xref: /illumos-gate/usr/src/uts/common/fs/zfs/dsl_dataset.c (revision 622200ad88c6c6382403a01985a94e22484baac6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/dmu_objset.h>
29 #include <sys/dsl_dataset.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dmu_traverse.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/arc.h>
34 #include <sys/zio.h>
35 #include <sys/zap.h>
36 #include <sys/unique.h>
37 #include <sys/zfs_context.h>
38 
39 #define	DOS_REF_MAX	(1ULL << 62)
40 
41 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
42 
43 #define	BP_GET_UCSIZE(bp) \
44 	((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
45 	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
46 
47 /*
48  * We use weighted reference counts to express the various forms of exclusion
49  * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
50  * is DOS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
51  * This makes the exclusion logic simple: the total refcnt for all opens cannot
52  * exceed DOS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
53  * weight (DOS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
54  * just over half of the refcnt space, so there can't be more than one, but it
55  * can peacefully coexist with any number of STANDARD opens.
56  */
57 static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
58 	0,			/* DOS_MODE_NONE - invalid		*/
59 	1,			/* DOS_MODE_STANDARD - unlimited number	*/
60 	(DOS_REF_MAX >> 1) + 1,	/* DOS_MODE_PRIMARY - only one of these	*/
61 	DOS_REF_MAX		/* DOS_MODE_EXCLUSIVE - no other opens	*/
62 };
63 
64 
65 void
66 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
67 {
68 	int used = BP_GET_ASIZE(bp);
69 	int compressed = BP_GET_PSIZE(bp);
70 	int uncompressed = BP_GET_UCSIZE(bp);
71 
72 	dprintf_bp(bp, "born, ds=%p\n", ds);
73 
74 	ASSERT(dmu_tx_is_syncing(tx));
75 	/* It could have been compressed away to nothing */
76 	if (BP_IS_HOLE(bp))
77 		return;
78 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
79 	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
80 	if (ds == NULL) {
81 		/*
82 		 * Account for the meta-objset space in its placeholder
83 		 * dsl_dir.
84 		 */
85 		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
86 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
87 		    used, compressed, uncompressed, tx);
88 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
89 		return;
90 	}
91 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
92 	mutex_enter(&ds->ds_lock);
93 	ds->ds_phys->ds_used_bytes += used;
94 	ds->ds_phys->ds_compressed_bytes += compressed;
95 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
96 	ds->ds_phys->ds_unique_bytes += used;
97 	mutex_exit(&ds->ds_lock);
98 	dsl_dir_diduse_space(ds->ds_dir,
99 	    used, compressed, uncompressed, tx);
100 }
101 
102 void
103 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
104 {
105 	int used = BP_GET_ASIZE(bp);
106 	int compressed = BP_GET_PSIZE(bp);
107 	int uncompressed = BP_GET_UCSIZE(bp);
108 
109 	ASSERT(dmu_tx_is_syncing(tx));
110 	if (BP_IS_HOLE(bp))
111 		return;
112 
113 	ASSERT(used > 0);
114 	if (ds == NULL) {
115 		/*
116 		 * Account for the meta-objset space in its placeholder
117 		 * dataset.
118 		 */
119 		/* XXX this can fail, what do we do when it does? */
120 		(void) arc_free(NULL, tx->tx_pool->dp_spa,
121 		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
122 		bzero(bp, sizeof (blkptr_t));
123 
124 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
125 		    -used, -compressed, -uncompressed, tx);
126 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
127 		return;
128 	}
129 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
130 
131 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
132 
133 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
134 		dprintf_bp(bp, "freeing: %s", "");
135 		/* XXX check return code? */
136 		(void) arc_free(NULL, tx->tx_pool->dp_spa,
137 		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
138 
139 		mutex_enter(&ds->ds_lock);
140 		/* XXX unique_bytes is not accurate for head datasets */
141 		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
142 		ds->ds_phys->ds_unique_bytes -= used;
143 		mutex_exit(&ds->ds_lock);
144 		dsl_dir_diduse_space(ds->ds_dir,
145 		    -used, -compressed, -uncompressed, tx);
146 	} else {
147 		dprintf_bp(bp, "putting on dead list: %s", "");
148 		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
149 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
150 		if (ds->ds_phys->ds_prev_snap_obj != 0) {
151 			ASSERT3U(ds->ds_prev->ds_object, ==,
152 			    ds->ds_phys->ds_prev_snap_obj);
153 			ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
154 			if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
155 			    ds->ds_object &&
156 			    bp->blk_birth >
157 			    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
158 				dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
159 				mutex_enter(&ds->ds_prev->ds_lock);
160 				ds->ds_prev->ds_phys->ds_unique_bytes +=
161 				    used;
162 				mutex_exit(&ds->ds_prev->ds_lock);
163 			}
164 		}
165 	}
166 	bzero(bp, sizeof (blkptr_t));
167 	mutex_enter(&ds->ds_lock);
168 	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
169 	ds->ds_phys->ds_used_bytes -= used;
170 	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
171 	ds->ds_phys->ds_compressed_bytes -= compressed;
172 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
173 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
174 	mutex_exit(&ds->ds_lock);
175 }
176 
177 uint64_t
178 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
179 {
180 	uint64_t txg;
181 	dsl_dir_t *dd;
182 
183 	if (ds == NULL)
184 		return (0);
185 	/*
186 	 * The snapshot creation could fail, but that would cause an
187 	 * incorrect FALSE return, which would only result in an
188 	 * overestimation of the amount of space that an operation would
189 	 * consume, which is OK.
190 	 *
191 	 * There's also a small window where we could miss a pending
192 	 * snapshot, because we could set the sync task in the quiescing
193 	 * phase.  So this should only be used as a guess.
194 	 */
195 	dd = ds->ds_dir;
196 	mutex_enter(&dd->dd_lock);
197 	if (dd->dd_sync_func == dsl_dataset_snapshot_sync)
198 		txg = dd->dd_sync_txg;
199 	else
200 		txg = ds->ds_phys->ds_prev_snap_txg;
201 	mutex_exit(&dd->dd_lock);
202 
203 	return (txg);
204 }
205 
206 int
207 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
208 {
209 	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
210 }
211 
212 /* ARGSUSED */
213 static void
214 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
215 {
216 	dsl_dataset_t *ds = dsv;
217 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
218 
219 	/* open_refcount == DOS_REF_MAX when deleting */
220 	ASSERT(ds->ds_open_refcount == 0 ||
221 	    ds->ds_open_refcount == DOS_REF_MAX);
222 
223 	dprintf_ds(ds, "evicting %s\n", "");
224 
225 	unique_remove(ds->ds_phys->ds_fsid_guid);
226 
227 	if (ds->ds_user_ptr != NULL)
228 		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
229 
230 	if (ds->ds_prev) {
231 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
232 		ds->ds_prev = NULL;
233 	}
234 
235 	bplist_close(&ds->ds_deadlist);
236 	dsl_dir_close(ds->ds_dir, ds);
237 
238 	if (list_link_active(&ds->ds_synced_link))
239 		list_remove(&dp->dp_synced_objsets, ds);
240 
241 	kmem_free(ds, sizeof (dsl_dataset_t));
242 }
243 
244 static int
245 dsl_dataset_get_snapname(dsl_dataset_t *ds)
246 {
247 	dsl_dataset_phys_t *headphys;
248 	int err;
249 	dmu_buf_t *headdbuf;
250 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
251 	objset_t *mos = dp->dp_meta_objset;
252 
253 	if (ds->ds_snapname[0])
254 		return (0);
255 	if (ds->ds_phys->ds_next_snap_obj == 0)
256 		return (0);
257 
258 	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
259 	    FTAG, &headdbuf);
260 	if (err)
261 		return (err);
262 	headphys = headdbuf->db_data;
263 	err = zap_value_search(dp->dp_meta_objset,
264 	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
265 	dmu_buf_rele(headdbuf, FTAG);
266 	return (err);
267 }
268 
269 int
270 dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
271     int mode, void *tag, dsl_dataset_t **dsp)
272 {
273 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
274 	objset_t *mos = dp->dp_meta_objset;
275 	dmu_buf_t *dbuf;
276 	dsl_dataset_t *ds;
277 	int err;
278 
279 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
280 	    dsl_pool_sync_context(dp));
281 
282 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
283 	if (err)
284 		return (err);
285 	ds = dmu_buf_get_user(dbuf);
286 	if (ds == NULL) {
287 		dsl_dataset_t *winner;
288 
289 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
290 		ds->ds_dbuf = dbuf;
291 		ds->ds_object = dsobj;
292 		ds->ds_phys = dbuf->db_data;
293 
294 		err = bplist_open(&ds->ds_deadlist,
295 		    mos, ds->ds_phys->ds_deadlist_obj);
296 		if (err == 0) {
297 			err = dsl_dir_open_obj(dp,
298 			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
299 		}
300 		if (err) {
301 			/*
302 			 * we don't really need to close the blist if we
303 			 * just opened it.
304 			 */
305 			kmem_free(ds, sizeof (dsl_dataset_t));
306 			dmu_buf_rele(dbuf, tag);
307 			return (err);
308 		}
309 
310 		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
311 			ds->ds_snapname[0] = '\0';
312 			if (ds->ds_phys->ds_prev_snap_obj) {
313 				err = dsl_dataset_open_obj(dp,
314 				    ds->ds_phys->ds_prev_snap_obj, NULL,
315 				    DS_MODE_NONE, ds, &ds->ds_prev);
316 			}
317 		} else {
318 			if (snapname) {
319 #ifdef ZFS_DEBUG
320 				dsl_dataset_phys_t *headphys;
321 				dmu_buf_t *headdbuf;
322 				err = dmu_bonus_hold(mos,
323 				    ds->ds_dir->dd_phys->dd_head_dataset_obj,
324 				    FTAG, &headdbuf);
325 				if (err == 0) {
326 					headphys = headdbuf->db_data;
327 					uint64_t foundobj;
328 					err = zap_lookup(dp->dp_meta_objset,
329 					    headphys->ds_snapnames_zapobj,
330 					    snapname, sizeof (foundobj), 1,
331 					    &foundobj);
332 					ASSERT3U(foundobj, ==, dsobj);
333 					dmu_buf_rele(headdbuf, FTAG);
334 				}
335 #endif
336 				(void) strcat(ds->ds_snapname, snapname);
337 			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
338 				err = dsl_dataset_get_snapname(ds);
339 			}
340 		}
341 
342 		if (err == 0) {
343 			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
344 			    dsl_dataset_evict);
345 		}
346 		if (err || winner) {
347 			bplist_close(&ds->ds_deadlist);
348 			if (ds->ds_prev) {
349 				dsl_dataset_close(ds->ds_prev,
350 				    DS_MODE_NONE, ds);
351 			}
352 			dsl_dir_close(ds->ds_dir, ds);
353 			kmem_free(ds, sizeof (dsl_dataset_t));
354 			if (err) {
355 				dmu_buf_rele(dbuf, tag);
356 				return (err);
357 			}
358 			ds = winner;
359 		} else {
360 			uint64_t new =
361 			    unique_insert(ds->ds_phys->ds_fsid_guid);
362 			if (new != ds->ds_phys->ds_fsid_guid) {
363 				/* XXX it won't necessarily be synced... */
364 				ds->ds_phys->ds_fsid_guid = new;
365 			}
366 		}
367 	}
368 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
369 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
370 
371 	mutex_enter(&ds->ds_lock);
372 	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
373 	    ds->ds_phys->ds_restoring && !DS_MODE_IS_RESTORE(mode)) ||
374 	    (ds->ds_open_refcount + weight > DOS_REF_MAX)) {
375 		mutex_exit(&ds->ds_lock);
376 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
377 		return (EBUSY);
378 	}
379 	ds->ds_open_refcount += weight;
380 	mutex_exit(&ds->ds_lock);
381 
382 	*dsp = ds;
383 	return (0);
384 }
385 
386 int
387 dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
388     void *tag, dsl_dataset_t **dsp)
389 {
390 	dsl_dir_t *dd;
391 	dsl_pool_t *dp;
392 	const char *tail;
393 	uint64_t obj;
394 	dsl_dataset_t *ds = NULL;
395 	int err = 0;
396 
397 	err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
398 	if (err)
399 		return (err);
400 
401 	dp = dd->dd_pool;
402 	obj = dd->dd_phys->dd_head_dataset_obj;
403 	rw_enter(&dp->dp_config_rwlock, RW_READER);
404 	if (obj == 0) {
405 		/* A dataset with no associated objset */
406 		err = ENOENT;
407 		goto out;
408 	}
409 
410 	if (tail != NULL) {
411 		objset_t *mos = dp->dp_meta_objset;
412 
413 		err = dsl_dataset_open_obj(dp, obj, NULL,
414 		    DS_MODE_NONE, tag, &ds);
415 		if (err)
416 			goto out;
417 		obj = ds->ds_phys->ds_snapnames_zapobj;
418 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
419 		ds = NULL;
420 
421 		if (tail[0] != '@') {
422 			err = ENOENT;
423 			goto out;
424 		}
425 		tail++;
426 
427 		/* Look for a snapshot */
428 		if (!DS_MODE_IS_READONLY(mode)) {
429 			err = EROFS;
430 			goto out;
431 		}
432 		dprintf("looking for snapshot '%s'\n", tail);
433 		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
434 		if (err)
435 			goto out;
436 	}
437 	err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
438 
439 out:
440 	rw_exit(&dp->dp_config_rwlock);
441 	dsl_dir_close(dd, FTAG);
442 
443 	ASSERT3U((err == 0), ==, (ds != NULL));
444 	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
445 
446 	*dsp = ds;
447 	return (err);
448 }
449 
450 int
451 dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
452 {
453 	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
454 }
455 
456 void
457 dsl_dataset_name(dsl_dataset_t *ds, char *name)
458 {
459 	if (ds == NULL) {
460 		(void) strcpy(name, "mos");
461 	} else {
462 		dsl_dir_name(ds->ds_dir, name);
463 		VERIFY(0 == dsl_dataset_get_snapname(ds));
464 		if (ds->ds_snapname[0]) {
465 			(void) strcat(name, "@");
466 			if (!MUTEX_HELD(&ds->ds_lock)) {
467 				/*
468 				 * We use a "recursive" mutex so that we
469 				 * can call dprintf_ds() with ds_lock held.
470 				 */
471 				mutex_enter(&ds->ds_lock);
472 				(void) strcat(name, ds->ds_snapname);
473 				mutex_exit(&ds->ds_lock);
474 			} else {
475 				(void) strcat(name, ds->ds_snapname);
476 			}
477 		}
478 	}
479 }
480 
481 void
482 dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
483 {
484 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
485 	mutex_enter(&ds->ds_lock);
486 	ASSERT3U(ds->ds_open_refcount, >=, weight);
487 	ds->ds_open_refcount -= weight;
488 	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
489 	    mode, ds->ds_open_refcount);
490 	mutex_exit(&ds->ds_lock);
491 
492 	dmu_buf_rele(ds->ds_dbuf, tag);
493 }
494 
495 void
496 dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
497 {
498 	objset_t *mos = dp->dp_meta_objset;
499 	dmu_buf_t *dbuf;
500 	dsl_dataset_phys_t *dsphys;
501 	dsl_dataset_t *ds;
502 	uint64_t dsobj;
503 	dsl_dir_t *dd;
504 
505 	dsl_dir_create_root(mos, ddobjp, tx);
506 	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
507 
508 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
509 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
510 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
511 	dmu_buf_will_dirty(dbuf, tx);
512 	dsphys = dbuf->db_data;
513 	dsphys->ds_dir_obj = dd->dd_object;
514 	dsphys->ds_fsid_guid = unique_create();
515 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
516 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
517 	    sizeof (dsphys->ds_guid));
518 	dsphys->ds_snapnames_zapobj =
519 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
520 	dsphys->ds_creation_time = gethrestime_sec();
521 	dsphys->ds_creation_txg = tx->tx_txg;
522 	dsphys->ds_deadlist_obj =
523 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
524 	dmu_buf_rele(dbuf, FTAG);
525 
526 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
527 	dd->dd_phys->dd_head_dataset_obj = dsobj;
528 	dsl_dir_close(dd, FTAG);
529 
530 	VERIFY(0 ==
531 	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
532 	(void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
533 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
534 }
535 
536 int
537 dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
538     const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
539 {
540 	int err;
541 	dsl_pool_t *dp = pds->dd_pool;
542 	dmu_buf_t *dbuf;
543 	dsl_dataset_phys_t *dsphys;
544 	uint64_t dsobj;
545 	objset_t *mos = dp->dp_meta_objset;
546 	dsl_dir_t *dd;
547 
548 	if (clone_parent != NULL) {
549 		/*
550 		 * You can't clone across pools.
551 		 */
552 		if (clone_parent->ds_dir->dd_pool != dp)
553 			return (EXDEV);
554 
555 		/*
556 		 * You can only clone snapshots, not the head datasets.
557 		 */
558 		if (clone_parent->ds_phys->ds_num_children == 0)
559 			return (EINVAL);
560 	}
561 
562 	ASSERT(lastname[0] != '@');
563 	ASSERT(dmu_tx_is_syncing(tx));
564 
565 	err = dsl_dir_create_sync(pds, lastname, tx);
566 	if (err)
567 		return (err);
568 	VERIFY(0 == dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, &dd, NULL));
569 
570 	/* This is the point of no (unsuccessful) return */
571 
572 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
573 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
574 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
575 	dmu_buf_will_dirty(dbuf, tx);
576 	dsphys = dbuf->db_data;
577 	dsphys->ds_dir_obj = dd->dd_object;
578 	dsphys->ds_fsid_guid = unique_create();
579 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
580 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
581 	    sizeof (dsphys->ds_guid));
582 	dsphys->ds_snapnames_zapobj =
583 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
584 	dsphys->ds_creation_time = gethrestime_sec();
585 	dsphys->ds_creation_txg = tx->tx_txg;
586 	dsphys->ds_deadlist_obj =
587 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
588 	if (clone_parent) {
589 		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
590 		dsphys->ds_prev_snap_txg =
591 		    clone_parent->ds_phys->ds_creation_txg;
592 		dsphys->ds_used_bytes =
593 		    clone_parent->ds_phys->ds_used_bytes;
594 		dsphys->ds_compressed_bytes =
595 		    clone_parent->ds_phys->ds_compressed_bytes;
596 		dsphys->ds_uncompressed_bytes =
597 		    clone_parent->ds_phys->ds_uncompressed_bytes;
598 		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
599 
600 		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
601 		clone_parent->ds_phys->ds_num_children++;
602 
603 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
604 		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
605 	}
606 	dmu_buf_rele(dbuf, FTAG);
607 
608 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
609 	dd->dd_phys->dd_head_dataset_obj = dsobj;
610 	dsl_dir_close(dd, FTAG);
611 
612 	return (0);
613 }
614 
615 
616 int
617 dsl_dataset_destroy(const char *name)
618 {
619 	int err;
620 	dsl_pool_t *dp;
621 	dsl_dir_t *dd;
622 	const char *tail;
623 
624 	err = dsl_dir_open(name, FTAG, &dd, &tail);
625 	if (err)
626 		return (err);
627 
628 	dp = dd->dd_pool;
629 	if (tail != NULL) {
630 		if (tail[0] != '@') {
631 			dsl_dir_close(dd, FTAG);
632 			return (ENOENT);
633 		}
634 		tail++;
635 		/* Just blow away the snapshot */
636 		do {
637 			txg_wait_synced(dp, 0);
638 			err = dsl_dir_sync_task(dd,
639 			    dsl_dataset_destroy_sync, (void*)tail, 0);
640 		} while (err == EAGAIN);
641 		dsl_dir_close(dd, FTAG);
642 	} else {
643 		char buf[MAXNAMELEN];
644 		char *cp;
645 
646 		dsl_dir_t *pds;
647 		if (dd->dd_phys->dd_parent_obj == 0) {
648 			dsl_dir_close(dd, FTAG);
649 			return (EINVAL);
650 		}
651 		/*
652 		 * Make sure it's not dirty before we destroy it.
653 		 */
654 		txg_wait_synced(dd->dd_pool, 0);
655 		/*
656 		 * Blow away the dsl_dir + head dataset.
657 		 * dsl_dir_destroy_sync() will call
658 		 * dsl_dataset_destroy_sync() to destroy the head dataset.
659 		 */
660 		rw_enter(&dp->dp_config_rwlock, RW_READER);
661 		err = dsl_dir_open_obj(dd->dd_pool,
662 		    dd->dd_phys->dd_parent_obj, NULL, FTAG, &pds);
663 		dsl_dir_close(dd, FTAG);
664 		rw_exit(&dp->dp_config_rwlock);
665 		if (err)
666 			return (err);
667 
668 		(void) strcpy(buf, name);
669 		cp = strrchr(buf, '/') + 1;
670 		ASSERT(cp[0] != '\0');
671 		do {
672 			txg_wait_synced(dp, 0);
673 			err = dsl_dir_sync_task(pds,
674 			    dsl_dir_destroy_sync, cp, 0);
675 		} while (err == EAGAIN);
676 		dsl_dir_close(pds, FTAG);
677 	}
678 
679 	return (err);
680 }
681 
682 int
683 dsl_dataset_rollback(const char *name)
684 {
685 	int err;
686 	dsl_dir_t *dd;
687 	const char *tail;
688 
689 	err = dsl_dir_open(name, FTAG, &dd, &tail);
690 	if (err)
691 		return (err);
692 
693 	if (tail != NULL) {
694 		dsl_dir_close(dd, FTAG);
695 		return (EINVAL);
696 	}
697 	do {
698 		txg_wait_synced(dd->dd_pool, 0);
699 		err = dsl_dir_sync_task(dd,
700 		    dsl_dataset_rollback_sync, NULL, 0);
701 	} while (err == EAGAIN);
702 	dsl_dir_close(dd, FTAG);
703 
704 	return (err);
705 }
706 
707 void *
708 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
709     void *p, dsl_dataset_evict_func_t func)
710 {
711 	void *old;
712 
713 	mutex_enter(&ds->ds_lock);
714 	old = ds->ds_user_ptr;
715 	if (old == NULL) {
716 		ds->ds_user_ptr = p;
717 		ds->ds_user_evict_func = func;
718 	}
719 	mutex_exit(&ds->ds_lock);
720 	return (old);
721 }
722 
723 void *
724 dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
725 {
726 	return (ds->ds_user_ptr);
727 }
728 
729 
730 void
731 dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp)
732 {
733 	*bp = ds->ds_phys->ds_bp;
734 }
735 
736 void
737 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
738 {
739 	ASSERT(dmu_tx_is_syncing(tx));
740 	/* If it's the meta-objset, set dp_meta_rootbp */
741 	if (ds == NULL) {
742 		tx->tx_pool->dp_meta_rootbp = *bp;
743 	} else {
744 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
745 		ds->ds_phys->ds_bp = *bp;
746 	}
747 }
748 
749 spa_t *
750 dsl_dataset_get_spa(dsl_dataset_t *ds)
751 {
752 	return (ds->ds_dir->dd_pool->dp_spa);
753 }
754 
755 void
756 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
757 {
758 	dsl_pool_t *dp;
759 
760 	if (ds == NULL) /* this is the meta-objset */
761 		return;
762 
763 	ASSERT(ds->ds_user_ptr != NULL);
764 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
765 
766 	dp = ds->ds_dir->dd_pool;
767 
768 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
769 		/* up the hold count until we can be written out */
770 		dmu_buf_add_ref(ds->ds_dbuf, ds);
771 	}
772 }
773 
774 struct killarg {
775 	uint64_t *usedp;
776 	uint64_t *compressedp;
777 	uint64_t *uncompressedp;
778 	zio_t *zio;
779 	dmu_tx_t *tx;
780 };
781 
782 static int
783 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
784 {
785 	struct killarg *ka = arg;
786 	blkptr_t *bp = &bc->bc_blkptr;
787 
788 	ASSERT3U(bc->bc_errno, ==, 0);
789 
790 	/*
791 	 * Since this callback is not called concurrently, no lock is
792 	 * needed on the accounting values.
793 	 */
794 	*ka->usedp += BP_GET_ASIZE(bp);
795 	*ka->compressedp += BP_GET_PSIZE(bp);
796 	*ka->uncompressedp += BP_GET_UCSIZE(bp);
797 	/* XXX check for EIO? */
798 	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
799 	    ARC_NOWAIT);
800 	return (0);
801 }
802 
803 /* ARGSUSED */
804 int
805 dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
806 {
807 	objset_t *mos = dd->dd_pool->dp_meta_objset;
808 	dsl_dataset_t *ds;
809 	int err;
810 
811 	if (dd->dd_phys->dd_head_dataset_obj == 0)
812 		return (EINVAL);
813 	err = dsl_dataset_open_obj(dd->dd_pool,
814 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &ds);
815 	if (err)
816 		return (err);
817 
818 	if (ds->ds_phys->ds_prev_snap_txg == 0) {
819 		/*
820 		 * There's no previous snapshot.  I suppose we could
821 		 * roll it back to being empty (and re-initialize the
822 		 * upper (ZPL) layer).  But for now there's no way to do
823 		 * this via the user interface.
824 		 */
825 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
826 		return (EINVAL);
827 	}
828 
829 	mutex_enter(&ds->ds_lock);
830 	if (ds->ds_open_refcount > 0) {
831 		mutex_exit(&ds->ds_lock);
832 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
833 		return (EBUSY);
834 	}
835 
836 	/*
837 	 * If we made changes this txg, traverse_dsl_dataset won't find
838 	 * them.  Try again.
839 	 */
840 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
841 		mutex_exit(&ds->ds_lock);
842 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
843 		return (EAGAIN);
844 	}
845 
846 	/* THE POINT OF NO (unsuccessful) RETURN */
847 	ds->ds_open_refcount = DOS_REF_MAX;
848 	mutex_exit(&ds->ds_lock);
849 
850 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
851 
852 	/* Zero out the deadlist. */
853 	dprintf("old deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
854 	bplist_close(&ds->ds_deadlist);
855 	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
856 	ds->ds_phys->ds_deadlist_obj =
857 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
858 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
859 	    ds->ds_phys->ds_deadlist_obj));
860 	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
861 
862 	{
863 		/* Free blkptrs that we gave birth to */
864 		zio_t *zio;
865 		uint64_t used = 0, compressed = 0, uncompressed = 0;
866 		struct killarg ka;
867 
868 		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
869 		    ZIO_FLAG_MUSTSUCCEED);
870 		ka.usedp = &used;
871 		ka.compressedp = &compressed;
872 		ka.uncompressedp = &uncompressed;
873 		ka.zio = zio;
874 		ka.tx = tx;
875 		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
876 		    ADVANCE_POST, kill_blkptr, &ka);
877 		(void) zio_wait(zio);
878 
879 		dsl_dir_diduse_space(dd,
880 		    -used, -compressed, -uncompressed, tx);
881 	}
882 
883 	/* Change our contents to that of the prev snapshot (finally!) */
884 	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
885 	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
886 	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
887 	ds->ds_phys->ds_compressed_bytes =
888 	    ds->ds_prev->ds_phys->ds_compressed_bytes;
889 	ds->ds_phys->ds_uncompressed_bytes =
890 	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
891 	ds->ds_phys->ds_restoring = ds->ds_prev->ds_phys->ds_restoring;
892 	ds->ds_phys->ds_unique_bytes = 0;
893 
894 	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
895 	ds->ds_prev->ds_phys->ds_unique_bytes = 0;
896 
897 	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
898 	ds->ds_open_refcount = 0;
899 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
900 
901 	return (0);
902 }
903 
904 int
905 dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
906 {
907 	const char *snapname = arg;
908 	uint64_t used = 0, compressed = 0, uncompressed = 0;
909 	blkptr_t bp;
910 	zio_t *zio;
911 	int err;
912 	int after_branch_point = FALSE;
913 	int drop_lock = FALSE;
914 	dsl_pool_t *dp = dd->dd_pool;
915 	objset_t *mos = dp->dp_meta_objset;
916 	dsl_dataset_t *ds, *ds_prev = NULL;
917 	uint64_t obj;
918 
919 	if (dd->dd_phys->dd_head_dataset_obj == 0)
920 		return (EINVAL);
921 
922 	if (!RW_WRITE_HELD(&dp->dp_config_rwlock)) {
923 		rw_enter(&dp->dp_config_rwlock, RW_WRITER);
924 		drop_lock = TRUE;
925 	}
926 
927 	err = dsl_dataset_open_obj(dd->dd_pool,
928 	    dd->dd_phys->dd_head_dataset_obj, NULL,
929 	    snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG, &ds);
930 
931 	if (err == 0 && snapname) {
932 		err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
933 		    snapname, 8, 1, &obj);
934 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
935 		if (err == 0) {
936 			err = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
937 			    DS_MODE_EXCLUSIVE, FTAG, &ds);
938 		}
939 	}
940 	if (err) {
941 		if (drop_lock)
942 			rw_exit(&dp->dp_config_rwlock);
943 		return (err);
944 	}
945 
946 	obj = ds->ds_object;
947 
948 	/* Can't delete a branch point. */
949 	if (ds->ds_phys->ds_num_children > 1) {
950 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
951 		if (drop_lock)
952 			rw_exit(&dp->dp_config_rwlock);
953 		return (EINVAL);
954 	}
955 
956 	/*
957 	 * Can't delete a head dataset if there are snapshots of it.
958 	 * (Except if the only snapshots are from the branch we cloned
959 	 * from.)
960 	 */
961 	if (ds->ds_prev != NULL &&
962 	    ds->ds_prev->ds_phys->ds_next_snap_obj == obj) {
963 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
964 		if (drop_lock)
965 			rw_exit(&dp->dp_config_rwlock);
966 		return (EINVAL);
967 	}
968 
969 	/*
970 	 * If we made changes this txg, traverse_dsl_dataset won't find
971 	 * them.  Try again.
972 	 */
973 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
974 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
975 		if (drop_lock)
976 			rw_exit(&dp->dp_config_rwlock);
977 		return (EAGAIN);
978 	}
979 
980 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
981 		if (ds->ds_prev) {
982 			ds_prev = ds->ds_prev;
983 		} else {
984 			err = dsl_dataset_open_obj(dd->dd_pool,
985 			    ds->ds_phys->ds_prev_snap_obj, NULL,
986 			    DS_MODE_NONE, FTAG, &ds_prev);
987 			if (err) {
988 				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
989 				if (drop_lock)
990 					rw_exit(&dp->dp_config_rwlock);
991 				return (err);
992 			}
993 		}
994 		after_branch_point =
995 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
996 
997 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
998 		if (after_branch_point &&
999 		    ds->ds_phys->ds_next_snap_obj == 0) {
1000 			/* This clone is toast. */
1001 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1002 			ds_prev->ds_phys->ds_num_children--;
1003 		} else if (!after_branch_point) {
1004 			ds_prev->ds_phys->ds_next_snap_obj =
1005 			    ds->ds_phys->ds_next_snap_obj;
1006 		}
1007 	}
1008 
1009 	/* THE POINT OF NO (unsuccessful) RETURN */
1010 
1011 	ASSERT3P(tx->tx_pool, ==, dd->dd_pool);
1012 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1013 
1014 	if (ds->ds_phys->ds_next_snap_obj != 0) {
1015 		dsl_dataset_t *ds_next;
1016 		uint64_t itor = 0;
1017 
1018 		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
1019 
1020 		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
1021 		    ds->ds_phys->ds_next_snap_obj, NULL,
1022 		    DS_MODE_NONE, FTAG, &ds_next));
1023 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1024 
1025 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1026 		ds_next->ds_phys->ds_prev_snap_obj =
1027 		    ds->ds_phys->ds_prev_snap_obj;
1028 		ds_next->ds_phys->ds_prev_snap_txg =
1029 		    ds->ds_phys->ds_prev_snap_txg;
1030 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1031 		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1032 
1033 		/*
1034 		 * Transfer to our deadlist (which will become next's
1035 		 * new deadlist) any entries from next's current
1036 		 * deadlist which were born before prev, and free the
1037 		 * other entries.
1038 		 *
1039 		 * XXX we're doing this long task with the config lock held
1040 		 */
1041 		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
1042 		    &bp) == 0) {
1043 			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
1044 				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
1045 				    &bp, tx));
1046 				if (ds_prev && !after_branch_point &&
1047 				    bp.blk_birth >
1048 				    ds_prev->ds_phys->ds_prev_snap_txg) {
1049 					ds_prev->ds_phys->ds_unique_bytes +=
1050 					    BP_GET_ASIZE(&bp);
1051 				}
1052 			} else {
1053 				used += BP_GET_ASIZE(&bp);
1054 				compressed += BP_GET_PSIZE(&bp);
1055 				uncompressed += BP_GET_UCSIZE(&bp);
1056 				/* XXX check return value? */
1057 				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
1058 				    &bp, NULL, NULL, ARC_NOWAIT);
1059 			}
1060 		}
1061 
1062 		/* free next's deadlist */
1063 		bplist_close(&ds_next->ds_deadlist);
1064 		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
1065 
1066 		/* set next's deadlist to our deadlist */
1067 		ds_next->ds_phys->ds_deadlist_obj =
1068 		    ds->ds_phys->ds_deadlist_obj;
1069 		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
1070 		    ds_next->ds_phys->ds_deadlist_obj));
1071 		ds->ds_phys->ds_deadlist_obj = 0;
1072 
1073 		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
1074 			/*
1075 			 * Update next's unique to include blocks which
1076 			 * were previously shared by only this snapshot
1077 			 * and it.  Those blocks will be born after the
1078 			 * prev snap and before this snap, and will have
1079 			 * died after the next snap and before the one
1080 			 * after that (ie. be on the snap after next's
1081 			 * deadlist).
1082 			 *
1083 			 * XXX we're doing this long task with the
1084 			 * config lock held
1085 			 */
1086 			dsl_dataset_t *ds_after_next;
1087 
1088 			VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
1089 			    ds_next->ds_phys->ds_next_snap_obj, NULL,
1090 			    DS_MODE_NONE, FTAG, &ds_after_next));
1091 			itor = 0;
1092 			while (bplist_iterate(&ds_after_next->ds_deadlist,
1093 			    &itor, &bp) == 0) {
1094 				if (bp.blk_birth >
1095 				    ds->ds_phys->ds_prev_snap_txg &&
1096 				    bp.blk_birth <=
1097 				    ds->ds_phys->ds_creation_txg) {
1098 					ds_next->ds_phys->ds_unique_bytes +=
1099 					    BP_GET_ASIZE(&bp);
1100 				}
1101 			}
1102 
1103 			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
1104 			ASSERT3P(ds_next->ds_prev, ==, NULL);
1105 		} else {
1106 			/*
1107 			 * It would be nice to update the head dataset's
1108 			 * unique.  To do so we would have to traverse
1109 			 * it for blocks born after ds_prev, which is
1110 			 * pretty expensive just to maintain something
1111 			 * for debugging purposes.
1112 			 */
1113 			ASSERT3P(ds_next->ds_prev, ==, ds);
1114 			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
1115 			    ds_next);
1116 			if (ds_prev) {
1117 				VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
1118 				    ds->ds_phys->ds_prev_snap_obj, NULL,
1119 				    DS_MODE_NONE, ds_next, &ds_next->ds_prev));
1120 			} else {
1121 				ds_next->ds_prev = NULL;
1122 			}
1123 		}
1124 		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
1125 
1126 		/*
1127 		 * NB: unique_bytes is not accurate for head objsets
1128 		 * because we don't update it when we delete the most
1129 		 * recent snapshot -- see above comment.
1130 		 */
1131 		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
1132 	} else {
1133 		/*
1134 		 * There's no next snapshot, so this is a head dataset.
1135 		 * Destroy the deadlist.  Unless it's a clone, the
1136 		 * deadlist should be empty.  (If it's a clone, it's
1137 		 * safe to ignore the deadlist contents.)
1138 		 */
1139 		struct killarg ka;
1140 
1141 		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
1142 		bplist_close(&ds->ds_deadlist);
1143 		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
1144 		ds->ds_phys->ds_deadlist_obj = 0;
1145 
1146 		/*
1147 		 * Free everything that we point to (that's born after
1148 		 * the previous snapshot, if we are a clone)
1149 		 *
1150 		 * XXX we're doing this long task with the config lock held
1151 		 */
1152 		ka.usedp = &used;
1153 		ka.compressedp = &compressed;
1154 		ka.uncompressedp = &uncompressed;
1155 		ka.zio = zio;
1156 		ka.tx = tx;
1157 		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1158 		    ADVANCE_POST, kill_blkptr, &ka);
1159 		ASSERT3U(err, ==, 0);
1160 	}
1161 
1162 	err = zio_wait(zio);
1163 	ASSERT3U(err, ==, 0);
1164 
1165 	dsl_dir_diduse_space(dd, -used, -compressed, -uncompressed, tx);
1166 
1167 	if (ds->ds_phys->ds_snapnames_zapobj) {
1168 		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1169 		ASSERT(err == 0);
1170 	}
1171 
1172 	if (dd->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1173 		/* Erase the link in the dataset */
1174 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
1175 		dd->dd_phys->dd_head_dataset_obj = 0;
1176 		/*
1177 		 * dsl_dir_sync_destroy() called us, they'll destroy
1178 		 * the dataset.
1179 		 */
1180 	} else {
1181 		/* remove from snapshot namespace */
1182 		dsl_dataset_t *ds_head;
1183 		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
1184 		    dd->dd_phys->dd_head_dataset_obj, NULL,
1185 		    DS_MODE_NONE, FTAG, &ds_head));
1186 #ifdef ZFS_DEBUG
1187 		{
1188 			uint64_t val;
1189 			err = zap_lookup(mos,
1190 			    ds_head->ds_phys->ds_snapnames_zapobj,
1191 			    snapname, 8, 1, &val);
1192 			ASSERT3U(err, ==, 0);
1193 			ASSERT3U(val, ==, obj);
1194 		}
1195 #endif
1196 		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
1197 		    snapname, tx);
1198 		ASSERT(err == 0);
1199 		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
1200 	}
1201 
1202 	if (ds_prev && ds->ds_prev != ds_prev)
1203 		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
1204 
1205 	err = dmu_object_free(mos, obj, tx);
1206 	ASSERT(err == 0);
1207 
1208 	/*
1209 	 * Close the objset with mode NONE, thus leaving it with
1210 	 * DOS_REF_MAX set, so that noone can access it.
1211 	 */
1212 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
1213 
1214 	if (drop_lock)
1215 		rw_exit(&dp->dp_config_rwlock);
1216 	return (0);
1217 }
1218 
1219 int
1220 dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
1221 {
1222 	const char *snapname = arg;
1223 	dsl_pool_t *dp = dd->dd_pool;
1224 	dmu_buf_t *dbuf;
1225 	dsl_dataset_phys_t *dsphys;
1226 	uint64_t dsobj, value;
1227 	objset_t *mos = dp->dp_meta_objset;
1228 	dsl_dataset_t *ds;
1229 	int err;
1230 
1231 	ASSERT(dmu_tx_is_syncing(tx));
1232 
1233 	if (dd->dd_phys->dd_head_dataset_obj == 0)
1234 		return (EINVAL);
1235 	err = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
1236 	    DS_MODE_NONE, FTAG, &ds);
1237 	if (err)
1238 		return (err);
1239 
1240 	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
1241 	    snapname, 8, 1, &value);
1242 	if (err == 0) {
1243 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
1244 		return (EEXIST);
1245 	}
1246 	ASSERT(err == ENOENT);
1247 
1248 	/* The point of no (unsuccessful) return */
1249 
1250 	dprintf_dd(dd, "taking snapshot %s in txg %llu\n",
1251 	    snapname, tx->tx_txg);
1252 
1253 	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
1254 
1255 	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
1256 
1257 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1258 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1259 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1260 	dmu_buf_will_dirty(dbuf, tx);
1261 	dsphys = dbuf->db_data;
1262 	dsphys->ds_dir_obj = dd->dd_object;
1263 	dsphys->ds_fsid_guid = unique_create();
1264 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
1265 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1266 	    sizeof (dsphys->ds_guid));
1267 	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
1268 	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
1269 	dsphys->ds_next_snap_obj = ds->ds_object;
1270 	dsphys->ds_num_children = 1;
1271 	dsphys->ds_creation_time = gethrestime_sec();
1272 	dsphys->ds_creation_txg = tx->tx_txg;
1273 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
1274 	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
1275 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
1276 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
1277 	dsphys->ds_restoring = ds->ds_phys->ds_restoring;
1278 	dsphys->ds_bp = ds->ds_phys->ds_bp;
1279 	dmu_buf_rele(dbuf, FTAG);
1280 
1281 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
1282 		dsl_dataset_t *ds_prev;
1283 
1284 		VERIFY(0 == dsl_dataset_open_obj(dp,
1285 		    ds->ds_phys->ds_prev_snap_obj, NULL,
1286 		    DS_MODE_NONE, FTAG, &ds_prev));
1287 		ASSERT(ds_prev->ds_phys->ds_next_snap_obj ==
1288 		    ds->ds_object ||
1289 		    ds_prev->ds_phys->ds_num_children > 1);
1290 		if (ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1291 			dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1292 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1293 			    ds_prev->ds_phys->ds_creation_txg);
1294 			ds_prev->ds_phys->ds_next_snap_obj = dsobj;
1295 		}
1296 		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
1297 	} else {
1298 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 0);
1299 	}
1300 
1301 	bplist_close(&ds->ds_deadlist);
1302 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1303 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
1304 	ds->ds_phys->ds_prev_snap_obj = dsobj;
1305 	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
1306 	ds->ds_phys->ds_unique_bytes = 0;
1307 	ds->ds_phys->ds_deadlist_obj =
1308 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
1309 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
1310 	    ds->ds_phys->ds_deadlist_obj));
1311 
1312 	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
1313 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
1314 	    snapname, 8, 1, &dsobj, tx);
1315 	ASSERT(err == 0);
1316 
1317 	if (ds->ds_prev)
1318 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
1319 	VERIFY(0 == dsl_dataset_open_obj(dp,
1320 	    ds->ds_phys->ds_prev_snap_obj, snapname,
1321 	    DS_MODE_NONE, ds, &ds->ds_prev));
1322 
1323 	rw_exit(&dp->dp_config_rwlock);
1324 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
1325 
1326 	return (0);
1327 }
1328 
1329 void
1330 dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
1331 {
1332 	ASSERT(dmu_tx_is_syncing(tx));
1333 	ASSERT(ds->ds_user_ptr != NULL);
1334 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
1335 
1336 	dmu_objset_sync(ds->ds_user_ptr, tx);
1337 	dsl_dir_dirty(ds->ds_dir, tx);
1338 	bplist_close(&ds->ds_deadlist);
1339 
1340 	dmu_buf_rele(ds->ds_dbuf, ds);
1341 }
1342 
1343 void
1344 dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
1345 {
1346 	/* fill in properties crap */
1347 	dsl_dir_stats(ds->ds_dir, dds);
1348 
1349 	if (ds->ds_phys->ds_num_children != 0) {
1350 		dds->dds_is_snapshot = TRUE;
1351 		dds->dds_num_clones = ds->ds_phys->ds_num_children - 1;
1352 	}
1353 
1354 	dds->dds_last_txg = ds->ds_phys->ds_bp.blk_birth;
1355 
1356 	dds->dds_objects_used = ds->ds_phys->ds_bp.blk_fill;
1357 	dds->dds_objects_avail = DN_MAX_OBJECT - dds->dds_objects_used;
1358 
1359 	/* We override the dataset's creation time... they should be the same */
1360 	dds->dds_creation_time = ds->ds_phys->ds_creation_time;
1361 	dds->dds_creation_txg = ds->ds_phys->ds_creation_txg;
1362 	dds->dds_space_refd = ds->ds_phys->ds_used_bytes;
1363 	dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid;
1364 
1365 	if (ds->ds_phys->ds_next_snap_obj) {
1366 		/*
1367 		 * This is a snapshot; override the dd's space used with
1368 		 * our unique space
1369 		 */
1370 		dds->dds_space_used = ds->ds_phys->ds_unique_bytes;
1371 		dds->dds_compressed_bytes =
1372 		    ds->ds_phys->ds_compressed_bytes;
1373 		dds->dds_uncompressed_bytes =
1374 		    ds->ds_phys->ds_uncompressed_bytes;
1375 	}
1376 }
1377 
1378 dsl_pool_t *
1379 dsl_dataset_pool(dsl_dataset_t *ds)
1380 {
1381 	return (ds->ds_dir->dd_pool);
1382 }
1383 
1384 struct osrenamearg {
1385 	const char *oldname;
1386 	const char *newname;
1387 };
1388 
1389 static int
1390 dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
1391 {
1392 	struct osrenamearg *ora = arg;
1393 	objset_t *mos = dd->dd_pool->dp_meta_objset;
1394 	dsl_dir_t *nds;
1395 	const char *tail;
1396 	int err;
1397 	dsl_dataset_t *snds, *fsds;
1398 	uint64_t val;
1399 
1400 	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, ora->oldname,
1401 	    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &snds);
1402 	if (err)
1403 		return (err);
1404 
1405 	if (snds->ds_dir != dd) {
1406 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1407 		return (EINVAL);
1408 	}
1409 
1410 	/* better be changing a snapshot */
1411 	if (snds->ds_phys->ds_next_snap_obj == 0) {
1412 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1413 		return (EINVAL);
1414 	}
1415 
1416 	/* new fs better exist */
1417 	err = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname,
1418 	    FTAG, &nds, &tail);
1419 	if (err) {
1420 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1421 		return (err);
1422 	}
1423 
1424 	dsl_dir_close(nds, FTAG);
1425 
1426 	/* new name better be in same fs */
1427 	if (nds != dd) {
1428 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1429 		return (EINVAL);
1430 	}
1431 
1432 	/* new name better be a snapshot */
1433 	if (tail == NULL || tail[0] != '@') {
1434 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1435 		return (EINVAL);
1436 	}
1437 
1438 	tail++;
1439 
1440 	err = dsl_dataset_open_obj(dd->dd_pool,
1441 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &fsds);
1442 	if (err) {
1443 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1444 		return (err);
1445 	}
1446 
1447 	/* new name better not be in use */
1448 	err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj,
1449 	    tail, 8, 1, &val);
1450 	if (err != ENOENT) {
1451 		if (err == 0)
1452 			err = EEXIST;
1453 		dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
1454 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1455 		return (EEXIST);
1456 	}
1457 
1458 	/* The point of no (unsuccessful) return */
1459 
1460 	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
1461 	VERIFY(0 == dsl_dataset_get_snapname(snds));
1462 	err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj,
1463 	    snds->ds_snapname, tx);
1464 	ASSERT3U(err, ==, 0);
1465 	mutex_enter(&snds->ds_lock);
1466 	(void) strcpy(snds->ds_snapname, tail);
1467 	mutex_exit(&snds->ds_lock);
1468 	err = zap_add(mos, fsds->ds_phys->ds_snapnames_zapobj,
1469 	    snds->ds_snapname, 8, 1, &snds->ds_object, tx);
1470 	ASSERT3U(err, ==, 0);
1471 	rw_exit(&dd->dd_pool->dp_config_rwlock);
1472 
1473 	dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
1474 	dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1475 	return (0);
1476 }
1477 
1478 #pragma weak dmu_objset_rename = dsl_dataset_rename
1479 int
1480 dsl_dataset_rename(const char *osname, const char *newname)
1481 {
1482 	dsl_dir_t *dd;
1483 	const char *tail;
1484 	struct osrenamearg ora;
1485 	int err;
1486 
1487 	err = dsl_dir_open(osname, FTAG, &dd, &tail);
1488 	if (err)
1489 		return (err);
1490 	if (tail == NULL) {
1491 		err = dsl_dir_sync_task(dd,
1492 		    dsl_dir_rename_sync, (void*)newname, 1<<12);
1493 		dsl_dir_close(dd, FTAG);
1494 		return (err);
1495 	}
1496 	if (tail[0] != '@') {
1497 		/* the name ended in a nonexistant component */
1498 		dsl_dir_close(dd, FTAG);
1499 		return (ENOENT);
1500 	}
1501 
1502 	ora.oldname = osname;
1503 	ora.newname = newname;
1504 
1505 	err = dsl_dir_sync_task(dd,
1506 	    dsl_dataset_snapshot_rename_sync, &ora, 1<<12);
1507 	dsl_dir_close(dd, FTAG);
1508 	return (err);
1509 }
1510