xref: /titanic_51/usr/src/uts/common/fs/zfs/dsl_dataset.c (revision 75d01c9ab5ef6f1bbac9f9d4eb379d5c38583d82)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/dmu_objset.h>
30 #include <sys/dsl_dataset.h>
31 #include <sys/dsl_dir.h>
32 #include <sys/dmu_traverse.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/arc.h>
35 #include <sys/zio.h>
36 #include <sys/zap.h>
37 #include <sys/unique.h>
38 #include <sys/zfs_context.h>
39 
40 #define	DOS_REF_MAX	(1ULL << 62)
41 
42 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
43 
44 #define	BP_GET_UCSIZE(bp) \
45 	((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
46 	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
47 
48 /*
49  * We use weighted reference counts to express the various forms of exclusion
50  * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
51  * is DOS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
52  * This makes the exclusion logic simple: the total refcnt for all opens cannot
53  * exceed DOS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
54  * weight (DOS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
55  * just over half of the refcnt space, so there can't be more than one, but it
56  * can peacefully coexist with any number of STANDARD opens.
57  */
58 static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
59 	0,			/* DOS_MODE_NONE - invalid		*/
60 	1,			/* DOS_MODE_STANDARD - unlimited number	*/
61 	(DOS_REF_MAX >> 1) + 1,	/* DOS_MODE_PRIMARY - only one of these	*/
62 	DOS_REF_MAX		/* DOS_MODE_EXCLUSIVE - no other opens	*/
63 };
64 
65 
66 void
67 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
68 {
69 	int used = BP_GET_ASIZE(bp);
70 	int compressed = BP_GET_PSIZE(bp);
71 	int uncompressed = BP_GET_UCSIZE(bp);
72 
73 	dprintf_bp(bp, "born, ds=%p\n", ds);
74 
75 	ASSERT(dmu_tx_is_syncing(tx));
76 	/* It could have been compressed away to nothing */
77 	if (BP_IS_HOLE(bp))
78 		return;
79 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
80 	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
81 	if (ds == NULL) {
82 		/*
83 		 * Account for the meta-objset space in its placeholder
84 		 * dsl_dir.
85 		 */
86 		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
87 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
88 		    used, compressed, uncompressed, tx);
89 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
90 		return;
91 	}
92 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
93 	mutex_enter(&ds->ds_lock);
94 	ds->ds_phys->ds_used_bytes += used;
95 	ds->ds_phys->ds_compressed_bytes += compressed;
96 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
97 	ds->ds_phys->ds_unique_bytes += used;
98 	mutex_exit(&ds->ds_lock);
99 	dsl_dir_diduse_space(ds->ds_dir,
100 	    used, compressed, uncompressed, tx);
101 }
102 
103 void
104 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
105 {
106 	int used = BP_GET_ASIZE(bp);
107 	int compressed = BP_GET_PSIZE(bp);
108 	int uncompressed = BP_GET_UCSIZE(bp);
109 
110 	ASSERT(dmu_tx_is_syncing(tx));
111 	if (BP_IS_HOLE(bp))
112 		return;
113 
114 	ASSERT(used > 0);
115 	if (ds == NULL) {
116 		/*
117 		 * Account for the meta-objset space in its placeholder
118 		 * dataset.
119 		 */
120 		/* XXX this can fail, what do we do when it does? */
121 		(void) arc_free(NULL, tx->tx_pool->dp_spa,
122 		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
123 		bzero(bp, sizeof (blkptr_t));
124 
125 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
126 		    -used, -compressed, -uncompressed, tx);
127 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
128 		return;
129 	}
130 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
131 
132 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
133 
134 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
135 		dprintf_bp(bp, "freeing: %s", "");
136 		/* XXX check return code? */
137 		(void) arc_free(NULL, tx->tx_pool->dp_spa,
138 		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
139 
140 		mutex_enter(&ds->ds_lock);
141 		/* XXX unique_bytes is not accurate for head datasets */
142 		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
143 		ds->ds_phys->ds_unique_bytes -= used;
144 		mutex_exit(&ds->ds_lock);
145 		dsl_dir_diduse_space(ds->ds_dir,
146 		    -used, -compressed, -uncompressed, tx);
147 	} else {
148 		dprintf_bp(bp, "putting on dead list: %s", "");
149 		bplist_enqueue(&ds->ds_deadlist, bp, tx);
150 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
151 		if (ds->ds_phys->ds_prev_snap_obj != 0) {
152 			ASSERT3U(ds->ds_prev->ds_object, ==,
153 			    ds->ds_phys->ds_prev_snap_obj);
154 			ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
155 			if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
156 			    ds->ds_object &&
157 			    bp->blk_birth >
158 			    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
159 				dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
160 				mutex_enter(&ds->ds_prev->ds_lock);
161 				ds->ds_prev->ds_phys->ds_unique_bytes +=
162 				    used;
163 				mutex_exit(&ds->ds_prev->ds_lock);
164 			}
165 		}
166 	}
167 	bzero(bp, sizeof (blkptr_t));
168 	mutex_enter(&ds->ds_lock);
169 	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
170 	ds->ds_phys->ds_used_bytes -= used;
171 	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
172 	ds->ds_phys->ds_compressed_bytes -= compressed;
173 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
174 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
175 	mutex_exit(&ds->ds_lock);
176 }
177 
178 int
179 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
180 {
181 	uint64_t prev_snap_txg;
182 	dsl_dir_t *dd;
183 	/* ASSERT that it is not a snapshot */
184 	if (ds == NULL)
185 		return (TRUE);
186 	/*
187 	 * The snapshot creation could fail, but that would cause an
188 	 * incorrect FALSE return, which would only result in an
189 	 * overestimation of the amount of space that an operation would
190 	 * consume, which is OK.
191 	 *
192 	 * There's also a small window where we could miss a pending
193 	 * snapshot, because we could set the sync task in the quiescing
194 	 * phase.  So this should only be used as a guess.
195 	 */
196 	dd = ds->ds_dir;
197 	mutex_enter(&dd->dd_lock);
198 	if (dd->dd_sync_func == dsl_dataset_snapshot_sync &&
199 	    dd->dd_sync_txg < tx->tx_txg)
200 		prev_snap_txg = dd->dd_sync_txg;
201 	else
202 		prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
203 	mutex_exit(&dd->dd_lock);
204 	return (blk_birth > prev_snap_txg);
205 }
206 
207 /* ARGSUSED */
208 static void
209 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
210 {
211 	dsl_dataset_t *ds = dsv;
212 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
213 
214 	/* open_refcount == DOS_REF_MAX when deleting */
215 	ASSERT(ds->ds_open_refcount == 0 ||
216 	    ds->ds_open_refcount == DOS_REF_MAX);
217 
218 	dprintf_ds(ds, "evicting %s\n", "");
219 
220 	unique_remove(ds->ds_phys->ds_fsid_guid);
221 
222 	if (ds->ds_user_ptr != NULL)
223 		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
224 
225 	if (ds->ds_prev) {
226 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
227 		ds->ds_prev = NULL;
228 	}
229 
230 	bplist_close(&ds->ds_deadlist);
231 	dsl_dir_close(ds->ds_dir, ds);
232 
233 	if (list_link_active(&ds->ds_synced_link))
234 		list_remove(&dp->dp_synced_objsets, ds);
235 
236 	kmem_free(ds, sizeof (dsl_dataset_t));
237 }
238 
239 static void
240 dsl_dataset_get_snapname(dsl_dataset_t *ds)
241 {
242 	dsl_dataset_phys_t *headphys;
243 	int err;
244 	dmu_buf_t *headdbuf;
245 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
246 	objset_t *mos = dp->dp_meta_objset;
247 
248 	if (ds->ds_snapname[0])
249 		return;
250 	if (ds->ds_phys->ds_next_snap_obj == 0)
251 		return;
252 
253 	headdbuf = dmu_bonus_hold_tag(mos,
254 	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG);
255 	dmu_buf_read(headdbuf);
256 	headphys = headdbuf->db_data;
257 	err = zap_value_search(dp->dp_meta_objset,
258 	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
259 	ASSERT(err == 0);
260 	dmu_buf_rele_tag(headdbuf, FTAG);
261 }
262 
263 dsl_dataset_t *
264 dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
265     int mode, void *tag)
266 {
267 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
268 	objset_t *mos = dp->dp_meta_objset;
269 	dmu_buf_t *dbuf;
270 	dsl_dataset_t *ds;
271 
272 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
273 	    dsl_pool_sync_context(dp));
274 
275 	dbuf = dmu_bonus_hold_tag(mos, dsobj, tag);
276 	dmu_buf_read(dbuf);
277 	ds = dmu_buf_get_user(dbuf);
278 	if (ds == NULL) {
279 		dsl_dataset_t *winner;
280 
281 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
282 		ds->ds_dbuf = dbuf;
283 		ds->ds_object = dsobj;
284 		ds->ds_phys = dbuf->db_data;
285 		ds->ds_dir = dsl_dir_open_obj(dp,
286 		    ds->ds_phys->ds_dir_obj, NULL, ds);
287 
288 		bplist_open(&ds->ds_deadlist,
289 		    mos, ds->ds_phys->ds_deadlist_obj);
290 
291 		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
292 			ds->ds_snapname[0] = '\0';
293 			if (ds->ds_phys->ds_prev_snap_obj) {
294 				ds->ds_prev =
295 				    dsl_dataset_open_obj(dp,
296 				    ds->ds_phys->ds_prev_snap_obj, NULL,
297 				    DS_MODE_NONE, ds);
298 			}
299 		} else {
300 			if (snapname) {
301 #ifdef ZFS_DEBUG
302 				dsl_dataset_phys_t *headphys;
303 				int err;
304 				dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos,
305 				    ds->ds_dir->dd_phys->
306 				    dd_head_dataset_obj, FTAG);
307 				dmu_buf_read(headdbuf);
308 				headphys = headdbuf->db_data;
309 				uint64_t foundobj;
310 				err = zap_lookup(dp->dp_meta_objset,
311 				    headphys->ds_snapnames_zapobj,
312 				    snapname, sizeof (foundobj), 1, &foundobj);
313 				ASSERT3U(err, ==, 0);
314 				ASSERT3U(foundobj, ==, dsobj);
315 				dmu_buf_rele_tag(headdbuf, FTAG);
316 #endif
317 				(void) strcat(ds->ds_snapname, snapname);
318 			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
319 				dsl_dataset_get_snapname(ds);
320 			}
321 		}
322 
323 		winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
324 		    dsl_dataset_evict);
325 		if (winner) {
326 			bplist_close(&ds->ds_deadlist);
327 			if (ds->ds_prev) {
328 				dsl_dataset_close(ds->ds_prev,
329 				    DS_MODE_NONE, ds);
330 			}
331 			dsl_dir_close(ds->ds_dir, ds);
332 			kmem_free(ds, sizeof (dsl_dataset_t));
333 			ds = winner;
334 		} else {
335 			uint64_t new =
336 			    unique_insert(ds->ds_phys->ds_fsid_guid);
337 			if (new != ds->ds_phys->ds_fsid_guid) {
338 				/* XXX it won't necessarily be synced... */
339 				ds->ds_phys->ds_fsid_guid = new;
340 			}
341 		}
342 	}
343 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
344 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
345 
346 	mutex_enter(&ds->ds_lock);
347 	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
348 	    ds->ds_phys->ds_restoring && !DS_MODE_IS_RESTORE(mode)) ||
349 	    (ds->ds_open_refcount + weight > DOS_REF_MAX)) {
350 		mutex_exit(&ds->ds_lock);
351 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
352 		return (NULL);
353 	}
354 	ds->ds_open_refcount += weight;
355 	mutex_exit(&ds->ds_lock);
356 
357 	return (ds);
358 }
359 
360 int
361 dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
362     void *tag, dsl_dataset_t **dsp)
363 {
364 	dsl_dir_t *dd;
365 	dsl_pool_t *dp;
366 	const char *tail;
367 	uint64_t obj;
368 	dsl_dataset_t *ds = NULL;
369 	int err = 0;
370 
371 	dd = dsl_dir_open_spa(spa, name, FTAG, &tail);
372 	if (dd == NULL)
373 		return (ENOENT);
374 
375 	dp = dd->dd_pool;
376 	obj = dd->dd_phys->dd_head_dataset_obj;
377 	rw_enter(&dp->dp_config_rwlock, RW_READER);
378 	if (obj == 0) {
379 		/* A dataset with no associated objset */
380 		err = ENOENT;
381 		goto out;
382 	}
383 
384 	if (tail != NULL) {
385 		objset_t *mos = dp->dp_meta_objset;
386 
387 		ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag);
388 		obj = ds->ds_phys->ds_snapnames_zapobj;
389 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
390 		ds = NULL;
391 
392 		if (tail[0] != '@') {
393 			err = ENOENT;
394 			goto out;
395 		}
396 		tail++;
397 
398 		/* Look for a snapshot */
399 		if (!DS_MODE_IS_READONLY(mode)) {
400 			err = EROFS;
401 			goto out;
402 		}
403 		dprintf("looking for snapshot '%s'\n", tail);
404 		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
405 		if (err)
406 			goto out;
407 	}
408 	ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag);
409 	if (ds == NULL)
410 		err = EBUSY;
411 
412 out:
413 	rw_exit(&dp->dp_config_rwlock);
414 	dsl_dir_close(dd, FTAG);
415 
416 	ASSERT3U((err == 0), ==, (ds != NULL));
417 	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
418 
419 	*dsp = ds;
420 	return (err);
421 }
422 
423 int
424 dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
425 {
426 	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
427 }
428 
429 void
430 dsl_dataset_name(dsl_dataset_t *ds, char *name)
431 {
432 	if (ds == NULL) {
433 		(void) strcpy(name, "mos");
434 	} else {
435 		dsl_dir_name(ds->ds_dir, name);
436 		dsl_dataset_get_snapname(ds);
437 		if (ds->ds_snapname[0]) {
438 			(void) strcat(name, "@");
439 			if (!MUTEX_HELD(&ds->ds_lock)) {
440 				/*
441 				 * We use a "recursive" mutex so that we
442 				 * can call dprintf_ds() with ds_lock held.
443 				 */
444 				mutex_enter(&ds->ds_lock);
445 				(void) strcat(name, ds->ds_snapname);
446 				mutex_exit(&ds->ds_lock);
447 			} else {
448 				(void) strcat(name, ds->ds_snapname);
449 			}
450 		}
451 	}
452 }
453 
454 void
455 dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
456 {
457 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
458 	mutex_enter(&ds->ds_lock);
459 	ASSERT3U(ds->ds_open_refcount, >=, weight);
460 	ds->ds_open_refcount -= weight;
461 	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
462 	    mode, ds->ds_open_refcount);
463 	mutex_exit(&ds->ds_lock);
464 
465 	dmu_buf_rele_tag(ds->ds_dbuf, tag);
466 }
467 
468 void
469 dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
470 {
471 	objset_t *mos = dp->dp_meta_objset;
472 	dmu_buf_t *dbuf;
473 	dsl_dataset_phys_t *dsphys;
474 	dsl_dataset_t *ds;
475 	uint64_t dsobj;
476 	dsl_dir_t *dd;
477 
478 	dsl_dir_create_root(mos, ddobjp, tx);
479 	dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG);
480 	ASSERT(dd != NULL);
481 
482 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
483 	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
484 	dbuf = dmu_bonus_hold(mos, dsobj);
485 	dmu_buf_will_dirty(dbuf, tx);
486 	dsphys = dbuf->db_data;
487 	dsphys->ds_dir_obj = dd->dd_object;
488 	dsphys->ds_fsid_guid = unique_create();
489 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
490 	    sizeof (dsphys->ds_guid));
491 	dsphys->ds_snapnames_zapobj =
492 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
493 	dsphys->ds_creation_time = gethrestime_sec();
494 	dsphys->ds_creation_txg = tx->tx_txg;
495 	dsphys->ds_deadlist_obj =
496 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
497 	dmu_buf_rele(dbuf);
498 
499 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
500 	dd->dd_phys->dd_head_dataset_obj = dsobj;
501 	dsl_dir_close(dd, FTAG);
502 
503 	ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG);
504 	(void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
505 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
506 }
507 
508 int
509 dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
510     const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
511 {
512 	int err;
513 	dsl_pool_t *dp = pds->dd_pool;
514 	dmu_buf_t *dbuf;
515 	dsl_dataset_phys_t *dsphys;
516 	uint64_t dsobj;
517 	objset_t *mos = dp->dp_meta_objset;
518 	dsl_dir_t *dd;
519 
520 	if (clone_parent != NULL) {
521 		/*
522 		 * You can't clone across pools.
523 		 */
524 		if (clone_parent->ds_dir->dd_pool != dp)
525 			return (EXDEV);
526 
527 		/*
528 		 * You can only clone snapshots, not the head datasets.
529 		 */
530 		if (clone_parent->ds_phys->ds_num_children == 0)
531 			return (EINVAL);
532 	}
533 
534 	ASSERT(lastname[0] != '@');
535 	ASSERT(dmu_tx_is_syncing(tx));
536 
537 	err = dsl_dir_create_sync(pds, lastname, tx);
538 	if (err)
539 		return (err);
540 	dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL);
541 	ASSERT(dd != NULL);
542 
543 	/* This is the point of no (unsuccessful) return */
544 
545 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
546 	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
547 	dbuf = dmu_bonus_hold(mos, dsobj);
548 	dmu_buf_will_dirty(dbuf, tx);
549 	dsphys = dbuf->db_data;
550 	dsphys->ds_dir_obj = dd->dd_object;
551 	dsphys->ds_fsid_guid = unique_create();
552 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
553 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
554 	    sizeof (dsphys->ds_guid));
555 	dsphys->ds_snapnames_zapobj =
556 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
557 	dsphys->ds_creation_time = gethrestime_sec();
558 	dsphys->ds_creation_txg = tx->tx_txg;
559 	dsphys->ds_deadlist_obj =
560 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
561 	if (clone_parent) {
562 		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
563 		dsphys->ds_prev_snap_txg =
564 		    clone_parent->ds_phys->ds_creation_txg;
565 		dsphys->ds_used_bytes =
566 		    clone_parent->ds_phys->ds_used_bytes;
567 		dsphys->ds_compressed_bytes =
568 		    clone_parent->ds_phys->ds_compressed_bytes;
569 		dsphys->ds_uncompressed_bytes =
570 		    clone_parent->ds_phys->ds_uncompressed_bytes;
571 		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
572 
573 		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
574 		clone_parent->ds_phys->ds_num_children++;
575 
576 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
577 		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
578 	}
579 	dmu_buf_rele(dbuf);
580 
581 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
582 	dd->dd_phys->dd_head_dataset_obj = dsobj;
583 	dsl_dir_close(dd, FTAG);
584 
585 	return (0);
586 }
587 
588 
589 int
590 dsl_dataset_destroy(const char *name)
591 {
592 	int err;
593 	dsl_pool_t *dp;
594 	dsl_dir_t *dd;
595 	const char *tail;
596 
597 	dd = dsl_dir_open(name, FTAG, &tail);
598 	if (dd == NULL)
599 		return (ENOENT);
600 
601 	dp = dd->dd_pool;
602 	if (tail != NULL) {
603 		if (tail[0] != '@') {
604 			dsl_dir_close(dd, FTAG);
605 			return (ENOENT);
606 		}
607 		tail++;
608 		/* Just blow away the snapshot */
609 		do {
610 			txg_wait_synced(dp, 0);
611 			err = dsl_dir_sync_task(dd,
612 			    dsl_dataset_destroy_sync, (void*)tail, 0);
613 		} while (err == EAGAIN);
614 		dsl_dir_close(dd, FTAG);
615 	} else {
616 		char buf[MAXNAMELEN];
617 		char *cp;
618 
619 		dsl_dir_t *pds;
620 		if (dd->dd_phys->dd_parent_obj == 0) {
621 			dsl_dir_close(dd, FTAG);
622 			return (EINVAL);
623 		}
624 		/*
625 		 * Make sure it's not dirty before we destroy it.
626 		 */
627 		txg_wait_synced(dd->dd_pool, 0);
628 		/*
629 		 * Blow away the dsl_dir + head dataset.
630 		 * dsl_dir_destroy_sync() will call
631 		 * dsl_dataset_destroy_sync() to destroy the head dataset.
632 		 */
633 		rw_enter(&dp->dp_config_rwlock, RW_READER);
634 		pds = dsl_dir_open_obj(dd->dd_pool,
635 		    dd->dd_phys->dd_parent_obj, NULL, FTAG);
636 		dsl_dir_close(dd, FTAG);
637 		rw_exit(&dp->dp_config_rwlock);
638 
639 		(void) strcpy(buf, name);
640 		cp = strrchr(buf, '/') + 1;
641 		ASSERT(cp[0] != '\0');
642 		do {
643 			txg_wait_synced(dp, 0);
644 			err = dsl_dir_sync_task(pds,
645 			    dsl_dir_destroy_sync, cp, 0);
646 		} while (err == EAGAIN);
647 		dsl_dir_close(pds, FTAG);
648 	}
649 
650 	return (err);
651 }
652 
653 int
654 dsl_dataset_rollback(const char *name)
655 {
656 	int err;
657 	dsl_dir_t *dd;
658 	const char *tail;
659 
660 	dd = dsl_dir_open(name, FTAG, &tail);
661 	if (dd == NULL)
662 		return (ENOENT);
663 
664 	if (tail != NULL) {
665 		dsl_dir_close(dd, FTAG);
666 		return (EINVAL);
667 	}
668 	do {
669 		txg_wait_synced(dd->dd_pool, 0);
670 		err = dsl_dir_sync_task(dd,
671 		    dsl_dataset_rollback_sync, NULL, 0);
672 	} while (err == EAGAIN);
673 	dsl_dir_close(dd, FTAG);
674 
675 	return (err);
676 }
677 
678 void *
679 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
680     void *p, dsl_dataset_evict_func_t func)
681 {
682 	void *old;
683 
684 	mutex_enter(&ds->ds_lock);
685 	old = ds->ds_user_ptr;
686 	if (old == NULL) {
687 		ds->ds_user_ptr = p;
688 		ds->ds_user_evict_func = func;
689 	}
690 	mutex_exit(&ds->ds_lock);
691 	return (old);
692 }
693 
694 void *
695 dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
696 {
697 	return (ds->ds_user_ptr);
698 }
699 
700 
701 void
702 dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp)
703 {
704 	*bp = ds->ds_phys->ds_bp;
705 }
706 
707 void
708 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
709 {
710 	ASSERT(dmu_tx_is_syncing(tx));
711 	/* If it's the meta-objset, set dp_meta_rootbp */
712 	if (ds == NULL) {
713 		tx->tx_pool->dp_meta_rootbp = *bp;
714 	} else {
715 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
716 		ds->ds_phys->ds_bp = *bp;
717 	}
718 }
719 
720 spa_t *
721 dsl_dataset_get_spa(dsl_dataset_t *ds)
722 {
723 	return (ds->ds_dir->dd_pool->dp_spa);
724 }
725 
726 void
727 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
728 {
729 	dsl_pool_t *dp;
730 
731 	if (ds == NULL) /* this is the meta-objset */
732 		return;
733 
734 	ASSERT(ds->ds_user_ptr != NULL);
735 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
736 
737 	dp = ds->ds_dir->dd_pool;
738 
739 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
740 		/* up the hold count until we can be written out */
741 		dmu_buf_add_ref(ds->ds_dbuf, ds);
742 	}
743 }
744 
745 struct killarg {
746 	uint64_t *usedp;
747 	uint64_t *compressedp;
748 	uint64_t *uncompressedp;
749 	zio_t *zio;
750 	dmu_tx_t *tx;
751 };
752 
753 static int
754 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
755 {
756 	struct killarg *ka = arg;
757 	blkptr_t *bp = &bc->bc_blkptr;
758 
759 	ASSERT3U(bc->bc_errno, ==, 0);
760 
761 	/*
762 	 * Since this callback is not called concurrently, no lock is
763 	 * needed on the accounting values.
764 	 */
765 	*ka->usedp += BP_GET_ASIZE(bp);
766 	*ka->compressedp += BP_GET_PSIZE(bp);
767 	*ka->uncompressedp += BP_GET_UCSIZE(bp);
768 	/* XXX check for EIO? */
769 	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
770 	    ARC_NOWAIT);
771 	return (0);
772 }
773 
774 /* ARGSUSED */
775 int
776 dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
777 {
778 	objset_t *mos = dd->dd_pool->dp_meta_objset;
779 	dsl_dataset_t *ds;
780 
781 	if (dd->dd_phys->dd_head_dataset_obj == 0)
782 		return (EINVAL);
783 	ds = dsl_dataset_open_obj(dd->dd_pool,
784 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
785 
786 	if (ds->ds_phys->ds_prev_snap_txg == 0) {
787 		/*
788 		 * There's no previous snapshot.  I suppose we could
789 		 * roll it back to being empty (and re-initialize the
790 		 * upper (ZPL) layer).  But for now there's no way to do
791 		 * this via the user interface.
792 		 */
793 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
794 		return (EINVAL);
795 	}
796 
797 	mutex_enter(&ds->ds_lock);
798 	if (ds->ds_open_refcount > 0) {
799 		mutex_exit(&ds->ds_lock);
800 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
801 		return (EBUSY);
802 	}
803 
804 	/*
805 	 * If we made changes this txg, traverse_dsl_dataset won't find
806 	 * them.  Try again.
807 	 */
808 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
809 		mutex_exit(&ds->ds_lock);
810 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
811 		return (EAGAIN);
812 	}
813 
814 	/* THE POINT OF NO (unsuccessful) RETURN */
815 	ds->ds_open_refcount = DOS_REF_MAX;
816 	mutex_exit(&ds->ds_lock);
817 
818 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
819 
820 	/* Zero out the deadlist. */
821 	dprintf("old deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
822 	bplist_close(&ds->ds_deadlist);
823 	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
824 	ds->ds_phys->ds_deadlist_obj =
825 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
826 	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
827 	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
828 
829 	{
830 		/* Free blkptrs that we gave birth to */
831 		zio_t *zio;
832 		uint64_t used = 0, compressed = 0, uncompressed = 0;
833 		struct killarg ka;
834 
835 		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
836 		    ZIO_FLAG_MUSTSUCCEED);
837 		ka.usedp = &used;
838 		ka.compressedp = &compressed;
839 		ka.uncompressedp = &uncompressed;
840 		ka.zio = zio;
841 		ka.tx = tx;
842 		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
843 		    ADVANCE_POST, kill_blkptr, &ka);
844 		(void) zio_wait(zio);
845 
846 		dsl_dir_diduse_space(dd,
847 		    -used, -compressed, -uncompressed, tx);
848 	}
849 
850 	/* Change our contents to that of the prev snapshot (finally!) */
851 	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
852 	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
853 	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
854 	ds->ds_phys->ds_compressed_bytes =
855 	    ds->ds_prev->ds_phys->ds_compressed_bytes;
856 	ds->ds_phys->ds_uncompressed_bytes =
857 	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
858 	ds->ds_phys->ds_restoring = ds->ds_prev->ds_phys->ds_restoring;
859 	ds->ds_phys->ds_unique_bytes = 0;
860 
861 	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
862 	ds->ds_prev->ds_phys->ds_unique_bytes = 0;
863 
864 	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
865 	ds->ds_open_refcount = 0;
866 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
867 
868 	return (0);
869 }
870 
871 int
872 dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
873 {
874 	const char *snapname = arg;
875 	uint64_t used = 0, compressed = 0, uncompressed = 0;
876 	blkptr_t bp;
877 	zio_t *zio;
878 	int err;
879 	int after_branch_point = FALSE;
880 	int drop_lock = FALSE;
881 	dsl_pool_t *dp = dd->dd_pool;
882 	objset_t *mos = dp->dp_meta_objset;
883 	dsl_dataset_t *ds, *ds_prev = NULL;
884 	uint64_t obj;
885 
886 	if (dd->dd_phys->dd_head_dataset_obj == 0)
887 		return (EINVAL);
888 
889 	if (!RW_WRITE_HELD(&dp->dp_config_rwlock)) {
890 		rw_enter(&dp->dp_config_rwlock, RW_WRITER);
891 		drop_lock = TRUE;
892 	}
893 
894 	ds = dsl_dataset_open_obj(dd->dd_pool,
895 	    dd->dd_phys->dd_head_dataset_obj, NULL,
896 	    snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG);
897 
898 	if (snapname) {
899 		err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
900 		    snapname, 8, 1, &obj);
901 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
902 		if (err) {
903 			if (drop_lock)
904 				rw_exit(&dp->dp_config_rwlock);
905 			return (err);
906 		}
907 
908 		ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
909 		    DS_MODE_EXCLUSIVE, FTAG);
910 	}
911 	if (ds == NULL) {
912 		if (drop_lock)
913 			rw_exit(&dp->dp_config_rwlock);
914 		return (EBUSY);
915 	}
916 
917 	obj = ds->ds_object;
918 
919 	/* Can't delete a branch point. */
920 	if (ds->ds_phys->ds_num_children > 1) {
921 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
922 		if (drop_lock)
923 			rw_exit(&dp->dp_config_rwlock);
924 		return (EINVAL);
925 	}
926 
927 	/*
928 	 * Can't delete a head dataset if there are snapshots of it.
929 	 * (Except if the only snapshots are from the branch we cloned
930 	 * from.)
931 	 */
932 	if (ds->ds_prev != NULL &&
933 	    ds->ds_prev->ds_phys->ds_next_snap_obj == obj) {
934 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
935 		if (drop_lock)
936 			rw_exit(&dp->dp_config_rwlock);
937 		return (EINVAL);
938 	}
939 
940 	/*
941 	 * If we made changes this txg, traverse_dsl_dataset won't find
942 	 * them.  Try again.
943 	 */
944 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
945 		mutex_exit(&ds->ds_lock);
946 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
947 		if (drop_lock)
948 			rw_exit(&dp->dp_config_rwlock);
949 		return (EAGAIN);
950 	}
951 
952 	/* THE POINT OF NO (unsuccessful) RETURN */
953 
954 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
955 		if (ds->ds_prev) {
956 			ds_prev = ds->ds_prev;
957 		} else {
958 			ds_prev = dsl_dataset_open_obj(dd->dd_pool,
959 			    ds->ds_phys->ds_prev_snap_obj, NULL,
960 			    DS_MODE_NONE, FTAG);
961 		}
962 		after_branch_point =
963 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
964 
965 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
966 		if (after_branch_point &&
967 		    ds->ds_phys->ds_next_snap_obj == 0) {
968 			/* This clone is toast. */
969 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
970 			ds_prev->ds_phys->ds_num_children--;
971 		} else if (!after_branch_point) {
972 			ds_prev->ds_phys->ds_next_snap_obj =
973 			    ds->ds_phys->ds_next_snap_obj;
974 		}
975 	}
976 
977 	ASSERT3P(tx->tx_pool, ==, dd->dd_pool);
978 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
979 
980 	if (ds->ds_phys->ds_next_snap_obj != 0) {
981 		dsl_dataset_t *ds_next;
982 		uint64_t itor = 0;
983 
984 		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
985 
986 		ds_next = dsl_dataset_open_obj(dd->dd_pool,
987 		    ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG);
988 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
989 
990 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
991 		ds_next->ds_phys->ds_prev_snap_obj =
992 		    ds->ds_phys->ds_prev_snap_obj;
993 		ds_next->ds_phys->ds_prev_snap_txg =
994 		    ds->ds_phys->ds_prev_snap_txg;
995 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
996 		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
997 
998 		/*
999 		 * Transfer to our deadlist (which will become next's
1000 		 * new deadlist) any entries from next's current
1001 		 * deadlist which were born before prev, and free the
1002 		 * other entries.
1003 		 *
1004 		 * XXX we're doing this long task with the config lock held
1005 		 */
1006 		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
1007 		    &bp) == 0) {
1008 			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
1009 				bplist_enqueue(&ds->ds_deadlist, &bp, tx);
1010 				if (ds_prev && !after_branch_point &&
1011 				    bp.blk_birth >
1012 				    ds_prev->ds_phys->ds_prev_snap_txg) {
1013 					ds_prev->ds_phys->ds_unique_bytes +=
1014 					    BP_GET_ASIZE(&bp);
1015 				}
1016 			} else {
1017 				used += BP_GET_ASIZE(&bp);
1018 				compressed += BP_GET_PSIZE(&bp);
1019 				uncompressed += BP_GET_UCSIZE(&bp);
1020 				/* XXX check return value? */
1021 				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
1022 				    &bp, NULL, NULL, ARC_NOWAIT);
1023 			}
1024 		}
1025 
1026 		/* free next's deadlist */
1027 		bplist_close(&ds_next->ds_deadlist);
1028 		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
1029 
1030 		/* set next's deadlist to our deadlist */
1031 		ds_next->ds_phys->ds_deadlist_obj =
1032 		    ds->ds_phys->ds_deadlist_obj;
1033 		bplist_open(&ds_next->ds_deadlist, mos,
1034 		    ds_next->ds_phys->ds_deadlist_obj);
1035 		ds->ds_phys->ds_deadlist_obj = 0;
1036 
1037 		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
1038 			/*
1039 			 * Update next's unique to include blocks which
1040 			 * were previously shared by only this snapshot
1041 			 * and it.  Those blocks will be born after the
1042 			 * prev snap and before this snap, and will have
1043 			 * died after the next snap and before the one
1044 			 * after that (ie. be on the snap after next's
1045 			 * deadlist).
1046 			 *
1047 			 * XXX we're doing this long task with the
1048 			 * config lock held
1049 			 */
1050 			dsl_dataset_t *ds_after_next;
1051 
1052 			ds_after_next = dsl_dataset_open_obj(dd->dd_pool,
1053 			    ds_next->ds_phys->ds_next_snap_obj, NULL,
1054 			    DS_MODE_NONE, FTAG);
1055 			itor = 0;
1056 			while (bplist_iterate(&ds_after_next->ds_deadlist,
1057 			    &itor, &bp) == 0) {
1058 				if (bp.blk_birth >
1059 				    ds->ds_phys->ds_prev_snap_txg &&
1060 				    bp.blk_birth <=
1061 				    ds->ds_phys->ds_creation_txg) {
1062 					ds_next->ds_phys->ds_unique_bytes +=
1063 					    BP_GET_ASIZE(&bp);
1064 				}
1065 			}
1066 
1067 			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
1068 			ASSERT3P(ds_next->ds_prev, ==, NULL);
1069 		} else {
1070 			/*
1071 			 * It would be nice to update the head dataset's
1072 			 * unique.  To do so we would have to traverse
1073 			 * it for blocks born after ds_prev, which is
1074 			 * pretty expensive just to maintain something
1075 			 * for debugging purposes.
1076 			 */
1077 			ASSERT3P(ds_next->ds_prev, ==, ds);
1078 			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
1079 			    ds_next);
1080 			if (ds_prev) {
1081 				ds_next->ds_prev = dsl_dataset_open_obj(
1082 				    dd->dd_pool, ds->ds_phys->ds_prev_snap_obj,
1083 				    NULL, DS_MODE_NONE, ds_next);
1084 			} else {
1085 				ds_next->ds_prev = NULL;
1086 			}
1087 		}
1088 		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
1089 
1090 		/*
1091 		 * NB: unique_bytes is not accurate for head objsets
1092 		 * because we don't update it when we delete the most
1093 		 * recent snapshot -- see above comment.
1094 		 */
1095 		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
1096 	} else {
1097 		/*
1098 		 * There's no next snapshot, so this is a head dataset.
1099 		 * Destroy the deadlist.  Unless it's a clone, the
1100 		 * deadlist should be empty.  (If it's a clone, it's
1101 		 * safe to ignore the deadlist contents.)
1102 		 */
1103 		struct killarg ka;
1104 
1105 		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
1106 		bplist_close(&ds->ds_deadlist);
1107 		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
1108 		ds->ds_phys->ds_deadlist_obj = 0;
1109 
1110 		/*
1111 		 * Free everything that we point to (that's born after
1112 		 * the previous snapshot, if we are a clone)
1113 		 *
1114 		 * XXX we're doing this long task with the config lock held
1115 		 */
1116 		ka.usedp = &used;
1117 		ka.compressedp = &compressed;
1118 		ka.uncompressedp = &uncompressed;
1119 		ka.zio = zio;
1120 		ka.tx = tx;
1121 		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1122 		    ADVANCE_POST, kill_blkptr, &ka);
1123 		ASSERT3U(err, ==, 0);
1124 	}
1125 
1126 	err = zio_wait(zio);
1127 	ASSERT3U(err, ==, 0);
1128 
1129 	dsl_dir_diduse_space(dd, -used, -compressed, -uncompressed, tx);
1130 
1131 	if (ds->ds_phys->ds_snapnames_zapobj) {
1132 		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1133 		ASSERT(err == 0);
1134 	}
1135 
1136 	if (dd->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1137 		/* Erase the link in the dataset */
1138 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
1139 		dd->dd_phys->dd_head_dataset_obj = 0;
1140 		/*
1141 		 * dsl_dir_sync_destroy() called us, they'll destroy
1142 		 * the dataset.
1143 		 */
1144 	} else {
1145 		/* remove from snapshot namespace */
1146 		dsl_dataset_t *ds_head;
1147 		ds_head = dsl_dataset_open_obj(dd->dd_pool,
1148 		    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
1149 #ifdef ZFS_DEBUG
1150 		{
1151 			uint64_t val;
1152 			err = zap_lookup(mos,
1153 			    ds_head->ds_phys->ds_snapnames_zapobj,
1154 			    snapname, 8, 1, &val);
1155 			ASSERT3U(err, ==, 0);
1156 			ASSERT3U(val, ==, obj);
1157 		}
1158 #endif
1159 		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
1160 		    snapname, tx);
1161 		ASSERT(err == 0);
1162 		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
1163 	}
1164 
1165 	if (ds_prev && ds->ds_prev != ds_prev)
1166 		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
1167 
1168 	err = dmu_object_free(mos, obj, tx);
1169 	ASSERT(err == 0);
1170 
1171 	/*
1172 	 * Close the objset with mode NONE, thus leaving it with
1173 	 * DOS_REF_MAX set, so that noone can access it.
1174 	 */
1175 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
1176 
1177 	if (drop_lock)
1178 		rw_exit(&dp->dp_config_rwlock);
1179 	return (0);
1180 }
1181 
1182 int
1183 dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
1184 {
1185 	const char *snapname = arg;
1186 	dsl_pool_t *dp = dd->dd_pool;
1187 	dmu_buf_t *dbuf;
1188 	dsl_dataset_phys_t *dsphys;
1189 	uint64_t dsobj, value;
1190 	objset_t *mos = dp->dp_meta_objset;
1191 	dsl_dataset_t *ds;
1192 	int err;
1193 
1194 	ASSERT(dmu_tx_is_syncing(tx));
1195 
1196 	if (dd->dd_phys->dd_head_dataset_obj == 0)
1197 		return (EINVAL);
1198 	ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
1199 	    DS_MODE_NONE, FTAG);
1200 
1201 	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
1202 	    snapname, 8, 1, &value);
1203 	if (err == 0) {
1204 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
1205 		return (EEXIST);
1206 	}
1207 	ASSERT(err == ENOENT);
1208 
1209 	/* The point of no (unsuccessful) return */
1210 
1211 	dprintf_dd(dd, "taking snapshot %s in txg %llu\n",
1212 	    snapname, tx->tx_txg);
1213 
1214 	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
1215 
1216 	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
1217 
1218 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
1219 	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
1220 	dbuf = dmu_bonus_hold(mos, dsobj);
1221 	dmu_buf_will_dirty(dbuf, tx);
1222 	dsphys = dbuf->db_data;
1223 	dsphys->ds_dir_obj = dd->dd_object;
1224 	dsphys->ds_fsid_guid = unique_create();
1225 	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
1226 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1227 	    sizeof (dsphys->ds_guid));
1228 	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
1229 	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
1230 	dsphys->ds_next_snap_obj = ds->ds_object;
1231 	dsphys->ds_num_children = 1;
1232 	dsphys->ds_creation_time = gethrestime_sec();
1233 	dsphys->ds_creation_txg = tx->tx_txg;
1234 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
1235 	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
1236 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
1237 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
1238 	dsphys->ds_restoring = ds->ds_phys->ds_restoring;
1239 	dsphys->ds_bp = ds->ds_phys->ds_bp;
1240 	dmu_buf_rele(dbuf);
1241 
1242 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
1243 		dsl_dataset_t *ds_prev;
1244 
1245 		ds_prev = dsl_dataset_open_obj(dp,
1246 		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG);
1247 		ASSERT(ds_prev->ds_phys->ds_next_snap_obj ==
1248 		    ds->ds_object ||
1249 		    ds_prev->ds_phys->ds_num_children > 1);
1250 		if (ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1251 			dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1252 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1253 			    ds_prev->ds_phys->ds_creation_txg);
1254 			ds_prev->ds_phys->ds_next_snap_obj = dsobj;
1255 		}
1256 		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
1257 	} else {
1258 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 0);
1259 	}
1260 
1261 	bplist_close(&ds->ds_deadlist);
1262 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1263 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
1264 	ds->ds_phys->ds_prev_snap_obj = dsobj;
1265 	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
1266 	ds->ds_phys->ds_unique_bytes = 0;
1267 	ds->ds_phys->ds_deadlist_obj =
1268 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
1269 	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1270 
1271 	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
1272 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
1273 	    snapname, 8, 1, &dsobj, tx);
1274 	ASSERT(err == 0);
1275 
1276 	if (ds->ds_prev)
1277 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
1278 	ds->ds_prev = dsl_dataset_open_obj(dp,
1279 	    ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds);
1280 
1281 	rw_exit(&dp->dp_config_rwlock);
1282 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
1283 
1284 	return (0);
1285 }
1286 
1287 void
1288 dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
1289 {
1290 	ASSERT(dmu_tx_is_syncing(tx));
1291 	ASSERT(ds->ds_user_ptr != NULL);
1292 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
1293 
1294 	dmu_objset_sync(ds->ds_user_ptr, tx);
1295 	dsl_dir_dirty(ds->ds_dir, tx);
1296 	bplist_close(&ds->ds_deadlist);
1297 
1298 	dmu_buf_remove_ref(ds->ds_dbuf, ds);
1299 }
1300 
1301 void
1302 dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
1303 {
1304 	/* fill in properties crap */
1305 	dsl_dir_stats(ds->ds_dir, dds);
1306 
1307 	if (ds->ds_phys->ds_num_children != 0) {
1308 		dds->dds_is_snapshot = TRUE;
1309 		dds->dds_num_clones = ds->ds_phys->ds_num_children - 1;
1310 	}
1311 
1312 	dds->dds_last_txg = ds->ds_phys->ds_bp.blk_birth;
1313 
1314 	dds->dds_objects_used = ds->ds_phys->ds_bp.blk_fill;
1315 	dds->dds_objects_avail = DN_MAX_OBJECT - dds->dds_objects_used;
1316 
1317 	/* We override the dataset's creation time... they should be the same */
1318 	dds->dds_creation_time = ds->ds_phys->ds_creation_time;
1319 	dds->dds_creation_txg = ds->ds_phys->ds_creation_txg;
1320 	dds->dds_space_refd = ds->ds_phys->ds_used_bytes;
1321 	dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid;
1322 	dds->dds_guid = ds->ds_phys->ds_guid;
1323 
1324 	if (ds->ds_phys->ds_next_snap_obj) {
1325 		/*
1326 		 * This is a snapshot; override the dd's space used with
1327 		 * our unique space
1328 		 */
1329 		dds->dds_space_used = ds->ds_phys->ds_unique_bytes;
1330 		dds->dds_compressed_bytes =
1331 		    ds->ds_phys->ds_compressed_bytes;
1332 		dds->dds_uncompressed_bytes =
1333 		    ds->ds_phys->ds_uncompressed_bytes;
1334 	}
1335 
1336 	dds->dds_objset_obj = ds->ds_object;
1337 }
1338 
1339 dsl_pool_t *
1340 dsl_dataset_pool(dsl_dataset_t *ds)
1341 {
1342 	return (ds->ds_dir->dd_pool);
1343 }
1344 
1345 struct osrenamearg {
1346 	const char *oldname;
1347 	const char *newname;
1348 };
1349 
1350 static int
1351 dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
1352 {
1353 	struct osrenamearg *ora = arg;
1354 	objset_t *mos = dd->dd_pool->dp_meta_objset;
1355 	dsl_dir_t *nds;
1356 	const char *tail;
1357 	int err;
1358 	dsl_dataset_t *snds, *fsds;
1359 	uint64_t val;
1360 
1361 	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, ora->oldname,
1362 	    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &snds);
1363 	if (err)
1364 		return (err);
1365 
1366 	if (snds->ds_dir != dd) {
1367 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1368 		return (EINVAL);
1369 	}
1370 
1371 	/* better be changing a snapshot */
1372 	if (snds->ds_phys->ds_next_snap_obj == 0) {
1373 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1374 		return (EINVAL);
1375 	}
1376 
1377 	/* new fs better exist */
1378 	nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail);
1379 	if (nds == NULL) {
1380 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1381 		return (ENOENT);
1382 	}
1383 
1384 	dsl_dir_close(nds, FTAG);
1385 
1386 	/* new name better be in same fs */
1387 	if (nds != dd) {
1388 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1389 		return (EINVAL);
1390 	}
1391 
1392 	/* new name better be a snapshot */
1393 	if (tail == NULL || tail[0] != '@') {
1394 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1395 		return (EINVAL);
1396 	}
1397 
1398 	tail++;
1399 
1400 	fsds = dsl_dataset_open_obj(dd->dd_pool,
1401 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
1402 
1403 	/* new name better not be in use */
1404 	err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj,
1405 	    tail, 8, 1, &val);
1406 	if (err != ENOENT) {
1407 		if (err == 0)
1408 			err = EEXIST;
1409 		dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
1410 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1411 		return (EEXIST);
1412 	}
1413 
1414 	/* The point of no (unsuccessful) return */
1415 
1416 	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
1417 	dsl_dataset_get_snapname(snds);
1418 	err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj,
1419 	    snds->ds_snapname, tx);
1420 	ASSERT3U(err, ==, 0);
1421 	mutex_enter(&snds->ds_lock);
1422 	(void) strcpy(snds->ds_snapname, tail);
1423 	mutex_exit(&snds->ds_lock);
1424 	err = zap_add(mos, fsds->ds_phys->ds_snapnames_zapobj,
1425 	    snds->ds_snapname, 8, 1, &snds->ds_object, tx);
1426 	ASSERT3U(err, ==, 0);
1427 	rw_exit(&dd->dd_pool->dp_config_rwlock);
1428 
1429 	dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
1430 	dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
1431 	return (0);
1432 }
1433 
1434 #pragma weak dmu_objset_rename = dsl_dataset_rename
1435 int
1436 dsl_dataset_rename(const char *osname, const char *newname)
1437 {
1438 	dsl_dir_t *dd;
1439 	const char *tail;
1440 	struct osrenamearg ora;
1441 	int err;
1442 
1443 	dd = dsl_dir_open(osname, FTAG, &tail);
1444 	if (dd == NULL)
1445 		return (ENOENT);
1446 	if (tail == NULL) {
1447 		err = dsl_dir_sync_task(dd,
1448 		    dsl_dir_rename_sync, (void*)newname, 1<<12);
1449 		dsl_dir_close(dd, FTAG);
1450 		return (err);
1451 	}
1452 	if (tail[0] != '@') {
1453 		/* the name ended in a nonexistant component */
1454 		dsl_dir_close(dd, FTAG);
1455 		return (ENOENT);
1456 	}
1457 
1458 	ora.oldname = osname;
1459 	ora.newname = newname;
1460 
1461 	err = dsl_dir_sync_task(dd,
1462 	    dsl_dataset_snapshot_rename_sync, &ora, 1<<12);
1463 	dsl_dir_close(dd, FTAG);
1464 	return (err);
1465 }
1466