xref: /titanic_41/usr/src/uts/common/fs/zfs/dsl_dataset.c (revision 3dabdd6ebae8bdf3ae7bc5787556261a9a12a2b6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013 by Delphix. All rights reserved.
24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25  */
26 
27 #include <sys/dmu_objset.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_prop.h>
31 #include <sys/dsl_synctask.h>
32 #include <sys/dmu_traverse.h>
33 #include <sys/dmu_impl.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/arc.h>
36 #include <sys/zio.h>
37 #include <sys/zap.h>
38 #include <sys/zfeature.h>
39 #include <sys/unique.h>
40 #include <sys/zfs_context.h>
41 #include <sys/zfs_ioctl.h>
42 #include <sys/spa.h>
43 #include <sys/zfs_znode.h>
44 #include <sys/zfs_onexit.h>
45 #include <sys/zvol.h>
46 #include <sys/dsl_scan.h>
47 #include <sys/dsl_deadlist.h>
48 #include <sys/dsl_destroy.h>
49 #include <sys/dsl_userhold.h>
50 #include <sys/dsl_bookmark.h>
51 
52 #define	SWITCH64(x, y) \
53 	{ \
54 		uint64_t __tmp = (x); \
55 		(x) = (y); \
56 		(y) = __tmp; \
57 	}
58 
59 #define	DS_REF_MAX	(1ULL << 62)
60 
61 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
62 
63 /*
64  * Figure out how much of this delta should be propogated to the dsl_dir
65  * layer.  If there's a refreservation, that space has already been
66  * partially accounted for in our ancestors.
67  */
68 static int64_t
69 parent_delta(dsl_dataset_t *ds, int64_t delta)
70 {
71 	uint64_t old_bytes, new_bytes;
72 
73 	if (ds->ds_reserved == 0)
74 		return (delta);
75 
76 	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
77 	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
78 
79 	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
80 	return (new_bytes - old_bytes);
81 }
82 
83 void
84 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
85 {
86 	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
87 	int compressed = BP_GET_PSIZE(bp);
88 	int uncompressed = BP_GET_UCSIZE(bp);
89 	int64_t delta;
90 
91 	dprintf_bp(bp, "ds=%p", ds);
92 
93 	ASSERT(dmu_tx_is_syncing(tx));
94 	/* It could have been compressed away to nothing */
95 	if (BP_IS_HOLE(bp))
96 		return;
97 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
98 	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
99 	if (ds == NULL) {
100 		dsl_pool_mos_diduse_space(tx->tx_pool,
101 		    used, compressed, uncompressed);
102 		return;
103 	}
104 
105 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
106 	mutex_enter(&ds->ds_lock);
107 	delta = parent_delta(ds, used);
108 	ds->ds_phys->ds_referenced_bytes += used;
109 	ds->ds_phys->ds_compressed_bytes += compressed;
110 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
111 	ds->ds_phys->ds_unique_bytes += used;
112 	mutex_exit(&ds->ds_lock);
113 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
114 	    compressed, uncompressed, tx);
115 	dsl_dir_transfer_space(ds->ds_dir, used - delta,
116 	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
117 }
118 
119 int
120 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
121     boolean_t async)
122 {
123 	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
124 	int compressed = BP_GET_PSIZE(bp);
125 	int uncompressed = BP_GET_UCSIZE(bp);
126 
127 	if (BP_IS_HOLE(bp))
128 		return (0);
129 
130 	ASSERT(dmu_tx_is_syncing(tx));
131 	ASSERT(bp->blk_birth <= tx->tx_txg);
132 
133 	if (ds == NULL) {
134 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
135 		dsl_pool_mos_diduse_space(tx->tx_pool,
136 		    -used, -compressed, -uncompressed);
137 		return (used);
138 	}
139 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
140 
141 	ASSERT(!dsl_dataset_is_snapshot(ds));
142 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
143 
144 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
145 		int64_t delta;
146 
147 		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
148 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
149 
150 		mutex_enter(&ds->ds_lock);
151 		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
152 		    !DS_UNIQUE_IS_ACCURATE(ds));
153 		delta = parent_delta(ds, -used);
154 		ds->ds_phys->ds_unique_bytes -= used;
155 		mutex_exit(&ds->ds_lock);
156 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
157 		    delta, -compressed, -uncompressed, tx);
158 		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
159 		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
160 	} else {
161 		dprintf_bp(bp, "putting on dead list: %s", "");
162 		if (async) {
163 			/*
164 			 * We are here as part of zio's write done callback,
165 			 * which means we're a zio interrupt thread.  We can't
166 			 * call dsl_deadlist_insert() now because it may block
167 			 * waiting for I/O.  Instead, put bp on the deferred
168 			 * queue and let dsl_pool_sync() finish the job.
169 			 */
170 			bplist_append(&ds->ds_pending_deadlist, bp);
171 		} else {
172 			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
173 		}
174 		ASSERT3U(ds->ds_prev->ds_object, ==,
175 		    ds->ds_phys->ds_prev_snap_obj);
176 		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
177 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
178 		if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
179 		    ds->ds_object && bp->blk_birth >
180 		    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
181 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
182 			mutex_enter(&ds->ds_prev->ds_lock);
183 			ds->ds_prev->ds_phys->ds_unique_bytes += used;
184 			mutex_exit(&ds->ds_prev->ds_lock);
185 		}
186 		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
187 			dsl_dir_transfer_space(ds->ds_dir, used,
188 			    DD_USED_HEAD, DD_USED_SNAP, tx);
189 		}
190 	}
191 	mutex_enter(&ds->ds_lock);
192 	ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
193 	ds->ds_phys->ds_referenced_bytes -= used;
194 	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
195 	ds->ds_phys->ds_compressed_bytes -= compressed;
196 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
197 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
198 	mutex_exit(&ds->ds_lock);
199 
200 	return (used);
201 }
202 
203 uint64_t
204 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
205 {
206 	uint64_t trysnap = 0;
207 
208 	if (ds == NULL)
209 		return (0);
210 	/*
211 	 * The snapshot creation could fail, but that would cause an
212 	 * incorrect FALSE return, which would only result in an
213 	 * overestimation of the amount of space that an operation would
214 	 * consume, which is OK.
215 	 *
216 	 * There's also a small window where we could miss a pending
217 	 * snapshot, because we could set the sync task in the quiescing
218 	 * phase.  So this should only be used as a guess.
219 	 */
220 	if (ds->ds_trysnap_txg >
221 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
222 		trysnap = ds->ds_trysnap_txg;
223 	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
224 }
225 
226 boolean_t
227 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
228     uint64_t blk_birth)
229 {
230 	if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
231 	    (bp != NULL && BP_IS_HOLE(bp)))
232 		return (B_FALSE);
233 
234 	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
235 
236 	return (B_TRUE);
237 }
238 
239 /* ARGSUSED */
240 static void
241 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
242 {
243 	dsl_dataset_t *ds = dsv;
244 
245 	ASSERT(ds->ds_owner == NULL);
246 
247 	unique_remove(ds->ds_fsid_guid);
248 
249 	if (ds->ds_objset != NULL)
250 		dmu_objset_evict(ds->ds_objset);
251 
252 	if (ds->ds_prev) {
253 		dsl_dataset_rele(ds->ds_prev, ds);
254 		ds->ds_prev = NULL;
255 	}
256 
257 	bplist_destroy(&ds->ds_pending_deadlist);
258 	if (ds->ds_phys->ds_deadlist_obj != 0)
259 		dsl_deadlist_close(&ds->ds_deadlist);
260 	if (ds->ds_dir)
261 		dsl_dir_rele(ds->ds_dir, ds);
262 
263 	ASSERT(!list_link_active(&ds->ds_synced_link));
264 
265 	mutex_destroy(&ds->ds_lock);
266 	mutex_destroy(&ds->ds_opening_lock);
267 	refcount_destroy(&ds->ds_longholds);
268 
269 	kmem_free(ds, sizeof (dsl_dataset_t));
270 }
271 
272 int
273 dsl_dataset_get_snapname(dsl_dataset_t *ds)
274 {
275 	dsl_dataset_phys_t *headphys;
276 	int err;
277 	dmu_buf_t *headdbuf;
278 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
279 	objset_t *mos = dp->dp_meta_objset;
280 
281 	if (ds->ds_snapname[0])
282 		return (0);
283 	if (ds->ds_phys->ds_next_snap_obj == 0)
284 		return (0);
285 
286 	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
287 	    FTAG, &headdbuf);
288 	if (err != 0)
289 		return (err);
290 	headphys = headdbuf->db_data;
291 	err = zap_value_search(dp->dp_meta_objset,
292 	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
293 	dmu_buf_rele(headdbuf, FTAG);
294 	return (err);
295 }
296 
297 int
298 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
299 {
300 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
301 	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
302 	matchtype_t mt;
303 	int err;
304 
305 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
306 		mt = MT_FIRST;
307 	else
308 		mt = MT_EXACT;
309 
310 	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
311 	    value, mt, NULL, 0, NULL);
312 	if (err == ENOTSUP && mt == MT_FIRST)
313 		err = zap_lookup(mos, snapobj, name, 8, 1, value);
314 	return (err);
315 }
316 
317 int
318 dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
319 {
320 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
321 	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
322 	matchtype_t mt;
323 	int err;
324 
325 	dsl_dir_snap_cmtime_update(ds->ds_dir);
326 
327 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
328 		mt = MT_FIRST;
329 	else
330 		mt = MT_EXACT;
331 
332 	err = zap_remove_norm(mos, snapobj, name, mt, tx);
333 	if (err == ENOTSUP && mt == MT_FIRST)
334 		err = zap_remove(mos, snapobj, name, tx);
335 	return (err);
336 }
337 
338 int
339 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
340     dsl_dataset_t **dsp)
341 {
342 	objset_t *mos = dp->dp_meta_objset;
343 	dmu_buf_t *dbuf;
344 	dsl_dataset_t *ds;
345 	int err;
346 	dmu_object_info_t doi;
347 
348 	ASSERT(dsl_pool_config_held(dp));
349 
350 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
351 	if (err != 0)
352 		return (err);
353 
354 	/* Make sure dsobj has the correct object type. */
355 	dmu_object_info_from_db(dbuf, &doi);
356 	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
357 		dmu_buf_rele(dbuf, tag);
358 		return (SET_ERROR(EINVAL));
359 	}
360 
361 	ds = dmu_buf_get_user(dbuf);
362 	if (ds == NULL) {
363 		dsl_dataset_t *winner = NULL;
364 
365 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
366 		ds->ds_dbuf = dbuf;
367 		ds->ds_object = dsobj;
368 		ds->ds_phys = dbuf->db_data;
369 
370 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
371 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
372 		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
373 		refcount_create(&ds->ds_longholds);
374 
375 		bplist_create(&ds->ds_pending_deadlist);
376 		dsl_deadlist_open(&ds->ds_deadlist,
377 		    mos, ds->ds_phys->ds_deadlist_obj);
378 
379 		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
380 		    offsetof(dmu_sendarg_t, dsa_link));
381 
382 		if (err == 0) {
383 			err = dsl_dir_hold_obj(dp,
384 			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
385 		}
386 		if (err != 0) {
387 			mutex_destroy(&ds->ds_lock);
388 			mutex_destroy(&ds->ds_opening_lock);
389 			refcount_destroy(&ds->ds_longholds);
390 			bplist_destroy(&ds->ds_pending_deadlist);
391 			dsl_deadlist_close(&ds->ds_deadlist);
392 			kmem_free(ds, sizeof (dsl_dataset_t));
393 			dmu_buf_rele(dbuf, tag);
394 			return (err);
395 		}
396 
397 		if (!dsl_dataset_is_snapshot(ds)) {
398 			ds->ds_snapname[0] = '\0';
399 			if (ds->ds_phys->ds_prev_snap_obj != 0) {
400 				err = dsl_dataset_hold_obj(dp,
401 				    ds->ds_phys->ds_prev_snap_obj,
402 				    ds, &ds->ds_prev);
403 			}
404 			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
405 				int zaperr = zap_lookup(mos, ds->ds_object,
406 				    DS_FIELD_BOOKMARK_NAMES,
407 				    sizeof (ds->ds_bookmarks), 1,
408 				    &ds->ds_bookmarks);
409 				if (zaperr != ENOENT)
410 					VERIFY0(zaperr);
411 			}
412 		} else {
413 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
414 				err = dsl_dataset_get_snapname(ds);
415 			if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
416 				err = zap_count(
417 				    ds->ds_dir->dd_pool->dp_meta_objset,
418 				    ds->ds_phys->ds_userrefs_obj,
419 				    &ds->ds_userrefs);
420 			}
421 		}
422 
423 		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
424 			err = dsl_prop_get_int_ds(ds,
425 			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
426 			    &ds->ds_reserved);
427 			if (err == 0) {
428 				err = dsl_prop_get_int_ds(ds,
429 				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
430 				    &ds->ds_quota);
431 			}
432 		} else {
433 			ds->ds_reserved = ds->ds_quota = 0;
434 		}
435 
436 		if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
437 		    &ds->ds_phys, dsl_dataset_evict)) != NULL) {
438 			bplist_destroy(&ds->ds_pending_deadlist);
439 			dsl_deadlist_close(&ds->ds_deadlist);
440 			if (ds->ds_prev)
441 				dsl_dataset_rele(ds->ds_prev, ds);
442 			dsl_dir_rele(ds->ds_dir, ds);
443 			mutex_destroy(&ds->ds_lock);
444 			mutex_destroy(&ds->ds_opening_lock);
445 			refcount_destroy(&ds->ds_longholds);
446 			kmem_free(ds, sizeof (dsl_dataset_t));
447 			if (err != 0) {
448 				dmu_buf_rele(dbuf, tag);
449 				return (err);
450 			}
451 			ds = winner;
452 		} else {
453 			ds->ds_fsid_guid =
454 			    unique_insert(ds->ds_phys->ds_fsid_guid);
455 		}
456 	}
457 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
458 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
459 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
460 	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
461 	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
462 	*dsp = ds;
463 	return (0);
464 }
465 
466 int
467 dsl_dataset_hold(dsl_pool_t *dp, const char *name,
468     void *tag, dsl_dataset_t **dsp)
469 {
470 	dsl_dir_t *dd;
471 	const char *snapname;
472 	uint64_t obj;
473 	int err = 0;
474 
475 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
476 	if (err != 0)
477 		return (err);
478 
479 	ASSERT(dsl_pool_config_held(dp));
480 	obj = dd->dd_phys->dd_head_dataset_obj;
481 	if (obj != 0)
482 		err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
483 	else
484 		err = SET_ERROR(ENOENT);
485 
486 	/* we may be looking for a snapshot */
487 	if (err == 0 && snapname != NULL) {
488 		dsl_dataset_t *ds;
489 
490 		if (*snapname++ != '@') {
491 			dsl_dataset_rele(*dsp, tag);
492 			dsl_dir_rele(dd, FTAG);
493 			return (SET_ERROR(ENOENT));
494 		}
495 
496 		dprintf("looking for snapshot '%s'\n", snapname);
497 		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
498 		if (err == 0)
499 			err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
500 		dsl_dataset_rele(*dsp, tag);
501 
502 		if (err == 0) {
503 			mutex_enter(&ds->ds_lock);
504 			if (ds->ds_snapname[0] == 0)
505 				(void) strlcpy(ds->ds_snapname, snapname,
506 				    sizeof (ds->ds_snapname));
507 			mutex_exit(&ds->ds_lock);
508 			*dsp = ds;
509 		}
510 	}
511 
512 	dsl_dir_rele(dd, FTAG);
513 	return (err);
514 }
515 
516 int
517 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
518     void *tag, dsl_dataset_t **dsp)
519 {
520 	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
521 	if (err != 0)
522 		return (err);
523 	if (!dsl_dataset_tryown(*dsp, tag)) {
524 		dsl_dataset_rele(*dsp, tag);
525 		*dsp = NULL;
526 		return (SET_ERROR(EBUSY));
527 	}
528 	return (0);
529 }
530 
531 int
532 dsl_dataset_own(dsl_pool_t *dp, const char *name,
533     void *tag, dsl_dataset_t **dsp)
534 {
535 	int err = dsl_dataset_hold(dp, name, tag, dsp);
536 	if (err != 0)
537 		return (err);
538 	if (!dsl_dataset_tryown(*dsp, tag)) {
539 		dsl_dataset_rele(*dsp, tag);
540 		return (SET_ERROR(EBUSY));
541 	}
542 	return (0);
543 }
544 
545 /*
546  * See the comment above dsl_pool_hold() for details.  In summary, a long
547  * hold is used to prevent destruction of a dataset while the pool hold
548  * is dropped, allowing other concurrent operations (e.g. spa_sync()).
549  *
550  * The dataset and pool must be held when this function is called.  After it
551  * is called, the pool hold may be released while the dataset is still held
552  * and accessed.
553  */
554 void
555 dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
556 {
557 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
558 	(void) refcount_add(&ds->ds_longholds, tag);
559 }
560 
561 void
562 dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
563 {
564 	(void) refcount_remove(&ds->ds_longholds, tag);
565 }
566 
567 /* Return B_TRUE if there are any long holds on this dataset. */
568 boolean_t
569 dsl_dataset_long_held(dsl_dataset_t *ds)
570 {
571 	return (!refcount_is_zero(&ds->ds_longholds));
572 }
573 
574 void
575 dsl_dataset_name(dsl_dataset_t *ds, char *name)
576 {
577 	if (ds == NULL) {
578 		(void) strcpy(name, "mos");
579 	} else {
580 		dsl_dir_name(ds->ds_dir, name);
581 		VERIFY0(dsl_dataset_get_snapname(ds));
582 		if (ds->ds_snapname[0]) {
583 			(void) strcat(name, "@");
584 			/*
585 			 * We use a "recursive" mutex so that we
586 			 * can call dprintf_ds() with ds_lock held.
587 			 */
588 			if (!MUTEX_HELD(&ds->ds_lock)) {
589 				mutex_enter(&ds->ds_lock);
590 				(void) strcat(name, ds->ds_snapname);
591 				mutex_exit(&ds->ds_lock);
592 			} else {
593 				(void) strcat(name, ds->ds_snapname);
594 			}
595 		}
596 	}
597 }
598 
599 void
600 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
601 {
602 	dmu_buf_rele(ds->ds_dbuf, tag);
603 }
604 
605 void
606 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
607 {
608 	ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL);
609 
610 	mutex_enter(&ds->ds_lock);
611 	ds->ds_owner = NULL;
612 	mutex_exit(&ds->ds_lock);
613 	dsl_dataset_long_rele(ds, tag);
614 	if (ds->ds_dbuf != NULL)
615 		dsl_dataset_rele(ds, tag);
616 	else
617 		dsl_dataset_evict(NULL, ds);
618 }
619 
620 boolean_t
621 dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
622 {
623 	boolean_t gotit = FALSE;
624 
625 	mutex_enter(&ds->ds_lock);
626 	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
627 		ds->ds_owner = tag;
628 		dsl_dataset_long_hold(ds, tag);
629 		gotit = TRUE;
630 	}
631 	mutex_exit(&ds->ds_lock);
632 	return (gotit);
633 }
634 
635 uint64_t
636 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
637     uint64_t flags, dmu_tx_t *tx)
638 {
639 	dsl_pool_t *dp = dd->dd_pool;
640 	dmu_buf_t *dbuf;
641 	dsl_dataset_phys_t *dsphys;
642 	uint64_t dsobj;
643 	objset_t *mos = dp->dp_meta_objset;
644 
645 	if (origin == NULL)
646 		origin = dp->dp_origin_snap;
647 
648 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
649 	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
650 	ASSERT(dmu_tx_is_syncing(tx));
651 	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
652 
653 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
654 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
655 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
656 	dmu_buf_will_dirty(dbuf, tx);
657 	dsphys = dbuf->db_data;
658 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
659 	dsphys->ds_dir_obj = dd->dd_object;
660 	dsphys->ds_flags = flags;
661 	dsphys->ds_fsid_guid = unique_create();
662 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
663 	    sizeof (dsphys->ds_guid));
664 	dsphys->ds_snapnames_zapobj =
665 	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
666 	    DMU_OT_NONE, 0, tx);
667 	dsphys->ds_creation_time = gethrestime_sec();
668 	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
669 
670 	if (origin == NULL) {
671 		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
672 	} else {
673 		dsl_dataset_t *ohds; /* head of the origin snapshot */
674 
675 		dsphys->ds_prev_snap_obj = origin->ds_object;
676 		dsphys->ds_prev_snap_txg =
677 		    origin->ds_phys->ds_creation_txg;
678 		dsphys->ds_referenced_bytes =
679 		    origin->ds_phys->ds_referenced_bytes;
680 		dsphys->ds_compressed_bytes =
681 		    origin->ds_phys->ds_compressed_bytes;
682 		dsphys->ds_uncompressed_bytes =
683 		    origin->ds_phys->ds_uncompressed_bytes;
684 		dsphys->ds_bp = origin->ds_phys->ds_bp;
685 		dsphys->ds_flags |= origin->ds_phys->ds_flags;
686 
687 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
688 		origin->ds_phys->ds_num_children++;
689 
690 		VERIFY0(dsl_dataset_hold_obj(dp,
691 		    origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
692 		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
693 		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
694 		dsl_dataset_rele(ohds, FTAG);
695 
696 		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
697 			if (origin->ds_phys->ds_next_clones_obj == 0) {
698 				origin->ds_phys->ds_next_clones_obj =
699 				    zap_create(mos,
700 				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
701 			}
702 			VERIFY0(zap_add_int(mos,
703 			    origin->ds_phys->ds_next_clones_obj, dsobj, tx));
704 		}
705 
706 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
707 		dd->dd_phys->dd_origin_obj = origin->ds_object;
708 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
709 			if (origin->ds_dir->dd_phys->dd_clones == 0) {
710 				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
711 				origin->ds_dir->dd_phys->dd_clones =
712 				    zap_create(mos,
713 				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
714 			}
715 			VERIFY0(zap_add_int(mos,
716 			    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
717 		}
718 	}
719 
720 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
721 		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
722 
723 	dmu_buf_rele(dbuf, FTAG);
724 
725 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
726 	dd->dd_phys->dd_head_dataset_obj = dsobj;
727 
728 	return (dsobj);
729 }
730 
731 static void
732 dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
733 {
734 	objset_t *os;
735 
736 	VERIFY0(dmu_objset_from_ds(ds, &os));
737 	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
738 	dsl_dataset_dirty(ds, tx);
739 }
740 
741 uint64_t
742 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
743     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
744 {
745 	dsl_pool_t *dp = pdd->dd_pool;
746 	uint64_t dsobj, ddobj;
747 	dsl_dir_t *dd;
748 
749 	ASSERT(dmu_tx_is_syncing(tx));
750 	ASSERT(lastname[0] != '@');
751 
752 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
753 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
754 
755 	dsobj = dsl_dataset_create_sync_dd(dd, origin,
756 	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
757 
758 	dsl_deleg_set_create_perms(dd, tx, cr);
759 
760 	dsl_dir_rele(dd, FTAG);
761 
762 	/*
763 	 * If we are creating a clone, make sure we zero out any stale
764 	 * data from the origin snapshots zil header.
765 	 */
766 	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
767 		dsl_dataset_t *ds;
768 
769 		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
770 		dsl_dataset_zero_zil(ds, tx);
771 		dsl_dataset_rele(ds, FTAG);
772 	}
773 
774 	return (dsobj);
775 }
776 
777 /*
778  * The unique space in the head dataset can be calculated by subtracting
779  * the space used in the most recent snapshot, that is still being used
780  * in this file system, from the space currently in use.  To figure out
781  * the space in the most recent snapshot still in use, we need to take
782  * the total space used in the snapshot and subtract out the space that
783  * has been freed up since the snapshot was taken.
784  */
785 void
786 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
787 {
788 	uint64_t mrs_used;
789 	uint64_t dlused, dlcomp, dluncomp;
790 
791 	ASSERT(!dsl_dataset_is_snapshot(ds));
792 
793 	if (ds->ds_phys->ds_prev_snap_obj != 0)
794 		mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
795 	else
796 		mrs_used = 0;
797 
798 	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
799 
800 	ASSERT3U(dlused, <=, mrs_used);
801 	ds->ds_phys->ds_unique_bytes =
802 	    ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
803 
804 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
805 	    SPA_VERSION_UNIQUE_ACCURATE)
806 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
807 }
808 
809 void
810 dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
811     dmu_tx_t *tx)
812 {
813 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
814 	uint64_t count;
815 	int err;
816 
817 	ASSERT(ds->ds_phys->ds_num_children >= 2);
818 	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
819 	/*
820 	 * The err should not be ENOENT, but a bug in a previous version
821 	 * of the code could cause upgrade_clones_cb() to not set
822 	 * ds_next_snap_obj when it should, leading to a missing entry.
823 	 * If we knew that the pool was created after
824 	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
825 	 * ENOENT.  However, at least we can check that we don't have
826 	 * too many entries in the next_clones_obj even after failing to
827 	 * remove this one.
828 	 */
829 	if (err != ENOENT)
830 		VERIFY0(err);
831 	ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
832 	    &count));
833 	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
834 }
835 
836 
837 blkptr_t *
838 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
839 {
840 	return (&ds->ds_phys->ds_bp);
841 }
842 
843 void
844 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
845 {
846 	ASSERT(dmu_tx_is_syncing(tx));
847 	/* If it's the meta-objset, set dp_meta_rootbp */
848 	if (ds == NULL) {
849 		tx->tx_pool->dp_meta_rootbp = *bp;
850 	} else {
851 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
852 		ds->ds_phys->ds_bp = *bp;
853 	}
854 }
855 
856 spa_t *
857 dsl_dataset_get_spa(dsl_dataset_t *ds)
858 {
859 	return (ds->ds_dir->dd_pool->dp_spa);
860 }
861 
862 void
863 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
864 {
865 	dsl_pool_t *dp;
866 
867 	if (ds == NULL) /* this is the meta-objset */
868 		return;
869 
870 	ASSERT(ds->ds_objset != NULL);
871 
872 	if (ds->ds_phys->ds_next_snap_obj != 0)
873 		panic("dirtying snapshot!");
874 
875 	dp = ds->ds_dir->dd_pool;
876 
877 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
878 		/* up the hold count until we can be written out */
879 		dmu_buf_add_ref(ds->ds_dbuf, ds);
880 	}
881 }
882 
883 boolean_t
884 dsl_dataset_is_dirty(dsl_dataset_t *ds)
885 {
886 	for (int t = 0; t < TXG_SIZE; t++) {
887 		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
888 		    ds, t))
889 			return (B_TRUE);
890 	}
891 	return (B_FALSE);
892 }
893 
894 static int
895 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
896 {
897 	uint64_t asize;
898 
899 	if (!dmu_tx_is_syncing(tx))
900 		return (0);
901 
902 	/*
903 	 * If there's an fs-only reservation, any blocks that might become
904 	 * owned by the snapshot dataset must be accommodated by space
905 	 * outside of the reservation.
906 	 */
907 	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
908 	asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
909 	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
910 		return (SET_ERROR(ENOSPC));
911 
912 	/*
913 	 * Propagate any reserved space for this snapshot to other
914 	 * snapshot checks in this sync group.
915 	 */
916 	if (asize > 0)
917 		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
918 
919 	return (0);
920 }
921 
922 typedef struct dsl_dataset_snapshot_arg {
923 	nvlist_t *ddsa_snaps;
924 	nvlist_t *ddsa_props;
925 	nvlist_t *ddsa_errors;
926 } dsl_dataset_snapshot_arg_t;
927 
928 int
929 dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
930     dmu_tx_t *tx, boolean_t recv)
931 {
932 	int error;
933 	uint64_t value;
934 
935 	ds->ds_trysnap_txg = tx->tx_txg;
936 
937 	if (!dmu_tx_is_syncing(tx))
938 		return (0);
939 
940 	/*
941 	 * We don't allow multiple snapshots of the same txg.  If there
942 	 * is already one, try again.
943 	 */
944 	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
945 		return (SET_ERROR(EAGAIN));
946 
947 	/*
948 	 * Check for conflicting snapshot name.
949 	 */
950 	error = dsl_dataset_snap_lookup(ds, snapname, &value);
951 	if (error == 0)
952 		return (SET_ERROR(EEXIST));
953 	if (error != ENOENT)
954 		return (error);
955 
956 	/*
957 	 * We don't allow taking snapshots of inconsistent datasets, such as
958 	 * those into which we are currently receiving.  However, if we are
959 	 * creating this snapshot as part of a receive, this check will be
960 	 * executed atomically with respect to the completion of the receive
961 	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
962 	 * case we ignore this, knowing it will be fixed up for us shortly in
963 	 * dmu_recv_end_sync().
964 	 */
965 	if (!recv && DS_IS_INCONSISTENT(ds))
966 		return (SET_ERROR(EBUSY));
967 
968 	error = dsl_dataset_snapshot_reserve_space(ds, tx);
969 	if (error != 0)
970 		return (error);
971 
972 	return (0);
973 }
974 
975 static int
976 dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
977 {
978 	dsl_dataset_snapshot_arg_t *ddsa = arg;
979 	dsl_pool_t *dp = dmu_tx_pool(tx);
980 	nvpair_t *pair;
981 	int rv = 0;
982 
983 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
984 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
985 		int error = 0;
986 		dsl_dataset_t *ds;
987 		char *name, *atp;
988 		char dsname[MAXNAMELEN];
989 
990 		name = nvpair_name(pair);
991 		if (strlen(name) >= MAXNAMELEN)
992 			error = SET_ERROR(ENAMETOOLONG);
993 		if (error == 0) {
994 			atp = strchr(name, '@');
995 			if (atp == NULL)
996 				error = SET_ERROR(EINVAL);
997 			if (error == 0)
998 				(void) strlcpy(dsname, name, atp - name + 1);
999 		}
1000 		if (error == 0)
1001 			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
1002 		if (error == 0) {
1003 			error = dsl_dataset_snapshot_check_impl(ds,
1004 			    atp + 1, tx, B_FALSE);
1005 			dsl_dataset_rele(ds, FTAG);
1006 		}
1007 
1008 		if (error != 0) {
1009 			if (ddsa->ddsa_errors != NULL) {
1010 				fnvlist_add_int32(ddsa->ddsa_errors,
1011 				    name, error);
1012 			}
1013 			rv = error;
1014 		}
1015 	}
1016 	return (rv);
1017 }
1018 
1019 void
1020 dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
1021     dmu_tx_t *tx)
1022 {
1023 	static zil_header_t zero_zil;
1024 
1025 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1026 	dmu_buf_t *dbuf;
1027 	dsl_dataset_phys_t *dsphys;
1028 	uint64_t dsobj, crtxg;
1029 	objset_t *mos = dp->dp_meta_objset;
1030 	objset_t *os;
1031 
1032 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
1033 
1034 	/*
1035 	 * If we are on an old pool, the zil must not be active, in which
1036 	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
1037 	 */
1038 	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
1039 	    dmu_objset_from_ds(ds, &os) != 0 ||
1040 	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
1041 	    sizeof (zero_zil)) == 0);
1042 
1043 
1044 	/*
1045 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
1046 	 */
1047 	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1048 		crtxg = 1;
1049 	else
1050 		crtxg = tx->tx_txg;
1051 
1052 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1053 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1054 	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1055 	dmu_buf_will_dirty(dbuf, tx);
1056 	dsphys = dbuf->db_data;
1057 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
1058 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1059 	dsphys->ds_fsid_guid = unique_create();
1060 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1061 	    sizeof (dsphys->ds_guid));
1062 	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
1063 	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
1064 	dsphys->ds_next_snap_obj = ds->ds_object;
1065 	dsphys->ds_num_children = 1;
1066 	dsphys->ds_creation_time = gethrestime_sec();
1067 	dsphys->ds_creation_txg = crtxg;
1068 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
1069 	dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
1070 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
1071 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
1072 	dsphys->ds_flags = ds->ds_phys->ds_flags;
1073 	dsphys->ds_bp = ds->ds_phys->ds_bp;
1074 	dmu_buf_rele(dbuf, FTAG);
1075 
1076 	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
1077 	if (ds->ds_prev) {
1078 		uint64_t next_clones_obj =
1079 		    ds->ds_prev->ds_phys->ds_next_clones_obj;
1080 		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
1081 		    ds->ds_object ||
1082 		    ds->ds_prev->ds_phys->ds_num_children > 1);
1083 		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1084 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1085 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1086 			    ds->ds_prev->ds_phys->ds_creation_txg);
1087 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
1088 		} else if (next_clones_obj != 0) {
1089 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
1090 			    dsphys->ds_next_snap_obj, tx);
1091 			VERIFY0(zap_add_int(mos,
1092 			    next_clones_obj, dsobj, tx));
1093 		}
1094 	}
1095 
1096 	/*
1097 	 * If we have a reference-reservation on this dataset, we will
1098 	 * need to increase the amount of refreservation being charged
1099 	 * since our unique space is going to zero.
1100 	 */
1101 	if (ds->ds_reserved) {
1102 		int64_t delta;
1103 		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
1104 		delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
1105 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1106 		    delta, 0, 0, tx);
1107 	}
1108 
1109 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1110 	ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
1111 	    UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
1112 	dsl_deadlist_close(&ds->ds_deadlist);
1113 	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1114 	dsl_deadlist_add_key(&ds->ds_deadlist,
1115 	    ds->ds_phys->ds_prev_snap_txg, tx);
1116 
1117 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
1118 	ds->ds_phys->ds_prev_snap_obj = dsobj;
1119 	ds->ds_phys->ds_prev_snap_txg = crtxg;
1120 	ds->ds_phys->ds_unique_bytes = 0;
1121 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1122 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1123 
1124 	VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
1125 	    snapname, 8, 1, &dsobj, tx));
1126 
1127 	if (ds->ds_prev)
1128 		dsl_dataset_rele(ds->ds_prev, ds);
1129 	VERIFY0(dsl_dataset_hold_obj(dp,
1130 	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
1131 
1132 	dsl_scan_ds_snapshotted(ds, tx);
1133 
1134 	dsl_dir_snap_cmtime_update(ds->ds_dir);
1135 
1136 	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
1137 }
1138 
1139 static void
1140 dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
1141 {
1142 	dsl_dataset_snapshot_arg_t *ddsa = arg;
1143 	dsl_pool_t *dp = dmu_tx_pool(tx);
1144 	nvpair_t *pair;
1145 
1146 	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1147 	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1148 		dsl_dataset_t *ds;
1149 		char *name, *atp;
1150 		char dsname[MAXNAMELEN];
1151 
1152 		name = nvpair_name(pair);
1153 		atp = strchr(name, '@');
1154 		(void) strlcpy(dsname, name, atp - name + 1);
1155 		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
1156 
1157 		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
1158 		if (ddsa->ddsa_props != NULL) {
1159 			dsl_props_set_sync_impl(ds->ds_prev,
1160 			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
1161 		}
1162 		dsl_dataset_rele(ds, FTAG);
1163 	}
1164 }
1165 
1166 /*
1167  * The snapshots must all be in the same pool.
1168  * All-or-nothing: if there are any failures, nothing will be modified.
1169  */
1170 int
1171 dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
1172 {
1173 	dsl_dataset_snapshot_arg_t ddsa;
1174 	nvpair_t *pair;
1175 	boolean_t needsuspend;
1176 	int error;
1177 	spa_t *spa;
1178 	char *firstname;
1179 	nvlist_t *suspended = NULL;
1180 
1181 	pair = nvlist_next_nvpair(snaps, NULL);
1182 	if (pair == NULL)
1183 		return (0);
1184 	firstname = nvpair_name(pair);
1185 
1186 	error = spa_open(firstname, &spa, FTAG);
1187 	if (error != 0)
1188 		return (error);
1189 	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1190 	spa_close(spa, FTAG);
1191 
1192 	if (needsuspend) {
1193 		suspended = fnvlist_alloc();
1194 		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1195 		    pair = nvlist_next_nvpair(snaps, pair)) {
1196 			char fsname[MAXNAMELEN];
1197 			char *snapname = nvpair_name(pair);
1198 			char *atp;
1199 			void *cookie;
1200 
1201 			atp = strchr(snapname, '@');
1202 			if (atp == NULL) {
1203 				error = SET_ERROR(EINVAL);
1204 				break;
1205 			}
1206 			(void) strlcpy(fsname, snapname, atp - snapname + 1);
1207 
1208 			error = zil_suspend(fsname, &cookie);
1209 			if (error != 0)
1210 				break;
1211 			fnvlist_add_uint64(suspended, fsname,
1212 			    (uintptr_t)cookie);
1213 		}
1214 	}
1215 
1216 	ddsa.ddsa_snaps = snaps;
1217 	ddsa.ddsa_props = props;
1218 	ddsa.ddsa_errors = errors;
1219 
1220 	if (error == 0) {
1221 		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
1222 		    dsl_dataset_snapshot_sync, &ddsa,
1223 		    fnvlist_num_pairs(snaps) * 3);
1224 	}
1225 
1226 	if (suspended != NULL) {
1227 		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
1228 		    pair = nvlist_next_nvpair(suspended, pair)) {
1229 			zil_resume((void *)(uintptr_t)
1230 			    fnvpair_value_uint64(pair));
1231 		}
1232 		fnvlist_free(suspended);
1233 	}
1234 
1235 	return (error);
1236 }
1237 
1238 typedef struct dsl_dataset_snapshot_tmp_arg {
1239 	const char *ddsta_fsname;
1240 	const char *ddsta_snapname;
1241 	minor_t ddsta_cleanup_minor;
1242 	const char *ddsta_htag;
1243 } dsl_dataset_snapshot_tmp_arg_t;
1244 
1245 static int
1246 dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
1247 {
1248 	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1249 	dsl_pool_t *dp = dmu_tx_pool(tx);
1250 	dsl_dataset_t *ds;
1251 	int error;
1252 
1253 	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
1254 	if (error != 0)
1255 		return (error);
1256 
1257 	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
1258 	    tx, B_FALSE);
1259 	if (error != 0) {
1260 		dsl_dataset_rele(ds, FTAG);
1261 		return (error);
1262 	}
1263 
1264 	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
1265 		dsl_dataset_rele(ds, FTAG);
1266 		return (SET_ERROR(ENOTSUP));
1267 	}
1268 	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
1269 	    B_TRUE, tx);
1270 	if (error != 0) {
1271 		dsl_dataset_rele(ds, FTAG);
1272 		return (error);
1273 	}
1274 
1275 	dsl_dataset_rele(ds, FTAG);
1276 	return (0);
1277 }
1278 
1279 static void
1280 dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
1281 {
1282 	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1283 	dsl_pool_t *dp = dmu_tx_pool(tx);
1284 	dsl_dataset_t *ds;
1285 
1286 	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
1287 
1288 	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
1289 	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
1290 	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
1291 	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
1292 
1293 	dsl_dataset_rele(ds, FTAG);
1294 }
1295 
1296 int
1297 dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
1298     minor_t cleanup_minor, const char *htag)
1299 {
1300 	dsl_dataset_snapshot_tmp_arg_t ddsta;
1301 	int error;
1302 	spa_t *spa;
1303 	boolean_t needsuspend;
1304 	void *cookie;
1305 
1306 	ddsta.ddsta_fsname = fsname;
1307 	ddsta.ddsta_snapname = snapname;
1308 	ddsta.ddsta_cleanup_minor = cleanup_minor;
1309 	ddsta.ddsta_htag = htag;
1310 
1311 	error = spa_open(fsname, &spa, FTAG);
1312 	if (error != 0)
1313 		return (error);
1314 	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1315 	spa_close(spa, FTAG);
1316 
1317 	if (needsuspend) {
1318 		error = zil_suspend(fsname, &cookie);
1319 		if (error != 0)
1320 			return (error);
1321 	}
1322 
1323 	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
1324 	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3);
1325 
1326 	if (needsuspend)
1327 		zil_resume(cookie);
1328 	return (error);
1329 }
1330 
1331 
1332 void
1333 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1334 {
1335 	ASSERT(dmu_tx_is_syncing(tx));
1336 	ASSERT(ds->ds_objset != NULL);
1337 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
1338 
1339 	/*
1340 	 * in case we had to change ds_fsid_guid when we opened it,
1341 	 * sync it out now.
1342 	 */
1343 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1344 	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
1345 
1346 	dmu_objset_sync(ds->ds_objset, zio, tx);
1347 }
1348 
1349 static void
1350 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
1351 {
1352 	uint64_t count = 0;
1353 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1354 	zap_cursor_t zc;
1355 	zap_attribute_t za;
1356 	nvlist_t *propval = fnvlist_alloc();
1357 	nvlist_t *val = fnvlist_alloc();
1358 
1359 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1360 
1361 	/*
1362 	 * There may be missing entries in ds_next_clones_obj
1363 	 * due to a bug in a previous version of the code.
1364 	 * Only trust it if it has the right number of entries.
1365 	 */
1366 	if (ds->ds_phys->ds_next_clones_obj != 0) {
1367 		ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1368 		    &count));
1369 	}
1370 	if (count != ds->ds_phys->ds_num_children - 1)
1371 		goto fail;
1372 	for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
1373 	    zap_cursor_retrieve(&zc, &za) == 0;
1374 	    zap_cursor_advance(&zc)) {
1375 		dsl_dataset_t *clone;
1376 		char buf[ZFS_MAXNAMELEN];
1377 		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1378 		    za.za_first_integer, FTAG, &clone));
1379 		dsl_dir_name(clone->ds_dir, buf);
1380 		fnvlist_add_boolean(val, buf);
1381 		dsl_dataset_rele(clone, FTAG);
1382 	}
1383 	zap_cursor_fini(&zc);
1384 	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
1385 	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
1386 fail:
1387 	nvlist_free(val);
1388 	nvlist_free(propval);
1389 }
1390 
1391 void
1392 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1393 {
1394 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1395 	uint64_t refd, avail, uobjs, aobjs, ratio;
1396 
1397 	ASSERT(dsl_pool_config_held(dp));
1398 
1399 	ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
1400 	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
1401 	    ds->ds_phys->ds_compressed_bytes);
1402 
1403 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
1404 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
1405 	    ds->ds_phys->ds_uncompressed_bytes);
1406 
1407 	if (dsl_dataset_is_snapshot(ds)) {
1408 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
1409 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1410 		    ds->ds_phys->ds_unique_bytes);
1411 		get_clones_stat(ds, nv);
1412 	} else {
1413 		dsl_dir_stats(ds->ds_dir, nv);
1414 	}
1415 
1416 	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1417 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1418 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1419 
1420 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1421 	    ds->ds_phys->ds_creation_time);
1422 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1423 	    ds->ds_phys->ds_creation_txg);
1424 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1425 	    ds->ds_quota);
1426 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1427 	    ds->ds_reserved);
1428 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1429 	    ds->ds_phys->ds_guid);
1430 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
1431 	    ds->ds_phys->ds_unique_bytes);
1432 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
1433 	    ds->ds_object);
1434 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
1435 	    ds->ds_userrefs);
1436 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
1437 	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
1438 
1439 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
1440 		uint64_t written, comp, uncomp;
1441 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
1442 		dsl_dataset_t *prev;
1443 
1444 		int err = dsl_dataset_hold_obj(dp,
1445 		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
1446 		if (err == 0) {
1447 			err = dsl_dataset_space_written(prev, ds, &written,
1448 			    &comp, &uncomp);
1449 			dsl_dataset_rele(prev, FTAG);
1450 			if (err == 0) {
1451 				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
1452 				    written);
1453 			}
1454 		}
1455 	}
1456 }
1457 
1458 void
1459 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1460 {
1461 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1462 	ASSERT(dsl_pool_config_held(dp));
1463 
1464 	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
1465 	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
1466 	stat->dds_guid = ds->ds_phys->ds_guid;
1467 	stat->dds_origin[0] = '\0';
1468 	if (dsl_dataset_is_snapshot(ds)) {
1469 		stat->dds_is_snapshot = B_TRUE;
1470 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
1471 	} else {
1472 		stat->dds_is_snapshot = B_FALSE;
1473 		stat->dds_num_clones = 0;
1474 
1475 		if (dsl_dir_is_clone(ds->ds_dir)) {
1476 			dsl_dataset_t *ods;
1477 
1478 			VERIFY0(dsl_dataset_hold_obj(dp,
1479 			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
1480 			dsl_dataset_name(ods, stat->dds_origin);
1481 			dsl_dataset_rele(ods, FTAG);
1482 		}
1483 	}
1484 }
1485 
1486 uint64_t
1487 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1488 {
1489 	return (ds->ds_fsid_guid);
1490 }
1491 
1492 void
1493 dsl_dataset_space(dsl_dataset_t *ds,
1494     uint64_t *refdbytesp, uint64_t *availbytesp,
1495     uint64_t *usedobjsp, uint64_t *availobjsp)
1496 {
1497 	*refdbytesp = ds->ds_phys->ds_referenced_bytes;
1498 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1499 	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
1500 		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
1501 	if (ds->ds_quota != 0) {
1502 		/*
1503 		 * Adjust available bytes according to refquota
1504 		 */
1505 		if (*refdbytesp < ds->ds_quota)
1506 			*availbytesp = MIN(*availbytesp,
1507 			    ds->ds_quota - *refdbytesp);
1508 		else
1509 			*availbytesp = 0;
1510 	}
1511 	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
1512 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
1513 }
1514 
1515 boolean_t
1516 dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
1517 {
1518 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1519 
1520 	ASSERT(dsl_pool_config_held(dp));
1521 	if (snap == NULL)
1522 		return (B_FALSE);
1523 	if (ds->ds_phys->ds_bp.blk_birth >
1524 	    snap->ds_phys->ds_creation_txg) {
1525 		objset_t *os, *os_snap;
1526 		/*
1527 		 * It may be that only the ZIL differs, because it was
1528 		 * reset in the head.  Don't count that as being
1529 		 * modified.
1530 		 */
1531 		if (dmu_objset_from_ds(ds, &os) != 0)
1532 			return (B_TRUE);
1533 		if (dmu_objset_from_ds(snap, &os_snap) != 0)
1534 			return (B_TRUE);
1535 		return (bcmp(&os->os_phys->os_meta_dnode,
1536 		    &os_snap->os_phys->os_meta_dnode,
1537 		    sizeof (os->os_phys->os_meta_dnode)) != 0);
1538 	}
1539 	return (B_FALSE);
1540 }
1541 
1542 typedef struct dsl_dataset_rename_snapshot_arg {
1543 	const char *ddrsa_fsname;
1544 	const char *ddrsa_oldsnapname;
1545 	const char *ddrsa_newsnapname;
1546 	boolean_t ddrsa_recursive;
1547 	dmu_tx_t *ddrsa_tx;
1548 } dsl_dataset_rename_snapshot_arg_t;
1549 
1550 /* ARGSUSED */
1551 static int
1552 dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
1553     dsl_dataset_t *hds, void *arg)
1554 {
1555 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1556 	int error;
1557 	uint64_t val;
1558 
1559 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
1560 	if (error != 0) {
1561 		/* ignore nonexistent snapshots */
1562 		return (error == ENOENT ? 0 : error);
1563 	}
1564 
1565 	/* new name should not exist */
1566 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
1567 	if (error == 0)
1568 		error = SET_ERROR(EEXIST);
1569 	else if (error == ENOENT)
1570 		error = 0;
1571 
1572 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
1573 	if (dsl_dir_namelen(hds->ds_dir) + 1 +
1574 	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
1575 		error = SET_ERROR(ENAMETOOLONG);
1576 
1577 	return (error);
1578 }
1579 
1580 static int
1581 dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
1582 {
1583 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1584 	dsl_pool_t *dp = dmu_tx_pool(tx);
1585 	dsl_dataset_t *hds;
1586 	int error;
1587 
1588 	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
1589 	if (error != 0)
1590 		return (error);
1591 
1592 	if (ddrsa->ddrsa_recursive) {
1593 		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
1594 		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
1595 		    DS_FIND_CHILDREN);
1596 	} else {
1597 		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
1598 	}
1599 	dsl_dataset_rele(hds, FTAG);
1600 	return (error);
1601 }
1602 
1603 static int
1604 dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
1605     dsl_dataset_t *hds, void *arg)
1606 {
1607 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1608 	dsl_dataset_t *ds;
1609 	uint64_t val;
1610 	dmu_tx_t *tx = ddrsa->ddrsa_tx;
1611 	int error;
1612 
1613 	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
1614 	ASSERT(error == 0 || error == ENOENT);
1615 	if (error == ENOENT) {
1616 		/* ignore nonexistent snapshots */
1617 		return (0);
1618 	}
1619 
1620 	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
1621 
1622 	/* log before we change the name */
1623 	spa_history_log_internal_ds(ds, "rename", tx,
1624 	    "-> @%s", ddrsa->ddrsa_newsnapname);
1625 
1626 	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
1627 	mutex_enter(&ds->ds_lock);
1628 	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
1629 	mutex_exit(&ds->ds_lock);
1630 	VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj,
1631 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
1632 
1633 	dsl_dataset_rele(ds, FTAG);
1634 	return (0);
1635 }
1636 
1637 static void
1638 dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
1639 {
1640 	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1641 	dsl_pool_t *dp = dmu_tx_pool(tx);
1642 	dsl_dataset_t *hds;
1643 
1644 	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
1645 	ddrsa->ddrsa_tx = tx;
1646 	if (ddrsa->ddrsa_recursive) {
1647 		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
1648 		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
1649 		    DS_FIND_CHILDREN));
1650 	} else {
1651 		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
1652 	}
1653 	dsl_dataset_rele(hds, FTAG);
1654 }
1655 
1656 int
1657 dsl_dataset_rename_snapshot(const char *fsname,
1658     const char *oldsnapname, const char *newsnapname, boolean_t recursive)
1659 {
1660 	dsl_dataset_rename_snapshot_arg_t ddrsa;
1661 
1662 	ddrsa.ddrsa_fsname = fsname;
1663 	ddrsa.ddrsa_oldsnapname = oldsnapname;
1664 	ddrsa.ddrsa_newsnapname = newsnapname;
1665 	ddrsa.ddrsa_recursive = recursive;
1666 
1667 	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
1668 	    dsl_dataset_rename_snapshot_sync, &ddrsa, 1));
1669 }
1670 
1671 /*
1672  * If we're doing an ownership handoff, we need to make sure that there is
1673  * only one long hold on the dataset.  We're not allowed to change anything here
1674  * so we don't permanently release the long hold or regular hold here.  We want
1675  * to do this only when syncing to avoid the dataset unexpectedly going away
1676  * when we release the long hold.
1677  */
1678 static int
1679 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
1680 {
1681 	boolean_t held;
1682 
1683 	if (!dmu_tx_is_syncing(tx))
1684 		return (0);
1685 
1686 	if (owner != NULL) {
1687 		VERIFY3P(ds->ds_owner, ==, owner);
1688 		dsl_dataset_long_rele(ds, owner);
1689 	}
1690 
1691 	held = dsl_dataset_long_held(ds);
1692 
1693 	if (owner != NULL)
1694 		dsl_dataset_long_hold(ds, owner);
1695 
1696 	if (held)
1697 		return (SET_ERROR(EBUSY));
1698 
1699 	return (0);
1700 }
1701 
1702 typedef struct dsl_dataset_rollback_arg {
1703 	const char *ddra_fsname;
1704 	void *ddra_owner;
1705 	nvlist_t *ddra_result;
1706 } dsl_dataset_rollback_arg_t;
1707 
1708 static int
1709 dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
1710 {
1711 	dsl_dataset_rollback_arg_t *ddra = arg;
1712 	dsl_pool_t *dp = dmu_tx_pool(tx);
1713 	dsl_dataset_t *ds;
1714 	int64_t unused_refres_delta;
1715 	int error;
1716 
1717 	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
1718 	if (error != 0)
1719 		return (error);
1720 
1721 	/* must not be a snapshot */
1722 	if (dsl_dataset_is_snapshot(ds)) {
1723 		dsl_dataset_rele(ds, FTAG);
1724 		return (SET_ERROR(EINVAL));
1725 	}
1726 
1727 	/* must have a most recent snapshot */
1728 	if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
1729 		dsl_dataset_rele(ds, FTAG);
1730 		return (SET_ERROR(EINVAL));
1731 	}
1732 
1733 	/* must not have any bookmarks after the most recent snapshot */
1734 	nvlist_t *proprequest = fnvlist_alloc();
1735 	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
1736 	nvlist_t *bookmarks = fnvlist_alloc();
1737 	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
1738 	fnvlist_free(proprequest);
1739 	if (error != 0)
1740 		return (error);
1741 	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
1742 	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
1743 		nvlist_t *valuenv =
1744 		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
1745 		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
1746 		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
1747 		if (createtxg > ds->ds_phys->ds_prev_snap_txg) {
1748 			fnvlist_free(bookmarks);
1749 			dsl_dataset_rele(ds, FTAG);
1750 			return (SET_ERROR(EEXIST));
1751 		}
1752 	}
1753 	fnvlist_free(bookmarks);
1754 
1755 	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
1756 	if (error != 0) {
1757 		dsl_dataset_rele(ds, FTAG);
1758 		return (error);
1759 	}
1760 
1761 	/*
1762 	 * Check if the snap we are rolling back to uses more than
1763 	 * the refquota.
1764 	 */
1765 	if (ds->ds_quota != 0 &&
1766 	    ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) {
1767 		dsl_dataset_rele(ds, FTAG);
1768 		return (SET_ERROR(EDQUOT));
1769 	}
1770 
1771 	/*
1772 	 * When we do the clone swap, we will temporarily use more space
1773 	 * due to the refreservation (the head will no longer have any
1774 	 * unique space, so the entire amount of the refreservation will need
1775 	 * to be free).  We will immediately destroy the clone, freeing
1776 	 * this space, but the freeing happens over many txg's.
1777 	 */
1778 	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
1779 	    ds->ds_phys->ds_unique_bytes);
1780 
1781 	if (unused_refres_delta > 0 &&
1782 	    unused_refres_delta >
1783 	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
1784 		dsl_dataset_rele(ds, FTAG);
1785 		return (SET_ERROR(ENOSPC));
1786 	}
1787 
1788 	dsl_dataset_rele(ds, FTAG);
1789 	return (0);
1790 }
1791 
1792 static void
1793 dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
1794 {
1795 	dsl_dataset_rollback_arg_t *ddra = arg;
1796 	dsl_pool_t *dp = dmu_tx_pool(tx);
1797 	dsl_dataset_t *ds, *clone;
1798 	uint64_t cloneobj;
1799 	char namebuf[ZFS_MAXNAMELEN];
1800 
1801 	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
1802 
1803 	dsl_dataset_name(ds->ds_prev, namebuf);
1804 	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
1805 
1806 	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
1807 	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
1808 
1809 	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
1810 
1811 	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
1812 	dsl_dataset_zero_zil(ds, tx);
1813 
1814 	dsl_destroy_head_sync_impl(clone, tx);
1815 
1816 	dsl_dataset_rele(clone, FTAG);
1817 	dsl_dataset_rele(ds, FTAG);
1818 }
1819 
1820 /*
1821  * Rolls back the given filesystem or volume to the most recent snapshot.
1822  * The name of the most recent snapshot will be returned under key "target"
1823  * in the result nvlist.
1824  *
1825  * If owner != NULL:
1826  * - The existing dataset MUST be owned by the specified owner at entry
1827  * - Upon return, dataset will still be held by the same owner, whether we
1828  *   succeed or not.
1829  *
1830  * This mode is required any time the existing filesystem is mounted.  See
1831  * notes above zfs_suspend_fs() for further details.
1832  */
1833 int
1834 dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
1835 {
1836 	dsl_dataset_rollback_arg_t ddra;
1837 
1838 	ddra.ddra_fsname = fsname;
1839 	ddra.ddra_owner = owner;
1840 	ddra.ddra_result = result;
1841 
1842 	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
1843 	    dsl_dataset_rollback_sync, &ddra, 1));
1844 }
1845 
1846 struct promotenode {
1847 	list_node_t link;
1848 	dsl_dataset_t *ds;
1849 };
1850 
1851 typedef struct dsl_dataset_promote_arg {
1852 	const char *ddpa_clonename;
1853 	dsl_dataset_t *ddpa_clone;
1854 	list_t shared_snaps, origin_snaps, clone_snaps;
1855 	dsl_dataset_t *origin_origin; /* origin of the origin */
1856 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
1857 	char *err_ds;
1858 } dsl_dataset_promote_arg_t;
1859 
1860 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
1861 static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
1862     void *tag);
1863 static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
1864 
1865 static int
1866 dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
1867 {
1868 	dsl_dataset_promote_arg_t *ddpa = arg;
1869 	dsl_pool_t *dp = dmu_tx_pool(tx);
1870 	dsl_dataset_t *hds;
1871 	struct promotenode *snap;
1872 	dsl_dataset_t *origin_ds;
1873 	int err;
1874 	uint64_t unused;
1875 
1876 	err = promote_hold(ddpa, dp, FTAG);
1877 	if (err != 0)
1878 		return (err);
1879 
1880 	hds = ddpa->ddpa_clone;
1881 
1882 	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
1883 		promote_rele(ddpa, FTAG);
1884 		return (SET_ERROR(EXDEV));
1885 	}
1886 
1887 	/*
1888 	 * Compute and check the amount of space to transfer.  Since this is
1889 	 * so expensive, don't do the preliminary check.
1890 	 */
1891 	if (!dmu_tx_is_syncing(tx)) {
1892 		promote_rele(ddpa, FTAG);
1893 		return (0);
1894 	}
1895 
1896 	snap = list_head(&ddpa->shared_snaps);
1897 	origin_ds = snap->ds;
1898 
1899 	/* compute origin's new unique space */
1900 	snap = list_tail(&ddpa->clone_snaps);
1901 	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
1902 	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
1903 	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1904 	    &ddpa->unique, &unused, &unused);
1905 
1906 	/*
1907 	 * Walk the snapshots that we are moving
1908 	 *
1909 	 * Compute space to transfer.  Consider the incremental changes
1910 	 * to used by each snapshot:
1911 	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
1912 	 * So each snapshot gave birth to:
1913 	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
1914 	 * So a sequence would look like:
1915 	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
1916 	 * Which simplifies to:
1917 	 * uN + kN + kN-1 + ... + k1 + k0
1918 	 * Note however, if we stop before we reach the ORIGIN we get:
1919 	 * uN + kN + kN-1 + ... + kM - uM-1
1920 	 */
1921 	ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
1922 	ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
1923 	ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
1924 	for (snap = list_head(&ddpa->shared_snaps); snap;
1925 	    snap = list_next(&ddpa->shared_snaps, snap)) {
1926 		uint64_t val, dlused, dlcomp, dluncomp;
1927 		dsl_dataset_t *ds = snap->ds;
1928 
1929 		/*
1930 		 * If there are long holds, we won't be able to evict
1931 		 * the objset.
1932 		 */
1933 		if (dsl_dataset_long_held(ds)) {
1934 			err = SET_ERROR(EBUSY);
1935 			goto out;
1936 		}
1937 
1938 		/* Check that the snapshot name does not conflict */
1939 		VERIFY0(dsl_dataset_get_snapname(ds));
1940 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
1941 		if (err == 0) {
1942 			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
1943 			err = SET_ERROR(EEXIST);
1944 			goto out;
1945 		}
1946 		if (err != ENOENT)
1947 			goto out;
1948 
1949 		/* The very first snapshot does not have a deadlist */
1950 		if (ds->ds_phys->ds_prev_snap_obj == 0)
1951 			continue;
1952 
1953 		dsl_deadlist_space(&ds->ds_deadlist,
1954 		    &dlused, &dlcomp, &dluncomp);
1955 		ddpa->used += dlused;
1956 		ddpa->comp += dlcomp;
1957 		ddpa->uncomp += dluncomp;
1958 	}
1959 
1960 	/*
1961 	 * If we are a clone of a clone then we never reached ORIGIN,
1962 	 * so we need to subtract out the clone origin's used space.
1963 	 */
1964 	if (ddpa->origin_origin) {
1965 		ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes;
1966 		ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes;
1967 		ddpa->uncomp -=
1968 		    ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
1969 	}
1970 
1971 	/* Check that there is enough space here */
1972 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
1973 	    ddpa->used);
1974 	if (err != 0)
1975 		goto out;
1976 
1977 	/*
1978 	 * Compute the amounts of space that will be used by snapshots
1979 	 * after the promotion (for both origin and clone).  For each,
1980 	 * it is the amount of space that will be on all of their
1981 	 * deadlists (that was not born before their new origin).
1982 	 */
1983 	if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1984 		uint64_t space;
1985 
1986 		/*
1987 		 * Note, typically this will not be a clone of a clone,
1988 		 * so dd_origin_txg will be < TXG_INITIAL, so
1989 		 * these snaplist_space() -> dsl_deadlist_space_range()
1990 		 * calls will be fast because they do not have to
1991 		 * iterate over all bps.
1992 		 */
1993 		snap = list_head(&ddpa->origin_snaps);
1994 		err = snaplist_space(&ddpa->shared_snaps,
1995 		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
1996 		if (err != 0)
1997 			goto out;
1998 
1999 		err = snaplist_space(&ddpa->clone_snaps,
2000 		    snap->ds->ds_dir->dd_origin_txg, &space);
2001 		if (err != 0)
2002 			goto out;
2003 		ddpa->cloneusedsnap += space;
2004 	}
2005 	if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2006 		err = snaplist_space(&ddpa->origin_snaps,
2007 		    origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap);
2008 		if (err != 0)
2009 			goto out;
2010 	}
2011 
2012 out:
2013 	promote_rele(ddpa, FTAG);
2014 	return (err);
2015 }
2016 
2017 static void
2018 dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
2019 {
2020 	dsl_dataset_promote_arg_t *ddpa = arg;
2021 	dsl_pool_t *dp = dmu_tx_pool(tx);
2022 	dsl_dataset_t *hds;
2023 	struct promotenode *snap;
2024 	dsl_dataset_t *origin_ds;
2025 	dsl_dataset_t *origin_head;
2026 	dsl_dir_t *dd;
2027 	dsl_dir_t *odd = NULL;
2028 	uint64_t oldnext_obj;
2029 	int64_t delta;
2030 
2031 	VERIFY0(promote_hold(ddpa, dp, FTAG));
2032 	hds = ddpa->ddpa_clone;
2033 
2034 	ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE);
2035 
2036 	snap = list_head(&ddpa->shared_snaps);
2037 	origin_ds = snap->ds;
2038 	dd = hds->ds_dir;
2039 
2040 	snap = list_head(&ddpa->origin_snaps);
2041 	origin_head = snap->ds;
2042 
2043 	/*
2044 	 * We need to explicitly open odd, since origin_ds's dd will be
2045 	 * changing.
2046 	 */
2047 	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
2048 	    NULL, FTAG, &odd));
2049 
2050 	/* change origin's next snap */
2051 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2052 	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2053 	snap = list_tail(&ddpa->clone_snaps);
2054 	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2055 	origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2056 
2057 	/* change the origin's next clone */
2058 	if (origin_ds->ds_phys->ds_next_clones_obj) {
2059 		dsl_dataset_remove_from_next_clones(origin_ds,
2060 		    snap->ds->ds_object, tx);
2061 		VERIFY0(zap_add_int(dp->dp_meta_objset,
2062 		    origin_ds->ds_phys->ds_next_clones_obj,
2063 		    oldnext_obj, tx));
2064 	}
2065 
2066 	/* change origin */
2067 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
2068 	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2069 	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2070 	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2071 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
2072 	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2073 	origin_head->ds_dir->dd_origin_txg =
2074 	    origin_ds->ds_phys->ds_creation_txg;
2075 
2076 	/* change dd_clone entries */
2077 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2078 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2079 		    odd->dd_phys->dd_clones, hds->ds_object, tx));
2080 		VERIFY0(zap_add_int(dp->dp_meta_objset,
2081 		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
2082 		    hds->ds_object, tx));
2083 
2084 		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2085 		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
2086 		    origin_head->ds_object, tx));
2087 		if (dd->dd_phys->dd_clones == 0) {
2088 			dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2089 			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2090 		}
2091 		VERIFY0(zap_add_int(dp->dp_meta_objset,
2092 		    dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2093 	}
2094 
2095 	/* move snapshots to this dir */
2096 	for (snap = list_head(&ddpa->shared_snaps); snap;
2097 	    snap = list_next(&ddpa->shared_snaps, snap)) {
2098 		dsl_dataset_t *ds = snap->ds;
2099 
2100 		/*
2101 		 * Property callbacks are registered to a particular
2102 		 * dsl_dir.  Since ours is changing, evict the objset
2103 		 * so that they will be unregistered from the old dsl_dir.
2104 		 */
2105 		if (ds->ds_objset) {
2106 			dmu_objset_evict(ds->ds_objset);
2107 			ds->ds_objset = NULL;
2108 		}
2109 
2110 		/* move snap name entry */
2111 		VERIFY0(dsl_dataset_get_snapname(ds));
2112 		VERIFY0(dsl_dataset_snap_remove(origin_head,
2113 		    ds->ds_snapname, tx));
2114 		VERIFY0(zap_add(dp->dp_meta_objset,
2115 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2116 		    8, 1, &ds->ds_object, tx));
2117 
2118 		/* change containing dsl_dir */
2119 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
2120 		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2121 		ds->ds_phys->ds_dir_obj = dd->dd_object;
2122 		ASSERT3P(ds->ds_dir, ==, odd);
2123 		dsl_dir_rele(ds->ds_dir, ds);
2124 		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
2125 		    NULL, ds, &ds->ds_dir));
2126 
2127 		/* move any clone references */
2128 		if (ds->ds_phys->ds_next_clones_obj &&
2129 		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2130 			zap_cursor_t zc;
2131 			zap_attribute_t za;
2132 
2133 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
2134 			    ds->ds_phys->ds_next_clones_obj);
2135 			    zap_cursor_retrieve(&zc, &za) == 0;
2136 			    zap_cursor_advance(&zc)) {
2137 				dsl_dataset_t *cnds;
2138 				uint64_t o;
2139 
2140 				if (za.za_first_integer == oldnext_obj) {
2141 					/*
2142 					 * We've already moved the
2143 					 * origin's reference.
2144 					 */
2145 					continue;
2146 				}
2147 
2148 				VERIFY0(dsl_dataset_hold_obj(dp,
2149 				    za.za_first_integer, FTAG, &cnds));
2150 				o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2151 
2152 				VERIFY0(zap_remove_int(dp->dp_meta_objset,
2153 				    odd->dd_phys->dd_clones, o, tx));
2154 				VERIFY0(zap_add_int(dp->dp_meta_objset,
2155 				    dd->dd_phys->dd_clones, o, tx));
2156 				dsl_dataset_rele(cnds, FTAG);
2157 			}
2158 			zap_cursor_fini(&zc);
2159 		}
2160 
2161 		ASSERT(!dsl_prop_hascb(ds));
2162 	}
2163 
2164 	/*
2165 	 * Change space accounting.
2166 	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2167 	 * both be valid, or both be 0 (resulting in delta == 0).  This
2168 	 * is true for each of {clone,origin} independently.
2169 	 */
2170 
2171 	delta = ddpa->cloneusedsnap -
2172 	    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2173 	ASSERT3S(delta, >=, 0);
2174 	ASSERT3U(ddpa->used, >=, delta);
2175 	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2176 	dsl_dir_diduse_space(dd, DD_USED_HEAD,
2177 	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
2178 
2179 	delta = ddpa->originusedsnap -
2180 	    odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2181 	ASSERT3S(delta, <=, 0);
2182 	ASSERT3U(ddpa->used, >=, -delta);
2183 	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2184 	dsl_dir_diduse_space(odd, DD_USED_HEAD,
2185 	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
2186 
2187 	origin_ds->ds_phys->ds_unique_bytes = ddpa->unique;
2188 
2189 	/* log history record */
2190 	spa_history_log_internal_ds(hds, "promote", tx, "");
2191 
2192 	dsl_dir_rele(odd, FTAG);
2193 	promote_rele(ddpa, FTAG);
2194 }
2195 
2196 /*
2197  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2198  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2199  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2200  * snapshots back to this dataset's origin.
2201  */
2202 static int
2203 snaplist_make(dsl_pool_t *dp,
2204     uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
2205 {
2206 	uint64_t obj = last_obj;
2207 
2208 	list_create(l, sizeof (struct promotenode),
2209 	    offsetof(struct promotenode, link));
2210 
2211 	while (obj != first_obj) {
2212 		dsl_dataset_t *ds;
2213 		struct promotenode *snap;
2214 		int err;
2215 
2216 		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
2217 		ASSERT(err != ENOENT);
2218 		if (err != 0)
2219 			return (err);
2220 
2221 		if (first_obj == 0)
2222 			first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2223 
2224 		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
2225 		snap->ds = ds;
2226 		list_insert_tail(l, snap);
2227 		obj = ds->ds_phys->ds_prev_snap_obj;
2228 	}
2229 
2230 	return (0);
2231 }
2232 
2233 static int
2234 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2235 {
2236 	struct promotenode *snap;
2237 
2238 	*spacep = 0;
2239 	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2240 		uint64_t used, comp, uncomp;
2241 		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2242 		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
2243 		*spacep += used;
2244 	}
2245 	return (0);
2246 }
2247 
2248 static void
2249 snaplist_destroy(list_t *l, void *tag)
2250 {
2251 	struct promotenode *snap;
2252 
2253 	if (l == NULL || !list_link_active(&l->list_head))
2254 		return;
2255 
2256 	while ((snap = list_tail(l)) != NULL) {
2257 		list_remove(l, snap);
2258 		dsl_dataset_rele(snap->ds, tag);
2259 		kmem_free(snap, sizeof (*snap));
2260 	}
2261 	list_destroy(l);
2262 }
2263 
2264 static int
2265 promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
2266 {
2267 	int error;
2268 	dsl_dir_t *dd;
2269 	struct promotenode *snap;
2270 
2271 	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
2272 	    &ddpa->ddpa_clone);
2273 	if (error != 0)
2274 		return (error);
2275 	dd = ddpa->ddpa_clone->ds_dir;
2276 
2277 	if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
2278 	    !dsl_dir_is_clone(dd)) {
2279 		dsl_dataset_rele(ddpa->ddpa_clone, tag);
2280 		return (SET_ERROR(EINVAL));
2281 	}
2282 
2283 	error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj,
2284 	    &ddpa->shared_snaps, tag);
2285 	if (error != 0)
2286 		goto out;
2287 
2288 	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
2289 	    &ddpa->clone_snaps, tag);
2290 	if (error != 0)
2291 		goto out;
2292 
2293 	snap = list_head(&ddpa->shared_snaps);
2294 	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
2295 	error = snaplist_make(dp, dd->dd_phys->dd_origin_obj,
2296 	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj,
2297 	    &ddpa->origin_snaps, tag);
2298 	if (error != 0)
2299 		goto out;
2300 
2301 	if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
2302 		error = dsl_dataset_hold_obj(dp,
2303 		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
2304 		    tag, &ddpa->origin_origin);
2305 		if (error != 0)
2306 			goto out;
2307 	}
2308 out:
2309 	if (error != 0)
2310 		promote_rele(ddpa, tag);
2311 	return (error);
2312 }
2313 
2314 static void
2315 promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
2316 {
2317 	snaplist_destroy(&ddpa->shared_snaps, tag);
2318 	snaplist_destroy(&ddpa->clone_snaps, tag);
2319 	snaplist_destroy(&ddpa->origin_snaps, tag);
2320 	if (ddpa->origin_origin != NULL)
2321 		dsl_dataset_rele(ddpa->origin_origin, tag);
2322 	dsl_dataset_rele(ddpa->ddpa_clone, tag);
2323 }
2324 
2325 /*
2326  * Promote a clone.
2327  *
2328  * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
2329  * in with the name.  (It must be at least MAXNAMELEN bytes long.)
2330  */
2331 int
2332 dsl_dataset_promote(const char *name, char *conflsnap)
2333 {
2334 	dsl_dataset_promote_arg_t ddpa = { 0 };
2335 	uint64_t numsnaps;
2336 	int error;
2337 	objset_t *os;
2338 
2339 	/*
2340 	 * We will modify space proportional to the number of
2341 	 * snapshots.  Compute numsnaps.
2342 	 */
2343 	error = dmu_objset_hold(name, FTAG, &os);
2344 	if (error != 0)
2345 		return (error);
2346 	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
2347 	    dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps);
2348 	dmu_objset_rele(os, FTAG);
2349 	if (error != 0)
2350 		return (error);
2351 
2352 	ddpa.ddpa_clonename = name;
2353 	ddpa.err_ds = conflsnap;
2354 
2355 	return (dsl_sync_task(name, dsl_dataset_promote_check,
2356 	    dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));
2357 }
2358 
2359 int
2360 dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
2361     dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
2362 {
2363 	int64_t unused_refres_delta;
2364 
2365 	/* they should both be heads */
2366 	if (dsl_dataset_is_snapshot(clone) ||
2367 	    dsl_dataset_is_snapshot(origin_head))
2368 		return (SET_ERROR(EINVAL));
2369 
2370 	/* if we are not forcing, the branch point should be just before them */
2371 	if (!force && clone->ds_prev != origin_head->ds_prev)
2372 		return (SET_ERROR(EINVAL));
2373 
2374 	/* clone should be the clone (unless they are unrelated) */
2375 	if (clone->ds_prev != NULL &&
2376 	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
2377 	    origin_head->ds_dir != clone->ds_prev->ds_dir)
2378 		return (SET_ERROR(EINVAL));
2379 
2380 	/* the clone should be a child of the origin */
2381 	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
2382 		return (SET_ERROR(EINVAL));
2383 
2384 	/* origin_head shouldn't be modified unless 'force' */
2385 	if (!force &&
2386 	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
2387 		return (SET_ERROR(ETXTBSY));
2388 
2389 	/* origin_head should have no long holds (e.g. is not mounted) */
2390 	if (dsl_dataset_handoff_check(origin_head, owner, tx))
2391 		return (SET_ERROR(EBUSY));
2392 
2393 	/* check amount of any unconsumed refreservation */
2394 	unused_refres_delta =
2395 	    (int64_t)MIN(origin_head->ds_reserved,
2396 	    origin_head->ds_phys->ds_unique_bytes) -
2397 	    (int64_t)MIN(origin_head->ds_reserved,
2398 	    clone->ds_phys->ds_unique_bytes);
2399 
2400 	if (unused_refres_delta > 0 &&
2401 	    unused_refres_delta >
2402 	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
2403 		return (SET_ERROR(ENOSPC));
2404 
2405 	/* clone can't be over the head's refquota */
2406 	if (origin_head->ds_quota != 0 &&
2407 	    clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota)
2408 		return (SET_ERROR(EDQUOT));
2409 
2410 	return (0);
2411 }
2412 
2413 void
2414 dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
2415     dsl_dataset_t *origin_head, dmu_tx_t *tx)
2416 {
2417 	dsl_pool_t *dp = dmu_tx_pool(tx);
2418 	int64_t unused_refres_delta;
2419 
2420 	ASSERT(clone->ds_reserved == 0);
2421 	ASSERT(origin_head->ds_quota == 0 ||
2422 	    clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota);
2423 	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
2424 
2425 	dmu_buf_will_dirty(clone->ds_dbuf, tx);
2426 	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2427 
2428 	if (clone->ds_objset != NULL) {
2429 		dmu_objset_evict(clone->ds_objset);
2430 		clone->ds_objset = NULL;
2431 	}
2432 
2433 	if (origin_head->ds_objset != NULL) {
2434 		dmu_objset_evict(origin_head->ds_objset);
2435 		origin_head->ds_objset = NULL;
2436 	}
2437 
2438 	unused_refres_delta =
2439 	    (int64_t)MIN(origin_head->ds_reserved,
2440 	    origin_head->ds_phys->ds_unique_bytes) -
2441 	    (int64_t)MIN(origin_head->ds_reserved,
2442 	    clone->ds_phys->ds_unique_bytes);
2443 
2444 	/*
2445 	 * Reset origin's unique bytes, if it exists.
2446 	 */
2447 	if (clone->ds_prev) {
2448 		dsl_dataset_t *origin = clone->ds_prev;
2449 		uint64_t comp, uncomp;
2450 
2451 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
2452 		dsl_deadlist_space_range(&clone->ds_deadlist,
2453 		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2454 		    &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
2455 	}
2456 
2457 	/* swap blkptrs */
2458 	{
2459 		blkptr_t tmp;
2460 		tmp = origin_head->ds_phys->ds_bp;
2461 		origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp;
2462 		clone->ds_phys->ds_bp = tmp;
2463 	}
2464 
2465 	/* set dd_*_bytes */
2466 	{
2467 		int64_t dused, dcomp, duncomp;
2468 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
2469 		uint64_t odl_used, odl_comp, odl_uncomp;
2470 
2471 		ASSERT3U(clone->ds_dir->dd_phys->
2472 		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
2473 
2474 		dsl_deadlist_space(&clone->ds_deadlist,
2475 		    &cdl_used, &cdl_comp, &cdl_uncomp);
2476 		dsl_deadlist_space(&origin_head->ds_deadlist,
2477 		    &odl_used, &odl_comp, &odl_uncomp);
2478 
2479 		dused = clone->ds_phys->ds_referenced_bytes + cdl_used -
2480 		    (origin_head->ds_phys->ds_referenced_bytes + odl_used);
2481 		dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp -
2482 		    (origin_head->ds_phys->ds_compressed_bytes + odl_comp);
2483 		duncomp = clone->ds_phys->ds_uncompressed_bytes +
2484 		    cdl_uncomp -
2485 		    (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp);
2486 
2487 		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
2488 		    dused, dcomp, duncomp, tx);
2489 		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
2490 		    -dused, -dcomp, -duncomp, tx);
2491 
2492 		/*
2493 		 * The difference in the space used by snapshots is the
2494 		 * difference in snapshot space due to the head's
2495 		 * deadlist (since that's the only thing that's
2496 		 * changing that affects the snapused).
2497 		 */
2498 		dsl_deadlist_space_range(&clone->ds_deadlist,
2499 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
2500 		    &cdl_used, &cdl_comp, &cdl_uncomp);
2501 		dsl_deadlist_space_range(&origin_head->ds_deadlist,
2502 		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
2503 		    &odl_used, &odl_comp, &odl_uncomp);
2504 		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
2505 		    DD_USED_HEAD, DD_USED_SNAP, tx);
2506 	}
2507 
2508 	/* swap ds_*_bytes */
2509 	SWITCH64(origin_head->ds_phys->ds_referenced_bytes,
2510 	    clone->ds_phys->ds_referenced_bytes);
2511 	SWITCH64(origin_head->ds_phys->ds_compressed_bytes,
2512 	    clone->ds_phys->ds_compressed_bytes);
2513 	SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes,
2514 	    clone->ds_phys->ds_uncompressed_bytes);
2515 	SWITCH64(origin_head->ds_phys->ds_unique_bytes,
2516 	    clone->ds_phys->ds_unique_bytes);
2517 
2518 	/* apply any parent delta for change in unconsumed refreservation */
2519 	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
2520 	    unused_refres_delta, 0, 0, tx);
2521 
2522 	/*
2523 	 * Swap deadlists.
2524 	 */
2525 	dsl_deadlist_close(&clone->ds_deadlist);
2526 	dsl_deadlist_close(&origin_head->ds_deadlist);
2527 	SWITCH64(origin_head->ds_phys->ds_deadlist_obj,
2528 	    clone->ds_phys->ds_deadlist_obj);
2529 	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
2530 	    clone->ds_phys->ds_deadlist_obj);
2531 	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
2532 	    origin_head->ds_phys->ds_deadlist_obj);
2533 
2534 	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
2535 
2536 	spa_history_log_internal_ds(clone, "clone swap", tx,
2537 	    "parent=%s", origin_head->ds_dir->dd_myname);
2538 }
2539 
2540 /*
2541  * Given a pool name and a dataset object number in that pool,
2542  * return the name of that dataset.
2543  */
2544 int
2545 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
2546 {
2547 	dsl_pool_t *dp;
2548 	dsl_dataset_t *ds;
2549 	int error;
2550 
2551 	error = dsl_pool_hold(pname, FTAG, &dp);
2552 	if (error != 0)
2553 		return (error);
2554 
2555 	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
2556 	if (error == 0) {
2557 		dsl_dataset_name(ds, buf);
2558 		dsl_dataset_rele(ds, FTAG);
2559 	}
2560 	dsl_pool_rele(dp, FTAG);
2561 
2562 	return (error);
2563 }
2564 
2565 int
2566 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
2567     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
2568 {
2569 	int error = 0;
2570 
2571 	ASSERT3S(asize, >, 0);
2572 
2573 	/*
2574 	 * *ref_rsrv is the portion of asize that will come from any
2575 	 * unconsumed refreservation space.
2576 	 */
2577 	*ref_rsrv = 0;
2578 
2579 	mutex_enter(&ds->ds_lock);
2580 	/*
2581 	 * Make a space adjustment for reserved bytes.
2582 	 */
2583 	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
2584 		ASSERT3U(*used, >=,
2585 		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
2586 		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
2587 		*ref_rsrv =
2588 		    asize - MIN(asize, parent_delta(ds, asize + inflight));
2589 	}
2590 
2591 	if (!check_quota || ds->ds_quota == 0) {
2592 		mutex_exit(&ds->ds_lock);
2593 		return (0);
2594 	}
2595 	/*
2596 	 * If they are requesting more space, and our current estimate
2597 	 * is over quota, they get to try again unless the actual
2598 	 * on-disk is over quota and there are no pending changes (which
2599 	 * may free up space for us).
2600 	 */
2601 	if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
2602 		if (inflight > 0 ||
2603 		    ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
2604 			error = SET_ERROR(ERESTART);
2605 		else
2606 			error = SET_ERROR(EDQUOT);
2607 	}
2608 	mutex_exit(&ds->ds_lock);
2609 
2610 	return (error);
2611 }
2612 
2613 typedef struct dsl_dataset_set_qr_arg {
2614 	const char *ddsqra_name;
2615 	zprop_source_t ddsqra_source;
2616 	uint64_t ddsqra_value;
2617 } dsl_dataset_set_qr_arg_t;
2618 
2619 
2620 /* ARGSUSED */
2621 static int
2622 dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
2623 {
2624 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
2625 	dsl_pool_t *dp = dmu_tx_pool(tx);
2626 	dsl_dataset_t *ds;
2627 	int error;
2628 	uint64_t newval;
2629 
2630 	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
2631 		return (SET_ERROR(ENOTSUP));
2632 
2633 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
2634 	if (error != 0)
2635 		return (error);
2636 
2637 	if (dsl_dataset_is_snapshot(ds)) {
2638 		dsl_dataset_rele(ds, FTAG);
2639 		return (SET_ERROR(EINVAL));
2640 	}
2641 
2642 	error = dsl_prop_predict(ds->ds_dir,
2643 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
2644 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
2645 	if (error != 0) {
2646 		dsl_dataset_rele(ds, FTAG);
2647 		return (error);
2648 	}
2649 
2650 	if (newval == 0) {
2651 		dsl_dataset_rele(ds, FTAG);
2652 		return (0);
2653 	}
2654 
2655 	if (newval < ds->ds_phys->ds_referenced_bytes ||
2656 	    newval < ds->ds_reserved) {
2657 		dsl_dataset_rele(ds, FTAG);
2658 		return (SET_ERROR(ENOSPC));
2659 	}
2660 
2661 	dsl_dataset_rele(ds, FTAG);
2662 	return (0);
2663 }
2664 
2665 static void
2666 dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
2667 {
2668 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
2669 	dsl_pool_t *dp = dmu_tx_pool(tx);
2670 	dsl_dataset_t *ds;
2671 	uint64_t newval;
2672 
2673 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
2674 
2675 	dsl_prop_set_sync_impl(ds,
2676 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
2677 	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
2678 	    &ddsqra->ddsqra_value, tx);
2679 
2680 	VERIFY0(dsl_prop_get_int_ds(ds,
2681 	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
2682 
2683 	if (ds->ds_quota != newval) {
2684 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
2685 		ds->ds_quota = newval;
2686 	}
2687 	dsl_dataset_rele(ds, FTAG);
2688 }
2689 
2690 int
2691 dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
2692     uint64_t refquota)
2693 {
2694 	dsl_dataset_set_qr_arg_t ddsqra;
2695 
2696 	ddsqra.ddsqra_name = dsname;
2697 	ddsqra.ddsqra_source = source;
2698 	ddsqra.ddsqra_value = refquota;
2699 
2700 	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
2701 	    dsl_dataset_set_refquota_sync, &ddsqra, 0));
2702 }
2703 
2704 static int
2705 dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
2706 {
2707 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
2708 	dsl_pool_t *dp = dmu_tx_pool(tx);
2709 	dsl_dataset_t *ds;
2710 	int error;
2711 	uint64_t newval, unique;
2712 
2713 	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
2714 		return (SET_ERROR(ENOTSUP));
2715 
2716 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
2717 	if (error != 0)
2718 		return (error);
2719 
2720 	if (dsl_dataset_is_snapshot(ds)) {
2721 		dsl_dataset_rele(ds, FTAG);
2722 		return (SET_ERROR(EINVAL));
2723 	}
2724 
2725 	error = dsl_prop_predict(ds->ds_dir,
2726 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
2727 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
2728 	if (error != 0) {
2729 		dsl_dataset_rele(ds, FTAG);
2730 		return (error);
2731 	}
2732 
2733 	/*
2734 	 * If we are doing the preliminary check in open context, the
2735 	 * space estimates may be inaccurate.
2736 	 */
2737 	if (!dmu_tx_is_syncing(tx)) {
2738 		dsl_dataset_rele(ds, FTAG);
2739 		return (0);
2740 	}
2741 
2742 	mutex_enter(&ds->ds_lock);
2743 	if (!DS_UNIQUE_IS_ACCURATE(ds))
2744 		dsl_dataset_recalc_head_uniq(ds);
2745 	unique = ds->ds_phys->ds_unique_bytes;
2746 	mutex_exit(&ds->ds_lock);
2747 
2748 	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
2749 		uint64_t delta = MAX(unique, newval) -
2750 		    MAX(unique, ds->ds_reserved);
2751 
2752 		if (delta >
2753 		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
2754 		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
2755 			dsl_dataset_rele(ds, FTAG);
2756 			return (SET_ERROR(ENOSPC));
2757 		}
2758 	}
2759 
2760 	dsl_dataset_rele(ds, FTAG);
2761 	return (0);
2762 }
2763 
2764 void
2765 dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
2766     zprop_source_t source, uint64_t value, dmu_tx_t *tx)
2767 {
2768 	uint64_t newval;
2769 	uint64_t unique;
2770 	int64_t delta;
2771 
2772 	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
2773 	    source, sizeof (value), 1, &value, tx);
2774 
2775 	VERIFY0(dsl_prop_get_int_ds(ds,
2776 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
2777 
2778 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
2779 	mutex_enter(&ds->ds_dir->dd_lock);
2780 	mutex_enter(&ds->ds_lock);
2781 	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2782 	unique = ds->ds_phys->ds_unique_bytes;
2783 	delta = MAX(0, (int64_t)(newval - unique)) -
2784 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
2785 	ds->ds_reserved = newval;
2786 	mutex_exit(&ds->ds_lock);
2787 
2788 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
2789 	mutex_exit(&ds->ds_dir->dd_lock);
2790 }
2791 
2792 static void
2793 dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
2794 {
2795 	dsl_dataset_set_qr_arg_t *ddsqra = arg;
2796 	dsl_pool_t *dp = dmu_tx_pool(tx);
2797 	dsl_dataset_t *ds;
2798 
2799 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
2800 	dsl_dataset_set_refreservation_sync_impl(ds,
2801 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
2802 	dsl_dataset_rele(ds, FTAG);
2803 }
2804 
2805 int
2806 dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
2807     uint64_t refreservation)
2808 {
2809 	dsl_dataset_set_qr_arg_t ddsqra;
2810 
2811 	ddsqra.ddsqra_name = dsname;
2812 	ddsqra.ddsqra_source = source;
2813 	ddsqra.ddsqra_value = refreservation;
2814 
2815 	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
2816 	    dsl_dataset_set_refreservation_sync, &ddsqra, 0));
2817 }
2818 
2819 /*
2820  * Return (in *usedp) the amount of space written in new that is not
2821  * present in oldsnap.  New may be a snapshot or the head.  Old must be
2822  * a snapshot before new, in new's filesystem (or its origin).  If not then
2823  * fail and return EINVAL.
2824  *
2825  * The written space is calculated by considering two components:  First, we
2826  * ignore any freed space, and calculate the written as new's used space
2827  * minus old's used space.  Next, we add in the amount of space that was freed
2828  * between the two snapshots, thus reducing new's used space relative to old's.
2829  * Specifically, this is the space that was born before old->ds_creation_txg,
2830  * and freed before new (ie. on new's deadlist or a previous deadlist).
2831  *
2832  * space freed                         [---------------------]
2833  * snapshots                       ---O-------O--------O-------O------
2834  *                                         oldsnap            new
2835  */
2836 int
2837 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
2838     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
2839 {
2840 	int err = 0;
2841 	uint64_t snapobj;
2842 	dsl_pool_t *dp = new->ds_dir->dd_pool;
2843 
2844 	ASSERT(dsl_pool_config_held(dp));
2845 
2846 	*usedp = 0;
2847 	*usedp += new->ds_phys->ds_referenced_bytes;
2848 	*usedp -= oldsnap->ds_phys->ds_referenced_bytes;
2849 
2850 	*compp = 0;
2851 	*compp += new->ds_phys->ds_compressed_bytes;
2852 	*compp -= oldsnap->ds_phys->ds_compressed_bytes;
2853 
2854 	*uncompp = 0;
2855 	*uncompp += new->ds_phys->ds_uncompressed_bytes;
2856 	*uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
2857 
2858 	snapobj = new->ds_object;
2859 	while (snapobj != oldsnap->ds_object) {
2860 		dsl_dataset_t *snap;
2861 		uint64_t used, comp, uncomp;
2862 
2863 		if (snapobj == new->ds_object) {
2864 			snap = new;
2865 		} else {
2866 			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
2867 			if (err != 0)
2868 				break;
2869 		}
2870 
2871 		if (snap->ds_phys->ds_prev_snap_txg ==
2872 		    oldsnap->ds_phys->ds_creation_txg) {
2873 			/*
2874 			 * The blocks in the deadlist can not be born after
2875 			 * ds_prev_snap_txg, so get the whole deadlist space,
2876 			 * which is more efficient (especially for old-format
2877 			 * deadlists).  Unfortunately the deadlist code
2878 			 * doesn't have enough information to make this
2879 			 * optimization itself.
2880 			 */
2881 			dsl_deadlist_space(&snap->ds_deadlist,
2882 			    &used, &comp, &uncomp);
2883 		} else {
2884 			dsl_deadlist_space_range(&snap->ds_deadlist,
2885 			    0, oldsnap->ds_phys->ds_creation_txg,
2886 			    &used, &comp, &uncomp);
2887 		}
2888 		*usedp += used;
2889 		*compp += comp;
2890 		*uncompp += uncomp;
2891 
2892 		/*
2893 		 * If we get to the beginning of the chain of snapshots
2894 		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
2895 		 * was not a snapshot of/before new.
2896 		 */
2897 		snapobj = snap->ds_phys->ds_prev_snap_obj;
2898 		if (snap != new)
2899 			dsl_dataset_rele(snap, FTAG);
2900 		if (snapobj == 0) {
2901 			err = SET_ERROR(EINVAL);
2902 			break;
2903 		}
2904 
2905 	}
2906 	return (err);
2907 }
2908 
2909 /*
2910  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
2911  * lastsnap, and all snapshots in between are deleted.
2912  *
2913  * blocks that would be freed            [---------------------------]
2914  * snapshots                       ---O-------O--------O-------O--------O
2915  *                                        firstsnap        lastsnap
2916  *
2917  * This is the set of blocks that were born after the snap before firstsnap,
2918  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
2919  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
2920  * We calculate this by iterating over the relevant deadlists (from the snap
2921  * after lastsnap, backward to the snap after firstsnap), summing up the
2922  * space on the deadlist that was born after the snap before firstsnap.
2923  */
2924 int
2925 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
2926     dsl_dataset_t *lastsnap,
2927     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
2928 {
2929 	int err = 0;
2930 	uint64_t snapobj;
2931 	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
2932 
2933 	ASSERT(dsl_dataset_is_snapshot(firstsnap));
2934 	ASSERT(dsl_dataset_is_snapshot(lastsnap));
2935 
2936 	/*
2937 	 * Check that the snapshots are in the same dsl_dir, and firstsnap
2938 	 * is before lastsnap.
2939 	 */
2940 	if (firstsnap->ds_dir != lastsnap->ds_dir ||
2941 	    firstsnap->ds_phys->ds_creation_txg >
2942 	    lastsnap->ds_phys->ds_creation_txg)
2943 		return (SET_ERROR(EINVAL));
2944 
2945 	*usedp = *compp = *uncompp = 0;
2946 
2947 	snapobj = lastsnap->ds_phys->ds_next_snap_obj;
2948 	while (snapobj != firstsnap->ds_object) {
2949 		dsl_dataset_t *ds;
2950 		uint64_t used, comp, uncomp;
2951 
2952 		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
2953 		if (err != 0)
2954 			break;
2955 
2956 		dsl_deadlist_space_range(&ds->ds_deadlist,
2957 		    firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2958 		    &used, &comp, &uncomp);
2959 		*usedp += used;
2960 		*compp += comp;
2961 		*uncompp += uncomp;
2962 
2963 		snapobj = ds->ds_phys->ds_prev_snap_obj;
2964 		ASSERT3U(snapobj, !=, 0);
2965 		dsl_dataset_rele(ds, FTAG);
2966 	}
2967 	return (err);
2968 }
2969 
2970 /*
2971  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
2972  * For example, they could both be snapshots of the same filesystem, and
2973  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
2974  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
2975  * filesystem.  Or 'earlier' could be the origin's origin.
2976  *
2977  * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
2978  */
2979 boolean_t
2980 dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
2981 	uint64_t earlier_txg)
2982 {
2983 	dsl_pool_t *dp = later->ds_dir->dd_pool;
2984 	int error;
2985 	boolean_t ret;
2986 
2987 	ASSERT(dsl_pool_config_held(dp));
2988 	ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0);
2989 
2990 	if (earlier_txg == 0)
2991 		earlier_txg = earlier->ds_phys->ds_creation_txg;
2992 
2993 	if (dsl_dataset_is_snapshot(later) &&
2994 	    earlier_txg >= later->ds_phys->ds_creation_txg)
2995 		return (B_FALSE);
2996 
2997 	if (later->ds_dir == earlier->ds_dir)
2998 		return (B_TRUE);
2999 	if (!dsl_dir_is_clone(later->ds_dir))
3000 		return (B_FALSE);
3001 
3002 	if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object)
3003 		return (B_TRUE);
3004 	dsl_dataset_t *origin;
3005 	error = dsl_dataset_hold_obj(dp,
3006 	    later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
3007 	if (error != 0)
3008 		return (B_FALSE);
3009 	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
3010 	dsl_dataset_rele(origin, FTAG);
3011 	return (ret);
3012 }
3013 
3014 
3015 void
3016 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
3017 {
3018 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3019 	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
3020 }
3021