xref: /illumos-gate/usr/src/uts/common/fs/zfs/dsl_destroy.c (revision 985cc36c07a787e0cb720fcf2fab565aa2a77590)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright (c) 2013 Steven Hartland. All rights reserved.
25  * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/dsl_userhold.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_synctask.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/dsl_pool.h>
35 #include <sys/dsl_dir.h>
36 #include <sys/dmu_traverse.h>
37 #include <sys/dsl_scan.h>
38 #include <sys/dmu_objset.h>
39 #include <sys/zap.h>
40 #include <sys/zfeature.h>
41 #include <sys/zfs_ioctl.h>
42 #include <sys/dsl_deleg.h>
43 #include <sys/dmu_impl.h>
44 
45 typedef struct dmu_snapshots_destroy_arg {
46 	nvlist_t *dsda_snaps;
47 	nvlist_t *dsda_successful_snaps;
48 	boolean_t dsda_defer;
49 	nvlist_t *dsda_errlist;
50 } dmu_snapshots_destroy_arg_t;
51 
52 int
53 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
54 {
55 	if (!ds->ds_is_snapshot)
56 		return (SET_ERROR(EINVAL));
57 
58 	if (dsl_dataset_long_held(ds))
59 		return (SET_ERROR(EBUSY));
60 
61 	/*
62 	 * Only allow deferred destroy on pools that support it.
63 	 * NOTE: deferred destroy is only supported on snapshots.
64 	 */
65 	if (defer) {
66 		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
67 		    SPA_VERSION_USERREFS)
68 			return (SET_ERROR(ENOTSUP));
69 		return (0);
70 	}
71 
72 	/*
73 	 * If this snapshot has an elevated user reference count,
74 	 * we can't destroy it yet.
75 	 */
76 	if (ds->ds_userrefs > 0)
77 		return (SET_ERROR(EBUSY));
78 
79 	/*
80 	 * Can't delete a branch point.
81 	 */
82 	if (dsl_dataset_phys(ds)->ds_num_children > 1)
83 		return (SET_ERROR(EEXIST));
84 
85 	return (0);
86 }
87 
88 static int
89 dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
90 {
91 	dmu_snapshots_destroy_arg_t *dsda = arg;
92 	dsl_pool_t *dp = dmu_tx_pool(tx);
93 	nvpair_t *pair;
94 	int error = 0;
95 
96 	if (!dmu_tx_is_syncing(tx))
97 		return (0);
98 
99 	for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
100 	    pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
101 		dsl_dataset_t *ds;
102 
103 		error = dsl_dataset_hold(dp, nvpair_name(pair),
104 		    FTAG, &ds);
105 
106 		/*
107 		 * If the snapshot does not exist, silently ignore it
108 		 * (it's "already destroyed").
109 		 */
110 		if (error == ENOENT)
111 			continue;
112 
113 		if (error == 0) {
114 			error = dsl_destroy_snapshot_check_impl(ds,
115 			    dsda->dsda_defer);
116 			dsl_dataset_rele(ds, FTAG);
117 		}
118 
119 		if (error == 0) {
120 			fnvlist_add_boolean(dsda->dsda_successful_snaps,
121 			    nvpair_name(pair));
122 		} else {
123 			fnvlist_add_int32(dsda->dsda_errlist,
124 			    nvpair_name(pair), error);
125 		}
126 	}
127 
128 	pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
129 	if (pair != NULL)
130 		return (fnvpair_value_int32(pair));
131 
132 	return (0);
133 }
134 
135 struct process_old_arg {
136 	dsl_dataset_t *ds;
137 	dsl_dataset_t *ds_prev;
138 	boolean_t after_branch_point;
139 	zio_t *pio;
140 	uint64_t used, comp, uncomp;
141 };
142 
143 static int
144 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
145 {
146 	struct process_old_arg *poa = arg;
147 	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
148 
149 	ASSERT(!BP_IS_HOLE(bp));
150 
151 	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
152 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
153 		if (poa->ds_prev && !poa->after_branch_point &&
154 		    bp->blk_birth >
155 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
156 			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
157 			    bp_get_dsize_sync(dp->dp_spa, bp);
158 		}
159 	} else {
160 		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
161 		poa->comp += BP_GET_PSIZE(bp);
162 		poa->uncomp += BP_GET_UCSIZE(bp);
163 		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
164 	}
165 	return (0);
166 }
167 
168 static void
169 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
170     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
171 {
172 	struct process_old_arg poa = { 0 };
173 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
174 	objset_t *mos = dp->dp_meta_objset;
175 	uint64_t deadlist_obj;
176 
177 	ASSERT(ds->ds_deadlist.dl_oldfmt);
178 	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
179 
180 	poa.ds = ds;
181 	poa.ds_prev = ds_prev;
182 	poa.after_branch_point = after_branch_point;
183 	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
184 	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
185 	    process_old_cb, &poa, tx));
186 	VERIFY0(zio_wait(poa.pio));
187 	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
188 
189 	/* change snapused */
190 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
191 	    -poa.used, -poa.comp, -poa.uncomp, tx);
192 
193 	/* swap next's deadlist to our deadlist */
194 	dsl_deadlist_close(&ds->ds_deadlist);
195 	dsl_deadlist_close(&ds_next->ds_deadlist);
196 	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
197 	dsl_dataset_phys(ds)->ds_deadlist_obj =
198 	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
199 	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
200 	dsl_deadlist_open(&ds->ds_deadlist, mos,
201 	    dsl_dataset_phys(ds)->ds_deadlist_obj);
202 	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
203 	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
204 }
205 
206 static void
207 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
208 {
209 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
210 	zap_cursor_t zc;
211 	zap_attribute_t za;
212 
213 	/*
214 	 * If it is the old version, dd_clones doesn't exist so we can't
215 	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
216 	 * doesn't matter.
217 	 */
218 	if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
219 		return;
220 
221 	for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
222 	    zap_cursor_retrieve(&zc, &za) == 0;
223 	    zap_cursor_advance(&zc)) {
224 		dsl_dataset_t *clone;
225 
226 		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
227 		    za.za_first_integer, FTAG, &clone));
228 		if (clone->ds_dir->dd_origin_txg > mintxg) {
229 			dsl_deadlist_remove_key(&clone->ds_deadlist,
230 			    mintxg, tx);
231 			dsl_dataset_remove_clones_key(clone, mintxg, tx);
232 		}
233 		dsl_dataset_rele(clone, FTAG);
234 	}
235 	zap_cursor_fini(&zc);
236 }
237 
238 void
239 dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
240 {
241 	int err;
242 	int after_branch_point = FALSE;
243 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
244 	objset_t *mos = dp->dp_meta_objset;
245 	dsl_dataset_t *ds_prev = NULL;
246 	uint64_t obj;
247 
248 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
249 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
250 	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
251 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
252 	ASSERT(refcount_is_zero(&ds->ds_longholds));
253 
254 	if (defer &&
255 	    (ds->ds_userrefs > 0 ||
256 	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
257 		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
258 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
259 		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
260 		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
261 		return;
262 	}
263 
264 	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
265 
266 	/* We need to log before removing it from the namespace. */
267 	spa_history_log_internal_ds(ds, "destroy", tx, "");
268 
269 	dsl_scan_ds_destroyed(ds, tx);
270 
271 	obj = ds->ds_object;
272 
273 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
274 		if (ds->ds_feature_inuse[f]) {
275 			dsl_dataset_deactivate_feature(obj, f, tx);
276 			ds->ds_feature_inuse[f] = B_FALSE;
277 		}
278 	}
279 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
280 		ASSERT3P(ds->ds_prev, ==, NULL);
281 		VERIFY0(dsl_dataset_hold_obj(dp,
282 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
283 		after_branch_point =
284 		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
285 
286 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
287 		if (after_branch_point &&
288 		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
289 			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
290 			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
291 				VERIFY0(zap_add_int(mos,
292 				    dsl_dataset_phys(ds_prev)->
293 				    ds_next_clones_obj,
294 				    dsl_dataset_phys(ds)->ds_next_snap_obj,
295 				    tx));
296 			}
297 		}
298 		if (!after_branch_point) {
299 			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
300 			    dsl_dataset_phys(ds)->ds_next_snap_obj;
301 		}
302 	}
303 
304 	dsl_dataset_t *ds_next;
305 	uint64_t old_unique;
306 	uint64_t used = 0, comp = 0, uncomp = 0;
307 
308 	VERIFY0(dsl_dataset_hold_obj(dp,
309 	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
310 	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
311 
312 	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
313 
314 	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
315 	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
316 	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
317 	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
318 	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
319 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
320 	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
321 
322 	if (ds_next->ds_deadlist.dl_oldfmt) {
323 		process_old_deadlist(ds, ds_prev, ds_next,
324 		    after_branch_point, tx);
325 	} else {
326 		/* Adjust prev's unique space. */
327 		if (ds_prev && !after_branch_point) {
328 			dsl_deadlist_space_range(&ds_next->ds_deadlist,
329 			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
330 			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
331 			    &used, &comp, &uncomp);
332 			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
333 		}
334 
335 		/* Adjust snapused. */
336 		dsl_deadlist_space_range(&ds_next->ds_deadlist,
337 		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
338 		    &used, &comp, &uncomp);
339 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
340 		    -used, -comp, -uncomp, tx);
341 
342 		/* Move blocks to be freed to pool's free list. */
343 		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
344 		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
345 		    tx);
346 		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
347 		    DD_USED_HEAD, used, comp, uncomp, tx);
348 
349 		/* Merge our deadlist into next's and free it. */
350 		dsl_deadlist_merge(&ds_next->ds_deadlist,
351 		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
352 	}
353 	dsl_deadlist_close(&ds->ds_deadlist);
354 	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
355 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
356 	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
357 
358 	/* Collapse range in clone heads */
359 	dsl_dataset_remove_clones_key(ds,
360 	    dsl_dataset_phys(ds)->ds_creation_txg, tx);
361 
362 	if (ds_next->ds_is_snapshot) {
363 		dsl_dataset_t *ds_nextnext;
364 
365 		/*
366 		 * Update next's unique to include blocks which
367 		 * were previously shared by only this snapshot
368 		 * and it.  Those blocks will be born after the
369 		 * prev snap and before this snap, and will have
370 		 * died after the next snap and before the one
371 		 * after that (ie. be on the snap after next's
372 		 * deadlist).
373 		 */
374 		VERIFY0(dsl_dataset_hold_obj(dp,
375 		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
376 		    FTAG, &ds_nextnext));
377 		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
378 		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
379 		    dsl_dataset_phys(ds)->ds_creation_txg,
380 		    &used, &comp, &uncomp);
381 		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
382 		dsl_dataset_rele(ds_nextnext, FTAG);
383 		ASSERT3P(ds_next->ds_prev, ==, NULL);
384 
385 		/* Collapse range in this head. */
386 		dsl_dataset_t *hds;
387 		VERIFY0(dsl_dataset_hold_obj(dp,
388 		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
389 		dsl_deadlist_remove_key(&hds->ds_deadlist,
390 		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
391 		dsl_dataset_rele(hds, FTAG);
392 
393 	} else {
394 		ASSERT3P(ds_next->ds_prev, ==, ds);
395 		dsl_dataset_rele(ds_next->ds_prev, ds_next);
396 		ds_next->ds_prev = NULL;
397 		if (ds_prev) {
398 			VERIFY0(dsl_dataset_hold_obj(dp,
399 			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
400 			    ds_next, &ds_next->ds_prev));
401 		}
402 
403 		dsl_dataset_recalc_head_uniq(ds_next);
404 
405 		/*
406 		 * Reduce the amount of our unconsumed refreservation
407 		 * being charged to our parent by the amount of
408 		 * new unique data we have gained.
409 		 */
410 		if (old_unique < ds_next->ds_reserved) {
411 			int64_t mrsdelta;
412 			uint64_t new_unique =
413 			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
414 
415 			ASSERT(old_unique <= new_unique);
416 			mrsdelta = MIN(new_unique - old_unique,
417 			    ds_next->ds_reserved - old_unique);
418 			dsl_dir_diduse_space(ds->ds_dir,
419 			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
420 		}
421 	}
422 	dsl_dataset_rele(ds_next, FTAG);
423 
424 	/*
425 	 * This must be done after the dsl_traverse(), because it will
426 	 * re-open the objset.
427 	 */
428 	if (ds->ds_objset) {
429 		dmu_objset_evict(ds->ds_objset);
430 		ds->ds_objset = NULL;
431 	}
432 
433 	/* remove from snapshot namespace */
434 	dsl_dataset_t *ds_head;
435 	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
436 	VERIFY0(dsl_dataset_hold_obj(dp,
437 	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
438 	VERIFY0(dsl_dataset_get_snapname(ds));
439 #ifdef ZFS_DEBUG
440 	{
441 		uint64_t val;
442 
443 		err = dsl_dataset_snap_lookup(ds_head,
444 		    ds->ds_snapname, &val);
445 		ASSERT0(err);
446 		ASSERT3U(val, ==, obj);
447 	}
448 #endif
449 	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
450 	dsl_dataset_rele(ds_head, FTAG);
451 
452 	if (ds_prev != NULL)
453 		dsl_dataset_rele(ds_prev, FTAG);
454 
455 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
456 
457 	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
458 		uint64_t count;
459 		ASSERT0(zap_count(mos,
460 		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
461 		    count == 0);
462 		VERIFY0(dmu_object_free(mos,
463 		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
464 	}
465 	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
466 		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
467 		    tx));
468 	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
469 		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
470 		    tx));
471 	dsl_dir_rele(ds->ds_dir, ds);
472 	ds->ds_dir = NULL;
473 	dmu_object_free_zapified(mos, obj, tx);
474 }
475 
476 static void
477 dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
478 {
479 	dmu_snapshots_destroy_arg_t *dsda = arg;
480 	dsl_pool_t *dp = dmu_tx_pool(tx);
481 	nvpair_t *pair;
482 
483 	for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
484 	    pair != NULL;
485 	    pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
486 		dsl_dataset_t *ds;
487 
488 		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
489 
490 		dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
491 		dsl_dataset_rele(ds, FTAG);
492 	}
493 }
494 
495 /*
496  * The semantics of this function are described in the comment above
497  * lzc_destroy_snaps().  To summarize:
498  *
499  * The snapshots must all be in the same pool.
500  *
501  * Snapshots that don't exist will be silently ignored (considered to be
502  * "already deleted").
503  *
504  * On success, all snaps will be destroyed and this will return 0.
505  * On failure, no snaps will be destroyed, the errlist will be filled in,
506  * and this will return an errno.
507  */
508 int
509 dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
510     nvlist_t *errlist)
511 {
512 	dmu_snapshots_destroy_arg_t dsda;
513 	int error;
514 	nvpair_t *pair;
515 
516 	pair = nvlist_next_nvpair(snaps, NULL);
517 	if (pair == NULL)
518 		return (0);
519 
520 	dsda.dsda_snaps = snaps;
521 	dsda.dsda_successful_snaps = fnvlist_alloc();
522 	dsda.dsda_defer = defer;
523 	dsda.dsda_errlist = errlist;
524 
525 	error = dsl_sync_task(nvpair_name(pair),
526 	    dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
527 	    &dsda, 0, ZFS_SPACE_CHECK_NONE);
528 	fnvlist_free(dsda.dsda_successful_snaps);
529 
530 	return (error);
531 }
532 
533 int
534 dsl_destroy_snapshot(const char *name, boolean_t defer)
535 {
536 	int error;
537 	nvlist_t *nvl = fnvlist_alloc();
538 	nvlist_t *errlist = fnvlist_alloc();
539 
540 	fnvlist_add_boolean(nvl, name);
541 	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
542 	fnvlist_free(errlist);
543 	fnvlist_free(nvl);
544 	return (error);
545 }
546 
547 struct killarg {
548 	dsl_dataset_t *ds;
549 	dmu_tx_t *tx;
550 };
551 
552 /* ARGSUSED */
553 static int
554 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
555     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
556 {
557 	struct killarg *ka = arg;
558 	dmu_tx_t *tx = ka->tx;
559 
560 	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
561 		return (0);
562 
563 	if (zb->zb_level == ZB_ZIL_LEVEL) {
564 		ASSERT(zilog != NULL);
565 		/*
566 		 * It's a block in the intent log.  It has no
567 		 * accounting, so just free it.
568 		 */
569 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
570 	} else {
571 		ASSERT(zilog == NULL);
572 		ASSERT3U(bp->blk_birth, >,
573 		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
574 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
575 	}
576 
577 	return (0);
578 }
579 
580 static void
581 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
582 {
583 	struct killarg ka;
584 
585 	/*
586 	 * Free everything that we point to (that's born after
587 	 * the previous snapshot, if we are a clone)
588 	 *
589 	 * NB: this should be very quick, because we already
590 	 * freed all the objects in open context.
591 	 */
592 	ka.ds = ds;
593 	ka.tx = tx;
594 	VERIFY0(traverse_dataset(ds,
595 	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
596 	    kill_blkptr, &ka));
597 	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
598 	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
599 }
600 
601 typedef struct dsl_destroy_head_arg {
602 	const char *ddha_name;
603 } dsl_destroy_head_arg_t;
604 
605 int
606 dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
607 {
608 	int error;
609 	uint64_t count;
610 	objset_t *mos;
611 
612 	ASSERT(!ds->ds_is_snapshot);
613 	if (ds->ds_is_snapshot)
614 		return (SET_ERROR(EINVAL));
615 
616 	if (refcount_count(&ds->ds_longholds) != expected_holds)
617 		return (SET_ERROR(EBUSY));
618 
619 	mos = ds->ds_dir->dd_pool->dp_meta_objset;
620 
621 	/*
622 	 * Can't delete a head dataset if there are snapshots of it.
623 	 * (Except if the only snapshots are from the branch we cloned
624 	 * from.)
625 	 */
626 	if (ds->ds_prev != NULL &&
627 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
628 		return (SET_ERROR(EBUSY));
629 
630 	/*
631 	 * Can't delete if there are children of this fs.
632 	 */
633 	error = zap_count(mos,
634 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
635 	if (error != 0)
636 		return (error);
637 	if (count != 0)
638 		return (SET_ERROR(EEXIST));
639 
640 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
641 	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
642 	    ds->ds_prev->ds_userrefs == 0) {
643 		/* We need to remove the origin snapshot as well. */
644 		if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
645 			return (SET_ERROR(EBUSY));
646 	}
647 	return (0);
648 }
649 
650 static int
651 dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
652 {
653 	dsl_destroy_head_arg_t *ddha = arg;
654 	dsl_pool_t *dp = dmu_tx_pool(tx);
655 	dsl_dataset_t *ds;
656 	int error;
657 
658 	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
659 	if (error != 0)
660 		return (error);
661 
662 	error = dsl_destroy_head_check_impl(ds, 0);
663 	dsl_dataset_rele(ds, FTAG);
664 	return (error);
665 }
666 
667 static void
668 dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
669 {
670 	dsl_dir_t *dd;
671 	dsl_pool_t *dp = dmu_tx_pool(tx);
672 	objset_t *mos = dp->dp_meta_objset;
673 	dd_used_t t;
674 
675 	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
676 
677 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
678 
679 	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
680 
681 	/*
682 	 * Decrement the filesystem count for all parent filesystems.
683 	 *
684 	 * When we receive an incremental stream into a filesystem that already
685 	 * exists, a temporary clone is created.  We never count this temporary
686 	 * clone, whose name begins with a '%'.
687 	 */
688 	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
689 		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
690 		    DD_FIELD_FILESYSTEM_COUNT, tx);
691 
692 	/*
693 	 * Remove our reservation. The impl() routine avoids setting the
694 	 * actual property, which would require the (already destroyed) ds.
695 	 */
696 	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
697 
698 	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
699 	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
700 	for (t = 0; t < DD_USED_NUM; t++)
701 		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
702 
703 	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
704 	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
705 	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
706 	VERIFY0(zap_remove(mos,
707 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
708 	    dd->dd_myname, tx));
709 
710 	dsl_dir_rele(dd, FTAG);
711 	dmu_object_free_zapified(mos, ddobj, tx);
712 }
713 
714 void
715 dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
716 {
717 	dsl_pool_t *dp = dmu_tx_pool(tx);
718 	objset_t *mos = dp->dp_meta_objset;
719 	uint64_t obj, ddobj, prevobj = 0;
720 	boolean_t rmorigin;
721 
722 	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
723 	ASSERT(ds->ds_prev == NULL ||
724 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
725 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
726 	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
727 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
728 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
729 
730 	/* We need to log before removing it from the namespace. */
731 	spa_history_log_internal_ds(ds, "destroy", tx, "");
732 
733 	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
734 	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
735 	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
736 	    ds->ds_prev->ds_userrefs == 0);
737 
738 	/* Remove our reservation. */
739 	if (ds->ds_reserved != 0) {
740 		dsl_dataset_set_refreservation_sync_impl(ds,
741 		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
742 		    0, tx);
743 		ASSERT0(ds->ds_reserved);
744 	}
745 
746 	obj = ds->ds_object;
747 
748 	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
749 		if (ds->ds_feature_inuse[f]) {
750 			dsl_dataset_deactivate_feature(obj, f, tx);
751 			ds->ds_feature_inuse[f] = B_FALSE;
752 		}
753 	}
754 
755 	dsl_scan_ds_destroyed(ds, tx);
756 
757 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
758 		/* This is a clone */
759 		ASSERT(ds->ds_prev != NULL);
760 		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
761 		    obj);
762 		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
763 
764 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
765 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
766 			dsl_dataset_remove_from_next_clones(ds->ds_prev,
767 			    obj, tx);
768 		}
769 
770 		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
771 		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
772 	}
773 
774 	/*
775 	 * Destroy the deadlist.  Unless it's a clone, the
776 	 * deadlist should be empty.  (If it's a clone, it's
777 	 * safe to ignore the deadlist contents.)
778 	 */
779 	dsl_deadlist_close(&ds->ds_deadlist);
780 	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
781 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
782 	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
783 
784 	objset_t *os;
785 	VERIFY0(dmu_objset_from_ds(ds, &os));
786 
787 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
788 		old_synchronous_dataset_destroy(ds, tx);
789 	} else {
790 		/*
791 		 * Move the bptree into the pool's list of trees to
792 		 * clean up and update space accounting information.
793 		 */
794 		uint64_t used, comp, uncomp;
795 
796 		zil_destroy_sync(dmu_objset_zil(os), tx);
797 
798 		if (!spa_feature_is_active(dp->dp_spa,
799 		    SPA_FEATURE_ASYNC_DESTROY)) {
800 			dsl_scan_t *scn = dp->dp_scan;
801 			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
802 			    tx);
803 			dp->dp_bptree_obj = bptree_alloc(mos, tx);
804 			VERIFY0(zap_add(mos,
805 			    DMU_POOL_DIRECTORY_OBJECT,
806 			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
807 			    &dp->dp_bptree_obj, tx));
808 			ASSERT(!scn->scn_async_destroying);
809 			scn->scn_async_destroying = B_TRUE;
810 		}
811 
812 		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
813 		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
814 		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
815 
816 		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
817 		    dsl_dataset_phys(ds)->ds_unique_bytes == used);
818 
819 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
820 		bptree_add(mos, dp->dp_bptree_obj,
821 		    &dsl_dataset_phys(ds)->ds_bp,
822 		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
823 		    used, comp, uncomp, tx);
824 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
825 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
826 		    -used, -comp, -uncomp, tx);
827 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
828 		    used, comp, uncomp, tx);
829 	}
830 
831 	if (ds->ds_prev != NULL) {
832 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
833 			VERIFY0(zap_remove_int(mos,
834 			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
835 			    ds->ds_object, tx));
836 		}
837 		prevobj = ds->ds_prev->ds_object;
838 		dsl_dataset_rele(ds->ds_prev, ds);
839 		ds->ds_prev = NULL;
840 	}
841 
842 	/*
843 	 * This must be done after the dsl_traverse(), because it will
844 	 * re-open the objset.
845 	 */
846 	if (ds->ds_objset) {
847 		dmu_objset_evict(ds->ds_objset);
848 		ds->ds_objset = NULL;
849 	}
850 
851 	/* Erase the link in the dir */
852 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
853 	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
854 	ddobj = ds->ds_dir->dd_object;
855 	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
856 	VERIFY0(zap_destroy(mos,
857 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
858 
859 	if (ds->ds_bookmarks != 0) {
860 		VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
861 		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
862 	}
863 
864 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
865 
866 	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
867 	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
868 	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
869 	dsl_dir_rele(ds->ds_dir, ds);
870 	ds->ds_dir = NULL;
871 	dmu_object_free_zapified(mos, obj, tx);
872 
873 	dsl_dir_destroy_sync(ddobj, tx);
874 
875 	if (rmorigin) {
876 		dsl_dataset_t *prev;
877 		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
878 		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
879 		dsl_dataset_rele(prev, FTAG);
880 	}
881 }
882 
883 static void
884 dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
885 {
886 	dsl_destroy_head_arg_t *ddha = arg;
887 	dsl_pool_t *dp = dmu_tx_pool(tx);
888 	dsl_dataset_t *ds;
889 
890 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
891 	dsl_destroy_head_sync_impl(ds, tx);
892 	dsl_dataset_rele(ds, FTAG);
893 }
894 
895 static void
896 dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
897 {
898 	dsl_destroy_head_arg_t *ddha = arg;
899 	dsl_pool_t *dp = dmu_tx_pool(tx);
900 	dsl_dataset_t *ds;
901 
902 	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
903 
904 	/* Mark it as inconsistent on-disk, in case we crash */
905 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
906 	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
907 
908 	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
909 	dsl_dataset_rele(ds, FTAG);
910 }
911 
912 int
913 dsl_destroy_head(const char *name)
914 {
915 	dsl_destroy_head_arg_t ddha;
916 	int error;
917 	spa_t *spa;
918 	boolean_t isenabled;
919 
920 #ifdef _KERNEL
921 	zfs_destroy_unmount_origin(name);
922 #endif
923 
924 	error = spa_open(name, &spa, FTAG);
925 	if (error != 0)
926 		return (error);
927 	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
928 	spa_close(spa, FTAG);
929 
930 	ddha.ddha_name = name;
931 
932 	if (!isenabled) {
933 		objset_t *os;
934 
935 		error = dsl_sync_task(name, dsl_destroy_head_check,
936 		    dsl_destroy_head_begin_sync, &ddha,
937 		    0, ZFS_SPACE_CHECK_NONE);
938 		if (error != 0)
939 			return (error);
940 
941 		/*
942 		 * Head deletion is processed in one txg on old pools;
943 		 * remove the objects from open context so that the txg sync
944 		 * is not too long.
945 		 */
946 		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
947 		if (error == 0) {
948 			uint64_t prev_snap_txg =
949 			    dsl_dataset_phys(dmu_objset_ds(os))->
950 			    ds_prev_snap_txg;
951 			for (uint64_t obj = 0; error == 0;
952 			    error = dmu_object_next(os, &obj, FALSE,
953 			    prev_snap_txg))
954 				(void) dmu_free_long_object(os, obj);
955 			/* sync out all frees */
956 			txg_wait_synced(dmu_objset_pool(os), 0);
957 			dmu_objset_disown(os, FTAG);
958 		}
959 	}
960 
961 	return (dsl_sync_task(name, dsl_destroy_head_check,
962 	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
963 }
964 
965 /*
966  * Note, this function is used as the callback for dmu_objset_find().  We
967  * always return 0 so that we will continue to find and process
968  * inconsistent datasets, even if we encounter an error trying to
969  * process one of them.
970  */
971 /* ARGSUSED */
972 int
973 dsl_destroy_inconsistent(const char *dsname, void *arg)
974 {
975 	objset_t *os;
976 
977 	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
978 		boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
979 
980 		/*
981 		 * If the dataset is inconsistent because a resumable receive
982 		 * has failed, then do not destroy it.
983 		 */
984 		if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
985 			need_destroy = B_FALSE;
986 
987 		dmu_objset_rele(os, FTAG);
988 		if (need_destroy)
989 			(void) dsl_destroy_head(dsname);
990 	}
991 	return (0);
992 }
993