xref: /freebsd/sys/contrib/openzfs/module/zfs/dsl_dir.c (revision eda14cbc264d6969b02f2b1994cef11148e914f1)
1*eda14cbcSMatt Macy /*
2*eda14cbcSMatt Macy  * CDDL HEADER START
3*eda14cbcSMatt Macy  *
4*eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5*eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6*eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7*eda14cbcSMatt Macy  *
8*eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*eda14cbcSMatt Macy  * or http://www.opensolaris.org/os/licensing.
10*eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11*eda14cbcSMatt Macy  * and limitations under the License.
12*eda14cbcSMatt Macy  *
13*eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14*eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16*eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17*eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18*eda14cbcSMatt Macy  *
19*eda14cbcSMatt Macy  * CDDL HEADER END
20*eda14cbcSMatt Macy  */
21*eda14cbcSMatt Macy /*
22*eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23*eda14cbcSMatt Macy  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24*eda14cbcSMatt Macy  * Copyright (c) 2013 Martin Matuska. All rights reserved.
25*eda14cbcSMatt Macy  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
26*eda14cbcSMatt Macy  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27*eda14cbcSMatt Macy  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
28*eda14cbcSMatt Macy  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
29*eda14cbcSMatt Macy  */
30*eda14cbcSMatt Macy 
31*eda14cbcSMatt Macy #include <sys/dmu.h>
32*eda14cbcSMatt Macy #include <sys/dmu_objset.h>
33*eda14cbcSMatt Macy #include <sys/dmu_tx.h>
34*eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
35*eda14cbcSMatt Macy #include <sys/dsl_dir.h>
36*eda14cbcSMatt Macy #include <sys/dsl_prop.h>
37*eda14cbcSMatt Macy #include <sys/dsl_synctask.h>
38*eda14cbcSMatt Macy #include <sys/dsl_deleg.h>
39*eda14cbcSMatt Macy #include <sys/dmu_impl.h>
40*eda14cbcSMatt Macy #include <sys/spa.h>
41*eda14cbcSMatt Macy #include <sys/spa_impl.h>
42*eda14cbcSMatt Macy #include <sys/metaslab.h>
43*eda14cbcSMatt Macy #include <sys/zap.h>
44*eda14cbcSMatt Macy #include <sys/zio.h>
45*eda14cbcSMatt Macy #include <sys/arc.h>
46*eda14cbcSMatt Macy #include <sys/sunddi.h>
47*eda14cbcSMatt Macy #include <sys/zfeature.h>
48*eda14cbcSMatt Macy #include <sys/policy.h>
49*eda14cbcSMatt Macy #include <sys/zfs_znode.h>
50*eda14cbcSMatt Macy #include <sys/zvol.h>
51*eda14cbcSMatt Macy #include <sys/zthr.h>
52*eda14cbcSMatt Macy #include "zfs_namecheck.h"
53*eda14cbcSMatt Macy #include "zfs_prop.h"
54*eda14cbcSMatt Macy #ifdef _KERNEL
55*eda14cbcSMatt Macy #include <sys/zfs_vfsops.h>
56*eda14cbcSMatt Macy #endif
57*eda14cbcSMatt Macy 
58*eda14cbcSMatt Macy /*
59*eda14cbcSMatt Macy  * Filesystem and Snapshot Limits
60*eda14cbcSMatt Macy  * ------------------------------
61*eda14cbcSMatt Macy  *
62*eda14cbcSMatt Macy  * These limits are used to restrict the number of filesystems and/or snapshots
63*eda14cbcSMatt Macy  * that can be created at a given level in the tree or below. A typical
64*eda14cbcSMatt Macy  * use-case is with a delegated dataset where the administrator wants to ensure
65*eda14cbcSMatt Macy  * that a user within the zone is not creating too many additional filesystems
66*eda14cbcSMatt Macy  * or snapshots, even though they're not exceeding their space quota.
67*eda14cbcSMatt Macy  *
68*eda14cbcSMatt Macy  * The filesystem and snapshot counts are stored as extensible properties. This
69*eda14cbcSMatt Macy  * capability is controlled by a feature flag and must be enabled to be used.
70*eda14cbcSMatt Macy  * Once enabled, the feature is not active until the first limit is set. At
71*eda14cbcSMatt Macy  * that point, future operations to create/destroy filesystems or snapshots
72*eda14cbcSMatt Macy  * will validate and update the counts.
73*eda14cbcSMatt Macy  *
74*eda14cbcSMatt Macy  * Because the count properties will not exist before the feature is active,
75*eda14cbcSMatt Macy  * the counts are updated when a limit is first set on an uninitialized
76*eda14cbcSMatt Macy  * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
77*eda14cbcSMatt Macy  * all of the nested filesystems/snapshots. Thus, a new leaf node has a
78*eda14cbcSMatt Macy  * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
79*eda14cbcSMatt Macy  * snapshot count properties on a node indicate uninitialized counts on that
80*eda14cbcSMatt Macy  * node.) When first setting a limit on an uninitialized node, the code starts
81*eda14cbcSMatt Macy  * at the filesystem with the new limit and descends into all sub-filesystems
82*eda14cbcSMatt Macy  * to add the count properties.
83*eda14cbcSMatt Macy  *
84*eda14cbcSMatt Macy  * In practice this is lightweight since a limit is typically set when the
85*eda14cbcSMatt Macy  * filesystem is created and thus has no children. Once valid, changing the
86*eda14cbcSMatt Macy  * limit value won't require a re-traversal since the counts are already valid.
87*eda14cbcSMatt Macy  * When recursively fixing the counts, if a node with a limit is encountered
88*eda14cbcSMatt Macy  * during the descent, the counts are known to be valid and there is no need to
89*eda14cbcSMatt Macy  * descend into that filesystem's children. The counts on filesystems above the
90*eda14cbcSMatt Macy  * one with the new limit will still be uninitialized, unless a limit is
91*eda14cbcSMatt Macy  * eventually set on one of those filesystems. The counts are always recursively
92*eda14cbcSMatt Macy  * updated when a limit is set on a dataset, unless there is already a limit.
93*eda14cbcSMatt Macy  * When a new limit value is set on a filesystem with an existing limit, it is
94*eda14cbcSMatt Macy  * possible for the new limit to be less than the current count at that level
95*eda14cbcSMatt Macy  * since a user who can change the limit is also allowed to exceed the limit.
96*eda14cbcSMatt Macy  *
97*eda14cbcSMatt Macy  * Once the feature is active, then whenever a filesystem or snapshot is
98*eda14cbcSMatt Macy  * created, the code recurses up the tree, validating the new count against the
99*eda14cbcSMatt Macy  * limit at each initialized level. In practice, most levels will not have a
100*eda14cbcSMatt Macy  * limit set. If there is a limit at any initialized level up the tree, the
101*eda14cbcSMatt Macy  * check must pass or the creation will fail. Likewise, when a filesystem or
102*eda14cbcSMatt Macy  * snapshot is destroyed, the counts are recursively adjusted all the way up
103*eda14cbcSMatt Macy  * the initialized nodes in the tree. Renaming a filesystem into different point
104*eda14cbcSMatt Macy  * in the tree will first validate, then update the counts on each branch up to
105*eda14cbcSMatt Macy  * the common ancestor. A receive will also validate the counts and then update
106*eda14cbcSMatt Macy  * them.
107*eda14cbcSMatt Macy  *
108*eda14cbcSMatt Macy  * An exception to the above behavior is that the limit is not enforced if the
109*eda14cbcSMatt Macy  * user has permission to modify the limit. This is primarily so that
110*eda14cbcSMatt Macy  * recursive snapshots in the global zone always work. We want to prevent a
111*eda14cbcSMatt Macy  * denial-of-service in which a lower level delegated dataset could max out its
112*eda14cbcSMatt Macy  * limit and thus block recursive snapshots from being taken in the global zone.
113*eda14cbcSMatt Macy  * Because of this, it is possible for the snapshot count to be over the limit
114*eda14cbcSMatt Macy  * and snapshots taken in the global zone could cause a lower level dataset to
115*eda14cbcSMatt Macy  * hit or exceed its limit. The administrator taking the global zone recursive
116*eda14cbcSMatt Macy  * snapshot should be aware of this side-effect and behave accordingly.
117*eda14cbcSMatt Macy  * For consistency, the filesystem limit is also not enforced if the user can
118*eda14cbcSMatt Macy  * modify the limit.
119*eda14cbcSMatt Macy  *
120*eda14cbcSMatt Macy  * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
121*eda14cbcSMatt Macy  * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
122*eda14cbcSMatt Macy  * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
123*eda14cbcSMatt Macy  * dsl_dir_init_fs_ss_count().
124*eda14cbcSMatt Macy  *
125*eda14cbcSMatt Macy  * There is a special case when we receive a filesystem that already exists. In
126*eda14cbcSMatt Macy  * this case a temporary clone name of %X is created (see dmu_recv_begin). We
127*eda14cbcSMatt Macy  * never update the filesystem counts for temporary clones.
128*eda14cbcSMatt Macy  *
129*eda14cbcSMatt Macy  * Likewise, we do not update the snapshot counts for temporary snapshots,
130*eda14cbcSMatt Macy  * such as those created by zfs diff.
131*eda14cbcSMatt Macy  */
132*eda14cbcSMatt Macy 
133*eda14cbcSMatt Macy extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
134*eda14cbcSMatt Macy 
135*eda14cbcSMatt Macy static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
136*eda14cbcSMatt Macy 
137*eda14cbcSMatt Macy typedef struct ddulrt_arg {
138*eda14cbcSMatt Macy 	dsl_dir_t	*ddulrta_dd;
139*eda14cbcSMatt Macy 	uint64_t	ddlrta_txg;
140*eda14cbcSMatt Macy } ddulrt_arg_t;
141*eda14cbcSMatt Macy 
142*eda14cbcSMatt Macy static void
143*eda14cbcSMatt Macy dsl_dir_evict_async(void *dbu)
144*eda14cbcSMatt Macy {
145*eda14cbcSMatt Macy 	dsl_dir_t *dd = dbu;
146*eda14cbcSMatt Macy 	int t;
147*eda14cbcSMatt Macy 	dsl_pool_t *dp __maybe_unused = dd->dd_pool;
148*eda14cbcSMatt Macy 
149*eda14cbcSMatt Macy 	dd->dd_dbuf = NULL;
150*eda14cbcSMatt Macy 
151*eda14cbcSMatt Macy 	for (t = 0; t < TXG_SIZE; t++) {
152*eda14cbcSMatt Macy 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
153*eda14cbcSMatt Macy 		ASSERT(dd->dd_tempreserved[t] == 0);
154*eda14cbcSMatt Macy 		ASSERT(dd->dd_space_towrite[t] == 0);
155*eda14cbcSMatt Macy 	}
156*eda14cbcSMatt Macy 
157*eda14cbcSMatt Macy 	if (dd->dd_parent)
158*eda14cbcSMatt Macy 		dsl_dir_async_rele(dd->dd_parent, dd);
159*eda14cbcSMatt Macy 
160*eda14cbcSMatt Macy 	spa_async_close(dd->dd_pool->dp_spa, dd);
161*eda14cbcSMatt Macy 
162*eda14cbcSMatt Macy 	if (dsl_deadlist_is_open(&dd->dd_livelist))
163*eda14cbcSMatt Macy 		dsl_dir_livelist_close(dd);
164*eda14cbcSMatt Macy 
165*eda14cbcSMatt Macy 	dsl_prop_fini(dd);
166*eda14cbcSMatt Macy 	cv_destroy(&dd->dd_activity_cv);
167*eda14cbcSMatt Macy 	mutex_destroy(&dd->dd_activity_lock);
168*eda14cbcSMatt Macy 	mutex_destroy(&dd->dd_lock);
169*eda14cbcSMatt Macy 	kmem_free(dd, sizeof (dsl_dir_t));
170*eda14cbcSMatt Macy }
171*eda14cbcSMatt Macy 
172*eda14cbcSMatt Macy int
173*eda14cbcSMatt Macy dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
174*eda14cbcSMatt Macy     const char *tail, void *tag, dsl_dir_t **ddp)
175*eda14cbcSMatt Macy {
176*eda14cbcSMatt Macy 	dmu_buf_t *dbuf;
177*eda14cbcSMatt Macy 	dsl_dir_t *dd;
178*eda14cbcSMatt Macy 	dmu_object_info_t doi;
179*eda14cbcSMatt Macy 	int err;
180*eda14cbcSMatt Macy 
181*eda14cbcSMatt Macy 	ASSERT(dsl_pool_config_held(dp));
182*eda14cbcSMatt Macy 
183*eda14cbcSMatt Macy 	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
184*eda14cbcSMatt Macy 	if (err != 0)
185*eda14cbcSMatt Macy 		return (err);
186*eda14cbcSMatt Macy 	dd = dmu_buf_get_user(dbuf);
187*eda14cbcSMatt Macy 
188*eda14cbcSMatt Macy 	dmu_object_info_from_db(dbuf, &doi);
189*eda14cbcSMatt Macy 	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
190*eda14cbcSMatt Macy 	ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
191*eda14cbcSMatt Macy 
192*eda14cbcSMatt Macy 	if (dd == NULL) {
193*eda14cbcSMatt Macy 		dsl_dir_t *winner;
194*eda14cbcSMatt Macy 
195*eda14cbcSMatt Macy 		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
196*eda14cbcSMatt Macy 		dd->dd_object = ddobj;
197*eda14cbcSMatt Macy 		dd->dd_dbuf = dbuf;
198*eda14cbcSMatt Macy 		dd->dd_pool = dp;
199*eda14cbcSMatt Macy 
200*eda14cbcSMatt Macy 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
201*eda14cbcSMatt Macy 		mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
202*eda14cbcSMatt Macy 		cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
203*eda14cbcSMatt Macy 		dsl_prop_init(dd);
204*eda14cbcSMatt Macy 
205*eda14cbcSMatt Macy 		if (dsl_dir_is_zapified(dd)) {
206*eda14cbcSMatt Macy 			err = zap_lookup(dp->dp_meta_objset,
207*eda14cbcSMatt Macy 			    ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
208*eda14cbcSMatt Macy 			    sizeof (uint64_t), 1, &dd->dd_crypto_obj);
209*eda14cbcSMatt Macy 			if (err == 0) {
210*eda14cbcSMatt Macy 				/* check for on-disk format errata */
211*eda14cbcSMatt Macy 				if (dsl_dir_incompatible_encryption_version(
212*eda14cbcSMatt Macy 				    dd)) {
213*eda14cbcSMatt Macy 					dp->dp_spa->spa_errata =
214*eda14cbcSMatt Macy 					    ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
215*eda14cbcSMatt Macy 				}
216*eda14cbcSMatt Macy 			} else if (err != ENOENT) {
217*eda14cbcSMatt Macy 				goto errout;
218*eda14cbcSMatt Macy 			}
219*eda14cbcSMatt Macy 		}
220*eda14cbcSMatt Macy 
221*eda14cbcSMatt Macy 		dsl_dir_snap_cmtime_update(dd);
222*eda14cbcSMatt Macy 
223*eda14cbcSMatt Macy 		if (dsl_dir_phys(dd)->dd_parent_obj) {
224*eda14cbcSMatt Macy 			err = dsl_dir_hold_obj(dp,
225*eda14cbcSMatt Macy 			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
226*eda14cbcSMatt Macy 			    &dd->dd_parent);
227*eda14cbcSMatt Macy 			if (err != 0)
228*eda14cbcSMatt Macy 				goto errout;
229*eda14cbcSMatt Macy 			if (tail) {
230*eda14cbcSMatt Macy #ifdef ZFS_DEBUG
231*eda14cbcSMatt Macy 				uint64_t foundobj;
232*eda14cbcSMatt Macy 
233*eda14cbcSMatt Macy 				err = zap_lookup(dp->dp_meta_objset,
234*eda14cbcSMatt Macy 				    dsl_dir_phys(dd->dd_parent)->
235*eda14cbcSMatt Macy 				    dd_child_dir_zapobj, tail,
236*eda14cbcSMatt Macy 				    sizeof (foundobj), 1, &foundobj);
237*eda14cbcSMatt Macy 				ASSERT(err || foundobj == ddobj);
238*eda14cbcSMatt Macy #endif
239*eda14cbcSMatt Macy 				(void) strlcpy(dd->dd_myname, tail,
240*eda14cbcSMatt Macy 				    sizeof (dd->dd_myname));
241*eda14cbcSMatt Macy 			} else {
242*eda14cbcSMatt Macy 				err = zap_value_search(dp->dp_meta_objset,
243*eda14cbcSMatt Macy 				    dsl_dir_phys(dd->dd_parent)->
244*eda14cbcSMatt Macy 				    dd_child_dir_zapobj,
245*eda14cbcSMatt Macy 				    ddobj, 0, dd->dd_myname);
246*eda14cbcSMatt Macy 			}
247*eda14cbcSMatt Macy 			if (err != 0)
248*eda14cbcSMatt Macy 				goto errout;
249*eda14cbcSMatt Macy 		} else {
250*eda14cbcSMatt Macy 			(void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa),
251*eda14cbcSMatt Macy 			    sizeof (dd->dd_myname));
252*eda14cbcSMatt Macy 		}
253*eda14cbcSMatt Macy 
254*eda14cbcSMatt Macy 		if (dsl_dir_is_clone(dd)) {
255*eda14cbcSMatt Macy 			dmu_buf_t *origin_bonus;
256*eda14cbcSMatt Macy 			dsl_dataset_phys_t *origin_phys;
257*eda14cbcSMatt Macy 
258*eda14cbcSMatt Macy 			/*
259*eda14cbcSMatt Macy 			 * We can't open the origin dataset, because
260*eda14cbcSMatt Macy 			 * that would require opening this dsl_dir.
261*eda14cbcSMatt Macy 			 * Just look at its phys directly instead.
262*eda14cbcSMatt Macy 			 */
263*eda14cbcSMatt Macy 			err = dmu_bonus_hold(dp->dp_meta_objset,
264*eda14cbcSMatt Macy 			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
265*eda14cbcSMatt Macy 			    &origin_bonus);
266*eda14cbcSMatt Macy 			if (err != 0)
267*eda14cbcSMatt Macy 				goto errout;
268*eda14cbcSMatt Macy 			origin_phys = origin_bonus->db_data;
269*eda14cbcSMatt Macy 			dd->dd_origin_txg =
270*eda14cbcSMatt Macy 			    origin_phys->ds_creation_txg;
271*eda14cbcSMatt Macy 			dmu_buf_rele(origin_bonus, FTAG);
272*eda14cbcSMatt Macy 			if (dsl_dir_is_zapified(dd)) {
273*eda14cbcSMatt Macy 				uint64_t obj;
274*eda14cbcSMatt Macy 				err = zap_lookup(dp->dp_meta_objset,
275*eda14cbcSMatt Macy 				    dd->dd_object, DD_FIELD_LIVELIST,
276*eda14cbcSMatt Macy 				    sizeof (uint64_t), 1, &obj);
277*eda14cbcSMatt Macy 				if (err == 0)
278*eda14cbcSMatt Macy 					dsl_dir_livelist_open(dd, obj);
279*eda14cbcSMatt Macy 				else if (err != ENOENT)
280*eda14cbcSMatt Macy 					goto errout;
281*eda14cbcSMatt Macy 			}
282*eda14cbcSMatt Macy 		}
283*eda14cbcSMatt Macy 
284*eda14cbcSMatt Macy 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
285*eda14cbcSMatt Macy 		    &dd->dd_dbuf);
286*eda14cbcSMatt Macy 		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
287*eda14cbcSMatt Macy 		if (winner != NULL) {
288*eda14cbcSMatt Macy 			if (dd->dd_parent)
289*eda14cbcSMatt Macy 				dsl_dir_rele(dd->dd_parent, dd);
290*eda14cbcSMatt Macy 			if (dsl_deadlist_is_open(&dd->dd_livelist))
291*eda14cbcSMatt Macy 				dsl_dir_livelist_close(dd);
292*eda14cbcSMatt Macy 			dsl_prop_fini(dd);
293*eda14cbcSMatt Macy 			cv_destroy(&dd->dd_activity_cv);
294*eda14cbcSMatt Macy 			mutex_destroy(&dd->dd_activity_lock);
295*eda14cbcSMatt Macy 			mutex_destroy(&dd->dd_lock);
296*eda14cbcSMatt Macy 			kmem_free(dd, sizeof (dsl_dir_t));
297*eda14cbcSMatt Macy 			dd = winner;
298*eda14cbcSMatt Macy 		} else {
299*eda14cbcSMatt Macy 			spa_open_ref(dp->dp_spa, dd);
300*eda14cbcSMatt Macy 		}
301*eda14cbcSMatt Macy 	}
302*eda14cbcSMatt Macy 
303*eda14cbcSMatt Macy 	/*
304*eda14cbcSMatt Macy 	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
305*eda14cbcSMatt Macy 	 * holds on the spa.  We need the open-to-close holds because
306*eda14cbcSMatt Macy 	 * otherwise the spa_refcnt wouldn't change when we open a
307*eda14cbcSMatt Macy 	 * dir which the spa also has open, so we could incorrectly
308*eda14cbcSMatt Macy 	 * think it was OK to unload/export/destroy the pool.  We need
309*eda14cbcSMatt Macy 	 * the instantiate-to-evict hold because the dsl_dir_t has a
310*eda14cbcSMatt Macy 	 * pointer to the dd_pool, which has a pointer to the spa_t.
311*eda14cbcSMatt Macy 	 */
312*eda14cbcSMatt Macy 	spa_open_ref(dp->dp_spa, tag);
313*eda14cbcSMatt Macy 	ASSERT3P(dd->dd_pool, ==, dp);
314*eda14cbcSMatt Macy 	ASSERT3U(dd->dd_object, ==, ddobj);
315*eda14cbcSMatt Macy 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
316*eda14cbcSMatt Macy 	*ddp = dd;
317*eda14cbcSMatt Macy 	return (0);
318*eda14cbcSMatt Macy 
319*eda14cbcSMatt Macy errout:
320*eda14cbcSMatt Macy 	if (dd->dd_parent)
321*eda14cbcSMatt Macy 		dsl_dir_rele(dd->dd_parent, dd);
322*eda14cbcSMatt Macy 	if (dsl_deadlist_is_open(&dd->dd_livelist))
323*eda14cbcSMatt Macy 		dsl_dir_livelist_close(dd);
324*eda14cbcSMatt Macy 	dsl_prop_fini(dd);
325*eda14cbcSMatt Macy 	cv_destroy(&dd->dd_activity_cv);
326*eda14cbcSMatt Macy 	mutex_destroy(&dd->dd_activity_lock);
327*eda14cbcSMatt Macy 	mutex_destroy(&dd->dd_lock);
328*eda14cbcSMatt Macy 	kmem_free(dd, sizeof (dsl_dir_t));
329*eda14cbcSMatt Macy 	dmu_buf_rele(dbuf, tag);
330*eda14cbcSMatt Macy 	return (err);
331*eda14cbcSMatt Macy }
332*eda14cbcSMatt Macy 
333*eda14cbcSMatt Macy void
334*eda14cbcSMatt Macy dsl_dir_rele(dsl_dir_t *dd, void *tag)
335*eda14cbcSMatt Macy {
336*eda14cbcSMatt Macy 	dprintf_dd(dd, "%s\n", "");
337*eda14cbcSMatt Macy 	spa_close(dd->dd_pool->dp_spa, tag);
338*eda14cbcSMatt Macy 	dmu_buf_rele(dd->dd_dbuf, tag);
339*eda14cbcSMatt Macy }
340*eda14cbcSMatt Macy 
341*eda14cbcSMatt Macy /*
342*eda14cbcSMatt Macy  * Remove a reference to the given dsl dir that is being asynchronously
343*eda14cbcSMatt Macy  * released.  Async releases occur from a taskq performing eviction of
344*eda14cbcSMatt Macy  * dsl datasets and dirs.  This process is identical to a normal release
345*eda14cbcSMatt Macy  * with the exception of using the async API for releasing the reference on
346*eda14cbcSMatt Macy  * the spa.
347*eda14cbcSMatt Macy  */
348*eda14cbcSMatt Macy void
349*eda14cbcSMatt Macy dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
350*eda14cbcSMatt Macy {
351*eda14cbcSMatt Macy 	dprintf_dd(dd, "%s\n", "");
352*eda14cbcSMatt Macy 	spa_async_close(dd->dd_pool->dp_spa, tag);
353*eda14cbcSMatt Macy 	dmu_buf_rele(dd->dd_dbuf, tag);
354*eda14cbcSMatt Macy }
355*eda14cbcSMatt Macy 
356*eda14cbcSMatt Macy /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
357*eda14cbcSMatt Macy void
358*eda14cbcSMatt Macy dsl_dir_name(dsl_dir_t *dd, char *buf)
359*eda14cbcSMatt Macy {
360*eda14cbcSMatt Macy 	if (dd->dd_parent) {
361*eda14cbcSMatt Macy 		dsl_dir_name(dd->dd_parent, buf);
362*eda14cbcSMatt Macy 		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
363*eda14cbcSMatt Macy 		    ZFS_MAX_DATASET_NAME_LEN);
364*eda14cbcSMatt Macy 	} else {
365*eda14cbcSMatt Macy 		buf[0] = '\0';
366*eda14cbcSMatt Macy 	}
367*eda14cbcSMatt Macy 	if (!MUTEX_HELD(&dd->dd_lock)) {
368*eda14cbcSMatt Macy 		/*
369*eda14cbcSMatt Macy 		 * recursive mutex so that we can use
370*eda14cbcSMatt Macy 		 * dprintf_dd() with dd_lock held
371*eda14cbcSMatt Macy 		 */
372*eda14cbcSMatt Macy 		mutex_enter(&dd->dd_lock);
373*eda14cbcSMatt Macy 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
374*eda14cbcSMatt Macy 		    <, ZFS_MAX_DATASET_NAME_LEN);
375*eda14cbcSMatt Macy 		mutex_exit(&dd->dd_lock);
376*eda14cbcSMatt Macy 	} else {
377*eda14cbcSMatt Macy 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
378*eda14cbcSMatt Macy 		    <, ZFS_MAX_DATASET_NAME_LEN);
379*eda14cbcSMatt Macy 	}
380*eda14cbcSMatt Macy }
381*eda14cbcSMatt Macy 
382*eda14cbcSMatt Macy /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
383*eda14cbcSMatt Macy int
384*eda14cbcSMatt Macy dsl_dir_namelen(dsl_dir_t *dd)
385*eda14cbcSMatt Macy {
386*eda14cbcSMatt Macy 	int result = 0;
387*eda14cbcSMatt Macy 
388*eda14cbcSMatt Macy 	if (dd->dd_parent) {
389*eda14cbcSMatt Macy 		/* parent's name + 1 for the "/" */
390*eda14cbcSMatt Macy 		result = dsl_dir_namelen(dd->dd_parent) + 1;
391*eda14cbcSMatt Macy 	}
392*eda14cbcSMatt Macy 
393*eda14cbcSMatt Macy 	if (!MUTEX_HELD(&dd->dd_lock)) {
394*eda14cbcSMatt Macy 		/* see dsl_dir_name */
395*eda14cbcSMatt Macy 		mutex_enter(&dd->dd_lock);
396*eda14cbcSMatt Macy 		result += strlen(dd->dd_myname);
397*eda14cbcSMatt Macy 		mutex_exit(&dd->dd_lock);
398*eda14cbcSMatt Macy 	} else {
399*eda14cbcSMatt Macy 		result += strlen(dd->dd_myname);
400*eda14cbcSMatt Macy 	}
401*eda14cbcSMatt Macy 
402*eda14cbcSMatt Macy 	return (result);
403*eda14cbcSMatt Macy }
404*eda14cbcSMatt Macy 
405*eda14cbcSMatt Macy static int
406*eda14cbcSMatt Macy getcomponent(const char *path, char *component, const char **nextp)
407*eda14cbcSMatt Macy {
408*eda14cbcSMatt Macy 	char *p;
409*eda14cbcSMatt Macy 
410*eda14cbcSMatt Macy 	if ((path == NULL) || (path[0] == '\0'))
411*eda14cbcSMatt Macy 		return (SET_ERROR(ENOENT));
412*eda14cbcSMatt Macy 	/* This would be a good place to reserve some namespace... */
413*eda14cbcSMatt Macy 	p = strpbrk(path, "/@");
414*eda14cbcSMatt Macy 	if (p && (p[1] == '/' || p[1] == '@')) {
415*eda14cbcSMatt Macy 		/* two separators in a row */
416*eda14cbcSMatt Macy 		return (SET_ERROR(EINVAL));
417*eda14cbcSMatt Macy 	}
418*eda14cbcSMatt Macy 	if (p == NULL || p == path) {
419*eda14cbcSMatt Macy 		/*
420*eda14cbcSMatt Macy 		 * if the first thing is an @ or /, it had better be an
421*eda14cbcSMatt Macy 		 * @ and it had better not have any more ats or slashes,
422*eda14cbcSMatt Macy 		 * and it had better have something after the @.
423*eda14cbcSMatt Macy 		 */
424*eda14cbcSMatt Macy 		if (p != NULL &&
425*eda14cbcSMatt Macy 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
426*eda14cbcSMatt Macy 			return (SET_ERROR(EINVAL));
427*eda14cbcSMatt Macy 		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
428*eda14cbcSMatt Macy 			return (SET_ERROR(ENAMETOOLONG));
429*eda14cbcSMatt Macy 		(void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN);
430*eda14cbcSMatt Macy 		p = NULL;
431*eda14cbcSMatt Macy 	} else if (p[0] == '/') {
432*eda14cbcSMatt Macy 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
433*eda14cbcSMatt Macy 			return (SET_ERROR(ENAMETOOLONG));
434*eda14cbcSMatt Macy 		(void) strncpy(component, path, p - path);
435*eda14cbcSMatt Macy 		component[p - path] = '\0';
436*eda14cbcSMatt Macy 		p++;
437*eda14cbcSMatt Macy 	} else if (p[0] == '@') {
438*eda14cbcSMatt Macy 		/*
439*eda14cbcSMatt Macy 		 * if the next separator is an @, there better not be
440*eda14cbcSMatt Macy 		 * any more slashes.
441*eda14cbcSMatt Macy 		 */
442*eda14cbcSMatt Macy 		if (strchr(path, '/'))
443*eda14cbcSMatt Macy 			return (SET_ERROR(EINVAL));
444*eda14cbcSMatt Macy 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
445*eda14cbcSMatt Macy 			return (SET_ERROR(ENAMETOOLONG));
446*eda14cbcSMatt Macy 		(void) strncpy(component, path, p - path);
447*eda14cbcSMatt Macy 		component[p - path] = '\0';
448*eda14cbcSMatt Macy 	} else {
449*eda14cbcSMatt Macy 		panic("invalid p=%p", (void *)p);
450*eda14cbcSMatt Macy 	}
451*eda14cbcSMatt Macy 	*nextp = p;
452*eda14cbcSMatt Macy 	return (0);
453*eda14cbcSMatt Macy }
454*eda14cbcSMatt Macy 
455*eda14cbcSMatt Macy /*
456*eda14cbcSMatt Macy  * Return the dsl_dir_t, and possibly the last component which couldn't
457*eda14cbcSMatt Macy  * be found in *tail.  The name must be in the specified dsl_pool_t.  This
458*eda14cbcSMatt Macy  * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
459*eda14cbcSMatt Macy  * path is bogus, or if tail==NULL and we couldn't parse the whole name.
460*eda14cbcSMatt Macy  * (*tail)[0] == '@' means that the last component is a snapshot.
461*eda14cbcSMatt Macy  */
462*eda14cbcSMatt Macy int
463*eda14cbcSMatt Macy dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
464*eda14cbcSMatt Macy     dsl_dir_t **ddp, const char **tailp)
465*eda14cbcSMatt Macy {
466*eda14cbcSMatt Macy 	char *buf;
467*eda14cbcSMatt Macy 	const char *spaname, *next, *nextnext = NULL;
468*eda14cbcSMatt Macy 	int err;
469*eda14cbcSMatt Macy 	dsl_dir_t *dd;
470*eda14cbcSMatt Macy 	uint64_t ddobj;
471*eda14cbcSMatt Macy 
472*eda14cbcSMatt Macy 	buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
473*eda14cbcSMatt Macy 	err = getcomponent(name, buf, &next);
474*eda14cbcSMatt Macy 	if (err != 0)
475*eda14cbcSMatt Macy 		goto error;
476*eda14cbcSMatt Macy 
477*eda14cbcSMatt Macy 	/* Make sure the name is in the specified pool. */
478*eda14cbcSMatt Macy 	spaname = spa_name(dp->dp_spa);
479*eda14cbcSMatt Macy 	if (strcmp(buf, spaname) != 0) {
480*eda14cbcSMatt Macy 		err = SET_ERROR(EXDEV);
481*eda14cbcSMatt Macy 		goto error;
482*eda14cbcSMatt Macy 	}
483*eda14cbcSMatt Macy 
484*eda14cbcSMatt Macy 	ASSERT(dsl_pool_config_held(dp));
485*eda14cbcSMatt Macy 
486*eda14cbcSMatt Macy 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
487*eda14cbcSMatt Macy 	if (err != 0) {
488*eda14cbcSMatt Macy 		goto error;
489*eda14cbcSMatt Macy 	}
490*eda14cbcSMatt Macy 
491*eda14cbcSMatt Macy 	while (next != NULL) {
492*eda14cbcSMatt Macy 		dsl_dir_t *child_dd;
493*eda14cbcSMatt Macy 		err = getcomponent(next, buf, &nextnext);
494*eda14cbcSMatt Macy 		if (err != 0)
495*eda14cbcSMatt Macy 			break;
496*eda14cbcSMatt Macy 		ASSERT(next[0] != '\0');
497*eda14cbcSMatt Macy 		if (next[0] == '@')
498*eda14cbcSMatt Macy 			break;
499*eda14cbcSMatt Macy 		dprintf("looking up %s in obj%lld\n",
500*eda14cbcSMatt Macy 		    buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
501*eda14cbcSMatt Macy 
502*eda14cbcSMatt Macy 		err = zap_lookup(dp->dp_meta_objset,
503*eda14cbcSMatt Macy 		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
504*eda14cbcSMatt Macy 		    buf, sizeof (ddobj), 1, &ddobj);
505*eda14cbcSMatt Macy 		if (err != 0) {
506*eda14cbcSMatt Macy 			if (err == ENOENT)
507*eda14cbcSMatt Macy 				err = 0;
508*eda14cbcSMatt Macy 			break;
509*eda14cbcSMatt Macy 		}
510*eda14cbcSMatt Macy 
511*eda14cbcSMatt Macy 		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
512*eda14cbcSMatt Macy 		if (err != 0)
513*eda14cbcSMatt Macy 			break;
514*eda14cbcSMatt Macy 		dsl_dir_rele(dd, tag);
515*eda14cbcSMatt Macy 		dd = child_dd;
516*eda14cbcSMatt Macy 		next = nextnext;
517*eda14cbcSMatt Macy 	}
518*eda14cbcSMatt Macy 
519*eda14cbcSMatt Macy 	if (err != 0) {
520*eda14cbcSMatt Macy 		dsl_dir_rele(dd, tag);
521*eda14cbcSMatt Macy 		goto error;
522*eda14cbcSMatt Macy 	}
523*eda14cbcSMatt Macy 
524*eda14cbcSMatt Macy 	/*
525*eda14cbcSMatt Macy 	 * It's an error if there's more than one component left, or
526*eda14cbcSMatt Macy 	 * tailp==NULL and there's any component left.
527*eda14cbcSMatt Macy 	 */
528*eda14cbcSMatt Macy 	if (next != NULL &&
529*eda14cbcSMatt Macy 	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
530*eda14cbcSMatt Macy 		/* bad path name */
531*eda14cbcSMatt Macy 		dsl_dir_rele(dd, tag);
532*eda14cbcSMatt Macy 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
533*eda14cbcSMatt Macy 		err = SET_ERROR(ENOENT);
534*eda14cbcSMatt Macy 	}
535*eda14cbcSMatt Macy 	if (tailp != NULL)
536*eda14cbcSMatt Macy 		*tailp = next;
537*eda14cbcSMatt Macy 	if (err == 0)
538*eda14cbcSMatt Macy 		*ddp = dd;
539*eda14cbcSMatt Macy error:
540*eda14cbcSMatt Macy 	kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN);
541*eda14cbcSMatt Macy 	return (err);
542*eda14cbcSMatt Macy }
543*eda14cbcSMatt Macy 
544*eda14cbcSMatt Macy /*
545*eda14cbcSMatt Macy  * If the counts are already initialized for this filesystem and its
546*eda14cbcSMatt Macy  * descendants then do nothing, otherwise initialize the counts.
547*eda14cbcSMatt Macy  *
548*eda14cbcSMatt Macy  * The counts on this filesystem, and those below, may be uninitialized due to
549*eda14cbcSMatt Macy  * either the use of a pre-existing pool which did not support the
550*eda14cbcSMatt Macy  * filesystem/snapshot limit feature, or one in which the feature had not yet
551*eda14cbcSMatt Macy  * been enabled.
552*eda14cbcSMatt Macy  *
553*eda14cbcSMatt Macy  * Recursively descend the filesystem tree and update the filesystem/snapshot
554*eda14cbcSMatt Macy  * counts on each filesystem below, then update the cumulative count on the
555*eda14cbcSMatt Macy  * current filesystem. If the filesystem already has a count set on it,
556*eda14cbcSMatt Macy  * then we know that its counts, and the counts on the filesystems below it,
557*eda14cbcSMatt Macy  * are already correct, so we don't have to update this filesystem.
558*eda14cbcSMatt Macy  */
559*eda14cbcSMatt Macy static void
560*eda14cbcSMatt Macy dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
561*eda14cbcSMatt Macy {
562*eda14cbcSMatt Macy 	uint64_t my_fs_cnt = 0;
563*eda14cbcSMatt Macy 	uint64_t my_ss_cnt = 0;
564*eda14cbcSMatt Macy 	dsl_pool_t *dp = dd->dd_pool;
565*eda14cbcSMatt Macy 	objset_t *os = dp->dp_meta_objset;
566*eda14cbcSMatt Macy 	zap_cursor_t *zc;
567*eda14cbcSMatt Macy 	zap_attribute_t *za;
568*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
569*eda14cbcSMatt Macy 
570*eda14cbcSMatt Macy 	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
571*eda14cbcSMatt Macy 	ASSERT(dsl_pool_config_held(dp));
572*eda14cbcSMatt Macy 	ASSERT(dmu_tx_is_syncing(tx));
573*eda14cbcSMatt Macy 
574*eda14cbcSMatt Macy 	dsl_dir_zapify(dd, tx);
575*eda14cbcSMatt Macy 
576*eda14cbcSMatt Macy 	/*
577*eda14cbcSMatt Macy 	 * If the filesystem count has already been initialized then we
578*eda14cbcSMatt Macy 	 * don't need to recurse down any further.
579*eda14cbcSMatt Macy 	 */
580*eda14cbcSMatt Macy 	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
581*eda14cbcSMatt Macy 		return;
582*eda14cbcSMatt Macy 
583*eda14cbcSMatt Macy 	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
584*eda14cbcSMatt Macy 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
585*eda14cbcSMatt Macy 
586*eda14cbcSMatt Macy 	/* Iterate my child dirs */
587*eda14cbcSMatt Macy 	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
588*eda14cbcSMatt Macy 	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
589*eda14cbcSMatt Macy 		dsl_dir_t *chld_dd;
590*eda14cbcSMatt Macy 		uint64_t count;
591*eda14cbcSMatt Macy 
592*eda14cbcSMatt Macy 		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
593*eda14cbcSMatt Macy 		    &chld_dd));
594*eda14cbcSMatt Macy 
595*eda14cbcSMatt Macy 		/*
596*eda14cbcSMatt Macy 		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
597*eda14cbcSMatt Macy 		 * temporary datasets.
598*eda14cbcSMatt Macy 		 */
599*eda14cbcSMatt Macy 		if (chld_dd->dd_myname[0] == '$' ||
600*eda14cbcSMatt Macy 		    chld_dd->dd_myname[0] == '%') {
601*eda14cbcSMatt Macy 			dsl_dir_rele(chld_dd, FTAG);
602*eda14cbcSMatt Macy 			continue;
603*eda14cbcSMatt Macy 		}
604*eda14cbcSMatt Macy 
605*eda14cbcSMatt Macy 		my_fs_cnt++;	/* count this child */
606*eda14cbcSMatt Macy 
607*eda14cbcSMatt Macy 		dsl_dir_init_fs_ss_count(chld_dd, tx);
608*eda14cbcSMatt Macy 
609*eda14cbcSMatt Macy 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
610*eda14cbcSMatt Macy 		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
611*eda14cbcSMatt Macy 		my_fs_cnt += count;
612*eda14cbcSMatt Macy 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
613*eda14cbcSMatt Macy 		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
614*eda14cbcSMatt Macy 		my_ss_cnt += count;
615*eda14cbcSMatt Macy 
616*eda14cbcSMatt Macy 		dsl_dir_rele(chld_dd, FTAG);
617*eda14cbcSMatt Macy 	}
618*eda14cbcSMatt Macy 	zap_cursor_fini(zc);
619*eda14cbcSMatt Macy 	/* Count my snapshots (we counted children's snapshots above) */
620*eda14cbcSMatt Macy 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
621*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
622*eda14cbcSMatt Macy 
623*eda14cbcSMatt Macy 	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
624*eda14cbcSMatt Macy 	    zap_cursor_retrieve(zc, za) == 0;
625*eda14cbcSMatt Macy 	    zap_cursor_advance(zc)) {
626*eda14cbcSMatt Macy 		/* Don't count temporary snapshots */
627*eda14cbcSMatt Macy 		if (za->za_name[0] != '%')
628*eda14cbcSMatt Macy 			my_ss_cnt++;
629*eda14cbcSMatt Macy 	}
630*eda14cbcSMatt Macy 	zap_cursor_fini(zc);
631*eda14cbcSMatt Macy 
632*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
633*eda14cbcSMatt Macy 
634*eda14cbcSMatt Macy 	kmem_free(zc, sizeof (zap_cursor_t));
635*eda14cbcSMatt Macy 	kmem_free(za, sizeof (zap_attribute_t));
636*eda14cbcSMatt Macy 
637*eda14cbcSMatt Macy 	/* we're in a sync task, update counts */
638*eda14cbcSMatt Macy 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
639*eda14cbcSMatt Macy 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
640*eda14cbcSMatt Macy 	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
641*eda14cbcSMatt Macy 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
642*eda14cbcSMatt Macy 	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
643*eda14cbcSMatt Macy }
644*eda14cbcSMatt Macy 
645*eda14cbcSMatt Macy static int
646*eda14cbcSMatt Macy dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
647*eda14cbcSMatt Macy {
648*eda14cbcSMatt Macy 	char *ddname = (char *)arg;
649*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
650*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
651*eda14cbcSMatt Macy 	dsl_dir_t *dd;
652*eda14cbcSMatt Macy 	int error;
653*eda14cbcSMatt Macy 
654*eda14cbcSMatt Macy 	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
655*eda14cbcSMatt Macy 	if (error != 0)
656*eda14cbcSMatt Macy 		return (error);
657*eda14cbcSMatt Macy 
658*eda14cbcSMatt Macy 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
659*eda14cbcSMatt Macy 		dsl_dataset_rele(ds, FTAG);
660*eda14cbcSMatt Macy 		return (SET_ERROR(ENOTSUP));
661*eda14cbcSMatt Macy 	}
662*eda14cbcSMatt Macy 
663*eda14cbcSMatt Macy 	dd = ds->ds_dir;
664*eda14cbcSMatt Macy 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
665*eda14cbcSMatt Macy 	    dsl_dir_is_zapified(dd) &&
666*eda14cbcSMatt Macy 	    zap_contains(dp->dp_meta_objset, dd->dd_object,
667*eda14cbcSMatt Macy 	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
668*eda14cbcSMatt Macy 		dsl_dataset_rele(ds, FTAG);
669*eda14cbcSMatt Macy 		return (SET_ERROR(EALREADY));
670*eda14cbcSMatt Macy 	}
671*eda14cbcSMatt Macy 
672*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
673*eda14cbcSMatt Macy 	return (0);
674*eda14cbcSMatt Macy }
675*eda14cbcSMatt Macy 
676*eda14cbcSMatt Macy static void
677*eda14cbcSMatt Macy dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
678*eda14cbcSMatt Macy {
679*eda14cbcSMatt Macy 	char *ddname = (char *)arg;
680*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
681*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
682*eda14cbcSMatt Macy 	spa_t *spa;
683*eda14cbcSMatt Macy 
684*eda14cbcSMatt Macy 	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
685*eda14cbcSMatt Macy 
686*eda14cbcSMatt Macy 	spa = dsl_dataset_get_spa(ds);
687*eda14cbcSMatt Macy 
688*eda14cbcSMatt Macy 	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
689*eda14cbcSMatt Macy 		/*
690*eda14cbcSMatt Macy 		 * Since the feature was not active and we're now setting a
691*eda14cbcSMatt Macy 		 * limit, increment the feature-active counter so that the
692*eda14cbcSMatt Macy 		 * feature becomes active for the first time.
693*eda14cbcSMatt Macy 		 *
694*eda14cbcSMatt Macy 		 * We are already in a sync task so we can update the MOS.
695*eda14cbcSMatt Macy 		 */
696*eda14cbcSMatt Macy 		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
697*eda14cbcSMatt Macy 	}
698*eda14cbcSMatt Macy 
699*eda14cbcSMatt Macy 	/*
700*eda14cbcSMatt Macy 	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
701*eda14cbcSMatt Macy 	 * we need to ensure the counts are correct. Descend down the tree from
702*eda14cbcSMatt Macy 	 * this point and update all of the counts to be accurate.
703*eda14cbcSMatt Macy 	 */
704*eda14cbcSMatt Macy 	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
705*eda14cbcSMatt Macy 
706*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
707*eda14cbcSMatt Macy }
708*eda14cbcSMatt Macy 
709*eda14cbcSMatt Macy /*
710*eda14cbcSMatt Macy  * Make sure the feature is enabled and activate it if necessary.
711*eda14cbcSMatt Macy  * Since we're setting a limit, ensure the on-disk counts are valid.
712*eda14cbcSMatt Macy  * This is only called by the ioctl path when setting a limit value.
713*eda14cbcSMatt Macy  *
714*eda14cbcSMatt Macy  * We do not need to validate the new limit, since users who can change the
715*eda14cbcSMatt Macy  * limit are also allowed to exceed the limit.
716*eda14cbcSMatt Macy  */
717*eda14cbcSMatt Macy int
718*eda14cbcSMatt Macy dsl_dir_activate_fs_ss_limit(const char *ddname)
719*eda14cbcSMatt Macy {
720*eda14cbcSMatt Macy 	int error;
721*eda14cbcSMatt Macy 
722*eda14cbcSMatt Macy 	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
723*eda14cbcSMatt Macy 	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
724*eda14cbcSMatt Macy 	    ZFS_SPACE_CHECK_RESERVED);
725*eda14cbcSMatt Macy 
726*eda14cbcSMatt Macy 	if (error == EALREADY)
727*eda14cbcSMatt Macy 		error = 0;
728*eda14cbcSMatt Macy 
729*eda14cbcSMatt Macy 	return (error);
730*eda14cbcSMatt Macy }
731*eda14cbcSMatt Macy 
732*eda14cbcSMatt Macy /*
733*eda14cbcSMatt Macy  * Used to determine if the filesystem_limit or snapshot_limit should be
734*eda14cbcSMatt Macy  * enforced. We allow the limit to be exceeded if the user has permission to
735*eda14cbcSMatt Macy  * write the property value. We pass in the creds that we got in the open
736*eda14cbcSMatt Macy  * context since we will always be the GZ root in syncing context. We also have
737*eda14cbcSMatt Macy  * to handle the case where we are allowed to change the limit on the current
738*eda14cbcSMatt Macy  * dataset, but there may be another limit in the tree above.
739*eda14cbcSMatt Macy  *
740*eda14cbcSMatt Macy  * We can never modify these two properties within a non-global zone. In
741*eda14cbcSMatt Macy  * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
742*eda14cbcSMatt Macy  * can't use that function since we are already holding the dp_config_rwlock.
743*eda14cbcSMatt Macy  * In addition, we already have the dd and dealing with snapshots is simplified
744*eda14cbcSMatt Macy  * in this code.
745*eda14cbcSMatt Macy  */
746*eda14cbcSMatt Macy 
747*eda14cbcSMatt Macy typedef enum {
748*eda14cbcSMatt Macy 	ENFORCE_ALWAYS,
749*eda14cbcSMatt Macy 	ENFORCE_NEVER,
750*eda14cbcSMatt Macy 	ENFORCE_ABOVE
751*eda14cbcSMatt Macy } enforce_res_t;
752*eda14cbcSMatt Macy 
753*eda14cbcSMatt Macy static enforce_res_t
754*eda14cbcSMatt Macy dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
755*eda14cbcSMatt Macy     cred_t *cr, proc_t *proc)
756*eda14cbcSMatt Macy {
757*eda14cbcSMatt Macy 	enforce_res_t enforce = ENFORCE_ALWAYS;
758*eda14cbcSMatt Macy 	uint64_t obj;
759*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
760*eda14cbcSMatt Macy 	uint64_t zoned;
761*eda14cbcSMatt Macy 	const char *zonedstr;
762*eda14cbcSMatt Macy 
763*eda14cbcSMatt Macy 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
764*eda14cbcSMatt Macy 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
765*eda14cbcSMatt Macy 
766*eda14cbcSMatt Macy #ifdef _KERNEL
767*eda14cbcSMatt Macy 	if (crgetzoneid(cr) != GLOBAL_ZONEID)
768*eda14cbcSMatt Macy 		return (ENFORCE_ALWAYS);
769*eda14cbcSMatt Macy 
770*eda14cbcSMatt Macy 	/*
771*eda14cbcSMatt Macy 	 * We are checking the saved credentials of the user process, which is
772*eda14cbcSMatt Macy 	 * not the current process.  Note that we can't use secpolicy_zfs(),
773*eda14cbcSMatt Macy 	 * because it only works if the cred is that of the current process (on
774*eda14cbcSMatt Macy 	 * Linux).
775*eda14cbcSMatt Macy 	 */
776*eda14cbcSMatt Macy 	if (secpolicy_zfs_proc(cr, proc) == 0)
777*eda14cbcSMatt Macy 		return (ENFORCE_NEVER);
778*eda14cbcSMatt Macy #endif
779*eda14cbcSMatt Macy 
780*eda14cbcSMatt Macy 	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
781*eda14cbcSMatt Macy 		return (ENFORCE_ALWAYS);
782*eda14cbcSMatt Macy 
783*eda14cbcSMatt Macy 	ASSERT(dsl_pool_config_held(dd->dd_pool));
784*eda14cbcSMatt Macy 
785*eda14cbcSMatt Macy 	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
786*eda14cbcSMatt Macy 		return (ENFORCE_ALWAYS);
787*eda14cbcSMatt Macy 
788*eda14cbcSMatt Macy 	zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED);
789*eda14cbcSMatt Macy 	if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) {
790*eda14cbcSMatt Macy 		/* Only root can access zoned fs's from the GZ */
791*eda14cbcSMatt Macy 		enforce = ENFORCE_ALWAYS;
792*eda14cbcSMatt Macy 	} else {
793*eda14cbcSMatt Macy 		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
794*eda14cbcSMatt Macy 			enforce = ENFORCE_ABOVE;
795*eda14cbcSMatt Macy 	}
796*eda14cbcSMatt Macy 
797*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
798*eda14cbcSMatt Macy 	return (enforce);
799*eda14cbcSMatt Macy }
800*eda14cbcSMatt Macy 
801*eda14cbcSMatt Macy /*
802*eda14cbcSMatt Macy  * Check if adding additional child filesystem(s) would exceed any filesystem
803*eda14cbcSMatt Macy  * limits or adding additional snapshot(s) would exceed any snapshot limits.
804*eda14cbcSMatt Macy  * The prop argument indicates which limit to check.
805*eda14cbcSMatt Macy  *
806*eda14cbcSMatt Macy  * Note that all filesystem limits up to the root (or the highest
807*eda14cbcSMatt Macy  * initialized) filesystem or the given ancestor must be satisfied.
808*eda14cbcSMatt Macy  */
809*eda14cbcSMatt Macy int
810*eda14cbcSMatt Macy dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
811*eda14cbcSMatt Macy     dsl_dir_t *ancestor, cred_t *cr, proc_t *proc)
812*eda14cbcSMatt Macy {
813*eda14cbcSMatt Macy 	objset_t *os = dd->dd_pool->dp_meta_objset;
814*eda14cbcSMatt Macy 	uint64_t limit, count;
815*eda14cbcSMatt Macy 	char *count_prop;
816*eda14cbcSMatt Macy 	enforce_res_t enforce;
817*eda14cbcSMatt Macy 	int err = 0;
818*eda14cbcSMatt Macy 
819*eda14cbcSMatt Macy 	ASSERT(dsl_pool_config_held(dd->dd_pool));
820*eda14cbcSMatt Macy 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
821*eda14cbcSMatt Macy 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
822*eda14cbcSMatt Macy 
823*eda14cbcSMatt Macy 	/*
824*eda14cbcSMatt Macy 	 * If we're allowed to change the limit, don't enforce the limit
825*eda14cbcSMatt Macy 	 * e.g. this can happen if a snapshot is taken by an administrative
826*eda14cbcSMatt Macy 	 * user in the global zone (i.e. a recursive snapshot by root).
827*eda14cbcSMatt Macy 	 * However, we must handle the case of delegated permissions where we
828*eda14cbcSMatt Macy 	 * are allowed to change the limit on the current dataset, but there
829*eda14cbcSMatt Macy 	 * is another limit in the tree above.
830*eda14cbcSMatt Macy 	 */
831*eda14cbcSMatt Macy 	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc);
832*eda14cbcSMatt Macy 	if (enforce == ENFORCE_NEVER)
833*eda14cbcSMatt Macy 		return (0);
834*eda14cbcSMatt Macy 
835*eda14cbcSMatt Macy 	/*
836*eda14cbcSMatt Macy 	 * e.g. if renaming a dataset with no snapshots, count adjustment
837*eda14cbcSMatt Macy 	 * is 0.
838*eda14cbcSMatt Macy 	 */
839*eda14cbcSMatt Macy 	if (delta == 0)
840*eda14cbcSMatt Macy 		return (0);
841*eda14cbcSMatt Macy 
842*eda14cbcSMatt Macy 	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
843*eda14cbcSMatt Macy 		/*
844*eda14cbcSMatt Macy 		 * We don't enforce the limit for temporary snapshots. This is
845*eda14cbcSMatt Macy 		 * indicated by a NULL cred_t argument.
846*eda14cbcSMatt Macy 		 */
847*eda14cbcSMatt Macy 		if (cr == NULL)
848*eda14cbcSMatt Macy 			return (0);
849*eda14cbcSMatt Macy 
850*eda14cbcSMatt Macy 		count_prop = DD_FIELD_SNAPSHOT_COUNT;
851*eda14cbcSMatt Macy 	} else {
852*eda14cbcSMatt Macy 		count_prop = DD_FIELD_FILESYSTEM_COUNT;
853*eda14cbcSMatt Macy 	}
854*eda14cbcSMatt Macy 
855*eda14cbcSMatt Macy 	/*
856*eda14cbcSMatt Macy 	 * If an ancestor has been provided, stop checking the limit once we
857*eda14cbcSMatt Macy 	 * hit that dir. We need this during rename so that we don't overcount
858*eda14cbcSMatt Macy 	 * the check once we recurse up to the common ancestor.
859*eda14cbcSMatt Macy 	 */
860*eda14cbcSMatt Macy 	if (ancestor == dd)
861*eda14cbcSMatt Macy 		return (0);
862*eda14cbcSMatt Macy 
863*eda14cbcSMatt Macy 	/*
864*eda14cbcSMatt Macy 	 * If we hit an uninitialized node while recursing up the tree, we can
865*eda14cbcSMatt Macy 	 * stop since we know there is no limit here (or above). The counts are
866*eda14cbcSMatt Macy 	 * not valid on this node and we know we won't touch this node's counts.
867*eda14cbcSMatt Macy 	 */
868*eda14cbcSMatt Macy 	if (!dsl_dir_is_zapified(dd))
869*eda14cbcSMatt Macy 		return (0);
870*eda14cbcSMatt Macy 	err = zap_lookup(os, dd->dd_object,
871*eda14cbcSMatt Macy 	    count_prop, sizeof (count), 1, &count);
872*eda14cbcSMatt Macy 	if (err == ENOENT)
873*eda14cbcSMatt Macy 		return (0);
874*eda14cbcSMatt Macy 	if (err != 0)
875*eda14cbcSMatt Macy 		return (err);
876*eda14cbcSMatt Macy 
877*eda14cbcSMatt Macy 	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
878*eda14cbcSMatt Macy 	    B_FALSE);
879*eda14cbcSMatt Macy 	if (err != 0)
880*eda14cbcSMatt Macy 		return (err);
881*eda14cbcSMatt Macy 
882*eda14cbcSMatt Macy 	/* Is there a limit which we've hit? */
883*eda14cbcSMatt Macy 	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
884*eda14cbcSMatt Macy 		return (SET_ERROR(EDQUOT));
885*eda14cbcSMatt Macy 
886*eda14cbcSMatt Macy 	if (dd->dd_parent != NULL)
887*eda14cbcSMatt Macy 		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
888*eda14cbcSMatt Macy 		    ancestor, cr, proc);
889*eda14cbcSMatt Macy 
890*eda14cbcSMatt Macy 	return (err);
891*eda14cbcSMatt Macy }
892*eda14cbcSMatt Macy 
893*eda14cbcSMatt Macy /*
894*eda14cbcSMatt Macy  * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
895*eda14cbcSMatt Macy  * parents. When a new filesystem/snapshot is created, increment the count on
896*eda14cbcSMatt Macy  * all parents, and when a filesystem/snapshot is destroyed, decrement the
897*eda14cbcSMatt Macy  * count.
898*eda14cbcSMatt Macy  */
899*eda14cbcSMatt Macy void
900*eda14cbcSMatt Macy dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
901*eda14cbcSMatt Macy     dmu_tx_t *tx)
902*eda14cbcSMatt Macy {
903*eda14cbcSMatt Macy 	int err;
904*eda14cbcSMatt Macy 	objset_t *os = dd->dd_pool->dp_meta_objset;
905*eda14cbcSMatt Macy 	uint64_t count;
906*eda14cbcSMatt Macy 
907*eda14cbcSMatt Macy 	ASSERT(dsl_pool_config_held(dd->dd_pool));
908*eda14cbcSMatt Macy 	ASSERT(dmu_tx_is_syncing(tx));
909*eda14cbcSMatt Macy 	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
910*eda14cbcSMatt Macy 	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
911*eda14cbcSMatt Macy 
912*eda14cbcSMatt Macy 	/*
913*eda14cbcSMatt Macy 	 * When we receive an incremental stream into a filesystem that already
914*eda14cbcSMatt Macy 	 * exists, a temporary clone is created.  We don't count this temporary
915*eda14cbcSMatt Macy 	 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
916*eda14cbcSMatt Macy 	 * $MOS & $ORIGIN) objsets.
917*eda14cbcSMatt Macy 	 */
918*eda14cbcSMatt Macy 	if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
919*eda14cbcSMatt Macy 	    strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
920*eda14cbcSMatt Macy 		return;
921*eda14cbcSMatt Macy 
922*eda14cbcSMatt Macy 	/*
923*eda14cbcSMatt Macy 	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
924*eda14cbcSMatt Macy 	 */
925*eda14cbcSMatt Macy 	if (delta == 0)
926*eda14cbcSMatt Macy 		return;
927*eda14cbcSMatt Macy 
928*eda14cbcSMatt Macy 	/*
929*eda14cbcSMatt Macy 	 * If we hit an uninitialized node while recursing up the tree, we can
930*eda14cbcSMatt Macy 	 * stop since we know the counts are not valid on this node and we
931*eda14cbcSMatt Macy 	 * know we shouldn't touch this node's counts. An uninitialized count
932*eda14cbcSMatt Macy 	 * on the node indicates that either the feature has not yet been
933*eda14cbcSMatt Macy 	 * activated or there are no limits on this part of the tree.
934*eda14cbcSMatt Macy 	 */
935*eda14cbcSMatt Macy 	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
936*eda14cbcSMatt Macy 	    prop, sizeof (count), 1, &count)) == ENOENT)
937*eda14cbcSMatt Macy 		return;
938*eda14cbcSMatt Macy 	VERIFY0(err);
939*eda14cbcSMatt Macy 
940*eda14cbcSMatt Macy 	count += delta;
941*eda14cbcSMatt Macy 	/* Use a signed verify to make sure we're not neg. */
942*eda14cbcSMatt Macy 	VERIFY3S(count, >=, 0);
943*eda14cbcSMatt Macy 
944*eda14cbcSMatt Macy 	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
945*eda14cbcSMatt Macy 	    tx));
946*eda14cbcSMatt Macy 
947*eda14cbcSMatt Macy 	/* Roll up this additional count into our ancestors */
948*eda14cbcSMatt Macy 	if (dd->dd_parent != NULL)
949*eda14cbcSMatt Macy 		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
950*eda14cbcSMatt Macy }
951*eda14cbcSMatt Macy 
952*eda14cbcSMatt Macy uint64_t
953*eda14cbcSMatt Macy dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
954*eda14cbcSMatt Macy     dmu_tx_t *tx)
955*eda14cbcSMatt Macy {
956*eda14cbcSMatt Macy 	objset_t *mos = dp->dp_meta_objset;
957*eda14cbcSMatt Macy 	uint64_t ddobj;
958*eda14cbcSMatt Macy 	dsl_dir_phys_t *ddphys;
959*eda14cbcSMatt Macy 	dmu_buf_t *dbuf;
960*eda14cbcSMatt Macy 
961*eda14cbcSMatt Macy 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
962*eda14cbcSMatt Macy 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
963*eda14cbcSMatt Macy 	if (pds) {
964*eda14cbcSMatt Macy 		VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
965*eda14cbcSMatt Macy 		    name, sizeof (uint64_t), 1, &ddobj, tx));
966*eda14cbcSMatt Macy 	} else {
967*eda14cbcSMatt Macy 		/* it's the root dir */
968*eda14cbcSMatt Macy 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
969*eda14cbcSMatt Macy 		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
970*eda14cbcSMatt Macy 	}
971*eda14cbcSMatt Macy 	VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
972*eda14cbcSMatt Macy 	dmu_buf_will_dirty(dbuf, tx);
973*eda14cbcSMatt Macy 	ddphys = dbuf->db_data;
974*eda14cbcSMatt Macy 
975*eda14cbcSMatt Macy 	ddphys->dd_creation_time = gethrestime_sec();
976*eda14cbcSMatt Macy 	if (pds) {
977*eda14cbcSMatt Macy 		ddphys->dd_parent_obj = pds->dd_object;
978*eda14cbcSMatt Macy 
979*eda14cbcSMatt Macy 		/* update the filesystem counts */
980*eda14cbcSMatt Macy 		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
981*eda14cbcSMatt Macy 	}
982*eda14cbcSMatt Macy 	ddphys->dd_props_zapobj = zap_create(mos,
983*eda14cbcSMatt Macy 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
984*eda14cbcSMatt Macy 	ddphys->dd_child_dir_zapobj = zap_create(mos,
985*eda14cbcSMatt Macy 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
986*eda14cbcSMatt Macy 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
987*eda14cbcSMatt Macy 		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
988*eda14cbcSMatt Macy 
989*eda14cbcSMatt Macy 	dmu_buf_rele(dbuf, FTAG);
990*eda14cbcSMatt Macy 
991*eda14cbcSMatt Macy 	return (ddobj);
992*eda14cbcSMatt Macy }
993*eda14cbcSMatt Macy 
994*eda14cbcSMatt Macy boolean_t
995*eda14cbcSMatt Macy dsl_dir_is_clone(dsl_dir_t *dd)
996*eda14cbcSMatt Macy {
997*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_origin_obj &&
998*eda14cbcSMatt Macy 	    (dd->dd_pool->dp_origin_snap == NULL ||
999*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_origin_obj !=
1000*eda14cbcSMatt Macy 	    dd->dd_pool->dp_origin_snap->ds_object));
1001*eda14cbcSMatt Macy }
1002*eda14cbcSMatt Macy 
1003*eda14cbcSMatt Macy uint64_t
1004*eda14cbcSMatt Macy dsl_dir_get_used(dsl_dir_t *dd)
1005*eda14cbcSMatt Macy {
1006*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_used_bytes);
1007*eda14cbcSMatt Macy }
1008*eda14cbcSMatt Macy 
1009*eda14cbcSMatt Macy uint64_t
1010*eda14cbcSMatt Macy dsl_dir_get_compressed(dsl_dir_t *dd)
1011*eda14cbcSMatt Macy {
1012*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_compressed_bytes);
1013*eda14cbcSMatt Macy }
1014*eda14cbcSMatt Macy 
1015*eda14cbcSMatt Macy uint64_t
1016*eda14cbcSMatt Macy dsl_dir_get_quota(dsl_dir_t *dd)
1017*eda14cbcSMatt Macy {
1018*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_quota);
1019*eda14cbcSMatt Macy }
1020*eda14cbcSMatt Macy 
1021*eda14cbcSMatt Macy uint64_t
1022*eda14cbcSMatt Macy dsl_dir_get_reservation(dsl_dir_t *dd)
1023*eda14cbcSMatt Macy {
1024*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_reserved);
1025*eda14cbcSMatt Macy }
1026*eda14cbcSMatt Macy 
1027*eda14cbcSMatt Macy uint64_t
1028*eda14cbcSMatt Macy dsl_dir_get_compressratio(dsl_dir_t *dd)
1029*eda14cbcSMatt Macy {
1030*eda14cbcSMatt Macy 	/* a fixed point number, 100x the ratio */
1031*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
1032*eda14cbcSMatt Macy 	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
1033*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_compressed_bytes));
1034*eda14cbcSMatt Macy }
1035*eda14cbcSMatt Macy 
1036*eda14cbcSMatt Macy uint64_t
1037*eda14cbcSMatt Macy dsl_dir_get_logicalused(dsl_dir_t *dd)
1038*eda14cbcSMatt Macy {
1039*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
1040*eda14cbcSMatt Macy }
1041*eda14cbcSMatt Macy 
1042*eda14cbcSMatt Macy uint64_t
1043*eda14cbcSMatt Macy dsl_dir_get_usedsnap(dsl_dir_t *dd)
1044*eda14cbcSMatt Macy {
1045*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
1046*eda14cbcSMatt Macy }
1047*eda14cbcSMatt Macy 
1048*eda14cbcSMatt Macy uint64_t
1049*eda14cbcSMatt Macy dsl_dir_get_usedds(dsl_dir_t *dd)
1050*eda14cbcSMatt Macy {
1051*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
1052*eda14cbcSMatt Macy }
1053*eda14cbcSMatt Macy 
1054*eda14cbcSMatt Macy uint64_t
1055*eda14cbcSMatt Macy dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
1056*eda14cbcSMatt Macy {
1057*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
1058*eda14cbcSMatt Macy }
1059*eda14cbcSMatt Macy 
1060*eda14cbcSMatt Macy uint64_t
1061*eda14cbcSMatt Macy dsl_dir_get_usedchild(dsl_dir_t *dd)
1062*eda14cbcSMatt Macy {
1063*eda14cbcSMatt Macy 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
1064*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
1065*eda14cbcSMatt Macy }
1066*eda14cbcSMatt Macy 
1067*eda14cbcSMatt Macy void
1068*eda14cbcSMatt Macy dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
1069*eda14cbcSMatt Macy {
1070*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
1071*eda14cbcSMatt Macy 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
1072*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
1073*eda14cbcSMatt Macy 
1074*eda14cbcSMatt Macy 	dsl_dataset_name(ds, buf);
1075*eda14cbcSMatt Macy 
1076*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
1077*eda14cbcSMatt Macy }
1078*eda14cbcSMatt Macy 
1079*eda14cbcSMatt Macy int
1080*eda14cbcSMatt Macy dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
1081*eda14cbcSMatt Macy {
1082*eda14cbcSMatt Macy 	if (dsl_dir_is_zapified(dd)) {
1083*eda14cbcSMatt Macy 		objset_t *os = dd->dd_pool->dp_meta_objset;
1084*eda14cbcSMatt Macy 		return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
1085*eda14cbcSMatt Macy 		    sizeof (*count), 1, count));
1086*eda14cbcSMatt Macy 	} else {
1087*eda14cbcSMatt Macy 		return (SET_ERROR(ENOENT));
1088*eda14cbcSMatt Macy 	}
1089*eda14cbcSMatt Macy }
1090*eda14cbcSMatt Macy 
1091*eda14cbcSMatt Macy int
1092*eda14cbcSMatt Macy dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
1093*eda14cbcSMatt Macy {
1094*eda14cbcSMatt Macy 	if (dsl_dir_is_zapified(dd)) {
1095*eda14cbcSMatt Macy 		objset_t *os = dd->dd_pool->dp_meta_objset;
1096*eda14cbcSMatt Macy 		return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
1097*eda14cbcSMatt Macy 		    sizeof (*count), 1, count));
1098*eda14cbcSMatt Macy 	} else {
1099*eda14cbcSMatt Macy 		return (SET_ERROR(ENOENT));
1100*eda14cbcSMatt Macy 	}
1101*eda14cbcSMatt Macy }
1102*eda14cbcSMatt Macy 
1103*eda14cbcSMatt Macy void
1104*eda14cbcSMatt Macy dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
1105*eda14cbcSMatt Macy {
1106*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
1107*eda14cbcSMatt Macy 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
1108*eda14cbcSMatt Macy 	    dsl_dir_get_quota(dd));
1109*eda14cbcSMatt Macy 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
1110*eda14cbcSMatt Macy 	    dsl_dir_get_reservation(dd));
1111*eda14cbcSMatt Macy 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
1112*eda14cbcSMatt Macy 	    dsl_dir_get_logicalused(dd));
1113*eda14cbcSMatt Macy 	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1114*eda14cbcSMatt Macy 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
1115*eda14cbcSMatt Macy 		    dsl_dir_get_usedsnap(dd));
1116*eda14cbcSMatt Macy 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
1117*eda14cbcSMatt Macy 		    dsl_dir_get_usedds(dd));
1118*eda14cbcSMatt Macy 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
1119*eda14cbcSMatt Macy 		    dsl_dir_get_usedrefreserv(dd));
1120*eda14cbcSMatt Macy 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
1121*eda14cbcSMatt Macy 		    dsl_dir_get_usedchild(dd));
1122*eda14cbcSMatt Macy 	}
1123*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
1124*eda14cbcSMatt Macy 
1125*eda14cbcSMatt Macy 	uint64_t count;
1126*eda14cbcSMatt Macy 	if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
1127*eda14cbcSMatt Macy 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
1128*eda14cbcSMatt Macy 		    count);
1129*eda14cbcSMatt Macy 	}
1130*eda14cbcSMatt Macy 	if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
1131*eda14cbcSMatt Macy 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
1132*eda14cbcSMatt Macy 		    count);
1133*eda14cbcSMatt Macy 	}
1134*eda14cbcSMatt Macy 
1135*eda14cbcSMatt Macy 	if (dsl_dir_is_clone(dd)) {
1136*eda14cbcSMatt Macy 		char buf[ZFS_MAX_DATASET_NAME_LEN];
1137*eda14cbcSMatt Macy 		dsl_dir_get_origin(dd, buf);
1138*eda14cbcSMatt Macy 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
1139*eda14cbcSMatt Macy 	}
1140*eda14cbcSMatt Macy 
1141*eda14cbcSMatt Macy }
1142*eda14cbcSMatt Macy 
1143*eda14cbcSMatt Macy void
1144*eda14cbcSMatt Macy dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
1145*eda14cbcSMatt Macy {
1146*eda14cbcSMatt Macy 	dsl_pool_t *dp = dd->dd_pool;
1147*eda14cbcSMatt Macy 
1148*eda14cbcSMatt Macy 	ASSERT(dsl_dir_phys(dd));
1149*eda14cbcSMatt Macy 
1150*eda14cbcSMatt Macy 	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
1151*eda14cbcSMatt Macy 		/* up the hold count until we can be written out */
1152*eda14cbcSMatt Macy 		dmu_buf_add_ref(dd->dd_dbuf, dd);
1153*eda14cbcSMatt Macy 	}
1154*eda14cbcSMatt Macy }
1155*eda14cbcSMatt Macy 
1156*eda14cbcSMatt Macy static int64_t
1157*eda14cbcSMatt Macy parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
1158*eda14cbcSMatt Macy {
1159*eda14cbcSMatt Macy 	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
1160*eda14cbcSMatt Macy 	uint64_t new_accounted =
1161*eda14cbcSMatt Macy 	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
1162*eda14cbcSMatt Macy 	return (new_accounted - old_accounted);
1163*eda14cbcSMatt Macy }
1164*eda14cbcSMatt Macy 
1165*eda14cbcSMatt Macy void
1166*eda14cbcSMatt Macy dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
1167*eda14cbcSMatt Macy {
1168*eda14cbcSMatt Macy 	ASSERT(dmu_tx_is_syncing(tx));
1169*eda14cbcSMatt Macy 
1170*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
1171*eda14cbcSMatt Macy 	ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]);
1172*eda14cbcSMatt Macy 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
1173*eda14cbcSMatt Macy 	    dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024);
1174*eda14cbcSMatt Macy 	dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0;
1175*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
1176*eda14cbcSMatt Macy 
1177*eda14cbcSMatt Macy 	/* release the hold from dsl_dir_dirty */
1178*eda14cbcSMatt Macy 	dmu_buf_rele(dd->dd_dbuf, dd);
1179*eda14cbcSMatt Macy }
1180*eda14cbcSMatt Macy 
1181*eda14cbcSMatt Macy static uint64_t
1182*eda14cbcSMatt Macy dsl_dir_space_towrite(dsl_dir_t *dd)
1183*eda14cbcSMatt Macy {
1184*eda14cbcSMatt Macy 	uint64_t space = 0;
1185*eda14cbcSMatt Macy 
1186*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&dd->dd_lock));
1187*eda14cbcSMatt Macy 
1188*eda14cbcSMatt Macy 	for (int i = 0; i < TXG_SIZE; i++) {
1189*eda14cbcSMatt Macy 		space += dd->dd_space_towrite[i & TXG_MASK];
1190*eda14cbcSMatt Macy 		ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
1191*eda14cbcSMatt Macy 	}
1192*eda14cbcSMatt Macy 	return (space);
1193*eda14cbcSMatt Macy }
1194*eda14cbcSMatt Macy 
1195*eda14cbcSMatt Macy /*
1196*eda14cbcSMatt Macy  * How much space would dd have available if ancestor had delta applied
1197*eda14cbcSMatt Macy  * to it?  If ondiskonly is set, we're only interested in what's
1198*eda14cbcSMatt Macy  * on-disk, not estimated pending changes.
1199*eda14cbcSMatt Macy  */
1200*eda14cbcSMatt Macy uint64_t
1201*eda14cbcSMatt Macy dsl_dir_space_available(dsl_dir_t *dd,
1202*eda14cbcSMatt Macy     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
1203*eda14cbcSMatt Macy {
1204*eda14cbcSMatt Macy 	uint64_t parentspace, myspace, quota, used;
1205*eda14cbcSMatt Macy 
1206*eda14cbcSMatt Macy 	/*
1207*eda14cbcSMatt Macy 	 * If there are no restrictions otherwise, assume we have
1208*eda14cbcSMatt Macy 	 * unlimited space available.
1209*eda14cbcSMatt Macy 	 */
1210*eda14cbcSMatt Macy 	quota = UINT64_MAX;
1211*eda14cbcSMatt Macy 	parentspace = UINT64_MAX;
1212*eda14cbcSMatt Macy 
1213*eda14cbcSMatt Macy 	if (dd->dd_parent != NULL) {
1214*eda14cbcSMatt Macy 		parentspace = dsl_dir_space_available(dd->dd_parent,
1215*eda14cbcSMatt Macy 		    ancestor, delta, ondiskonly);
1216*eda14cbcSMatt Macy 	}
1217*eda14cbcSMatt Macy 
1218*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
1219*eda14cbcSMatt Macy 	if (dsl_dir_phys(dd)->dd_quota != 0)
1220*eda14cbcSMatt Macy 		quota = dsl_dir_phys(dd)->dd_quota;
1221*eda14cbcSMatt Macy 	used = dsl_dir_phys(dd)->dd_used_bytes;
1222*eda14cbcSMatt Macy 	if (!ondiskonly)
1223*eda14cbcSMatt Macy 		used += dsl_dir_space_towrite(dd);
1224*eda14cbcSMatt Macy 
1225*eda14cbcSMatt Macy 	if (dd->dd_parent == NULL) {
1226*eda14cbcSMatt Macy 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
1227*eda14cbcSMatt Macy 		    ZFS_SPACE_CHECK_NORMAL);
1228*eda14cbcSMatt Macy 		quota = MIN(quota, poolsize);
1229*eda14cbcSMatt Macy 	}
1230*eda14cbcSMatt Macy 
1231*eda14cbcSMatt Macy 	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
1232*eda14cbcSMatt Macy 		/*
1233*eda14cbcSMatt Macy 		 * We have some space reserved, in addition to what our
1234*eda14cbcSMatt Macy 		 * parent gave us.
1235*eda14cbcSMatt Macy 		 */
1236*eda14cbcSMatt Macy 		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
1237*eda14cbcSMatt Macy 	}
1238*eda14cbcSMatt Macy 
1239*eda14cbcSMatt Macy 	if (dd == ancestor) {
1240*eda14cbcSMatt Macy 		ASSERT(delta <= 0);
1241*eda14cbcSMatt Macy 		ASSERT(used >= -delta);
1242*eda14cbcSMatt Macy 		used += delta;
1243*eda14cbcSMatt Macy 		if (parentspace != UINT64_MAX)
1244*eda14cbcSMatt Macy 			parentspace -= delta;
1245*eda14cbcSMatt Macy 	}
1246*eda14cbcSMatt Macy 
1247*eda14cbcSMatt Macy 	if (used > quota) {
1248*eda14cbcSMatt Macy 		/* over quota */
1249*eda14cbcSMatt Macy 		myspace = 0;
1250*eda14cbcSMatt Macy 	} else {
1251*eda14cbcSMatt Macy 		/*
1252*eda14cbcSMatt Macy 		 * the lesser of the space provided by our parent and
1253*eda14cbcSMatt Macy 		 * the space left in our quota
1254*eda14cbcSMatt Macy 		 */
1255*eda14cbcSMatt Macy 		myspace = MIN(parentspace, quota - used);
1256*eda14cbcSMatt Macy 	}
1257*eda14cbcSMatt Macy 
1258*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
1259*eda14cbcSMatt Macy 
1260*eda14cbcSMatt Macy 	return (myspace);
1261*eda14cbcSMatt Macy }
1262*eda14cbcSMatt Macy 
1263*eda14cbcSMatt Macy struct tempreserve {
1264*eda14cbcSMatt Macy 	list_node_t tr_node;
1265*eda14cbcSMatt Macy 	dsl_dir_t *tr_ds;
1266*eda14cbcSMatt Macy 	uint64_t tr_size;
1267*eda14cbcSMatt Macy };
1268*eda14cbcSMatt Macy 
1269*eda14cbcSMatt Macy static int
1270*eda14cbcSMatt Macy dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
1271*eda14cbcSMatt Macy     boolean_t ignorequota, list_t *tr_list,
1272*eda14cbcSMatt Macy     dmu_tx_t *tx, boolean_t first)
1273*eda14cbcSMatt Macy {
1274*eda14cbcSMatt Macy 	uint64_t txg;
1275*eda14cbcSMatt Macy 	uint64_t quota;
1276*eda14cbcSMatt Macy 	struct tempreserve *tr;
1277*eda14cbcSMatt Macy 	int retval;
1278*eda14cbcSMatt Macy 	uint64_t ref_rsrv;
1279*eda14cbcSMatt Macy 
1280*eda14cbcSMatt Macy top_of_function:
1281*eda14cbcSMatt Macy 	txg = tx->tx_txg;
1282*eda14cbcSMatt Macy 	retval = EDQUOT;
1283*eda14cbcSMatt Macy 	ref_rsrv = 0;
1284*eda14cbcSMatt Macy 
1285*eda14cbcSMatt Macy 	ASSERT3U(txg, !=, 0);
1286*eda14cbcSMatt Macy 	ASSERT3S(asize, >, 0);
1287*eda14cbcSMatt Macy 
1288*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
1289*eda14cbcSMatt Macy 
1290*eda14cbcSMatt Macy 	/*
1291*eda14cbcSMatt Macy 	 * Check against the dsl_dir's quota.  We don't add in the delta
1292*eda14cbcSMatt Macy 	 * when checking for over-quota because they get one free hit.
1293*eda14cbcSMatt Macy 	 */
1294*eda14cbcSMatt Macy 	uint64_t est_inflight = dsl_dir_space_towrite(dd);
1295*eda14cbcSMatt Macy 	for (int i = 0; i < TXG_SIZE; i++)
1296*eda14cbcSMatt Macy 		est_inflight += dd->dd_tempreserved[i];
1297*eda14cbcSMatt Macy 	uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
1298*eda14cbcSMatt Macy 
1299*eda14cbcSMatt Macy 	/*
1300*eda14cbcSMatt Macy 	 * On the first iteration, fetch the dataset's used-on-disk and
1301*eda14cbcSMatt Macy 	 * refreservation values. Also, if checkrefquota is set, test if
1302*eda14cbcSMatt Macy 	 * allocating this space would exceed the dataset's refquota.
1303*eda14cbcSMatt Macy 	 */
1304*eda14cbcSMatt Macy 	if (first && tx->tx_objset) {
1305*eda14cbcSMatt Macy 		int error;
1306*eda14cbcSMatt Macy 		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
1307*eda14cbcSMatt Macy 
1308*eda14cbcSMatt Macy 		error = dsl_dataset_check_quota(ds, !netfree,
1309*eda14cbcSMatt Macy 		    asize, est_inflight, &used_on_disk, &ref_rsrv);
1310*eda14cbcSMatt Macy 		if (error != 0) {
1311*eda14cbcSMatt Macy 			mutex_exit(&dd->dd_lock);
1312*eda14cbcSMatt Macy 			DMU_TX_STAT_BUMP(dmu_tx_quota);
1313*eda14cbcSMatt Macy 			return (error);
1314*eda14cbcSMatt Macy 		}
1315*eda14cbcSMatt Macy 	}
1316*eda14cbcSMatt Macy 
1317*eda14cbcSMatt Macy 	/*
1318*eda14cbcSMatt Macy 	 * If this transaction will result in a net free of space,
1319*eda14cbcSMatt Macy 	 * we want to let it through.
1320*eda14cbcSMatt Macy 	 */
1321*eda14cbcSMatt Macy 	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
1322*eda14cbcSMatt Macy 		quota = UINT64_MAX;
1323*eda14cbcSMatt Macy 	else
1324*eda14cbcSMatt Macy 		quota = dsl_dir_phys(dd)->dd_quota;
1325*eda14cbcSMatt Macy 
1326*eda14cbcSMatt Macy 	/*
1327*eda14cbcSMatt Macy 	 * Adjust the quota against the actual pool size at the root
1328*eda14cbcSMatt Macy 	 * minus any outstanding deferred frees.
1329*eda14cbcSMatt Macy 	 * To ensure that it's possible to remove files from a full
1330*eda14cbcSMatt Macy 	 * pool without inducing transient overcommits, we throttle
1331*eda14cbcSMatt Macy 	 * netfree transactions against a quota that is slightly larger,
1332*eda14cbcSMatt Macy 	 * but still within the pool's allocation slop.  In cases where
1333*eda14cbcSMatt Macy 	 * we're very close to full, this will allow a steady trickle of
1334*eda14cbcSMatt Macy 	 * removes to get through.
1335*eda14cbcSMatt Macy 	 */
1336*eda14cbcSMatt Macy 	uint64_t deferred = 0;
1337*eda14cbcSMatt Macy 	if (dd->dd_parent == NULL) {
1338*eda14cbcSMatt Macy 		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
1339*eda14cbcSMatt Macy 		    (netfree) ?
1340*eda14cbcSMatt Macy 		    ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
1341*eda14cbcSMatt Macy 
1342*eda14cbcSMatt Macy 		if (avail < quota) {
1343*eda14cbcSMatt Macy 			quota = avail;
1344*eda14cbcSMatt Macy 			retval = SET_ERROR(ENOSPC);
1345*eda14cbcSMatt Macy 		}
1346*eda14cbcSMatt Macy 	}
1347*eda14cbcSMatt Macy 
1348*eda14cbcSMatt Macy 	/*
1349*eda14cbcSMatt Macy 	 * If they are requesting more space, and our current estimate
1350*eda14cbcSMatt Macy 	 * is over quota, they get to try again unless the actual
1351*eda14cbcSMatt Macy 	 * on-disk is over quota and there are no pending changes (which
1352*eda14cbcSMatt Macy 	 * may free up space for us).
1353*eda14cbcSMatt Macy 	 */
1354*eda14cbcSMatt Macy 	if (used_on_disk + est_inflight >= quota) {
1355*eda14cbcSMatt Macy 		if (est_inflight > 0 || used_on_disk < quota ||
1356*eda14cbcSMatt Macy 		    (retval == ENOSPC && used_on_disk < quota + deferred))
1357*eda14cbcSMatt Macy 			retval = ERESTART;
1358*eda14cbcSMatt Macy 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
1359*eda14cbcSMatt Macy 		    "quota=%lluK tr=%lluK err=%d\n",
1360*eda14cbcSMatt Macy 		    used_on_disk>>10, est_inflight>>10,
1361*eda14cbcSMatt Macy 		    quota>>10, asize>>10, retval);
1362*eda14cbcSMatt Macy 		mutex_exit(&dd->dd_lock);
1363*eda14cbcSMatt Macy 		DMU_TX_STAT_BUMP(dmu_tx_quota);
1364*eda14cbcSMatt Macy 		return (SET_ERROR(retval));
1365*eda14cbcSMatt Macy 	}
1366*eda14cbcSMatt Macy 
1367*eda14cbcSMatt Macy 	/* We need to up our estimated delta before dropping dd_lock */
1368*eda14cbcSMatt Macy 	dd->dd_tempreserved[txg & TXG_MASK] += asize;
1369*eda14cbcSMatt Macy 
1370*eda14cbcSMatt Macy 	uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
1371*eda14cbcSMatt Macy 	    asize - ref_rsrv);
1372*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
1373*eda14cbcSMatt Macy 
1374*eda14cbcSMatt Macy 	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1375*eda14cbcSMatt Macy 	tr->tr_ds = dd;
1376*eda14cbcSMatt Macy 	tr->tr_size = asize;
1377*eda14cbcSMatt Macy 	list_insert_tail(tr_list, tr);
1378*eda14cbcSMatt Macy 
1379*eda14cbcSMatt Macy 	/* see if it's OK with our parent */
1380*eda14cbcSMatt Macy 	if (dd->dd_parent != NULL && parent_rsrv != 0) {
1381*eda14cbcSMatt Macy 		/*
1382*eda14cbcSMatt Macy 		 * Recurse on our parent without recursion. This has been
1383*eda14cbcSMatt Macy 		 * observed to be potentially large stack usage even within
1384*eda14cbcSMatt Macy 		 * the test suite. Largest seen stack was 7632 bytes on linux.
1385*eda14cbcSMatt Macy 		 */
1386*eda14cbcSMatt Macy 
1387*eda14cbcSMatt Macy 		dd = dd->dd_parent;
1388*eda14cbcSMatt Macy 		asize = parent_rsrv;
1389*eda14cbcSMatt Macy 		ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
1390*eda14cbcSMatt Macy 		first = B_FALSE;
1391*eda14cbcSMatt Macy 		goto top_of_function;
1392*eda14cbcSMatt Macy 
1393*eda14cbcSMatt Macy 	} else {
1394*eda14cbcSMatt Macy 		return (0);
1395*eda14cbcSMatt Macy 	}
1396*eda14cbcSMatt Macy }
1397*eda14cbcSMatt Macy 
1398*eda14cbcSMatt Macy /*
1399*eda14cbcSMatt Macy  * Reserve space in this dsl_dir, to be used in this tx's txg.
1400*eda14cbcSMatt Macy  * After the space has been dirtied (and dsl_dir_willuse_space()
1401*eda14cbcSMatt Macy  * has been called), the reservation should be canceled, using
1402*eda14cbcSMatt Macy  * dsl_dir_tempreserve_clear().
1403*eda14cbcSMatt Macy  */
1404*eda14cbcSMatt Macy int
1405*eda14cbcSMatt Macy dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
1406*eda14cbcSMatt Macy     boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
1407*eda14cbcSMatt Macy {
1408*eda14cbcSMatt Macy 	int err;
1409*eda14cbcSMatt Macy 	list_t *tr_list;
1410*eda14cbcSMatt Macy 
1411*eda14cbcSMatt Macy 	if (asize == 0) {
1412*eda14cbcSMatt Macy 		*tr_cookiep = NULL;
1413*eda14cbcSMatt Macy 		return (0);
1414*eda14cbcSMatt Macy 	}
1415*eda14cbcSMatt Macy 
1416*eda14cbcSMatt Macy 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
1417*eda14cbcSMatt Macy 	list_create(tr_list, sizeof (struct tempreserve),
1418*eda14cbcSMatt Macy 	    offsetof(struct tempreserve, tr_node));
1419*eda14cbcSMatt Macy 	ASSERT3S(asize, >, 0);
1420*eda14cbcSMatt Macy 
1421*eda14cbcSMatt Macy 	err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
1422*eda14cbcSMatt Macy 	if (err == 0) {
1423*eda14cbcSMatt Macy 		struct tempreserve *tr;
1424*eda14cbcSMatt Macy 
1425*eda14cbcSMatt Macy 		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1426*eda14cbcSMatt Macy 		tr->tr_size = lsize;
1427*eda14cbcSMatt Macy 		list_insert_tail(tr_list, tr);
1428*eda14cbcSMatt Macy 	} else {
1429*eda14cbcSMatt Macy 		if (err == EAGAIN) {
1430*eda14cbcSMatt Macy 			/*
1431*eda14cbcSMatt Macy 			 * If arc_memory_throttle() detected that pageout
1432*eda14cbcSMatt Macy 			 * is running and we are low on memory, we delay new
1433*eda14cbcSMatt Macy 			 * non-pageout transactions to give pageout an
1434*eda14cbcSMatt Macy 			 * advantage.
1435*eda14cbcSMatt Macy 			 *
1436*eda14cbcSMatt Macy 			 * It is unfortunate to be delaying while the caller's
1437*eda14cbcSMatt Macy 			 * locks are held.
1438*eda14cbcSMatt Macy 			 */
1439*eda14cbcSMatt Macy 			txg_delay(dd->dd_pool, tx->tx_txg,
1440*eda14cbcSMatt Macy 			    MSEC2NSEC(10), MSEC2NSEC(10));
1441*eda14cbcSMatt Macy 			err = SET_ERROR(ERESTART);
1442*eda14cbcSMatt Macy 		}
1443*eda14cbcSMatt Macy 	}
1444*eda14cbcSMatt Macy 
1445*eda14cbcSMatt Macy 	if (err == 0) {
1446*eda14cbcSMatt Macy 		err = dsl_dir_tempreserve_impl(dd, asize, netfree,
1447*eda14cbcSMatt Macy 		    B_FALSE, tr_list, tx, B_TRUE);
1448*eda14cbcSMatt Macy 	}
1449*eda14cbcSMatt Macy 
1450*eda14cbcSMatt Macy 	if (err != 0)
1451*eda14cbcSMatt Macy 		dsl_dir_tempreserve_clear(tr_list, tx);
1452*eda14cbcSMatt Macy 	else
1453*eda14cbcSMatt Macy 		*tr_cookiep = tr_list;
1454*eda14cbcSMatt Macy 
1455*eda14cbcSMatt Macy 	return (err);
1456*eda14cbcSMatt Macy }
1457*eda14cbcSMatt Macy 
1458*eda14cbcSMatt Macy /*
1459*eda14cbcSMatt Macy  * Clear a temporary reservation that we previously made with
1460*eda14cbcSMatt Macy  * dsl_dir_tempreserve_space().
1461*eda14cbcSMatt Macy  */
1462*eda14cbcSMatt Macy void
1463*eda14cbcSMatt Macy dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
1464*eda14cbcSMatt Macy {
1465*eda14cbcSMatt Macy 	int txgidx = tx->tx_txg & TXG_MASK;
1466*eda14cbcSMatt Macy 	list_t *tr_list = tr_cookie;
1467*eda14cbcSMatt Macy 	struct tempreserve *tr;
1468*eda14cbcSMatt Macy 
1469*eda14cbcSMatt Macy 	ASSERT3U(tx->tx_txg, !=, 0);
1470*eda14cbcSMatt Macy 
1471*eda14cbcSMatt Macy 	if (tr_cookie == NULL)
1472*eda14cbcSMatt Macy 		return;
1473*eda14cbcSMatt Macy 
1474*eda14cbcSMatt Macy 	while ((tr = list_head(tr_list)) != NULL) {
1475*eda14cbcSMatt Macy 		if (tr->tr_ds) {
1476*eda14cbcSMatt Macy 			mutex_enter(&tr->tr_ds->dd_lock);
1477*eda14cbcSMatt Macy 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
1478*eda14cbcSMatt Macy 			    tr->tr_size);
1479*eda14cbcSMatt Macy 			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
1480*eda14cbcSMatt Macy 			mutex_exit(&tr->tr_ds->dd_lock);
1481*eda14cbcSMatt Macy 		} else {
1482*eda14cbcSMatt Macy 			arc_tempreserve_clear(tr->tr_size);
1483*eda14cbcSMatt Macy 		}
1484*eda14cbcSMatt Macy 		list_remove(tr_list, tr);
1485*eda14cbcSMatt Macy 		kmem_free(tr, sizeof (struct tempreserve));
1486*eda14cbcSMatt Macy 	}
1487*eda14cbcSMatt Macy 
1488*eda14cbcSMatt Macy 	kmem_free(tr_list, sizeof (list_t));
1489*eda14cbcSMatt Macy }
1490*eda14cbcSMatt Macy 
1491*eda14cbcSMatt Macy /*
1492*eda14cbcSMatt Macy  * This should be called from open context when we think we're going to write
1493*eda14cbcSMatt Macy  * or free space, for example when dirtying data. Be conservative; it's okay
1494*eda14cbcSMatt Macy  * to write less space or free more, but we don't want to write more or free
1495*eda14cbcSMatt Macy  * less than the amount specified.
1496*eda14cbcSMatt Macy  *
1497*eda14cbcSMatt Macy  * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
1498*eda14cbcSMatt Macy  * version however it has been adjusted to use an iterative rather than
1499*eda14cbcSMatt Macy  * recursive algorithm to minimize stack usage.
1500*eda14cbcSMatt Macy  */
1501*eda14cbcSMatt Macy void
1502*eda14cbcSMatt Macy dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
1503*eda14cbcSMatt Macy {
1504*eda14cbcSMatt Macy 	int64_t parent_space;
1505*eda14cbcSMatt Macy 	uint64_t est_used;
1506*eda14cbcSMatt Macy 
1507*eda14cbcSMatt Macy 	do {
1508*eda14cbcSMatt Macy 		mutex_enter(&dd->dd_lock);
1509*eda14cbcSMatt Macy 		if (space > 0)
1510*eda14cbcSMatt Macy 			dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
1511*eda14cbcSMatt Macy 
1512*eda14cbcSMatt Macy 		est_used = dsl_dir_space_towrite(dd) +
1513*eda14cbcSMatt Macy 		    dsl_dir_phys(dd)->dd_used_bytes;
1514*eda14cbcSMatt Macy 		parent_space = parent_delta(dd, est_used, space);
1515*eda14cbcSMatt Macy 		mutex_exit(&dd->dd_lock);
1516*eda14cbcSMatt Macy 
1517*eda14cbcSMatt Macy 		/* Make sure that we clean up dd_space_to* */
1518*eda14cbcSMatt Macy 		dsl_dir_dirty(dd, tx);
1519*eda14cbcSMatt Macy 
1520*eda14cbcSMatt Macy 		dd = dd->dd_parent;
1521*eda14cbcSMatt Macy 		space = parent_space;
1522*eda14cbcSMatt Macy 	} while (space && dd);
1523*eda14cbcSMatt Macy }
1524*eda14cbcSMatt Macy 
1525*eda14cbcSMatt Macy /* call from syncing context when we actually write/free space for this dd */
1526*eda14cbcSMatt Macy void
1527*eda14cbcSMatt Macy dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
1528*eda14cbcSMatt Macy     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
1529*eda14cbcSMatt Macy {
1530*eda14cbcSMatt Macy 	int64_t accounted_delta;
1531*eda14cbcSMatt Macy 
1532*eda14cbcSMatt Macy 	/*
1533*eda14cbcSMatt Macy 	 * dsl_dataset_set_refreservation_sync_impl() calls this with
1534*eda14cbcSMatt Macy 	 * dd_lock held, so that it can atomically update
1535*eda14cbcSMatt Macy 	 * ds->ds_reserved and the dsl_dir accounting, so that
1536*eda14cbcSMatt Macy 	 * dsl_dataset_check_quota() can see dataset and dir accounting
1537*eda14cbcSMatt Macy 	 * consistently.
1538*eda14cbcSMatt Macy 	 */
1539*eda14cbcSMatt Macy 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
1540*eda14cbcSMatt Macy 
1541*eda14cbcSMatt Macy 	ASSERT(dmu_tx_is_syncing(tx));
1542*eda14cbcSMatt Macy 	ASSERT(type < DD_USED_NUM);
1543*eda14cbcSMatt Macy 
1544*eda14cbcSMatt Macy 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1545*eda14cbcSMatt Macy 
1546*eda14cbcSMatt Macy 	if (needlock)
1547*eda14cbcSMatt Macy 		mutex_enter(&dd->dd_lock);
1548*eda14cbcSMatt Macy 	accounted_delta =
1549*eda14cbcSMatt Macy 	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
1550*eda14cbcSMatt Macy 	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
1551*eda14cbcSMatt Macy 	ASSERT(compressed >= 0 ||
1552*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
1553*eda14cbcSMatt Macy 	ASSERT(uncompressed >= 0 ||
1554*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
1555*eda14cbcSMatt Macy 	dsl_dir_phys(dd)->dd_used_bytes += used;
1556*eda14cbcSMatt Macy 	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
1557*eda14cbcSMatt Macy 	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
1558*eda14cbcSMatt Macy 
1559*eda14cbcSMatt Macy 	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1560*eda14cbcSMatt Macy 		ASSERT(used > 0 ||
1561*eda14cbcSMatt Macy 		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
1562*eda14cbcSMatt Macy 		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
1563*eda14cbcSMatt Macy #ifdef ZFS_DEBUG
1564*eda14cbcSMatt Macy 		{
1565*eda14cbcSMatt Macy 			dd_used_t t;
1566*eda14cbcSMatt Macy 			uint64_t u = 0;
1567*eda14cbcSMatt Macy 			for (t = 0; t < DD_USED_NUM; t++)
1568*eda14cbcSMatt Macy 				u += dsl_dir_phys(dd)->dd_used_breakdown[t];
1569*eda14cbcSMatt Macy 			ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
1570*eda14cbcSMatt Macy 		}
1571*eda14cbcSMatt Macy #endif
1572*eda14cbcSMatt Macy 	}
1573*eda14cbcSMatt Macy 	if (needlock)
1574*eda14cbcSMatt Macy 		mutex_exit(&dd->dd_lock);
1575*eda14cbcSMatt Macy 
1576*eda14cbcSMatt Macy 	if (dd->dd_parent != NULL) {
1577*eda14cbcSMatt Macy 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1578*eda14cbcSMatt Macy 		    accounted_delta, compressed, uncompressed, tx);
1579*eda14cbcSMatt Macy 		dsl_dir_transfer_space(dd->dd_parent,
1580*eda14cbcSMatt Macy 		    used - accounted_delta,
1581*eda14cbcSMatt Macy 		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
1582*eda14cbcSMatt Macy 	}
1583*eda14cbcSMatt Macy }
1584*eda14cbcSMatt Macy 
1585*eda14cbcSMatt Macy void
1586*eda14cbcSMatt Macy dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
1587*eda14cbcSMatt Macy     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1588*eda14cbcSMatt Macy {
1589*eda14cbcSMatt Macy 	ASSERT(dmu_tx_is_syncing(tx));
1590*eda14cbcSMatt Macy 	ASSERT(oldtype < DD_USED_NUM);
1591*eda14cbcSMatt Macy 	ASSERT(newtype < DD_USED_NUM);
1592*eda14cbcSMatt Macy 
1593*eda14cbcSMatt Macy 	if (delta == 0 ||
1594*eda14cbcSMatt Macy 	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
1595*eda14cbcSMatt Macy 		return;
1596*eda14cbcSMatt Macy 
1597*eda14cbcSMatt Macy 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1598*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
1599*eda14cbcSMatt Macy 	ASSERT(delta > 0 ?
1600*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
1601*eda14cbcSMatt Macy 	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
1602*eda14cbcSMatt Macy 	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
1603*eda14cbcSMatt Macy 	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
1604*eda14cbcSMatt Macy 	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
1605*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
1606*eda14cbcSMatt Macy }
1607*eda14cbcSMatt Macy 
1608*eda14cbcSMatt Macy typedef struct dsl_dir_set_qr_arg {
1609*eda14cbcSMatt Macy 	const char *ddsqra_name;
1610*eda14cbcSMatt Macy 	zprop_source_t ddsqra_source;
1611*eda14cbcSMatt Macy 	uint64_t ddsqra_value;
1612*eda14cbcSMatt Macy } dsl_dir_set_qr_arg_t;
1613*eda14cbcSMatt Macy 
1614*eda14cbcSMatt Macy static int
1615*eda14cbcSMatt Macy dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
1616*eda14cbcSMatt Macy {
1617*eda14cbcSMatt Macy 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1618*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
1619*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
1620*eda14cbcSMatt Macy 	int error;
1621*eda14cbcSMatt Macy 	uint64_t towrite, newval;
1622*eda14cbcSMatt Macy 
1623*eda14cbcSMatt Macy 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1624*eda14cbcSMatt Macy 	if (error != 0)
1625*eda14cbcSMatt Macy 		return (error);
1626*eda14cbcSMatt Macy 
1627*eda14cbcSMatt Macy 	error = dsl_prop_predict(ds->ds_dir, "quota",
1628*eda14cbcSMatt Macy 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1629*eda14cbcSMatt Macy 	if (error != 0) {
1630*eda14cbcSMatt Macy 		dsl_dataset_rele(ds, FTAG);
1631*eda14cbcSMatt Macy 		return (error);
1632*eda14cbcSMatt Macy 	}
1633*eda14cbcSMatt Macy 
1634*eda14cbcSMatt Macy 	if (newval == 0) {
1635*eda14cbcSMatt Macy 		dsl_dataset_rele(ds, FTAG);
1636*eda14cbcSMatt Macy 		return (0);
1637*eda14cbcSMatt Macy 	}
1638*eda14cbcSMatt Macy 
1639*eda14cbcSMatt Macy 	mutex_enter(&ds->ds_dir->dd_lock);
1640*eda14cbcSMatt Macy 	/*
1641*eda14cbcSMatt Macy 	 * If we are doing the preliminary check in open context, and
1642*eda14cbcSMatt Macy 	 * there are pending changes, then don't fail it, since the
1643*eda14cbcSMatt Macy 	 * pending changes could under-estimate the amount of space to be
1644*eda14cbcSMatt Macy 	 * freed up.
1645*eda14cbcSMatt Macy 	 */
1646*eda14cbcSMatt Macy 	towrite = dsl_dir_space_towrite(ds->ds_dir);
1647*eda14cbcSMatt Macy 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1648*eda14cbcSMatt Macy 	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
1649*eda14cbcSMatt Macy 	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
1650*eda14cbcSMatt Macy 		error = SET_ERROR(ENOSPC);
1651*eda14cbcSMatt Macy 	}
1652*eda14cbcSMatt Macy 	mutex_exit(&ds->ds_dir->dd_lock);
1653*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
1654*eda14cbcSMatt Macy 	return (error);
1655*eda14cbcSMatt Macy }
1656*eda14cbcSMatt Macy 
1657*eda14cbcSMatt Macy static void
1658*eda14cbcSMatt Macy dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
1659*eda14cbcSMatt Macy {
1660*eda14cbcSMatt Macy 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1661*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
1662*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
1663*eda14cbcSMatt Macy 	uint64_t newval;
1664*eda14cbcSMatt Macy 
1665*eda14cbcSMatt Macy 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1666*eda14cbcSMatt Macy 
1667*eda14cbcSMatt Macy 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1668*eda14cbcSMatt Macy 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
1669*eda14cbcSMatt Macy 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1670*eda14cbcSMatt Macy 		    &ddsqra->ddsqra_value, tx);
1671*eda14cbcSMatt Macy 
1672*eda14cbcSMatt Macy 		VERIFY0(dsl_prop_get_int_ds(ds,
1673*eda14cbcSMatt Macy 		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
1674*eda14cbcSMatt Macy 	} else {
1675*eda14cbcSMatt Macy 		newval = ddsqra->ddsqra_value;
1676*eda14cbcSMatt Macy 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1677*eda14cbcSMatt Macy 		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
1678*eda14cbcSMatt Macy 	}
1679*eda14cbcSMatt Macy 
1680*eda14cbcSMatt Macy 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1681*eda14cbcSMatt Macy 	mutex_enter(&ds->ds_dir->dd_lock);
1682*eda14cbcSMatt Macy 	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
1683*eda14cbcSMatt Macy 	mutex_exit(&ds->ds_dir->dd_lock);
1684*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
1685*eda14cbcSMatt Macy }
1686*eda14cbcSMatt Macy 
1687*eda14cbcSMatt Macy int
1688*eda14cbcSMatt Macy dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1689*eda14cbcSMatt Macy {
1690*eda14cbcSMatt Macy 	dsl_dir_set_qr_arg_t ddsqra;
1691*eda14cbcSMatt Macy 
1692*eda14cbcSMatt Macy 	ddsqra.ddsqra_name = ddname;
1693*eda14cbcSMatt Macy 	ddsqra.ddsqra_source = source;
1694*eda14cbcSMatt Macy 	ddsqra.ddsqra_value = quota;
1695*eda14cbcSMatt Macy 
1696*eda14cbcSMatt Macy 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
1697*eda14cbcSMatt Macy 	    dsl_dir_set_quota_sync, &ddsqra, 0,
1698*eda14cbcSMatt Macy 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
1699*eda14cbcSMatt Macy }
1700*eda14cbcSMatt Macy 
1701*eda14cbcSMatt Macy static int
1702*eda14cbcSMatt Macy dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
1703*eda14cbcSMatt Macy {
1704*eda14cbcSMatt Macy 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1705*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
1706*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
1707*eda14cbcSMatt Macy 	dsl_dir_t *dd;
1708*eda14cbcSMatt Macy 	uint64_t newval, used, avail;
1709*eda14cbcSMatt Macy 	int error;
1710*eda14cbcSMatt Macy 
1711*eda14cbcSMatt Macy 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1712*eda14cbcSMatt Macy 	if (error != 0)
1713*eda14cbcSMatt Macy 		return (error);
1714*eda14cbcSMatt Macy 	dd = ds->ds_dir;
1715*eda14cbcSMatt Macy 
1716*eda14cbcSMatt Macy 	/*
1717*eda14cbcSMatt Macy 	 * If we are doing the preliminary check in open context, the
1718*eda14cbcSMatt Macy 	 * space estimates may be inaccurate.
1719*eda14cbcSMatt Macy 	 */
1720*eda14cbcSMatt Macy 	if (!dmu_tx_is_syncing(tx)) {
1721*eda14cbcSMatt Macy 		dsl_dataset_rele(ds, FTAG);
1722*eda14cbcSMatt Macy 		return (0);
1723*eda14cbcSMatt Macy 	}
1724*eda14cbcSMatt Macy 
1725*eda14cbcSMatt Macy 	error = dsl_prop_predict(ds->ds_dir,
1726*eda14cbcSMatt Macy 	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1727*eda14cbcSMatt Macy 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1728*eda14cbcSMatt Macy 	if (error != 0) {
1729*eda14cbcSMatt Macy 		dsl_dataset_rele(ds, FTAG);
1730*eda14cbcSMatt Macy 		return (error);
1731*eda14cbcSMatt Macy 	}
1732*eda14cbcSMatt Macy 
1733*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
1734*eda14cbcSMatt Macy 	used = dsl_dir_phys(dd)->dd_used_bytes;
1735*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
1736*eda14cbcSMatt Macy 
1737*eda14cbcSMatt Macy 	if (dd->dd_parent) {
1738*eda14cbcSMatt Macy 		avail = dsl_dir_space_available(dd->dd_parent,
1739*eda14cbcSMatt Macy 		    NULL, 0, FALSE);
1740*eda14cbcSMatt Macy 	} else {
1741*eda14cbcSMatt Macy 		avail = dsl_pool_adjustedsize(dd->dd_pool,
1742*eda14cbcSMatt Macy 		    ZFS_SPACE_CHECK_NORMAL) - used;
1743*eda14cbcSMatt Macy 	}
1744*eda14cbcSMatt Macy 
1745*eda14cbcSMatt Macy 	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
1746*eda14cbcSMatt Macy 		uint64_t delta = MAX(used, newval) -
1747*eda14cbcSMatt Macy 		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
1748*eda14cbcSMatt Macy 
1749*eda14cbcSMatt Macy 		if (delta > avail ||
1750*eda14cbcSMatt Macy 		    (dsl_dir_phys(dd)->dd_quota > 0 &&
1751*eda14cbcSMatt Macy 		    newval > dsl_dir_phys(dd)->dd_quota))
1752*eda14cbcSMatt Macy 			error = SET_ERROR(ENOSPC);
1753*eda14cbcSMatt Macy 	}
1754*eda14cbcSMatt Macy 
1755*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
1756*eda14cbcSMatt Macy 	return (error);
1757*eda14cbcSMatt Macy }
1758*eda14cbcSMatt Macy 
1759*eda14cbcSMatt Macy void
1760*eda14cbcSMatt Macy dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1761*eda14cbcSMatt Macy {
1762*eda14cbcSMatt Macy 	uint64_t used;
1763*eda14cbcSMatt Macy 	int64_t delta;
1764*eda14cbcSMatt Macy 
1765*eda14cbcSMatt Macy 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1766*eda14cbcSMatt Macy 
1767*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
1768*eda14cbcSMatt Macy 	used = dsl_dir_phys(dd)->dd_used_bytes;
1769*eda14cbcSMatt Macy 	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
1770*eda14cbcSMatt Macy 	dsl_dir_phys(dd)->dd_reserved = value;
1771*eda14cbcSMatt Macy 
1772*eda14cbcSMatt Macy 	if (dd->dd_parent != NULL) {
1773*eda14cbcSMatt Macy 		/* Roll up this additional usage into our ancestors */
1774*eda14cbcSMatt Macy 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1775*eda14cbcSMatt Macy 		    delta, 0, 0, tx);
1776*eda14cbcSMatt Macy 	}
1777*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
1778*eda14cbcSMatt Macy }
1779*eda14cbcSMatt Macy 
1780*eda14cbcSMatt Macy static void
1781*eda14cbcSMatt Macy dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
1782*eda14cbcSMatt Macy {
1783*eda14cbcSMatt Macy 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1784*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
1785*eda14cbcSMatt Macy 	dsl_dataset_t *ds;
1786*eda14cbcSMatt Macy 	uint64_t newval;
1787*eda14cbcSMatt Macy 
1788*eda14cbcSMatt Macy 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1789*eda14cbcSMatt Macy 
1790*eda14cbcSMatt Macy 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1791*eda14cbcSMatt Macy 		dsl_prop_set_sync_impl(ds,
1792*eda14cbcSMatt Macy 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1793*eda14cbcSMatt Macy 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1794*eda14cbcSMatt Macy 		    &ddsqra->ddsqra_value, tx);
1795*eda14cbcSMatt Macy 
1796*eda14cbcSMatt Macy 		VERIFY0(dsl_prop_get_int_ds(ds,
1797*eda14cbcSMatt Macy 		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1798*eda14cbcSMatt Macy 	} else {
1799*eda14cbcSMatt Macy 		newval = ddsqra->ddsqra_value;
1800*eda14cbcSMatt Macy 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1801*eda14cbcSMatt Macy 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1802*eda14cbcSMatt Macy 		    (longlong_t)newval);
1803*eda14cbcSMatt Macy 	}
1804*eda14cbcSMatt Macy 
1805*eda14cbcSMatt Macy 	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1806*eda14cbcSMatt Macy 	dsl_dataset_rele(ds, FTAG);
1807*eda14cbcSMatt Macy }
1808*eda14cbcSMatt Macy 
1809*eda14cbcSMatt Macy int
1810*eda14cbcSMatt Macy dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1811*eda14cbcSMatt Macy     uint64_t reservation)
1812*eda14cbcSMatt Macy {
1813*eda14cbcSMatt Macy 	dsl_dir_set_qr_arg_t ddsqra;
1814*eda14cbcSMatt Macy 
1815*eda14cbcSMatt Macy 	ddsqra.ddsqra_name = ddname;
1816*eda14cbcSMatt Macy 	ddsqra.ddsqra_source = source;
1817*eda14cbcSMatt Macy 	ddsqra.ddsqra_value = reservation;
1818*eda14cbcSMatt Macy 
1819*eda14cbcSMatt Macy 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
1820*eda14cbcSMatt Macy 	    dsl_dir_set_reservation_sync, &ddsqra, 0,
1821*eda14cbcSMatt Macy 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
1822*eda14cbcSMatt Macy }
1823*eda14cbcSMatt Macy 
1824*eda14cbcSMatt Macy static dsl_dir_t *
1825*eda14cbcSMatt Macy closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1826*eda14cbcSMatt Macy {
1827*eda14cbcSMatt Macy 	for (; ds1; ds1 = ds1->dd_parent) {
1828*eda14cbcSMatt Macy 		dsl_dir_t *dd;
1829*eda14cbcSMatt Macy 		for (dd = ds2; dd; dd = dd->dd_parent) {
1830*eda14cbcSMatt Macy 			if (ds1 == dd)
1831*eda14cbcSMatt Macy 				return (dd);
1832*eda14cbcSMatt Macy 		}
1833*eda14cbcSMatt Macy 	}
1834*eda14cbcSMatt Macy 	return (NULL);
1835*eda14cbcSMatt Macy }
1836*eda14cbcSMatt Macy 
1837*eda14cbcSMatt Macy /*
1838*eda14cbcSMatt Macy  * If delta is applied to dd, how much of that delta would be applied to
1839*eda14cbcSMatt Macy  * ancestor?  Syncing context only.
1840*eda14cbcSMatt Macy  */
1841*eda14cbcSMatt Macy static int64_t
1842*eda14cbcSMatt Macy would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1843*eda14cbcSMatt Macy {
1844*eda14cbcSMatt Macy 	if (dd == ancestor)
1845*eda14cbcSMatt Macy 		return (delta);
1846*eda14cbcSMatt Macy 
1847*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
1848*eda14cbcSMatt Macy 	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
1849*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
1850*eda14cbcSMatt Macy 	return (would_change(dd->dd_parent, delta, ancestor));
1851*eda14cbcSMatt Macy }
1852*eda14cbcSMatt Macy 
1853*eda14cbcSMatt Macy typedef struct dsl_dir_rename_arg {
1854*eda14cbcSMatt Macy 	const char *ddra_oldname;
1855*eda14cbcSMatt Macy 	const char *ddra_newname;
1856*eda14cbcSMatt Macy 	cred_t *ddra_cred;
1857*eda14cbcSMatt Macy 	proc_t *ddra_proc;
1858*eda14cbcSMatt Macy } dsl_dir_rename_arg_t;
1859*eda14cbcSMatt Macy 
1860*eda14cbcSMatt Macy typedef struct dsl_valid_rename_arg {
1861*eda14cbcSMatt Macy 	int char_delta;
1862*eda14cbcSMatt Macy 	int nest_delta;
1863*eda14cbcSMatt Macy } dsl_valid_rename_arg_t;
1864*eda14cbcSMatt Macy 
1865*eda14cbcSMatt Macy /* ARGSUSED */
1866*eda14cbcSMatt Macy static int
1867*eda14cbcSMatt Macy dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1868*eda14cbcSMatt Macy {
1869*eda14cbcSMatt Macy 	dsl_valid_rename_arg_t *dvra = arg;
1870*eda14cbcSMatt Macy 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
1871*eda14cbcSMatt Macy 
1872*eda14cbcSMatt Macy 	dsl_dataset_name(ds, namebuf);
1873*eda14cbcSMatt Macy 
1874*eda14cbcSMatt Macy 	ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
1875*eda14cbcSMatt Macy 	    <, ZFS_MAX_DATASET_NAME_LEN);
1876*eda14cbcSMatt Macy 	int namelen = strlen(namebuf) + dvra->char_delta;
1877*eda14cbcSMatt Macy 	int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
1878*eda14cbcSMatt Macy 
1879*eda14cbcSMatt Macy 	if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
1880*eda14cbcSMatt Macy 		return (SET_ERROR(ENAMETOOLONG));
1881*eda14cbcSMatt Macy 	if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
1882*eda14cbcSMatt Macy 		return (SET_ERROR(ENAMETOOLONG));
1883*eda14cbcSMatt Macy 	return (0);
1884*eda14cbcSMatt Macy }
1885*eda14cbcSMatt Macy 
1886*eda14cbcSMatt Macy static int
1887*eda14cbcSMatt Macy dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1888*eda14cbcSMatt Macy {
1889*eda14cbcSMatt Macy 	dsl_dir_rename_arg_t *ddra = arg;
1890*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
1891*eda14cbcSMatt Macy 	dsl_dir_t *dd, *newparent;
1892*eda14cbcSMatt Macy 	dsl_valid_rename_arg_t dvra;
1893*eda14cbcSMatt Macy 	dsl_dataset_t *parentds;
1894*eda14cbcSMatt Macy 	objset_t *parentos;
1895*eda14cbcSMatt Macy 	const char *mynewname;
1896*eda14cbcSMatt Macy 	int error;
1897*eda14cbcSMatt Macy 
1898*eda14cbcSMatt Macy 	/* target dir should exist */
1899*eda14cbcSMatt Macy 	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1900*eda14cbcSMatt Macy 	if (error != 0)
1901*eda14cbcSMatt Macy 		return (error);
1902*eda14cbcSMatt Macy 
1903*eda14cbcSMatt Macy 	/* new parent should exist */
1904*eda14cbcSMatt Macy 	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1905*eda14cbcSMatt Macy 	    &newparent, &mynewname);
1906*eda14cbcSMatt Macy 	if (error != 0) {
1907*eda14cbcSMatt Macy 		dsl_dir_rele(dd, FTAG);
1908*eda14cbcSMatt Macy 		return (error);
1909*eda14cbcSMatt Macy 	}
1910*eda14cbcSMatt Macy 
1911*eda14cbcSMatt Macy 	/* can't rename to different pool */
1912*eda14cbcSMatt Macy 	if (dd->dd_pool != newparent->dd_pool) {
1913*eda14cbcSMatt Macy 		dsl_dir_rele(newparent, FTAG);
1914*eda14cbcSMatt Macy 		dsl_dir_rele(dd, FTAG);
1915*eda14cbcSMatt Macy 		return (SET_ERROR(EXDEV));
1916*eda14cbcSMatt Macy 	}
1917*eda14cbcSMatt Macy 
1918*eda14cbcSMatt Macy 	/* new name should not already exist */
1919*eda14cbcSMatt Macy 	if (mynewname == NULL) {
1920*eda14cbcSMatt Macy 		dsl_dir_rele(newparent, FTAG);
1921*eda14cbcSMatt Macy 		dsl_dir_rele(dd, FTAG);
1922*eda14cbcSMatt Macy 		return (SET_ERROR(EEXIST));
1923*eda14cbcSMatt Macy 	}
1924*eda14cbcSMatt Macy 
1925*eda14cbcSMatt Macy 	/* can't rename below anything but filesystems (eg. no ZVOLs) */
1926*eda14cbcSMatt Macy 	error = dsl_dataset_hold_obj(newparent->dd_pool,
1927*eda14cbcSMatt Macy 	    dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
1928*eda14cbcSMatt Macy 	if (error != 0) {
1929*eda14cbcSMatt Macy 		dsl_dir_rele(newparent, FTAG);
1930*eda14cbcSMatt Macy 		dsl_dir_rele(dd, FTAG);
1931*eda14cbcSMatt Macy 		return (error);
1932*eda14cbcSMatt Macy 	}
1933*eda14cbcSMatt Macy 	error = dmu_objset_from_ds(parentds, &parentos);
1934*eda14cbcSMatt Macy 	if (error != 0) {
1935*eda14cbcSMatt Macy 		dsl_dataset_rele(parentds, FTAG);
1936*eda14cbcSMatt Macy 		dsl_dir_rele(newparent, FTAG);
1937*eda14cbcSMatt Macy 		dsl_dir_rele(dd, FTAG);
1938*eda14cbcSMatt Macy 		return (error);
1939*eda14cbcSMatt Macy 	}
1940*eda14cbcSMatt Macy 	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
1941*eda14cbcSMatt Macy 		dsl_dataset_rele(parentds, FTAG);
1942*eda14cbcSMatt Macy 		dsl_dir_rele(newparent, FTAG);
1943*eda14cbcSMatt Macy 		dsl_dir_rele(dd, FTAG);
1944*eda14cbcSMatt Macy 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
1945*eda14cbcSMatt Macy 	}
1946*eda14cbcSMatt Macy 	dsl_dataset_rele(parentds, FTAG);
1947*eda14cbcSMatt Macy 
1948*eda14cbcSMatt Macy 	ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
1949*eda14cbcSMatt Macy 	    <, ZFS_MAX_DATASET_NAME_LEN);
1950*eda14cbcSMatt Macy 	ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
1951*eda14cbcSMatt Macy 	    <, ZFS_MAX_DATASET_NAME_LEN);
1952*eda14cbcSMatt Macy 	dvra.char_delta = strlen(ddra->ddra_newname)
1953*eda14cbcSMatt Macy 	    - strlen(ddra->ddra_oldname);
1954*eda14cbcSMatt Macy 	dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
1955*eda14cbcSMatt Macy 	    - get_dataset_depth(ddra->ddra_oldname);
1956*eda14cbcSMatt Macy 
1957*eda14cbcSMatt Macy 	/* if the name length is growing, validate child name lengths */
1958*eda14cbcSMatt Macy 	if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
1959*eda14cbcSMatt Macy 		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
1960*eda14cbcSMatt Macy 		    &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1961*eda14cbcSMatt Macy 		if (error != 0) {
1962*eda14cbcSMatt Macy 			dsl_dir_rele(newparent, FTAG);
1963*eda14cbcSMatt Macy 			dsl_dir_rele(dd, FTAG);
1964*eda14cbcSMatt Macy 			return (error);
1965*eda14cbcSMatt Macy 		}
1966*eda14cbcSMatt Macy 	}
1967*eda14cbcSMatt Macy 
1968*eda14cbcSMatt Macy 	if (dmu_tx_is_syncing(tx)) {
1969*eda14cbcSMatt Macy 		if (spa_feature_is_active(dp->dp_spa,
1970*eda14cbcSMatt Macy 		    SPA_FEATURE_FS_SS_LIMIT)) {
1971*eda14cbcSMatt Macy 			/*
1972*eda14cbcSMatt Macy 			 * Although this is the check function and we don't
1973*eda14cbcSMatt Macy 			 * normally make on-disk changes in check functions,
1974*eda14cbcSMatt Macy 			 * we need to do that here.
1975*eda14cbcSMatt Macy 			 *
1976*eda14cbcSMatt Macy 			 * Ensure this portion of the tree's counts have been
1977*eda14cbcSMatt Macy 			 * initialized in case the new parent has limits set.
1978*eda14cbcSMatt Macy 			 */
1979*eda14cbcSMatt Macy 			dsl_dir_init_fs_ss_count(dd, tx);
1980*eda14cbcSMatt Macy 		}
1981*eda14cbcSMatt Macy 	}
1982*eda14cbcSMatt Macy 
1983*eda14cbcSMatt Macy 	if (newparent != dd->dd_parent) {
1984*eda14cbcSMatt Macy 		/* is there enough space? */
1985*eda14cbcSMatt Macy 		uint64_t myspace =
1986*eda14cbcSMatt Macy 		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
1987*eda14cbcSMatt Macy 		    dsl_dir_phys(dd)->dd_reserved);
1988*eda14cbcSMatt Macy 		objset_t *os = dd->dd_pool->dp_meta_objset;
1989*eda14cbcSMatt Macy 		uint64_t fs_cnt = 0;
1990*eda14cbcSMatt Macy 		uint64_t ss_cnt = 0;
1991*eda14cbcSMatt Macy 
1992*eda14cbcSMatt Macy 		if (dsl_dir_is_zapified(dd)) {
1993*eda14cbcSMatt Macy 			int err;
1994*eda14cbcSMatt Macy 
1995*eda14cbcSMatt Macy 			err = zap_lookup(os, dd->dd_object,
1996*eda14cbcSMatt Macy 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1997*eda14cbcSMatt Macy 			    &fs_cnt);
1998*eda14cbcSMatt Macy 			if (err != ENOENT && err != 0) {
1999*eda14cbcSMatt Macy 				dsl_dir_rele(newparent, FTAG);
2000*eda14cbcSMatt Macy 				dsl_dir_rele(dd, FTAG);
2001*eda14cbcSMatt Macy 				return (err);
2002*eda14cbcSMatt Macy 			}
2003*eda14cbcSMatt Macy 
2004*eda14cbcSMatt Macy 			/*
2005*eda14cbcSMatt Macy 			 * have to add 1 for the filesystem itself that we're
2006*eda14cbcSMatt Macy 			 * moving
2007*eda14cbcSMatt Macy 			 */
2008*eda14cbcSMatt Macy 			fs_cnt++;
2009*eda14cbcSMatt Macy 
2010*eda14cbcSMatt Macy 			err = zap_lookup(os, dd->dd_object,
2011*eda14cbcSMatt Macy 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
2012*eda14cbcSMatt Macy 			    &ss_cnt);
2013*eda14cbcSMatt Macy 			if (err != ENOENT && err != 0) {
2014*eda14cbcSMatt Macy 				dsl_dir_rele(newparent, FTAG);
2015*eda14cbcSMatt Macy 				dsl_dir_rele(dd, FTAG);
2016*eda14cbcSMatt Macy 				return (err);
2017*eda14cbcSMatt Macy 			}
2018*eda14cbcSMatt Macy 		}
2019*eda14cbcSMatt Macy 
2020*eda14cbcSMatt Macy 		/* check for encryption errors */
2021*eda14cbcSMatt Macy 		error = dsl_dir_rename_crypt_check(dd, newparent);
2022*eda14cbcSMatt Macy 		if (error != 0) {
2023*eda14cbcSMatt Macy 			dsl_dir_rele(newparent, FTAG);
2024*eda14cbcSMatt Macy 			dsl_dir_rele(dd, FTAG);
2025*eda14cbcSMatt Macy 			return (SET_ERROR(EACCES));
2026*eda14cbcSMatt Macy 		}
2027*eda14cbcSMatt Macy 
2028*eda14cbcSMatt Macy 		/* no rename into our descendant */
2029*eda14cbcSMatt Macy 		if (closest_common_ancestor(dd, newparent) == dd) {
2030*eda14cbcSMatt Macy 			dsl_dir_rele(newparent, FTAG);
2031*eda14cbcSMatt Macy 			dsl_dir_rele(dd, FTAG);
2032*eda14cbcSMatt Macy 			return (SET_ERROR(EINVAL));
2033*eda14cbcSMatt Macy 		}
2034*eda14cbcSMatt Macy 
2035*eda14cbcSMatt Macy 		error = dsl_dir_transfer_possible(dd->dd_parent,
2036*eda14cbcSMatt Macy 		    newparent, fs_cnt, ss_cnt, myspace,
2037*eda14cbcSMatt Macy 		    ddra->ddra_cred, ddra->ddra_proc);
2038*eda14cbcSMatt Macy 		if (error != 0) {
2039*eda14cbcSMatt Macy 			dsl_dir_rele(newparent, FTAG);
2040*eda14cbcSMatt Macy 			dsl_dir_rele(dd, FTAG);
2041*eda14cbcSMatt Macy 			return (error);
2042*eda14cbcSMatt Macy 		}
2043*eda14cbcSMatt Macy 	}
2044*eda14cbcSMatt Macy 
2045*eda14cbcSMatt Macy 	dsl_dir_rele(newparent, FTAG);
2046*eda14cbcSMatt Macy 	dsl_dir_rele(dd, FTAG);
2047*eda14cbcSMatt Macy 	return (0);
2048*eda14cbcSMatt Macy }
2049*eda14cbcSMatt Macy 
2050*eda14cbcSMatt Macy static void
2051*eda14cbcSMatt Macy dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
2052*eda14cbcSMatt Macy {
2053*eda14cbcSMatt Macy 	dsl_dir_rename_arg_t *ddra = arg;
2054*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
2055*eda14cbcSMatt Macy 	dsl_dir_t *dd, *newparent;
2056*eda14cbcSMatt Macy 	const char *mynewname;
2057*eda14cbcSMatt Macy 	objset_t *mos = dp->dp_meta_objset;
2058*eda14cbcSMatt Macy 
2059*eda14cbcSMatt Macy 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
2060*eda14cbcSMatt Macy 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
2061*eda14cbcSMatt Macy 	    &mynewname));
2062*eda14cbcSMatt Macy 
2063*eda14cbcSMatt Macy 	/* Log this before we change the name. */
2064*eda14cbcSMatt Macy 	spa_history_log_internal_dd(dd, "rename", tx,
2065*eda14cbcSMatt Macy 	    "-> %s", ddra->ddra_newname);
2066*eda14cbcSMatt Macy 
2067*eda14cbcSMatt Macy 	if (newparent != dd->dd_parent) {
2068*eda14cbcSMatt Macy 		objset_t *os = dd->dd_pool->dp_meta_objset;
2069*eda14cbcSMatt Macy 		uint64_t fs_cnt = 0;
2070*eda14cbcSMatt Macy 		uint64_t ss_cnt = 0;
2071*eda14cbcSMatt Macy 
2072*eda14cbcSMatt Macy 		/*
2073*eda14cbcSMatt Macy 		 * We already made sure the dd counts were initialized in the
2074*eda14cbcSMatt Macy 		 * check function.
2075*eda14cbcSMatt Macy 		 */
2076*eda14cbcSMatt Macy 		if (spa_feature_is_active(dp->dp_spa,
2077*eda14cbcSMatt Macy 		    SPA_FEATURE_FS_SS_LIMIT)) {
2078*eda14cbcSMatt Macy 			VERIFY0(zap_lookup(os, dd->dd_object,
2079*eda14cbcSMatt Macy 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
2080*eda14cbcSMatt Macy 			    &fs_cnt));
2081*eda14cbcSMatt Macy 			/* add 1 for the filesystem itself that we're moving */
2082*eda14cbcSMatt Macy 			fs_cnt++;
2083*eda14cbcSMatt Macy 
2084*eda14cbcSMatt Macy 			VERIFY0(zap_lookup(os, dd->dd_object,
2085*eda14cbcSMatt Macy 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
2086*eda14cbcSMatt Macy 			    &ss_cnt));
2087*eda14cbcSMatt Macy 		}
2088*eda14cbcSMatt Macy 
2089*eda14cbcSMatt Macy 		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
2090*eda14cbcSMatt Macy 		    DD_FIELD_FILESYSTEM_COUNT, tx);
2091*eda14cbcSMatt Macy 		dsl_fs_ss_count_adjust(newparent, fs_cnt,
2092*eda14cbcSMatt Macy 		    DD_FIELD_FILESYSTEM_COUNT, tx);
2093*eda14cbcSMatt Macy 
2094*eda14cbcSMatt Macy 		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
2095*eda14cbcSMatt Macy 		    DD_FIELD_SNAPSHOT_COUNT, tx);
2096*eda14cbcSMatt Macy 		dsl_fs_ss_count_adjust(newparent, ss_cnt,
2097*eda14cbcSMatt Macy 		    DD_FIELD_SNAPSHOT_COUNT, tx);
2098*eda14cbcSMatt Macy 
2099*eda14cbcSMatt Macy 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
2100*eda14cbcSMatt Macy 		    -dsl_dir_phys(dd)->dd_used_bytes,
2101*eda14cbcSMatt Macy 		    -dsl_dir_phys(dd)->dd_compressed_bytes,
2102*eda14cbcSMatt Macy 		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
2103*eda14cbcSMatt Macy 		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
2104*eda14cbcSMatt Macy 		    dsl_dir_phys(dd)->dd_used_bytes,
2105*eda14cbcSMatt Macy 		    dsl_dir_phys(dd)->dd_compressed_bytes,
2106*eda14cbcSMatt Macy 		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
2107*eda14cbcSMatt Macy 
2108*eda14cbcSMatt Macy 		if (dsl_dir_phys(dd)->dd_reserved >
2109*eda14cbcSMatt Macy 		    dsl_dir_phys(dd)->dd_used_bytes) {
2110*eda14cbcSMatt Macy 			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
2111*eda14cbcSMatt Macy 			    dsl_dir_phys(dd)->dd_used_bytes;
2112*eda14cbcSMatt Macy 
2113*eda14cbcSMatt Macy 			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
2114*eda14cbcSMatt Macy 			    -unused_rsrv, 0, 0, tx);
2115*eda14cbcSMatt Macy 			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
2116*eda14cbcSMatt Macy 			    unused_rsrv, 0, 0, tx);
2117*eda14cbcSMatt Macy 		}
2118*eda14cbcSMatt Macy 	}
2119*eda14cbcSMatt Macy 
2120*eda14cbcSMatt Macy 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
2121*eda14cbcSMatt Macy 
2122*eda14cbcSMatt Macy 	/* remove from old parent zapobj */
2123*eda14cbcSMatt Macy 	VERIFY0(zap_remove(mos,
2124*eda14cbcSMatt Macy 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
2125*eda14cbcSMatt Macy 	    dd->dd_myname, tx));
2126*eda14cbcSMatt Macy 
2127*eda14cbcSMatt Macy 	(void) strlcpy(dd->dd_myname, mynewname,
2128*eda14cbcSMatt Macy 	    sizeof (dd->dd_myname));
2129*eda14cbcSMatt Macy 	dsl_dir_rele(dd->dd_parent, dd);
2130*eda14cbcSMatt Macy 	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
2131*eda14cbcSMatt Macy 	VERIFY0(dsl_dir_hold_obj(dp,
2132*eda14cbcSMatt Macy 	    newparent->dd_object, NULL, dd, &dd->dd_parent));
2133*eda14cbcSMatt Macy 
2134*eda14cbcSMatt Macy 	/* add to new parent zapobj */
2135*eda14cbcSMatt Macy 	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
2136*eda14cbcSMatt Macy 	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
2137*eda14cbcSMatt Macy 
2138*eda14cbcSMatt Macy 	zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
2139*eda14cbcSMatt Macy 	    ddra->ddra_newname, B_TRUE);
2140*eda14cbcSMatt Macy 
2141*eda14cbcSMatt Macy 	dsl_prop_notify_all(dd);
2142*eda14cbcSMatt Macy 
2143*eda14cbcSMatt Macy 	dsl_dir_rele(newparent, FTAG);
2144*eda14cbcSMatt Macy 	dsl_dir_rele(dd, FTAG);
2145*eda14cbcSMatt Macy }
2146*eda14cbcSMatt Macy 
2147*eda14cbcSMatt Macy int
2148*eda14cbcSMatt Macy dsl_dir_rename(const char *oldname, const char *newname)
2149*eda14cbcSMatt Macy {
2150*eda14cbcSMatt Macy 	dsl_dir_rename_arg_t ddra;
2151*eda14cbcSMatt Macy 
2152*eda14cbcSMatt Macy 	ddra.ddra_oldname = oldname;
2153*eda14cbcSMatt Macy 	ddra.ddra_newname = newname;
2154*eda14cbcSMatt Macy 	ddra.ddra_cred = CRED();
2155*eda14cbcSMatt Macy 	ddra.ddra_proc = curproc;
2156*eda14cbcSMatt Macy 
2157*eda14cbcSMatt Macy 	return (dsl_sync_task(oldname,
2158*eda14cbcSMatt Macy 	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
2159*eda14cbcSMatt Macy 	    3, ZFS_SPACE_CHECK_RESERVED));
2160*eda14cbcSMatt Macy }
2161*eda14cbcSMatt Macy 
2162*eda14cbcSMatt Macy int
2163*eda14cbcSMatt Macy dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
2164*eda14cbcSMatt Macy     uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space,
2165*eda14cbcSMatt Macy     cred_t *cr, proc_t *proc)
2166*eda14cbcSMatt Macy {
2167*eda14cbcSMatt Macy 	dsl_dir_t *ancestor;
2168*eda14cbcSMatt Macy 	int64_t adelta;
2169*eda14cbcSMatt Macy 	uint64_t avail;
2170*eda14cbcSMatt Macy 	int err;
2171*eda14cbcSMatt Macy 
2172*eda14cbcSMatt Macy 	ancestor = closest_common_ancestor(sdd, tdd);
2173*eda14cbcSMatt Macy 	adelta = would_change(sdd, -space, ancestor);
2174*eda14cbcSMatt Macy 	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
2175*eda14cbcSMatt Macy 	if (avail < space)
2176*eda14cbcSMatt Macy 		return (SET_ERROR(ENOSPC));
2177*eda14cbcSMatt Macy 
2178*eda14cbcSMatt Macy 	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
2179*eda14cbcSMatt Macy 	    ancestor, cr, proc);
2180*eda14cbcSMatt Macy 	if (err != 0)
2181*eda14cbcSMatt Macy 		return (err);
2182*eda14cbcSMatt Macy 	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
2183*eda14cbcSMatt Macy 	    ancestor, cr, proc);
2184*eda14cbcSMatt Macy 	if (err != 0)
2185*eda14cbcSMatt Macy 		return (err);
2186*eda14cbcSMatt Macy 
2187*eda14cbcSMatt Macy 	return (0);
2188*eda14cbcSMatt Macy }
2189*eda14cbcSMatt Macy 
2190*eda14cbcSMatt Macy inode_timespec_t
2191*eda14cbcSMatt Macy dsl_dir_snap_cmtime(dsl_dir_t *dd)
2192*eda14cbcSMatt Macy {
2193*eda14cbcSMatt Macy 	inode_timespec_t t;
2194*eda14cbcSMatt Macy 
2195*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
2196*eda14cbcSMatt Macy 	t = dd->dd_snap_cmtime;
2197*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
2198*eda14cbcSMatt Macy 
2199*eda14cbcSMatt Macy 	return (t);
2200*eda14cbcSMatt Macy }
2201*eda14cbcSMatt Macy 
2202*eda14cbcSMatt Macy void
2203*eda14cbcSMatt Macy dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
2204*eda14cbcSMatt Macy {
2205*eda14cbcSMatt Macy 	inode_timespec_t t;
2206*eda14cbcSMatt Macy 
2207*eda14cbcSMatt Macy 	gethrestime(&t);
2208*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_lock);
2209*eda14cbcSMatt Macy 	dd->dd_snap_cmtime = t;
2210*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_lock);
2211*eda14cbcSMatt Macy }
2212*eda14cbcSMatt Macy 
2213*eda14cbcSMatt Macy void
2214*eda14cbcSMatt Macy dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
2215*eda14cbcSMatt Macy {
2216*eda14cbcSMatt Macy 	objset_t *mos = dd->dd_pool->dp_meta_objset;
2217*eda14cbcSMatt Macy 	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
2218*eda14cbcSMatt Macy }
2219*eda14cbcSMatt Macy 
2220*eda14cbcSMatt Macy boolean_t
2221*eda14cbcSMatt Macy dsl_dir_is_zapified(dsl_dir_t *dd)
2222*eda14cbcSMatt Macy {
2223*eda14cbcSMatt Macy 	dmu_object_info_t doi;
2224*eda14cbcSMatt Macy 
2225*eda14cbcSMatt Macy 	dmu_object_info_from_db(dd->dd_dbuf, &doi);
2226*eda14cbcSMatt Macy 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
2227*eda14cbcSMatt Macy }
2228*eda14cbcSMatt Macy 
2229*eda14cbcSMatt Macy void
2230*eda14cbcSMatt Macy dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
2231*eda14cbcSMatt Macy {
2232*eda14cbcSMatt Macy 	objset_t *mos = dd->dd_pool->dp_meta_objset;
2233*eda14cbcSMatt Macy 	ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
2234*eda14cbcSMatt Macy 	    SPA_FEATURE_LIVELIST));
2235*eda14cbcSMatt Macy 	dsl_deadlist_open(&dd->dd_livelist, mos, obj);
2236*eda14cbcSMatt Macy 	bplist_create(&dd->dd_pending_allocs);
2237*eda14cbcSMatt Macy 	bplist_create(&dd->dd_pending_frees);
2238*eda14cbcSMatt Macy }
2239*eda14cbcSMatt Macy 
2240*eda14cbcSMatt Macy void
2241*eda14cbcSMatt Macy dsl_dir_livelist_close(dsl_dir_t *dd)
2242*eda14cbcSMatt Macy {
2243*eda14cbcSMatt Macy 	dsl_deadlist_close(&dd->dd_livelist);
2244*eda14cbcSMatt Macy 	bplist_destroy(&dd->dd_pending_allocs);
2245*eda14cbcSMatt Macy 	bplist_destroy(&dd->dd_pending_frees);
2246*eda14cbcSMatt Macy }
2247*eda14cbcSMatt Macy 
2248*eda14cbcSMatt Macy void
2249*eda14cbcSMatt Macy dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
2250*eda14cbcSMatt Macy {
2251*eda14cbcSMatt Macy 	uint64_t obj;
2252*eda14cbcSMatt Macy 	dsl_pool_t *dp = dmu_tx_pool(tx);
2253*eda14cbcSMatt Macy 	spa_t *spa = dp->dp_spa;
2254*eda14cbcSMatt Macy 	livelist_condense_entry_t to_condense = spa->spa_to_condense;
2255*eda14cbcSMatt Macy 
2256*eda14cbcSMatt Macy 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
2257*eda14cbcSMatt Macy 		return;
2258*eda14cbcSMatt Macy 
2259*eda14cbcSMatt Macy 	/*
2260*eda14cbcSMatt Macy 	 * If the livelist being removed is set to be condensed, stop the
2261*eda14cbcSMatt Macy 	 * condense zthr and indicate the cancellation in the spa_to_condense
2262*eda14cbcSMatt Macy 	 * struct in case the condense no-wait synctask has already started
2263*eda14cbcSMatt Macy 	 */
2264*eda14cbcSMatt Macy 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
2265*eda14cbcSMatt Macy 	if (ll_condense_thread != NULL &&
2266*eda14cbcSMatt Macy 	    (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
2267*eda14cbcSMatt Macy 		/*
2268*eda14cbcSMatt Macy 		 * We use zthr_wait_cycle_done instead of zthr_cancel
2269*eda14cbcSMatt Macy 		 * because we don't want to destroy the zthr, just have
2270*eda14cbcSMatt Macy 		 * it skip its current task.
2271*eda14cbcSMatt Macy 		 */
2272*eda14cbcSMatt Macy 		spa->spa_to_condense.cancelled = B_TRUE;
2273*eda14cbcSMatt Macy 		zthr_wait_cycle_done(ll_condense_thread);
2274*eda14cbcSMatt Macy 		/*
2275*eda14cbcSMatt Macy 		 * If we've returned from zthr_wait_cycle_done without
2276*eda14cbcSMatt Macy 		 * clearing the to_condense data structure it's either
2277*eda14cbcSMatt Macy 		 * because the no-wait synctask has started (which is
2278*eda14cbcSMatt Macy 		 * indicated by 'syncing' field of to_condense) and we
2279*eda14cbcSMatt Macy 		 * can expect it to clear to_condense on its own.
2280*eda14cbcSMatt Macy 		 * Otherwise, we returned before the zthr ran. The
2281*eda14cbcSMatt Macy 		 * checkfunc will now fail as cancelled == B_TRUE so we
2282*eda14cbcSMatt Macy 		 * can safely NULL out ds, allowing a different dir's
2283*eda14cbcSMatt Macy 		 * livelist to be condensed.
2284*eda14cbcSMatt Macy 		 *
2285*eda14cbcSMatt Macy 		 * We can be sure that the to_condense struct will not
2286*eda14cbcSMatt Macy 		 * be repopulated at this stage because both this
2287*eda14cbcSMatt Macy 		 * function and dsl_livelist_try_condense execute in
2288*eda14cbcSMatt Macy 		 * syncing context.
2289*eda14cbcSMatt Macy 		 */
2290*eda14cbcSMatt Macy 		if ((spa->spa_to_condense.ds != NULL) &&
2291*eda14cbcSMatt Macy 		    !spa->spa_to_condense.syncing) {
2292*eda14cbcSMatt Macy 			dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
2293*eda14cbcSMatt Macy 			    spa);
2294*eda14cbcSMatt Macy 			spa->spa_to_condense.ds = NULL;
2295*eda14cbcSMatt Macy 		}
2296*eda14cbcSMatt Macy 	}
2297*eda14cbcSMatt Macy 
2298*eda14cbcSMatt Macy 	dsl_dir_livelist_close(dd);
2299*eda14cbcSMatt Macy 	VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
2300*eda14cbcSMatt Macy 	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
2301*eda14cbcSMatt Macy 	VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
2302*eda14cbcSMatt Macy 	    DD_FIELD_LIVELIST, tx));
2303*eda14cbcSMatt Macy 	if (total) {
2304*eda14cbcSMatt Macy 		dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
2305*eda14cbcSMatt Macy 		spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
2306*eda14cbcSMatt Macy 	}
2307*eda14cbcSMatt Macy }
2308*eda14cbcSMatt Macy 
2309*eda14cbcSMatt Macy static int
2310*eda14cbcSMatt Macy dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
2311*eda14cbcSMatt Macy     zfs_wait_activity_t activity, boolean_t *in_progress)
2312*eda14cbcSMatt Macy {
2313*eda14cbcSMatt Macy 	int error = 0;
2314*eda14cbcSMatt Macy 
2315*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&dd->dd_activity_lock));
2316*eda14cbcSMatt Macy 
2317*eda14cbcSMatt Macy 	switch (activity) {
2318*eda14cbcSMatt Macy 	case ZFS_WAIT_DELETEQ: {
2319*eda14cbcSMatt Macy #ifdef _KERNEL
2320*eda14cbcSMatt Macy 		objset_t *os;
2321*eda14cbcSMatt Macy 		error = dmu_objset_from_ds(ds, &os);
2322*eda14cbcSMatt Macy 		if (error != 0)
2323*eda14cbcSMatt Macy 			break;
2324*eda14cbcSMatt Macy 
2325*eda14cbcSMatt Macy 		mutex_enter(&os->os_user_ptr_lock);
2326*eda14cbcSMatt Macy 		void *user = dmu_objset_get_user(os);
2327*eda14cbcSMatt Macy 		mutex_exit(&os->os_user_ptr_lock);
2328*eda14cbcSMatt Macy 		if (dmu_objset_type(os) != DMU_OST_ZFS ||
2329*eda14cbcSMatt Macy 		    user == NULL || zfs_get_vfs_flag_unmounted(os)) {
2330*eda14cbcSMatt Macy 			*in_progress = B_FALSE;
2331*eda14cbcSMatt Macy 			return (0);
2332*eda14cbcSMatt Macy 		}
2333*eda14cbcSMatt Macy 
2334*eda14cbcSMatt Macy 		uint64_t readonly = B_FALSE;
2335*eda14cbcSMatt Macy 		error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly,
2336*eda14cbcSMatt Macy 		    NULL);
2337*eda14cbcSMatt Macy 
2338*eda14cbcSMatt Macy 		if (error != 0)
2339*eda14cbcSMatt Macy 			break;
2340*eda14cbcSMatt Macy 
2341*eda14cbcSMatt Macy 		if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) {
2342*eda14cbcSMatt Macy 			*in_progress = B_FALSE;
2343*eda14cbcSMatt Macy 			return (0);
2344*eda14cbcSMatt Macy 		}
2345*eda14cbcSMatt Macy 
2346*eda14cbcSMatt Macy 		uint64_t count, unlinked_obj;
2347*eda14cbcSMatt Macy 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
2348*eda14cbcSMatt Macy 		    &unlinked_obj);
2349*eda14cbcSMatt Macy 		if (error != 0) {
2350*eda14cbcSMatt Macy 			dsl_dataset_rele(ds, FTAG);
2351*eda14cbcSMatt Macy 			break;
2352*eda14cbcSMatt Macy 		}
2353*eda14cbcSMatt Macy 		error = zap_count(os, unlinked_obj, &count);
2354*eda14cbcSMatt Macy 
2355*eda14cbcSMatt Macy 		if (error == 0)
2356*eda14cbcSMatt Macy 			*in_progress = (count != 0);
2357*eda14cbcSMatt Macy 		break;
2358*eda14cbcSMatt Macy #else
2359*eda14cbcSMatt Macy 		/*
2360*eda14cbcSMatt Macy 		 * The delete queue is ZPL specific, and libzpool doesn't have
2361*eda14cbcSMatt Macy 		 * it. It doesn't make sense to wait for it.
2362*eda14cbcSMatt Macy 		 */
2363*eda14cbcSMatt Macy 		*in_progress = B_FALSE;
2364*eda14cbcSMatt Macy 		break;
2365*eda14cbcSMatt Macy #endif
2366*eda14cbcSMatt Macy 	}
2367*eda14cbcSMatt Macy 	default:
2368*eda14cbcSMatt Macy 		panic("unrecognized value for activity %d", activity);
2369*eda14cbcSMatt Macy 	}
2370*eda14cbcSMatt Macy 
2371*eda14cbcSMatt Macy 	return (error);
2372*eda14cbcSMatt Macy }
2373*eda14cbcSMatt Macy 
2374*eda14cbcSMatt Macy int
2375*eda14cbcSMatt Macy dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity,
2376*eda14cbcSMatt Macy     boolean_t *waited)
2377*eda14cbcSMatt Macy {
2378*eda14cbcSMatt Macy 	int error = 0;
2379*eda14cbcSMatt Macy 	boolean_t in_progress;
2380*eda14cbcSMatt Macy 	dsl_pool_t *dp = dd->dd_pool;
2381*eda14cbcSMatt Macy 	for (;;) {
2382*eda14cbcSMatt Macy 		dsl_pool_config_enter(dp, FTAG);
2383*eda14cbcSMatt Macy 		error = dsl_dir_activity_in_progress(dd, ds, activity,
2384*eda14cbcSMatt Macy 		    &in_progress);
2385*eda14cbcSMatt Macy 		dsl_pool_config_exit(dp, FTAG);
2386*eda14cbcSMatt Macy 		if (error != 0 || !in_progress)
2387*eda14cbcSMatt Macy 			break;
2388*eda14cbcSMatt Macy 
2389*eda14cbcSMatt Macy 		*waited = B_TRUE;
2390*eda14cbcSMatt Macy 
2391*eda14cbcSMatt Macy 		if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) ==
2392*eda14cbcSMatt Macy 		    0 || dd->dd_activity_cancelled) {
2393*eda14cbcSMatt Macy 			error = SET_ERROR(EINTR);
2394*eda14cbcSMatt Macy 			break;
2395*eda14cbcSMatt Macy 		}
2396*eda14cbcSMatt Macy 	}
2397*eda14cbcSMatt Macy 	return (error);
2398*eda14cbcSMatt Macy }
2399*eda14cbcSMatt Macy 
2400*eda14cbcSMatt Macy void
2401*eda14cbcSMatt Macy dsl_dir_cancel_waiters(dsl_dir_t *dd)
2402*eda14cbcSMatt Macy {
2403*eda14cbcSMatt Macy 	mutex_enter(&dd->dd_activity_lock);
2404*eda14cbcSMatt Macy 	dd->dd_activity_cancelled = B_TRUE;
2405*eda14cbcSMatt Macy 	cv_broadcast(&dd->dd_activity_cv);
2406*eda14cbcSMatt Macy 	while (dd->dd_activity_waiters > 0)
2407*eda14cbcSMatt Macy 		cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock);
2408*eda14cbcSMatt Macy 	mutex_exit(&dd->dd_activity_lock);
2409*eda14cbcSMatt Macy }
2410*eda14cbcSMatt Macy 
2411*eda14cbcSMatt Macy #if defined(_KERNEL)
2412*eda14cbcSMatt Macy EXPORT_SYMBOL(dsl_dir_set_quota);
2413*eda14cbcSMatt Macy EXPORT_SYMBOL(dsl_dir_set_reservation);
2414*eda14cbcSMatt Macy #endif
2415