xref: /freebsd/sys/contrib/openzfs/module/zfs/dsl_dir.c (revision 78cd75393ec79565c63927bf200f06f839a1dc05)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24  * Copyright (c) 2013 Martin Matuska. All rights reserved.
25  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
28  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
29  * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
30  */
31 
32 #include <sys/dmu.h>
33 #include <sys/dmu_objset.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/dsl_dataset.h>
36 #include <sys/dsl_dir.h>
37 #include <sys/dsl_prop.h>
38 #include <sys/dsl_synctask.h>
39 #include <sys/dsl_deleg.h>
40 #include <sys/dmu_impl.h>
41 #include <sys/spa.h>
42 #include <sys/spa_impl.h>
43 #include <sys/metaslab.h>
44 #include <sys/zap.h>
45 #include <sys/zio.h>
46 #include <sys/arc.h>
47 #include <sys/sunddi.h>
48 #include <sys/zfeature.h>
49 #include <sys/policy.h>
50 #include <sys/zfs_vfsops.h>
51 #include <sys/zfs_znode.h>
52 #include <sys/zvol.h>
53 #include <sys/zthr.h>
54 #include "zfs_namecheck.h"
55 #include "zfs_prop.h"
56 
57 /*
58  * This controls if we verify the ZVOL quota or not.
59  * Currently, quotas are not implemented for ZVOLs.
60  * The quota size is the size of the ZVOL.
61  * The size of the volume already implies the ZVOL size quota.
62  * The quota mechanism can introduce a significant performance drop.
63  */
64 static int zvol_enforce_quotas = B_TRUE;
65 
66 /*
67  * Filesystem and Snapshot Limits
68  * ------------------------------
69  *
70  * These limits are used to restrict the number of filesystems and/or snapshots
71  * that can be created at a given level in the tree or below. A typical
72  * use-case is with a delegated dataset where the administrator wants to ensure
73  * that a user within the zone is not creating too many additional filesystems
74  * or snapshots, even though they're not exceeding their space quota.
75  *
76  * The filesystem and snapshot counts are stored as extensible properties. This
77  * capability is controlled by a feature flag and must be enabled to be used.
78  * Once enabled, the feature is not active until the first limit is set. At
79  * that point, future operations to create/destroy filesystems or snapshots
80  * will validate and update the counts.
81  *
82  * Because the count properties will not exist before the feature is active,
83  * the counts are updated when a limit is first set on an uninitialized
84  * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
85  * all of the nested filesystems/snapshots. Thus, a new leaf node has a
86  * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
87  * snapshot count properties on a node indicate uninitialized counts on that
88  * node.) When first setting a limit on an uninitialized node, the code starts
89  * at the filesystem with the new limit and descends into all sub-filesystems
90  * to add the count properties.
91  *
92  * In practice this is lightweight since a limit is typically set when the
93  * filesystem is created and thus has no children. Once valid, changing the
94  * limit value won't require a re-traversal since the counts are already valid.
95  * When recursively fixing the counts, if a node with a limit is encountered
96  * during the descent, the counts are known to be valid and there is no need to
97  * descend into that filesystem's children. The counts on filesystems above the
98  * one with the new limit will still be uninitialized, unless a limit is
99  * eventually set on one of those filesystems. The counts are always recursively
100  * updated when a limit is set on a dataset, unless there is already a limit.
101  * When a new limit value is set on a filesystem with an existing limit, it is
102  * possible for the new limit to be less than the current count at that level
103  * since a user who can change the limit is also allowed to exceed the limit.
104  *
105  * Once the feature is active, then whenever a filesystem or snapshot is
106  * created, the code recurses up the tree, validating the new count against the
107  * limit at each initialized level. In practice, most levels will not have a
108  * limit set. If there is a limit at any initialized level up the tree, the
109  * check must pass or the creation will fail. Likewise, when a filesystem or
110  * snapshot is destroyed, the counts are recursively adjusted all the way up
111  * the initialized nodes in the tree. Renaming a filesystem into different point
112  * in the tree will first validate, then update the counts on each branch up to
113  * the common ancestor. A receive will also validate the counts and then update
114  * them.
115  *
116  * An exception to the above behavior is that the limit is not enforced if the
117  * user has permission to modify the limit. This is primarily so that
118  * recursive snapshots in the global zone always work. We want to prevent a
119  * denial-of-service in which a lower level delegated dataset could max out its
120  * limit and thus block recursive snapshots from being taken in the global zone.
121  * Because of this, it is possible for the snapshot count to be over the limit
122  * and snapshots taken in the global zone could cause a lower level dataset to
123  * hit or exceed its limit. The administrator taking the global zone recursive
124  * snapshot should be aware of this side-effect and behave accordingly.
125  * For consistency, the filesystem limit is also not enforced if the user can
126  * modify the limit.
127  *
128  * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
129  * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
130  * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
131  * dsl_dir_init_fs_ss_count().
132  */
133 
134 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
135 
136 typedef struct ddulrt_arg {
137 	dsl_dir_t	*ddulrta_dd;
138 	uint64_t	ddlrta_txg;
139 } ddulrt_arg_t;
140 
141 static void
142 dsl_dir_evict_async(void *dbu)
143 {
144 	dsl_dir_t *dd = dbu;
145 	int t;
146 	dsl_pool_t *dp __maybe_unused = dd->dd_pool;
147 
148 	dd->dd_dbuf = NULL;
149 
150 	for (t = 0; t < TXG_SIZE; t++) {
151 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
152 		ASSERT(dd->dd_tempreserved[t] == 0);
153 		ASSERT(dd->dd_space_towrite[t] == 0);
154 	}
155 
156 	if (dd->dd_parent)
157 		dsl_dir_async_rele(dd->dd_parent, dd);
158 
159 	spa_async_close(dd->dd_pool->dp_spa, dd);
160 
161 	if (dsl_deadlist_is_open(&dd->dd_livelist))
162 		dsl_dir_livelist_close(dd);
163 
164 	dsl_prop_fini(dd);
165 	cv_destroy(&dd->dd_activity_cv);
166 	mutex_destroy(&dd->dd_activity_lock);
167 	mutex_destroy(&dd->dd_lock);
168 	kmem_free(dd, sizeof (dsl_dir_t));
169 }
170 
171 int
172 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
173     const char *tail, const void *tag, dsl_dir_t **ddp)
174 {
175 	dmu_buf_t *dbuf;
176 	dsl_dir_t *dd;
177 	dmu_object_info_t doi;
178 	int err;
179 
180 	ASSERT(dsl_pool_config_held(dp));
181 
182 	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
183 	if (err != 0)
184 		return (err);
185 	dd = dmu_buf_get_user(dbuf);
186 
187 	dmu_object_info_from_db(dbuf, &doi);
188 	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
189 	ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
190 
191 	if (dd == NULL) {
192 		dsl_dir_t *winner;
193 
194 		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
195 		dd->dd_object = ddobj;
196 		dd->dd_dbuf = dbuf;
197 		dd->dd_pool = dp;
198 
199 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
200 		mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
201 		cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
202 		dsl_prop_init(dd);
203 
204 		if (dsl_dir_is_zapified(dd)) {
205 			err = zap_lookup(dp->dp_meta_objset,
206 			    ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
207 			    sizeof (uint64_t), 1, &dd->dd_crypto_obj);
208 			if (err == 0) {
209 				/* check for on-disk format errata */
210 				if (dsl_dir_incompatible_encryption_version(
211 				    dd)) {
212 					dp->dp_spa->spa_errata =
213 					    ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
214 				}
215 			} else if (err != ENOENT) {
216 				goto errout;
217 			}
218 		}
219 
220 		if (dsl_dir_phys(dd)->dd_parent_obj) {
221 			err = dsl_dir_hold_obj(dp,
222 			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
223 			    &dd->dd_parent);
224 			if (err != 0)
225 				goto errout;
226 			if (tail) {
227 #ifdef ZFS_DEBUG
228 				uint64_t foundobj;
229 
230 				err = zap_lookup(dp->dp_meta_objset,
231 				    dsl_dir_phys(dd->dd_parent)->
232 				    dd_child_dir_zapobj, tail,
233 				    sizeof (foundobj), 1, &foundobj);
234 				ASSERT(err || foundobj == ddobj);
235 #endif
236 				(void) strlcpy(dd->dd_myname, tail,
237 				    sizeof (dd->dd_myname));
238 			} else {
239 				err = zap_value_search(dp->dp_meta_objset,
240 				    dsl_dir_phys(dd->dd_parent)->
241 				    dd_child_dir_zapobj,
242 				    ddobj, 0, dd->dd_myname);
243 			}
244 			if (err != 0)
245 				goto errout;
246 		} else {
247 			(void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa),
248 			    sizeof (dd->dd_myname));
249 		}
250 
251 		if (dsl_dir_is_clone(dd)) {
252 			dmu_buf_t *origin_bonus;
253 			dsl_dataset_phys_t *origin_phys;
254 
255 			/*
256 			 * We can't open the origin dataset, because
257 			 * that would require opening this dsl_dir.
258 			 * Just look at its phys directly instead.
259 			 */
260 			err = dmu_bonus_hold(dp->dp_meta_objset,
261 			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
262 			    &origin_bonus);
263 			if (err != 0)
264 				goto errout;
265 			origin_phys = origin_bonus->db_data;
266 			dd->dd_origin_txg =
267 			    origin_phys->ds_creation_txg;
268 			dmu_buf_rele(origin_bonus, FTAG);
269 			if (dsl_dir_is_zapified(dd)) {
270 				uint64_t obj;
271 				err = zap_lookup(dp->dp_meta_objset,
272 				    dd->dd_object, DD_FIELD_LIVELIST,
273 				    sizeof (uint64_t), 1, &obj);
274 				if (err == 0)
275 					dsl_dir_livelist_open(dd, obj);
276 				else if (err != ENOENT)
277 					goto errout;
278 			}
279 		}
280 
281 		if (dsl_dir_is_zapified(dd)) {
282 			inode_timespec_t t = {0};
283 			(void) zap_lookup(dp->dp_meta_objset, ddobj,
284 			    DD_FIELD_SNAPSHOTS_CHANGED,
285 			    sizeof (uint64_t),
286 			    sizeof (inode_timespec_t) / sizeof (uint64_t),
287 			    &t);
288 			dd->dd_snap_cmtime = t;
289 		}
290 
291 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
292 		    &dd->dd_dbuf);
293 		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
294 		if (winner != NULL) {
295 			if (dd->dd_parent)
296 				dsl_dir_rele(dd->dd_parent, dd);
297 			if (dsl_deadlist_is_open(&dd->dd_livelist))
298 				dsl_dir_livelist_close(dd);
299 			dsl_prop_fini(dd);
300 			cv_destroy(&dd->dd_activity_cv);
301 			mutex_destroy(&dd->dd_activity_lock);
302 			mutex_destroy(&dd->dd_lock);
303 			kmem_free(dd, sizeof (dsl_dir_t));
304 			dd = winner;
305 		} else {
306 			spa_open_ref(dp->dp_spa, dd);
307 		}
308 	}
309 
310 	/*
311 	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
312 	 * holds on the spa.  We need the open-to-close holds because
313 	 * otherwise the spa_refcnt wouldn't change when we open a
314 	 * dir which the spa also has open, so we could incorrectly
315 	 * think it was OK to unload/export/destroy the pool.  We need
316 	 * the instantiate-to-evict hold because the dsl_dir_t has a
317 	 * pointer to the dd_pool, which has a pointer to the spa_t.
318 	 */
319 	spa_open_ref(dp->dp_spa, tag);
320 	ASSERT3P(dd->dd_pool, ==, dp);
321 	ASSERT3U(dd->dd_object, ==, ddobj);
322 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
323 	*ddp = dd;
324 	return (0);
325 
326 errout:
327 	if (dd->dd_parent)
328 		dsl_dir_rele(dd->dd_parent, dd);
329 	if (dsl_deadlist_is_open(&dd->dd_livelist))
330 		dsl_dir_livelist_close(dd);
331 	dsl_prop_fini(dd);
332 	cv_destroy(&dd->dd_activity_cv);
333 	mutex_destroy(&dd->dd_activity_lock);
334 	mutex_destroy(&dd->dd_lock);
335 	kmem_free(dd, sizeof (dsl_dir_t));
336 	dmu_buf_rele(dbuf, tag);
337 	return (err);
338 }
339 
340 void
341 dsl_dir_rele(dsl_dir_t *dd, const void *tag)
342 {
343 	dprintf_dd(dd, "%s\n", "");
344 	spa_close(dd->dd_pool->dp_spa, tag);
345 	dmu_buf_rele(dd->dd_dbuf, tag);
346 }
347 
348 /*
349  * Remove a reference to the given dsl dir that is being asynchronously
350  * released.  Async releases occur from a taskq performing eviction of
351  * dsl datasets and dirs.  This process is identical to a normal release
352  * with the exception of using the async API for releasing the reference on
353  * the spa.
354  */
355 void
356 dsl_dir_async_rele(dsl_dir_t *dd, const void *tag)
357 {
358 	dprintf_dd(dd, "%s\n", "");
359 	spa_async_close(dd->dd_pool->dp_spa, tag);
360 	dmu_buf_rele(dd->dd_dbuf, tag);
361 }
362 
363 /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
364 void
365 dsl_dir_name(dsl_dir_t *dd, char *buf)
366 {
367 	if (dd->dd_parent) {
368 		dsl_dir_name(dd->dd_parent, buf);
369 		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
370 		    ZFS_MAX_DATASET_NAME_LEN);
371 	} else {
372 		buf[0] = '\0';
373 	}
374 	if (!MUTEX_HELD(&dd->dd_lock)) {
375 		/*
376 		 * recursive mutex so that we can use
377 		 * dprintf_dd() with dd_lock held
378 		 */
379 		mutex_enter(&dd->dd_lock);
380 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
381 		    <, ZFS_MAX_DATASET_NAME_LEN);
382 		mutex_exit(&dd->dd_lock);
383 	} else {
384 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
385 		    <, ZFS_MAX_DATASET_NAME_LEN);
386 	}
387 }
388 
389 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
390 int
391 dsl_dir_namelen(dsl_dir_t *dd)
392 {
393 	int result = 0;
394 
395 	if (dd->dd_parent) {
396 		/* parent's name + 1 for the "/" */
397 		result = dsl_dir_namelen(dd->dd_parent) + 1;
398 	}
399 
400 	if (!MUTEX_HELD(&dd->dd_lock)) {
401 		/* see dsl_dir_name */
402 		mutex_enter(&dd->dd_lock);
403 		result += strlen(dd->dd_myname);
404 		mutex_exit(&dd->dd_lock);
405 	} else {
406 		result += strlen(dd->dd_myname);
407 	}
408 
409 	return (result);
410 }
411 
412 static int
413 getcomponent(const char *path, char *component, const char **nextp)
414 {
415 	char *p;
416 
417 	if ((path == NULL) || (path[0] == '\0'))
418 		return (SET_ERROR(ENOENT));
419 	/* This would be a good place to reserve some namespace... */
420 	p = strpbrk(path, "/@");
421 	if (p && (p[1] == '/' || p[1] == '@')) {
422 		/* two separators in a row */
423 		return (SET_ERROR(EINVAL));
424 	}
425 	if (p == NULL || p == path) {
426 		/*
427 		 * if the first thing is an @ or /, it had better be an
428 		 * @ and it had better not have any more ats or slashes,
429 		 * and it had better have something after the @.
430 		 */
431 		if (p != NULL &&
432 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
433 			return (SET_ERROR(EINVAL));
434 		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
435 			return (SET_ERROR(ENAMETOOLONG));
436 		(void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN);
437 		p = NULL;
438 	} else if (p[0] == '/') {
439 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
440 			return (SET_ERROR(ENAMETOOLONG));
441 		(void) strlcpy(component, path, p - path + 1);
442 		p++;
443 	} else if (p[0] == '@') {
444 		/*
445 		 * if the next separator is an @, there better not be
446 		 * any more slashes.
447 		 */
448 		if (strchr(path, '/'))
449 			return (SET_ERROR(EINVAL));
450 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
451 			return (SET_ERROR(ENAMETOOLONG));
452 		(void) strlcpy(component, path, p - path + 1);
453 	} else {
454 		panic("invalid p=%p", (void *)p);
455 	}
456 	*nextp = p;
457 	return (0);
458 }
459 
460 /*
461  * Return the dsl_dir_t, and possibly the last component which couldn't
462  * be found in *tail.  The name must be in the specified dsl_pool_t.  This
463  * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
464  * path is bogus, or if tail==NULL and we couldn't parse the whole name.
465  * (*tail)[0] == '@' means that the last component is a snapshot.
466  */
467 int
468 dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag,
469     dsl_dir_t **ddp, const char **tailp)
470 {
471 	char *buf;
472 	const char *spaname, *next, *nextnext = NULL;
473 	int err;
474 	dsl_dir_t *dd;
475 	uint64_t ddobj;
476 
477 	buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
478 	err = getcomponent(name, buf, &next);
479 	if (err != 0)
480 		goto error;
481 
482 	/* Make sure the name is in the specified pool. */
483 	spaname = spa_name(dp->dp_spa);
484 	if (strcmp(buf, spaname) != 0) {
485 		err = SET_ERROR(EXDEV);
486 		goto error;
487 	}
488 
489 	ASSERT(dsl_pool_config_held(dp));
490 
491 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
492 	if (err != 0) {
493 		goto error;
494 	}
495 
496 	while (next != NULL) {
497 		dsl_dir_t *child_dd;
498 		err = getcomponent(next, buf, &nextnext);
499 		if (err != 0)
500 			break;
501 		ASSERT(next[0] != '\0');
502 		if (next[0] == '@')
503 			break;
504 		dprintf("looking up %s in obj%lld\n",
505 		    buf, (longlong_t)dsl_dir_phys(dd)->dd_child_dir_zapobj);
506 
507 		err = zap_lookup(dp->dp_meta_objset,
508 		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
509 		    buf, sizeof (ddobj), 1, &ddobj);
510 		if (err != 0) {
511 			if (err == ENOENT)
512 				err = 0;
513 			break;
514 		}
515 
516 		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
517 		if (err != 0)
518 			break;
519 		dsl_dir_rele(dd, tag);
520 		dd = child_dd;
521 		next = nextnext;
522 	}
523 
524 	if (err != 0) {
525 		dsl_dir_rele(dd, tag);
526 		goto error;
527 	}
528 
529 	/*
530 	 * It's an error if there's more than one component left, or
531 	 * tailp==NULL and there's any component left.
532 	 */
533 	if (next != NULL &&
534 	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
535 		/* bad path name */
536 		dsl_dir_rele(dd, tag);
537 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
538 		err = SET_ERROR(ENOENT);
539 	}
540 	if (tailp != NULL)
541 		*tailp = next;
542 	if (err == 0)
543 		*ddp = dd;
544 error:
545 	kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN);
546 	return (err);
547 }
548 
549 /*
550  * If the counts are already initialized for this filesystem and its
551  * descendants then do nothing, otherwise initialize the counts.
552  *
553  * The counts on this filesystem, and those below, may be uninitialized due to
554  * either the use of a pre-existing pool which did not support the
555  * filesystem/snapshot limit feature, or one in which the feature had not yet
556  * been enabled.
557  *
558  * Recursively descend the filesystem tree and update the filesystem/snapshot
559  * counts on each filesystem below, then update the cumulative count on the
560  * current filesystem. If the filesystem already has a count set on it,
561  * then we know that its counts, and the counts on the filesystems below it,
562  * are already correct, so we don't have to update this filesystem.
563  */
564 static void
565 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
566 {
567 	uint64_t my_fs_cnt = 0;
568 	uint64_t my_ss_cnt = 0;
569 	dsl_pool_t *dp = dd->dd_pool;
570 	objset_t *os = dp->dp_meta_objset;
571 	zap_cursor_t *zc;
572 	zap_attribute_t *za;
573 	dsl_dataset_t *ds;
574 
575 	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
576 	ASSERT(dsl_pool_config_held(dp));
577 	ASSERT(dmu_tx_is_syncing(tx));
578 
579 	dsl_dir_zapify(dd, tx);
580 
581 	/*
582 	 * If the filesystem count has already been initialized then we
583 	 * don't need to recurse down any further.
584 	 */
585 	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
586 		return;
587 
588 	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
589 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
590 
591 	/* Iterate my child dirs */
592 	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
593 	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
594 		dsl_dir_t *chld_dd;
595 		uint64_t count;
596 
597 		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
598 		    &chld_dd));
599 
600 		/*
601 		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets.
602 		 */
603 		if (chld_dd->dd_myname[0] == '$') {
604 			dsl_dir_rele(chld_dd, FTAG);
605 			continue;
606 		}
607 
608 		my_fs_cnt++;	/* count this child */
609 
610 		dsl_dir_init_fs_ss_count(chld_dd, tx);
611 
612 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
613 		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
614 		my_fs_cnt += count;
615 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
616 		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
617 		my_ss_cnt += count;
618 
619 		dsl_dir_rele(chld_dd, FTAG);
620 	}
621 	zap_cursor_fini(zc);
622 	/* Count my snapshots (we counted children's snapshots above) */
623 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
624 	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
625 
626 	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
627 	    zap_cursor_retrieve(zc, za) == 0;
628 	    zap_cursor_advance(zc)) {
629 		/* Don't count temporary snapshots */
630 		if (za->za_name[0] != '%')
631 			my_ss_cnt++;
632 	}
633 	zap_cursor_fini(zc);
634 
635 	dsl_dataset_rele(ds, FTAG);
636 
637 	kmem_free(zc, sizeof (zap_cursor_t));
638 	kmem_free(za, sizeof (zap_attribute_t));
639 
640 	/* we're in a sync task, update counts */
641 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
642 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
643 	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
644 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
645 	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
646 }
647 
648 static int
649 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
650 {
651 	char *ddname = (char *)arg;
652 	dsl_pool_t *dp = dmu_tx_pool(tx);
653 	dsl_dataset_t *ds;
654 	dsl_dir_t *dd;
655 	int error;
656 
657 	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
658 	if (error != 0)
659 		return (error);
660 
661 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
662 		dsl_dataset_rele(ds, FTAG);
663 		return (SET_ERROR(ENOTSUP));
664 	}
665 
666 	dd = ds->ds_dir;
667 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
668 	    dsl_dir_is_zapified(dd) &&
669 	    zap_contains(dp->dp_meta_objset, dd->dd_object,
670 	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
671 		dsl_dataset_rele(ds, FTAG);
672 		return (SET_ERROR(EALREADY));
673 	}
674 
675 	dsl_dataset_rele(ds, FTAG);
676 	return (0);
677 }
678 
679 static void
680 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
681 {
682 	char *ddname = (char *)arg;
683 	dsl_pool_t *dp = dmu_tx_pool(tx);
684 	dsl_dataset_t *ds;
685 	spa_t *spa;
686 
687 	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
688 
689 	spa = dsl_dataset_get_spa(ds);
690 
691 	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
692 		/*
693 		 * Since the feature was not active and we're now setting a
694 		 * limit, increment the feature-active counter so that the
695 		 * feature becomes active for the first time.
696 		 *
697 		 * We are already in a sync task so we can update the MOS.
698 		 */
699 		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
700 	}
701 
702 	/*
703 	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
704 	 * we need to ensure the counts are correct. Descend down the tree from
705 	 * this point and update all of the counts to be accurate.
706 	 */
707 	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
708 
709 	dsl_dataset_rele(ds, FTAG);
710 }
711 
712 /*
713  * Make sure the feature is enabled and activate it if necessary.
714  * Since we're setting a limit, ensure the on-disk counts are valid.
715  * This is only called by the ioctl path when setting a limit value.
716  *
717  * We do not need to validate the new limit, since users who can change the
718  * limit are also allowed to exceed the limit.
719  */
720 int
721 dsl_dir_activate_fs_ss_limit(const char *ddname)
722 {
723 	int error;
724 
725 	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
726 	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
727 	    ZFS_SPACE_CHECK_RESERVED);
728 
729 	if (error == EALREADY)
730 		error = 0;
731 
732 	return (error);
733 }
734 
735 /*
736  * Used to determine if the filesystem_limit or snapshot_limit should be
737  * enforced. We allow the limit to be exceeded if the user has permission to
738  * write the property value. We pass in the creds that we got in the open
739  * context since we will always be the GZ root in syncing context. We also have
740  * to handle the case where we are allowed to change the limit on the current
741  * dataset, but there may be another limit in the tree above.
742  *
743  * We can never modify these two properties within a non-global zone. In
744  * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
745  * can't use that function since we are already holding the dp_config_rwlock.
746  * In addition, we already have the dd and dealing with snapshots is simplified
747  * in this code.
748  */
749 
750 typedef enum {
751 	ENFORCE_ALWAYS,
752 	ENFORCE_NEVER,
753 	ENFORCE_ABOVE
754 } enforce_res_t;
755 
756 static enforce_res_t
757 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
758     cred_t *cr, proc_t *proc)
759 {
760 	enforce_res_t enforce = ENFORCE_ALWAYS;
761 	uint64_t obj;
762 	dsl_dataset_t *ds;
763 	uint64_t zoned;
764 	const char *zonedstr;
765 
766 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
767 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
768 
769 #ifdef _KERNEL
770 	if (crgetzoneid(cr) != GLOBAL_ZONEID)
771 		return (ENFORCE_ALWAYS);
772 
773 	/*
774 	 * We are checking the saved credentials of the user process, which is
775 	 * not the current process.  Note that we can't use secpolicy_zfs(),
776 	 * because it only works if the cred is that of the current process (on
777 	 * Linux).
778 	 */
779 	if (secpolicy_zfs_proc(cr, proc) == 0)
780 		return (ENFORCE_NEVER);
781 #else
782 	(void) proc;
783 #endif
784 
785 	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
786 		return (ENFORCE_ALWAYS);
787 
788 	ASSERT(dsl_pool_config_held(dd->dd_pool));
789 
790 	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
791 		return (ENFORCE_ALWAYS);
792 
793 	zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED);
794 	if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) {
795 		/* Only root can access zoned fs's from the GZ */
796 		enforce = ENFORCE_ALWAYS;
797 	} else {
798 		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
799 			enforce = ENFORCE_ABOVE;
800 	}
801 
802 	dsl_dataset_rele(ds, FTAG);
803 	return (enforce);
804 }
805 
806 /*
807  * Check if adding additional child filesystem(s) would exceed any filesystem
808  * limits or adding additional snapshot(s) would exceed any snapshot limits.
809  * The prop argument indicates which limit to check.
810  *
811  * Note that all filesystem limits up to the root (or the highest
812  * initialized) filesystem or the given ancestor must be satisfied.
813  */
814 int
815 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
816     dsl_dir_t *ancestor, cred_t *cr, proc_t *proc)
817 {
818 	objset_t *os = dd->dd_pool->dp_meta_objset;
819 	uint64_t limit, count;
820 	const char *count_prop;
821 	enforce_res_t enforce;
822 	int err = 0;
823 
824 	ASSERT(dsl_pool_config_held(dd->dd_pool));
825 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
826 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
827 
828 	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
829 		/*
830 		 * We don't enforce the limit for temporary snapshots. This is
831 		 * indicated by a NULL cred_t argument.
832 		 */
833 		if (cr == NULL)
834 			return (0);
835 
836 		count_prop = DD_FIELD_SNAPSHOT_COUNT;
837 	} else {
838 		count_prop = DD_FIELD_FILESYSTEM_COUNT;
839 	}
840 	/*
841 	 * If we're allowed to change the limit, don't enforce the limit
842 	 * e.g. this can happen if a snapshot is taken by an administrative
843 	 * user in the global zone (i.e. a recursive snapshot by root).
844 	 * However, we must handle the case of delegated permissions where we
845 	 * are allowed to change the limit on the current dataset, but there
846 	 * is another limit in the tree above.
847 	 */
848 	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc);
849 	if (enforce == ENFORCE_NEVER)
850 		return (0);
851 
852 	/*
853 	 * e.g. if renaming a dataset with no snapshots, count adjustment
854 	 * is 0.
855 	 */
856 	if (delta == 0)
857 		return (0);
858 
859 	/*
860 	 * If an ancestor has been provided, stop checking the limit once we
861 	 * hit that dir. We need this during rename so that we don't overcount
862 	 * the check once we recurse up to the common ancestor.
863 	 */
864 	if (ancestor == dd)
865 		return (0);
866 
867 	/*
868 	 * If we hit an uninitialized node while recursing up the tree, we can
869 	 * stop since we know there is no limit here (or above). The counts are
870 	 * not valid on this node and we know we won't touch this node's counts.
871 	 */
872 	if (!dsl_dir_is_zapified(dd))
873 		return (0);
874 	err = zap_lookup(os, dd->dd_object,
875 	    count_prop, sizeof (count), 1, &count);
876 	if (err == ENOENT)
877 		return (0);
878 	if (err != 0)
879 		return (err);
880 
881 	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
882 	    B_FALSE);
883 	if (err != 0)
884 		return (err);
885 
886 	/* Is there a limit which we've hit? */
887 	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
888 		return (SET_ERROR(EDQUOT));
889 
890 	if (dd->dd_parent != NULL)
891 		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
892 		    ancestor, cr, proc);
893 
894 	return (err);
895 }
896 
897 /*
898  * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
899  * parents. When a new filesystem/snapshot is created, increment the count on
900  * all parents, and when a filesystem/snapshot is destroyed, decrement the
901  * count.
902  */
903 void
904 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
905     dmu_tx_t *tx)
906 {
907 	int err;
908 	objset_t *os = dd->dd_pool->dp_meta_objset;
909 	uint64_t count;
910 
911 	ASSERT(dsl_pool_config_held(dd->dd_pool));
912 	ASSERT(dmu_tx_is_syncing(tx));
913 	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
914 	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
915 
916 	/*
917 	 * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets.
918 	 */
919 	if (dd->dd_myname[0] == '$' && strcmp(prop,
920 	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
921 		return;
922 	}
923 
924 	/*
925 	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
926 	 */
927 	if (delta == 0)
928 		return;
929 
930 	/*
931 	 * If we hit an uninitialized node while recursing up the tree, we can
932 	 * stop since we know the counts are not valid on this node and we
933 	 * know we shouldn't touch this node's counts. An uninitialized count
934 	 * on the node indicates that either the feature has not yet been
935 	 * activated or there are no limits on this part of the tree.
936 	 */
937 	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
938 	    prop, sizeof (count), 1, &count)) == ENOENT)
939 		return;
940 	VERIFY0(err);
941 
942 	count += delta;
943 	/* Use a signed verify to make sure we're not neg. */
944 	VERIFY3S(count, >=, 0);
945 
946 	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
947 	    tx));
948 
949 	/* Roll up this additional count into our ancestors */
950 	if (dd->dd_parent != NULL)
951 		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
952 }
953 
954 uint64_t
955 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
956     dmu_tx_t *tx)
957 {
958 	objset_t *mos = dp->dp_meta_objset;
959 	uint64_t ddobj;
960 	dsl_dir_phys_t *ddphys;
961 	dmu_buf_t *dbuf;
962 
963 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
964 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
965 	if (pds) {
966 		VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
967 		    name, sizeof (uint64_t), 1, &ddobj, tx));
968 	} else {
969 		/* it's the root dir */
970 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
971 		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
972 	}
973 	VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
974 	dmu_buf_will_dirty(dbuf, tx);
975 	ddphys = dbuf->db_data;
976 
977 	ddphys->dd_creation_time = gethrestime_sec();
978 	if (pds) {
979 		ddphys->dd_parent_obj = pds->dd_object;
980 
981 		/* update the filesystem counts */
982 		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
983 	}
984 	ddphys->dd_props_zapobj = zap_create(mos,
985 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
986 	ddphys->dd_child_dir_zapobj = zap_create(mos,
987 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
988 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
989 		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
990 
991 	dmu_buf_rele(dbuf, FTAG);
992 
993 	return (ddobj);
994 }
995 
996 boolean_t
997 dsl_dir_is_clone(dsl_dir_t *dd)
998 {
999 	return (dsl_dir_phys(dd)->dd_origin_obj &&
1000 	    (dd->dd_pool->dp_origin_snap == NULL ||
1001 	    dsl_dir_phys(dd)->dd_origin_obj !=
1002 	    dd->dd_pool->dp_origin_snap->ds_object));
1003 }
1004 
1005 uint64_t
1006 dsl_dir_get_used(dsl_dir_t *dd)
1007 {
1008 	return (dsl_dir_phys(dd)->dd_used_bytes);
1009 }
1010 
1011 uint64_t
1012 dsl_dir_get_compressed(dsl_dir_t *dd)
1013 {
1014 	return (dsl_dir_phys(dd)->dd_compressed_bytes);
1015 }
1016 
1017 uint64_t
1018 dsl_dir_get_quota(dsl_dir_t *dd)
1019 {
1020 	return (dsl_dir_phys(dd)->dd_quota);
1021 }
1022 
1023 uint64_t
1024 dsl_dir_get_reservation(dsl_dir_t *dd)
1025 {
1026 	return (dsl_dir_phys(dd)->dd_reserved);
1027 }
1028 
1029 uint64_t
1030 dsl_dir_get_compressratio(dsl_dir_t *dd)
1031 {
1032 	/* a fixed point number, 100x the ratio */
1033 	return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
1034 	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
1035 	    dsl_dir_phys(dd)->dd_compressed_bytes));
1036 }
1037 
1038 uint64_t
1039 dsl_dir_get_logicalused(dsl_dir_t *dd)
1040 {
1041 	return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
1042 }
1043 
1044 uint64_t
1045 dsl_dir_get_usedsnap(dsl_dir_t *dd)
1046 {
1047 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
1048 }
1049 
1050 uint64_t
1051 dsl_dir_get_usedds(dsl_dir_t *dd)
1052 {
1053 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
1054 }
1055 
1056 uint64_t
1057 dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
1058 {
1059 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
1060 }
1061 
1062 uint64_t
1063 dsl_dir_get_usedchild(dsl_dir_t *dd)
1064 {
1065 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
1066 	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
1067 }
1068 
1069 void
1070 dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
1071 {
1072 	dsl_dataset_t *ds;
1073 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
1074 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
1075 
1076 	dsl_dataset_name(ds, buf);
1077 
1078 	dsl_dataset_rele(ds, FTAG);
1079 }
1080 
1081 int
1082 dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
1083 {
1084 	if (dsl_dir_is_zapified(dd)) {
1085 		objset_t *os = dd->dd_pool->dp_meta_objset;
1086 		return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
1087 		    sizeof (*count), 1, count));
1088 	} else {
1089 		return (SET_ERROR(ENOENT));
1090 	}
1091 }
1092 
1093 int
1094 dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
1095 {
1096 	if (dsl_dir_is_zapified(dd)) {
1097 		objset_t *os = dd->dd_pool->dp_meta_objset;
1098 		return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
1099 		    sizeof (*count), 1, count));
1100 	} else {
1101 		return (SET_ERROR(ENOENT));
1102 	}
1103 }
1104 
1105 void
1106 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
1107 {
1108 	mutex_enter(&dd->dd_lock);
1109 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
1110 	    dsl_dir_get_quota(dd));
1111 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
1112 	    dsl_dir_get_reservation(dd));
1113 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
1114 	    dsl_dir_get_logicalused(dd));
1115 	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1116 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
1117 		    dsl_dir_get_usedsnap(dd));
1118 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
1119 		    dsl_dir_get_usedds(dd));
1120 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
1121 		    dsl_dir_get_usedrefreserv(dd));
1122 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
1123 		    dsl_dir_get_usedchild(dd));
1124 	}
1125 	mutex_exit(&dd->dd_lock);
1126 
1127 	uint64_t count;
1128 	if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
1129 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
1130 		    count);
1131 	}
1132 	if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
1133 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
1134 		    count);
1135 	}
1136 
1137 	if (dsl_dir_is_clone(dd)) {
1138 		char buf[ZFS_MAX_DATASET_NAME_LEN];
1139 		dsl_dir_get_origin(dd, buf);
1140 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
1141 	}
1142 
1143 }
1144 
1145 void
1146 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
1147 {
1148 	dsl_pool_t *dp = dd->dd_pool;
1149 
1150 	ASSERT(dsl_dir_phys(dd));
1151 
1152 	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
1153 		/* up the hold count until we can be written out */
1154 		dmu_buf_add_ref(dd->dd_dbuf, dd);
1155 	}
1156 }
1157 
1158 static int64_t
1159 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
1160 {
1161 	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
1162 	uint64_t new_accounted =
1163 	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
1164 	return (new_accounted - old_accounted);
1165 }
1166 
1167 void
1168 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
1169 {
1170 	ASSERT(dmu_tx_is_syncing(tx));
1171 
1172 	mutex_enter(&dd->dd_lock);
1173 	ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]);
1174 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", (u_longlong_t)tx->tx_txg,
1175 	    (u_longlong_t)dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024);
1176 	dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0;
1177 	mutex_exit(&dd->dd_lock);
1178 
1179 	/* release the hold from dsl_dir_dirty */
1180 	dmu_buf_rele(dd->dd_dbuf, dd);
1181 }
1182 
1183 static uint64_t
1184 dsl_dir_space_towrite(dsl_dir_t *dd)
1185 {
1186 	uint64_t space = 0;
1187 
1188 	ASSERT(MUTEX_HELD(&dd->dd_lock));
1189 
1190 	for (int i = 0; i < TXG_SIZE; i++)
1191 		space += dd->dd_space_towrite[i & TXG_MASK];
1192 
1193 	return (space);
1194 }
1195 
1196 /*
1197  * How much space would dd have available if ancestor had delta applied
1198  * to it?  If ondiskonly is set, we're only interested in what's
1199  * on-disk, not estimated pending changes.
1200  */
1201 uint64_t
1202 dsl_dir_space_available(dsl_dir_t *dd,
1203     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
1204 {
1205 	uint64_t parentspace, myspace, quota, used;
1206 
1207 	/*
1208 	 * If there are no restrictions otherwise, assume we have
1209 	 * unlimited space available.
1210 	 */
1211 	quota = UINT64_MAX;
1212 	parentspace = UINT64_MAX;
1213 
1214 	if (dd->dd_parent != NULL) {
1215 		parentspace = dsl_dir_space_available(dd->dd_parent,
1216 		    ancestor, delta, ondiskonly);
1217 	}
1218 
1219 	mutex_enter(&dd->dd_lock);
1220 	if (dsl_dir_phys(dd)->dd_quota != 0)
1221 		quota = dsl_dir_phys(dd)->dd_quota;
1222 	used = dsl_dir_phys(dd)->dd_used_bytes;
1223 	if (!ondiskonly)
1224 		used += dsl_dir_space_towrite(dd);
1225 
1226 	if (dd->dd_parent == NULL) {
1227 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
1228 		    ZFS_SPACE_CHECK_NORMAL);
1229 		quota = MIN(quota, poolsize);
1230 	}
1231 
1232 	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
1233 		/*
1234 		 * We have some space reserved, in addition to what our
1235 		 * parent gave us.
1236 		 */
1237 		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
1238 	}
1239 
1240 	if (dd == ancestor) {
1241 		ASSERT(delta <= 0);
1242 		ASSERT(used >= -delta);
1243 		used += delta;
1244 		if (parentspace != UINT64_MAX)
1245 			parentspace -= delta;
1246 	}
1247 
1248 	if (used > quota) {
1249 		/* over quota */
1250 		myspace = 0;
1251 	} else {
1252 		/*
1253 		 * the lesser of the space provided by our parent and
1254 		 * the space left in our quota
1255 		 */
1256 		myspace = MIN(parentspace, quota - used);
1257 	}
1258 
1259 	mutex_exit(&dd->dd_lock);
1260 
1261 	return (myspace);
1262 }
1263 
1264 struct tempreserve {
1265 	list_node_t tr_node;
1266 	dsl_dir_t *tr_ds;
1267 	uint64_t tr_size;
1268 };
1269 
1270 static int
1271 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
1272     boolean_t ignorequota, list_t *tr_list,
1273     dmu_tx_t *tx, boolean_t first)
1274 {
1275 	uint64_t txg;
1276 	uint64_t quota;
1277 	struct tempreserve *tr;
1278 	int retval;
1279 	uint64_t ext_quota;
1280 	uint64_t ref_rsrv;
1281 
1282 top_of_function:
1283 	txg = tx->tx_txg;
1284 	retval = EDQUOT;
1285 	ref_rsrv = 0;
1286 
1287 	ASSERT3U(txg, !=, 0);
1288 	ASSERT3S(asize, >, 0);
1289 
1290 	mutex_enter(&dd->dd_lock);
1291 
1292 	/*
1293 	 * Check against the dsl_dir's quota.  We don't add in the delta
1294 	 * when checking for over-quota because they get one free hit.
1295 	 */
1296 	uint64_t est_inflight = dsl_dir_space_towrite(dd);
1297 	for (int i = 0; i < TXG_SIZE; i++)
1298 		est_inflight += dd->dd_tempreserved[i];
1299 	uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
1300 
1301 	/*
1302 	 * On the first iteration, fetch the dataset's used-on-disk and
1303 	 * refreservation values. Also, if checkrefquota is set, test if
1304 	 * allocating this space would exceed the dataset's refquota.
1305 	 */
1306 	if (first && tx->tx_objset) {
1307 		int error;
1308 		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
1309 
1310 		error = dsl_dataset_check_quota(ds, !netfree,
1311 		    asize, est_inflight, &used_on_disk, &ref_rsrv);
1312 		if (error != 0) {
1313 			mutex_exit(&dd->dd_lock);
1314 			DMU_TX_STAT_BUMP(dmu_tx_quota);
1315 			return (error);
1316 		}
1317 	}
1318 
1319 	/*
1320 	 * If this transaction will result in a net free of space,
1321 	 * we want to let it through.
1322 	 */
1323 	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 ||
1324 	    (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL &&
1325 	    zvol_enforce_quotas == B_FALSE))
1326 		quota = UINT64_MAX;
1327 	else
1328 		quota = dsl_dir_phys(dd)->dd_quota;
1329 
1330 	/*
1331 	 * Adjust the quota against the actual pool size at the root
1332 	 * minus any outstanding deferred frees.
1333 	 * To ensure that it's possible to remove files from a full
1334 	 * pool without inducing transient overcommits, we throttle
1335 	 * netfree transactions against a quota that is slightly larger,
1336 	 * but still within the pool's allocation slop.  In cases where
1337 	 * we're very close to full, this will allow a steady trickle of
1338 	 * removes to get through.
1339 	 */
1340 	if (dd->dd_parent == NULL) {
1341 		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
1342 		    (netfree) ?
1343 		    ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
1344 
1345 		if (avail < quota) {
1346 			quota = avail;
1347 			retval = SET_ERROR(ENOSPC);
1348 		}
1349 	}
1350 
1351 	/*
1352 	 * If they are requesting more space, and our current estimate
1353 	 * is over quota, they get to try again unless the actual
1354 	 * on-disk is over quota and there are no pending changes
1355 	 * or deferred frees (which may free up space for us).
1356 	 */
1357 	ext_quota = quota >> 5;
1358 	if (quota == UINT64_MAX)
1359 		ext_quota = 0;
1360 
1361 	if (used_on_disk >= quota) {
1362 		if (retval == ENOSPC && (used_on_disk - quota) <
1363 		    dsl_pool_deferred_space(dd->dd_pool)) {
1364 			retval = SET_ERROR(ERESTART);
1365 		}
1366 		/* Quota exceeded */
1367 		mutex_exit(&dd->dd_lock);
1368 		DMU_TX_STAT_BUMP(dmu_tx_quota);
1369 		return (retval);
1370 	} else if (used_on_disk + est_inflight >= quota + ext_quota) {
1371 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
1372 		    "quota=%lluK tr=%lluK\n",
1373 		    (u_longlong_t)used_on_disk>>10,
1374 		    (u_longlong_t)est_inflight>>10,
1375 		    (u_longlong_t)quota>>10, (u_longlong_t)asize>>10);
1376 		mutex_exit(&dd->dd_lock);
1377 		DMU_TX_STAT_BUMP(dmu_tx_quota);
1378 		return (SET_ERROR(ERESTART));
1379 	}
1380 
1381 	/* We need to up our estimated delta before dropping dd_lock */
1382 	dd->dd_tempreserved[txg & TXG_MASK] += asize;
1383 
1384 	uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
1385 	    asize - ref_rsrv);
1386 	mutex_exit(&dd->dd_lock);
1387 
1388 	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1389 	tr->tr_ds = dd;
1390 	tr->tr_size = asize;
1391 	list_insert_tail(tr_list, tr);
1392 
1393 	/* see if it's OK with our parent */
1394 	if (dd->dd_parent != NULL && parent_rsrv != 0) {
1395 		/*
1396 		 * Recurse on our parent without recursion. This has been
1397 		 * observed to be potentially large stack usage even within
1398 		 * the test suite. Largest seen stack was 7632 bytes on linux.
1399 		 */
1400 
1401 		dd = dd->dd_parent;
1402 		asize = parent_rsrv;
1403 		ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
1404 		first = B_FALSE;
1405 		goto top_of_function;
1406 	}
1407 
1408 	return (0);
1409 }
1410 
1411 /*
1412  * Reserve space in this dsl_dir, to be used in this tx's txg.
1413  * After the space has been dirtied (and dsl_dir_willuse_space()
1414  * has been called), the reservation should be canceled, using
1415  * dsl_dir_tempreserve_clear().
1416  */
1417 int
1418 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
1419     boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
1420 {
1421 	int err;
1422 	list_t *tr_list;
1423 
1424 	if (asize == 0) {
1425 		*tr_cookiep = NULL;
1426 		return (0);
1427 	}
1428 
1429 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
1430 	list_create(tr_list, sizeof (struct tempreserve),
1431 	    offsetof(struct tempreserve, tr_node));
1432 	ASSERT3S(asize, >, 0);
1433 
1434 	err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
1435 	if (err == 0) {
1436 		struct tempreserve *tr;
1437 
1438 		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1439 		tr->tr_size = lsize;
1440 		list_insert_tail(tr_list, tr);
1441 	} else {
1442 		if (err == EAGAIN) {
1443 			/*
1444 			 * If arc_memory_throttle() detected that pageout
1445 			 * is running and we are low on memory, we delay new
1446 			 * non-pageout transactions to give pageout an
1447 			 * advantage.
1448 			 *
1449 			 * It is unfortunate to be delaying while the caller's
1450 			 * locks are held.
1451 			 */
1452 			txg_delay(dd->dd_pool, tx->tx_txg,
1453 			    MSEC2NSEC(10), MSEC2NSEC(10));
1454 			err = SET_ERROR(ERESTART);
1455 		}
1456 	}
1457 
1458 	if (err == 0) {
1459 		err = dsl_dir_tempreserve_impl(dd, asize, netfree,
1460 		    B_FALSE, tr_list, tx, B_TRUE);
1461 	}
1462 
1463 	if (err != 0)
1464 		dsl_dir_tempreserve_clear(tr_list, tx);
1465 	else
1466 		*tr_cookiep = tr_list;
1467 
1468 	return (err);
1469 }
1470 
1471 /*
1472  * Clear a temporary reservation that we previously made with
1473  * dsl_dir_tempreserve_space().
1474  */
1475 void
1476 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
1477 {
1478 	int txgidx = tx->tx_txg & TXG_MASK;
1479 	list_t *tr_list = tr_cookie;
1480 	struct tempreserve *tr;
1481 
1482 	ASSERT3U(tx->tx_txg, !=, 0);
1483 
1484 	if (tr_cookie == NULL)
1485 		return;
1486 
1487 	while ((tr = list_remove_head(tr_list)) != NULL) {
1488 		if (tr->tr_ds) {
1489 			mutex_enter(&tr->tr_ds->dd_lock);
1490 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
1491 			    tr->tr_size);
1492 			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
1493 			mutex_exit(&tr->tr_ds->dd_lock);
1494 		} else {
1495 			arc_tempreserve_clear(tr->tr_size);
1496 		}
1497 		kmem_free(tr, sizeof (struct tempreserve));
1498 	}
1499 
1500 	kmem_free(tr_list, sizeof (list_t));
1501 }
1502 
1503 /*
1504  * This should be called from open context when we think we're going to write
1505  * or free space, for example when dirtying data. Be conservative; it's okay
1506  * to write less space or free more, but we don't want to write more or free
1507  * less than the amount specified.
1508  *
1509  * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
1510  * version however it has been adjusted to use an iterative rather than
1511  * recursive algorithm to minimize stack usage.
1512  */
1513 void
1514 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
1515 {
1516 	int64_t parent_space;
1517 	uint64_t est_used;
1518 
1519 	do {
1520 		mutex_enter(&dd->dd_lock);
1521 		if (space > 0)
1522 			dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
1523 
1524 		est_used = dsl_dir_space_towrite(dd) +
1525 		    dsl_dir_phys(dd)->dd_used_bytes;
1526 		parent_space = parent_delta(dd, est_used, space);
1527 		mutex_exit(&dd->dd_lock);
1528 
1529 		/* Make sure that we clean up dd_space_to* */
1530 		dsl_dir_dirty(dd, tx);
1531 
1532 		dd = dd->dd_parent;
1533 		space = parent_space;
1534 	} while (space && dd);
1535 }
1536 
1537 /* call from syncing context when we actually write/free space for this dd */
1538 void
1539 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
1540     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
1541 {
1542 	int64_t accounted_delta;
1543 
1544 	ASSERT(dmu_tx_is_syncing(tx));
1545 	ASSERT(type < DD_USED_NUM);
1546 
1547 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1548 
1549 	/*
1550 	 * dsl_dataset_set_refreservation_sync_impl() calls this with
1551 	 * dd_lock held, so that it can atomically update
1552 	 * ds->ds_reserved and the dsl_dir accounting, so that
1553 	 * dsl_dataset_check_quota() can see dataset and dir accounting
1554 	 * consistently.
1555 	 */
1556 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
1557 	if (needlock)
1558 		mutex_enter(&dd->dd_lock);
1559 	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
1560 	accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
1561 	ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
1562 	ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
1563 	ASSERT(uncompressed >= 0 ||
1564 	    ddp->dd_uncompressed_bytes >= -uncompressed);
1565 	ddp->dd_used_bytes += used;
1566 	ddp->dd_uncompressed_bytes += uncompressed;
1567 	ddp->dd_compressed_bytes += compressed;
1568 
1569 	if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1570 		ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used);
1571 		ddp->dd_used_breakdown[type] += used;
1572 #ifdef ZFS_DEBUG
1573 		{
1574 			dd_used_t t;
1575 			uint64_t u = 0;
1576 			for (t = 0; t < DD_USED_NUM; t++)
1577 				u += ddp->dd_used_breakdown[t];
1578 			ASSERT3U(u, ==, ddp->dd_used_bytes);
1579 		}
1580 #endif
1581 	}
1582 	if (needlock)
1583 		mutex_exit(&dd->dd_lock);
1584 
1585 	if (dd->dd_parent != NULL) {
1586 		dsl_dir_diduse_transfer_space(dd->dd_parent,
1587 		    accounted_delta, compressed, uncompressed,
1588 		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
1589 	}
1590 }
1591 
1592 void
1593 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
1594     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1595 {
1596 	ASSERT(dmu_tx_is_syncing(tx));
1597 	ASSERT(oldtype < DD_USED_NUM);
1598 	ASSERT(newtype < DD_USED_NUM);
1599 
1600 	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
1601 	if (delta == 0 ||
1602 	    !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN))
1603 		return;
1604 
1605 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1606 	mutex_enter(&dd->dd_lock);
1607 	ASSERT(delta > 0 ?
1608 	    ddp->dd_used_breakdown[oldtype] >= delta :
1609 	    ddp->dd_used_breakdown[newtype] >= -delta);
1610 	ASSERT(ddp->dd_used_bytes >= ABS(delta));
1611 	ddp->dd_used_breakdown[oldtype] -= delta;
1612 	ddp->dd_used_breakdown[newtype] += delta;
1613 	mutex_exit(&dd->dd_lock);
1614 }
1615 
1616 void
1617 dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
1618     int64_t compressed, int64_t uncompressed, int64_t tonew,
1619     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1620 {
1621 	int64_t accounted_delta;
1622 
1623 	ASSERT(dmu_tx_is_syncing(tx));
1624 	ASSERT(oldtype < DD_USED_NUM);
1625 	ASSERT(newtype < DD_USED_NUM);
1626 
1627 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1628 
1629 	mutex_enter(&dd->dd_lock);
1630 	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
1631 	accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
1632 	ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
1633 	ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
1634 	ASSERT(uncompressed >= 0 ||
1635 	    ddp->dd_uncompressed_bytes >= -uncompressed);
1636 	ddp->dd_used_bytes += used;
1637 	ddp->dd_uncompressed_bytes += uncompressed;
1638 	ddp->dd_compressed_bytes += compressed;
1639 
1640 	if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1641 		ASSERT(tonew - used <= 0 ||
1642 		    ddp->dd_used_breakdown[oldtype] >= tonew - used);
1643 		ASSERT(tonew >= 0 ||
1644 		    ddp->dd_used_breakdown[newtype] >= -tonew);
1645 		ddp->dd_used_breakdown[oldtype] -= tonew - used;
1646 		ddp->dd_used_breakdown[newtype] += tonew;
1647 #ifdef ZFS_DEBUG
1648 		{
1649 			dd_used_t t;
1650 			uint64_t u = 0;
1651 			for (t = 0; t < DD_USED_NUM; t++)
1652 				u += ddp->dd_used_breakdown[t];
1653 			ASSERT3U(u, ==, ddp->dd_used_bytes);
1654 		}
1655 #endif
1656 	}
1657 	mutex_exit(&dd->dd_lock);
1658 
1659 	if (dd->dd_parent != NULL) {
1660 		dsl_dir_diduse_transfer_space(dd->dd_parent,
1661 		    accounted_delta, compressed, uncompressed,
1662 		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
1663 	}
1664 }
1665 
1666 typedef struct dsl_dir_set_qr_arg {
1667 	const char *ddsqra_name;
1668 	zprop_source_t ddsqra_source;
1669 	uint64_t ddsqra_value;
1670 } dsl_dir_set_qr_arg_t;
1671 
1672 static int
1673 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
1674 {
1675 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1676 	dsl_pool_t *dp = dmu_tx_pool(tx);
1677 	dsl_dataset_t *ds;
1678 	int error;
1679 	uint64_t towrite, newval;
1680 
1681 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1682 	if (error != 0)
1683 		return (error);
1684 
1685 	error = dsl_prop_predict(ds->ds_dir, "quota",
1686 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1687 	if (error != 0) {
1688 		dsl_dataset_rele(ds, FTAG);
1689 		return (error);
1690 	}
1691 
1692 	if (newval == 0) {
1693 		dsl_dataset_rele(ds, FTAG);
1694 		return (0);
1695 	}
1696 
1697 	mutex_enter(&ds->ds_dir->dd_lock);
1698 	/*
1699 	 * If we are doing the preliminary check in open context, and
1700 	 * there are pending changes, then don't fail it, since the
1701 	 * pending changes could under-estimate the amount of space to be
1702 	 * freed up.
1703 	 */
1704 	towrite = dsl_dir_space_towrite(ds->ds_dir);
1705 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1706 	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
1707 	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
1708 		error = SET_ERROR(ENOSPC);
1709 	}
1710 	mutex_exit(&ds->ds_dir->dd_lock);
1711 	dsl_dataset_rele(ds, FTAG);
1712 	return (error);
1713 }
1714 
1715 static void
1716 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
1717 {
1718 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1719 	dsl_pool_t *dp = dmu_tx_pool(tx);
1720 	dsl_dataset_t *ds;
1721 	uint64_t newval;
1722 
1723 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1724 
1725 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1726 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
1727 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1728 		    &ddsqra->ddsqra_value, tx);
1729 
1730 		VERIFY0(dsl_prop_get_int_ds(ds,
1731 		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
1732 	} else {
1733 		newval = ddsqra->ddsqra_value;
1734 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1735 		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
1736 	}
1737 
1738 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1739 	mutex_enter(&ds->ds_dir->dd_lock);
1740 	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
1741 	mutex_exit(&ds->ds_dir->dd_lock);
1742 	dsl_dataset_rele(ds, FTAG);
1743 }
1744 
1745 int
1746 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1747 {
1748 	dsl_dir_set_qr_arg_t ddsqra;
1749 
1750 	ddsqra.ddsqra_name = ddname;
1751 	ddsqra.ddsqra_source = source;
1752 	ddsqra.ddsqra_value = quota;
1753 
1754 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
1755 	    dsl_dir_set_quota_sync, &ddsqra, 0,
1756 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
1757 }
1758 
1759 static int
1760 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
1761 {
1762 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1763 	dsl_pool_t *dp = dmu_tx_pool(tx);
1764 	dsl_dataset_t *ds;
1765 	dsl_dir_t *dd;
1766 	uint64_t newval, used, avail;
1767 	int error;
1768 
1769 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1770 	if (error != 0)
1771 		return (error);
1772 	dd = ds->ds_dir;
1773 
1774 	/*
1775 	 * If we are doing the preliminary check in open context, the
1776 	 * space estimates may be inaccurate.
1777 	 */
1778 	if (!dmu_tx_is_syncing(tx)) {
1779 		dsl_dataset_rele(ds, FTAG);
1780 		return (0);
1781 	}
1782 
1783 	error = dsl_prop_predict(ds->ds_dir,
1784 	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1785 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1786 	if (error != 0) {
1787 		dsl_dataset_rele(ds, FTAG);
1788 		return (error);
1789 	}
1790 
1791 	mutex_enter(&dd->dd_lock);
1792 	used = dsl_dir_phys(dd)->dd_used_bytes;
1793 	mutex_exit(&dd->dd_lock);
1794 
1795 	if (dd->dd_parent) {
1796 		avail = dsl_dir_space_available(dd->dd_parent,
1797 		    NULL, 0, FALSE);
1798 	} else {
1799 		avail = dsl_pool_adjustedsize(dd->dd_pool,
1800 		    ZFS_SPACE_CHECK_NORMAL) - used;
1801 	}
1802 
1803 	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
1804 		uint64_t delta = MAX(used, newval) -
1805 		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
1806 
1807 		if (delta > avail ||
1808 		    (dsl_dir_phys(dd)->dd_quota > 0 &&
1809 		    newval > dsl_dir_phys(dd)->dd_quota))
1810 			error = SET_ERROR(ENOSPC);
1811 	}
1812 
1813 	dsl_dataset_rele(ds, FTAG);
1814 	return (error);
1815 }
1816 
1817 void
1818 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1819 {
1820 	uint64_t used;
1821 	int64_t delta;
1822 
1823 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1824 
1825 	mutex_enter(&dd->dd_lock);
1826 	used = dsl_dir_phys(dd)->dd_used_bytes;
1827 	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
1828 	dsl_dir_phys(dd)->dd_reserved = value;
1829 
1830 	if (dd->dd_parent != NULL) {
1831 		/* Roll up this additional usage into our ancestors */
1832 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1833 		    delta, 0, 0, tx);
1834 	}
1835 	mutex_exit(&dd->dd_lock);
1836 }
1837 
1838 static void
1839 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
1840 {
1841 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1842 	dsl_pool_t *dp = dmu_tx_pool(tx);
1843 	dsl_dataset_t *ds;
1844 	uint64_t newval;
1845 
1846 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1847 
1848 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1849 		dsl_prop_set_sync_impl(ds,
1850 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1851 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1852 		    &ddsqra->ddsqra_value, tx);
1853 
1854 		VERIFY0(dsl_prop_get_int_ds(ds,
1855 		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1856 	} else {
1857 		newval = ddsqra->ddsqra_value;
1858 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1859 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1860 		    (longlong_t)newval);
1861 	}
1862 
1863 	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1864 	dsl_dataset_rele(ds, FTAG);
1865 }
1866 
1867 int
1868 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1869     uint64_t reservation)
1870 {
1871 	dsl_dir_set_qr_arg_t ddsqra;
1872 
1873 	ddsqra.ddsqra_name = ddname;
1874 	ddsqra.ddsqra_source = source;
1875 	ddsqra.ddsqra_value = reservation;
1876 
1877 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
1878 	    dsl_dir_set_reservation_sync, &ddsqra, 0,
1879 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
1880 }
1881 
1882 static dsl_dir_t *
1883 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1884 {
1885 	for (; ds1; ds1 = ds1->dd_parent) {
1886 		dsl_dir_t *dd;
1887 		for (dd = ds2; dd; dd = dd->dd_parent) {
1888 			if (ds1 == dd)
1889 				return (dd);
1890 		}
1891 	}
1892 	return (NULL);
1893 }
1894 
1895 /*
1896  * If delta is applied to dd, how much of that delta would be applied to
1897  * ancestor?  Syncing context only.
1898  */
1899 static int64_t
1900 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1901 {
1902 	if (dd == ancestor)
1903 		return (delta);
1904 
1905 	mutex_enter(&dd->dd_lock);
1906 	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
1907 	mutex_exit(&dd->dd_lock);
1908 	return (would_change(dd->dd_parent, delta, ancestor));
1909 }
1910 
1911 typedef struct dsl_dir_rename_arg {
1912 	const char *ddra_oldname;
1913 	const char *ddra_newname;
1914 	cred_t *ddra_cred;
1915 	proc_t *ddra_proc;
1916 } dsl_dir_rename_arg_t;
1917 
1918 typedef struct dsl_valid_rename_arg {
1919 	int char_delta;
1920 	int nest_delta;
1921 } dsl_valid_rename_arg_t;
1922 
1923 static int
1924 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1925 {
1926 	(void) dp;
1927 	dsl_valid_rename_arg_t *dvra = arg;
1928 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
1929 
1930 	dsl_dataset_name(ds, namebuf);
1931 
1932 	ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
1933 	    <, ZFS_MAX_DATASET_NAME_LEN);
1934 	int namelen = strlen(namebuf) + dvra->char_delta;
1935 	int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
1936 
1937 	if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
1938 		return (SET_ERROR(ENAMETOOLONG));
1939 	if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
1940 		return (SET_ERROR(ENAMETOOLONG));
1941 	return (0);
1942 }
1943 
1944 static int
1945 dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1946 {
1947 	dsl_dir_rename_arg_t *ddra = arg;
1948 	dsl_pool_t *dp = dmu_tx_pool(tx);
1949 	dsl_dir_t *dd, *newparent;
1950 	dsl_valid_rename_arg_t dvra;
1951 	dsl_dataset_t *parentds;
1952 	objset_t *parentos;
1953 	const char *mynewname;
1954 	int error;
1955 
1956 	/* target dir should exist */
1957 	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1958 	if (error != 0)
1959 		return (error);
1960 
1961 	/* new parent should exist */
1962 	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1963 	    &newparent, &mynewname);
1964 	if (error != 0) {
1965 		dsl_dir_rele(dd, FTAG);
1966 		return (error);
1967 	}
1968 
1969 	/* can't rename to different pool */
1970 	if (dd->dd_pool != newparent->dd_pool) {
1971 		dsl_dir_rele(newparent, FTAG);
1972 		dsl_dir_rele(dd, FTAG);
1973 		return (SET_ERROR(EXDEV));
1974 	}
1975 
1976 	/* new name should not already exist */
1977 	if (mynewname == NULL) {
1978 		dsl_dir_rele(newparent, FTAG);
1979 		dsl_dir_rele(dd, FTAG);
1980 		return (SET_ERROR(EEXIST));
1981 	}
1982 
1983 	/* can't rename below anything but filesystems (eg. no ZVOLs) */
1984 	error = dsl_dataset_hold_obj(newparent->dd_pool,
1985 	    dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
1986 	if (error != 0) {
1987 		dsl_dir_rele(newparent, FTAG);
1988 		dsl_dir_rele(dd, FTAG);
1989 		return (error);
1990 	}
1991 	error = dmu_objset_from_ds(parentds, &parentos);
1992 	if (error != 0) {
1993 		dsl_dataset_rele(parentds, FTAG);
1994 		dsl_dir_rele(newparent, FTAG);
1995 		dsl_dir_rele(dd, FTAG);
1996 		return (error);
1997 	}
1998 	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
1999 		dsl_dataset_rele(parentds, FTAG);
2000 		dsl_dir_rele(newparent, FTAG);
2001 		dsl_dir_rele(dd, FTAG);
2002 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
2003 	}
2004 	dsl_dataset_rele(parentds, FTAG);
2005 
2006 	ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
2007 	    <, ZFS_MAX_DATASET_NAME_LEN);
2008 	ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
2009 	    <, ZFS_MAX_DATASET_NAME_LEN);
2010 	dvra.char_delta = strlen(ddra->ddra_newname)
2011 	    - strlen(ddra->ddra_oldname);
2012 	dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
2013 	    - get_dataset_depth(ddra->ddra_oldname);
2014 
2015 	/* if the name length is growing, validate child name lengths */
2016 	if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
2017 		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
2018 		    &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2019 		if (error != 0) {
2020 			dsl_dir_rele(newparent, FTAG);
2021 			dsl_dir_rele(dd, FTAG);
2022 			return (error);
2023 		}
2024 	}
2025 
2026 	if (dmu_tx_is_syncing(tx)) {
2027 		if (spa_feature_is_active(dp->dp_spa,
2028 		    SPA_FEATURE_FS_SS_LIMIT)) {
2029 			/*
2030 			 * Although this is the check function and we don't
2031 			 * normally make on-disk changes in check functions,
2032 			 * we need to do that here.
2033 			 *
2034 			 * Ensure this portion of the tree's counts have been
2035 			 * initialized in case the new parent has limits set.
2036 			 */
2037 			dsl_dir_init_fs_ss_count(dd, tx);
2038 		}
2039 	}
2040 
2041 	if (newparent != dd->dd_parent) {
2042 		/* is there enough space? */
2043 		uint64_t myspace =
2044 		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
2045 		    dsl_dir_phys(dd)->dd_reserved);
2046 		objset_t *os = dd->dd_pool->dp_meta_objset;
2047 		uint64_t fs_cnt = 0;
2048 		uint64_t ss_cnt = 0;
2049 
2050 		if (dsl_dir_is_zapified(dd)) {
2051 			int err;
2052 
2053 			err = zap_lookup(os, dd->dd_object,
2054 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
2055 			    &fs_cnt);
2056 			if (err != ENOENT && err != 0) {
2057 				dsl_dir_rele(newparent, FTAG);
2058 				dsl_dir_rele(dd, FTAG);
2059 				return (err);
2060 			}
2061 
2062 			/*
2063 			 * have to add 1 for the filesystem itself that we're
2064 			 * moving
2065 			 */
2066 			fs_cnt++;
2067 
2068 			err = zap_lookup(os, dd->dd_object,
2069 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
2070 			    &ss_cnt);
2071 			if (err != ENOENT && err != 0) {
2072 				dsl_dir_rele(newparent, FTAG);
2073 				dsl_dir_rele(dd, FTAG);
2074 				return (err);
2075 			}
2076 		}
2077 
2078 		/* check for encryption errors */
2079 		error = dsl_dir_rename_crypt_check(dd, newparent);
2080 		if (error != 0) {
2081 			dsl_dir_rele(newparent, FTAG);
2082 			dsl_dir_rele(dd, FTAG);
2083 			return (SET_ERROR(EACCES));
2084 		}
2085 
2086 		/* no rename into our descendant */
2087 		if (closest_common_ancestor(dd, newparent) == dd) {
2088 			dsl_dir_rele(newparent, FTAG);
2089 			dsl_dir_rele(dd, FTAG);
2090 			return (SET_ERROR(EINVAL));
2091 		}
2092 
2093 		error = dsl_dir_transfer_possible(dd->dd_parent,
2094 		    newparent, fs_cnt, ss_cnt, myspace,
2095 		    ddra->ddra_cred, ddra->ddra_proc);
2096 		if (error != 0) {
2097 			dsl_dir_rele(newparent, FTAG);
2098 			dsl_dir_rele(dd, FTAG);
2099 			return (error);
2100 		}
2101 	}
2102 
2103 	dsl_dir_rele(newparent, FTAG);
2104 	dsl_dir_rele(dd, FTAG);
2105 	return (0);
2106 }
2107 
2108 static void
2109 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
2110 {
2111 	dsl_dir_rename_arg_t *ddra = arg;
2112 	dsl_pool_t *dp = dmu_tx_pool(tx);
2113 	dsl_dir_t *dd, *newparent;
2114 	const char *mynewname;
2115 	objset_t *mos = dp->dp_meta_objset;
2116 
2117 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
2118 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
2119 	    &mynewname));
2120 
2121 	ASSERT3P(mynewname, !=, NULL);
2122 
2123 	/* Log this before we change the name. */
2124 	spa_history_log_internal_dd(dd, "rename", tx,
2125 	    "-> %s", ddra->ddra_newname);
2126 
2127 	if (newparent != dd->dd_parent) {
2128 		objset_t *os = dd->dd_pool->dp_meta_objset;
2129 		uint64_t fs_cnt = 0;
2130 		uint64_t ss_cnt = 0;
2131 
2132 		/*
2133 		 * We already made sure the dd counts were initialized in the
2134 		 * check function.
2135 		 */
2136 		if (spa_feature_is_active(dp->dp_spa,
2137 		    SPA_FEATURE_FS_SS_LIMIT)) {
2138 			VERIFY0(zap_lookup(os, dd->dd_object,
2139 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
2140 			    &fs_cnt));
2141 			/* add 1 for the filesystem itself that we're moving */
2142 			fs_cnt++;
2143 
2144 			VERIFY0(zap_lookup(os, dd->dd_object,
2145 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
2146 			    &ss_cnt));
2147 		}
2148 
2149 		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
2150 		    DD_FIELD_FILESYSTEM_COUNT, tx);
2151 		dsl_fs_ss_count_adjust(newparent, fs_cnt,
2152 		    DD_FIELD_FILESYSTEM_COUNT, tx);
2153 
2154 		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
2155 		    DD_FIELD_SNAPSHOT_COUNT, tx);
2156 		dsl_fs_ss_count_adjust(newparent, ss_cnt,
2157 		    DD_FIELD_SNAPSHOT_COUNT, tx);
2158 
2159 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
2160 		    -dsl_dir_phys(dd)->dd_used_bytes,
2161 		    -dsl_dir_phys(dd)->dd_compressed_bytes,
2162 		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
2163 		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
2164 		    dsl_dir_phys(dd)->dd_used_bytes,
2165 		    dsl_dir_phys(dd)->dd_compressed_bytes,
2166 		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
2167 
2168 		if (dsl_dir_phys(dd)->dd_reserved >
2169 		    dsl_dir_phys(dd)->dd_used_bytes) {
2170 			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
2171 			    dsl_dir_phys(dd)->dd_used_bytes;
2172 
2173 			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
2174 			    -unused_rsrv, 0, 0, tx);
2175 			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
2176 			    unused_rsrv, 0, 0, tx);
2177 		}
2178 	}
2179 
2180 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
2181 
2182 	/* remove from old parent zapobj */
2183 	VERIFY0(zap_remove(mos,
2184 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
2185 	    dd->dd_myname, tx));
2186 
2187 	(void) strlcpy(dd->dd_myname, mynewname,
2188 	    sizeof (dd->dd_myname));
2189 	dsl_dir_rele(dd->dd_parent, dd);
2190 	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
2191 	VERIFY0(dsl_dir_hold_obj(dp,
2192 	    newparent->dd_object, NULL, dd, &dd->dd_parent));
2193 
2194 	/* add to new parent zapobj */
2195 	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
2196 	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
2197 
2198 	/* TODO: A rename callback to avoid these layering violations. */
2199 	zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
2200 	zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
2201 	    ddra->ddra_newname, B_TRUE);
2202 
2203 	dsl_prop_notify_all(dd);
2204 
2205 	dsl_dir_rele(newparent, FTAG);
2206 	dsl_dir_rele(dd, FTAG);
2207 }
2208 
2209 int
2210 dsl_dir_rename(const char *oldname, const char *newname)
2211 {
2212 	dsl_dir_rename_arg_t ddra;
2213 
2214 	ddra.ddra_oldname = oldname;
2215 	ddra.ddra_newname = newname;
2216 	ddra.ddra_cred = CRED();
2217 	ddra.ddra_proc = curproc;
2218 
2219 	return (dsl_sync_task(oldname,
2220 	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
2221 	    3, ZFS_SPACE_CHECK_RESERVED));
2222 }
2223 
2224 int
2225 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
2226     uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space,
2227     cred_t *cr, proc_t *proc)
2228 {
2229 	dsl_dir_t *ancestor;
2230 	int64_t adelta;
2231 	uint64_t avail;
2232 	int err;
2233 
2234 	ancestor = closest_common_ancestor(sdd, tdd);
2235 	adelta = would_change(sdd, -space, ancestor);
2236 	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
2237 	if (avail < space)
2238 		return (SET_ERROR(ENOSPC));
2239 
2240 	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
2241 	    ancestor, cr, proc);
2242 	if (err != 0)
2243 		return (err);
2244 	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
2245 	    ancestor, cr, proc);
2246 	if (err != 0)
2247 		return (err);
2248 
2249 	return (0);
2250 }
2251 
2252 inode_timespec_t
2253 dsl_dir_snap_cmtime(dsl_dir_t *dd)
2254 {
2255 	inode_timespec_t t;
2256 
2257 	mutex_enter(&dd->dd_lock);
2258 	t = dd->dd_snap_cmtime;
2259 	mutex_exit(&dd->dd_lock);
2260 
2261 	return (t);
2262 }
2263 
2264 void
2265 dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx)
2266 {
2267 	dsl_pool_t *dp = dmu_tx_pool(tx);
2268 	inode_timespec_t t;
2269 	gethrestime(&t);
2270 
2271 	mutex_enter(&dd->dd_lock);
2272 	dd->dd_snap_cmtime = t;
2273 	if (spa_feature_is_enabled(dp->dp_spa,
2274 	    SPA_FEATURE_EXTENSIBLE_DATASET)) {
2275 		objset_t *mos = dd->dd_pool->dp_meta_objset;
2276 		uint64_t ddobj = dd->dd_object;
2277 		dsl_dir_zapify(dd, tx);
2278 		VERIFY0(zap_update(mos, ddobj,
2279 		    DD_FIELD_SNAPSHOTS_CHANGED,
2280 		    sizeof (uint64_t),
2281 		    sizeof (inode_timespec_t) / sizeof (uint64_t),
2282 		    &t, tx));
2283 	}
2284 	mutex_exit(&dd->dd_lock);
2285 }
2286 
2287 void
2288 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
2289 {
2290 	objset_t *mos = dd->dd_pool->dp_meta_objset;
2291 	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
2292 }
2293 
2294 boolean_t
2295 dsl_dir_is_zapified(dsl_dir_t *dd)
2296 {
2297 	dmu_object_info_t doi;
2298 
2299 	dmu_object_info_from_db(dd->dd_dbuf, &doi);
2300 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
2301 }
2302 
2303 void
2304 dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
2305 {
2306 	objset_t *mos = dd->dd_pool->dp_meta_objset;
2307 	ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
2308 	    SPA_FEATURE_LIVELIST));
2309 	dsl_deadlist_open(&dd->dd_livelist, mos, obj);
2310 	bplist_create(&dd->dd_pending_allocs);
2311 	bplist_create(&dd->dd_pending_frees);
2312 }
2313 
2314 void
2315 dsl_dir_livelist_close(dsl_dir_t *dd)
2316 {
2317 	dsl_deadlist_close(&dd->dd_livelist);
2318 	bplist_destroy(&dd->dd_pending_allocs);
2319 	bplist_destroy(&dd->dd_pending_frees);
2320 }
2321 
2322 void
2323 dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
2324 {
2325 	uint64_t obj;
2326 	dsl_pool_t *dp = dmu_tx_pool(tx);
2327 	spa_t *spa = dp->dp_spa;
2328 	livelist_condense_entry_t to_condense = spa->spa_to_condense;
2329 
2330 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
2331 		return;
2332 
2333 	/*
2334 	 * If the livelist being removed is set to be condensed, stop the
2335 	 * condense zthr and indicate the cancellation in the spa_to_condense
2336 	 * struct in case the condense no-wait synctask has already started
2337 	 */
2338 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
2339 	if (ll_condense_thread != NULL &&
2340 	    (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
2341 		/*
2342 		 * We use zthr_wait_cycle_done instead of zthr_cancel
2343 		 * because we don't want to destroy the zthr, just have
2344 		 * it skip its current task.
2345 		 */
2346 		spa->spa_to_condense.cancelled = B_TRUE;
2347 		zthr_wait_cycle_done(ll_condense_thread);
2348 		/*
2349 		 * If we've returned from zthr_wait_cycle_done without
2350 		 * clearing the to_condense data structure it's either
2351 		 * because the no-wait synctask has started (which is
2352 		 * indicated by 'syncing' field of to_condense) and we
2353 		 * can expect it to clear to_condense on its own.
2354 		 * Otherwise, we returned before the zthr ran. The
2355 		 * checkfunc will now fail as cancelled == B_TRUE so we
2356 		 * can safely NULL out ds, allowing a different dir's
2357 		 * livelist to be condensed.
2358 		 *
2359 		 * We can be sure that the to_condense struct will not
2360 		 * be repopulated at this stage because both this
2361 		 * function and dsl_livelist_try_condense execute in
2362 		 * syncing context.
2363 		 */
2364 		if ((spa->spa_to_condense.ds != NULL) &&
2365 		    !spa->spa_to_condense.syncing) {
2366 			dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
2367 			    spa);
2368 			spa->spa_to_condense.ds = NULL;
2369 		}
2370 	}
2371 
2372 	dsl_dir_livelist_close(dd);
2373 	VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
2374 	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
2375 	VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
2376 	    DD_FIELD_LIVELIST, tx));
2377 	if (total) {
2378 		dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
2379 		spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
2380 	}
2381 }
2382 
2383 static int
2384 dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
2385     zfs_wait_activity_t activity, boolean_t *in_progress)
2386 {
2387 	int error = 0;
2388 
2389 	ASSERT(MUTEX_HELD(&dd->dd_activity_lock));
2390 
2391 	switch (activity) {
2392 	case ZFS_WAIT_DELETEQ: {
2393 #ifdef _KERNEL
2394 		objset_t *os;
2395 		error = dmu_objset_from_ds(ds, &os);
2396 		if (error != 0)
2397 			break;
2398 
2399 		mutex_enter(&os->os_user_ptr_lock);
2400 		void *user = dmu_objset_get_user(os);
2401 		mutex_exit(&os->os_user_ptr_lock);
2402 		if (dmu_objset_type(os) != DMU_OST_ZFS ||
2403 		    user == NULL || zfs_get_vfs_flag_unmounted(os)) {
2404 			*in_progress = B_FALSE;
2405 			return (0);
2406 		}
2407 
2408 		uint64_t readonly = B_FALSE;
2409 		error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly,
2410 		    NULL);
2411 
2412 		if (error != 0)
2413 			break;
2414 
2415 		if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) {
2416 			*in_progress = B_FALSE;
2417 			return (0);
2418 		}
2419 
2420 		uint64_t count, unlinked_obj;
2421 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
2422 		    &unlinked_obj);
2423 		if (error != 0) {
2424 			dsl_dataset_rele(ds, FTAG);
2425 			break;
2426 		}
2427 		error = zap_count(os, unlinked_obj, &count);
2428 
2429 		if (error == 0)
2430 			*in_progress = (count != 0);
2431 		break;
2432 #else
2433 		/*
2434 		 * The delete queue is ZPL specific, and libzpool doesn't have
2435 		 * it. It doesn't make sense to wait for it.
2436 		 */
2437 		(void) ds;
2438 		*in_progress = B_FALSE;
2439 		break;
2440 #endif
2441 	}
2442 	default:
2443 		panic("unrecognized value for activity %d", activity);
2444 	}
2445 
2446 	return (error);
2447 }
2448 
2449 int
2450 dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity,
2451     boolean_t *waited)
2452 {
2453 	int error = 0;
2454 	boolean_t in_progress;
2455 	dsl_pool_t *dp = dd->dd_pool;
2456 	for (;;) {
2457 		dsl_pool_config_enter(dp, FTAG);
2458 		error = dsl_dir_activity_in_progress(dd, ds, activity,
2459 		    &in_progress);
2460 		dsl_pool_config_exit(dp, FTAG);
2461 		if (error != 0 || !in_progress)
2462 			break;
2463 
2464 		*waited = B_TRUE;
2465 
2466 		if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) ==
2467 		    0 || dd->dd_activity_cancelled) {
2468 			error = SET_ERROR(EINTR);
2469 			break;
2470 		}
2471 	}
2472 	return (error);
2473 }
2474 
2475 void
2476 dsl_dir_cancel_waiters(dsl_dir_t *dd)
2477 {
2478 	mutex_enter(&dd->dd_activity_lock);
2479 	dd->dd_activity_cancelled = B_TRUE;
2480 	cv_broadcast(&dd->dd_activity_cv);
2481 	while (dd->dd_activity_waiters > 0)
2482 		cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock);
2483 	mutex_exit(&dd->dd_activity_lock);
2484 }
2485 
2486 #if defined(_KERNEL)
2487 EXPORT_SYMBOL(dsl_dir_set_quota);
2488 EXPORT_SYMBOL(dsl_dir_set_reservation);
2489 #endif
2490 
2491 /* CSTYLED */
2492 ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW,
2493 	"Enable strict ZVOL quota enforcment");
2494