xref: /titanic_41/usr/src/uts/common/fs/zfs/dsl_dir.c (revision ebaf8c4561997ef317f9b720badae0dc0905ec77)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
24  * Copyright (c) 2013 Martin Matuska. All rights reserved.
25  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 #include <sys/dmu.h>
31 #include <sys/dmu_objset.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/dsl_dir.h>
35 #include <sys/dsl_prop.h>
36 #include <sys/dsl_synctask.h>
37 #include <sys/dsl_deleg.h>
38 #include <sys/dmu_impl.h>
39 #include <sys/spa.h>
40 #include <sys/metaslab.h>
41 #include <sys/zap.h>
42 #include <sys/zio.h>
43 #include <sys/arc.h>
44 #include <sys/sunddi.h>
45 #include <sys/zfeature.h>
46 #include <sys/policy.h>
47 #include <sys/zfs_znode.h>
48 #include "zfs_namecheck.h"
49 #include "zfs_prop.h"
50 
51 /*
52  * Filesystem and Snapshot Limits
53  * ------------------------------
54  *
55  * These limits are used to restrict the number of filesystems and/or snapshots
56  * that can be created at a given level in the tree or below. A typical
57  * use-case is with a delegated dataset where the administrator wants to ensure
58  * that a user within the zone is not creating too many additional filesystems
59  * or snapshots, even though they're not exceeding their space quota.
60  *
61  * The filesystem and snapshot counts are stored as extensible properties. This
62  * capability is controlled by a feature flag and must be enabled to be used.
63  * Once enabled, the feature is not active until the first limit is set. At
64  * that point, future operations to create/destroy filesystems or snapshots
65  * will validate and update the counts.
66  *
67  * Because the count properties will not exist before the feature is active,
68  * the counts are updated when a limit is first set on an uninitialized
69  * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
70  * all of the nested filesystems/snapshots. Thus, a new leaf node has a
71  * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
72  * snapshot count properties on a node indicate uninitialized counts on that
73  * node.) When first setting a limit on an uninitialized node, the code starts
74  * at the filesystem with the new limit and descends into all sub-filesystems
75  * to add the count properties.
76  *
77  * In practice this is lightweight since a limit is typically set when the
78  * filesystem is created and thus has no children. Once valid, changing the
79  * limit value won't require a re-traversal since the counts are already valid.
80  * When recursively fixing the counts, if a node with a limit is encountered
81  * during the descent, the counts are known to be valid and there is no need to
82  * descend into that filesystem's children. The counts on filesystems above the
83  * one with the new limit will still be uninitialized, unless a limit is
84  * eventually set on one of those filesystems. The counts are always recursively
85  * updated when a limit is set on a dataset, unless there is already a limit.
86  * When a new limit value is set on a filesystem with an existing limit, it is
87  * possible for the new limit to be less than the current count at that level
88  * since a user who can change the limit is also allowed to exceed the limit.
89  *
90  * Once the feature is active, then whenever a filesystem or snapshot is
91  * created, the code recurses up the tree, validating the new count against the
92  * limit at each initialized level. In practice, most levels will not have a
93  * limit set. If there is a limit at any initialized level up the tree, the
94  * check must pass or the creation will fail. Likewise, when a filesystem or
95  * snapshot is destroyed, the counts are recursively adjusted all the way up
96  * the initizized nodes in the tree. Renaming a filesystem into different point
97  * in the tree will first validate, then update the counts on each branch up to
98  * the common ancestor. A receive will also validate the counts and then update
99  * them.
100  *
101  * An exception to the above behavior is that the limit is not enforced if the
102  * user has permission to modify the limit. This is primarily so that
103  * recursive snapshots in the global zone always work. We want to prevent a
104  * denial-of-service in which a lower level delegated dataset could max out its
105  * limit and thus block recursive snapshots from being taken in the global zone.
106  * Because of this, it is possible for the snapshot count to be over the limit
107  * and snapshots taken in the global zone could cause a lower level dataset to
108  * hit or exceed its limit. The administrator taking the global zone recursive
109  * snapshot should be aware of this side-effect and behave accordingly.
110  * For consistency, the filesystem limit is also not enforced if the user can
111  * modify the limit.
112  *
113  * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
114  * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
115  * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
116  * dsl_dir_init_fs_ss_count().
117  *
118  * There is a special case when we receive a filesystem that already exists. In
119  * this case a temporary clone name of %X is created (see dmu_recv_begin). We
120  * never update the filesystem counts for temporary clones.
121  *
122  * Likewise, we do not update the snapshot counts for temporary snapshots,
123  * such as those created by zfs diff.
124  */
125 
126 /*
127  * Tunable to control EDQUOT behaviour. With this set to a value != 0, zfs
128  * doesn't always wait for a dirty txg to complete when an operation can't
129  * get through due to space exhaustion. Instead it fails early in a range
130  * of the tunable around the quota.
131  * This vastly helps to reduce the number of threads waiting for the txg
132  * to commit when a busy filesystem is near quota, especially in combination
133  * with NFS, where each waiter takes up a server thread.
134  */
135 uint64_t early_edquot_threshold = 32 * 1048576; /* tunable */
136 
137 extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
138 
139 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
140 
141 static void
142 dsl_dir_evict_async(void *dbu)
143 {
144 	dsl_dir_t *dd = dbu;
145 	dsl_pool_t *dp = dd->dd_pool;
146 	int t;
147 
148 	dd->dd_dbuf = NULL;
149 
150 	for (t = 0; t < TXG_SIZE; t++) {
151 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
152 		ASSERT(dd->dd_tempreserved[t] == 0);
153 		ASSERT(dd->dd_space_towrite[t] == 0);
154 	}
155 
156 	if (dd->dd_parent)
157 		dsl_dir_async_rele(dd->dd_parent, dd);
158 
159 	spa_async_close(dd->dd_pool->dp_spa, dd);
160 
161 	/*
162 	 * The props callback list should have been cleaned up by
163 	 * objset_evict().
164 	 */
165 	list_destroy(&dd->dd_prop_cbs);
166 	mutex_destroy(&dd->dd_lock);
167 	kmem_free(dd, sizeof (dsl_dir_t));
168 }
169 
170 int
171 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
172     const char *tail, void *tag, dsl_dir_t **ddp)
173 {
174 	dmu_buf_t *dbuf;
175 	dsl_dir_t *dd;
176 	int err;
177 
178 	ASSERT(dsl_pool_config_held(dp));
179 
180 	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
181 	if (err != 0)
182 		return (err);
183 	dd = dmu_buf_get_user(dbuf);
184 #ifdef ZFS_DEBUG
185 	{
186 		dmu_object_info_t doi;
187 		dmu_object_info_from_db(dbuf, &doi);
188 		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
189 		ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
190 	}
191 #endif
192 	if (dd == NULL) {
193 		dsl_dir_t *winner;
194 
195 		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
196 		dd->dd_object = ddobj;
197 		dd->dd_dbuf = dbuf;
198 		dd->dd_pool = dp;
199 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
200 
201 		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
202 		    offsetof(dsl_prop_cb_record_t, cbr_node));
203 
204 		dsl_dir_snap_cmtime_update(dd);
205 
206 		if (dsl_dir_phys(dd)->dd_parent_obj) {
207 			err = dsl_dir_hold_obj(dp,
208 			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
209 			    &dd->dd_parent);
210 			if (err != 0)
211 				goto errout;
212 			if (tail) {
213 #ifdef ZFS_DEBUG
214 				uint64_t foundobj;
215 
216 				err = zap_lookup(dp->dp_meta_objset,
217 				    dsl_dir_phys(dd->dd_parent)->
218 				    dd_child_dir_zapobj, tail,
219 				    sizeof (foundobj), 1, &foundobj);
220 				ASSERT(err || foundobj == ddobj);
221 #endif
222 				(void) strcpy(dd->dd_myname, tail);
223 			} else {
224 				err = zap_value_search(dp->dp_meta_objset,
225 				    dsl_dir_phys(dd->dd_parent)->
226 				    dd_child_dir_zapobj,
227 				    ddobj, 0, dd->dd_myname);
228 			}
229 			if (err != 0)
230 				goto errout;
231 		} else {
232 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
233 		}
234 
235 		if (dsl_dir_is_clone(dd)) {
236 			dmu_buf_t *origin_bonus;
237 			dsl_dataset_phys_t *origin_phys;
238 
239 			/*
240 			 * We can't open the origin dataset, because
241 			 * that would require opening this dsl_dir.
242 			 * Just look at its phys directly instead.
243 			 */
244 			err = dmu_bonus_hold(dp->dp_meta_objset,
245 			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
246 			    &origin_bonus);
247 			if (err != 0)
248 				goto errout;
249 			origin_phys = origin_bonus->db_data;
250 			dd->dd_origin_txg =
251 			    origin_phys->ds_creation_txg;
252 			dmu_buf_rele(origin_bonus, FTAG);
253 		}
254 
255 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
256 		    &dd->dd_dbuf);
257 		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
258 		if (winner != NULL) {
259 			if (dd->dd_parent)
260 				dsl_dir_rele(dd->dd_parent, dd);
261 			mutex_destroy(&dd->dd_lock);
262 			kmem_free(dd, sizeof (dsl_dir_t));
263 			dd = winner;
264 		} else {
265 			spa_open_ref(dp->dp_spa, dd);
266 		}
267 	}
268 
269 	/*
270 	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
271 	 * holds on the spa.  We need the open-to-close holds because
272 	 * otherwise the spa_refcnt wouldn't change when we open a
273 	 * dir which the spa also has open, so we could incorrectly
274 	 * think it was OK to unload/export/destroy the pool.  We need
275 	 * the instantiate-to-evict hold because the dsl_dir_t has a
276 	 * pointer to the dd_pool, which has a pointer to the spa_t.
277 	 */
278 	spa_open_ref(dp->dp_spa, tag);
279 	ASSERT3P(dd->dd_pool, ==, dp);
280 	ASSERT3U(dd->dd_object, ==, ddobj);
281 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
282 	*ddp = dd;
283 	return (0);
284 
285 errout:
286 	if (dd->dd_parent)
287 		dsl_dir_rele(dd->dd_parent, dd);
288 	mutex_destroy(&dd->dd_lock);
289 	kmem_free(dd, sizeof (dsl_dir_t));
290 	dmu_buf_rele(dbuf, tag);
291 	return (err);
292 }
293 
294 void
295 dsl_dir_rele(dsl_dir_t *dd, void *tag)
296 {
297 	dprintf_dd(dd, "%s\n", "");
298 	spa_close(dd->dd_pool->dp_spa, tag);
299 	dmu_buf_rele(dd->dd_dbuf, tag);
300 }
301 
302 /*
303  * Remove a reference to the given dsl dir that is being asynchronously
304  * released.  Async releases occur from a taskq performing eviction of
305  * dsl datasets and dirs.  This process is identical to a normal release
306  * with the exception of using the async API for releasing the reference on
307  * the spa.
308  */
309 void
310 dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
311 {
312 	dprintf_dd(dd, "%s\n", "");
313 	spa_async_close(dd->dd_pool->dp_spa, tag);
314 	dmu_buf_rele(dd->dd_dbuf, tag);
315 }
316 
317 /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
318 void
319 dsl_dir_name(dsl_dir_t *dd, char *buf)
320 {
321 	if (dd->dd_parent) {
322 		dsl_dir_name(dd->dd_parent, buf);
323 		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
324 		    ZFS_MAX_DATASET_NAME_LEN);
325 	} else {
326 		buf[0] = '\0';
327 	}
328 	if (!MUTEX_HELD(&dd->dd_lock)) {
329 		/*
330 		 * recursive mutex so that we can use
331 		 * dprintf_dd() with dd_lock held
332 		 */
333 		mutex_enter(&dd->dd_lock);
334 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
335 		    <, ZFS_MAX_DATASET_NAME_LEN);
336 		mutex_exit(&dd->dd_lock);
337 	} else {
338 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
339 		    <, ZFS_MAX_DATASET_NAME_LEN);
340 	}
341 }
342 
343 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
344 int
345 dsl_dir_namelen(dsl_dir_t *dd)
346 {
347 	int result = 0;
348 
349 	if (dd->dd_parent) {
350 		/* parent's name + 1 for the "/" */
351 		result = dsl_dir_namelen(dd->dd_parent) + 1;
352 	}
353 
354 	if (!MUTEX_HELD(&dd->dd_lock)) {
355 		/* see dsl_dir_name */
356 		mutex_enter(&dd->dd_lock);
357 		result += strlen(dd->dd_myname);
358 		mutex_exit(&dd->dd_lock);
359 	} else {
360 		result += strlen(dd->dd_myname);
361 	}
362 
363 	return (result);
364 }
365 
366 static int
367 getcomponent(const char *path, char *component, const char **nextp)
368 {
369 	char *p;
370 
371 	if ((path == NULL) || (path[0] == '\0'))
372 		return (SET_ERROR(ENOENT));
373 	/* This would be a good place to reserve some namespace... */
374 	p = strpbrk(path, "/@");
375 	if (p && (p[1] == '/' || p[1] == '@')) {
376 		/* two separators in a row */
377 		return (SET_ERROR(EINVAL));
378 	}
379 	if (p == NULL || p == path) {
380 		/*
381 		 * if the first thing is an @ or /, it had better be an
382 		 * @ and it had better not have any more ats or slashes,
383 		 * and it had better have something after the @.
384 		 */
385 		if (p != NULL &&
386 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
387 			return (SET_ERROR(EINVAL));
388 		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
389 			return (SET_ERROR(ENAMETOOLONG));
390 		(void) strcpy(component, path);
391 		p = NULL;
392 	} else if (p[0] == '/') {
393 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
394 			return (SET_ERROR(ENAMETOOLONG));
395 		(void) strncpy(component, path, p - path);
396 		component[p - path] = '\0';
397 		p++;
398 	} else if (p[0] == '@') {
399 		/*
400 		 * if the next separator is an @, there better not be
401 		 * any more slashes.
402 		 */
403 		if (strchr(path, '/'))
404 			return (SET_ERROR(EINVAL));
405 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
406 			return (SET_ERROR(ENAMETOOLONG));
407 		(void) strncpy(component, path, p - path);
408 		component[p - path] = '\0';
409 	} else {
410 		panic("invalid p=%p", (void *)p);
411 	}
412 	*nextp = p;
413 	return (0);
414 }
415 
416 /*
417  * Return the dsl_dir_t, and possibly the last component which couldn't
418  * be found in *tail.  The name must be in the specified dsl_pool_t.  This
419  * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
420  * path is bogus, or if tail==NULL and we couldn't parse the whole name.
421  * (*tail)[0] == '@' means that the last component is a snapshot.
422  */
423 int
424 dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
425     dsl_dir_t **ddp, const char **tailp)
426 {
427 	char buf[ZFS_MAX_DATASET_NAME_LEN];
428 	const char *spaname, *next, *nextnext = NULL;
429 	int err;
430 	dsl_dir_t *dd;
431 	uint64_t ddobj;
432 
433 	err = getcomponent(name, buf, &next);
434 	if (err != 0)
435 		return (err);
436 
437 	/* Make sure the name is in the specified pool. */
438 	spaname = spa_name(dp->dp_spa);
439 	if (strcmp(buf, spaname) != 0)
440 		return (SET_ERROR(EXDEV));
441 
442 	ASSERT(dsl_pool_config_held(dp));
443 
444 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
445 	if (err != 0) {
446 		return (err);
447 	}
448 
449 	while (next != NULL) {
450 		dsl_dir_t *child_dd;
451 		err = getcomponent(next, buf, &nextnext);
452 		if (err != 0)
453 			break;
454 		ASSERT(next[0] != '\0');
455 		if (next[0] == '@')
456 			break;
457 		dprintf("looking up %s in obj%lld\n",
458 		    buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
459 
460 		err = zap_lookup(dp->dp_meta_objset,
461 		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
462 		    buf, sizeof (ddobj), 1, &ddobj);
463 		if (err != 0) {
464 			if (err == ENOENT)
465 				err = 0;
466 			break;
467 		}
468 
469 		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
470 		if (err != 0)
471 			break;
472 		dsl_dir_rele(dd, tag);
473 		dd = child_dd;
474 		next = nextnext;
475 	}
476 
477 	if (err != 0) {
478 		dsl_dir_rele(dd, tag);
479 		return (err);
480 	}
481 
482 	/*
483 	 * It's an error if there's more than one component left, or
484 	 * tailp==NULL and there's any component left.
485 	 */
486 	if (next != NULL &&
487 	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
488 		/* bad path name */
489 		dsl_dir_rele(dd, tag);
490 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
491 		err = SET_ERROR(ENOENT);
492 	}
493 	if (tailp != NULL)
494 		*tailp = next;
495 	*ddp = dd;
496 	return (err);
497 }
498 
499 /*
500  * If the counts are already initialized for this filesystem and its
501  * descendants then do nothing, otherwise initialize the counts.
502  *
503  * The counts on this filesystem, and those below, may be uninitialized due to
504  * either the use of a pre-existing pool which did not support the
505  * filesystem/snapshot limit feature, or one in which the feature had not yet
506  * been enabled.
507  *
508  * Recursively descend the filesystem tree and update the filesystem/snapshot
509  * counts on each filesystem below, then update the cumulative count on the
510  * current filesystem. If the filesystem already has a count set on it,
511  * then we know that its counts, and the counts on the filesystems below it,
512  * are already correct, so we don't have to update this filesystem.
513  */
514 static void
515 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
516 {
517 	uint64_t my_fs_cnt = 0;
518 	uint64_t my_ss_cnt = 0;
519 	dsl_pool_t *dp = dd->dd_pool;
520 	objset_t *os = dp->dp_meta_objset;
521 	zap_cursor_t *zc;
522 	zap_attribute_t *za;
523 	dsl_dataset_t *ds;
524 
525 	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
526 	ASSERT(dsl_pool_config_held(dp));
527 	ASSERT(dmu_tx_is_syncing(tx));
528 
529 	dsl_dir_zapify(dd, tx);
530 
531 	/*
532 	 * If the filesystem count has already been initialized then we
533 	 * don't need to recurse down any further.
534 	 */
535 	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
536 		return;
537 
538 	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
539 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
540 
541 	/* Iterate my child dirs */
542 	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
543 	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
544 		dsl_dir_t *chld_dd;
545 		uint64_t count;
546 
547 		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
548 		    &chld_dd));
549 
550 		/*
551 		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
552 		 * temporary datasets.
553 		 */
554 		if (chld_dd->dd_myname[0] == '$' ||
555 		    chld_dd->dd_myname[0] == '%') {
556 			dsl_dir_rele(chld_dd, FTAG);
557 			continue;
558 		}
559 
560 		my_fs_cnt++;	/* count this child */
561 
562 		dsl_dir_init_fs_ss_count(chld_dd, tx);
563 
564 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
565 		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
566 		my_fs_cnt += count;
567 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
568 		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
569 		my_ss_cnt += count;
570 
571 		dsl_dir_rele(chld_dd, FTAG);
572 	}
573 	zap_cursor_fini(zc);
574 	/* Count my snapshots (we counted children's snapshots above) */
575 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
576 	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
577 
578 	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
579 	    zap_cursor_retrieve(zc, za) == 0;
580 	    zap_cursor_advance(zc)) {
581 		/* Don't count temporary snapshots */
582 		if (za->za_name[0] != '%')
583 			my_ss_cnt++;
584 	}
585 	zap_cursor_fini(zc);
586 
587 	dsl_dataset_rele(ds, FTAG);
588 
589 	kmem_free(zc, sizeof (zap_cursor_t));
590 	kmem_free(za, sizeof (zap_attribute_t));
591 
592 	/* we're in a sync task, update counts */
593 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
594 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
595 	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
596 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
597 	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
598 }
599 
600 static int
601 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
602 {
603 	char *ddname = (char *)arg;
604 	dsl_pool_t *dp = dmu_tx_pool(tx);
605 	dsl_dataset_t *ds;
606 	dsl_dir_t *dd;
607 	int error;
608 
609 	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
610 	if (error != 0)
611 		return (error);
612 
613 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
614 		dsl_dataset_rele(ds, FTAG);
615 		return (SET_ERROR(ENOTSUP));
616 	}
617 
618 	dd = ds->ds_dir;
619 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
620 	    dsl_dir_is_zapified(dd) &&
621 	    zap_contains(dp->dp_meta_objset, dd->dd_object,
622 	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
623 		dsl_dataset_rele(ds, FTAG);
624 		return (SET_ERROR(EALREADY));
625 	}
626 
627 	dsl_dataset_rele(ds, FTAG);
628 	return (0);
629 }
630 
631 static void
632 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
633 {
634 	char *ddname = (char *)arg;
635 	dsl_pool_t *dp = dmu_tx_pool(tx);
636 	dsl_dataset_t *ds;
637 	spa_t *spa;
638 
639 	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
640 
641 	spa = dsl_dataset_get_spa(ds);
642 
643 	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
644 		/*
645 		 * Since the feature was not active and we're now setting a
646 		 * limit, increment the feature-active counter so that the
647 		 * feature becomes active for the first time.
648 		 *
649 		 * We are already in a sync task so we can update the MOS.
650 		 */
651 		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
652 	}
653 
654 	/*
655 	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
656 	 * we need to ensure the counts are correct. Descend down the tree from
657 	 * this point and update all of the counts to be accurate.
658 	 */
659 	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
660 
661 	dsl_dataset_rele(ds, FTAG);
662 }
663 
664 /*
665  * Make sure the feature is enabled and activate it if necessary.
666  * Since we're setting a limit, ensure the on-disk counts are valid.
667  * This is only called by the ioctl path when setting a limit value.
668  *
669  * We do not need to validate the new limit, since users who can change the
670  * limit are also allowed to exceed the limit.
671  */
672 int
673 dsl_dir_activate_fs_ss_limit(const char *ddname)
674 {
675 	int error;
676 
677 	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
678 	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
679 	    ZFS_SPACE_CHECK_RESERVED);
680 
681 	if (error == EALREADY)
682 		error = 0;
683 
684 	return (error);
685 }
686 
687 /*
688  * Used to determine if the filesystem_limit or snapshot_limit should be
689  * enforced. We allow the limit to be exceeded if the user has permission to
690  * write the property value. We pass in the creds that we got in the open
691  * context since we will always be the GZ root in syncing context. We also have
692  * to handle the case where we are allowed to change the limit on the current
693  * dataset, but there may be another limit in the tree above.
694  *
695  * We can never modify these two properties within a non-global zone. In
696  * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
697  * can't use that function since we are already holding the dp_config_rwlock.
698  * In addition, we already have the dd and dealing with snapshots is simplified
699  * in this code.
700  */
701 
702 typedef enum {
703 	ENFORCE_ALWAYS,
704 	ENFORCE_NEVER,
705 	ENFORCE_ABOVE
706 } enforce_res_t;
707 
708 static enforce_res_t
709 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
710 {
711 	enforce_res_t enforce = ENFORCE_ALWAYS;
712 	uint64_t obj;
713 	dsl_dataset_t *ds;
714 	uint64_t zoned;
715 
716 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
717 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
718 
719 #ifdef _KERNEL
720 	if (crgetzoneid(cr) != GLOBAL_ZONEID)
721 		return (ENFORCE_ALWAYS);
722 
723 	if (secpolicy_zfs(cr) == 0)
724 		return (ENFORCE_NEVER);
725 #endif
726 
727 	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
728 		return (ENFORCE_ALWAYS);
729 
730 	ASSERT(dsl_pool_config_held(dd->dd_pool));
731 
732 	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
733 		return (ENFORCE_ALWAYS);
734 
735 	if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
736 		/* Only root can access zoned fs's from the GZ */
737 		enforce = ENFORCE_ALWAYS;
738 	} else {
739 		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
740 			enforce = ENFORCE_ABOVE;
741 	}
742 
743 	dsl_dataset_rele(ds, FTAG);
744 	return (enforce);
745 }
746 
747 /*
748  * Check if adding additional child filesystem(s) would exceed any filesystem
749  * limits or adding additional snapshot(s) would exceed any snapshot limits.
750  * The prop argument indicates which limit to check.
751  *
752  * Note that all filesystem limits up to the root (or the highest
753  * initialized) filesystem or the given ancestor must be satisfied.
754  */
755 int
756 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
757     dsl_dir_t *ancestor, cred_t *cr)
758 {
759 	objset_t *os = dd->dd_pool->dp_meta_objset;
760 	uint64_t limit, count;
761 	char *count_prop;
762 	enforce_res_t enforce;
763 	int err = 0;
764 
765 	ASSERT(dsl_pool_config_held(dd->dd_pool));
766 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
767 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
768 
769 	/*
770 	 * If we're allowed to change the limit, don't enforce the limit
771 	 * e.g. this can happen if a snapshot is taken by an administrative
772 	 * user in the global zone (i.e. a recursive snapshot by root).
773 	 * However, we must handle the case of delegated permissions where we
774 	 * are allowed to change the limit on the current dataset, but there
775 	 * is another limit in the tree above.
776 	 */
777 	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
778 	if (enforce == ENFORCE_NEVER)
779 		return (0);
780 
781 	/*
782 	 * e.g. if renaming a dataset with no snapshots, count adjustment
783 	 * is 0.
784 	 */
785 	if (delta == 0)
786 		return (0);
787 
788 	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
789 		/*
790 		 * We don't enforce the limit for temporary snapshots. This is
791 		 * indicated by a NULL cred_t argument.
792 		 */
793 		if (cr == NULL)
794 			return (0);
795 
796 		count_prop = DD_FIELD_SNAPSHOT_COUNT;
797 	} else {
798 		count_prop = DD_FIELD_FILESYSTEM_COUNT;
799 	}
800 
801 	/*
802 	 * If an ancestor has been provided, stop checking the limit once we
803 	 * hit that dir. We need this during rename so that we don't overcount
804 	 * the check once we recurse up to the common ancestor.
805 	 */
806 	if (ancestor == dd)
807 		return (0);
808 
809 	/*
810 	 * If we hit an uninitialized node while recursing up the tree, we can
811 	 * stop since we know there is no limit here (or above). The counts are
812 	 * not valid on this node and we know we won't touch this node's counts.
813 	 */
814 	if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
815 	    count_prop, sizeof (count), 1, &count) == ENOENT)
816 		return (0);
817 
818 	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
819 	    B_FALSE);
820 	if (err != 0)
821 		return (err);
822 
823 	/* Is there a limit which we've hit? */
824 	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
825 		return (SET_ERROR(EDQUOT));
826 
827 	if (dd->dd_parent != NULL)
828 		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
829 		    ancestor, cr);
830 
831 	return (err);
832 }
833 
834 /*
835  * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
836  * parents. When a new filesystem/snapshot is created, increment the count on
837  * all parents, and when a filesystem/snapshot is destroyed, decrement the
838  * count.
839  */
840 void
841 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
842     dmu_tx_t *tx)
843 {
844 	int err;
845 	objset_t *os = dd->dd_pool->dp_meta_objset;
846 	uint64_t count;
847 
848 	ASSERT(dsl_pool_config_held(dd->dd_pool));
849 	ASSERT(dmu_tx_is_syncing(tx));
850 	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
851 	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
852 
853 	/*
854 	 * When we receive an incremental stream into a filesystem that already
855 	 * exists, a temporary clone is created.  We don't count this temporary
856 	 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
857 	 * $MOS & $ORIGIN) objsets.
858 	 */
859 	if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
860 	    strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
861 		return;
862 
863 	/*
864 	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
865 	 */
866 	if (delta == 0)
867 		return;
868 
869 	/*
870 	 * If we hit an uninitialized node while recursing up the tree, we can
871 	 * stop since we know the counts are not valid on this node and we
872 	 * know we shouldn't touch this node's counts. An uninitialized count
873 	 * on the node indicates that either the feature has not yet been
874 	 * activated or there are no limits on this part of the tree.
875 	 */
876 	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
877 	    prop, sizeof (count), 1, &count)) == ENOENT)
878 		return;
879 	VERIFY0(err);
880 
881 	count += delta;
882 	/* Use a signed verify to make sure we're not neg. */
883 	VERIFY3S(count, >=, 0);
884 
885 	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
886 	    tx));
887 
888 	/* Roll up this additional count into our ancestors */
889 	if (dd->dd_parent != NULL)
890 		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
891 }
892 
893 uint64_t
894 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
895     dmu_tx_t *tx)
896 {
897 	objset_t *mos = dp->dp_meta_objset;
898 	uint64_t ddobj;
899 	dsl_dir_phys_t *ddphys;
900 	dmu_buf_t *dbuf;
901 
902 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
903 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
904 	if (pds) {
905 		VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
906 		    name, sizeof (uint64_t), 1, &ddobj, tx));
907 	} else {
908 		/* it's the root dir */
909 		VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
910 		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
911 	}
912 	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
913 	dmu_buf_will_dirty(dbuf, tx);
914 	ddphys = dbuf->db_data;
915 
916 	ddphys->dd_creation_time = gethrestime_sec();
917 	if (pds) {
918 		ddphys->dd_parent_obj = pds->dd_object;
919 
920 		/* update the filesystem counts */
921 		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
922 	}
923 	ddphys->dd_props_zapobj = zap_create(mos,
924 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
925 	ddphys->dd_child_dir_zapobj = zap_create(mos,
926 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
927 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
928 		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
929 	dmu_buf_rele(dbuf, FTAG);
930 
931 	return (ddobj);
932 }
933 
934 boolean_t
935 dsl_dir_is_clone(dsl_dir_t *dd)
936 {
937 	return (dsl_dir_phys(dd)->dd_origin_obj &&
938 	    (dd->dd_pool->dp_origin_snap == NULL ||
939 	    dsl_dir_phys(dd)->dd_origin_obj !=
940 	    dd->dd_pool->dp_origin_snap->ds_object));
941 }
942 
943 void
944 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
945 {
946 	mutex_enter(&dd->dd_lock);
947 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
948 	    dsl_dir_phys(dd)->dd_used_bytes);
949 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
950 	    dsl_dir_phys(dd)->dd_quota);
951 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
952 	    dsl_dir_phys(dd)->dd_reserved);
953 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
954 	    dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
955 	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
956 	    dsl_dir_phys(dd)->dd_compressed_bytes));
957 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
958 	    dsl_dir_phys(dd)->dd_uncompressed_bytes);
959 	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
960 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
961 		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
962 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
963 		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
964 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
965 		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
966 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
967 		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
968 		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
969 	}
970 	mutex_exit(&dd->dd_lock);
971 
972 	if (dsl_dir_is_zapified(dd)) {
973 		uint64_t count;
974 		objset_t *os = dd->dd_pool->dp_meta_objset;
975 
976 		if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
977 		    sizeof (count), 1, &count) == 0) {
978 			dsl_prop_nvlist_add_uint64(nv,
979 			    ZFS_PROP_FILESYSTEM_COUNT, count);
980 		}
981 		if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
982 		    sizeof (count), 1, &count) == 0) {
983 			dsl_prop_nvlist_add_uint64(nv,
984 			    ZFS_PROP_SNAPSHOT_COUNT, count);
985 		}
986 	}
987 
988 	if (dsl_dir_is_clone(dd)) {
989 		dsl_dataset_t *ds;
990 		char buf[ZFS_MAX_DATASET_NAME_LEN];
991 
992 		VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
993 		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
994 		dsl_dataset_name(ds, buf);
995 		dsl_dataset_rele(ds, FTAG);
996 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
997 	}
998 }
999 
1000 void
1001 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
1002 {
1003 	dsl_pool_t *dp = dd->dd_pool;
1004 
1005 	ASSERT(dsl_dir_phys(dd));
1006 
1007 	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
1008 		/* up the hold count until we can be written out */
1009 		dmu_buf_add_ref(dd->dd_dbuf, dd);
1010 	}
1011 }
1012 
1013 static int64_t
1014 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
1015 {
1016 	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
1017 	uint64_t new_accounted =
1018 	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
1019 	return (new_accounted - old_accounted);
1020 }
1021 
1022 void
1023 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
1024 {
1025 	ASSERT(dmu_tx_is_syncing(tx));
1026 
1027 	mutex_enter(&dd->dd_lock);
1028 	ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
1029 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
1030 	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
1031 	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
1032 	mutex_exit(&dd->dd_lock);
1033 
1034 	/* release the hold from dsl_dir_dirty */
1035 	dmu_buf_rele(dd->dd_dbuf, dd);
1036 }
1037 
1038 static uint64_t
1039 dsl_dir_space_towrite(dsl_dir_t *dd)
1040 {
1041 	uint64_t space = 0;
1042 	int i;
1043 
1044 	ASSERT(MUTEX_HELD(&dd->dd_lock));
1045 
1046 	for (i = 0; i < TXG_SIZE; i++) {
1047 		space += dd->dd_space_towrite[i&TXG_MASK];
1048 		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
1049 	}
1050 	return (space);
1051 }
1052 
1053 /*
1054  * How much space would dd have available if ancestor had delta applied
1055  * to it?  If ondiskonly is set, we're only interested in what's
1056  * on-disk, not estimated pending changes.
1057  */
1058 uint64_t
1059 dsl_dir_space_available(dsl_dir_t *dd,
1060     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
1061 {
1062 	uint64_t parentspace, myspace, quota, used;
1063 
1064 	/*
1065 	 * If there are no restrictions otherwise, assume we have
1066 	 * unlimited space available.
1067 	 */
1068 	quota = UINT64_MAX;
1069 	parentspace = UINT64_MAX;
1070 
1071 	if (dd->dd_parent != NULL) {
1072 		parentspace = dsl_dir_space_available(dd->dd_parent,
1073 		    ancestor, delta, ondiskonly);
1074 	}
1075 
1076 	mutex_enter(&dd->dd_lock);
1077 	if (dsl_dir_phys(dd)->dd_quota != 0)
1078 		quota = dsl_dir_phys(dd)->dd_quota;
1079 	used = dsl_dir_phys(dd)->dd_used_bytes;
1080 	if (!ondiskonly)
1081 		used += dsl_dir_space_towrite(dd);
1082 
1083 	if (dd->dd_parent == NULL) {
1084 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
1085 		quota = MIN(quota, poolsize);
1086 	}
1087 
1088 	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
1089 		/*
1090 		 * We have some space reserved, in addition to what our
1091 		 * parent gave us.
1092 		 */
1093 		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
1094 	}
1095 
1096 	if (dd == ancestor) {
1097 		ASSERT(delta <= 0);
1098 		ASSERT(used >= -delta);
1099 		used += delta;
1100 		if (parentspace != UINT64_MAX)
1101 			parentspace -= delta;
1102 	}
1103 
1104 	if (used > quota) {
1105 		/* over quota */
1106 		myspace = 0;
1107 	} else {
1108 		/*
1109 		 * the lesser of the space provided by our parent and
1110 		 * the space left in our quota
1111 		 */
1112 		myspace = MIN(parentspace, quota - used);
1113 	}
1114 
1115 	mutex_exit(&dd->dd_lock);
1116 
1117 	return (myspace);
1118 }
1119 
1120 struct tempreserve {
1121 	list_node_t tr_node;
1122 	dsl_dir_t *tr_ds;
1123 	uint64_t tr_size;
1124 };
1125 
1126 static int
1127 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
1128     boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
1129     dmu_tx_t *tx, boolean_t first)
1130 {
1131 	uint64_t txg = tx->tx_txg;
1132 	uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
1133 	uint64_t deferred = 0;
1134 	struct tempreserve *tr;
1135 	int retval = EDQUOT;
1136 	int txgidx = txg & TXG_MASK;
1137 	int i;
1138 	uint64_t ref_rsrv = 0;
1139 
1140 	ASSERT3U(txg, !=, 0);
1141 	ASSERT3S(asize, >, 0);
1142 
1143 	mutex_enter(&dd->dd_lock);
1144 
1145 	/*
1146 	 * Check against the dsl_dir's quota.  We don't add in the delta
1147 	 * when checking for over-quota because they get one free hit.
1148 	 */
1149 	est_inflight = dsl_dir_space_towrite(dd);
1150 	for (i = 0; i < TXG_SIZE; i++)
1151 		est_inflight += dd->dd_tempreserved[i];
1152 	used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
1153 
1154 	/*
1155 	 * On the first iteration, fetch the dataset's used-on-disk and
1156 	 * refreservation values. Also, if checkrefquota is set, test if
1157 	 * allocating this space would exceed the dataset's refquota.
1158 	 */
1159 	if (first && tx->tx_objset) {
1160 		int error;
1161 		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
1162 
1163 		error = dsl_dataset_check_quota(ds, checkrefquota,
1164 		    asize, est_inflight, &used_on_disk, &ref_rsrv);
1165 		if (error) {
1166 			mutex_exit(&dd->dd_lock);
1167 			return (error);
1168 		}
1169 	}
1170 
1171 	/*
1172 	 * If this transaction will result in a net free of space,
1173 	 * we want to let it through.
1174 	 */
1175 	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
1176 		quota = UINT64_MAX;
1177 	else
1178 		quota = dsl_dir_phys(dd)->dd_quota;
1179 
1180 	/*
1181 	 * Adjust the quota against the actual pool size at the root
1182 	 * minus any outstanding deferred frees.
1183 	 * To ensure that it's possible to remove files from a full
1184 	 * pool without inducing transient overcommits, we throttle
1185 	 * netfree transactions against a quota that is slightly larger,
1186 	 * but still within the pool's allocation slop.  In cases where
1187 	 * we're very close to full, this will allow a steady trickle of
1188 	 * removes to get through.
1189 	 */
1190 	if (dd->dd_parent == NULL) {
1191 		spa_t *spa = dd->dd_pool->dp_spa;
1192 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
1193 		deferred = metaslab_class_get_deferred(spa_normal_class(spa));
1194 		if (poolsize - deferred < quota) {
1195 			quota = poolsize - deferred;
1196 			retval = ENOSPC;
1197 		}
1198 	}
1199 
1200 	/*
1201 	 * If they are requesting more space, and our current estimate
1202 	 * is over quota, they get to try again unless the actual
1203 	 * on-disk is over quota and there are no pending changes (which
1204 	 * may free up space for us).
1205 	 */
1206 	if (used_on_disk + est_inflight >= quota) {
1207 		if (est_inflight > early_edquot_threshold ||
1208 		    used_on_disk + early_edquot_threshold < quota ||
1209 		    (retval == ENOSPC && used_on_disk < quota + deferred))
1210 			retval = ERESTART;
1211 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
1212 		    "quota=%lluK tr=%lluK err=%d\n",
1213 		    used_on_disk>>10, est_inflight>>10,
1214 		    quota>>10, asize>>10, retval);
1215 		mutex_exit(&dd->dd_lock);
1216 		return (SET_ERROR(retval));
1217 	}
1218 
1219 	/* We need to up our estimated delta before dropping dd_lock */
1220 	dd->dd_tempreserved[txgidx] += asize;
1221 
1222 	parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
1223 	    asize - ref_rsrv);
1224 	mutex_exit(&dd->dd_lock);
1225 
1226 	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1227 	tr->tr_ds = dd;
1228 	tr->tr_size = asize;
1229 	list_insert_tail(tr_list, tr);
1230 
1231 	/* see if it's OK with our parent */
1232 	if (dd->dd_parent && parent_rsrv) {
1233 		boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
1234 
1235 		return (dsl_dir_tempreserve_impl(dd->dd_parent,
1236 		    parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
1237 	} else {
1238 		return (0);
1239 	}
1240 }
1241 
1242 /*
1243  * Reserve space in this dsl_dir, to be used in this tx's txg.
1244  * After the space has been dirtied (and dsl_dir_willuse_space()
1245  * has been called), the reservation should be canceled, using
1246  * dsl_dir_tempreserve_clear().
1247  */
1248 int
1249 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
1250     uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
1251 {
1252 	int err;
1253 	list_t *tr_list;
1254 
1255 	if (asize == 0) {
1256 		*tr_cookiep = NULL;
1257 		return (0);
1258 	}
1259 
1260 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
1261 	list_create(tr_list, sizeof (struct tempreserve),
1262 	    offsetof(struct tempreserve, tr_node));
1263 	ASSERT3S(asize, >, 0);
1264 	ASSERT3S(fsize, >=, 0);
1265 
1266 	err = arc_tempreserve_space(lsize, tx->tx_txg);
1267 	if (err == 0) {
1268 		struct tempreserve *tr;
1269 
1270 		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1271 		tr->tr_size = lsize;
1272 		list_insert_tail(tr_list, tr);
1273 	} else {
1274 		if (err == EAGAIN) {
1275 			/*
1276 			 * If arc_memory_throttle() detected that pageout
1277 			 * is running and we are low on memory, we delay new
1278 			 * non-pageout transactions to give pageout an
1279 			 * advantage.
1280 			 *
1281 			 * It is unfortunate to be delaying while the caller's
1282 			 * locks are held.
1283 			 */
1284 			txg_delay(dd->dd_pool, tx->tx_txg,
1285 			    MSEC2NSEC(10), MSEC2NSEC(10));
1286 			err = SET_ERROR(ERESTART);
1287 		}
1288 	}
1289 
1290 	if (err == 0) {
1291 		err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
1292 		    FALSE, asize > usize, tr_list, tx, TRUE);
1293 	}
1294 
1295 	if (err != 0)
1296 		dsl_dir_tempreserve_clear(tr_list, tx);
1297 	else
1298 		*tr_cookiep = tr_list;
1299 
1300 	return (err);
1301 }
1302 
1303 /*
1304  * Clear a temporary reservation that we previously made with
1305  * dsl_dir_tempreserve_space().
1306  */
1307 void
1308 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
1309 {
1310 	int txgidx = tx->tx_txg & TXG_MASK;
1311 	list_t *tr_list = tr_cookie;
1312 	struct tempreserve *tr;
1313 
1314 	ASSERT3U(tx->tx_txg, !=, 0);
1315 
1316 	if (tr_cookie == NULL)
1317 		return;
1318 
1319 	while ((tr = list_head(tr_list)) != NULL) {
1320 		if (tr->tr_ds) {
1321 			mutex_enter(&tr->tr_ds->dd_lock);
1322 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
1323 			    tr->tr_size);
1324 			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
1325 			mutex_exit(&tr->tr_ds->dd_lock);
1326 		} else {
1327 			arc_tempreserve_clear(tr->tr_size);
1328 		}
1329 		list_remove(tr_list, tr);
1330 		kmem_free(tr, sizeof (struct tempreserve));
1331 	}
1332 
1333 	kmem_free(tr_list, sizeof (list_t));
1334 }
1335 
1336 /*
1337  * This should be called from open context when we think we're going to write
1338  * or free space, for example when dirtying data. Be conservative; it's okay
1339  * to write less space or free more, but we don't want to write more or free
1340  * less than the amount specified.
1341  */
1342 void
1343 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
1344 {
1345 	int64_t parent_space;
1346 	uint64_t est_used;
1347 
1348 	mutex_enter(&dd->dd_lock);
1349 	if (space > 0)
1350 		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
1351 
1352 	est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
1353 	parent_space = parent_delta(dd, est_used, space);
1354 	mutex_exit(&dd->dd_lock);
1355 
1356 	/* Make sure that we clean up dd_space_to* */
1357 	dsl_dir_dirty(dd, tx);
1358 
1359 	/* XXX this is potentially expensive and unnecessary... */
1360 	if (parent_space && dd->dd_parent)
1361 		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
1362 }
1363 
1364 /* call from syncing context when we actually write/free space for this dd */
1365 void
1366 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
1367     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
1368 {
1369 	int64_t accounted_delta;
1370 
1371 	/*
1372 	 * dsl_dataset_set_refreservation_sync_impl() calls this with
1373 	 * dd_lock held, so that it can atomically update
1374 	 * ds->ds_reserved and the dsl_dir accounting, so that
1375 	 * dsl_dataset_check_quota() can see dataset and dir accounting
1376 	 * consistently.
1377 	 */
1378 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
1379 
1380 	ASSERT(dmu_tx_is_syncing(tx));
1381 	ASSERT(type < DD_USED_NUM);
1382 
1383 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1384 
1385 	if (needlock)
1386 		mutex_enter(&dd->dd_lock);
1387 	accounted_delta =
1388 	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
1389 	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
1390 	ASSERT(compressed >= 0 ||
1391 	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
1392 	ASSERT(uncompressed >= 0 ||
1393 	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
1394 	dsl_dir_phys(dd)->dd_used_bytes += used;
1395 	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
1396 	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
1397 
1398 	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1399 		ASSERT(used > 0 ||
1400 		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
1401 		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
1402 #ifdef DEBUG
1403 		dd_used_t t;
1404 		uint64_t u = 0;
1405 		for (t = 0; t < DD_USED_NUM; t++)
1406 			u += dsl_dir_phys(dd)->dd_used_breakdown[t];
1407 		ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
1408 #endif
1409 	}
1410 	if (needlock)
1411 		mutex_exit(&dd->dd_lock);
1412 
1413 	if (dd->dd_parent != NULL) {
1414 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1415 		    accounted_delta, compressed, uncompressed, tx);
1416 		dsl_dir_transfer_space(dd->dd_parent,
1417 		    used - accounted_delta,
1418 		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
1419 	}
1420 }
1421 
1422 void
1423 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
1424     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1425 {
1426 	ASSERT(dmu_tx_is_syncing(tx));
1427 	ASSERT(oldtype < DD_USED_NUM);
1428 	ASSERT(newtype < DD_USED_NUM);
1429 
1430 	if (delta == 0 ||
1431 	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
1432 		return;
1433 
1434 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1435 	mutex_enter(&dd->dd_lock);
1436 	ASSERT(delta > 0 ?
1437 	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
1438 	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
1439 	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
1440 	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
1441 	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
1442 	mutex_exit(&dd->dd_lock);
1443 }
1444 
1445 typedef struct dsl_dir_set_qr_arg {
1446 	const char *ddsqra_name;
1447 	zprop_source_t ddsqra_source;
1448 	uint64_t ddsqra_value;
1449 } dsl_dir_set_qr_arg_t;
1450 
1451 static int
1452 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
1453 {
1454 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1455 	dsl_pool_t *dp = dmu_tx_pool(tx);
1456 	dsl_dataset_t *ds;
1457 	int error;
1458 	uint64_t towrite, newval;
1459 
1460 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1461 	if (error != 0)
1462 		return (error);
1463 
1464 	error = dsl_prop_predict(ds->ds_dir, "quota",
1465 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1466 	if (error != 0) {
1467 		dsl_dataset_rele(ds, FTAG);
1468 		return (error);
1469 	}
1470 
1471 	if (newval == 0) {
1472 		dsl_dataset_rele(ds, FTAG);
1473 		return (0);
1474 	}
1475 
1476 	mutex_enter(&ds->ds_dir->dd_lock);
1477 	/*
1478 	 * If we are doing the preliminary check in open context, and
1479 	 * there are pending changes, then don't fail it, since the
1480 	 * pending changes could under-estimate the amount of space to be
1481 	 * freed up.
1482 	 */
1483 	towrite = dsl_dir_space_towrite(ds->ds_dir);
1484 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1485 	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
1486 	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
1487 		error = SET_ERROR(ENOSPC);
1488 	}
1489 	mutex_exit(&ds->ds_dir->dd_lock);
1490 	dsl_dataset_rele(ds, FTAG);
1491 	return (error);
1492 }
1493 
1494 static void
1495 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
1496 {
1497 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1498 	dsl_pool_t *dp = dmu_tx_pool(tx);
1499 	dsl_dataset_t *ds;
1500 	uint64_t newval;
1501 
1502 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1503 
1504 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1505 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
1506 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1507 		    &ddsqra->ddsqra_value, tx);
1508 
1509 		VERIFY0(dsl_prop_get_int_ds(ds,
1510 		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
1511 	} else {
1512 		newval = ddsqra->ddsqra_value;
1513 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1514 		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
1515 	}
1516 
1517 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1518 	mutex_enter(&ds->ds_dir->dd_lock);
1519 	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
1520 	mutex_exit(&ds->ds_dir->dd_lock);
1521 	dsl_dataset_rele(ds, FTAG);
1522 }
1523 
1524 int
1525 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1526 {
1527 	dsl_dir_set_qr_arg_t ddsqra;
1528 
1529 	ddsqra.ddsqra_name = ddname;
1530 	ddsqra.ddsqra_source = source;
1531 	ddsqra.ddsqra_value = quota;
1532 
1533 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
1534 	    dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
1535 }
1536 
1537 int
1538 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
1539 {
1540 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1541 	dsl_pool_t *dp = dmu_tx_pool(tx);
1542 	dsl_dataset_t *ds;
1543 	dsl_dir_t *dd;
1544 	uint64_t newval, used, avail;
1545 	int error;
1546 
1547 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1548 	if (error != 0)
1549 		return (error);
1550 	dd = ds->ds_dir;
1551 
1552 	/*
1553 	 * If we are doing the preliminary check in open context, the
1554 	 * space estimates may be inaccurate.
1555 	 */
1556 	if (!dmu_tx_is_syncing(tx)) {
1557 		dsl_dataset_rele(ds, FTAG);
1558 		return (0);
1559 	}
1560 
1561 	error = dsl_prop_predict(ds->ds_dir,
1562 	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1563 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1564 	if (error != 0) {
1565 		dsl_dataset_rele(ds, FTAG);
1566 		return (error);
1567 	}
1568 
1569 	mutex_enter(&dd->dd_lock);
1570 	used = dsl_dir_phys(dd)->dd_used_bytes;
1571 	mutex_exit(&dd->dd_lock);
1572 
1573 	if (dd->dd_parent) {
1574 		avail = dsl_dir_space_available(dd->dd_parent,
1575 		    NULL, 0, FALSE);
1576 	} else {
1577 		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1578 	}
1579 
1580 	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
1581 		uint64_t delta = MAX(used, newval) -
1582 		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
1583 
1584 		if (delta > avail ||
1585 		    (dsl_dir_phys(dd)->dd_quota > 0 &&
1586 		    newval > dsl_dir_phys(dd)->dd_quota))
1587 			error = SET_ERROR(ENOSPC);
1588 	}
1589 
1590 	dsl_dataset_rele(ds, FTAG);
1591 	return (error);
1592 }
1593 
1594 void
1595 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1596 {
1597 	uint64_t used;
1598 	int64_t delta;
1599 
1600 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1601 
1602 	mutex_enter(&dd->dd_lock);
1603 	used = dsl_dir_phys(dd)->dd_used_bytes;
1604 	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
1605 	dsl_dir_phys(dd)->dd_reserved = value;
1606 
1607 	if (dd->dd_parent != NULL) {
1608 		/* Roll up this additional usage into our ancestors */
1609 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1610 		    delta, 0, 0, tx);
1611 	}
1612 	mutex_exit(&dd->dd_lock);
1613 }
1614 
1615 
1616 static void
1617 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
1618 {
1619 	dsl_dir_set_qr_arg_t *ddsqra = arg;
1620 	dsl_pool_t *dp = dmu_tx_pool(tx);
1621 	dsl_dataset_t *ds;
1622 	uint64_t newval;
1623 
1624 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1625 
1626 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1627 		dsl_prop_set_sync_impl(ds,
1628 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1629 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1630 		    &ddsqra->ddsqra_value, tx);
1631 
1632 		VERIFY0(dsl_prop_get_int_ds(ds,
1633 		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1634 	} else {
1635 		newval = ddsqra->ddsqra_value;
1636 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1637 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1638 		    (longlong_t)newval);
1639 	}
1640 
1641 	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1642 	dsl_dataset_rele(ds, FTAG);
1643 }
1644 
1645 int
1646 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1647     uint64_t reservation)
1648 {
1649 	dsl_dir_set_qr_arg_t ddsqra;
1650 
1651 	ddsqra.ddsqra_name = ddname;
1652 	ddsqra.ddsqra_source = source;
1653 	ddsqra.ddsqra_value = reservation;
1654 
1655 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
1656 	    dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
1657 }
1658 
1659 static dsl_dir_t *
1660 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1661 {
1662 	for (; ds1; ds1 = ds1->dd_parent) {
1663 		dsl_dir_t *dd;
1664 		for (dd = ds2; dd; dd = dd->dd_parent) {
1665 			if (ds1 == dd)
1666 				return (dd);
1667 		}
1668 	}
1669 	return (NULL);
1670 }
1671 
1672 /*
1673  * If delta is applied to dd, how much of that delta would be applied to
1674  * ancestor?  Syncing context only.
1675  */
1676 static int64_t
1677 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1678 {
1679 	if (dd == ancestor)
1680 		return (delta);
1681 
1682 	mutex_enter(&dd->dd_lock);
1683 	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
1684 	mutex_exit(&dd->dd_lock);
1685 	return (would_change(dd->dd_parent, delta, ancestor));
1686 }
1687 
1688 typedef struct dsl_dir_rename_arg {
1689 	const char *ddra_oldname;
1690 	const char *ddra_newname;
1691 	cred_t *ddra_cred;
1692 } dsl_dir_rename_arg_t;
1693 
1694 /* ARGSUSED */
1695 static int
1696 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1697 {
1698 	int *deltap = arg;
1699 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
1700 
1701 	dsl_dataset_name(ds, namebuf);
1702 
1703 	if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN)
1704 		return (SET_ERROR(ENAMETOOLONG));
1705 	return (0);
1706 }
1707 
1708 static int
1709 dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1710 {
1711 	dsl_dir_rename_arg_t *ddra = arg;
1712 	dsl_pool_t *dp = dmu_tx_pool(tx);
1713 	dsl_dir_t *dd, *newparent;
1714 	const char *mynewname;
1715 	int error;
1716 	int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
1717 
1718 	/* target dir should exist */
1719 	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1720 	if (error != 0)
1721 		return (error);
1722 
1723 	/* new parent should exist */
1724 	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1725 	    &newparent, &mynewname);
1726 	if (error != 0) {
1727 		dsl_dir_rele(dd, FTAG);
1728 		return (error);
1729 	}
1730 
1731 	/* can't rename to different pool */
1732 	if (dd->dd_pool != newparent->dd_pool) {
1733 		dsl_dir_rele(newparent, FTAG);
1734 		dsl_dir_rele(dd, FTAG);
1735 		return (SET_ERROR(ENXIO));
1736 	}
1737 
1738 	/* new name should not already exist */
1739 	if (mynewname == NULL) {
1740 		dsl_dir_rele(newparent, FTAG);
1741 		dsl_dir_rele(dd, FTAG);
1742 		return (SET_ERROR(EEXIST));
1743 	}
1744 
1745 	/* if the name length is growing, validate child name lengths */
1746 	if (delta > 0) {
1747 		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
1748 		    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1749 		if (error != 0) {
1750 			dsl_dir_rele(newparent, FTAG);
1751 			dsl_dir_rele(dd, FTAG);
1752 			return (error);
1753 		}
1754 	}
1755 
1756 	if (dmu_tx_is_syncing(tx)) {
1757 		if (spa_feature_is_active(dp->dp_spa,
1758 		    SPA_FEATURE_FS_SS_LIMIT)) {
1759 			/*
1760 			 * Although this is the check function and we don't
1761 			 * normally make on-disk changes in check functions,
1762 			 * we need to do that here.
1763 			 *
1764 			 * Ensure this portion of the tree's counts have been
1765 			 * initialized in case the new parent has limits set.
1766 			 */
1767 			dsl_dir_init_fs_ss_count(dd, tx);
1768 		}
1769 	}
1770 
1771 	if (newparent != dd->dd_parent) {
1772 		/* is there enough space? */
1773 		uint64_t myspace =
1774 		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
1775 		    dsl_dir_phys(dd)->dd_reserved);
1776 		objset_t *os = dd->dd_pool->dp_meta_objset;
1777 		uint64_t fs_cnt = 0;
1778 		uint64_t ss_cnt = 0;
1779 
1780 		if (dsl_dir_is_zapified(dd)) {
1781 			int err;
1782 
1783 			err = zap_lookup(os, dd->dd_object,
1784 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1785 			    &fs_cnt);
1786 			if (err != ENOENT && err != 0) {
1787 				dsl_dir_rele(newparent, FTAG);
1788 				dsl_dir_rele(dd, FTAG);
1789 				return (err);
1790 			}
1791 
1792 			/*
1793 			 * have to add 1 for the filesystem itself that we're
1794 			 * moving
1795 			 */
1796 			fs_cnt++;
1797 
1798 			err = zap_lookup(os, dd->dd_object,
1799 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
1800 			    &ss_cnt);
1801 			if (err != ENOENT && err != 0) {
1802 				dsl_dir_rele(newparent, FTAG);
1803 				dsl_dir_rele(dd, FTAG);
1804 				return (err);
1805 			}
1806 		}
1807 
1808 		/* no rename into our descendant */
1809 		if (closest_common_ancestor(dd, newparent) == dd) {
1810 			dsl_dir_rele(newparent, FTAG);
1811 			dsl_dir_rele(dd, FTAG);
1812 			return (SET_ERROR(EINVAL));
1813 		}
1814 
1815 		error = dsl_dir_transfer_possible(dd->dd_parent,
1816 		    newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
1817 		if (error != 0) {
1818 			dsl_dir_rele(newparent, FTAG);
1819 			dsl_dir_rele(dd, FTAG);
1820 			return (error);
1821 		}
1822 	}
1823 
1824 	dsl_dir_rele(newparent, FTAG);
1825 	dsl_dir_rele(dd, FTAG);
1826 	return (0);
1827 }
1828 
1829 static void
1830 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
1831 {
1832 	dsl_dir_rename_arg_t *ddra = arg;
1833 	dsl_pool_t *dp = dmu_tx_pool(tx);
1834 	dsl_dir_t *dd, *newparent;
1835 	const char *mynewname;
1836 	int error;
1837 	objset_t *mos = dp->dp_meta_objset;
1838 
1839 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
1840 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
1841 	    &mynewname));
1842 
1843 	/* Log this before we change the name. */
1844 	spa_history_log_internal_dd(dd, "rename", tx,
1845 	    "-> %s", ddra->ddra_newname);
1846 
1847 	if (newparent != dd->dd_parent) {
1848 		objset_t *os = dd->dd_pool->dp_meta_objset;
1849 		uint64_t fs_cnt = 0;
1850 		uint64_t ss_cnt = 0;
1851 
1852 		/*
1853 		 * We already made sure the dd counts were initialized in the
1854 		 * check function.
1855 		 */
1856 		if (spa_feature_is_active(dp->dp_spa,
1857 		    SPA_FEATURE_FS_SS_LIMIT)) {
1858 			VERIFY0(zap_lookup(os, dd->dd_object,
1859 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1860 			    &fs_cnt));
1861 			/* add 1 for the filesystem itself that we're moving */
1862 			fs_cnt++;
1863 
1864 			VERIFY0(zap_lookup(os, dd->dd_object,
1865 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
1866 			    &ss_cnt));
1867 		}
1868 
1869 		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
1870 		    DD_FIELD_FILESYSTEM_COUNT, tx);
1871 		dsl_fs_ss_count_adjust(newparent, fs_cnt,
1872 		    DD_FIELD_FILESYSTEM_COUNT, tx);
1873 
1874 		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
1875 		    DD_FIELD_SNAPSHOT_COUNT, tx);
1876 		dsl_fs_ss_count_adjust(newparent, ss_cnt,
1877 		    DD_FIELD_SNAPSHOT_COUNT, tx);
1878 
1879 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1880 		    -dsl_dir_phys(dd)->dd_used_bytes,
1881 		    -dsl_dir_phys(dd)->dd_compressed_bytes,
1882 		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
1883 		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
1884 		    dsl_dir_phys(dd)->dd_used_bytes,
1885 		    dsl_dir_phys(dd)->dd_compressed_bytes,
1886 		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
1887 
1888 		if (dsl_dir_phys(dd)->dd_reserved >
1889 		    dsl_dir_phys(dd)->dd_used_bytes) {
1890 			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
1891 			    dsl_dir_phys(dd)->dd_used_bytes;
1892 
1893 			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1894 			    -unused_rsrv, 0, 0, tx);
1895 			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
1896 			    unused_rsrv, 0, 0, tx);
1897 		}
1898 	}
1899 
1900 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1901 
1902 	/* remove from old parent zapobj */
1903 	error = zap_remove(mos,
1904 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
1905 	    dd->dd_myname, tx);
1906 	ASSERT0(error);
1907 
1908 	(void) strcpy(dd->dd_myname, mynewname);
1909 	dsl_dir_rele(dd->dd_parent, dd);
1910 	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
1911 	VERIFY0(dsl_dir_hold_obj(dp,
1912 	    newparent->dd_object, NULL, dd, &dd->dd_parent));
1913 
1914 	/* add to new parent zapobj */
1915 	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
1916 	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
1917 
1918 	dsl_prop_notify_all(dd);
1919 
1920 	dsl_dir_rele(newparent, FTAG);
1921 	dsl_dir_rele(dd, FTAG);
1922 }
1923 
1924 int
1925 dsl_dir_rename(const char *oldname, const char *newname)
1926 {
1927 	dsl_dir_rename_arg_t ddra;
1928 
1929 	ddra.ddra_oldname = oldname;
1930 	ddra.ddra_newname = newname;
1931 	ddra.ddra_cred = CRED();
1932 
1933 	return (dsl_sync_task(oldname,
1934 	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
1935 	    3, ZFS_SPACE_CHECK_RESERVED));
1936 }
1937 
1938 int
1939 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
1940     uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
1941 {
1942 	dsl_dir_t *ancestor;
1943 	int64_t adelta;
1944 	uint64_t avail;
1945 	int err;
1946 
1947 	ancestor = closest_common_ancestor(sdd, tdd);
1948 	adelta = would_change(sdd, -space, ancestor);
1949 	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1950 	if (avail < space)
1951 		return (SET_ERROR(ENOSPC));
1952 
1953 	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
1954 	    ancestor, cr);
1955 	if (err != 0)
1956 		return (err);
1957 	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
1958 	    ancestor, cr);
1959 	if (err != 0)
1960 		return (err);
1961 
1962 	return (0);
1963 }
1964 
1965 timestruc_t
1966 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1967 {
1968 	timestruc_t t;
1969 
1970 	mutex_enter(&dd->dd_lock);
1971 	t = dd->dd_snap_cmtime;
1972 	mutex_exit(&dd->dd_lock);
1973 
1974 	return (t);
1975 }
1976 
1977 void
1978 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1979 {
1980 	timestruc_t t;
1981 
1982 	gethrestime(&t);
1983 	mutex_enter(&dd->dd_lock);
1984 	dd->dd_snap_cmtime = t;
1985 	mutex_exit(&dd->dd_lock);
1986 }
1987 
1988 void
1989 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
1990 {
1991 	objset_t *mos = dd->dd_pool->dp_meta_objset;
1992 	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
1993 }
1994 
1995 boolean_t
1996 dsl_dir_is_zapified(dsl_dir_t *dd)
1997 {
1998 	dmu_object_info_t doi;
1999 
2000 	dmu_object_info_from_db(dd->dd_dbuf, &doi);
2001 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
2002 }
2003