1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 Martin Matuska. All rights reserved. 25 * Copyright (c) 2014 Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 28 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. 29 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 30 */ 31 32 #include <sys/dmu.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/dsl_dir.h> 37 #include <sys/dsl_prop.h> 38 #include <sys/dsl_synctask.h> 39 #include <sys/dsl_deleg.h> 40 #include <sys/dmu_impl.h> 41 #include <sys/spa.h> 42 #include <sys/spa_impl.h> 43 #include <sys/metaslab.h> 44 #include <sys/zap.h> 45 #include <sys/zio.h> 46 #include <sys/arc.h> 47 #include <sys/sunddi.h> 48 #include <sys/zfeature.h> 49 #include <sys/policy.h> 50 #include <sys/zfs_vfsops.h> 51 #include <sys/zfs_znode.h> 52 #include <sys/zvol.h> 53 #include <sys/zthr.h> 54 #include "zfs_namecheck.h" 55 #include "zfs_prop.h" 56 57 /* 58 * This controls if we verify the ZVOL quota or not. 59 * Currently, quotas are not implemented for ZVOLs. 60 * The quota size is the size of the ZVOL. 61 * The size of the volume already implies the ZVOL size quota. 62 * The quota mechanism can introduce a significant performance drop. 63 */ 64 static int zvol_enforce_quotas = B_TRUE; 65 66 /* 67 * Filesystem and Snapshot Limits 68 * ------------------------------ 69 * 70 * These limits are used to restrict the number of filesystems and/or snapshots 71 * that can be created at a given level in the tree or below. A typical 72 * use-case is with a delegated dataset where the administrator wants to ensure 73 * that a user within the zone is not creating too many additional filesystems 74 * or snapshots, even though they're not exceeding their space quota. 75 * 76 * The filesystem and snapshot counts are stored as extensible properties. This 77 * capability is controlled by a feature flag and must be enabled to be used. 78 * Once enabled, the feature is not active until the first limit is set. At 79 * that point, future operations to create/destroy filesystems or snapshots 80 * will validate and update the counts. 81 * 82 * Because the count properties will not exist before the feature is active, 83 * the counts are updated when a limit is first set on an uninitialized 84 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes 85 * all of the nested filesystems/snapshots. Thus, a new leaf node has a 86 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and 87 * snapshot count properties on a node indicate uninitialized counts on that 88 * node.) When first setting a limit on an uninitialized node, the code starts 89 * at the filesystem with the new limit and descends into all sub-filesystems 90 * to add the count properties. 91 * 92 * In practice this is lightweight since a limit is typically set when the 93 * filesystem is created and thus has no children. Once valid, changing the 94 * limit value won't require a re-traversal since the counts are already valid. 95 * When recursively fixing the counts, if a node with a limit is encountered 96 * during the descent, the counts are known to be valid and there is no need to 97 * descend into that filesystem's children. The counts on filesystems above the 98 * one with the new limit will still be uninitialized, unless a limit is 99 * eventually set on one of those filesystems. The counts are always recursively 100 * updated when a limit is set on a dataset, unless there is already a limit. 101 * When a new limit value is set on a filesystem with an existing limit, it is 102 * possible for the new limit to be less than the current count at that level 103 * since a user who can change the limit is also allowed to exceed the limit. 104 * 105 * Once the feature is active, then whenever a filesystem or snapshot is 106 * created, the code recurses up the tree, validating the new count against the 107 * limit at each initialized level. In practice, most levels will not have a 108 * limit set. If there is a limit at any initialized level up the tree, the 109 * check must pass or the creation will fail. Likewise, when a filesystem or 110 * snapshot is destroyed, the counts are recursively adjusted all the way up 111 * the initialized nodes in the tree. Renaming a filesystem into different point 112 * in the tree will first validate, then update the counts on each branch up to 113 * the common ancestor. A receive will also validate the counts and then update 114 * them. 115 * 116 * An exception to the above behavior is that the limit is not enforced if the 117 * user has permission to modify the limit. This is primarily so that 118 * recursive snapshots in the global zone always work. We want to prevent a 119 * denial-of-service in which a lower level delegated dataset could max out its 120 * limit and thus block recursive snapshots from being taken in the global zone. 121 * Because of this, it is possible for the snapshot count to be over the limit 122 * and snapshots taken in the global zone could cause a lower level dataset to 123 * hit or exceed its limit. The administrator taking the global zone recursive 124 * snapshot should be aware of this side-effect and behave accordingly. 125 * For consistency, the filesystem limit is also not enforced if the user can 126 * modify the limit. 127 * 128 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() 129 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in 130 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by 131 * dsl_dir_init_fs_ss_count(). 132 */ 133 134 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); 135 136 typedef struct ddulrt_arg { 137 dsl_dir_t *ddulrta_dd; 138 uint64_t ddlrta_txg; 139 } ddulrt_arg_t; 140 141 static void 142 dsl_dir_evict_async(void *dbu) 143 { 144 dsl_dir_t *dd = dbu; 145 int t; 146 dsl_pool_t *dp __maybe_unused = dd->dd_pool; 147 148 dd->dd_dbuf = NULL; 149 150 for (t = 0; t < TXG_SIZE; t++) { 151 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 152 ASSERT(dd->dd_tempreserved[t] == 0); 153 ASSERT(dd->dd_space_towrite[t] == 0); 154 } 155 156 if (dd->dd_parent) 157 dsl_dir_async_rele(dd->dd_parent, dd); 158 159 spa_async_close(dd->dd_pool->dp_spa, dd); 160 161 if (dsl_deadlist_is_open(&dd->dd_livelist)) 162 dsl_dir_livelist_close(dd); 163 164 dsl_prop_fini(dd); 165 cv_destroy(&dd->dd_activity_cv); 166 mutex_destroy(&dd->dd_activity_lock); 167 mutex_destroy(&dd->dd_lock); 168 kmem_free(dd, sizeof (dsl_dir_t)); 169 } 170 171 int 172 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, 173 const char *tail, const void *tag, dsl_dir_t **ddp) 174 { 175 dmu_buf_t *dbuf; 176 dsl_dir_t *dd; 177 dmu_object_info_t doi; 178 int err; 179 180 ASSERT(dsl_pool_config_held(dp)); 181 182 err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); 183 if (err != 0) 184 return (err); 185 dd = dmu_buf_get_user(dbuf); 186 187 dmu_object_info_from_db(dbuf, &doi); 188 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); 189 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); 190 191 if (dd == NULL) { 192 dsl_dir_t *winner; 193 194 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); 195 dd->dd_object = ddobj; 196 dd->dd_dbuf = dbuf; 197 dd->dd_pool = dp; 198 199 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); 200 mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); 201 cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); 202 dsl_prop_init(dd); 203 204 if (dsl_dir_is_zapified(dd)) { 205 err = zap_lookup(dp->dp_meta_objset, 206 ddobj, DD_FIELD_CRYPTO_KEY_OBJ, 207 sizeof (uint64_t), 1, &dd->dd_crypto_obj); 208 if (err == 0) { 209 /* check for on-disk format errata */ 210 if (dsl_dir_incompatible_encryption_version( 211 dd)) { 212 dp->dp_spa->spa_errata = 213 ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; 214 } 215 } else if (err != ENOENT) { 216 goto errout; 217 } 218 } 219 220 if (dsl_dir_phys(dd)->dd_parent_obj) { 221 err = dsl_dir_hold_obj(dp, 222 dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, 223 &dd->dd_parent); 224 if (err != 0) 225 goto errout; 226 if (tail) { 227 #ifdef ZFS_DEBUG 228 uint64_t foundobj; 229 230 err = zap_lookup(dp->dp_meta_objset, 231 dsl_dir_phys(dd->dd_parent)-> 232 dd_child_dir_zapobj, tail, 233 sizeof (foundobj), 1, &foundobj); 234 ASSERT(err || foundobj == ddobj); 235 #endif 236 (void) strlcpy(dd->dd_myname, tail, 237 sizeof (dd->dd_myname)); 238 } else { 239 err = zap_value_search(dp->dp_meta_objset, 240 dsl_dir_phys(dd->dd_parent)-> 241 dd_child_dir_zapobj, 242 ddobj, 0, dd->dd_myname, 243 sizeof (dd->dd_myname)); 244 } 245 if (err != 0) 246 goto errout; 247 } else { 248 (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa), 249 sizeof (dd->dd_myname)); 250 } 251 252 if (dsl_dir_is_clone(dd)) { 253 dmu_buf_t *origin_bonus; 254 dsl_dataset_phys_t *origin_phys; 255 256 /* 257 * We can't open the origin dataset, because 258 * that would require opening this dsl_dir. 259 * Just look at its phys directly instead. 260 */ 261 err = dmu_bonus_hold(dp->dp_meta_objset, 262 dsl_dir_phys(dd)->dd_origin_obj, FTAG, 263 &origin_bonus); 264 if (err != 0) 265 goto errout; 266 origin_phys = origin_bonus->db_data; 267 dd->dd_origin_txg = 268 origin_phys->ds_creation_txg; 269 dmu_buf_rele(origin_bonus, FTAG); 270 if (dsl_dir_is_zapified(dd)) { 271 uint64_t obj; 272 err = zap_lookup(dp->dp_meta_objset, 273 dd->dd_object, DD_FIELD_LIVELIST, 274 sizeof (uint64_t), 1, &obj); 275 if (err == 0) 276 dsl_dir_livelist_open(dd, obj); 277 else if (err != ENOENT) 278 goto errout; 279 } 280 } 281 282 if (dsl_dir_is_zapified(dd)) { 283 inode_timespec_t t = {0}; 284 (void) zap_lookup(dp->dp_meta_objset, ddobj, 285 DD_FIELD_SNAPSHOTS_CHANGED, 286 sizeof (uint64_t), 287 sizeof (inode_timespec_t) / sizeof (uint64_t), 288 &t); 289 dd->dd_snap_cmtime = t; 290 } 291 292 dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, 293 &dd->dd_dbuf); 294 winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); 295 if (winner != NULL) { 296 if (dd->dd_parent) 297 dsl_dir_rele(dd->dd_parent, dd); 298 if (dsl_deadlist_is_open(&dd->dd_livelist)) 299 dsl_dir_livelist_close(dd); 300 dsl_prop_fini(dd); 301 cv_destroy(&dd->dd_activity_cv); 302 mutex_destroy(&dd->dd_activity_lock); 303 mutex_destroy(&dd->dd_lock); 304 kmem_free(dd, sizeof (dsl_dir_t)); 305 dd = winner; 306 } else { 307 spa_open_ref(dp->dp_spa, dd); 308 } 309 } 310 311 /* 312 * The dsl_dir_t has both open-to-close and instantiate-to-evict 313 * holds on the spa. We need the open-to-close holds because 314 * otherwise the spa_refcnt wouldn't change when we open a 315 * dir which the spa also has open, so we could incorrectly 316 * think it was OK to unload/export/destroy the pool. We need 317 * the instantiate-to-evict hold because the dsl_dir_t has a 318 * pointer to the dd_pool, which has a pointer to the spa_t. 319 */ 320 spa_open_ref(dp->dp_spa, tag); 321 ASSERT3P(dd->dd_pool, ==, dp); 322 ASSERT3U(dd->dd_object, ==, ddobj); 323 ASSERT3P(dd->dd_dbuf, ==, dbuf); 324 *ddp = dd; 325 return (0); 326 327 errout: 328 if (dd->dd_parent) 329 dsl_dir_rele(dd->dd_parent, dd); 330 if (dsl_deadlist_is_open(&dd->dd_livelist)) 331 dsl_dir_livelist_close(dd); 332 dsl_prop_fini(dd); 333 cv_destroy(&dd->dd_activity_cv); 334 mutex_destroy(&dd->dd_activity_lock); 335 mutex_destroy(&dd->dd_lock); 336 kmem_free(dd, sizeof (dsl_dir_t)); 337 dmu_buf_rele(dbuf, tag); 338 return (err); 339 } 340 341 void 342 dsl_dir_rele(dsl_dir_t *dd, const void *tag) 343 { 344 dprintf_dd(dd, "%s\n", ""); 345 spa_close(dd->dd_pool->dp_spa, tag); 346 dmu_buf_rele(dd->dd_dbuf, tag); 347 } 348 349 /* 350 * Remove a reference to the given dsl dir that is being asynchronously 351 * released. Async releases occur from a taskq performing eviction of 352 * dsl datasets and dirs. This process is identical to a normal release 353 * with the exception of using the async API for releasing the reference on 354 * the spa. 355 */ 356 void 357 dsl_dir_async_rele(dsl_dir_t *dd, const void *tag) 358 { 359 dprintf_dd(dd, "%s\n", ""); 360 spa_async_close(dd->dd_pool->dp_spa, tag); 361 dmu_buf_rele(dd->dd_dbuf, tag); 362 } 363 364 /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ 365 void 366 dsl_dir_name(dsl_dir_t *dd, char *buf) 367 { 368 if (dd->dd_parent) { 369 dsl_dir_name(dd->dd_parent, buf); 370 VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, 371 ZFS_MAX_DATASET_NAME_LEN); 372 } else { 373 buf[0] = '\0'; 374 } 375 if (!MUTEX_HELD(&dd->dd_lock)) { 376 /* 377 * recursive mutex so that we can use 378 * dprintf_dd() with dd_lock held 379 */ 380 mutex_enter(&dd->dd_lock); 381 VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), 382 <, ZFS_MAX_DATASET_NAME_LEN); 383 mutex_exit(&dd->dd_lock); 384 } else { 385 VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), 386 <, ZFS_MAX_DATASET_NAME_LEN); 387 } 388 } 389 390 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ 391 int 392 dsl_dir_namelen(dsl_dir_t *dd) 393 { 394 int result = 0; 395 396 if (dd->dd_parent) { 397 /* parent's name + 1 for the "/" */ 398 result = dsl_dir_namelen(dd->dd_parent) + 1; 399 } 400 401 if (!MUTEX_HELD(&dd->dd_lock)) { 402 /* see dsl_dir_name */ 403 mutex_enter(&dd->dd_lock); 404 result += strlen(dd->dd_myname); 405 mutex_exit(&dd->dd_lock); 406 } else { 407 result += strlen(dd->dd_myname); 408 } 409 410 return (result); 411 } 412 413 static int 414 getcomponent(const char *path, char *component, const char **nextp) 415 { 416 char *p; 417 418 if ((path == NULL) || (path[0] == '\0')) 419 return (SET_ERROR(ENOENT)); 420 /* This would be a good place to reserve some namespace... */ 421 p = strpbrk(path, "/@"); 422 if (p && (p[1] == '/' || p[1] == '@')) { 423 /* two separators in a row */ 424 return (SET_ERROR(EINVAL)); 425 } 426 if (p == NULL || p == path) { 427 /* 428 * if the first thing is an @ or /, it had better be an 429 * @ and it had better not have any more ats or slashes, 430 * and it had better have something after the @. 431 */ 432 if (p != NULL && 433 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) 434 return (SET_ERROR(EINVAL)); 435 if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) 436 return (SET_ERROR(ENAMETOOLONG)); 437 (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN); 438 p = NULL; 439 } else if (p[0] == '/') { 440 if (p - path >= ZFS_MAX_DATASET_NAME_LEN) 441 return (SET_ERROR(ENAMETOOLONG)); 442 (void) strlcpy(component, path, p - path + 1); 443 p++; 444 } else if (p[0] == '@') { 445 /* 446 * if the next separator is an @, there better not be 447 * any more slashes. 448 */ 449 if (strchr(path, '/')) 450 return (SET_ERROR(EINVAL)); 451 if (p - path >= ZFS_MAX_DATASET_NAME_LEN) 452 return (SET_ERROR(ENAMETOOLONG)); 453 (void) strlcpy(component, path, p - path + 1); 454 } else { 455 panic("invalid p=%p", (void *)p); 456 } 457 *nextp = p; 458 return (0); 459 } 460 461 /* 462 * Return the dsl_dir_t, and possibly the last component which couldn't 463 * be found in *tail. The name must be in the specified dsl_pool_t. This 464 * thread must hold the dp_config_rwlock for the pool. Returns NULL if the 465 * path is bogus, or if tail==NULL and we couldn't parse the whole name. 466 * (*tail)[0] == '@' means that the last component is a snapshot. 467 */ 468 int 469 dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag, 470 dsl_dir_t **ddp, const char **tailp) 471 { 472 char *buf; 473 const char *spaname, *next, *nextnext = NULL; 474 int err; 475 dsl_dir_t *dd; 476 uint64_t ddobj; 477 478 buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 479 err = getcomponent(name, buf, &next); 480 if (err != 0) 481 goto error; 482 483 /* Make sure the name is in the specified pool. */ 484 spaname = spa_name(dp->dp_spa); 485 if (strcmp(buf, spaname) != 0) { 486 err = SET_ERROR(EXDEV); 487 goto error; 488 } 489 490 ASSERT(dsl_pool_config_held(dp)); 491 492 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); 493 if (err != 0) { 494 goto error; 495 } 496 497 while (next != NULL) { 498 dsl_dir_t *child_dd; 499 err = getcomponent(next, buf, &nextnext); 500 if (err != 0) 501 break; 502 ASSERT(next[0] != '\0'); 503 if (next[0] == '@') 504 break; 505 dprintf("looking up %s in obj%lld\n", 506 buf, (longlong_t)dsl_dir_phys(dd)->dd_child_dir_zapobj); 507 508 err = zap_lookup(dp->dp_meta_objset, 509 dsl_dir_phys(dd)->dd_child_dir_zapobj, 510 buf, sizeof (ddobj), 1, &ddobj); 511 if (err != 0) { 512 if (err == ENOENT) 513 err = 0; 514 break; 515 } 516 517 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); 518 if (err != 0) 519 break; 520 dsl_dir_rele(dd, tag); 521 dd = child_dd; 522 next = nextnext; 523 } 524 525 if (err != 0) { 526 dsl_dir_rele(dd, tag); 527 goto error; 528 } 529 530 /* 531 * It's an error if there's more than one component left, or 532 * tailp==NULL and there's any component left. 533 */ 534 if (next != NULL && 535 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { 536 /* bad path name */ 537 dsl_dir_rele(dd, tag); 538 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); 539 err = SET_ERROR(ENOENT); 540 } 541 if (tailp != NULL) 542 *tailp = next; 543 if (err == 0) 544 *ddp = dd; 545 error: 546 kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN); 547 return (err); 548 } 549 550 /* 551 * If the counts are already initialized for this filesystem and its 552 * descendants then do nothing, otherwise initialize the counts. 553 * 554 * The counts on this filesystem, and those below, may be uninitialized due to 555 * either the use of a pre-existing pool which did not support the 556 * filesystem/snapshot limit feature, or one in which the feature had not yet 557 * been enabled. 558 * 559 * Recursively descend the filesystem tree and update the filesystem/snapshot 560 * counts on each filesystem below, then update the cumulative count on the 561 * current filesystem. If the filesystem already has a count set on it, 562 * then we know that its counts, and the counts on the filesystems below it, 563 * are already correct, so we don't have to update this filesystem. 564 */ 565 static void 566 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) 567 { 568 uint64_t my_fs_cnt = 0; 569 uint64_t my_ss_cnt = 0; 570 dsl_pool_t *dp = dd->dd_pool; 571 objset_t *os = dp->dp_meta_objset; 572 zap_cursor_t *zc; 573 zap_attribute_t *za; 574 dsl_dataset_t *ds; 575 576 ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); 577 ASSERT(dsl_pool_config_held(dp)); 578 ASSERT(dmu_tx_is_syncing(tx)); 579 580 dsl_dir_zapify(dd, tx); 581 582 /* 583 * If the filesystem count has already been initialized then we 584 * don't need to recurse down any further. 585 */ 586 if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) 587 return; 588 589 zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); 590 za = zap_attribute_alloc(); 591 592 /* Iterate my child dirs */ 593 for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); 594 zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { 595 dsl_dir_t *chld_dd; 596 uint64_t count; 597 598 VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, 599 &chld_dd)); 600 601 /* 602 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets. 603 */ 604 if (chld_dd->dd_myname[0] == '$') { 605 dsl_dir_rele(chld_dd, FTAG); 606 continue; 607 } 608 609 my_fs_cnt++; /* count this child */ 610 611 dsl_dir_init_fs_ss_count(chld_dd, tx); 612 613 VERIFY0(zap_lookup(os, chld_dd->dd_object, 614 DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); 615 my_fs_cnt += count; 616 VERIFY0(zap_lookup(os, chld_dd->dd_object, 617 DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); 618 my_ss_cnt += count; 619 620 dsl_dir_rele(chld_dd, FTAG); 621 } 622 zap_cursor_fini(zc); 623 /* Count my snapshots (we counted children's snapshots above) */ 624 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 625 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); 626 627 for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); 628 zap_cursor_retrieve(zc, za) == 0; 629 zap_cursor_advance(zc)) { 630 /* Don't count temporary snapshots */ 631 if (za->za_name[0] != '%') 632 my_ss_cnt++; 633 } 634 zap_cursor_fini(zc); 635 636 dsl_dataset_rele(ds, FTAG); 637 638 kmem_free(zc, sizeof (zap_cursor_t)); 639 zap_attribute_free(za); 640 641 /* we're in a sync task, update counts */ 642 dmu_buf_will_dirty(dd->dd_dbuf, tx); 643 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 644 sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); 645 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 646 sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); 647 } 648 649 static int 650 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) 651 { 652 char *ddname = (char *)arg; 653 dsl_pool_t *dp = dmu_tx_pool(tx); 654 dsl_dataset_t *ds; 655 dsl_dir_t *dd; 656 int error; 657 658 error = dsl_dataset_hold(dp, ddname, FTAG, &ds); 659 if (error != 0) 660 return (error); 661 662 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { 663 dsl_dataset_rele(ds, FTAG); 664 return (SET_ERROR(ENOTSUP)); 665 } 666 667 dd = ds->ds_dir; 668 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && 669 dsl_dir_is_zapified(dd) && 670 zap_contains(dp->dp_meta_objset, dd->dd_object, 671 DD_FIELD_FILESYSTEM_COUNT) == 0) { 672 dsl_dataset_rele(ds, FTAG); 673 return (SET_ERROR(EALREADY)); 674 } 675 676 dsl_dataset_rele(ds, FTAG); 677 return (0); 678 } 679 680 static void 681 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) 682 { 683 char *ddname = (char *)arg; 684 dsl_pool_t *dp = dmu_tx_pool(tx); 685 dsl_dataset_t *ds; 686 spa_t *spa; 687 688 VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); 689 690 spa = dsl_dataset_get_spa(ds); 691 692 if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { 693 /* 694 * Since the feature was not active and we're now setting a 695 * limit, increment the feature-active counter so that the 696 * feature becomes active for the first time. 697 * 698 * We are already in a sync task so we can update the MOS. 699 */ 700 spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); 701 } 702 703 /* 704 * Since we are now setting a non-UINT64_MAX limit on the filesystem, 705 * we need to ensure the counts are correct. Descend down the tree from 706 * this point and update all of the counts to be accurate. 707 */ 708 dsl_dir_init_fs_ss_count(ds->ds_dir, tx); 709 710 dsl_dataset_rele(ds, FTAG); 711 } 712 713 /* 714 * Make sure the feature is enabled and activate it if necessary. 715 * Since we're setting a limit, ensure the on-disk counts are valid. 716 * This is only called by the ioctl path when setting a limit value. 717 * 718 * We do not need to validate the new limit, since users who can change the 719 * limit are also allowed to exceed the limit. 720 */ 721 int 722 dsl_dir_activate_fs_ss_limit(const char *ddname) 723 { 724 int error; 725 726 error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, 727 dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, 728 ZFS_SPACE_CHECK_RESERVED); 729 730 if (error == EALREADY) 731 error = 0; 732 733 return (error); 734 } 735 736 /* 737 * Used to determine if the filesystem_limit or snapshot_limit should be 738 * enforced. We allow the limit to be exceeded if the user has permission to 739 * write the property value. We pass in the creds that we got in the open 740 * context since we will always be the GZ root in syncing context. We also have 741 * to handle the case where we are allowed to change the limit on the current 742 * dataset, but there may be another limit in the tree above. 743 * 744 * We can never modify these two properties within a non-global zone. In 745 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We 746 * can't use that function since we are already holding the dp_config_rwlock. 747 * In addition, we already have the dd and dealing with snapshots is simplified 748 * in this code. 749 */ 750 751 typedef enum { 752 ENFORCE_ALWAYS, 753 ENFORCE_NEVER, 754 ENFORCE_ABOVE 755 } enforce_res_t; 756 757 static enforce_res_t 758 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, 759 cred_t *cr, proc_t *proc) 760 { 761 enforce_res_t enforce = ENFORCE_ALWAYS; 762 uint64_t obj; 763 dsl_dataset_t *ds; 764 uint64_t zoned; 765 const char *zonedstr; 766 767 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 768 prop == ZFS_PROP_SNAPSHOT_LIMIT); 769 770 #ifdef _KERNEL 771 if (crgetzoneid(cr) != GLOBAL_ZONEID) 772 return (ENFORCE_ALWAYS); 773 774 /* 775 * We are checking the saved credentials of the user process, which is 776 * not the current process. Note that we can't use secpolicy_zfs(), 777 * because it only works if the cred is that of the current process (on 778 * Linux). 779 */ 780 if (secpolicy_zfs_proc(cr, proc) == 0) 781 return (ENFORCE_NEVER); 782 #else 783 (void) proc; 784 #endif 785 786 if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) 787 return (ENFORCE_ALWAYS); 788 789 ASSERT(dsl_pool_config_held(dd->dd_pool)); 790 791 if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) 792 return (ENFORCE_ALWAYS); 793 794 zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED); 795 if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) { 796 /* Only root can access zoned fs's from the GZ */ 797 enforce = ENFORCE_ALWAYS; 798 } else { 799 if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) 800 enforce = ENFORCE_ABOVE; 801 } 802 803 dsl_dataset_rele(ds, FTAG); 804 return (enforce); 805 } 806 807 /* 808 * Check if adding additional child filesystem(s) would exceed any filesystem 809 * limits or adding additional snapshot(s) would exceed any snapshot limits. 810 * The prop argument indicates which limit to check. 811 * 812 * Note that all filesystem limits up to the root (or the highest 813 * initialized) filesystem or the given ancestor must be satisfied. 814 */ 815 int 816 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, 817 dsl_dir_t *ancestor, cred_t *cr, proc_t *proc) 818 { 819 objset_t *os = dd->dd_pool->dp_meta_objset; 820 uint64_t limit, count; 821 const char *count_prop; 822 enforce_res_t enforce; 823 int err = 0; 824 825 ASSERT(dsl_pool_config_held(dd->dd_pool)); 826 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 827 prop == ZFS_PROP_SNAPSHOT_LIMIT); 828 829 if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { 830 /* 831 * We don't enforce the limit for temporary snapshots. This is 832 * indicated by a NULL cred_t argument. 833 */ 834 if (cr == NULL) 835 return (0); 836 837 count_prop = DD_FIELD_SNAPSHOT_COUNT; 838 } else { 839 count_prop = DD_FIELD_FILESYSTEM_COUNT; 840 } 841 /* 842 * If we're allowed to change the limit, don't enforce the limit 843 * e.g. this can happen if a snapshot is taken by an administrative 844 * user in the global zone (i.e. a recursive snapshot by root). 845 * However, we must handle the case of delegated permissions where we 846 * are allowed to change the limit on the current dataset, but there 847 * is another limit in the tree above. 848 */ 849 enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc); 850 if (enforce == ENFORCE_NEVER) 851 return (0); 852 853 /* 854 * e.g. if renaming a dataset with no snapshots, count adjustment 855 * is 0. 856 */ 857 if (delta == 0) 858 return (0); 859 860 /* 861 * If an ancestor has been provided, stop checking the limit once we 862 * hit that dir. We need this during rename so that we don't overcount 863 * the check once we recurse up to the common ancestor. 864 */ 865 if (ancestor == dd) 866 return (0); 867 868 /* 869 * If we hit an uninitialized node while recursing up the tree, we can 870 * stop since we know there is no limit here (or above). The counts are 871 * not valid on this node and we know we won't touch this node's counts. 872 */ 873 if (!dsl_dir_is_zapified(dd)) 874 return (0); 875 err = zap_lookup(os, dd->dd_object, 876 count_prop, sizeof (count), 1, &count); 877 if (err == ENOENT) 878 return (0); 879 if (err != 0) 880 return (err); 881 882 err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, 883 B_FALSE); 884 if (err != 0) 885 return (err); 886 887 /* Is there a limit which we've hit? */ 888 if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) 889 return (SET_ERROR(EDQUOT)); 890 891 if (dd->dd_parent != NULL) 892 err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, 893 ancestor, cr, proc); 894 895 return (err); 896 } 897 898 /* 899 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all 900 * parents. When a new filesystem/snapshot is created, increment the count on 901 * all parents, and when a filesystem/snapshot is destroyed, decrement the 902 * count. 903 */ 904 void 905 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, 906 dmu_tx_t *tx) 907 { 908 int err; 909 objset_t *os = dd->dd_pool->dp_meta_objset; 910 uint64_t count; 911 912 ASSERT(dsl_pool_config_held(dd->dd_pool)); 913 ASSERT(dmu_tx_is_syncing(tx)); 914 ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || 915 strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); 916 917 /* 918 * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets. 919 */ 920 if (dd->dd_myname[0] == '$' && strcmp(prop, 921 DD_FIELD_FILESYSTEM_COUNT) == 0) { 922 return; 923 } 924 925 /* 926 * e.g. if renaming a dataset with no snapshots, count adjustment is 0 927 */ 928 if (delta == 0) 929 return; 930 931 /* 932 * If we hit an uninitialized node while recursing up the tree, we can 933 * stop since we know the counts are not valid on this node and we 934 * know we shouldn't touch this node's counts. An uninitialized count 935 * on the node indicates that either the feature has not yet been 936 * activated or there are no limits on this part of the tree. 937 */ 938 if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, 939 prop, sizeof (count), 1, &count)) == ENOENT) 940 return; 941 VERIFY0(err); 942 943 count += delta; 944 /* Use a signed verify to make sure we're not neg. */ 945 VERIFY3S(count, >=, 0); 946 947 VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, 948 tx)); 949 950 /* Roll up this additional count into our ancestors */ 951 if (dd->dd_parent != NULL) 952 dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); 953 } 954 955 uint64_t 956 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, 957 dmu_tx_t *tx) 958 { 959 objset_t *mos = dp->dp_meta_objset; 960 uint64_t ddobj; 961 dsl_dir_phys_t *ddphys; 962 dmu_buf_t *dbuf; 963 964 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 965 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 966 if (pds) { 967 VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, 968 name, sizeof (uint64_t), 1, &ddobj, tx)); 969 } else { 970 /* it's the root dir */ 971 VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, 972 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); 973 } 974 VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); 975 dmu_buf_will_dirty(dbuf, tx); 976 ddphys = dbuf->db_data; 977 978 ddphys->dd_creation_time = gethrestime_sec(); 979 if (pds) { 980 ddphys->dd_parent_obj = pds->dd_object; 981 982 /* update the filesystem counts */ 983 dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); 984 } 985 ddphys->dd_props_zapobj = zap_create(mos, 986 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 987 ddphys->dd_child_dir_zapobj = zap_create(mos, 988 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 989 if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) 990 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; 991 992 dmu_buf_rele(dbuf, FTAG); 993 994 return (ddobj); 995 } 996 997 boolean_t 998 dsl_dir_is_clone(dsl_dir_t *dd) 999 { 1000 return (dsl_dir_phys(dd)->dd_origin_obj && 1001 (dd->dd_pool->dp_origin_snap == NULL || 1002 dsl_dir_phys(dd)->dd_origin_obj != 1003 dd->dd_pool->dp_origin_snap->ds_object)); 1004 } 1005 1006 uint64_t 1007 dsl_dir_get_used(dsl_dir_t *dd) 1008 { 1009 return (dsl_dir_phys(dd)->dd_used_bytes); 1010 } 1011 1012 uint64_t 1013 dsl_dir_get_compressed(dsl_dir_t *dd) 1014 { 1015 return (dsl_dir_phys(dd)->dd_compressed_bytes); 1016 } 1017 1018 uint64_t 1019 dsl_dir_get_quota(dsl_dir_t *dd) 1020 { 1021 return (dsl_dir_phys(dd)->dd_quota); 1022 } 1023 1024 uint64_t 1025 dsl_dir_get_reservation(dsl_dir_t *dd) 1026 { 1027 return (dsl_dir_phys(dd)->dd_reserved); 1028 } 1029 1030 uint64_t 1031 dsl_dir_get_compressratio(dsl_dir_t *dd) 1032 { 1033 /* a fixed point number, 100x the ratio */ 1034 return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : 1035 (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / 1036 dsl_dir_phys(dd)->dd_compressed_bytes)); 1037 } 1038 1039 uint64_t 1040 dsl_dir_get_logicalused(dsl_dir_t *dd) 1041 { 1042 return (dsl_dir_phys(dd)->dd_uncompressed_bytes); 1043 } 1044 1045 uint64_t 1046 dsl_dir_get_usedsnap(dsl_dir_t *dd) 1047 { 1048 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); 1049 } 1050 1051 uint64_t 1052 dsl_dir_get_usedds(dsl_dir_t *dd) 1053 { 1054 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); 1055 } 1056 1057 uint64_t 1058 dsl_dir_get_usedrefreserv(dsl_dir_t *dd) 1059 { 1060 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); 1061 } 1062 1063 uint64_t 1064 dsl_dir_get_usedchild(dsl_dir_t *dd) 1065 { 1066 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + 1067 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); 1068 } 1069 1070 void 1071 dsl_dir_get_origin(dsl_dir_t *dd, char *buf) 1072 { 1073 dsl_dataset_t *ds; 1074 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 1075 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); 1076 1077 dsl_dataset_name(ds, buf); 1078 1079 dsl_dataset_rele(ds, FTAG); 1080 } 1081 1082 int 1083 dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) 1084 { 1085 if (dsl_dir_is_zapified(dd)) { 1086 objset_t *os = dd->dd_pool->dp_meta_objset; 1087 return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 1088 sizeof (*count), 1, count)); 1089 } else { 1090 return (SET_ERROR(ENOENT)); 1091 } 1092 } 1093 1094 int 1095 dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) 1096 { 1097 if (dsl_dir_is_zapified(dd)) { 1098 objset_t *os = dd->dd_pool->dp_meta_objset; 1099 return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 1100 sizeof (*count), 1, count)); 1101 } else { 1102 return (SET_ERROR(ENOENT)); 1103 } 1104 } 1105 1106 void 1107 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) 1108 { 1109 mutex_enter(&dd->dd_lock); 1110 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, 1111 dsl_dir_get_quota(dd)); 1112 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, 1113 dsl_dir_get_reservation(dd)); 1114 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, 1115 dsl_dir_get_logicalused(dd)); 1116 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1117 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, 1118 dsl_dir_get_usedsnap(dd)); 1119 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, 1120 dsl_dir_get_usedds(dd)); 1121 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, 1122 dsl_dir_get_usedrefreserv(dd)); 1123 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, 1124 dsl_dir_get_usedchild(dd)); 1125 } 1126 mutex_exit(&dd->dd_lock); 1127 1128 uint64_t count; 1129 if (dsl_dir_get_filesystem_count(dd, &count) == 0) { 1130 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, 1131 count); 1132 } 1133 if (dsl_dir_get_snapshot_count(dd, &count) == 0) { 1134 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, 1135 count); 1136 } 1137 1138 if (dsl_dir_is_clone(dd)) { 1139 char buf[ZFS_MAX_DATASET_NAME_LEN]; 1140 dsl_dir_get_origin(dd, buf); 1141 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); 1142 } 1143 1144 } 1145 1146 void 1147 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) 1148 { 1149 dsl_pool_t *dp = dd->dd_pool; 1150 1151 ASSERT(dsl_dir_phys(dd)); 1152 1153 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { 1154 /* up the hold count until we can be written out */ 1155 dmu_buf_add_ref(dd->dd_dbuf, dd); 1156 } 1157 } 1158 1159 static int64_t 1160 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) 1161 { 1162 uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); 1163 uint64_t new_accounted = 1164 MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); 1165 return (new_accounted - old_accounted); 1166 } 1167 1168 void 1169 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) 1170 { 1171 ASSERT(dmu_tx_is_syncing(tx)); 1172 1173 mutex_enter(&dd->dd_lock); 1174 ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]); 1175 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", (u_longlong_t)tx->tx_txg, 1176 (u_longlong_t)dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024); 1177 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0; 1178 mutex_exit(&dd->dd_lock); 1179 1180 /* release the hold from dsl_dir_dirty */ 1181 dmu_buf_rele(dd->dd_dbuf, dd); 1182 } 1183 1184 static uint64_t 1185 dsl_dir_space_towrite(dsl_dir_t *dd) 1186 { 1187 uint64_t space = 0; 1188 1189 ASSERT(MUTEX_HELD(&dd->dd_lock)); 1190 1191 for (int i = 0; i < TXG_SIZE; i++) 1192 space += dd->dd_space_towrite[i & TXG_MASK]; 1193 1194 return (space); 1195 } 1196 1197 /* 1198 * How much space would dd have available if ancestor had delta applied 1199 * to it? If ondiskonly is set, we're only interested in what's 1200 * on-disk, not estimated pending changes. 1201 */ 1202 uint64_t 1203 dsl_dir_space_available(dsl_dir_t *dd, 1204 dsl_dir_t *ancestor, int64_t delta, int ondiskonly) 1205 { 1206 uint64_t parentspace, myspace, quota, used; 1207 1208 /* 1209 * If there are no restrictions otherwise, assume we have 1210 * unlimited space available. 1211 */ 1212 quota = UINT64_MAX; 1213 parentspace = UINT64_MAX; 1214 1215 if (dd->dd_parent != NULL) { 1216 parentspace = dsl_dir_space_available(dd->dd_parent, 1217 ancestor, delta, ondiskonly); 1218 } 1219 1220 mutex_enter(&dd->dd_lock); 1221 if (dsl_dir_phys(dd)->dd_quota != 0) 1222 quota = dsl_dir_phys(dd)->dd_quota; 1223 used = dsl_dir_phys(dd)->dd_used_bytes; 1224 if (!ondiskonly) 1225 used += dsl_dir_space_towrite(dd); 1226 1227 if (dd->dd_parent == NULL) { 1228 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, 1229 ZFS_SPACE_CHECK_NORMAL); 1230 quota = MIN(quota, poolsize); 1231 } 1232 1233 if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { 1234 /* 1235 * We have some space reserved, in addition to what our 1236 * parent gave us. 1237 */ 1238 parentspace += dsl_dir_phys(dd)->dd_reserved - used; 1239 } 1240 1241 if (dd == ancestor) { 1242 ASSERT(delta <= 0); 1243 ASSERT(used >= -delta); 1244 used += delta; 1245 if (parentspace != UINT64_MAX) 1246 parentspace -= delta; 1247 } 1248 1249 if (used > quota) { 1250 /* over quota */ 1251 myspace = 0; 1252 } else { 1253 /* 1254 * the lesser of the space provided by our parent and 1255 * the space left in our quota 1256 */ 1257 myspace = MIN(parentspace, quota - used); 1258 } 1259 1260 mutex_exit(&dd->dd_lock); 1261 1262 return (myspace); 1263 } 1264 1265 struct tempreserve { 1266 list_node_t tr_node; 1267 dsl_dir_t *tr_ds; 1268 uint64_t tr_size; 1269 }; 1270 1271 static int 1272 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, 1273 boolean_t ignorequota, list_t *tr_list, 1274 dmu_tx_t *tx, boolean_t first) 1275 { 1276 uint64_t txg; 1277 uint64_t quota; 1278 struct tempreserve *tr; 1279 int retval; 1280 uint64_t ext_quota; 1281 uint64_t ref_rsrv; 1282 1283 top_of_function: 1284 txg = tx->tx_txg; 1285 retval = EDQUOT; 1286 ref_rsrv = 0; 1287 1288 ASSERT3U(txg, !=, 0); 1289 ASSERT3S(asize, >, 0); 1290 1291 mutex_enter(&dd->dd_lock); 1292 1293 /* 1294 * Check against the dsl_dir's quota. We don't add in the delta 1295 * when checking for over-quota because they get one free hit. 1296 */ 1297 uint64_t est_inflight = dsl_dir_space_towrite(dd); 1298 for (int i = 0; i < TXG_SIZE; i++) 1299 est_inflight += dd->dd_tempreserved[i]; 1300 uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; 1301 1302 /* 1303 * On the first iteration, fetch the dataset's used-on-disk and 1304 * refreservation values. Also, if checkrefquota is set, test if 1305 * allocating this space would exceed the dataset's refquota. 1306 */ 1307 if (first && tx->tx_objset) { 1308 int error; 1309 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; 1310 1311 error = dsl_dataset_check_quota(ds, !netfree, 1312 asize, est_inflight, &used_on_disk, &ref_rsrv); 1313 if (error != 0) { 1314 mutex_exit(&dd->dd_lock); 1315 DMU_TX_STAT_BUMP(dmu_tx_quota); 1316 return (error); 1317 } 1318 } 1319 1320 /* 1321 * If this transaction will result in a net free of space, 1322 * we want to let it through. 1323 */ 1324 if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 || 1325 (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL && 1326 zvol_enforce_quotas == B_FALSE)) 1327 quota = UINT64_MAX; 1328 else 1329 quota = dsl_dir_phys(dd)->dd_quota; 1330 1331 /* 1332 * Adjust the quota against the actual pool size at the root 1333 * minus any outstanding deferred frees. 1334 * To ensure that it's possible to remove files from a full 1335 * pool without inducing transient overcommits, we throttle 1336 * netfree transactions against a quota that is slightly larger, 1337 * but still within the pool's allocation slop. In cases where 1338 * we're very close to full, this will allow a steady trickle of 1339 * removes to get through. 1340 */ 1341 if (dd->dd_parent == NULL) { 1342 uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, 1343 (netfree) ? 1344 ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); 1345 1346 if (avail < quota) { 1347 quota = avail; 1348 retval = SET_ERROR(ENOSPC); 1349 } 1350 } 1351 1352 /* 1353 * If they are requesting more space, and our current estimate 1354 * is over quota, they get to try again unless the actual 1355 * on-disk is over quota and there are no pending changes 1356 * or deferred frees (which may free up space for us). 1357 */ 1358 ext_quota = quota >> 5; 1359 if (quota == UINT64_MAX) 1360 ext_quota = 0; 1361 1362 if (used_on_disk >= quota) { 1363 if (retval == ENOSPC && (used_on_disk - quota) < 1364 dsl_pool_deferred_space(dd->dd_pool)) { 1365 retval = SET_ERROR(ERESTART); 1366 } 1367 /* Quota exceeded */ 1368 mutex_exit(&dd->dd_lock); 1369 DMU_TX_STAT_BUMP(dmu_tx_quota); 1370 return (retval); 1371 } else if (used_on_disk + est_inflight >= quota + ext_quota) { 1372 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " 1373 "quota=%lluK tr=%lluK\n", 1374 (u_longlong_t)used_on_disk>>10, 1375 (u_longlong_t)est_inflight>>10, 1376 (u_longlong_t)quota>>10, (u_longlong_t)asize>>10); 1377 mutex_exit(&dd->dd_lock); 1378 DMU_TX_STAT_BUMP(dmu_tx_quota); 1379 return (SET_ERROR(ERESTART)); 1380 } 1381 1382 /* We need to up our estimated delta before dropping dd_lock */ 1383 dd->dd_tempreserved[txg & TXG_MASK] += asize; 1384 1385 uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, 1386 asize - ref_rsrv); 1387 mutex_exit(&dd->dd_lock); 1388 1389 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1390 tr->tr_ds = dd; 1391 tr->tr_size = asize; 1392 list_insert_tail(tr_list, tr); 1393 1394 /* see if it's OK with our parent */ 1395 if (dd->dd_parent != NULL && parent_rsrv != 0) { 1396 /* 1397 * Recurse on our parent without recursion. This has been 1398 * observed to be potentially large stack usage even within 1399 * the test suite. Largest seen stack was 7632 bytes on linux. 1400 */ 1401 1402 dd = dd->dd_parent; 1403 asize = parent_rsrv; 1404 ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); 1405 first = B_FALSE; 1406 goto top_of_function; 1407 } 1408 1409 return (0); 1410 } 1411 1412 /* 1413 * Reserve space in this dsl_dir, to be used in this tx's txg. 1414 * After the space has been dirtied (and dsl_dir_willuse_space() 1415 * has been called), the reservation should be canceled, using 1416 * dsl_dir_tempreserve_clear(). 1417 */ 1418 int 1419 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, 1420 boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) 1421 { 1422 int err; 1423 list_t *tr_list; 1424 1425 if (asize == 0) { 1426 *tr_cookiep = NULL; 1427 return (0); 1428 } 1429 1430 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 1431 list_create(tr_list, sizeof (struct tempreserve), 1432 offsetof(struct tempreserve, tr_node)); 1433 ASSERT3S(asize, >, 0); 1434 1435 err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); 1436 if (err == 0) { 1437 struct tempreserve *tr; 1438 1439 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1440 tr->tr_size = lsize; 1441 list_insert_tail(tr_list, tr); 1442 } else { 1443 if (err == EAGAIN) { 1444 /* 1445 * If arc_memory_throttle() detected that pageout 1446 * is running and we are low on memory, we delay new 1447 * non-pageout transactions to give pageout an 1448 * advantage. 1449 * 1450 * It is unfortunate to be delaying while the caller's 1451 * locks are held. 1452 */ 1453 txg_delay(dd->dd_pool, tx->tx_txg, 1454 MSEC2NSEC(10), MSEC2NSEC(10)); 1455 err = SET_ERROR(ERESTART); 1456 } 1457 } 1458 1459 if (err == 0) { 1460 err = dsl_dir_tempreserve_impl(dd, asize, netfree, 1461 B_FALSE, tr_list, tx, B_TRUE); 1462 } 1463 1464 if (err != 0) 1465 dsl_dir_tempreserve_clear(tr_list, tx); 1466 else 1467 *tr_cookiep = tr_list; 1468 1469 return (err); 1470 } 1471 1472 /* 1473 * Clear a temporary reservation that we previously made with 1474 * dsl_dir_tempreserve_space(). 1475 */ 1476 void 1477 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) 1478 { 1479 int txgidx = tx->tx_txg & TXG_MASK; 1480 list_t *tr_list = tr_cookie; 1481 struct tempreserve *tr; 1482 1483 ASSERT3U(tx->tx_txg, !=, 0); 1484 1485 if (tr_cookie == NULL) 1486 return; 1487 1488 while ((tr = list_remove_head(tr_list)) != NULL) { 1489 if (tr->tr_ds) { 1490 mutex_enter(&tr->tr_ds->dd_lock); 1491 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, 1492 tr->tr_size); 1493 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; 1494 mutex_exit(&tr->tr_ds->dd_lock); 1495 } else { 1496 arc_tempreserve_clear(tr->tr_size); 1497 } 1498 kmem_free(tr, sizeof (struct tempreserve)); 1499 } 1500 1501 kmem_free(tr_list, sizeof (list_t)); 1502 } 1503 1504 /* 1505 * This should be called from open context when we think we're going to write 1506 * or free space, for example when dirtying data. Be conservative; it's okay 1507 * to write less space or free more, but we don't want to write more or free 1508 * less than the amount specified. 1509 * 1510 * NOTE: The behavior of this function is identical to the Illumos / FreeBSD 1511 * version however it has been adjusted to use an iterative rather than 1512 * recursive algorithm to minimize stack usage. 1513 */ 1514 void 1515 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) 1516 { 1517 int64_t parent_space; 1518 uint64_t est_used; 1519 1520 do { 1521 mutex_enter(&dd->dd_lock); 1522 if (space > 0) 1523 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; 1524 1525 est_used = dsl_dir_space_towrite(dd) + 1526 dsl_dir_phys(dd)->dd_used_bytes; 1527 parent_space = parent_delta(dd, est_used, space); 1528 mutex_exit(&dd->dd_lock); 1529 1530 /* Make sure that we clean up dd_space_to* */ 1531 dsl_dir_dirty(dd, tx); 1532 1533 dd = dd->dd_parent; 1534 space = parent_space; 1535 } while (space && dd); 1536 } 1537 1538 /* call from syncing context when we actually write/free space for this dd */ 1539 void 1540 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, 1541 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) 1542 { 1543 int64_t accounted_delta; 1544 1545 ASSERT(dmu_tx_is_syncing(tx)); 1546 ASSERT(type < DD_USED_NUM); 1547 1548 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1549 1550 /* 1551 * dsl_dataset_set_refreservation_sync_impl() calls this with 1552 * dd_lock held, so that it can atomically update 1553 * ds->ds_reserved and the dsl_dir accounting, so that 1554 * dsl_dataset_check_quota() can see dataset and dir accounting 1555 * consistently. 1556 */ 1557 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); 1558 if (needlock) 1559 mutex_enter(&dd->dd_lock); 1560 dsl_dir_phys_t *ddp = dsl_dir_phys(dd); 1561 accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); 1562 ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); 1563 ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); 1564 ASSERT(uncompressed >= 0 || 1565 ddp->dd_uncompressed_bytes >= -uncompressed); 1566 ddp->dd_used_bytes += used; 1567 ddp->dd_uncompressed_bytes += uncompressed; 1568 ddp->dd_compressed_bytes += compressed; 1569 1570 if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1571 ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used); 1572 ddp->dd_used_breakdown[type] += used; 1573 #ifdef ZFS_DEBUG 1574 { 1575 dd_used_t t; 1576 uint64_t u = 0; 1577 for (t = 0; t < DD_USED_NUM; t++) 1578 u += ddp->dd_used_breakdown[t]; 1579 ASSERT3U(u, ==, ddp->dd_used_bytes); 1580 } 1581 #endif 1582 } 1583 if (needlock) 1584 mutex_exit(&dd->dd_lock); 1585 1586 if (dd->dd_parent != NULL) { 1587 dsl_dir_diduse_transfer_space(dd->dd_parent, 1588 accounted_delta, compressed, uncompressed, 1589 used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); 1590 } 1591 } 1592 1593 void 1594 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, 1595 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) 1596 { 1597 ASSERT(dmu_tx_is_syncing(tx)); 1598 ASSERT(oldtype < DD_USED_NUM); 1599 ASSERT(newtype < DD_USED_NUM); 1600 1601 dsl_dir_phys_t *ddp = dsl_dir_phys(dd); 1602 if (delta == 0 || 1603 !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN)) 1604 return; 1605 1606 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1607 mutex_enter(&dd->dd_lock); 1608 ASSERT(delta > 0 ? 1609 ddp->dd_used_breakdown[oldtype] >= delta : 1610 ddp->dd_used_breakdown[newtype] >= -delta); 1611 ASSERT(ddp->dd_used_bytes >= ABS(delta)); 1612 ddp->dd_used_breakdown[oldtype] -= delta; 1613 ddp->dd_used_breakdown[newtype] += delta; 1614 mutex_exit(&dd->dd_lock); 1615 } 1616 1617 void 1618 dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, 1619 int64_t compressed, int64_t uncompressed, int64_t tonew, 1620 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) 1621 { 1622 int64_t accounted_delta; 1623 1624 ASSERT(dmu_tx_is_syncing(tx)); 1625 ASSERT(oldtype < DD_USED_NUM); 1626 ASSERT(newtype < DD_USED_NUM); 1627 1628 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1629 1630 mutex_enter(&dd->dd_lock); 1631 dsl_dir_phys_t *ddp = dsl_dir_phys(dd); 1632 accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); 1633 ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); 1634 ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); 1635 ASSERT(uncompressed >= 0 || 1636 ddp->dd_uncompressed_bytes >= -uncompressed); 1637 ddp->dd_used_bytes += used; 1638 ddp->dd_uncompressed_bytes += uncompressed; 1639 ddp->dd_compressed_bytes += compressed; 1640 1641 if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1642 ASSERT(tonew - used <= 0 || 1643 ddp->dd_used_breakdown[oldtype] >= tonew - used); 1644 ASSERT(tonew >= 0 || 1645 ddp->dd_used_breakdown[newtype] >= -tonew); 1646 ddp->dd_used_breakdown[oldtype] -= tonew - used; 1647 ddp->dd_used_breakdown[newtype] += tonew; 1648 #ifdef ZFS_DEBUG 1649 { 1650 dd_used_t t; 1651 uint64_t u = 0; 1652 for (t = 0; t < DD_USED_NUM; t++) 1653 u += ddp->dd_used_breakdown[t]; 1654 ASSERT3U(u, ==, ddp->dd_used_bytes); 1655 } 1656 #endif 1657 } 1658 mutex_exit(&dd->dd_lock); 1659 1660 if (dd->dd_parent != NULL) { 1661 dsl_dir_diduse_transfer_space(dd->dd_parent, 1662 accounted_delta, compressed, uncompressed, 1663 used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); 1664 } 1665 } 1666 1667 typedef struct dsl_dir_set_qr_arg { 1668 const char *ddsqra_name; 1669 zprop_source_t ddsqra_source; 1670 uint64_t ddsqra_value; 1671 } dsl_dir_set_qr_arg_t; 1672 1673 static int 1674 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) 1675 { 1676 dsl_dir_set_qr_arg_t *ddsqra = arg; 1677 dsl_pool_t *dp = dmu_tx_pool(tx); 1678 dsl_dataset_t *ds; 1679 int error; 1680 uint64_t towrite, newval; 1681 1682 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1683 if (error != 0) 1684 return (error); 1685 1686 error = dsl_prop_predict(ds->ds_dir, "quota", 1687 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1688 if (error != 0) { 1689 dsl_dataset_rele(ds, FTAG); 1690 return (error); 1691 } 1692 1693 if (newval == 0) { 1694 dsl_dataset_rele(ds, FTAG); 1695 return (0); 1696 } 1697 1698 mutex_enter(&ds->ds_dir->dd_lock); 1699 /* 1700 * If we are doing the preliminary check in open context, and 1701 * there are pending changes, then don't fail it, since the 1702 * pending changes could under-estimate the amount of space to be 1703 * freed up. 1704 */ 1705 towrite = dsl_dir_space_towrite(ds->ds_dir); 1706 if ((dmu_tx_is_syncing(tx) || towrite == 0) && 1707 (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || 1708 newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { 1709 error = SET_ERROR(ENOSPC); 1710 } 1711 mutex_exit(&ds->ds_dir->dd_lock); 1712 dsl_dataset_rele(ds, FTAG); 1713 return (error); 1714 } 1715 1716 static void 1717 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) 1718 { 1719 dsl_dir_set_qr_arg_t *ddsqra = arg; 1720 dsl_pool_t *dp = dmu_tx_pool(tx); 1721 dsl_dataset_t *ds; 1722 uint64_t newval; 1723 1724 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1725 1726 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1727 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), 1728 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1729 &ddsqra->ddsqra_value, tx); 1730 1731 VERIFY0(dsl_prop_get_int_ds(ds, 1732 zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); 1733 } else { 1734 newval = ddsqra->ddsqra_value; 1735 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1736 zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); 1737 } 1738 1739 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1740 mutex_enter(&ds->ds_dir->dd_lock); 1741 dsl_dir_phys(ds->ds_dir)->dd_quota = newval; 1742 mutex_exit(&ds->ds_dir->dd_lock); 1743 dsl_dataset_rele(ds, FTAG); 1744 } 1745 1746 int 1747 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) 1748 { 1749 dsl_dir_set_qr_arg_t ddsqra; 1750 1751 ddsqra.ddsqra_name = ddname; 1752 ddsqra.ddsqra_source = source; 1753 ddsqra.ddsqra_value = quota; 1754 1755 return (dsl_sync_task(ddname, dsl_dir_set_quota_check, 1756 dsl_dir_set_quota_sync, &ddsqra, 0, 1757 ZFS_SPACE_CHECK_EXTRA_RESERVED)); 1758 } 1759 1760 static int 1761 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) 1762 { 1763 dsl_dir_set_qr_arg_t *ddsqra = arg; 1764 dsl_pool_t *dp = dmu_tx_pool(tx); 1765 dsl_dataset_t *ds; 1766 dsl_dir_t *dd; 1767 uint64_t newval, used, avail; 1768 int error; 1769 1770 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1771 if (error != 0) 1772 return (error); 1773 dd = ds->ds_dir; 1774 1775 /* 1776 * If we are doing the preliminary check in open context, the 1777 * space estimates may be inaccurate. 1778 */ 1779 if (!dmu_tx_is_syncing(tx)) { 1780 dsl_dataset_rele(ds, FTAG); 1781 return (0); 1782 } 1783 1784 error = dsl_prop_predict(ds->ds_dir, 1785 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1786 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1787 if (error != 0) { 1788 dsl_dataset_rele(ds, FTAG); 1789 return (error); 1790 } 1791 1792 mutex_enter(&dd->dd_lock); 1793 used = dsl_dir_phys(dd)->dd_used_bytes; 1794 mutex_exit(&dd->dd_lock); 1795 1796 if (dd->dd_parent) { 1797 avail = dsl_dir_space_available(dd->dd_parent, 1798 NULL, 0, FALSE); 1799 } else { 1800 avail = dsl_pool_adjustedsize(dd->dd_pool, 1801 ZFS_SPACE_CHECK_NORMAL) - used; 1802 } 1803 1804 if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { 1805 uint64_t delta = MAX(used, newval) - 1806 MAX(used, dsl_dir_phys(dd)->dd_reserved); 1807 1808 if (delta > avail || 1809 (dsl_dir_phys(dd)->dd_quota > 0 && 1810 newval > dsl_dir_phys(dd)->dd_quota)) 1811 error = SET_ERROR(ENOSPC); 1812 } 1813 1814 dsl_dataset_rele(ds, FTAG); 1815 return (error); 1816 } 1817 1818 void 1819 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) 1820 { 1821 uint64_t used; 1822 int64_t delta; 1823 1824 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1825 1826 mutex_enter(&dd->dd_lock); 1827 used = dsl_dir_phys(dd)->dd_used_bytes; 1828 delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); 1829 dsl_dir_phys(dd)->dd_reserved = value; 1830 1831 if (dd->dd_parent != NULL) { 1832 /* Roll up this additional usage into our ancestors */ 1833 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 1834 delta, 0, 0, tx); 1835 } 1836 mutex_exit(&dd->dd_lock); 1837 } 1838 1839 static void 1840 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) 1841 { 1842 dsl_dir_set_qr_arg_t *ddsqra = arg; 1843 dsl_pool_t *dp = dmu_tx_pool(tx); 1844 dsl_dataset_t *ds; 1845 uint64_t newval; 1846 1847 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1848 1849 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1850 dsl_prop_set_sync_impl(ds, 1851 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1852 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1853 &ddsqra->ddsqra_value, tx); 1854 1855 VERIFY0(dsl_prop_get_int_ds(ds, 1856 zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); 1857 } else { 1858 newval = ddsqra->ddsqra_value; 1859 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1860 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1861 (longlong_t)newval); 1862 } 1863 1864 dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); 1865 dsl_dataset_rele(ds, FTAG); 1866 } 1867 1868 int 1869 dsl_dir_set_reservation(const char *ddname, zprop_source_t source, 1870 uint64_t reservation) 1871 { 1872 dsl_dir_set_qr_arg_t ddsqra; 1873 1874 ddsqra.ddsqra_name = ddname; 1875 ddsqra.ddsqra_source = source; 1876 ddsqra.ddsqra_value = reservation; 1877 1878 return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, 1879 dsl_dir_set_reservation_sync, &ddsqra, 0, 1880 ZFS_SPACE_CHECK_EXTRA_RESERVED)); 1881 } 1882 1883 static dsl_dir_t * 1884 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) 1885 { 1886 for (; ds1; ds1 = ds1->dd_parent) { 1887 dsl_dir_t *dd; 1888 for (dd = ds2; dd; dd = dd->dd_parent) { 1889 if (ds1 == dd) 1890 return (dd); 1891 } 1892 } 1893 return (NULL); 1894 } 1895 1896 /* 1897 * If delta is applied to dd, how much of that delta would be applied to 1898 * ancestor? Syncing context only. 1899 */ 1900 static int64_t 1901 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) 1902 { 1903 if (dd == ancestor) 1904 return (delta); 1905 1906 mutex_enter(&dd->dd_lock); 1907 delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); 1908 mutex_exit(&dd->dd_lock); 1909 return (would_change(dd->dd_parent, delta, ancestor)); 1910 } 1911 1912 typedef struct dsl_dir_rename_arg { 1913 const char *ddra_oldname; 1914 const char *ddra_newname; 1915 cred_t *ddra_cred; 1916 proc_t *ddra_proc; 1917 } dsl_dir_rename_arg_t; 1918 1919 typedef struct dsl_valid_rename_arg { 1920 int char_delta; 1921 int nest_delta; 1922 } dsl_valid_rename_arg_t; 1923 1924 static int 1925 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1926 { 1927 (void) dp; 1928 dsl_valid_rename_arg_t *dvra = arg; 1929 char namebuf[ZFS_MAX_DATASET_NAME_LEN]; 1930 1931 dsl_dataset_name(ds, namebuf); 1932 1933 ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), 1934 <, ZFS_MAX_DATASET_NAME_LEN); 1935 int namelen = strlen(namebuf) + dvra->char_delta; 1936 int depth = get_dataset_depth(namebuf) + dvra->nest_delta; 1937 1938 if (namelen >= ZFS_MAX_DATASET_NAME_LEN) 1939 return (SET_ERROR(ENAMETOOLONG)); 1940 if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) 1941 return (SET_ERROR(ENAMETOOLONG)); 1942 return (0); 1943 } 1944 1945 static int 1946 dsl_dir_rename_check(void *arg, dmu_tx_t *tx) 1947 { 1948 dsl_dir_rename_arg_t *ddra = arg; 1949 dsl_pool_t *dp = dmu_tx_pool(tx); 1950 dsl_dir_t *dd, *newparent; 1951 dsl_valid_rename_arg_t dvra; 1952 dsl_dataset_t *parentds; 1953 objset_t *parentos; 1954 const char *mynewname; 1955 int error; 1956 1957 /* target dir should exist */ 1958 error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); 1959 if (error != 0) 1960 return (error); 1961 1962 /* new parent should exist */ 1963 error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, 1964 &newparent, &mynewname); 1965 if (error != 0) { 1966 dsl_dir_rele(dd, FTAG); 1967 return (error); 1968 } 1969 1970 /* can't rename to different pool */ 1971 if (dd->dd_pool != newparent->dd_pool) { 1972 dsl_dir_rele(newparent, FTAG); 1973 dsl_dir_rele(dd, FTAG); 1974 return (SET_ERROR(EXDEV)); 1975 } 1976 1977 /* new name should not already exist */ 1978 if (mynewname == NULL) { 1979 dsl_dir_rele(newparent, FTAG); 1980 dsl_dir_rele(dd, FTAG); 1981 return (SET_ERROR(EEXIST)); 1982 } 1983 1984 /* can't rename below anything but filesystems (eg. no ZVOLs) */ 1985 error = dsl_dataset_hold_obj(newparent->dd_pool, 1986 dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds); 1987 if (error != 0) { 1988 dsl_dir_rele(newparent, FTAG); 1989 dsl_dir_rele(dd, FTAG); 1990 return (error); 1991 } 1992 error = dmu_objset_from_ds(parentds, &parentos); 1993 if (error != 0) { 1994 dsl_dataset_rele(parentds, FTAG); 1995 dsl_dir_rele(newparent, FTAG); 1996 dsl_dir_rele(dd, FTAG); 1997 return (error); 1998 } 1999 if (dmu_objset_type(parentos) != DMU_OST_ZFS) { 2000 dsl_dataset_rele(parentds, FTAG); 2001 dsl_dir_rele(newparent, FTAG); 2002 dsl_dir_rele(dd, FTAG); 2003 return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); 2004 } 2005 dsl_dataset_rele(parentds, FTAG); 2006 2007 ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), 2008 <, ZFS_MAX_DATASET_NAME_LEN); 2009 ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), 2010 <, ZFS_MAX_DATASET_NAME_LEN); 2011 dvra.char_delta = strlen(ddra->ddra_newname) 2012 - strlen(ddra->ddra_oldname); 2013 dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) 2014 - get_dataset_depth(ddra->ddra_oldname); 2015 2016 /* if the name length is growing, validate child name lengths */ 2017 if (dvra.char_delta > 0 || dvra.nest_delta > 0) { 2018 error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, 2019 &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2020 if (error != 0) { 2021 dsl_dir_rele(newparent, FTAG); 2022 dsl_dir_rele(dd, FTAG); 2023 return (error); 2024 } 2025 } 2026 2027 if (dmu_tx_is_syncing(tx)) { 2028 if (spa_feature_is_active(dp->dp_spa, 2029 SPA_FEATURE_FS_SS_LIMIT)) { 2030 /* 2031 * Although this is the check function and we don't 2032 * normally make on-disk changes in check functions, 2033 * we need to do that here. 2034 * 2035 * Ensure this portion of the tree's counts have been 2036 * initialized in case the new parent has limits set. 2037 */ 2038 dsl_dir_init_fs_ss_count(dd, tx); 2039 } 2040 } 2041 2042 if (newparent != dd->dd_parent) { 2043 /* is there enough space? */ 2044 uint64_t myspace = 2045 MAX(dsl_dir_phys(dd)->dd_used_bytes, 2046 dsl_dir_phys(dd)->dd_reserved); 2047 objset_t *os = dd->dd_pool->dp_meta_objset; 2048 uint64_t fs_cnt = 0; 2049 uint64_t ss_cnt = 0; 2050 2051 if (dsl_dir_is_zapified(dd)) { 2052 int err; 2053 2054 err = zap_lookup(os, dd->dd_object, 2055 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 2056 &fs_cnt); 2057 if (err != ENOENT && err != 0) { 2058 dsl_dir_rele(newparent, FTAG); 2059 dsl_dir_rele(dd, FTAG); 2060 return (err); 2061 } 2062 2063 /* 2064 * have to add 1 for the filesystem itself that we're 2065 * moving 2066 */ 2067 fs_cnt++; 2068 2069 err = zap_lookup(os, dd->dd_object, 2070 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 2071 &ss_cnt); 2072 if (err != ENOENT && err != 0) { 2073 dsl_dir_rele(newparent, FTAG); 2074 dsl_dir_rele(dd, FTAG); 2075 return (err); 2076 } 2077 } 2078 2079 /* check for encryption errors */ 2080 error = dsl_dir_rename_crypt_check(dd, newparent); 2081 if (error != 0) { 2082 dsl_dir_rele(newparent, FTAG); 2083 dsl_dir_rele(dd, FTAG); 2084 return (SET_ERROR(EACCES)); 2085 } 2086 2087 /* no rename into our descendant */ 2088 if (closest_common_ancestor(dd, newparent) == dd) { 2089 dsl_dir_rele(newparent, FTAG); 2090 dsl_dir_rele(dd, FTAG); 2091 return (SET_ERROR(EINVAL)); 2092 } 2093 2094 error = dsl_dir_transfer_possible(dd->dd_parent, 2095 newparent, fs_cnt, ss_cnt, myspace, 2096 ddra->ddra_cred, ddra->ddra_proc); 2097 if (error != 0) { 2098 dsl_dir_rele(newparent, FTAG); 2099 dsl_dir_rele(dd, FTAG); 2100 return (error); 2101 } 2102 } 2103 2104 dsl_dir_rele(newparent, FTAG); 2105 dsl_dir_rele(dd, FTAG); 2106 return (0); 2107 } 2108 2109 static void 2110 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) 2111 { 2112 dsl_dir_rename_arg_t *ddra = arg; 2113 dsl_pool_t *dp = dmu_tx_pool(tx); 2114 dsl_dir_t *dd, *newparent; 2115 const char *mynewname; 2116 objset_t *mos = dp->dp_meta_objset; 2117 2118 VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); 2119 VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, 2120 &mynewname)); 2121 2122 ASSERT3P(mynewname, !=, NULL); 2123 2124 /* Log this before we change the name. */ 2125 spa_history_log_internal_dd(dd, "rename", tx, 2126 "-> %s", ddra->ddra_newname); 2127 2128 if (newparent != dd->dd_parent) { 2129 objset_t *os = dd->dd_pool->dp_meta_objset; 2130 uint64_t fs_cnt = 0; 2131 uint64_t ss_cnt = 0; 2132 2133 /* 2134 * We already made sure the dd counts were initialized in the 2135 * check function. 2136 */ 2137 if (spa_feature_is_active(dp->dp_spa, 2138 SPA_FEATURE_FS_SS_LIMIT)) { 2139 VERIFY0(zap_lookup(os, dd->dd_object, 2140 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 2141 &fs_cnt)); 2142 /* add 1 for the filesystem itself that we're moving */ 2143 fs_cnt++; 2144 2145 VERIFY0(zap_lookup(os, dd->dd_object, 2146 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 2147 &ss_cnt)); 2148 } 2149 2150 dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, 2151 DD_FIELD_FILESYSTEM_COUNT, tx); 2152 dsl_fs_ss_count_adjust(newparent, fs_cnt, 2153 DD_FIELD_FILESYSTEM_COUNT, tx); 2154 2155 dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, 2156 DD_FIELD_SNAPSHOT_COUNT, tx); 2157 dsl_fs_ss_count_adjust(newparent, ss_cnt, 2158 DD_FIELD_SNAPSHOT_COUNT, tx); 2159 2160 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, 2161 -dsl_dir_phys(dd)->dd_used_bytes, 2162 -dsl_dir_phys(dd)->dd_compressed_bytes, 2163 -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 2164 dsl_dir_diduse_space(newparent, DD_USED_CHILD, 2165 dsl_dir_phys(dd)->dd_used_bytes, 2166 dsl_dir_phys(dd)->dd_compressed_bytes, 2167 dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 2168 2169 if (dsl_dir_phys(dd)->dd_reserved > 2170 dsl_dir_phys(dd)->dd_used_bytes) { 2171 uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - 2172 dsl_dir_phys(dd)->dd_used_bytes; 2173 2174 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 2175 -unused_rsrv, 0, 0, tx); 2176 dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, 2177 unused_rsrv, 0, 0, tx); 2178 } 2179 } 2180 2181 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2182 2183 /* remove from old parent zapobj */ 2184 VERIFY0(zap_remove(mos, 2185 dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, 2186 dd->dd_myname, tx)); 2187 2188 (void) strlcpy(dd->dd_myname, mynewname, 2189 sizeof (dd->dd_myname)); 2190 dsl_dir_rele(dd->dd_parent, dd); 2191 dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; 2192 VERIFY0(dsl_dir_hold_obj(dp, 2193 newparent->dd_object, NULL, dd, &dd->dd_parent)); 2194 2195 /* add to new parent zapobj */ 2196 VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, 2197 dd->dd_myname, 8, 1, &dd->dd_object, tx)); 2198 2199 /* TODO: A rename callback to avoid these layering violations. */ 2200 zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); 2201 zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, 2202 ddra->ddra_newname, B_TRUE); 2203 2204 dsl_prop_notify_all(dd); 2205 2206 dsl_dir_rele(newparent, FTAG); 2207 dsl_dir_rele(dd, FTAG); 2208 } 2209 2210 int 2211 dsl_dir_rename(const char *oldname, const char *newname) 2212 { 2213 dsl_dir_rename_arg_t ddra; 2214 2215 ddra.ddra_oldname = oldname; 2216 ddra.ddra_newname = newname; 2217 ddra.ddra_cred = CRED(); 2218 ddra.ddra_proc = curproc; 2219 2220 return (dsl_sync_task(oldname, 2221 dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 2222 3, ZFS_SPACE_CHECK_RESERVED)); 2223 } 2224 2225 int 2226 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, 2227 uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, 2228 cred_t *cr, proc_t *proc) 2229 { 2230 dsl_dir_t *ancestor; 2231 int64_t adelta; 2232 uint64_t avail; 2233 int err; 2234 2235 ancestor = closest_common_ancestor(sdd, tdd); 2236 adelta = would_change(sdd, -space, ancestor); 2237 avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); 2238 if (avail < space) 2239 return (SET_ERROR(ENOSPC)); 2240 2241 err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, 2242 ancestor, cr, proc); 2243 if (err != 0) 2244 return (err); 2245 err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, 2246 ancestor, cr, proc); 2247 if (err != 0) 2248 return (err); 2249 2250 return (0); 2251 } 2252 2253 inode_timespec_t 2254 dsl_dir_snap_cmtime(dsl_dir_t *dd) 2255 { 2256 inode_timespec_t t; 2257 2258 mutex_enter(&dd->dd_lock); 2259 t = dd->dd_snap_cmtime; 2260 mutex_exit(&dd->dd_lock); 2261 2262 return (t); 2263 } 2264 2265 void 2266 dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx) 2267 { 2268 dsl_pool_t *dp = dmu_tx_pool(tx); 2269 inode_timespec_t t; 2270 gethrestime(&t); 2271 2272 mutex_enter(&dd->dd_lock); 2273 dd->dd_snap_cmtime = t; 2274 if (spa_feature_is_enabled(dp->dp_spa, 2275 SPA_FEATURE_EXTENSIBLE_DATASET)) { 2276 objset_t *mos = dd->dd_pool->dp_meta_objset; 2277 uint64_t ddobj = dd->dd_object; 2278 dsl_dir_zapify(dd, tx); 2279 VERIFY0(zap_update(mos, ddobj, 2280 DD_FIELD_SNAPSHOTS_CHANGED, 2281 sizeof (uint64_t), 2282 sizeof (inode_timespec_t) / sizeof (uint64_t), 2283 &t, tx)); 2284 } 2285 mutex_exit(&dd->dd_lock); 2286 } 2287 2288 void 2289 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) 2290 { 2291 objset_t *mos = dd->dd_pool->dp_meta_objset; 2292 dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); 2293 } 2294 2295 boolean_t 2296 dsl_dir_is_zapified(dsl_dir_t *dd) 2297 { 2298 dmu_object_info_t doi; 2299 2300 dmu_object_info_from_db(dd->dd_dbuf, &doi); 2301 return (doi.doi_type == DMU_OTN_ZAP_METADATA); 2302 } 2303 2304 void 2305 dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) 2306 { 2307 objset_t *mos = dd->dd_pool->dp_meta_objset; 2308 ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, 2309 SPA_FEATURE_LIVELIST)); 2310 dsl_deadlist_open(&dd->dd_livelist, mos, obj); 2311 bplist_create(&dd->dd_pending_allocs); 2312 bplist_create(&dd->dd_pending_frees); 2313 } 2314 2315 void 2316 dsl_dir_livelist_close(dsl_dir_t *dd) 2317 { 2318 dsl_deadlist_close(&dd->dd_livelist); 2319 bplist_destroy(&dd->dd_pending_allocs); 2320 bplist_destroy(&dd->dd_pending_frees); 2321 } 2322 2323 void 2324 dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) 2325 { 2326 uint64_t obj; 2327 dsl_pool_t *dp = dmu_tx_pool(tx); 2328 spa_t *spa = dp->dp_spa; 2329 livelist_condense_entry_t to_condense = spa->spa_to_condense; 2330 2331 if (!dsl_deadlist_is_open(&dd->dd_livelist)) 2332 return; 2333 2334 /* 2335 * If the livelist being removed is set to be condensed, stop the 2336 * condense zthr and indicate the cancellation in the spa_to_condense 2337 * struct in case the condense no-wait synctask has already started 2338 */ 2339 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 2340 if (ll_condense_thread != NULL && 2341 (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { 2342 /* 2343 * We use zthr_wait_cycle_done instead of zthr_cancel 2344 * because we don't want to destroy the zthr, just have 2345 * it skip its current task. 2346 */ 2347 spa->spa_to_condense.cancelled = B_TRUE; 2348 zthr_wait_cycle_done(ll_condense_thread); 2349 /* 2350 * If we've returned from zthr_wait_cycle_done without 2351 * clearing the to_condense data structure it's either 2352 * because the no-wait synctask has started (which is 2353 * indicated by 'syncing' field of to_condense) and we 2354 * can expect it to clear to_condense on its own. 2355 * Otherwise, we returned before the zthr ran. The 2356 * checkfunc will now fail as cancelled == B_TRUE so we 2357 * can safely NULL out ds, allowing a different dir's 2358 * livelist to be condensed. 2359 * 2360 * We can be sure that the to_condense struct will not 2361 * be repopulated at this stage because both this 2362 * function and dsl_livelist_try_condense execute in 2363 * syncing context. 2364 */ 2365 if ((spa->spa_to_condense.ds != NULL) && 2366 !spa->spa_to_condense.syncing) { 2367 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, 2368 spa); 2369 spa->spa_to_condense.ds = NULL; 2370 } 2371 } 2372 2373 dsl_dir_livelist_close(dd); 2374 VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object, 2375 DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj)); 2376 VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, 2377 DD_FIELD_LIVELIST, tx)); 2378 if (total) { 2379 dsl_deadlist_free(dp->dp_meta_objset, obj, tx); 2380 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2381 } 2382 } 2383 2384 static int 2385 dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, 2386 zfs_wait_activity_t activity, boolean_t *in_progress) 2387 { 2388 int error = 0; 2389 2390 ASSERT(MUTEX_HELD(&dd->dd_activity_lock)); 2391 2392 switch (activity) { 2393 case ZFS_WAIT_DELETEQ: { 2394 #ifdef _KERNEL 2395 objset_t *os; 2396 error = dmu_objset_from_ds(ds, &os); 2397 if (error != 0) 2398 break; 2399 2400 mutex_enter(&os->os_user_ptr_lock); 2401 void *user = dmu_objset_get_user(os); 2402 mutex_exit(&os->os_user_ptr_lock); 2403 if (dmu_objset_type(os) != DMU_OST_ZFS || 2404 user == NULL || zfs_get_vfs_flag_unmounted(os)) { 2405 *in_progress = B_FALSE; 2406 return (0); 2407 } 2408 2409 uint64_t readonly = B_FALSE; 2410 error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly, 2411 NULL); 2412 2413 if (error != 0) 2414 break; 2415 2416 if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) { 2417 *in_progress = B_FALSE; 2418 return (0); 2419 } 2420 2421 uint64_t count, unlinked_obj; 2422 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 2423 &unlinked_obj); 2424 if (error != 0) { 2425 dsl_dataset_rele(ds, FTAG); 2426 break; 2427 } 2428 error = zap_count(os, unlinked_obj, &count); 2429 2430 if (error == 0) 2431 *in_progress = (count != 0); 2432 break; 2433 #else 2434 /* 2435 * The delete queue is ZPL specific, and libzpool doesn't have 2436 * it. It doesn't make sense to wait for it. 2437 */ 2438 (void) ds; 2439 *in_progress = B_FALSE; 2440 break; 2441 #endif 2442 } 2443 default: 2444 panic("unrecognized value for activity %d", activity); 2445 } 2446 2447 return (error); 2448 } 2449 2450 int 2451 dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, 2452 boolean_t *waited) 2453 { 2454 int error = 0; 2455 boolean_t in_progress; 2456 dsl_pool_t *dp = dd->dd_pool; 2457 for (;;) { 2458 dsl_pool_config_enter(dp, FTAG); 2459 error = dsl_dir_activity_in_progress(dd, ds, activity, 2460 &in_progress); 2461 dsl_pool_config_exit(dp, FTAG); 2462 if (error != 0 || !in_progress) 2463 break; 2464 2465 *waited = B_TRUE; 2466 2467 if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) == 2468 0 || dd->dd_activity_cancelled) { 2469 error = SET_ERROR(EINTR); 2470 break; 2471 } 2472 } 2473 return (error); 2474 } 2475 2476 void 2477 dsl_dir_cancel_waiters(dsl_dir_t *dd) 2478 { 2479 mutex_enter(&dd->dd_activity_lock); 2480 dd->dd_activity_cancelled = B_TRUE; 2481 cv_broadcast(&dd->dd_activity_cv); 2482 while (dd->dd_activity_waiters > 0) 2483 cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock); 2484 mutex_exit(&dd->dd_activity_lock); 2485 } 2486 2487 #if defined(_KERNEL) 2488 EXPORT_SYMBOL(dsl_dir_set_quota); 2489 EXPORT_SYMBOL(dsl_dir_set_reservation); 2490 #endif 2491 2492 /* CSTYLED */ 2493 ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW, 2494 "Enable strict ZVOL quota enforcment"); 2495