1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2013 Martin Matuska. All rights reserved. 25 * Copyright (c) 2014 Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 28 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. 29 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 30 */ 31 32 #include <sys/dmu.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/dsl_dir.h> 37 #include <sys/dsl_prop.h> 38 #include <sys/dsl_synctask.h> 39 #include <sys/dsl_deleg.h> 40 #include <sys/dmu_impl.h> 41 #include <sys/spa.h> 42 #include <sys/spa_impl.h> 43 #include <sys/metaslab.h> 44 #include <sys/zap.h> 45 #include <sys/zio.h> 46 #include <sys/arc.h> 47 #include <sys/sunddi.h> 48 #include <sys/zfeature.h> 49 #include <sys/policy.h> 50 #include <sys/zfs_vfsops.h> 51 #include <sys/zfs_znode.h> 52 #include <sys/zvol.h> 53 #include <sys/zthr.h> 54 #include "zfs_namecheck.h" 55 #include "zfs_prop.h" 56 57 /* 58 * This controls if we verify the ZVOL quota or not. 59 * Currently, quotas are not implemented for ZVOLs. 60 * The quota size is the size of the ZVOL. 61 * The size of the volume already implies the ZVOL size quota. 62 * The quota mechanism can introduce a significant performance drop. 63 */ 64 static int zvol_enforce_quotas = B_TRUE; 65 66 /* 67 * Filesystem and Snapshot Limits 68 * ------------------------------ 69 * 70 * These limits are used to restrict the number of filesystems and/or snapshots 71 * that can be created at a given level in the tree or below. A typical 72 * use-case is with a delegated dataset where the administrator wants to ensure 73 * that a user within the zone is not creating too many additional filesystems 74 * or snapshots, even though they're not exceeding their space quota. 75 * 76 * The filesystem and snapshot counts are stored as extensible properties. This 77 * capability is controlled by a feature flag and must be enabled to be used. 78 * Once enabled, the feature is not active until the first limit is set. At 79 * that point, future operations to create/destroy filesystems or snapshots 80 * will validate and update the counts. 81 * 82 * Because the count properties will not exist before the feature is active, 83 * the counts are updated when a limit is first set on an uninitialized 84 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes 85 * all of the nested filesystems/snapshots. Thus, a new leaf node has a 86 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and 87 * snapshot count properties on a node indicate uninitialized counts on that 88 * node.) When first setting a limit on an uninitialized node, the code starts 89 * at the filesystem with the new limit and descends into all sub-filesystems 90 * to add the count properties. 91 * 92 * In practice this is lightweight since a limit is typically set when the 93 * filesystem is created and thus has no children. Once valid, changing the 94 * limit value won't require a re-traversal since the counts are already valid. 95 * When recursively fixing the counts, if a node with a limit is encountered 96 * during the descent, the counts are known to be valid and there is no need to 97 * descend into that filesystem's children. The counts on filesystems above the 98 * one with the new limit will still be uninitialized, unless a limit is 99 * eventually set on one of those filesystems. The counts are always recursively 100 * updated when a limit is set on a dataset, unless there is already a limit. 101 * When a new limit value is set on a filesystem with an existing limit, it is 102 * possible for the new limit to be less than the current count at that level 103 * since a user who can change the limit is also allowed to exceed the limit. 104 * 105 * Once the feature is active, then whenever a filesystem or snapshot is 106 * created, the code recurses up the tree, validating the new count against the 107 * limit at each initialized level. In practice, most levels will not have a 108 * limit set. If there is a limit at any initialized level up the tree, the 109 * check must pass or the creation will fail. Likewise, when a filesystem or 110 * snapshot is destroyed, the counts are recursively adjusted all the way up 111 * the initialized nodes in the tree. Renaming a filesystem into different point 112 * in the tree will first validate, then update the counts on each branch up to 113 * the common ancestor. A receive will also validate the counts and then update 114 * them. 115 * 116 * An exception to the above behavior is that the limit is not enforced if the 117 * user has permission to modify the limit. This is primarily so that 118 * recursive snapshots in the global zone always work. We want to prevent a 119 * denial-of-service in which a lower level delegated dataset could max out its 120 * limit and thus block recursive snapshots from being taken in the global zone. 121 * Because of this, it is possible for the snapshot count to be over the limit 122 * and snapshots taken in the global zone could cause a lower level dataset to 123 * hit or exceed its limit. The administrator taking the global zone recursive 124 * snapshot should be aware of this side-effect and behave accordingly. 125 * For consistency, the filesystem limit is also not enforced if the user can 126 * modify the limit. 127 * 128 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() 129 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in 130 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by 131 * dsl_dir_init_fs_ss_count(). 132 */ 133 134 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); 135 136 typedef struct ddulrt_arg { 137 dsl_dir_t *ddulrta_dd; 138 uint64_t ddlrta_txg; 139 } ddulrt_arg_t; 140 141 static void 142 dsl_dir_evict_async(void *dbu) 143 { 144 dsl_dir_t *dd = dbu; 145 int t; 146 dsl_pool_t *dp __maybe_unused = dd->dd_pool; 147 148 dd->dd_dbuf = NULL; 149 150 for (t = 0; t < TXG_SIZE; t++) { 151 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 152 ASSERT(dd->dd_tempreserved[t] == 0); 153 ASSERT(dd->dd_space_towrite[t] == 0); 154 } 155 156 if (dd->dd_parent) 157 dsl_dir_async_rele(dd->dd_parent, dd); 158 159 spa_async_close(dd->dd_pool->dp_spa, dd); 160 161 if (dsl_deadlist_is_open(&dd->dd_livelist)) 162 dsl_dir_livelist_close(dd); 163 164 dsl_prop_fini(dd); 165 cv_destroy(&dd->dd_activity_cv); 166 mutex_destroy(&dd->dd_activity_lock); 167 mutex_destroy(&dd->dd_lock); 168 kmem_free(dd, sizeof (dsl_dir_t)); 169 } 170 171 int 172 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, 173 const char *tail, const void *tag, dsl_dir_t **ddp) 174 { 175 dmu_buf_t *dbuf; 176 dsl_dir_t *dd; 177 dmu_object_info_t doi; 178 int err; 179 180 ASSERT(dsl_pool_config_held(dp)); 181 182 err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); 183 if (err != 0) 184 return (err); 185 dd = dmu_buf_get_user(dbuf); 186 187 dmu_object_info_from_db(dbuf, &doi); 188 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); 189 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); 190 191 if (dd == NULL) { 192 dsl_dir_t *winner; 193 194 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); 195 dd->dd_object = ddobj; 196 dd->dd_dbuf = dbuf; 197 dd->dd_pool = dp; 198 199 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); 200 mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); 201 cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); 202 dsl_prop_init(dd); 203 204 if (dsl_dir_is_zapified(dd)) { 205 err = zap_lookup(dp->dp_meta_objset, 206 ddobj, DD_FIELD_CRYPTO_KEY_OBJ, 207 sizeof (uint64_t), 1, &dd->dd_crypto_obj); 208 if (err == 0) { 209 /* check for on-disk format errata */ 210 if (dsl_dir_incompatible_encryption_version( 211 dd)) { 212 dp->dp_spa->spa_errata = 213 ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; 214 } 215 } else if (err != ENOENT) { 216 goto errout; 217 } 218 } 219 220 if (dsl_dir_phys(dd)->dd_parent_obj) { 221 err = dsl_dir_hold_obj(dp, 222 dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, 223 &dd->dd_parent); 224 if (err != 0) 225 goto errout; 226 if (tail) { 227 #ifdef ZFS_DEBUG 228 uint64_t foundobj; 229 230 err = zap_lookup(dp->dp_meta_objset, 231 dsl_dir_phys(dd->dd_parent)-> 232 dd_child_dir_zapobj, tail, 233 sizeof (foundobj), 1, &foundobj); 234 ASSERT(err || foundobj == ddobj); 235 #endif 236 (void) strlcpy(dd->dd_myname, tail, 237 sizeof (dd->dd_myname)); 238 } else { 239 err = zap_value_search(dp->dp_meta_objset, 240 dsl_dir_phys(dd->dd_parent)-> 241 dd_child_dir_zapobj, 242 ddobj, 0, dd->dd_myname); 243 } 244 if (err != 0) 245 goto errout; 246 } else { 247 (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa), 248 sizeof (dd->dd_myname)); 249 } 250 251 if (dsl_dir_is_clone(dd)) { 252 dmu_buf_t *origin_bonus; 253 dsl_dataset_phys_t *origin_phys; 254 255 /* 256 * We can't open the origin dataset, because 257 * that would require opening this dsl_dir. 258 * Just look at its phys directly instead. 259 */ 260 err = dmu_bonus_hold(dp->dp_meta_objset, 261 dsl_dir_phys(dd)->dd_origin_obj, FTAG, 262 &origin_bonus); 263 if (err != 0) 264 goto errout; 265 origin_phys = origin_bonus->db_data; 266 dd->dd_origin_txg = 267 origin_phys->ds_creation_txg; 268 dmu_buf_rele(origin_bonus, FTAG); 269 if (dsl_dir_is_zapified(dd)) { 270 uint64_t obj; 271 err = zap_lookup(dp->dp_meta_objset, 272 dd->dd_object, DD_FIELD_LIVELIST, 273 sizeof (uint64_t), 1, &obj); 274 if (err == 0) 275 dsl_dir_livelist_open(dd, obj); 276 else if (err != ENOENT) 277 goto errout; 278 } 279 } 280 281 if (dsl_dir_is_zapified(dd)) { 282 inode_timespec_t t = {0}; 283 (void) zap_lookup(dp->dp_meta_objset, ddobj, 284 DD_FIELD_SNAPSHOTS_CHANGED, 285 sizeof (uint64_t), 286 sizeof (inode_timespec_t) / sizeof (uint64_t), 287 &t); 288 dd->dd_snap_cmtime = t; 289 } 290 291 dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, 292 &dd->dd_dbuf); 293 winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); 294 if (winner != NULL) { 295 if (dd->dd_parent) 296 dsl_dir_rele(dd->dd_parent, dd); 297 if (dsl_deadlist_is_open(&dd->dd_livelist)) 298 dsl_dir_livelist_close(dd); 299 dsl_prop_fini(dd); 300 cv_destroy(&dd->dd_activity_cv); 301 mutex_destroy(&dd->dd_activity_lock); 302 mutex_destroy(&dd->dd_lock); 303 kmem_free(dd, sizeof (dsl_dir_t)); 304 dd = winner; 305 } else { 306 spa_open_ref(dp->dp_spa, dd); 307 } 308 } 309 310 /* 311 * The dsl_dir_t has both open-to-close and instantiate-to-evict 312 * holds on the spa. We need the open-to-close holds because 313 * otherwise the spa_refcnt wouldn't change when we open a 314 * dir which the spa also has open, so we could incorrectly 315 * think it was OK to unload/export/destroy the pool. We need 316 * the instantiate-to-evict hold because the dsl_dir_t has a 317 * pointer to the dd_pool, which has a pointer to the spa_t. 318 */ 319 spa_open_ref(dp->dp_spa, tag); 320 ASSERT3P(dd->dd_pool, ==, dp); 321 ASSERT3U(dd->dd_object, ==, ddobj); 322 ASSERT3P(dd->dd_dbuf, ==, dbuf); 323 *ddp = dd; 324 return (0); 325 326 errout: 327 if (dd->dd_parent) 328 dsl_dir_rele(dd->dd_parent, dd); 329 if (dsl_deadlist_is_open(&dd->dd_livelist)) 330 dsl_dir_livelist_close(dd); 331 dsl_prop_fini(dd); 332 cv_destroy(&dd->dd_activity_cv); 333 mutex_destroy(&dd->dd_activity_lock); 334 mutex_destroy(&dd->dd_lock); 335 kmem_free(dd, sizeof (dsl_dir_t)); 336 dmu_buf_rele(dbuf, tag); 337 return (err); 338 } 339 340 void 341 dsl_dir_rele(dsl_dir_t *dd, const void *tag) 342 { 343 dprintf_dd(dd, "%s\n", ""); 344 spa_close(dd->dd_pool->dp_spa, tag); 345 dmu_buf_rele(dd->dd_dbuf, tag); 346 } 347 348 /* 349 * Remove a reference to the given dsl dir that is being asynchronously 350 * released. Async releases occur from a taskq performing eviction of 351 * dsl datasets and dirs. This process is identical to a normal release 352 * with the exception of using the async API for releasing the reference on 353 * the spa. 354 */ 355 void 356 dsl_dir_async_rele(dsl_dir_t *dd, const void *tag) 357 { 358 dprintf_dd(dd, "%s\n", ""); 359 spa_async_close(dd->dd_pool->dp_spa, tag); 360 dmu_buf_rele(dd->dd_dbuf, tag); 361 } 362 363 /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ 364 void 365 dsl_dir_name(dsl_dir_t *dd, char *buf) 366 { 367 if (dd->dd_parent) { 368 dsl_dir_name(dd->dd_parent, buf); 369 VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, 370 ZFS_MAX_DATASET_NAME_LEN); 371 } else { 372 buf[0] = '\0'; 373 } 374 if (!MUTEX_HELD(&dd->dd_lock)) { 375 /* 376 * recursive mutex so that we can use 377 * dprintf_dd() with dd_lock held 378 */ 379 mutex_enter(&dd->dd_lock); 380 VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), 381 <, ZFS_MAX_DATASET_NAME_LEN); 382 mutex_exit(&dd->dd_lock); 383 } else { 384 VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), 385 <, ZFS_MAX_DATASET_NAME_LEN); 386 } 387 } 388 389 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ 390 int 391 dsl_dir_namelen(dsl_dir_t *dd) 392 { 393 int result = 0; 394 395 if (dd->dd_parent) { 396 /* parent's name + 1 for the "/" */ 397 result = dsl_dir_namelen(dd->dd_parent) + 1; 398 } 399 400 if (!MUTEX_HELD(&dd->dd_lock)) { 401 /* see dsl_dir_name */ 402 mutex_enter(&dd->dd_lock); 403 result += strlen(dd->dd_myname); 404 mutex_exit(&dd->dd_lock); 405 } else { 406 result += strlen(dd->dd_myname); 407 } 408 409 return (result); 410 } 411 412 static int 413 getcomponent(const char *path, char *component, const char **nextp) 414 { 415 char *p; 416 417 if ((path == NULL) || (path[0] == '\0')) 418 return (SET_ERROR(ENOENT)); 419 /* This would be a good place to reserve some namespace... */ 420 p = strpbrk(path, "/@"); 421 if (p && (p[1] == '/' || p[1] == '@')) { 422 /* two separators in a row */ 423 return (SET_ERROR(EINVAL)); 424 } 425 if (p == NULL || p == path) { 426 /* 427 * if the first thing is an @ or /, it had better be an 428 * @ and it had better not have any more ats or slashes, 429 * and it had better have something after the @. 430 */ 431 if (p != NULL && 432 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) 433 return (SET_ERROR(EINVAL)); 434 if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) 435 return (SET_ERROR(ENAMETOOLONG)); 436 (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN); 437 p = NULL; 438 } else if (p[0] == '/') { 439 if (p - path >= ZFS_MAX_DATASET_NAME_LEN) 440 return (SET_ERROR(ENAMETOOLONG)); 441 (void) strlcpy(component, path, p - path + 1); 442 p++; 443 } else if (p[0] == '@') { 444 /* 445 * if the next separator is an @, there better not be 446 * any more slashes. 447 */ 448 if (strchr(path, '/')) 449 return (SET_ERROR(EINVAL)); 450 if (p - path >= ZFS_MAX_DATASET_NAME_LEN) 451 return (SET_ERROR(ENAMETOOLONG)); 452 (void) strlcpy(component, path, p - path + 1); 453 } else { 454 panic("invalid p=%p", (void *)p); 455 } 456 *nextp = p; 457 return (0); 458 } 459 460 /* 461 * Return the dsl_dir_t, and possibly the last component which couldn't 462 * be found in *tail. The name must be in the specified dsl_pool_t. This 463 * thread must hold the dp_config_rwlock for the pool. Returns NULL if the 464 * path is bogus, or if tail==NULL and we couldn't parse the whole name. 465 * (*tail)[0] == '@' means that the last component is a snapshot. 466 */ 467 int 468 dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag, 469 dsl_dir_t **ddp, const char **tailp) 470 { 471 char *buf; 472 const char *spaname, *next, *nextnext = NULL; 473 int err; 474 dsl_dir_t *dd; 475 uint64_t ddobj; 476 477 buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 478 err = getcomponent(name, buf, &next); 479 if (err != 0) 480 goto error; 481 482 /* Make sure the name is in the specified pool. */ 483 spaname = spa_name(dp->dp_spa); 484 if (strcmp(buf, spaname) != 0) { 485 err = SET_ERROR(EXDEV); 486 goto error; 487 } 488 489 ASSERT(dsl_pool_config_held(dp)); 490 491 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); 492 if (err != 0) { 493 goto error; 494 } 495 496 while (next != NULL) { 497 dsl_dir_t *child_dd; 498 err = getcomponent(next, buf, &nextnext); 499 if (err != 0) 500 break; 501 ASSERT(next[0] != '\0'); 502 if (next[0] == '@') 503 break; 504 dprintf("looking up %s in obj%lld\n", 505 buf, (longlong_t)dsl_dir_phys(dd)->dd_child_dir_zapobj); 506 507 err = zap_lookup(dp->dp_meta_objset, 508 dsl_dir_phys(dd)->dd_child_dir_zapobj, 509 buf, sizeof (ddobj), 1, &ddobj); 510 if (err != 0) { 511 if (err == ENOENT) 512 err = 0; 513 break; 514 } 515 516 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); 517 if (err != 0) 518 break; 519 dsl_dir_rele(dd, tag); 520 dd = child_dd; 521 next = nextnext; 522 } 523 524 if (err != 0) { 525 dsl_dir_rele(dd, tag); 526 goto error; 527 } 528 529 /* 530 * It's an error if there's more than one component left, or 531 * tailp==NULL and there's any component left. 532 */ 533 if (next != NULL && 534 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { 535 /* bad path name */ 536 dsl_dir_rele(dd, tag); 537 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); 538 err = SET_ERROR(ENOENT); 539 } 540 if (tailp != NULL) 541 *tailp = next; 542 if (err == 0) 543 *ddp = dd; 544 error: 545 kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN); 546 return (err); 547 } 548 549 /* 550 * If the counts are already initialized for this filesystem and its 551 * descendants then do nothing, otherwise initialize the counts. 552 * 553 * The counts on this filesystem, and those below, may be uninitialized due to 554 * either the use of a pre-existing pool which did not support the 555 * filesystem/snapshot limit feature, or one in which the feature had not yet 556 * been enabled. 557 * 558 * Recursively descend the filesystem tree and update the filesystem/snapshot 559 * counts on each filesystem below, then update the cumulative count on the 560 * current filesystem. If the filesystem already has a count set on it, 561 * then we know that its counts, and the counts on the filesystems below it, 562 * are already correct, so we don't have to update this filesystem. 563 */ 564 static void 565 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) 566 { 567 uint64_t my_fs_cnt = 0; 568 uint64_t my_ss_cnt = 0; 569 dsl_pool_t *dp = dd->dd_pool; 570 objset_t *os = dp->dp_meta_objset; 571 zap_cursor_t *zc; 572 zap_attribute_t *za; 573 dsl_dataset_t *ds; 574 575 ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); 576 ASSERT(dsl_pool_config_held(dp)); 577 ASSERT(dmu_tx_is_syncing(tx)); 578 579 dsl_dir_zapify(dd, tx); 580 581 /* 582 * If the filesystem count has already been initialized then we 583 * don't need to recurse down any further. 584 */ 585 if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) 586 return; 587 588 zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); 589 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 590 591 /* Iterate my child dirs */ 592 for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); 593 zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { 594 dsl_dir_t *chld_dd; 595 uint64_t count; 596 597 VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, 598 &chld_dd)); 599 600 /* 601 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets. 602 */ 603 if (chld_dd->dd_myname[0] == '$') { 604 dsl_dir_rele(chld_dd, FTAG); 605 continue; 606 } 607 608 my_fs_cnt++; /* count this child */ 609 610 dsl_dir_init_fs_ss_count(chld_dd, tx); 611 612 VERIFY0(zap_lookup(os, chld_dd->dd_object, 613 DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); 614 my_fs_cnt += count; 615 VERIFY0(zap_lookup(os, chld_dd->dd_object, 616 DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); 617 my_ss_cnt += count; 618 619 dsl_dir_rele(chld_dd, FTAG); 620 } 621 zap_cursor_fini(zc); 622 /* Count my snapshots (we counted children's snapshots above) */ 623 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 624 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); 625 626 for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); 627 zap_cursor_retrieve(zc, za) == 0; 628 zap_cursor_advance(zc)) { 629 /* Don't count temporary snapshots */ 630 if (za->za_name[0] != '%') 631 my_ss_cnt++; 632 } 633 zap_cursor_fini(zc); 634 635 dsl_dataset_rele(ds, FTAG); 636 637 kmem_free(zc, sizeof (zap_cursor_t)); 638 kmem_free(za, sizeof (zap_attribute_t)); 639 640 /* we're in a sync task, update counts */ 641 dmu_buf_will_dirty(dd->dd_dbuf, tx); 642 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 643 sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); 644 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 645 sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); 646 } 647 648 static int 649 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) 650 { 651 char *ddname = (char *)arg; 652 dsl_pool_t *dp = dmu_tx_pool(tx); 653 dsl_dataset_t *ds; 654 dsl_dir_t *dd; 655 int error; 656 657 error = dsl_dataset_hold(dp, ddname, FTAG, &ds); 658 if (error != 0) 659 return (error); 660 661 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { 662 dsl_dataset_rele(ds, FTAG); 663 return (SET_ERROR(ENOTSUP)); 664 } 665 666 dd = ds->ds_dir; 667 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && 668 dsl_dir_is_zapified(dd) && 669 zap_contains(dp->dp_meta_objset, dd->dd_object, 670 DD_FIELD_FILESYSTEM_COUNT) == 0) { 671 dsl_dataset_rele(ds, FTAG); 672 return (SET_ERROR(EALREADY)); 673 } 674 675 dsl_dataset_rele(ds, FTAG); 676 return (0); 677 } 678 679 static void 680 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) 681 { 682 char *ddname = (char *)arg; 683 dsl_pool_t *dp = dmu_tx_pool(tx); 684 dsl_dataset_t *ds; 685 spa_t *spa; 686 687 VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); 688 689 spa = dsl_dataset_get_spa(ds); 690 691 if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { 692 /* 693 * Since the feature was not active and we're now setting a 694 * limit, increment the feature-active counter so that the 695 * feature becomes active for the first time. 696 * 697 * We are already in a sync task so we can update the MOS. 698 */ 699 spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); 700 } 701 702 /* 703 * Since we are now setting a non-UINT64_MAX limit on the filesystem, 704 * we need to ensure the counts are correct. Descend down the tree from 705 * this point and update all of the counts to be accurate. 706 */ 707 dsl_dir_init_fs_ss_count(ds->ds_dir, tx); 708 709 dsl_dataset_rele(ds, FTAG); 710 } 711 712 /* 713 * Make sure the feature is enabled and activate it if necessary. 714 * Since we're setting a limit, ensure the on-disk counts are valid. 715 * This is only called by the ioctl path when setting a limit value. 716 * 717 * We do not need to validate the new limit, since users who can change the 718 * limit are also allowed to exceed the limit. 719 */ 720 int 721 dsl_dir_activate_fs_ss_limit(const char *ddname) 722 { 723 int error; 724 725 error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, 726 dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, 727 ZFS_SPACE_CHECK_RESERVED); 728 729 if (error == EALREADY) 730 error = 0; 731 732 return (error); 733 } 734 735 /* 736 * Used to determine if the filesystem_limit or snapshot_limit should be 737 * enforced. We allow the limit to be exceeded if the user has permission to 738 * write the property value. We pass in the creds that we got in the open 739 * context since we will always be the GZ root in syncing context. We also have 740 * to handle the case where we are allowed to change the limit on the current 741 * dataset, but there may be another limit in the tree above. 742 * 743 * We can never modify these two properties within a non-global zone. In 744 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We 745 * can't use that function since we are already holding the dp_config_rwlock. 746 * In addition, we already have the dd and dealing with snapshots is simplified 747 * in this code. 748 */ 749 750 typedef enum { 751 ENFORCE_ALWAYS, 752 ENFORCE_NEVER, 753 ENFORCE_ABOVE 754 } enforce_res_t; 755 756 static enforce_res_t 757 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, 758 cred_t *cr, proc_t *proc) 759 { 760 enforce_res_t enforce = ENFORCE_ALWAYS; 761 uint64_t obj; 762 dsl_dataset_t *ds; 763 uint64_t zoned; 764 const char *zonedstr; 765 766 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 767 prop == ZFS_PROP_SNAPSHOT_LIMIT); 768 769 #ifdef _KERNEL 770 if (crgetzoneid(cr) != GLOBAL_ZONEID) 771 return (ENFORCE_ALWAYS); 772 773 /* 774 * We are checking the saved credentials of the user process, which is 775 * not the current process. Note that we can't use secpolicy_zfs(), 776 * because it only works if the cred is that of the current process (on 777 * Linux). 778 */ 779 if (secpolicy_zfs_proc(cr, proc) == 0) 780 return (ENFORCE_NEVER); 781 #else 782 (void) proc; 783 #endif 784 785 if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) 786 return (ENFORCE_ALWAYS); 787 788 ASSERT(dsl_pool_config_held(dd->dd_pool)); 789 790 if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) 791 return (ENFORCE_ALWAYS); 792 793 zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED); 794 if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) { 795 /* Only root can access zoned fs's from the GZ */ 796 enforce = ENFORCE_ALWAYS; 797 } else { 798 if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) 799 enforce = ENFORCE_ABOVE; 800 } 801 802 dsl_dataset_rele(ds, FTAG); 803 return (enforce); 804 } 805 806 /* 807 * Check if adding additional child filesystem(s) would exceed any filesystem 808 * limits or adding additional snapshot(s) would exceed any snapshot limits. 809 * The prop argument indicates which limit to check. 810 * 811 * Note that all filesystem limits up to the root (or the highest 812 * initialized) filesystem or the given ancestor must be satisfied. 813 */ 814 int 815 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, 816 dsl_dir_t *ancestor, cred_t *cr, proc_t *proc) 817 { 818 objset_t *os = dd->dd_pool->dp_meta_objset; 819 uint64_t limit, count; 820 const char *count_prop; 821 enforce_res_t enforce; 822 int err = 0; 823 824 ASSERT(dsl_pool_config_held(dd->dd_pool)); 825 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 826 prop == ZFS_PROP_SNAPSHOT_LIMIT); 827 828 if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { 829 /* 830 * We don't enforce the limit for temporary snapshots. This is 831 * indicated by a NULL cred_t argument. 832 */ 833 if (cr == NULL) 834 return (0); 835 836 count_prop = DD_FIELD_SNAPSHOT_COUNT; 837 } else { 838 count_prop = DD_FIELD_FILESYSTEM_COUNT; 839 } 840 /* 841 * If we're allowed to change the limit, don't enforce the limit 842 * e.g. this can happen if a snapshot is taken by an administrative 843 * user in the global zone (i.e. a recursive snapshot by root). 844 * However, we must handle the case of delegated permissions where we 845 * are allowed to change the limit on the current dataset, but there 846 * is another limit in the tree above. 847 */ 848 enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc); 849 if (enforce == ENFORCE_NEVER) 850 return (0); 851 852 /* 853 * e.g. if renaming a dataset with no snapshots, count adjustment 854 * is 0. 855 */ 856 if (delta == 0) 857 return (0); 858 859 /* 860 * If an ancestor has been provided, stop checking the limit once we 861 * hit that dir. We need this during rename so that we don't overcount 862 * the check once we recurse up to the common ancestor. 863 */ 864 if (ancestor == dd) 865 return (0); 866 867 /* 868 * If we hit an uninitialized node while recursing up the tree, we can 869 * stop since we know there is no limit here (or above). The counts are 870 * not valid on this node and we know we won't touch this node's counts. 871 */ 872 if (!dsl_dir_is_zapified(dd)) 873 return (0); 874 err = zap_lookup(os, dd->dd_object, 875 count_prop, sizeof (count), 1, &count); 876 if (err == ENOENT) 877 return (0); 878 if (err != 0) 879 return (err); 880 881 err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, 882 B_FALSE); 883 if (err != 0) 884 return (err); 885 886 /* Is there a limit which we've hit? */ 887 if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) 888 return (SET_ERROR(EDQUOT)); 889 890 if (dd->dd_parent != NULL) 891 err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, 892 ancestor, cr, proc); 893 894 return (err); 895 } 896 897 /* 898 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all 899 * parents. When a new filesystem/snapshot is created, increment the count on 900 * all parents, and when a filesystem/snapshot is destroyed, decrement the 901 * count. 902 */ 903 void 904 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, 905 dmu_tx_t *tx) 906 { 907 int err; 908 objset_t *os = dd->dd_pool->dp_meta_objset; 909 uint64_t count; 910 911 ASSERT(dsl_pool_config_held(dd->dd_pool)); 912 ASSERT(dmu_tx_is_syncing(tx)); 913 ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || 914 strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); 915 916 /* 917 * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets. 918 */ 919 if (dd->dd_myname[0] == '$' && strcmp(prop, 920 DD_FIELD_FILESYSTEM_COUNT) == 0) { 921 return; 922 } 923 924 /* 925 * e.g. if renaming a dataset with no snapshots, count adjustment is 0 926 */ 927 if (delta == 0) 928 return; 929 930 /* 931 * If we hit an uninitialized node while recursing up the tree, we can 932 * stop since we know the counts are not valid on this node and we 933 * know we shouldn't touch this node's counts. An uninitialized count 934 * on the node indicates that either the feature has not yet been 935 * activated or there are no limits on this part of the tree. 936 */ 937 if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, 938 prop, sizeof (count), 1, &count)) == ENOENT) 939 return; 940 VERIFY0(err); 941 942 count += delta; 943 /* Use a signed verify to make sure we're not neg. */ 944 VERIFY3S(count, >=, 0); 945 946 VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, 947 tx)); 948 949 /* Roll up this additional count into our ancestors */ 950 if (dd->dd_parent != NULL) 951 dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); 952 } 953 954 uint64_t 955 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, 956 dmu_tx_t *tx) 957 { 958 objset_t *mos = dp->dp_meta_objset; 959 uint64_t ddobj; 960 dsl_dir_phys_t *ddphys; 961 dmu_buf_t *dbuf; 962 963 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 964 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 965 if (pds) { 966 VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, 967 name, sizeof (uint64_t), 1, &ddobj, tx)); 968 } else { 969 /* it's the root dir */ 970 VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, 971 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); 972 } 973 VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); 974 dmu_buf_will_dirty(dbuf, tx); 975 ddphys = dbuf->db_data; 976 977 ddphys->dd_creation_time = gethrestime_sec(); 978 if (pds) { 979 ddphys->dd_parent_obj = pds->dd_object; 980 981 /* update the filesystem counts */ 982 dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); 983 } 984 ddphys->dd_props_zapobj = zap_create(mos, 985 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 986 ddphys->dd_child_dir_zapobj = zap_create(mos, 987 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 988 if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) 989 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; 990 991 dmu_buf_rele(dbuf, FTAG); 992 993 return (ddobj); 994 } 995 996 boolean_t 997 dsl_dir_is_clone(dsl_dir_t *dd) 998 { 999 return (dsl_dir_phys(dd)->dd_origin_obj && 1000 (dd->dd_pool->dp_origin_snap == NULL || 1001 dsl_dir_phys(dd)->dd_origin_obj != 1002 dd->dd_pool->dp_origin_snap->ds_object)); 1003 } 1004 1005 uint64_t 1006 dsl_dir_get_used(dsl_dir_t *dd) 1007 { 1008 return (dsl_dir_phys(dd)->dd_used_bytes); 1009 } 1010 1011 uint64_t 1012 dsl_dir_get_compressed(dsl_dir_t *dd) 1013 { 1014 return (dsl_dir_phys(dd)->dd_compressed_bytes); 1015 } 1016 1017 uint64_t 1018 dsl_dir_get_quota(dsl_dir_t *dd) 1019 { 1020 return (dsl_dir_phys(dd)->dd_quota); 1021 } 1022 1023 uint64_t 1024 dsl_dir_get_reservation(dsl_dir_t *dd) 1025 { 1026 return (dsl_dir_phys(dd)->dd_reserved); 1027 } 1028 1029 uint64_t 1030 dsl_dir_get_compressratio(dsl_dir_t *dd) 1031 { 1032 /* a fixed point number, 100x the ratio */ 1033 return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : 1034 (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / 1035 dsl_dir_phys(dd)->dd_compressed_bytes)); 1036 } 1037 1038 uint64_t 1039 dsl_dir_get_logicalused(dsl_dir_t *dd) 1040 { 1041 return (dsl_dir_phys(dd)->dd_uncompressed_bytes); 1042 } 1043 1044 uint64_t 1045 dsl_dir_get_usedsnap(dsl_dir_t *dd) 1046 { 1047 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); 1048 } 1049 1050 uint64_t 1051 dsl_dir_get_usedds(dsl_dir_t *dd) 1052 { 1053 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); 1054 } 1055 1056 uint64_t 1057 dsl_dir_get_usedrefreserv(dsl_dir_t *dd) 1058 { 1059 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); 1060 } 1061 1062 uint64_t 1063 dsl_dir_get_usedchild(dsl_dir_t *dd) 1064 { 1065 return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + 1066 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); 1067 } 1068 1069 void 1070 dsl_dir_get_origin(dsl_dir_t *dd, char *buf) 1071 { 1072 dsl_dataset_t *ds; 1073 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 1074 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); 1075 1076 dsl_dataset_name(ds, buf); 1077 1078 dsl_dataset_rele(ds, FTAG); 1079 } 1080 1081 int 1082 dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) 1083 { 1084 if (dsl_dir_is_zapified(dd)) { 1085 objset_t *os = dd->dd_pool->dp_meta_objset; 1086 return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 1087 sizeof (*count), 1, count)); 1088 } else { 1089 return (SET_ERROR(ENOENT)); 1090 } 1091 } 1092 1093 int 1094 dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) 1095 { 1096 if (dsl_dir_is_zapified(dd)) { 1097 objset_t *os = dd->dd_pool->dp_meta_objset; 1098 return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 1099 sizeof (*count), 1, count)); 1100 } else { 1101 return (SET_ERROR(ENOENT)); 1102 } 1103 } 1104 1105 void 1106 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) 1107 { 1108 mutex_enter(&dd->dd_lock); 1109 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, 1110 dsl_dir_get_quota(dd)); 1111 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, 1112 dsl_dir_get_reservation(dd)); 1113 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, 1114 dsl_dir_get_logicalused(dd)); 1115 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1116 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, 1117 dsl_dir_get_usedsnap(dd)); 1118 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, 1119 dsl_dir_get_usedds(dd)); 1120 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, 1121 dsl_dir_get_usedrefreserv(dd)); 1122 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, 1123 dsl_dir_get_usedchild(dd)); 1124 } 1125 mutex_exit(&dd->dd_lock); 1126 1127 uint64_t count; 1128 if (dsl_dir_get_filesystem_count(dd, &count) == 0) { 1129 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, 1130 count); 1131 } 1132 if (dsl_dir_get_snapshot_count(dd, &count) == 0) { 1133 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, 1134 count); 1135 } 1136 1137 if (dsl_dir_is_clone(dd)) { 1138 char buf[ZFS_MAX_DATASET_NAME_LEN]; 1139 dsl_dir_get_origin(dd, buf); 1140 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); 1141 } 1142 1143 } 1144 1145 void 1146 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) 1147 { 1148 dsl_pool_t *dp = dd->dd_pool; 1149 1150 ASSERT(dsl_dir_phys(dd)); 1151 1152 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { 1153 /* up the hold count until we can be written out */ 1154 dmu_buf_add_ref(dd->dd_dbuf, dd); 1155 } 1156 } 1157 1158 static int64_t 1159 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) 1160 { 1161 uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); 1162 uint64_t new_accounted = 1163 MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); 1164 return (new_accounted - old_accounted); 1165 } 1166 1167 void 1168 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) 1169 { 1170 ASSERT(dmu_tx_is_syncing(tx)); 1171 1172 mutex_enter(&dd->dd_lock); 1173 ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]); 1174 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", (u_longlong_t)tx->tx_txg, 1175 (u_longlong_t)dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024); 1176 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0; 1177 mutex_exit(&dd->dd_lock); 1178 1179 /* release the hold from dsl_dir_dirty */ 1180 dmu_buf_rele(dd->dd_dbuf, dd); 1181 } 1182 1183 static uint64_t 1184 dsl_dir_space_towrite(dsl_dir_t *dd) 1185 { 1186 uint64_t space = 0; 1187 1188 ASSERT(MUTEX_HELD(&dd->dd_lock)); 1189 1190 for (int i = 0; i < TXG_SIZE; i++) 1191 space += dd->dd_space_towrite[i & TXG_MASK]; 1192 1193 return (space); 1194 } 1195 1196 /* 1197 * How much space would dd have available if ancestor had delta applied 1198 * to it? If ondiskonly is set, we're only interested in what's 1199 * on-disk, not estimated pending changes. 1200 */ 1201 uint64_t 1202 dsl_dir_space_available(dsl_dir_t *dd, 1203 dsl_dir_t *ancestor, int64_t delta, int ondiskonly) 1204 { 1205 uint64_t parentspace, myspace, quota, used; 1206 1207 /* 1208 * If there are no restrictions otherwise, assume we have 1209 * unlimited space available. 1210 */ 1211 quota = UINT64_MAX; 1212 parentspace = UINT64_MAX; 1213 1214 if (dd->dd_parent != NULL) { 1215 parentspace = dsl_dir_space_available(dd->dd_parent, 1216 ancestor, delta, ondiskonly); 1217 } 1218 1219 mutex_enter(&dd->dd_lock); 1220 if (dsl_dir_phys(dd)->dd_quota != 0) 1221 quota = dsl_dir_phys(dd)->dd_quota; 1222 used = dsl_dir_phys(dd)->dd_used_bytes; 1223 if (!ondiskonly) 1224 used += dsl_dir_space_towrite(dd); 1225 1226 if (dd->dd_parent == NULL) { 1227 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, 1228 ZFS_SPACE_CHECK_NORMAL); 1229 quota = MIN(quota, poolsize); 1230 } 1231 1232 if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { 1233 /* 1234 * We have some space reserved, in addition to what our 1235 * parent gave us. 1236 */ 1237 parentspace += dsl_dir_phys(dd)->dd_reserved - used; 1238 } 1239 1240 if (dd == ancestor) { 1241 ASSERT(delta <= 0); 1242 ASSERT(used >= -delta); 1243 used += delta; 1244 if (parentspace != UINT64_MAX) 1245 parentspace -= delta; 1246 } 1247 1248 if (used > quota) { 1249 /* over quota */ 1250 myspace = 0; 1251 } else { 1252 /* 1253 * the lesser of the space provided by our parent and 1254 * the space left in our quota 1255 */ 1256 myspace = MIN(parentspace, quota - used); 1257 } 1258 1259 mutex_exit(&dd->dd_lock); 1260 1261 return (myspace); 1262 } 1263 1264 struct tempreserve { 1265 list_node_t tr_node; 1266 dsl_dir_t *tr_ds; 1267 uint64_t tr_size; 1268 }; 1269 1270 static int 1271 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, 1272 boolean_t ignorequota, list_t *tr_list, 1273 dmu_tx_t *tx, boolean_t first) 1274 { 1275 uint64_t txg; 1276 uint64_t quota; 1277 struct tempreserve *tr; 1278 int retval; 1279 uint64_t ext_quota; 1280 uint64_t ref_rsrv; 1281 1282 top_of_function: 1283 txg = tx->tx_txg; 1284 retval = EDQUOT; 1285 ref_rsrv = 0; 1286 1287 ASSERT3U(txg, !=, 0); 1288 ASSERT3S(asize, >, 0); 1289 1290 mutex_enter(&dd->dd_lock); 1291 1292 /* 1293 * Check against the dsl_dir's quota. We don't add in the delta 1294 * when checking for over-quota because they get one free hit. 1295 */ 1296 uint64_t est_inflight = dsl_dir_space_towrite(dd); 1297 for (int i = 0; i < TXG_SIZE; i++) 1298 est_inflight += dd->dd_tempreserved[i]; 1299 uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; 1300 1301 /* 1302 * On the first iteration, fetch the dataset's used-on-disk and 1303 * refreservation values. Also, if checkrefquota is set, test if 1304 * allocating this space would exceed the dataset's refquota. 1305 */ 1306 if (first && tx->tx_objset) { 1307 int error; 1308 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; 1309 1310 error = dsl_dataset_check_quota(ds, !netfree, 1311 asize, est_inflight, &used_on_disk, &ref_rsrv); 1312 if (error != 0) { 1313 mutex_exit(&dd->dd_lock); 1314 DMU_TX_STAT_BUMP(dmu_tx_quota); 1315 return (error); 1316 } 1317 } 1318 1319 /* 1320 * If this transaction will result in a net free of space, 1321 * we want to let it through. 1322 */ 1323 if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 || 1324 (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL && 1325 zvol_enforce_quotas == B_FALSE)) 1326 quota = UINT64_MAX; 1327 else 1328 quota = dsl_dir_phys(dd)->dd_quota; 1329 1330 /* 1331 * Adjust the quota against the actual pool size at the root 1332 * minus any outstanding deferred frees. 1333 * To ensure that it's possible to remove files from a full 1334 * pool without inducing transient overcommits, we throttle 1335 * netfree transactions against a quota that is slightly larger, 1336 * but still within the pool's allocation slop. In cases where 1337 * we're very close to full, this will allow a steady trickle of 1338 * removes to get through. 1339 */ 1340 if (dd->dd_parent == NULL) { 1341 uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, 1342 (netfree) ? 1343 ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); 1344 1345 if (avail < quota) { 1346 quota = avail; 1347 retval = SET_ERROR(ENOSPC); 1348 } 1349 } 1350 1351 /* 1352 * If they are requesting more space, and our current estimate 1353 * is over quota, they get to try again unless the actual 1354 * on-disk is over quota and there are no pending changes 1355 * or deferred frees (which may free up space for us). 1356 */ 1357 ext_quota = quota >> 5; 1358 if (quota == UINT64_MAX) 1359 ext_quota = 0; 1360 1361 if (used_on_disk >= quota) { 1362 if (retval == ENOSPC && (used_on_disk - quota) < 1363 dsl_pool_deferred_space(dd->dd_pool)) { 1364 retval = SET_ERROR(ERESTART); 1365 } 1366 /* Quota exceeded */ 1367 mutex_exit(&dd->dd_lock); 1368 DMU_TX_STAT_BUMP(dmu_tx_quota); 1369 return (retval); 1370 } else if (used_on_disk + est_inflight >= quota + ext_quota) { 1371 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " 1372 "quota=%lluK tr=%lluK\n", 1373 (u_longlong_t)used_on_disk>>10, 1374 (u_longlong_t)est_inflight>>10, 1375 (u_longlong_t)quota>>10, (u_longlong_t)asize>>10); 1376 mutex_exit(&dd->dd_lock); 1377 DMU_TX_STAT_BUMP(dmu_tx_quota); 1378 return (SET_ERROR(ERESTART)); 1379 } 1380 1381 /* We need to up our estimated delta before dropping dd_lock */ 1382 dd->dd_tempreserved[txg & TXG_MASK] += asize; 1383 1384 uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, 1385 asize - ref_rsrv); 1386 mutex_exit(&dd->dd_lock); 1387 1388 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1389 tr->tr_ds = dd; 1390 tr->tr_size = asize; 1391 list_insert_tail(tr_list, tr); 1392 1393 /* see if it's OK with our parent */ 1394 if (dd->dd_parent != NULL && parent_rsrv != 0) { 1395 /* 1396 * Recurse on our parent without recursion. This has been 1397 * observed to be potentially large stack usage even within 1398 * the test suite. Largest seen stack was 7632 bytes on linux. 1399 */ 1400 1401 dd = dd->dd_parent; 1402 asize = parent_rsrv; 1403 ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); 1404 first = B_FALSE; 1405 goto top_of_function; 1406 } 1407 1408 return (0); 1409 } 1410 1411 /* 1412 * Reserve space in this dsl_dir, to be used in this tx's txg. 1413 * After the space has been dirtied (and dsl_dir_willuse_space() 1414 * has been called), the reservation should be canceled, using 1415 * dsl_dir_tempreserve_clear(). 1416 */ 1417 int 1418 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, 1419 boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) 1420 { 1421 int err; 1422 list_t *tr_list; 1423 1424 if (asize == 0) { 1425 *tr_cookiep = NULL; 1426 return (0); 1427 } 1428 1429 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 1430 list_create(tr_list, sizeof (struct tempreserve), 1431 offsetof(struct tempreserve, tr_node)); 1432 ASSERT3S(asize, >, 0); 1433 1434 err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); 1435 if (err == 0) { 1436 struct tempreserve *tr; 1437 1438 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1439 tr->tr_size = lsize; 1440 list_insert_tail(tr_list, tr); 1441 } else { 1442 if (err == EAGAIN) { 1443 /* 1444 * If arc_memory_throttle() detected that pageout 1445 * is running and we are low on memory, we delay new 1446 * non-pageout transactions to give pageout an 1447 * advantage. 1448 * 1449 * It is unfortunate to be delaying while the caller's 1450 * locks are held. 1451 */ 1452 txg_delay(dd->dd_pool, tx->tx_txg, 1453 MSEC2NSEC(10), MSEC2NSEC(10)); 1454 err = SET_ERROR(ERESTART); 1455 } 1456 } 1457 1458 if (err == 0) { 1459 err = dsl_dir_tempreserve_impl(dd, asize, netfree, 1460 B_FALSE, tr_list, tx, B_TRUE); 1461 } 1462 1463 if (err != 0) 1464 dsl_dir_tempreserve_clear(tr_list, tx); 1465 else 1466 *tr_cookiep = tr_list; 1467 1468 return (err); 1469 } 1470 1471 /* 1472 * Clear a temporary reservation that we previously made with 1473 * dsl_dir_tempreserve_space(). 1474 */ 1475 void 1476 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) 1477 { 1478 int txgidx = tx->tx_txg & TXG_MASK; 1479 list_t *tr_list = tr_cookie; 1480 struct tempreserve *tr; 1481 1482 ASSERT3U(tx->tx_txg, !=, 0); 1483 1484 if (tr_cookie == NULL) 1485 return; 1486 1487 while ((tr = list_remove_head(tr_list)) != NULL) { 1488 if (tr->tr_ds) { 1489 mutex_enter(&tr->tr_ds->dd_lock); 1490 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, 1491 tr->tr_size); 1492 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; 1493 mutex_exit(&tr->tr_ds->dd_lock); 1494 } else { 1495 arc_tempreserve_clear(tr->tr_size); 1496 } 1497 kmem_free(tr, sizeof (struct tempreserve)); 1498 } 1499 1500 kmem_free(tr_list, sizeof (list_t)); 1501 } 1502 1503 /* 1504 * This should be called from open context when we think we're going to write 1505 * or free space, for example when dirtying data. Be conservative; it's okay 1506 * to write less space or free more, but we don't want to write more or free 1507 * less than the amount specified. 1508 * 1509 * NOTE: The behavior of this function is identical to the Illumos / FreeBSD 1510 * version however it has been adjusted to use an iterative rather than 1511 * recursive algorithm to minimize stack usage. 1512 */ 1513 void 1514 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) 1515 { 1516 int64_t parent_space; 1517 uint64_t est_used; 1518 1519 do { 1520 mutex_enter(&dd->dd_lock); 1521 if (space > 0) 1522 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; 1523 1524 est_used = dsl_dir_space_towrite(dd) + 1525 dsl_dir_phys(dd)->dd_used_bytes; 1526 parent_space = parent_delta(dd, est_used, space); 1527 mutex_exit(&dd->dd_lock); 1528 1529 /* Make sure that we clean up dd_space_to* */ 1530 dsl_dir_dirty(dd, tx); 1531 1532 dd = dd->dd_parent; 1533 space = parent_space; 1534 } while (space && dd); 1535 } 1536 1537 /* call from syncing context when we actually write/free space for this dd */ 1538 void 1539 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, 1540 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) 1541 { 1542 int64_t accounted_delta; 1543 1544 ASSERT(dmu_tx_is_syncing(tx)); 1545 ASSERT(type < DD_USED_NUM); 1546 1547 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1548 1549 /* 1550 * dsl_dataset_set_refreservation_sync_impl() calls this with 1551 * dd_lock held, so that it can atomically update 1552 * ds->ds_reserved and the dsl_dir accounting, so that 1553 * dsl_dataset_check_quota() can see dataset and dir accounting 1554 * consistently. 1555 */ 1556 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); 1557 if (needlock) 1558 mutex_enter(&dd->dd_lock); 1559 dsl_dir_phys_t *ddp = dsl_dir_phys(dd); 1560 accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); 1561 ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); 1562 ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); 1563 ASSERT(uncompressed >= 0 || 1564 ddp->dd_uncompressed_bytes >= -uncompressed); 1565 ddp->dd_used_bytes += used; 1566 ddp->dd_uncompressed_bytes += uncompressed; 1567 ddp->dd_compressed_bytes += compressed; 1568 1569 if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1570 ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used); 1571 ddp->dd_used_breakdown[type] += used; 1572 #ifdef ZFS_DEBUG 1573 { 1574 dd_used_t t; 1575 uint64_t u = 0; 1576 for (t = 0; t < DD_USED_NUM; t++) 1577 u += ddp->dd_used_breakdown[t]; 1578 ASSERT3U(u, ==, ddp->dd_used_bytes); 1579 } 1580 #endif 1581 } 1582 if (needlock) 1583 mutex_exit(&dd->dd_lock); 1584 1585 if (dd->dd_parent != NULL) { 1586 dsl_dir_diduse_transfer_space(dd->dd_parent, 1587 accounted_delta, compressed, uncompressed, 1588 used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); 1589 } 1590 } 1591 1592 void 1593 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, 1594 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) 1595 { 1596 ASSERT(dmu_tx_is_syncing(tx)); 1597 ASSERT(oldtype < DD_USED_NUM); 1598 ASSERT(newtype < DD_USED_NUM); 1599 1600 dsl_dir_phys_t *ddp = dsl_dir_phys(dd); 1601 if (delta == 0 || 1602 !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN)) 1603 return; 1604 1605 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1606 mutex_enter(&dd->dd_lock); 1607 ASSERT(delta > 0 ? 1608 ddp->dd_used_breakdown[oldtype] >= delta : 1609 ddp->dd_used_breakdown[newtype] >= -delta); 1610 ASSERT(ddp->dd_used_bytes >= ABS(delta)); 1611 ddp->dd_used_breakdown[oldtype] -= delta; 1612 ddp->dd_used_breakdown[newtype] += delta; 1613 mutex_exit(&dd->dd_lock); 1614 } 1615 1616 void 1617 dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, 1618 int64_t compressed, int64_t uncompressed, int64_t tonew, 1619 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) 1620 { 1621 int64_t accounted_delta; 1622 1623 ASSERT(dmu_tx_is_syncing(tx)); 1624 ASSERT(oldtype < DD_USED_NUM); 1625 ASSERT(newtype < DD_USED_NUM); 1626 1627 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1628 1629 mutex_enter(&dd->dd_lock); 1630 dsl_dir_phys_t *ddp = dsl_dir_phys(dd); 1631 accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); 1632 ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); 1633 ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); 1634 ASSERT(uncompressed >= 0 || 1635 ddp->dd_uncompressed_bytes >= -uncompressed); 1636 ddp->dd_used_bytes += used; 1637 ddp->dd_uncompressed_bytes += uncompressed; 1638 ddp->dd_compressed_bytes += compressed; 1639 1640 if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1641 ASSERT(tonew - used <= 0 || 1642 ddp->dd_used_breakdown[oldtype] >= tonew - used); 1643 ASSERT(tonew >= 0 || 1644 ddp->dd_used_breakdown[newtype] >= -tonew); 1645 ddp->dd_used_breakdown[oldtype] -= tonew - used; 1646 ddp->dd_used_breakdown[newtype] += tonew; 1647 #ifdef ZFS_DEBUG 1648 { 1649 dd_used_t t; 1650 uint64_t u = 0; 1651 for (t = 0; t < DD_USED_NUM; t++) 1652 u += ddp->dd_used_breakdown[t]; 1653 ASSERT3U(u, ==, ddp->dd_used_bytes); 1654 } 1655 #endif 1656 } 1657 mutex_exit(&dd->dd_lock); 1658 1659 if (dd->dd_parent != NULL) { 1660 dsl_dir_diduse_transfer_space(dd->dd_parent, 1661 accounted_delta, compressed, uncompressed, 1662 used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); 1663 } 1664 } 1665 1666 typedef struct dsl_dir_set_qr_arg { 1667 const char *ddsqra_name; 1668 zprop_source_t ddsqra_source; 1669 uint64_t ddsqra_value; 1670 } dsl_dir_set_qr_arg_t; 1671 1672 static int 1673 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) 1674 { 1675 dsl_dir_set_qr_arg_t *ddsqra = arg; 1676 dsl_pool_t *dp = dmu_tx_pool(tx); 1677 dsl_dataset_t *ds; 1678 int error; 1679 uint64_t towrite, newval; 1680 1681 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1682 if (error != 0) 1683 return (error); 1684 1685 error = dsl_prop_predict(ds->ds_dir, "quota", 1686 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1687 if (error != 0) { 1688 dsl_dataset_rele(ds, FTAG); 1689 return (error); 1690 } 1691 1692 if (newval == 0) { 1693 dsl_dataset_rele(ds, FTAG); 1694 return (0); 1695 } 1696 1697 mutex_enter(&ds->ds_dir->dd_lock); 1698 /* 1699 * If we are doing the preliminary check in open context, and 1700 * there are pending changes, then don't fail it, since the 1701 * pending changes could under-estimate the amount of space to be 1702 * freed up. 1703 */ 1704 towrite = dsl_dir_space_towrite(ds->ds_dir); 1705 if ((dmu_tx_is_syncing(tx) || towrite == 0) && 1706 (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || 1707 newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { 1708 error = SET_ERROR(ENOSPC); 1709 } 1710 mutex_exit(&ds->ds_dir->dd_lock); 1711 dsl_dataset_rele(ds, FTAG); 1712 return (error); 1713 } 1714 1715 static void 1716 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) 1717 { 1718 dsl_dir_set_qr_arg_t *ddsqra = arg; 1719 dsl_pool_t *dp = dmu_tx_pool(tx); 1720 dsl_dataset_t *ds; 1721 uint64_t newval; 1722 1723 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1724 1725 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1726 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), 1727 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1728 &ddsqra->ddsqra_value, tx); 1729 1730 VERIFY0(dsl_prop_get_int_ds(ds, 1731 zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); 1732 } else { 1733 newval = ddsqra->ddsqra_value; 1734 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1735 zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); 1736 } 1737 1738 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1739 mutex_enter(&ds->ds_dir->dd_lock); 1740 dsl_dir_phys(ds->ds_dir)->dd_quota = newval; 1741 mutex_exit(&ds->ds_dir->dd_lock); 1742 dsl_dataset_rele(ds, FTAG); 1743 } 1744 1745 int 1746 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) 1747 { 1748 dsl_dir_set_qr_arg_t ddsqra; 1749 1750 ddsqra.ddsqra_name = ddname; 1751 ddsqra.ddsqra_source = source; 1752 ddsqra.ddsqra_value = quota; 1753 1754 return (dsl_sync_task(ddname, dsl_dir_set_quota_check, 1755 dsl_dir_set_quota_sync, &ddsqra, 0, 1756 ZFS_SPACE_CHECK_EXTRA_RESERVED)); 1757 } 1758 1759 static int 1760 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) 1761 { 1762 dsl_dir_set_qr_arg_t *ddsqra = arg; 1763 dsl_pool_t *dp = dmu_tx_pool(tx); 1764 dsl_dataset_t *ds; 1765 dsl_dir_t *dd; 1766 uint64_t newval, used, avail; 1767 int error; 1768 1769 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1770 if (error != 0) 1771 return (error); 1772 dd = ds->ds_dir; 1773 1774 /* 1775 * If we are doing the preliminary check in open context, the 1776 * space estimates may be inaccurate. 1777 */ 1778 if (!dmu_tx_is_syncing(tx)) { 1779 dsl_dataset_rele(ds, FTAG); 1780 return (0); 1781 } 1782 1783 error = dsl_prop_predict(ds->ds_dir, 1784 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1785 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1786 if (error != 0) { 1787 dsl_dataset_rele(ds, FTAG); 1788 return (error); 1789 } 1790 1791 mutex_enter(&dd->dd_lock); 1792 used = dsl_dir_phys(dd)->dd_used_bytes; 1793 mutex_exit(&dd->dd_lock); 1794 1795 if (dd->dd_parent) { 1796 avail = dsl_dir_space_available(dd->dd_parent, 1797 NULL, 0, FALSE); 1798 } else { 1799 avail = dsl_pool_adjustedsize(dd->dd_pool, 1800 ZFS_SPACE_CHECK_NORMAL) - used; 1801 } 1802 1803 if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { 1804 uint64_t delta = MAX(used, newval) - 1805 MAX(used, dsl_dir_phys(dd)->dd_reserved); 1806 1807 if (delta > avail || 1808 (dsl_dir_phys(dd)->dd_quota > 0 && 1809 newval > dsl_dir_phys(dd)->dd_quota)) 1810 error = SET_ERROR(ENOSPC); 1811 } 1812 1813 dsl_dataset_rele(ds, FTAG); 1814 return (error); 1815 } 1816 1817 void 1818 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) 1819 { 1820 uint64_t used; 1821 int64_t delta; 1822 1823 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1824 1825 mutex_enter(&dd->dd_lock); 1826 used = dsl_dir_phys(dd)->dd_used_bytes; 1827 delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); 1828 dsl_dir_phys(dd)->dd_reserved = value; 1829 1830 if (dd->dd_parent != NULL) { 1831 /* Roll up this additional usage into our ancestors */ 1832 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 1833 delta, 0, 0, tx); 1834 } 1835 mutex_exit(&dd->dd_lock); 1836 } 1837 1838 static void 1839 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) 1840 { 1841 dsl_dir_set_qr_arg_t *ddsqra = arg; 1842 dsl_pool_t *dp = dmu_tx_pool(tx); 1843 dsl_dataset_t *ds; 1844 uint64_t newval; 1845 1846 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1847 1848 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1849 dsl_prop_set_sync_impl(ds, 1850 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1851 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1852 &ddsqra->ddsqra_value, tx); 1853 1854 VERIFY0(dsl_prop_get_int_ds(ds, 1855 zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); 1856 } else { 1857 newval = ddsqra->ddsqra_value; 1858 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1859 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1860 (longlong_t)newval); 1861 } 1862 1863 dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); 1864 dsl_dataset_rele(ds, FTAG); 1865 } 1866 1867 int 1868 dsl_dir_set_reservation(const char *ddname, zprop_source_t source, 1869 uint64_t reservation) 1870 { 1871 dsl_dir_set_qr_arg_t ddsqra; 1872 1873 ddsqra.ddsqra_name = ddname; 1874 ddsqra.ddsqra_source = source; 1875 ddsqra.ddsqra_value = reservation; 1876 1877 return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, 1878 dsl_dir_set_reservation_sync, &ddsqra, 0, 1879 ZFS_SPACE_CHECK_EXTRA_RESERVED)); 1880 } 1881 1882 static dsl_dir_t * 1883 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) 1884 { 1885 for (; ds1; ds1 = ds1->dd_parent) { 1886 dsl_dir_t *dd; 1887 for (dd = ds2; dd; dd = dd->dd_parent) { 1888 if (ds1 == dd) 1889 return (dd); 1890 } 1891 } 1892 return (NULL); 1893 } 1894 1895 /* 1896 * If delta is applied to dd, how much of that delta would be applied to 1897 * ancestor? Syncing context only. 1898 */ 1899 static int64_t 1900 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) 1901 { 1902 if (dd == ancestor) 1903 return (delta); 1904 1905 mutex_enter(&dd->dd_lock); 1906 delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); 1907 mutex_exit(&dd->dd_lock); 1908 return (would_change(dd->dd_parent, delta, ancestor)); 1909 } 1910 1911 typedef struct dsl_dir_rename_arg { 1912 const char *ddra_oldname; 1913 const char *ddra_newname; 1914 cred_t *ddra_cred; 1915 proc_t *ddra_proc; 1916 } dsl_dir_rename_arg_t; 1917 1918 typedef struct dsl_valid_rename_arg { 1919 int char_delta; 1920 int nest_delta; 1921 } dsl_valid_rename_arg_t; 1922 1923 static int 1924 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1925 { 1926 (void) dp; 1927 dsl_valid_rename_arg_t *dvra = arg; 1928 char namebuf[ZFS_MAX_DATASET_NAME_LEN]; 1929 1930 dsl_dataset_name(ds, namebuf); 1931 1932 ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), 1933 <, ZFS_MAX_DATASET_NAME_LEN); 1934 int namelen = strlen(namebuf) + dvra->char_delta; 1935 int depth = get_dataset_depth(namebuf) + dvra->nest_delta; 1936 1937 if (namelen >= ZFS_MAX_DATASET_NAME_LEN) 1938 return (SET_ERROR(ENAMETOOLONG)); 1939 if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) 1940 return (SET_ERROR(ENAMETOOLONG)); 1941 return (0); 1942 } 1943 1944 static int 1945 dsl_dir_rename_check(void *arg, dmu_tx_t *tx) 1946 { 1947 dsl_dir_rename_arg_t *ddra = arg; 1948 dsl_pool_t *dp = dmu_tx_pool(tx); 1949 dsl_dir_t *dd, *newparent; 1950 dsl_valid_rename_arg_t dvra; 1951 dsl_dataset_t *parentds; 1952 objset_t *parentos; 1953 const char *mynewname; 1954 int error; 1955 1956 /* target dir should exist */ 1957 error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); 1958 if (error != 0) 1959 return (error); 1960 1961 /* new parent should exist */ 1962 error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, 1963 &newparent, &mynewname); 1964 if (error != 0) { 1965 dsl_dir_rele(dd, FTAG); 1966 return (error); 1967 } 1968 1969 /* can't rename to different pool */ 1970 if (dd->dd_pool != newparent->dd_pool) { 1971 dsl_dir_rele(newparent, FTAG); 1972 dsl_dir_rele(dd, FTAG); 1973 return (SET_ERROR(EXDEV)); 1974 } 1975 1976 /* new name should not already exist */ 1977 if (mynewname == NULL) { 1978 dsl_dir_rele(newparent, FTAG); 1979 dsl_dir_rele(dd, FTAG); 1980 return (SET_ERROR(EEXIST)); 1981 } 1982 1983 /* can't rename below anything but filesystems (eg. no ZVOLs) */ 1984 error = dsl_dataset_hold_obj(newparent->dd_pool, 1985 dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds); 1986 if (error != 0) { 1987 dsl_dir_rele(newparent, FTAG); 1988 dsl_dir_rele(dd, FTAG); 1989 return (error); 1990 } 1991 error = dmu_objset_from_ds(parentds, &parentos); 1992 if (error != 0) { 1993 dsl_dataset_rele(parentds, FTAG); 1994 dsl_dir_rele(newparent, FTAG); 1995 dsl_dir_rele(dd, FTAG); 1996 return (error); 1997 } 1998 if (dmu_objset_type(parentos) != DMU_OST_ZFS) { 1999 dsl_dataset_rele(parentds, FTAG); 2000 dsl_dir_rele(newparent, FTAG); 2001 dsl_dir_rele(dd, FTAG); 2002 return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); 2003 } 2004 dsl_dataset_rele(parentds, FTAG); 2005 2006 ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), 2007 <, ZFS_MAX_DATASET_NAME_LEN); 2008 ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), 2009 <, ZFS_MAX_DATASET_NAME_LEN); 2010 dvra.char_delta = strlen(ddra->ddra_newname) 2011 - strlen(ddra->ddra_oldname); 2012 dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) 2013 - get_dataset_depth(ddra->ddra_oldname); 2014 2015 /* if the name length is growing, validate child name lengths */ 2016 if (dvra.char_delta > 0 || dvra.nest_delta > 0) { 2017 error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, 2018 &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2019 if (error != 0) { 2020 dsl_dir_rele(newparent, FTAG); 2021 dsl_dir_rele(dd, FTAG); 2022 return (error); 2023 } 2024 } 2025 2026 if (dmu_tx_is_syncing(tx)) { 2027 if (spa_feature_is_active(dp->dp_spa, 2028 SPA_FEATURE_FS_SS_LIMIT)) { 2029 /* 2030 * Although this is the check function and we don't 2031 * normally make on-disk changes in check functions, 2032 * we need to do that here. 2033 * 2034 * Ensure this portion of the tree's counts have been 2035 * initialized in case the new parent has limits set. 2036 */ 2037 dsl_dir_init_fs_ss_count(dd, tx); 2038 } 2039 } 2040 2041 if (newparent != dd->dd_parent) { 2042 /* is there enough space? */ 2043 uint64_t myspace = 2044 MAX(dsl_dir_phys(dd)->dd_used_bytes, 2045 dsl_dir_phys(dd)->dd_reserved); 2046 objset_t *os = dd->dd_pool->dp_meta_objset; 2047 uint64_t fs_cnt = 0; 2048 uint64_t ss_cnt = 0; 2049 2050 if (dsl_dir_is_zapified(dd)) { 2051 int err; 2052 2053 err = zap_lookup(os, dd->dd_object, 2054 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 2055 &fs_cnt); 2056 if (err != ENOENT && err != 0) { 2057 dsl_dir_rele(newparent, FTAG); 2058 dsl_dir_rele(dd, FTAG); 2059 return (err); 2060 } 2061 2062 /* 2063 * have to add 1 for the filesystem itself that we're 2064 * moving 2065 */ 2066 fs_cnt++; 2067 2068 err = zap_lookup(os, dd->dd_object, 2069 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 2070 &ss_cnt); 2071 if (err != ENOENT && err != 0) { 2072 dsl_dir_rele(newparent, FTAG); 2073 dsl_dir_rele(dd, FTAG); 2074 return (err); 2075 } 2076 } 2077 2078 /* check for encryption errors */ 2079 error = dsl_dir_rename_crypt_check(dd, newparent); 2080 if (error != 0) { 2081 dsl_dir_rele(newparent, FTAG); 2082 dsl_dir_rele(dd, FTAG); 2083 return (SET_ERROR(EACCES)); 2084 } 2085 2086 /* no rename into our descendant */ 2087 if (closest_common_ancestor(dd, newparent) == dd) { 2088 dsl_dir_rele(newparent, FTAG); 2089 dsl_dir_rele(dd, FTAG); 2090 return (SET_ERROR(EINVAL)); 2091 } 2092 2093 error = dsl_dir_transfer_possible(dd->dd_parent, 2094 newparent, fs_cnt, ss_cnt, myspace, 2095 ddra->ddra_cred, ddra->ddra_proc); 2096 if (error != 0) { 2097 dsl_dir_rele(newparent, FTAG); 2098 dsl_dir_rele(dd, FTAG); 2099 return (error); 2100 } 2101 } 2102 2103 dsl_dir_rele(newparent, FTAG); 2104 dsl_dir_rele(dd, FTAG); 2105 return (0); 2106 } 2107 2108 static void 2109 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) 2110 { 2111 dsl_dir_rename_arg_t *ddra = arg; 2112 dsl_pool_t *dp = dmu_tx_pool(tx); 2113 dsl_dir_t *dd, *newparent; 2114 const char *mynewname; 2115 objset_t *mos = dp->dp_meta_objset; 2116 2117 VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); 2118 VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, 2119 &mynewname)); 2120 2121 ASSERT3P(mynewname, !=, NULL); 2122 2123 /* Log this before we change the name. */ 2124 spa_history_log_internal_dd(dd, "rename", tx, 2125 "-> %s", ddra->ddra_newname); 2126 2127 if (newparent != dd->dd_parent) { 2128 objset_t *os = dd->dd_pool->dp_meta_objset; 2129 uint64_t fs_cnt = 0; 2130 uint64_t ss_cnt = 0; 2131 2132 /* 2133 * We already made sure the dd counts were initialized in the 2134 * check function. 2135 */ 2136 if (spa_feature_is_active(dp->dp_spa, 2137 SPA_FEATURE_FS_SS_LIMIT)) { 2138 VERIFY0(zap_lookup(os, dd->dd_object, 2139 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 2140 &fs_cnt)); 2141 /* add 1 for the filesystem itself that we're moving */ 2142 fs_cnt++; 2143 2144 VERIFY0(zap_lookup(os, dd->dd_object, 2145 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 2146 &ss_cnt)); 2147 } 2148 2149 dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, 2150 DD_FIELD_FILESYSTEM_COUNT, tx); 2151 dsl_fs_ss_count_adjust(newparent, fs_cnt, 2152 DD_FIELD_FILESYSTEM_COUNT, tx); 2153 2154 dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, 2155 DD_FIELD_SNAPSHOT_COUNT, tx); 2156 dsl_fs_ss_count_adjust(newparent, ss_cnt, 2157 DD_FIELD_SNAPSHOT_COUNT, tx); 2158 2159 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, 2160 -dsl_dir_phys(dd)->dd_used_bytes, 2161 -dsl_dir_phys(dd)->dd_compressed_bytes, 2162 -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 2163 dsl_dir_diduse_space(newparent, DD_USED_CHILD, 2164 dsl_dir_phys(dd)->dd_used_bytes, 2165 dsl_dir_phys(dd)->dd_compressed_bytes, 2166 dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 2167 2168 if (dsl_dir_phys(dd)->dd_reserved > 2169 dsl_dir_phys(dd)->dd_used_bytes) { 2170 uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - 2171 dsl_dir_phys(dd)->dd_used_bytes; 2172 2173 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 2174 -unused_rsrv, 0, 0, tx); 2175 dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, 2176 unused_rsrv, 0, 0, tx); 2177 } 2178 } 2179 2180 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2181 2182 /* remove from old parent zapobj */ 2183 VERIFY0(zap_remove(mos, 2184 dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, 2185 dd->dd_myname, tx)); 2186 2187 (void) strlcpy(dd->dd_myname, mynewname, 2188 sizeof (dd->dd_myname)); 2189 dsl_dir_rele(dd->dd_parent, dd); 2190 dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; 2191 VERIFY0(dsl_dir_hold_obj(dp, 2192 newparent->dd_object, NULL, dd, &dd->dd_parent)); 2193 2194 /* add to new parent zapobj */ 2195 VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, 2196 dd->dd_myname, 8, 1, &dd->dd_object, tx)); 2197 2198 /* TODO: A rename callback to avoid these layering violations. */ 2199 zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); 2200 zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, 2201 ddra->ddra_newname, B_TRUE); 2202 2203 dsl_prop_notify_all(dd); 2204 2205 dsl_dir_rele(newparent, FTAG); 2206 dsl_dir_rele(dd, FTAG); 2207 } 2208 2209 int 2210 dsl_dir_rename(const char *oldname, const char *newname) 2211 { 2212 dsl_dir_rename_arg_t ddra; 2213 2214 ddra.ddra_oldname = oldname; 2215 ddra.ddra_newname = newname; 2216 ddra.ddra_cred = CRED(); 2217 ddra.ddra_proc = curproc; 2218 2219 return (dsl_sync_task(oldname, 2220 dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 2221 3, ZFS_SPACE_CHECK_RESERVED)); 2222 } 2223 2224 int 2225 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, 2226 uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, 2227 cred_t *cr, proc_t *proc) 2228 { 2229 dsl_dir_t *ancestor; 2230 int64_t adelta; 2231 uint64_t avail; 2232 int err; 2233 2234 ancestor = closest_common_ancestor(sdd, tdd); 2235 adelta = would_change(sdd, -space, ancestor); 2236 avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); 2237 if (avail < space) 2238 return (SET_ERROR(ENOSPC)); 2239 2240 err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, 2241 ancestor, cr, proc); 2242 if (err != 0) 2243 return (err); 2244 err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, 2245 ancestor, cr, proc); 2246 if (err != 0) 2247 return (err); 2248 2249 return (0); 2250 } 2251 2252 inode_timespec_t 2253 dsl_dir_snap_cmtime(dsl_dir_t *dd) 2254 { 2255 inode_timespec_t t; 2256 2257 mutex_enter(&dd->dd_lock); 2258 t = dd->dd_snap_cmtime; 2259 mutex_exit(&dd->dd_lock); 2260 2261 return (t); 2262 } 2263 2264 void 2265 dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx) 2266 { 2267 dsl_pool_t *dp = dmu_tx_pool(tx); 2268 inode_timespec_t t; 2269 gethrestime(&t); 2270 2271 mutex_enter(&dd->dd_lock); 2272 dd->dd_snap_cmtime = t; 2273 if (spa_feature_is_enabled(dp->dp_spa, 2274 SPA_FEATURE_EXTENSIBLE_DATASET)) { 2275 objset_t *mos = dd->dd_pool->dp_meta_objset; 2276 uint64_t ddobj = dd->dd_object; 2277 dsl_dir_zapify(dd, tx); 2278 VERIFY0(zap_update(mos, ddobj, 2279 DD_FIELD_SNAPSHOTS_CHANGED, 2280 sizeof (uint64_t), 2281 sizeof (inode_timespec_t) / sizeof (uint64_t), 2282 &t, tx)); 2283 } 2284 mutex_exit(&dd->dd_lock); 2285 } 2286 2287 void 2288 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) 2289 { 2290 objset_t *mos = dd->dd_pool->dp_meta_objset; 2291 dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); 2292 } 2293 2294 boolean_t 2295 dsl_dir_is_zapified(dsl_dir_t *dd) 2296 { 2297 dmu_object_info_t doi; 2298 2299 dmu_object_info_from_db(dd->dd_dbuf, &doi); 2300 return (doi.doi_type == DMU_OTN_ZAP_METADATA); 2301 } 2302 2303 void 2304 dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) 2305 { 2306 objset_t *mos = dd->dd_pool->dp_meta_objset; 2307 ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, 2308 SPA_FEATURE_LIVELIST)); 2309 dsl_deadlist_open(&dd->dd_livelist, mos, obj); 2310 bplist_create(&dd->dd_pending_allocs); 2311 bplist_create(&dd->dd_pending_frees); 2312 } 2313 2314 void 2315 dsl_dir_livelist_close(dsl_dir_t *dd) 2316 { 2317 dsl_deadlist_close(&dd->dd_livelist); 2318 bplist_destroy(&dd->dd_pending_allocs); 2319 bplist_destroy(&dd->dd_pending_frees); 2320 } 2321 2322 void 2323 dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) 2324 { 2325 uint64_t obj; 2326 dsl_pool_t *dp = dmu_tx_pool(tx); 2327 spa_t *spa = dp->dp_spa; 2328 livelist_condense_entry_t to_condense = spa->spa_to_condense; 2329 2330 if (!dsl_deadlist_is_open(&dd->dd_livelist)) 2331 return; 2332 2333 /* 2334 * If the livelist being removed is set to be condensed, stop the 2335 * condense zthr and indicate the cancellation in the spa_to_condense 2336 * struct in case the condense no-wait synctask has already started 2337 */ 2338 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 2339 if (ll_condense_thread != NULL && 2340 (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { 2341 /* 2342 * We use zthr_wait_cycle_done instead of zthr_cancel 2343 * because we don't want to destroy the zthr, just have 2344 * it skip its current task. 2345 */ 2346 spa->spa_to_condense.cancelled = B_TRUE; 2347 zthr_wait_cycle_done(ll_condense_thread); 2348 /* 2349 * If we've returned from zthr_wait_cycle_done without 2350 * clearing the to_condense data structure it's either 2351 * because the no-wait synctask has started (which is 2352 * indicated by 'syncing' field of to_condense) and we 2353 * can expect it to clear to_condense on its own. 2354 * Otherwise, we returned before the zthr ran. The 2355 * checkfunc will now fail as cancelled == B_TRUE so we 2356 * can safely NULL out ds, allowing a different dir's 2357 * livelist to be condensed. 2358 * 2359 * We can be sure that the to_condense struct will not 2360 * be repopulated at this stage because both this 2361 * function and dsl_livelist_try_condense execute in 2362 * syncing context. 2363 */ 2364 if ((spa->spa_to_condense.ds != NULL) && 2365 !spa->spa_to_condense.syncing) { 2366 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, 2367 spa); 2368 spa->spa_to_condense.ds = NULL; 2369 } 2370 } 2371 2372 dsl_dir_livelist_close(dd); 2373 VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object, 2374 DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj)); 2375 VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, 2376 DD_FIELD_LIVELIST, tx)); 2377 if (total) { 2378 dsl_deadlist_free(dp->dp_meta_objset, obj, tx); 2379 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2380 } 2381 } 2382 2383 static int 2384 dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, 2385 zfs_wait_activity_t activity, boolean_t *in_progress) 2386 { 2387 int error = 0; 2388 2389 ASSERT(MUTEX_HELD(&dd->dd_activity_lock)); 2390 2391 switch (activity) { 2392 case ZFS_WAIT_DELETEQ: { 2393 #ifdef _KERNEL 2394 objset_t *os; 2395 error = dmu_objset_from_ds(ds, &os); 2396 if (error != 0) 2397 break; 2398 2399 mutex_enter(&os->os_user_ptr_lock); 2400 void *user = dmu_objset_get_user(os); 2401 mutex_exit(&os->os_user_ptr_lock); 2402 if (dmu_objset_type(os) != DMU_OST_ZFS || 2403 user == NULL || zfs_get_vfs_flag_unmounted(os)) { 2404 *in_progress = B_FALSE; 2405 return (0); 2406 } 2407 2408 uint64_t readonly = B_FALSE; 2409 error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly, 2410 NULL); 2411 2412 if (error != 0) 2413 break; 2414 2415 if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) { 2416 *in_progress = B_FALSE; 2417 return (0); 2418 } 2419 2420 uint64_t count, unlinked_obj; 2421 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 2422 &unlinked_obj); 2423 if (error != 0) { 2424 dsl_dataset_rele(ds, FTAG); 2425 break; 2426 } 2427 error = zap_count(os, unlinked_obj, &count); 2428 2429 if (error == 0) 2430 *in_progress = (count != 0); 2431 break; 2432 #else 2433 /* 2434 * The delete queue is ZPL specific, and libzpool doesn't have 2435 * it. It doesn't make sense to wait for it. 2436 */ 2437 (void) ds; 2438 *in_progress = B_FALSE; 2439 break; 2440 #endif 2441 } 2442 default: 2443 panic("unrecognized value for activity %d", activity); 2444 } 2445 2446 return (error); 2447 } 2448 2449 int 2450 dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, 2451 boolean_t *waited) 2452 { 2453 int error = 0; 2454 boolean_t in_progress; 2455 dsl_pool_t *dp = dd->dd_pool; 2456 for (;;) { 2457 dsl_pool_config_enter(dp, FTAG); 2458 error = dsl_dir_activity_in_progress(dd, ds, activity, 2459 &in_progress); 2460 dsl_pool_config_exit(dp, FTAG); 2461 if (error != 0 || !in_progress) 2462 break; 2463 2464 *waited = B_TRUE; 2465 2466 if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) == 2467 0 || dd->dd_activity_cancelled) { 2468 error = SET_ERROR(EINTR); 2469 break; 2470 } 2471 } 2472 return (error); 2473 } 2474 2475 void 2476 dsl_dir_cancel_waiters(dsl_dir_t *dd) 2477 { 2478 mutex_enter(&dd->dd_activity_lock); 2479 dd->dd_activity_cancelled = B_TRUE; 2480 cv_broadcast(&dd->dd_activity_cv); 2481 while (dd->dd_activity_waiters > 0) 2482 cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock); 2483 mutex_exit(&dd->dd_activity_lock); 2484 } 2485 2486 #if defined(_KERNEL) 2487 EXPORT_SYMBOL(dsl_dir_set_quota); 2488 EXPORT_SYMBOL(dsl_dir_set_reservation); 2489 #endif 2490 2491 /* CSTYLED */ 2492 ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW, 2493 "Enable strict ZVOL quota enforcment"); 2494