1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dsl_prop.h> 34 #include <sys/spa.h> 35 #include <sys/zap.h> 36 #include <sys/zio.h> 37 #include <sys/arc.h> 38 #include "zfs_namecheck.h" 39 40 static uint64_t dsl_dir_space_accounted(dsl_dir_t *dd); 41 static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); 42 static int dsl_dir_set_reservation_sync(dsl_dir_t *dd, 43 void *arg, dmu_tx_t *tx); 44 static uint64_t dsl_dir_space_available(dsl_dir_t *dd, 45 dsl_dir_t *ancestor, int64_t delta, int ondiskonly); 46 47 48 /* ARGSUSED */ 49 static void 50 dsl_dir_evict(dmu_buf_t *db, void *arg) 51 { 52 dsl_dir_t *dd = arg; 53 dsl_pool_t *dp = dd->dd_pool; 54 int t; 55 56 for (t = 0; t < TXG_SIZE; t++) { 57 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 58 ASSERT(dd->dd_tempreserved[t] == 0); 59 ASSERT(dd->dd_space_towrite[t] == 0); 60 } 61 62 ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); 63 64 ASSERT(dd->dd_sync_txg == 0); 65 66 if (dd->dd_parent) 67 dsl_dir_close(dd->dd_parent, dd); 68 69 spa_close(dd->dd_pool->dp_spa, dd); 70 71 /* 72 * The props callback list should be empty since they hold the 73 * dir open. 74 */ 75 list_destroy(&dd->dd_prop_cbs); 76 kmem_free(dd, sizeof (dsl_dir_t)); 77 } 78 79 dsl_dir_t * 80 dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, 81 const char *tail, void *tag) 82 { 83 dmu_buf_t *dbuf; 84 dsl_dir_t *dd; 85 86 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 87 dsl_pool_sync_context(dp)); 88 89 dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag); 90 dmu_buf_read(dbuf); 91 dd = dmu_buf_get_user(dbuf); 92 #ifdef ZFS_DEBUG 93 { 94 dmu_object_info_t doi; 95 dmu_object_info_from_db(dbuf, &doi); 96 ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); 97 } 98 #endif 99 /* XXX assert bonus buffer size is correct */ 100 if (dd == NULL) { 101 dsl_dir_t *winner; 102 int err; 103 104 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); 105 dd->dd_object = ddobj; 106 dd->dd_dbuf = dbuf; 107 dd->dd_pool = dp; 108 dd->dd_phys = dbuf->db_data; 109 dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; 110 111 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), 112 offsetof(dsl_prop_cb_record_t, cbr_node)); 113 114 if (dd->dd_phys->dd_parent_obj) { 115 dd->dd_parent = dsl_dir_open_obj(dp, 116 dd->dd_phys->dd_parent_obj, NULL, dd); 117 if (tail) { 118 #ifdef ZFS_DEBUG 119 uint64_t foundobj; 120 121 err = zap_lookup(dp->dp_meta_objset, 122 dd->dd_parent->dd_phys-> 123 dd_child_dir_zapobj, 124 tail, sizeof (foundobj), 1, &foundobj); 125 ASSERT3U(err, ==, 0); 126 ASSERT3U(foundobj, ==, ddobj); 127 #endif 128 (void) strcpy(dd->dd_myname, tail); 129 } else { 130 err = zap_value_search(dp->dp_meta_objset, 131 dd->dd_parent->dd_phys-> 132 dd_child_dir_zapobj, 133 ddobj, dd->dd_myname); 134 /* 135 * The caller should be protecting this ddobj 136 * from being deleted concurrently 137 */ 138 ASSERT(err == 0); 139 } 140 } else { 141 (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); 142 } 143 144 winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, 145 dsl_dir_evict); 146 if (winner) { 147 if (dd->dd_parent) 148 dsl_dir_close(dd->dd_parent, dd); 149 kmem_free(dd, sizeof (dsl_dir_t)); 150 dd = winner; 151 } else { 152 spa_open_ref(dp->dp_spa, dd); 153 } 154 } 155 156 /* 157 * The dsl_dir_t has both open-to-close and instantiate-to-evict 158 * holds on the spa. We need the open-to-close holds because 159 * otherwise the spa_refcnt wouldn't change when we open a 160 * dir which the spa also has open, so we could incorrectly 161 * think it was OK to unload/export/destroy the pool. We need 162 * the instantiate-to-evict hold because the dsl_dir_t has a 163 * pointer to the dd_pool, which has a pointer to the spa_t. 164 */ 165 spa_open_ref(dp->dp_spa, tag); 166 ASSERT3P(dd->dd_pool, ==, dp); 167 ASSERT3U(dd->dd_object, ==, ddobj); 168 ASSERT3P(dd->dd_dbuf, ==, dbuf); 169 return (dd); 170 } 171 172 void 173 dsl_dir_close(dsl_dir_t *dd, void *tag) 174 { 175 dprintf_dd(dd, "%s\n", ""); 176 spa_close(dd->dd_pool->dp_spa, tag); 177 dmu_buf_rele_tag(dd->dd_dbuf, tag); 178 } 179 180 /* buf must be long enough (MAXNAMELEN should do) */ 181 void 182 dsl_dir_name(dsl_dir_t *dd, char *buf) 183 { 184 if (dd->dd_parent) { 185 dsl_dir_name(dd->dd_parent, buf); 186 (void) strcat(buf, "/"); 187 } else { 188 buf[0] = '\0'; 189 } 190 if (!MUTEX_HELD(&dd->dd_lock)) { 191 /* 192 * recursive mutex so that we can use 193 * dprintf_dd() with dd_lock held 194 */ 195 mutex_enter(&dd->dd_lock); 196 (void) strcat(buf, dd->dd_myname); 197 mutex_exit(&dd->dd_lock); 198 } else { 199 (void) strcat(buf, dd->dd_myname); 200 } 201 } 202 203 int 204 dsl_dir_is_private(dsl_dir_t *dd) 205 { 206 int rv = FALSE; 207 208 if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) 209 rv = TRUE; 210 if (dataset_name_hidden(dd->dd_myname)) 211 rv = TRUE; 212 return (rv); 213 } 214 215 216 static int 217 getcomponent(const char *path, char *component, const char **nextp) 218 { 219 char *p; 220 if (path == NULL) 221 return (NULL); 222 /* This would be a good place to reserve some namespace... */ 223 p = strpbrk(path, "/@"); 224 if (p && (p[1] == '/' || p[1] == '@')) { 225 /* two separators in a row */ 226 return (EINVAL); 227 } 228 if (p == NULL || p == path) { 229 /* 230 * if the first thing is an @ or /, it had better be an 231 * @ and it had better not have any more ats or slashes, 232 * and it had better have something after the @. 233 */ 234 if (p != NULL && 235 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) 236 return (EINVAL); 237 if (strlen(path) >= MAXNAMELEN) 238 return (ENAMETOOLONG); 239 (void) strcpy(component, path); 240 p = NULL; 241 } else if (p[0] == '/') { 242 if (p-path >= MAXNAMELEN) 243 return (ENAMETOOLONG); 244 (void) strncpy(component, path, p - path); 245 component[p-path] = '\0'; 246 p++; 247 } else if (p[0] == '@') { 248 /* 249 * if the next separator is an @, there better not be 250 * any more slashes. 251 */ 252 if (strchr(path, '/')) 253 return (EINVAL); 254 if (p-path >= MAXNAMELEN) 255 return (ENAMETOOLONG); 256 (void) strncpy(component, path, p - path); 257 component[p-path] = '\0'; 258 } else { 259 ASSERT(!"invalid p"); 260 } 261 *nextp = p; 262 return (0); 263 } 264 265 /* 266 * same as dsl_open_dir, ignore the first component of name and use the 267 * spa instead 268 */ 269 dsl_dir_t * 270 dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) 271 { 272 char buf[MAXNAMELEN]; 273 const char *next, *nextnext = NULL; 274 int err; 275 dsl_dir_t *dd; 276 dsl_pool_t *dp; 277 uint64_t ddobj; 278 int openedspa = FALSE; 279 280 dprintf("%s\n", name); 281 282 if (name == NULL) 283 return (NULL); 284 err = getcomponent(name, buf, &next); 285 if (err) 286 return (NULL); 287 if (spa == NULL) { 288 err = spa_open(buf, &spa, FTAG); 289 if (err) { 290 dprintf("spa_open(%s) failed\n", buf); 291 return (NULL); 292 } 293 openedspa = TRUE; 294 295 /* XXX this assertion belongs in spa_open */ 296 ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); 297 } 298 299 dp = spa_get_dsl(spa); 300 301 rw_enter(&dp->dp_config_rwlock, RW_READER); 302 dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag); 303 while (next != NULL) { 304 dsl_dir_t *child_ds; 305 err = getcomponent(next, buf, &nextnext); 306 if (err) { 307 dsl_dir_close(dd, tag); 308 rw_exit(&dp->dp_config_rwlock); 309 if (openedspa) 310 spa_close(spa, FTAG); 311 return (NULL); 312 } 313 ASSERT(next[0] != '\0'); 314 if (next[0] == '@') 315 break; 316 if (dd->dd_phys->dd_child_dir_zapobj == 0) 317 break; 318 dprintf("looking up %s in obj%lld\n", 319 buf, dd->dd_phys->dd_child_dir_zapobj); 320 321 err = zap_lookup(dp->dp_meta_objset, 322 dd->dd_phys->dd_child_dir_zapobj, 323 buf, sizeof (ddobj), 1, &ddobj); 324 if (err == ENOENT) { 325 break; 326 } 327 ASSERT(err == 0); 328 329 child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag); 330 dsl_dir_close(dd, tag); 331 dd = child_ds; 332 next = nextnext; 333 } 334 rw_exit(&dp->dp_config_rwlock); 335 336 /* 337 * It's an error if there's more than one component left, or 338 * tailp==NULL and there's any component left. 339 */ 340 if (next != NULL && 341 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { 342 /* bad path name */ 343 dsl_dir_close(dd, tag); 344 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); 345 next = NULL; 346 dd = NULL; 347 } 348 if (tailp) 349 *tailp = next; 350 if (openedspa) 351 spa_close(spa, FTAG); 352 return (dd); 353 } 354 355 /* 356 * Return the dsl_dir_t, and possibly the last component which couldn't 357 * be found in *tail. Return NULL if the path is bogus, or if 358 * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' 359 * means that the last component is a snapshot. 360 */ 361 dsl_dir_t * 362 dsl_dir_open(const char *name, void *tag, const char **tailp) 363 { 364 return (dsl_dir_open_spa(NULL, name, tag, tailp)); 365 } 366 367 int 368 dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) 369 { 370 objset_t *mos = pds->dd_pool->dp_meta_objset; 371 uint64_t ddobj; 372 dsl_dir_phys_t *dsphys; 373 dmu_buf_t *dbuf; 374 int err; 375 376 ASSERT(dmu_tx_is_syncing(tx)); 377 378 if (pds->dd_phys->dd_child_dir_zapobj == 0) { 379 dmu_buf_will_dirty(pds->dd_dbuf, tx); 380 pds->dd_phys->dd_child_dir_zapobj = zap_create(mos, 381 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 382 } 383 384 rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER); 385 err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, 386 name, sizeof (uint64_t), 1, &ddobj); 387 if (err != ENOENT) { 388 rw_exit(&pds->dd_pool->dp_config_rwlock); 389 return (err ? err : EEXIST); 390 } 391 392 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 393 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 394 err = zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, 395 name, sizeof (uint64_t), 1, &ddobj, tx); 396 ASSERT3U(err, ==, 0); 397 dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n", 398 name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err); 399 400 dbuf = dmu_bonus_hold(mos, ddobj); 401 dmu_buf_will_dirty(dbuf, tx); 402 dsphys = dbuf->db_data; 403 404 dsphys->dd_creation_time = gethrestime_sec(); 405 dsphys->dd_parent_obj = pds->dd_object; 406 dsphys->dd_props_zapobj = zap_create(mos, 407 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 408 dsphys->dd_child_dir_zapobj = zap_create(mos, 409 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 410 dmu_buf_rele(dbuf); 411 412 rw_exit(&pds->dd_pool->dp_config_rwlock); 413 414 return (0); 415 } 416 417 int 418 dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx) 419 { 420 const char *name = arg; 421 dsl_dir_t *dd = NULL; 422 dsl_pool_t *dp = pds->dd_pool; 423 objset_t *mos = dp->dp_meta_objset; 424 uint64_t val, obj, child_zapobj, props_zapobj; 425 int t, err; 426 427 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 428 429 err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, name, 430 8, 1, &obj); 431 if (err) 432 goto out; 433 434 dd = dsl_dir_open_obj(dp, obj, name, FTAG); 435 ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object); 436 437 if (dmu_buf_refcount(dd->dd_dbuf) > 1) { 438 err = EBUSY; 439 goto out; 440 } 441 442 for (t = 0; t < TXG_SIZE; t++) { 443 /* 444 * if they were dirty, they'd also be open. 445 * dp_config_rwlock ensures that it stays that way. 446 */ 447 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 448 } 449 450 child_zapobj = dd->dd_phys->dd_child_dir_zapobj; 451 props_zapobj = dd->dd_phys->dd_props_zapobj; 452 453 if (child_zapobj != 0) { 454 uint64_t count; 455 err = EEXIST; 456 (void) zap_count(mos, child_zapobj, &count); 457 if (count != 0) 458 goto out; 459 } 460 461 if (dd->dd_phys->dd_head_dataset_obj != 0) { 462 err = dsl_dataset_destroy_sync(dd, NULL, tx); 463 if (err) 464 goto out; 465 } 466 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 467 468 /* The point of no (unsuccessful) return */ 469 470 /* Make sure parent's used gets updated */ 471 val = 0; 472 err = dsl_dir_set_reservation_sync(dd, &val, tx); 473 ASSERT(err == 0); 474 ASSERT3U(dd->dd_used_bytes, ==, 0); 475 ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); 476 dsl_dir_close(dd, FTAG); 477 dd = NULL; 478 479 err = dmu_object_free(mos, obj, tx); 480 ASSERT(err == 0); 481 482 if (child_zapobj) 483 err = zap_destroy(mos, child_zapobj, tx); 484 ASSERT(err == 0); 485 486 if (props_zapobj) 487 err = zap_destroy(mos, props_zapobj, tx); 488 ASSERT(err == 0); 489 490 err = zap_remove(mos, pds->dd_phys->dd_child_dir_zapobj, name, tx); 491 ASSERT(err == 0); 492 493 out: 494 rw_exit(&dp->dp_config_rwlock); 495 if (dd) 496 dsl_dir_close(dd, FTAG); 497 498 return (err); 499 } 500 501 void 502 dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) 503 { 504 dsl_dir_phys_t *dsp; 505 dmu_buf_t *dbuf; 506 int error; 507 508 *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 509 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 510 511 error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, 512 sizeof (uint64_t), 1, ddobjp, tx); 513 ASSERT3U(error, ==, 0); 514 515 dbuf = dmu_bonus_hold(mos, *ddobjp); 516 dmu_buf_will_dirty(dbuf, tx); 517 dsp = dbuf->db_data; 518 519 dsp->dd_creation_time = gethrestime_sec(); 520 dsp->dd_props_zapobj = zap_create(mos, 521 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 522 dsp->dd_child_dir_zapobj = zap_create(mos, 523 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 524 525 dmu_buf_rele(dbuf); 526 } 527 528 void 529 dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds) 530 { 531 bzero(dds, sizeof (dmu_objset_stats_t)); 532 533 dds->dds_dir_obj = dd->dd_object; 534 dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE); 535 536 mutex_enter(&dd->dd_lock); 537 dds->dds_space_used = dd->dd_used_bytes; 538 dds->dds_compressed_bytes = dd->dd_phys->dd_compressed_bytes; 539 dds->dds_uncompressed_bytes = dd->dd_phys->dd_uncompressed_bytes; 540 dds->dds_quota = dd->dd_phys->dd_quota; 541 dds->dds_reserved = dd->dd_phys->dd_reserved; 542 mutex_exit(&dd->dd_lock); 543 544 dds->dds_creation_time = dd->dd_phys->dd_creation_time; 545 546 dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0); 547 548 if (dd->dd_phys->dd_clone_parent_obj) { 549 dsl_dataset_t *ds; 550 551 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 552 ds = dsl_dataset_open_obj(dd->dd_pool, 553 dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG); 554 dsl_dataset_name(ds, dds->dds_clone_of); 555 dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj; 556 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 557 rw_exit(&dd->dd_pool->dp_config_rwlock); 558 } 559 560 VERIFY(dsl_prop_get_ds_integer(dd, "checksum", 561 &dds->dds_checksum, dds->dds_checksum_setpoint) == 0); 562 563 VERIFY(dsl_prop_get_ds_integer(dd, "compression", 564 &dds->dds_compression, dds->dds_compression_setpoint) == 0); 565 566 VERIFY(dsl_prop_get_ds_integer(dd, "zoned", 567 &dds->dds_zoned, dds->dds_zoned_setpoint) == 0); 568 569 spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot, 570 sizeof (dds->dds_altroot)); 571 } 572 573 int 574 dsl_dir_sync_task(dsl_dir_t *dd, 575 int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space) 576 { 577 dmu_tx_t *tx; 578 dsl_pool_t *dp = dd->dd_pool; 579 int err = 0; 580 uint64_t txg; 581 582 dprintf_dd(dd, "func=%p space=%llu\n", func, space); 583 584 again: 585 tx = dmu_tx_create_ds(dd); 586 dmu_tx_hold_space(tx, space); 587 err = dmu_tx_assign(tx, TXG_WAIT); 588 if (err == ENOSPC || err == EDQUOT) { 589 dsl_dir_t *rds; 590 /* 591 * They can get their space from either this dd, or the 592 * root dd. 593 */ 594 for (rds = dd; rds->dd_parent; rds = rds->dd_parent) 595 continue; 596 dmu_tx_abort(tx); 597 tx = dmu_tx_create_ds(rds); 598 dmu_tx_hold_space(tx, space); 599 err = dmu_tx_assign(tx, TXG_WAIT); 600 } 601 if (err) { 602 dmu_tx_abort(tx); 603 return (err); 604 } 605 606 txg = dmu_tx_get_txg(tx); 607 mutex_enter(&dd->dd_lock); 608 if (dd->dd_sync_txg != 0) { 609 mutex_exit(&dd->dd_lock); 610 dmu_tx_commit(tx); 611 txg_wait_synced(dp, 0); 612 goto again; 613 } 614 615 /* We're good to go */ 616 617 dd->dd_sync_txg = txg; 618 dd->dd_sync_func = func; 619 dd->dd_sync_arg = arg; 620 621 mutex_exit(&dd->dd_lock); 622 623 dsl_dir_dirty(dd, tx); 624 dmu_tx_commit(tx); 625 626 txg_wait_synced(dp, txg); 627 628 mutex_enter(&dd->dd_lock); 629 ASSERT(dd->dd_sync_txg == txg); 630 ASSERT(dd->dd_sync_func == NULL); 631 err = dd->dd_sync_err; 632 dd->dd_sync_txg = 0; 633 mutex_exit(&dd->dd_lock); 634 635 return (err); 636 } 637 638 void 639 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) 640 { 641 dsl_pool_t *dp = dd->dd_pool; 642 643 ASSERT(dd->dd_phys); 644 645 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { 646 /* up the hold count until we can be written out */ 647 dmu_buf_add_ref(dd->dd_dbuf, dd); 648 } 649 } 650 651 static int64_t 652 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) 653 { 654 uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); 655 uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); 656 return (new_accounted - old_accounted); 657 } 658 659 void 660 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) 661 { 662 if (dd->dd_sync_txg == tx->tx_txg && dd->dd_sync_func) { 663 dd->dd_sync_err = dd->dd_sync_func(dd, dd->dd_sync_arg, tx); 664 dd->dd_sync_func = NULL; 665 } 666 667 ASSERT(dmu_tx_is_syncing(tx)); 668 669 dmu_buf_will_dirty(dd->dd_dbuf, tx); 670 671 mutex_enter(&dd->dd_lock); 672 ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); 673 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, 674 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); 675 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; 676 dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; 677 mutex_exit(&dd->dd_lock); 678 679 /* release the hold from dsl_dir_dirty */ 680 dmu_buf_remove_ref(dd->dd_dbuf, dd); 681 } 682 683 static uint64_t 684 dsl_dir_estimated_space(dsl_dir_t *dd) 685 { 686 int64_t space; 687 int i; 688 689 ASSERT(MUTEX_HELD(&dd->dd_lock)); 690 691 space = dd->dd_used_bytes; 692 ASSERT(space >= 0); 693 for (i = 0; i < TXG_SIZE; i++) { 694 space += dd->dd_space_towrite[i&TXG_MASK]; 695 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); 696 } 697 return (space); 698 } 699 700 /* 701 * How much space would dd have available if ancestor had delta applied 702 * to it? If ondiskonly is set, we're only interested in what's 703 * on-disk, not estimated pending changes. 704 */ 705 static uint64_t 706 dsl_dir_space_available(dsl_dir_t *dd, 707 dsl_dir_t *ancestor, int64_t delta, int ondiskonly) 708 { 709 uint64_t parentspace, myspace, quota, used; 710 711 /* 712 * If there are no restrictions otherwise, assume we have 713 * unlimited space available. 714 */ 715 quota = UINT64_MAX; 716 parentspace = UINT64_MAX; 717 718 if (dd->dd_parent != NULL) { 719 parentspace = dsl_dir_space_available(dd->dd_parent, 720 ancestor, delta, ondiskonly); 721 } 722 723 mutex_enter(&dd->dd_lock); 724 if (dd->dd_phys->dd_quota != 0) 725 quota = dd->dd_phys->dd_quota; 726 if (ondiskonly) { 727 used = dd->dd_used_bytes; 728 } else { 729 used = dsl_dir_estimated_space(dd); 730 } 731 if (dd == ancestor) 732 used += delta; 733 734 if (dd->dd_parent == NULL) { 735 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE); 736 quota = MIN(quota, poolsize); 737 } 738 739 if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { 740 /* 741 * We have some space reserved, in addition to what our 742 * parent gave us. 743 */ 744 parentspace += dd->dd_phys->dd_reserved - used; 745 } 746 747 if (used > quota) { 748 /* over quota */ 749 myspace = 0; 750 #ifdef ZFS_DEBUG 751 { 752 /* 753 * While it's OK to be a little over quota, if 754 * we think we are using more space than there 755 * is in the pool (which is already 6% more than 756 * dsl_pool_adjustedsize()), something is very 757 * wrong. 758 */ 759 uint64_t space = spa_get_space(dd->dd_pool->dp_spa); 760 ASSERT3U(used, <=, space); 761 } 762 #endif 763 } else { 764 /* 765 * the lesser of parent's space and the space 766 * left in our quota 767 */ 768 myspace = MIN(parentspace, quota - used); 769 } 770 771 mutex_exit(&dd->dd_lock); 772 773 return (myspace); 774 } 775 776 struct tempreserve { 777 list_node_t tr_node; 778 dsl_dir_t *tr_ds; 779 uint64_t tr_size; 780 }; 781 782 /* 783 * Reserve space in this dsl_dir, to be used in this tx's txg. 784 * After the space has been dirtied (and thus 785 * dsl_dir_willuse_space() has been called), the reservation should 786 * be canceled, using dsl_dir_tempreserve_clear(). 787 */ 788 static int 789 dsl_dir_tempreserve_impl(dsl_dir_t *dd, 790 uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) 791 { 792 uint64_t txg = tx->tx_txg; 793 uint64_t est_used, quota, parent_rsrv; 794 int edquot = EDQUOT; 795 int txgidx = txg & TXG_MASK; 796 int i; 797 struct tempreserve *tr; 798 799 ASSERT3U(txg, !=, 0); 800 801 mutex_enter(&dd->dd_lock); 802 /* 803 * Check against the dsl_dir's quota. We don't add in the delta 804 * when checking for over-quota because they get one free hit. 805 */ 806 est_used = dsl_dir_estimated_space(dd); 807 for (i = 0; i < TXG_SIZE; i++) 808 est_used += dd->dd_tempreserved[i]; 809 810 quota = UINT64_MAX; 811 812 if (dd->dd_phys->dd_quota) 813 quota = dd->dd_phys->dd_quota; 814 815 /* 816 * If this transaction will result in a net free of space, we want 817 * to let it through, but we have to be careful: the space that it 818 * frees won't become available until *after* this txg syncs. 819 * Therefore, to ensure that it's possible to remove files from 820 * a full pool without inducing transient overcommits, we throttle 821 * netfree transactions against a quota that is slightly larger, 822 * but still within the pool's allocation slop. In cases where 823 * we're very close to full, this will allow a steady trickle of 824 * removes to get through. 825 */ 826 if (dd->dd_parent == NULL) { 827 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); 828 if (poolsize < quota) { 829 quota = poolsize; 830 edquot = ENOSPC; 831 } 832 } else if (netfree) { 833 quota = UINT64_MAX; 834 } 835 836 /* 837 * If they are requesting more space, and our current estimate 838 * is over quota. They get to try again unless the actual 839 * on-disk is over quota. 840 */ 841 if (asize > 0 && est_used > quota) { 842 if (dd->dd_used_bytes < quota) 843 edquot = ERESTART; 844 dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " 845 "quota=%lluK tr=%lluK err=%d\n", 846 dd->dd_used_bytes>>10, est_used>>10, 847 quota>>10, asize>>10, edquot); 848 mutex_exit(&dd->dd_lock); 849 return (edquot); 850 } 851 852 /* We need to up our estimated delta before dropping dd_lock */ 853 dd->dd_tempreserved[txgidx] += asize; 854 855 parent_rsrv = parent_delta(dd, est_used, asize); 856 mutex_exit(&dd->dd_lock); 857 858 tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); 859 tr->tr_ds = dd; 860 tr->tr_size = asize; 861 list_insert_tail(tr_list, tr); 862 863 /* see if it's OK with our parent */ 864 if (dd->dd_parent && parent_rsrv) { 865 return (dsl_dir_tempreserve_impl(dd->dd_parent, 866 parent_rsrv, netfree, tr_list, tx)); 867 } else { 868 return (0); 869 } 870 } 871 872 /* 873 * Reserve space in this dsl_dir, to be used in this tx's txg. 874 * After the space has been dirtied (and thus 875 * dsl_dir_willuse_space() has been called), the reservation should 876 * be canceled, using dsl_dir_tempreserve_clear(). 877 */ 878 int 879 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, 880 uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) 881 { 882 int err = 0; 883 list_t *tr_list; 884 885 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 886 list_create(tr_list, sizeof (struct tempreserve), 887 offsetof(struct tempreserve, tr_node)); 888 889 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, 890 tr_list, tx); 891 892 if (err == 0) { 893 struct tempreserve *tr; 894 895 err = arc_tempreserve_space(lsize); 896 if (err == 0) { 897 tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); 898 tr->tr_ds = NULL; 899 tr->tr_size = lsize; 900 list_insert_tail(tr_list, tr); 901 } 902 } 903 904 if (err) 905 dsl_dir_tempreserve_clear(tr_list, tx); 906 else 907 *tr_cookiep = tr_list; 908 return (err); 909 } 910 911 /* 912 * Clear a temporary reservation that we previously made with 913 * dsl_dir_tempreserve_space(). 914 */ 915 void 916 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) 917 { 918 int txgidx = tx->tx_txg & TXG_MASK; 919 list_t *tr_list = tr_cookie; 920 struct tempreserve *tr; 921 922 ASSERT3U(tx->tx_txg, !=, 0); 923 924 while (tr = list_head(tr_list)) { 925 if (tr->tr_ds == NULL) { 926 arc_tempreserve_clear(tr->tr_size); 927 } else { 928 mutex_enter(&tr->tr_ds->dd_lock); 929 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, 930 tr->tr_size); 931 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; 932 mutex_exit(&tr->tr_ds->dd_lock); 933 } 934 list_remove(tr_list, tr); 935 kmem_free(tr, sizeof (struct tempreserve)); 936 } 937 938 kmem_free(tr_list, sizeof (list_t)); 939 } 940 941 /* 942 * Call in open context when we think we're going to write/free space, 943 * eg. when dirtying data. Be conservative (ie. OK to write less than 944 * this or free more than this, but don't write more or free less). 945 */ 946 void 947 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) 948 { 949 int64_t parent_space; 950 uint64_t est_used; 951 952 mutex_enter(&dd->dd_lock); 953 if (space > 0) 954 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; 955 956 est_used = dsl_dir_estimated_space(dd); 957 parent_space = parent_delta(dd, est_used, space); 958 mutex_exit(&dd->dd_lock); 959 960 /* Make sure that we clean up dd_space_to* */ 961 dsl_dir_dirty(dd, tx); 962 963 /* XXX this is potentially expensive and unnecessary... */ 964 if (parent_space && dd->dd_parent) 965 dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); 966 } 967 968 /* call from syncing context when we actually write/free space for this dd */ 969 void 970 dsl_dir_diduse_space(dsl_dir_t *dd, 971 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) 972 { 973 int64_t accounted_delta; 974 975 ASSERT(dmu_tx_is_syncing(tx)); 976 977 dsl_dir_dirty(dd, tx); 978 979 mutex_enter(&dd->dd_lock); 980 accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); 981 ASSERT(used >= 0 || dd->dd_used_bytes >= -used); 982 ASSERT(compressed >= 0 || 983 dd->dd_phys->dd_compressed_bytes >= -compressed); 984 ASSERT(uncompressed >= 0 || 985 dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); 986 dd->dd_used_bytes += used; 987 if (used > 0) 988 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used; 989 dd->dd_phys->dd_uncompressed_bytes += uncompressed; 990 dd->dd_phys->dd_compressed_bytes += compressed; 991 mutex_exit(&dd->dd_lock); 992 993 if (dd->dd_parent != NULL) { 994 dsl_dir_diduse_space(dd->dd_parent, 995 accounted_delta, compressed, uncompressed, tx); 996 } 997 } 998 999 static int 1000 dsl_dir_set_quota_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 1001 { 1002 uint64_t *quotap = arg; 1003 uint64_t new_quota = *quotap; 1004 int err = 0; 1005 1006 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1007 1008 mutex_enter(&dd->dd_lock); 1009 if (new_quota != 0 && (new_quota < dd->dd_phys->dd_reserved || 1010 new_quota < dsl_dir_estimated_space(dd))) { 1011 err = ENOSPC; 1012 } else { 1013 dd->dd_phys->dd_quota = new_quota; 1014 } 1015 mutex_exit(&dd->dd_lock); 1016 return (err); 1017 } 1018 1019 int 1020 dsl_dir_set_quota(const char *ddname, uint64_t quota) 1021 { 1022 dsl_dir_t *dd; 1023 int err; 1024 1025 dd = dsl_dir_open(ddname, FTAG, NULL); 1026 if (dd == NULL) 1027 return (ENOENT); 1028 /* 1029 * If someone removes a file, then tries to set the quota, we 1030 * want to make sure the file freeing takes effect. 1031 */ 1032 txg_wait_open(dd->dd_pool, 0); 1033 1034 err = dsl_dir_sync_task(dd, dsl_dir_set_quota_sync, "a, 0); 1035 dsl_dir_close(dd, FTAG); 1036 return (err); 1037 } 1038 1039 static int 1040 dsl_dir_set_reservation_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 1041 { 1042 uint64_t *reservationp = arg; 1043 uint64_t new_reservation = *reservationp; 1044 uint64_t used, avail; 1045 int64_t delta; 1046 1047 if (new_reservation > INT64_MAX) 1048 return (EOVERFLOW); 1049 1050 mutex_enter(&dd->dd_lock); 1051 used = dd->dd_used_bytes; 1052 delta = MAX(used, new_reservation) - 1053 MAX(used, dd->dd_phys->dd_reserved); 1054 mutex_exit(&dd->dd_lock); 1055 1056 if (dd->dd_parent) { 1057 avail = dsl_dir_space_available(dd->dd_parent, 1058 NULL, 0, FALSE); 1059 } else { 1060 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; 1061 } 1062 1063 if (delta > 0 && delta > avail) 1064 return (ENOSPC); 1065 if (delta > 0 && dd->dd_phys->dd_quota > 0 && 1066 new_reservation > dd->dd_phys->dd_quota) 1067 return (ENOSPC); 1068 1069 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1070 dd->dd_phys->dd_reserved = new_reservation; 1071 1072 if (dd->dd_parent != NULL) { 1073 /* Roll up this additional usage into our ancestors */ 1074 dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); 1075 } 1076 return (0); 1077 } 1078 1079 int 1080 dsl_dir_set_reservation(const char *ddname, uint64_t reservation) 1081 { 1082 dsl_dir_t *dd; 1083 int err; 1084 1085 dd = dsl_dir_open(ddname, FTAG, NULL); 1086 if (dd == NULL) 1087 return (ENOENT); 1088 err = dsl_dir_sync_task(dd, 1089 dsl_dir_set_reservation_sync, &reservation, 0); 1090 dsl_dir_close(dd, FTAG); 1091 return (err); 1092 } 1093 1094 static dsl_dir_t * 1095 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) 1096 { 1097 for (; ds1; ds1 = ds1->dd_parent) { 1098 dsl_dir_t *dd; 1099 for (dd = ds2; dd; dd = dd->dd_parent) { 1100 if (ds1 == dd) 1101 return (dd); 1102 } 1103 } 1104 return (NULL); 1105 } 1106 1107 /* 1108 * If delta is applied to dd, how much of that delta would be applied to 1109 * ancestor? Syncing context only. 1110 */ 1111 static int64_t 1112 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) 1113 { 1114 if (dd == ancestor) 1115 return (delta); 1116 1117 mutex_enter(&dd->dd_lock); 1118 delta = parent_delta(dd, dd->dd_used_bytes, delta); 1119 mutex_exit(&dd->dd_lock); 1120 return (would_change(dd->dd_parent, delta, ancestor)); 1121 } 1122 1123 int 1124 dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 1125 { 1126 const char *newname = arg; 1127 dsl_pool_t *dp = dd->dd_pool; 1128 objset_t *mos = dp->dp_meta_objset; 1129 dsl_dir_t *newpds; 1130 const char *tail; 1131 int err, len; 1132 1133 /* can't rename to different pool */ 1134 len = strlen(dp->dp_root_dir->dd_myname); 1135 if (strncmp(dp->dp_root_dir->dd_myname, newname, len != 0) || 1136 newname[len] != '/') { 1137 return (ENXIO); 1138 } 1139 1140 newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail); 1141 1142 /* new parent should exist */ 1143 if (newpds == NULL) 1144 return (ENOENT); 1145 1146 /* new name should not already exist */ 1147 if (tail == NULL) { 1148 dsl_dir_close(newpds, FTAG); 1149 return (EEXIST); 1150 } 1151 1152 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 1153 1154 /* There should be 2 references: the open and the dirty */ 1155 if (dmu_buf_refcount(dd->dd_dbuf) > 2) { 1156 rw_exit(&dp->dp_config_rwlock); 1157 dsl_dir_close(newpds, FTAG); 1158 return (EBUSY); 1159 } 1160 1161 if (newpds != dd->dd_parent) { 1162 dsl_dir_t *ancestor; 1163 int64_t adelta; 1164 uint64_t myspace, avail; 1165 1166 ancestor = closest_common_ancestor(dd, newpds); 1167 1168 /* no rename into our descendent */ 1169 if (ancestor == dd) { 1170 dsl_dir_close(newpds, FTAG); 1171 rw_exit(&dp->dp_config_rwlock); 1172 return (EINVAL); 1173 } 1174 1175 myspace = MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); 1176 adelta = would_change(dd->dd_parent, -myspace, ancestor); 1177 avail = dsl_dir_space_available(newpds, 1178 ancestor, adelta, FALSE); 1179 if (avail < myspace) { 1180 dsl_dir_close(newpds, FTAG); 1181 rw_exit(&dp->dp_config_rwlock); 1182 return (ENOSPC); 1183 } 1184 1185 /* The point of no (unsuccessful) return */ 1186 1187 dsl_dir_diduse_space(dd->dd_parent, -myspace, 1188 -dd->dd_phys->dd_compressed_bytes, 1189 -dd->dd_phys->dd_uncompressed_bytes, tx); 1190 dsl_dir_diduse_space(newpds, myspace, 1191 dd->dd_phys->dd_compressed_bytes, 1192 dd->dd_phys->dd_uncompressed_bytes, tx); 1193 } 1194 1195 /* The point of no (unsuccessful) return */ 1196 1197 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1198 1199 /* remove from old parent zapobj */ 1200 err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, 1201 dd->dd_myname, tx); 1202 ASSERT3U(err, ==, 0); 1203 1204 (void) strcpy(dd->dd_myname, tail); 1205 dsl_dir_close(dd->dd_parent, dd); 1206 dd->dd_phys->dd_parent_obj = newpds->dd_object; 1207 dd->dd_parent = dsl_dir_open_obj(dd->dd_pool, 1208 newpds->dd_object, NULL, dd); 1209 1210 /* add to new parent zapobj */ 1211 err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj, 1212 dd->dd_myname, 8, 1, &dd->dd_object, tx); 1213 ASSERT3U(err, ==, 0); 1214 1215 dsl_dir_close(newpds, FTAG); 1216 rw_exit(&dp->dp_config_rwlock); 1217 return (0); 1218 } 1219