1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dsl_prop.h> 34 #include <sys/spa.h> 35 #include <sys/zap.h> 36 #include <sys/zio.h> 37 #include <sys/arc.h> 38 #include "zfs_namecheck.h" 39 40 static uint64_t dsl_dir_space_accounted(dsl_dir_t *dd); 41 static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); 42 static int dsl_dir_set_reservation_sync(dsl_dir_t *dd, 43 void *arg, dmu_tx_t *tx); 44 static uint64_t dsl_dir_space_available(dsl_dir_t *dd, 45 dsl_dir_t *ancestor, int64_t delta, int ondiskonly); 46 47 48 /* ARGSUSED */ 49 static void 50 dsl_dir_evict(dmu_buf_t *db, void *arg) 51 { 52 dsl_dir_t *dd = arg; 53 dsl_pool_t *dp = dd->dd_pool; 54 int t; 55 56 for (t = 0; t < TXG_SIZE; t++) { 57 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 58 ASSERT(dd->dd_tempreserved[t] == 0); 59 ASSERT(dd->dd_space_towrite[t] == 0); 60 } 61 62 ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); 63 64 ASSERT(dd->dd_sync_txg == 0); 65 66 if (dd->dd_parent) 67 dsl_dir_close(dd->dd_parent, dd); 68 69 spa_close(dd->dd_pool->dp_spa, dd); 70 71 /* 72 * The props callback list should be empty since they hold the 73 * dir open. 74 */ 75 list_destroy(&dd->dd_prop_cbs); 76 kmem_free(dd, sizeof (dsl_dir_t)); 77 } 78 79 dsl_dir_t * 80 dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, 81 const char *tail, void *tag) 82 { 83 dmu_buf_t *dbuf; 84 dsl_dir_t *dd; 85 86 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 87 dsl_pool_sync_context(dp)); 88 89 dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag); 90 dmu_buf_read(dbuf); 91 dd = dmu_buf_get_user(dbuf); 92 #ifdef ZFS_DEBUG 93 { 94 dmu_object_info_t doi; 95 dmu_object_info_from_db(dbuf, &doi); 96 ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); 97 } 98 #endif 99 /* XXX assert bonus buffer size is correct */ 100 if (dd == NULL) { 101 dsl_dir_t *winner; 102 int err; 103 104 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); 105 dd->dd_object = ddobj; 106 dd->dd_dbuf = dbuf; 107 dd->dd_pool = dp; 108 dd->dd_phys = dbuf->db_data; 109 dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; 110 111 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), 112 offsetof(dsl_prop_cb_record_t, cbr_node)); 113 114 if (dd->dd_phys->dd_parent_obj) { 115 dd->dd_parent = dsl_dir_open_obj(dp, 116 dd->dd_phys->dd_parent_obj, NULL, dd); 117 if (tail) { 118 #ifdef ZFS_DEBUG 119 uint64_t foundobj; 120 121 err = zap_lookup(dp->dp_meta_objset, 122 dd->dd_parent->dd_phys-> 123 dd_child_dir_zapobj, 124 tail, sizeof (foundobj), 1, &foundobj); 125 ASSERT3U(err, ==, 0); 126 ASSERT3U(foundobj, ==, ddobj); 127 #endif 128 (void) strcpy(dd->dd_myname, tail); 129 } else { 130 err = zap_value_search(dp->dp_meta_objset, 131 dd->dd_parent->dd_phys-> 132 dd_child_dir_zapobj, 133 ddobj, dd->dd_myname); 134 /* 135 * The caller should be protecting this ddobj 136 * from being deleted concurrently 137 */ 138 ASSERT(err == 0); 139 } 140 } else { 141 (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); 142 } 143 144 winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, 145 dsl_dir_evict); 146 if (winner) { 147 if (dd->dd_parent) 148 dsl_dir_close(dd->dd_parent, dd); 149 kmem_free(dd, sizeof (dsl_dir_t)); 150 dd = winner; 151 } else { 152 spa_open_ref(dp->dp_spa, dd); 153 } 154 } 155 156 /* 157 * The dsl_dir_t has both open-to-close and instantiate-to-evict 158 * holds on the spa. We need the open-to-close holds because 159 * otherwise the spa_refcnt wouldn't change when we open a 160 * dir which the spa also has open, so we could incorrectly 161 * think it was OK to unload/export/destroy the pool. We need 162 * the instantiate-to-evict hold because the dsl_dir_t has a 163 * pointer to the dd_pool, which has a pointer to the spa_t. 164 */ 165 spa_open_ref(dp->dp_spa, tag); 166 ASSERT3P(dd->dd_pool, ==, dp); 167 ASSERT3U(dd->dd_object, ==, ddobj); 168 ASSERT3P(dd->dd_dbuf, ==, dbuf); 169 return (dd); 170 } 171 172 void 173 dsl_dir_close(dsl_dir_t *dd, void *tag) 174 { 175 dprintf_dd(dd, "%s\n", ""); 176 spa_close(dd->dd_pool->dp_spa, tag); 177 dmu_buf_rele_tag(dd->dd_dbuf, tag); 178 } 179 180 /* buf must be long enough (MAXNAMELEN should do) */ 181 void 182 dsl_dir_name(dsl_dir_t *dd, char *buf) 183 { 184 if (dd->dd_parent) { 185 dsl_dir_name(dd->dd_parent, buf); 186 (void) strcat(buf, "/"); 187 } else { 188 buf[0] = '\0'; 189 } 190 if (!MUTEX_HELD(&dd->dd_lock)) { 191 /* 192 * recursive mutex so that we can use 193 * dprintf_dd() with dd_lock held 194 */ 195 mutex_enter(&dd->dd_lock); 196 (void) strcat(buf, dd->dd_myname); 197 mutex_exit(&dd->dd_lock); 198 } else { 199 (void) strcat(buf, dd->dd_myname); 200 } 201 } 202 203 int 204 dsl_dir_is_private(dsl_dir_t *dd) 205 { 206 int rv = FALSE; 207 208 if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) 209 rv = TRUE; 210 if (dataset_name_hidden(dd->dd_myname)) 211 rv = TRUE; 212 return (rv); 213 } 214 215 216 static int 217 getcomponent(const char *path, char *component, const char **nextp) 218 { 219 char *p; 220 if (path == NULL) 221 return (NULL); 222 /* This would be a good place to reserve some namespace... */ 223 p = strpbrk(path, "/@"); 224 if (p && (p[1] == '/' || p[1] == '@')) { 225 /* two separators in a row */ 226 return (EINVAL); 227 } 228 if (p == NULL || p == path) { 229 /* 230 * if the first thing is an @ or /, it had better be an 231 * @ and it had better not have any more ats or slashes, 232 * and it had better have something after the @. 233 */ 234 if (p != NULL && 235 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) 236 return (EINVAL); 237 if (strlen(path) >= MAXNAMELEN) 238 return (ENAMETOOLONG); 239 (void) strcpy(component, path); 240 p = NULL; 241 } else if (p[0] == '/') { 242 if (p-path >= MAXNAMELEN) 243 return (ENAMETOOLONG); 244 (void) strncpy(component, path, p - path); 245 component[p-path] = '\0'; 246 p++; 247 } else if (p[0] == '@') { 248 /* 249 * if the next separator is an @, there better not be 250 * any more slashes. 251 */ 252 if (strchr(path, '/')) 253 return (EINVAL); 254 if (p-path >= MAXNAMELEN) 255 return (ENAMETOOLONG); 256 (void) strncpy(component, path, p - path); 257 component[p-path] = '\0'; 258 } else { 259 ASSERT(!"invalid p"); 260 } 261 *nextp = p; 262 return (0); 263 } 264 265 /* 266 * same as dsl_open_dir, ignore the first component of name and use the 267 * spa instead 268 */ 269 dsl_dir_t * 270 dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) 271 { 272 char buf[MAXNAMELEN]; 273 const char *next, *nextnext = NULL; 274 int err; 275 dsl_dir_t *dd; 276 dsl_pool_t *dp; 277 uint64_t ddobj; 278 int openedspa = FALSE; 279 280 dprintf("%s\n", name); 281 282 if (name == NULL) 283 return (NULL); 284 err = getcomponent(name, buf, &next); 285 if (err) 286 return (NULL); 287 if (spa == NULL) { 288 err = spa_open(buf, &spa, FTAG); 289 if (err) { 290 dprintf("spa_open(%s) failed\n", buf); 291 return (NULL); 292 } 293 openedspa = TRUE; 294 295 /* XXX this assertion belongs in spa_open */ 296 ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); 297 } 298 299 dp = spa_get_dsl(spa); 300 301 rw_enter(&dp->dp_config_rwlock, RW_READER); 302 dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag); 303 while (next != NULL) { 304 dsl_dir_t *child_ds; 305 err = getcomponent(next, buf, &nextnext); 306 if (err) { 307 dsl_dir_close(dd, tag); 308 rw_exit(&dp->dp_config_rwlock); 309 if (openedspa) 310 spa_close(spa, FTAG); 311 return (NULL); 312 } 313 ASSERT(next[0] != '\0'); 314 if (next[0] == '@') 315 break; 316 if (dd->dd_phys->dd_child_dir_zapobj == 0) 317 break; 318 dprintf("looking up %s in obj%lld\n", 319 buf, dd->dd_phys->dd_child_dir_zapobj); 320 321 err = zap_lookup(dp->dp_meta_objset, 322 dd->dd_phys->dd_child_dir_zapobj, 323 buf, sizeof (ddobj), 1, &ddobj); 324 if (err == ENOENT) { 325 break; 326 } 327 ASSERT(err == 0); 328 329 child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag); 330 dsl_dir_close(dd, tag); 331 dd = child_ds; 332 next = nextnext; 333 } 334 rw_exit(&dp->dp_config_rwlock); 335 336 /* 337 * It's an error if there's more than one component left, or 338 * tailp==NULL and there's any component left. 339 */ 340 if (next != NULL && 341 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { 342 /* bad path name */ 343 dsl_dir_close(dd, tag); 344 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); 345 next = NULL; 346 dd = NULL; 347 } 348 if (tailp) 349 *tailp = next; 350 if (openedspa) 351 spa_close(spa, FTAG); 352 return (dd); 353 } 354 355 /* 356 * Return the dsl_dir_t, and possibly the last component which couldn't 357 * be found in *tail. Return NULL if the path is bogus, or if 358 * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' 359 * means that the last component is a snapshot. 360 */ 361 dsl_dir_t * 362 dsl_dir_open(const char *name, void *tag, const char **tailp) 363 { 364 return (dsl_dir_open_spa(NULL, name, tag, tailp)); 365 } 366 367 int 368 dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) 369 { 370 objset_t *mos = pds->dd_pool->dp_meta_objset; 371 uint64_t ddobj; 372 dsl_dir_phys_t *dsphys; 373 dmu_buf_t *dbuf; 374 int err; 375 376 ASSERT(dmu_tx_is_syncing(tx)); 377 378 if (pds->dd_phys->dd_child_dir_zapobj == 0) { 379 dmu_buf_will_dirty(pds->dd_dbuf, tx); 380 pds->dd_phys->dd_child_dir_zapobj = zap_create(mos, 381 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 382 } 383 384 rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER); 385 err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, 386 name, sizeof (uint64_t), 1, &ddobj); 387 if (err != ENOENT) { 388 rw_exit(&pds->dd_pool->dp_config_rwlock); 389 return (err ? err : EEXIST); 390 } 391 392 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 393 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 394 err = zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, 395 name, sizeof (uint64_t), 1, &ddobj, tx); 396 ASSERT3U(err, ==, 0); 397 dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n", 398 name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err); 399 400 dbuf = dmu_bonus_hold(mos, ddobj); 401 dmu_buf_will_dirty(dbuf, tx); 402 dsphys = dbuf->db_data; 403 404 dsphys->dd_creation_time = gethrestime_sec(); 405 dsphys->dd_parent_obj = pds->dd_object; 406 dsphys->dd_props_zapobj = zap_create(mos, 407 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 408 dsphys->dd_child_dir_zapobj = zap_create(mos, 409 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 410 dmu_buf_rele(dbuf); 411 412 rw_exit(&pds->dd_pool->dp_config_rwlock); 413 414 return (0); 415 } 416 417 int 418 dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx) 419 { 420 const char *name = arg; 421 dsl_dir_t *dd = NULL; 422 dsl_pool_t *dp = pds->dd_pool; 423 objset_t *mos = dp->dp_meta_objset; 424 uint64_t val, obj, child_zapobj, props_zapobj; 425 int t, err; 426 427 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 428 429 err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, name, 430 8, 1, &obj); 431 if (err) 432 goto out; 433 434 dd = dsl_dir_open_obj(dp, obj, name, FTAG); 435 ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object); 436 437 if (dmu_buf_refcount(dd->dd_dbuf) > 1) { 438 err = EBUSY; 439 goto out; 440 } 441 442 for (t = 0; t < TXG_SIZE; t++) { 443 /* 444 * if they were dirty, they'd also be open. 445 * dp_config_rwlock ensures that it stays that way. 446 */ 447 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 448 } 449 450 child_zapobj = dd->dd_phys->dd_child_dir_zapobj; 451 props_zapobj = dd->dd_phys->dd_props_zapobj; 452 453 if (child_zapobj != 0) { 454 uint64_t count; 455 err = EEXIST; 456 (void) zap_count(mos, child_zapobj, &count); 457 if (count != 0) 458 goto out; 459 } 460 461 if (dd->dd_phys->dd_head_dataset_obj != 0) { 462 err = dsl_dataset_destroy_sync(dd, NULL, tx); 463 if (err) 464 goto out; 465 } 466 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 467 468 /* The point of no (unsuccessful) return */ 469 470 /* Make sure parent's used gets updated */ 471 val = 0; 472 err = dsl_dir_set_reservation_sync(dd, &val, tx); 473 ASSERT(err == 0); 474 ASSERT3U(dd->dd_used_bytes, ==, 0); 475 ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); 476 dsl_dir_close(dd, FTAG); 477 dd = NULL; 478 479 err = dmu_object_free(mos, obj, tx); 480 ASSERT(err == 0); 481 482 if (child_zapobj) 483 err = zap_destroy(mos, child_zapobj, tx); 484 ASSERT(err == 0); 485 486 if (props_zapobj) 487 err = zap_destroy(mos, props_zapobj, tx); 488 ASSERT(err == 0); 489 490 err = zap_remove(mos, pds->dd_phys->dd_child_dir_zapobj, name, tx); 491 ASSERT(err == 0); 492 493 out: 494 rw_exit(&dp->dp_config_rwlock); 495 if (dd) 496 dsl_dir_close(dd, FTAG); 497 498 return (err); 499 } 500 501 void 502 dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) 503 { 504 dsl_dir_phys_t *dsp; 505 dmu_buf_t *dbuf; 506 int error; 507 508 *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 509 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 510 511 error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, 512 sizeof (uint64_t), 1, ddobjp, tx); 513 ASSERT3U(error, ==, 0); 514 515 dbuf = dmu_bonus_hold(mos, *ddobjp); 516 dmu_buf_will_dirty(dbuf, tx); 517 dsp = dbuf->db_data; 518 519 dsp->dd_creation_time = gethrestime_sec(); 520 dsp->dd_props_zapobj = zap_create(mos, 521 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 522 dsp->dd_child_dir_zapobj = zap_create(mos, 523 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 524 525 dmu_buf_rele(dbuf); 526 } 527 528 void 529 dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds) 530 { 531 bzero(dds, sizeof (dmu_objset_stats_t)); 532 533 dds->dds_dir_obj = dd->dd_object; 534 dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE); 535 536 mutex_enter(&dd->dd_lock); 537 dds->dds_space_used = dd->dd_used_bytes; 538 dds->dds_compressed_bytes = dd->dd_phys->dd_compressed_bytes; 539 dds->dds_uncompressed_bytes = dd->dd_phys->dd_uncompressed_bytes; 540 dds->dds_quota = dd->dd_phys->dd_quota; 541 dds->dds_reserved = dd->dd_phys->dd_reserved; 542 mutex_exit(&dd->dd_lock); 543 544 dds->dds_creation_time = dd->dd_phys->dd_creation_time; 545 546 dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0); 547 548 if (dd->dd_phys->dd_clone_parent_obj) { 549 dsl_dataset_t *ds; 550 551 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 552 ds = dsl_dataset_open_obj(dd->dd_pool, 553 dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG); 554 dsl_dataset_name(ds, dds->dds_clone_of); 555 dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj; 556 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 557 rw_exit(&dd->dd_pool->dp_config_rwlock); 558 } 559 560 spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot, 561 sizeof (dds->dds_altroot)); 562 } 563 564 int 565 dsl_dir_sync_task(dsl_dir_t *dd, 566 int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space) 567 { 568 dmu_tx_t *tx; 569 dsl_pool_t *dp = dd->dd_pool; 570 int err = 0; 571 uint64_t txg; 572 573 dprintf_dd(dd, "func=%p space=%llu\n", func, space); 574 575 again: 576 tx = dmu_tx_create_ds(dd); 577 dmu_tx_hold_space(tx, space); 578 err = dmu_tx_assign(tx, TXG_WAIT); 579 if (err == ENOSPC || err == EDQUOT) { 580 dsl_dir_t *rds; 581 /* 582 * They can get their space from either this dd, or the 583 * root dd. 584 */ 585 for (rds = dd; rds->dd_parent; rds = rds->dd_parent) 586 continue; 587 dmu_tx_abort(tx); 588 tx = dmu_tx_create_ds(rds); 589 dmu_tx_hold_space(tx, space); 590 err = dmu_tx_assign(tx, TXG_WAIT); 591 } 592 if (err) { 593 dmu_tx_abort(tx); 594 return (err); 595 } 596 597 txg = dmu_tx_get_txg(tx); 598 mutex_enter(&dd->dd_lock); 599 if (dd->dd_sync_txg != 0) { 600 mutex_exit(&dd->dd_lock); 601 dmu_tx_commit(tx); 602 txg_wait_synced(dp, 0); 603 goto again; 604 } 605 606 /* We're good to go */ 607 608 dd->dd_sync_txg = txg; 609 dd->dd_sync_func = func; 610 dd->dd_sync_arg = arg; 611 612 mutex_exit(&dd->dd_lock); 613 614 dsl_dir_dirty(dd, tx); 615 dmu_tx_commit(tx); 616 617 txg_wait_synced(dp, txg); 618 619 mutex_enter(&dd->dd_lock); 620 ASSERT(dd->dd_sync_txg == txg); 621 ASSERT(dd->dd_sync_func == NULL); 622 err = dd->dd_sync_err; 623 dd->dd_sync_txg = 0; 624 mutex_exit(&dd->dd_lock); 625 626 return (err); 627 } 628 629 void 630 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) 631 { 632 dsl_pool_t *dp = dd->dd_pool; 633 634 ASSERT(dd->dd_phys); 635 636 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { 637 /* up the hold count until we can be written out */ 638 dmu_buf_add_ref(dd->dd_dbuf, dd); 639 } 640 } 641 642 static int64_t 643 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) 644 { 645 uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); 646 uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); 647 return (new_accounted - old_accounted); 648 } 649 650 void 651 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) 652 { 653 if (dd->dd_sync_txg == tx->tx_txg && dd->dd_sync_func) { 654 dd->dd_sync_err = dd->dd_sync_func(dd, dd->dd_sync_arg, tx); 655 dd->dd_sync_func = NULL; 656 } 657 658 ASSERT(dmu_tx_is_syncing(tx)); 659 660 dmu_buf_will_dirty(dd->dd_dbuf, tx); 661 662 mutex_enter(&dd->dd_lock); 663 ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); 664 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, 665 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); 666 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; 667 dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; 668 mutex_exit(&dd->dd_lock); 669 670 /* release the hold from dsl_dir_dirty */ 671 dmu_buf_remove_ref(dd->dd_dbuf, dd); 672 } 673 674 static uint64_t 675 dsl_dir_estimated_space(dsl_dir_t *dd) 676 { 677 int64_t space; 678 int i; 679 680 ASSERT(MUTEX_HELD(&dd->dd_lock)); 681 682 space = dd->dd_used_bytes; 683 ASSERT(space >= 0); 684 for (i = 0; i < TXG_SIZE; i++) { 685 space += dd->dd_space_towrite[i&TXG_MASK]; 686 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); 687 } 688 return (space); 689 } 690 691 /* 692 * How much space would dd have available if ancestor had delta applied 693 * to it? If ondiskonly is set, we're only interested in what's 694 * on-disk, not estimated pending changes. 695 */ 696 static uint64_t 697 dsl_dir_space_available(dsl_dir_t *dd, 698 dsl_dir_t *ancestor, int64_t delta, int ondiskonly) 699 { 700 uint64_t parentspace, myspace, quota, used; 701 702 /* 703 * If there are no restrictions otherwise, assume we have 704 * unlimited space available. 705 */ 706 quota = UINT64_MAX; 707 parentspace = UINT64_MAX; 708 709 if (dd->dd_parent != NULL) { 710 parentspace = dsl_dir_space_available(dd->dd_parent, 711 ancestor, delta, ondiskonly); 712 } 713 714 mutex_enter(&dd->dd_lock); 715 if (dd->dd_phys->dd_quota != 0) 716 quota = dd->dd_phys->dd_quota; 717 if (ondiskonly) { 718 used = dd->dd_used_bytes; 719 } else { 720 used = dsl_dir_estimated_space(dd); 721 } 722 if (dd == ancestor) 723 used += delta; 724 725 if (dd->dd_parent == NULL) { 726 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE); 727 quota = MIN(quota, poolsize); 728 } 729 730 if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { 731 /* 732 * We have some space reserved, in addition to what our 733 * parent gave us. 734 */ 735 parentspace += dd->dd_phys->dd_reserved - used; 736 } 737 738 if (used > quota) { 739 /* over quota */ 740 myspace = 0; 741 #ifdef ZFS_DEBUG 742 { 743 /* 744 * While it's OK to be a little over quota, if 745 * we think we are using more space than there 746 * is in the pool (which is already 6% more than 747 * dsl_pool_adjustedsize()), something is very 748 * wrong. 749 */ 750 uint64_t space = spa_get_space(dd->dd_pool->dp_spa); 751 ASSERT3U(used, <=, space); 752 } 753 #endif 754 } else { 755 /* 756 * the lesser of parent's space and the space 757 * left in our quota 758 */ 759 myspace = MIN(parentspace, quota - used); 760 } 761 762 mutex_exit(&dd->dd_lock); 763 764 return (myspace); 765 } 766 767 struct tempreserve { 768 list_node_t tr_node; 769 dsl_dir_t *tr_ds; 770 uint64_t tr_size; 771 }; 772 773 /* 774 * Reserve space in this dsl_dir, to be used in this tx's txg. 775 * After the space has been dirtied (and thus 776 * dsl_dir_willuse_space() has been called), the reservation should 777 * be canceled, using dsl_dir_tempreserve_clear(). 778 */ 779 static int 780 dsl_dir_tempreserve_impl(dsl_dir_t *dd, 781 uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) 782 { 783 uint64_t txg = tx->tx_txg; 784 uint64_t est_used, quota, parent_rsrv; 785 int edquot = EDQUOT; 786 int txgidx = txg & TXG_MASK; 787 int i; 788 struct tempreserve *tr; 789 790 ASSERT3U(txg, !=, 0); 791 792 mutex_enter(&dd->dd_lock); 793 /* 794 * Check against the dsl_dir's quota. We don't add in the delta 795 * when checking for over-quota because they get one free hit. 796 */ 797 est_used = dsl_dir_estimated_space(dd); 798 for (i = 0; i < TXG_SIZE; i++) 799 est_used += dd->dd_tempreserved[i]; 800 801 quota = UINT64_MAX; 802 803 if (dd->dd_phys->dd_quota) 804 quota = dd->dd_phys->dd_quota; 805 806 /* 807 * If this transaction will result in a net free of space, we want 808 * to let it through, but we have to be careful: the space that it 809 * frees won't become available until *after* this txg syncs. 810 * Therefore, to ensure that it's possible to remove files from 811 * a full pool without inducing transient overcommits, we throttle 812 * netfree transactions against a quota that is slightly larger, 813 * but still within the pool's allocation slop. In cases where 814 * we're very close to full, this will allow a steady trickle of 815 * removes to get through. 816 */ 817 if (dd->dd_parent == NULL) { 818 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); 819 if (poolsize < quota) { 820 quota = poolsize; 821 edquot = ENOSPC; 822 } 823 } else if (netfree) { 824 quota = UINT64_MAX; 825 } 826 827 /* 828 * If they are requesting more space, and our current estimate 829 * is over quota. They get to try again unless the actual 830 * on-disk is over quota. 831 */ 832 if (asize > 0 && est_used > quota) { 833 if (dd->dd_used_bytes < quota) 834 edquot = ERESTART; 835 dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " 836 "quota=%lluK tr=%lluK err=%d\n", 837 dd->dd_used_bytes>>10, est_used>>10, 838 quota>>10, asize>>10, edquot); 839 mutex_exit(&dd->dd_lock); 840 return (edquot); 841 } 842 843 /* We need to up our estimated delta before dropping dd_lock */ 844 dd->dd_tempreserved[txgidx] += asize; 845 846 parent_rsrv = parent_delta(dd, est_used, asize); 847 mutex_exit(&dd->dd_lock); 848 849 tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); 850 tr->tr_ds = dd; 851 tr->tr_size = asize; 852 list_insert_tail(tr_list, tr); 853 854 /* see if it's OK with our parent */ 855 if (dd->dd_parent && parent_rsrv) { 856 return (dsl_dir_tempreserve_impl(dd->dd_parent, 857 parent_rsrv, netfree, tr_list, tx)); 858 } else { 859 return (0); 860 } 861 } 862 863 /* 864 * Reserve space in this dsl_dir, to be used in this tx's txg. 865 * After the space has been dirtied (and thus 866 * dsl_dir_willuse_space() has been called), the reservation should 867 * be canceled, using dsl_dir_tempreserve_clear(). 868 */ 869 int 870 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, 871 uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) 872 { 873 int err = 0; 874 list_t *tr_list; 875 876 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 877 list_create(tr_list, sizeof (struct tempreserve), 878 offsetof(struct tempreserve, tr_node)); 879 880 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, 881 tr_list, tx); 882 883 if (err == 0) { 884 struct tempreserve *tr; 885 886 err = arc_tempreserve_space(lsize); 887 if (err == 0) { 888 tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); 889 tr->tr_ds = NULL; 890 tr->tr_size = lsize; 891 list_insert_tail(tr_list, tr); 892 } 893 } 894 895 if (err) 896 dsl_dir_tempreserve_clear(tr_list, tx); 897 else 898 *tr_cookiep = tr_list; 899 return (err); 900 } 901 902 /* 903 * Clear a temporary reservation that we previously made with 904 * dsl_dir_tempreserve_space(). 905 */ 906 void 907 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) 908 { 909 int txgidx = tx->tx_txg & TXG_MASK; 910 list_t *tr_list = tr_cookie; 911 struct tempreserve *tr; 912 913 ASSERT3U(tx->tx_txg, !=, 0); 914 915 while (tr = list_head(tr_list)) { 916 if (tr->tr_ds == NULL) { 917 arc_tempreserve_clear(tr->tr_size); 918 } else { 919 mutex_enter(&tr->tr_ds->dd_lock); 920 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, 921 tr->tr_size); 922 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; 923 mutex_exit(&tr->tr_ds->dd_lock); 924 } 925 list_remove(tr_list, tr); 926 kmem_free(tr, sizeof (struct tempreserve)); 927 } 928 929 kmem_free(tr_list, sizeof (list_t)); 930 } 931 932 /* 933 * Call in open context when we think we're going to write/free space, 934 * eg. when dirtying data. Be conservative (ie. OK to write less than 935 * this or free more than this, but don't write more or free less). 936 */ 937 void 938 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) 939 { 940 int64_t parent_space; 941 uint64_t est_used; 942 943 mutex_enter(&dd->dd_lock); 944 if (space > 0) 945 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; 946 947 est_used = dsl_dir_estimated_space(dd); 948 parent_space = parent_delta(dd, est_used, space); 949 mutex_exit(&dd->dd_lock); 950 951 /* Make sure that we clean up dd_space_to* */ 952 dsl_dir_dirty(dd, tx); 953 954 /* XXX this is potentially expensive and unnecessary... */ 955 if (parent_space && dd->dd_parent) 956 dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); 957 } 958 959 /* call from syncing context when we actually write/free space for this dd */ 960 void 961 dsl_dir_diduse_space(dsl_dir_t *dd, 962 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) 963 { 964 int64_t accounted_delta; 965 966 ASSERT(dmu_tx_is_syncing(tx)); 967 968 dsl_dir_dirty(dd, tx); 969 970 mutex_enter(&dd->dd_lock); 971 accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); 972 ASSERT(used >= 0 || dd->dd_used_bytes >= -used); 973 ASSERT(compressed >= 0 || 974 dd->dd_phys->dd_compressed_bytes >= -compressed); 975 ASSERT(uncompressed >= 0 || 976 dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); 977 dd->dd_used_bytes += used; 978 if (used > 0) 979 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used; 980 dd->dd_phys->dd_uncompressed_bytes += uncompressed; 981 dd->dd_phys->dd_compressed_bytes += compressed; 982 mutex_exit(&dd->dd_lock); 983 984 if (dd->dd_parent != NULL) { 985 dsl_dir_diduse_space(dd->dd_parent, 986 accounted_delta, compressed, uncompressed, tx); 987 } 988 } 989 990 static int 991 dsl_dir_set_quota_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 992 { 993 uint64_t *quotap = arg; 994 uint64_t new_quota = *quotap; 995 int err = 0; 996 997 dmu_buf_will_dirty(dd->dd_dbuf, tx); 998 999 mutex_enter(&dd->dd_lock); 1000 if (new_quota != 0 && (new_quota < dd->dd_phys->dd_reserved || 1001 new_quota < dsl_dir_estimated_space(dd))) { 1002 err = ENOSPC; 1003 } else { 1004 dd->dd_phys->dd_quota = new_quota; 1005 } 1006 mutex_exit(&dd->dd_lock); 1007 return (err); 1008 } 1009 1010 int 1011 dsl_dir_set_quota(const char *ddname, uint64_t quota) 1012 { 1013 dsl_dir_t *dd; 1014 int err; 1015 1016 dd = dsl_dir_open(ddname, FTAG, NULL); 1017 if (dd == NULL) 1018 return (ENOENT); 1019 /* 1020 * If someone removes a file, then tries to set the quota, we 1021 * want to make sure the file freeing takes effect. 1022 */ 1023 txg_wait_open(dd->dd_pool, 0); 1024 1025 err = dsl_dir_sync_task(dd, dsl_dir_set_quota_sync, "a, 0); 1026 dsl_dir_close(dd, FTAG); 1027 return (err); 1028 } 1029 1030 static int 1031 dsl_dir_set_reservation_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 1032 { 1033 uint64_t *reservationp = arg; 1034 uint64_t new_reservation = *reservationp; 1035 uint64_t used, avail; 1036 int64_t delta; 1037 1038 if (new_reservation > INT64_MAX) 1039 return (EOVERFLOW); 1040 1041 mutex_enter(&dd->dd_lock); 1042 used = dd->dd_used_bytes; 1043 delta = MAX(used, new_reservation) - 1044 MAX(used, dd->dd_phys->dd_reserved); 1045 mutex_exit(&dd->dd_lock); 1046 1047 if (dd->dd_parent) { 1048 avail = dsl_dir_space_available(dd->dd_parent, 1049 NULL, 0, FALSE); 1050 } else { 1051 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; 1052 } 1053 1054 if (delta > 0 && delta > avail) 1055 return (ENOSPC); 1056 if (delta > 0 && dd->dd_phys->dd_quota > 0 && 1057 new_reservation > dd->dd_phys->dd_quota) 1058 return (ENOSPC); 1059 1060 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1061 dd->dd_phys->dd_reserved = new_reservation; 1062 1063 if (dd->dd_parent != NULL) { 1064 /* Roll up this additional usage into our ancestors */ 1065 dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); 1066 } 1067 return (0); 1068 } 1069 1070 int 1071 dsl_dir_set_reservation(const char *ddname, uint64_t reservation) 1072 { 1073 dsl_dir_t *dd; 1074 int err; 1075 1076 dd = dsl_dir_open(ddname, FTAG, NULL); 1077 if (dd == NULL) 1078 return (ENOENT); 1079 err = dsl_dir_sync_task(dd, 1080 dsl_dir_set_reservation_sync, &reservation, 0); 1081 dsl_dir_close(dd, FTAG); 1082 return (err); 1083 } 1084 1085 static dsl_dir_t * 1086 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) 1087 { 1088 for (; ds1; ds1 = ds1->dd_parent) { 1089 dsl_dir_t *dd; 1090 for (dd = ds2; dd; dd = dd->dd_parent) { 1091 if (ds1 == dd) 1092 return (dd); 1093 } 1094 } 1095 return (NULL); 1096 } 1097 1098 /* 1099 * If delta is applied to dd, how much of that delta would be applied to 1100 * ancestor? Syncing context only. 1101 */ 1102 static int64_t 1103 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) 1104 { 1105 if (dd == ancestor) 1106 return (delta); 1107 1108 mutex_enter(&dd->dd_lock); 1109 delta = parent_delta(dd, dd->dd_used_bytes, delta); 1110 mutex_exit(&dd->dd_lock); 1111 return (would_change(dd->dd_parent, delta, ancestor)); 1112 } 1113 1114 int 1115 dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 1116 { 1117 const char *newname = arg; 1118 dsl_pool_t *dp = dd->dd_pool; 1119 objset_t *mos = dp->dp_meta_objset; 1120 dsl_dir_t *newpds; 1121 const char *tail; 1122 int err, len; 1123 1124 /* can't rename to different pool */ 1125 len = strlen(dp->dp_root_dir->dd_myname); 1126 if (strncmp(dp->dp_root_dir->dd_myname, newname, len != 0) || 1127 newname[len] != '/') { 1128 return (ENXIO); 1129 } 1130 1131 newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail); 1132 1133 /* new parent should exist */ 1134 if (newpds == NULL) 1135 return (ENOENT); 1136 1137 /* new name should not already exist */ 1138 if (tail == NULL) { 1139 dsl_dir_close(newpds, FTAG); 1140 return (EEXIST); 1141 } 1142 1143 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 1144 1145 /* There should be 2 references: the open and the dirty */ 1146 if (dmu_buf_refcount(dd->dd_dbuf) > 2) { 1147 rw_exit(&dp->dp_config_rwlock); 1148 dsl_dir_close(newpds, FTAG); 1149 return (EBUSY); 1150 } 1151 1152 if (newpds != dd->dd_parent) { 1153 dsl_dir_t *ancestor; 1154 int64_t adelta; 1155 uint64_t myspace, avail; 1156 1157 ancestor = closest_common_ancestor(dd, newpds); 1158 1159 /* no rename into our descendent */ 1160 if (ancestor == dd) { 1161 dsl_dir_close(newpds, FTAG); 1162 rw_exit(&dp->dp_config_rwlock); 1163 return (EINVAL); 1164 } 1165 1166 myspace = MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); 1167 adelta = would_change(dd->dd_parent, -myspace, ancestor); 1168 avail = dsl_dir_space_available(newpds, 1169 ancestor, adelta, FALSE); 1170 if (avail < myspace) { 1171 dsl_dir_close(newpds, FTAG); 1172 rw_exit(&dp->dp_config_rwlock); 1173 return (ENOSPC); 1174 } 1175 1176 /* The point of no (unsuccessful) return */ 1177 1178 dsl_dir_diduse_space(dd->dd_parent, -myspace, 1179 -dd->dd_phys->dd_compressed_bytes, 1180 -dd->dd_phys->dd_uncompressed_bytes, tx); 1181 dsl_dir_diduse_space(newpds, myspace, 1182 dd->dd_phys->dd_compressed_bytes, 1183 dd->dd_phys->dd_uncompressed_bytes, tx); 1184 } 1185 1186 /* The point of no (unsuccessful) return */ 1187 1188 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1189 1190 /* remove from old parent zapobj */ 1191 err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, 1192 dd->dd_myname, tx); 1193 ASSERT3U(err, ==, 0); 1194 1195 (void) strcpy(dd->dd_myname, tail); 1196 dsl_dir_close(dd->dd_parent, dd); 1197 dd->dd_phys->dd_parent_obj = newpds->dd_object; 1198 dd->dd_parent = dsl_dir_open_obj(dd->dd_pool, 1199 newpds->dd_object, NULL, dd); 1200 1201 /* add to new parent zapobj */ 1202 err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj, 1203 dd->dd_myname, 8, 1, &dd->dd_object, tx); 1204 ASSERT3U(err, ==, 0); 1205 1206 dsl_dir_close(newpds, FTAG); 1207 rw_exit(&dp->dp_config_rwlock); 1208 return (0); 1209 } 1210