1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dsl_prop.h> 34 #include <sys/spa.h> 35 #include <sys/zap.h> 36 #include <sys/zio.h> 37 #include <sys/arc.h> 38 #include "zfs_namecheck.h" 39 40 static uint64_t dsl_dir_space_accounted(dsl_dir_t *dd); 41 static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); 42 static int dsl_dir_set_reservation_sync(dsl_dir_t *dd, 43 void *arg, dmu_tx_t *tx); 44 static uint64_t dsl_dir_space_available(dsl_dir_t *dd, 45 dsl_dir_t *ancestor, int64_t delta, int ondiskonly); 46 47 48 /* ARGSUSED */ 49 static void 50 dsl_dir_evict(dmu_buf_t *db, void *arg) 51 { 52 dsl_dir_t *dd = arg; 53 dsl_pool_t *dp = dd->dd_pool; 54 int t; 55 56 for (t = 0; t < TXG_SIZE; t++) { 57 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 58 ASSERT(dd->dd_tempreserved[t] == 0); 59 ASSERT(dd->dd_space_towrite[t] == 0); 60 } 61 62 ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); 63 64 ASSERT(dd->dd_sync_txg == 0); 65 66 if (dd->dd_parent) 67 dsl_dir_close(dd->dd_parent, dd); 68 69 spa_close(dd->dd_pool->dp_spa, dd); 70 71 /* 72 * The props callback list should be empty since they hold the 73 * dir open. 74 */ 75 list_destroy(&dd->dd_prop_cbs); 76 kmem_free(dd, sizeof (dsl_dir_t)); 77 } 78 79 dsl_dir_t * 80 dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, 81 const char *tail, void *tag) 82 { 83 dmu_buf_t *dbuf; 84 dsl_dir_t *dd; 85 86 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 87 dsl_pool_sync_context(dp)); 88 89 dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag); 90 dmu_buf_read(dbuf); 91 dd = dmu_buf_get_user(dbuf); 92 #ifdef ZFS_DEBUG 93 { 94 dmu_object_info_t doi; 95 dmu_object_info_from_db(dbuf, &doi); 96 ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DATASET); 97 } 98 #endif 99 /* XXX assert bonus buffer size is correct */ 100 if (dd == NULL) { 101 dsl_dir_t *winner; 102 int err; 103 104 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); 105 dd->dd_object = ddobj; 106 dd->dd_dbuf = dbuf; 107 dd->dd_pool = dp; 108 dd->dd_phys = dbuf->db_data; 109 dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; 110 111 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), 112 offsetof(dsl_prop_cb_record_t, cbr_node)); 113 114 if (dd->dd_phys->dd_parent_obj) { 115 dd->dd_parent = dsl_dir_open_obj(dp, 116 dd->dd_phys->dd_parent_obj, NULL, dd); 117 if (tail) { 118 #ifdef ZFS_DEBUG 119 uint64_t foundobj; 120 121 err = zap_lookup(dp->dp_meta_objset, 122 dd->dd_parent->dd_phys-> 123 dd_child_dir_zapobj, 124 tail, sizeof (foundobj), 1, &foundobj); 125 ASSERT3U(err, ==, 0); 126 ASSERT3U(foundobj, ==, ddobj); 127 #endif 128 (void) strcpy(dd->dd_myname, tail); 129 } else { 130 err = zap_value_search(dp->dp_meta_objset, 131 dd->dd_parent->dd_phys-> 132 dd_child_dir_zapobj, 133 ddobj, dd->dd_myname); 134 /* 135 * The caller should be protecting this ddobj 136 * from being deleted concurrently 137 */ 138 ASSERT(err == 0); 139 } 140 } else { 141 (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); 142 } 143 144 winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, 145 dsl_dir_evict); 146 if (winner) { 147 if (dd->dd_parent) 148 dsl_dir_close(dd->dd_parent, dd); 149 kmem_free(dd, sizeof (dsl_dir_t)); 150 dd = winner; 151 } else { 152 spa_open_ref(dp->dp_spa, dd); 153 } 154 } 155 156 /* 157 * The dsl_dir_t has both open-to-close and instantiate-to-evict 158 * holds on the spa. We need the open-to-close holds because 159 * otherwise the spa_refcnt wouldn't change when we open a 160 * dir which the spa also has open, so we could incorrectly 161 * think it was OK to unload/export/destroy the pool. We need 162 * the instantiate-to-evict hold because the dsl_dir_t has a 163 * pointer to the dd_pool, which has a pointer to the spa_t. 164 */ 165 spa_open_ref(dp->dp_spa, tag); 166 ASSERT3P(dd->dd_pool, ==, dp); 167 ASSERT3U(dd->dd_object, ==, ddobj); 168 ASSERT3P(dd->dd_dbuf, ==, dbuf); 169 return (dd); 170 } 171 172 void 173 dsl_dir_close(dsl_dir_t *dd, void *tag) 174 { 175 dprintf_dd(dd, "%s\n", ""); 176 spa_close(dd->dd_pool->dp_spa, tag); 177 dmu_buf_rele_tag(dd->dd_dbuf, tag); 178 } 179 180 /* buf must be long enough (MAXNAMELEN should do) */ 181 void 182 dsl_dir_name(dsl_dir_t *dd, char *buf) 183 { 184 if (dd->dd_parent) { 185 dsl_dir_name(dd->dd_parent, buf); 186 (void) strcat(buf, "/"); 187 } else { 188 buf[0] = '\0'; 189 } 190 if (!MUTEX_HELD(&dd->dd_lock)) { 191 /* 192 * recursive mutex so that we can use 193 * dprintf_dd() with dd_lock held 194 */ 195 mutex_enter(&dd->dd_lock); 196 (void) strcat(buf, dd->dd_myname); 197 mutex_exit(&dd->dd_lock); 198 } else { 199 (void) strcat(buf, dd->dd_myname); 200 } 201 } 202 203 int 204 dsl_dir_is_private(dsl_dir_t *dd) 205 { 206 int rv = FALSE; 207 208 if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) 209 rv = TRUE; 210 if (dataset_name_hidden(dd->dd_myname)) 211 rv = TRUE; 212 return (rv); 213 } 214 215 216 static int 217 getcomponent(const char *path, char *component, const char **nextp) 218 { 219 char *p; 220 if (path == NULL) 221 return (NULL); 222 /* This would be a good place to reserve some namespace... */ 223 p = strpbrk(path, "/@"); 224 if (p && (p[1] == '/' || p[1] == '@')) { 225 /* two separators in a row */ 226 return (EINVAL); 227 } 228 if (p == NULL || p == path) { 229 /* 230 * if the first thing is an @ or /, it had better be an 231 * @ and it had better not have any more ats or slashes, 232 * and it had better have something after the @. 233 */ 234 if (p != NULL && 235 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) 236 return (EINVAL); 237 if (strlen(path) >= MAXNAMELEN) 238 return (ENAMETOOLONG); 239 (void) strcpy(component, path); 240 p = NULL; 241 } else if (p[0] == '/') { 242 if (p-path >= MAXNAMELEN) 243 return (ENAMETOOLONG); 244 (void) strncpy(component, path, p - path); 245 component[p-path] = '\0'; 246 p++; 247 } else if (p[0] == '@') { 248 /* 249 * if the next separator is an @, there better not be 250 * any more slashes. 251 */ 252 if (strchr(path, '/')) 253 return (EINVAL); 254 if (p-path >= MAXNAMELEN) 255 return (ENAMETOOLONG); 256 (void) strncpy(component, path, p - path); 257 component[p-path] = '\0'; 258 } else { 259 ASSERT(!"invalid p"); 260 } 261 *nextp = p; 262 return (0); 263 } 264 265 /* 266 * same as dsl_open_dir, ignore the first component of name and use the 267 * spa instead 268 */ 269 dsl_dir_t * 270 dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) 271 { 272 char buf[MAXNAMELEN]; 273 const char *next, *nextnext = NULL; 274 int err; 275 dsl_dir_t *dd; 276 dsl_pool_t *dp; 277 uint64_t ddobj; 278 int openedspa = FALSE; 279 280 dprintf("%s\n", name); 281 282 if (name == NULL) 283 return (NULL); 284 err = getcomponent(name, buf, &next); 285 if (err) 286 return (NULL); 287 if (spa == NULL) { 288 err = spa_open(buf, &spa, FTAG); 289 if (err) { 290 dprintf("spa_open(%s) failed\n", buf); 291 return (NULL); 292 } 293 openedspa = TRUE; 294 295 /* XXX this assertion belongs in spa_open */ 296 ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); 297 } 298 299 dp = spa_get_dsl(spa); 300 301 rw_enter(&dp->dp_config_rwlock, RW_READER); 302 dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag); 303 while (next != NULL) { 304 dsl_dir_t *child_ds; 305 err = getcomponent(next, buf, &nextnext); 306 if (err) { 307 dsl_dir_close(dd, tag); 308 if (openedspa) 309 spa_close(spa, FTAG); 310 return (NULL); 311 } 312 ASSERT(next[0] != '\0'); 313 if (next[0] == '@') 314 break; 315 if (dd->dd_phys->dd_child_dir_zapobj == 0) 316 break; 317 dprintf("looking up %s in obj%lld\n", 318 buf, dd->dd_phys->dd_child_dir_zapobj); 319 320 err = zap_lookup(dp->dp_meta_objset, 321 dd->dd_phys->dd_child_dir_zapobj, 322 buf, sizeof (ddobj), 1, &ddobj); 323 if (err == ENOENT) { 324 break; 325 } 326 ASSERT(err == 0); 327 328 child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag); 329 dsl_dir_close(dd, tag); 330 dd = child_ds; 331 next = nextnext; 332 } 333 rw_exit(&dp->dp_config_rwlock); 334 335 /* 336 * It's an error if there's more than one component left, or 337 * tailp==NULL and there's any component left. 338 */ 339 if (next != NULL && 340 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { 341 /* bad path name */ 342 dsl_dir_close(dd, tag); 343 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); 344 next = NULL; 345 dd = NULL; 346 } 347 if (tailp) 348 *tailp = next; 349 if (openedspa) 350 spa_close(spa, FTAG); 351 return (dd); 352 } 353 354 /* 355 * Return the dsl_dir_t, and possibly the last component which couldn't 356 * be found in *tail. Return NULL if the path is bogus, or if 357 * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' 358 * means that the last component is a snapshot. 359 */ 360 dsl_dir_t * 361 dsl_dir_open(const char *name, void *tag, const char **tailp) 362 { 363 return (dsl_dir_open_spa(NULL, name, tag, tailp)); 364 } 365 366 int 367 dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) 368 { 369 objset_t *mos = pds->dd_pool->dp_meta_objset; 370 uint64_t ddobj; 371 dsl_dir_phys_t *dsphys; 372 dmu_buf_t *dbuf; 373 int err; 374 375 ASSERT(dmu_tx_is_syncing(tx)); 376 377 if (pds->dd_phys->dd_child_dir_zapobj == 0) { 378 dmu_buf_will_dirty(pds->dd_dbuf, tx); 379 pds->dd_phys->dd_child_dir_zapobj = zap_create(mos, 380 DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx); 381 } 382 383 rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER); 384 err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, 385 name, sizeof (uint64_t), 1, &ddobj); 386 if (err != ENOENT) { 387 rw_exit(&pds->dd_pool->dp_config_rwlock); 388 return (err ? err : EEXIST); 389 } 390 391 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 392 DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx); 393 err = zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, 394 name, sizeof (uint64_t), 1, &ddobj, tx); 395 ASSERT3U(err, ==, 0); 396 dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n", 397 name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err); 398 399 dbuf = dmu_bonus_hold(mos, ddobj); 400 dmu_buf_will_dirty(dbuf, tx); 401 dsphys = dbuf->db_data; 402 403 dsphys->dd_creation_time = gethrestime_sec(); 404 dsphys->dd_parent_obj = pds->dd_object; 405 dsphys->dd_props_zapobj = zap_create(mos, 406 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 407 dsphys->dd_child_dir_zapobj = zap_create(mos, 408 DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx); 409 dmu_buf_rele(dbuf); 410 411 rw_exit(&pds->dd_pool->dp_config_rwlock); 412 413 return (0); 414 } 415 416 int 417 dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx) 418 { 419 const char *name = arg; 420 dsl_dir_t *dd = NULL; 421 dsl_pool_t *dp = pds->dd_pool; 422 objset_t *mos = dp->dp_meta_objset; 423 uint64_t val, obj, child_zapobj, props_zapobj; 424 int t, err; 425 426 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 427 428 err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, name, 429 8, 1, &obj); 430 if (err) 431 goto out; 432 433 dd = dsl_dir_open_obj(dp, obj, name, FTAG); 434 ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object); 435 436 if (dmu_buf_refcount(dd->dd_dbuf) > 1) { 437 err = EBUSY; 438 goto out; 439 } 440 441 for (t = 0; t < TXG_SIZE; t++) { 442 /* 443 * if they were dirty, they'd also be open. 444 * dp_config_rwlock ensures that it stays that way. 445 */ 446 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 447 } 448 449 child_zapobj = dd->dd_phys->dd_child_dir_zapobj; 450 props_zapobj = dd->dd_phys->dd_props_zapobj; 451 452 if (child_zapobj != 0) { 453 uint64_t count; 454 err = EEXIST; 455 (void) zap_count(mos, child_zapobj, &count); 456 if (count != 0) 457 goto out; 458 } 459 460 if (dd->dd_phys->dd_head_dataset_obj != 0) { 461 err = dsl_dataset_destroy_sync(dd, NULL, tx); 462 if (err) 463 goto out; 464 } 465 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 466 467 /* The point of no (unsuccessful) return */ 468 469 /* Make sure parent's used gets updated */ 470 val = 0; 471 err = dsl_dir_set_reservation_sync(dd, &val, tx); 472 ASSERT(err == 0); 473 ASSERT3U(dd->dd_used_bytes, ==, 0); 474 ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); 475 dsl_dir_close(dd, FTAG); 476 dd = NULL; 477 478 err = dmu_object_free(mos, obj, tx); 479 ASSERT(err == 0); 480 481 if (child_zapobj) 482 err = zap_destroy(mos, child_zapobj, tx); 483 ASSERT(err == 0); 484 485 if (props_zapobj) 486 err = zap_destroy(mos, props_zapobj, tx); 487 ASSERT(err == 0); 488 489 err = zap_remove(mos, pds->dd_phys->dd_child_dir_zapobj, name, tx); 490 ASSERT(err == 0); 491 492 out: 493 rw_exit(&dp->dp_config_rwlock); 494 if (dd) 495 dsl_dir_close(dd, FTAG); 496 497 return (err); 498 } 499 500 void 501 dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) 502 { 503 dsl_dir_phys_t *dsp; 504 dmu_buf_t *dbuf; 505 int error; 506 507 *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 508 DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx); 509 510 error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, 511 sizeof (uint64_t), 1, ddobjp, tx); 512 ASSERT3U(error, ==, 0); 513 514 dbuf = dmu_bonus_hold(mos, *ddobjp); 515 dmu_buf_will_dirty(dbuf, tx); 516 dsp = dbuf->db_data; 517 518 dsp->dd_creation_time = gethrestime_sec(); 519 dsp->dd_props_zapobj = zap_create(mos, 520 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 521 dsp->dd_child_dir_zapobj = zap_create(mos, 522 DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx); 523 524 dmu_buf_rele(dbuf); 525 } 526 527 void 528 dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds) 529 { 530 bzero(dds, sizeof (dmu_objset_stats_t)); 531 532 dds->dds_dir_obj = dd->dd_object; 533 dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE); 534 535 mutex_enter(&dd->dd_lock); 536 dds->dds_space_used = dd->dd_used_bytes; 537 dds->dds_compressed_bytes = dd->dd_phys->dd_compressed_bytes; 538 dds->dds_uncompressed_bytes = dd->dd_phys->dd_uncompressed_bytes; 539 dds->dds_quota = dd->dd_phys->dd_quota; 540 dds->dds_reserved = dd->dd_phys->dd_reserved; 541 mutex_exit(&dd->dd_lock); 542 543 dds->dds_creation_time = dd->dd_phys->dd_creation_time; 544 545 dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0); 546 547 if (dd->dd_phys->dd_clone_parent_obj) { 548 dsl_dataset_t *ds; 549 550 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 551 ds = dsl_dataset_open_obj(dd->dd_pool, 552 dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG); 553 dsl_dataset_name(ds, dds->dds_clone_of); 554 dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj; 555 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 556 rw_exit(&dd->dd_pool->dp_config_rwlock); 557 } 558 559 VERIFY(dsl_prop_get_ds_integer(dd, "checksum", 560 &dds->dds_checksum, dds->dds_checksum_setpoint) == 0); 561 562 VERIFY(dsl_prop_get_ds_integer(dd, "compression", 563 &dds->dds_compression, dds->dds_compression_setpoint) == 0); 564 565 VERIFY(dsl_prop_get_ds_integer(dd, "zoned", 566 &dds->dds_zoned, dds->dds_zoned_setpoint) == 0); 567 568 spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot, 569 sizeof (dds->dds_altroot)); 570 } 571 572 int 573 dsl_dir_sync_task(dsl_dir_t *dd, 574 int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space) 575 { 576 dmu_tx_t *tx; 577 dsl_pool_t *dp = dd->dd_pool; 578 int err = 0; 579 uint64_t txg; 580 581 dprintf_dd(dd, "func=%p space=%llu\n", func, space); 582 583 again: 584 tx = dmu_tx_create_ds(dd); 585 dmu_tx_hold_space(tx, space); 586 err = dmu_tx_assign(tx, TXG_WAIT); 587 if (err == ENOSPC || err == EDQUOT) { 588 dsl_dir_t *rds; 589 /* 590 * They can get their space from either this dd, or the 591 * root dd. 592 */ 593 for (rds = dd; rds->dd_parent; rds = rds->dd_parent) 594 continue; 595 dmu_tx_abort(tx); 596 tx = dmu_tx_create_ds(rds); 597 dmu_tx_hold_space(tx, space); 598 err = dmu_tx_assign(tx, TXG_WAIT); 599 } 600 if (err) { 601 dmu_tx_abort(tx); 602 return (err); 603 } 604 605 txg = dmu_tx_get_txg(tx); 606 mutex_enter(&dd->dd_lock); 607 if (dd->dd_sync_txg != 0) { 608 mutex_exit(&dd->dd_lock); 609 dmu_tx_commit(tx); 610 txg_wait_synced(dp, 0); 611 goto again; 612 } 613 614 /* We're good to go */ 615 616 dd->dd_sync_txg = txg; 617 dd->dd_sync_func = func; 618 dd->dd_sync_arg = arg; 619 620 mutex_exit(&dd->dd_lock); 621 622 dsl_dir_dirty(dd, tx); 623 dmu_tx_commit(tx); 624 625 txg_wait_synced(dp, txg); 626 627 mutex_enter(&dd->dd_lock); 628 ASSERT(dd->dd_sync_txg == txg); 629 ASSERT(dd->dd_sync_func == NULL); 630 err = dd->dd_sync_err; 631 dd->dd_sync_txg = 0; 632 mutex_exit(&dd->dd_lock); 633 634 return (err); 635 } 636 637 void 638 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) 639 { 640 dsl_pool_t *dp = dd->dd_pool; 641 642 ASSERT(dd->dd_phys); 643 644 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { 645 /* up the hold count until we can be written out */ 646 dmu_buf_add_ref(dd->dd_dbuf, dd); 647 } 648 } 649 650 static int64_t 651 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) 652 { 653 uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); 654 uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); 655 return (new_accounted - old_accounted); 656 } 657 658 void 659 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) 660 { 661 if (dd->dd_sync_txg == tx->tx_txg && dd->dd_sync_func) { 662 dd->dd_sync_err = dd->dd_sync_func(dd, dd->dd_sync_arg, tx); 663 dd->dd_sync_func = NULL; 664 } 665 666 ASSERT(dmu_tx_is_syncing(tx)); 667 668 dmu_buf_will_dirty(dd->dd_dbuf, tx); 669 670 mutex_enter(&dd->dd_lock); 671 ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); 672 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, 673 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); 674 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; 675 dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; 676 mutex_exit(&dd->dd_lock); 677 678 /* release the hold from dsl_dir_dirty */ 679 dmu_buf_remove_ref(dd->dd_dbuf, dd); 680 } 681 682 static uint64_t 683 dsl_dir_estimated_space(dsl_dir_t *dd) 684 { 685 int64_t space; 686 int i; 687 688 ASSERT(MUTEX_HELD(&dd->dd_lock)); 689 690 space = dd->dd_used_bytes; 691 ASSERT(space >= 0); 692 for (i = 0; i < TXG_SIZE; i++) { 693 space += dd->dd_space_towrite[i&TXG_MASK]; 694 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); 695 } 696 return (space); 697 } 698 699 /* 700 * How much space would dd have available if ancestor had delta applied 701 * to it? If ondiskonly is set, we're only interested in what's 702 * on-disk, not estimated pending changes. 703 */ 704 static uint64_t 705 dsl_dir_space_available(dsl_dir_t *dd, 706 dsl_dir_t *ancestor, int64_t delta, int ondiskonly) 707 { 708 uint64_t parentspace, myspace, quota, used; 709 710 /* 711 * If there are no restrictions otherwise, assume we have 712 * unlimited space available. 713 */ 714 quota = UINT64_MAX; 715 parentspace = UINT64_MAX; 716 717 if (dd->dd_parent != NULL) { 718 parentspace = dsl_dir_space_available(dd->dd_parent, 719 ancestor, delta, ondiskonly); 720 } 721 722 mutex_enter(&dd->dd_lock); 723 if (dd->dd_phys->dd_quota != 0) 724 quota = dd->dd_phys->dd_quota; 725 if (ondiskonly) { 726 used = dd->dd_used_bytes; 727 } else { 728 used = dsl_dir_estimated_space(dd); 729 } 730 if (dd == ancestor) 731 used += delta; 732 733 if (dd->dd_parent == NULL) { 734 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE); 735 quota = MIN(quota, poolsize); 736 } 737 738 if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { 739 /* 740 * We have some space reserved, in addition to what our 741 * parent gave us. 742 */ 743 parentspace += dd->dd_phys->dd_reserved - used; 744 } 745 746 if (used > quota) { 747 /* over quota */ 748 myspace = 0; 749 #ifdef ZFS_DEBUG 750 { 751 /* 752 * While it's OK to be a little over quota, if 753 * we think we are using more space than there 754 * is in the pool (which is already 6% more than 755 * dsl_pool_adjustedsize()), something is very 756 * wrong. 757 */ 758 uint64_t space = spa_get_space(dd->dd_pool->dp_spa); 759 ASSERT3U(used, <=, space); 760 } 761 #endif 762 } else { 763 /* 764 * the lesser of parent's space and the space 765 * left in our quota 766 */ 767 myspace = MIN(parentspace, quota - used); 768 } 769 770 mutex_exit(&dd->dd_lock); 771 772 return (myspace); 773 } 774 775 struct tempreserve { 776 list_node_t tr_node; 777 dsl_dir_t *tr_ds; 778 uint64_t tr_size; 779 }; 780 781 /* 782 * Reserve space in this dsl_dir, to be used in this tx's txg. 783 * After the space has been dirtied (and thus 784 * dsl_dir_willuse_space() has been called), the reservation should 785 * be canceled, using dsl_dir_tempreserve_clear(). 786 */ 787 static int 788 dsl_dir_tempreserve_impl(dsl_dir_t *dd, 789 uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) 790 { 791 uint64_t txg = tx->tx_txg; 792 uint64_t est_used, quota, parent_rsrv; 793 int edquot = EDQUOT; 794 int txgidx = txg & TXG_MASK; 795 int i; 796 struct tempreserve *tr; 797 798 ASSERT3U(txg, !=, 0); 799 800 mutex_enter(&dd->dd_lock); 801 /* 802 * Check against the dsl_dir's quota. We don't add in the delta 803 * when checking for over-quota because they get one free hit. 804 */ 805 est_used = dsl_dir_estimated_space(dd); 806 for (i = 0; i < TXG_SIZE; i++) 807 est_used += dd->dd_tempreserved[i]; 808 809 quota = UINT64_MAX; 810 811 if (dd->dd_phys->dd_quota) 812 quota = dd->dd_phys->dd_quota; 813 814 /* 815 * If this transaction will result in a net free of space, we want 816 * to let it through, but we have to be careful: the space that it 817 * frees won't become available until *after* this txg syncs. 818 * Therefore, to ensure that it's possible to remove files from 819 * a full pool without inducing transient overcommits, we throttle 820 * netfree transactions against a quota that is slightly larger, 821 * but still within the pool's allocation slop. In cases where 822 * we're very close to full, this will allow a steady trickle of 823 * removes to get through. 824 */ 825 if (dd->dd_parent == NULL) { 826 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); 827 if (poolsize < quota) { 828 quota = poolsize; 829 edquot = ENOSPC; 830 } 831 } else if (netfree) { 832 quota = UINT64_MAX; 833 } 834 835 /* 836 * If they are requesting more space, and our current estimate 837 * is over quota. They get to try again unless the actual 838 * on-disk is over quota. 839 */ 840 if (asize > 0 && est_used > quota) { 841 if (dd->dd_used_bytes < quota) 842 edquot = ERESTART; 843 dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " 844 "quota=%lluK tr=%lluK err=%d\n", 845 dd->dd_used_bytes>>10, est_used>>10, 846 quota>>10, asize>>10, edquot); 847 mutex_exit(&dd->dd_lock); 848 return (edquot); 849 } 850 851 /* We need to up our estimated delta before dropping dd_lock */ 852 dd->dd_tempreserved[txgidx] += asize; 853 854 parent_rsrv = parent_delta(dd, est_used, asize); 855 mutex_exit(&dd->dd_lock); 856 857 tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); 858 tr->tr_ds = dd; 859 tr->tr_size = asize; 860 list_insert_tail(tr_list, tr); 861 862 /* see if it's OK with our parent */ 863 if (dd->dd_parent && parent_rsrv) { 864 return (dsl_dir_tempreserve_impl(dd->dd_parent, 865 parent_rsrv, netfree, tr_list, tx)); 866 } else { 867 return (0); 868 } 869 } 870 871 /* 872 * Reserve space in this dsl_dir, to be used in this tx's txg. 873 * After the space has been dirtied (and thus 874 * dsl_dir_willuse_space() has been called), the reservation should 875 * be canceled, using dsl_dir_tempreserve_clear(). 876 */ 877 int 878 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, 879 uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) 880 { 881 int err = 0; 882 list_t *tr_list; 883 884 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 885 list_create(tr_list, sizeof (struct tempreserve), 886 offsetof(struct tempreserve, tr_node)); 887 888 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, 889 tr_list, tx); 890 891 if (err == 0) { 892 struct tempreserve *tr; 893 894 err = arc_tempreserve_space(lsize); 895 if (err == 0) { 896 tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); 897 tr->tr_ds = NULL; 898 tr->tr_size = lsize; 899 list_insert_tail(tr_list, tr); 900 } 901 } 902 903 if (err) 904 dsl_dir_tempreserve_clear(tr_list, tx); 905 else 906 *tr_cookiep = tr_list; 907 return (err); 908 } 909 910 /* 911 * Clear a temporary reservation that we previously made with 912 * dsl_dir_tempreserve_space(). 913 */ 914 void 915 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) 916 { 917 int txgidx = tx->tx_txg & TXG_MASK; 918 list_t *tr_list = tr_cookie; 919 struct tempreserve *tr; 920 921 ASSERT3U(tx->tx_txg, !=, 0); 922 923 while (tr = list_head(tr_list)) { 924 if (tr->tr_ds == NULL) { 925 arc_tempreserve_clear(tr->tr_size); 926 } else { 927 mutex_enter(&tr->tr_ds->dd_lock); 928 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, 929 tr->tr_size); 930 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; 931 mutex_exit(&tr->tr_ds->dd_lock); 932 } 933 list_remove(tr_list, tr); 934 kmem_free(tr, sizeof (struct tempreserve)); 935 } 936 937 kmem_free(tr_list, sizeof (list_t)); 938 } 939 940 /* 941 * Call in open context when we think we're going to write/free space, 942 * eg. when dirtying data. Be conservative (ie. OK to write less than 943 * this or free more than this, but don't write more or free less). 944 */ 945 void 946 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) 947 { 948 int64_t parent_space; 949 uint64_t est_used; 950 951 mutex_enter(&dd->dd_lock); 952 if (space > 0) 953 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; 954 955 est_used = dsl_dir_estimated_space(dd); 956 parent_space = parent_delta(dd, est_used, space); 957 mutex_exit(&dd->dd_lock); 958 959 /* Make sure that we clean up dd_space_to* */ 960 dsl_dir_dirty(dd, tx); 961 962 /* XXX this is potentially expensive and unnecessary... */ 963 if (parent_space && dd->dd_parent) 964 dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); 965 } 966 967 /* call from syncing context when we actually write/free space for this dd */ 968 void 969 dsl_dir_diduse_space(dsl_dir_t *dd, 970 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) 971 { 972 int64_t accounted_delta; 973 974 ASSERT(dmu_tx_is_syncing(tx)); 975 976 dsl_dir_dirty(dd, tx); 977 978 mutex_enter(&dd->dd_lock); 979 accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); 980 ASSERT(used >= 0 || dd->dd_used_bytes >= -used); 981 ASSERT(compressed >= 0 || 982 dd->dd_phys->dd_compressed_bytes >= -compressed); 983 ASSERT(uncompressed >= 0 || 984 dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); 985 dd->dd_used_bytes += used; 986 if (used > 0) 987 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used; 988 dd->dd_phys->dd_uncompressed_bytes += uncompressed; 989 dd->dd_phys->dd_compressed_bytes += compressed; 990 mutex_exit(&dd->dd_lock); 991 992 if (dd->dd_parent != NULL) { 993 dsl_dir_diduse_space(dd->dd_parent, 994 accounted_delta, compressed, uncompressed, tx); 995 } 996 } 997 998 static int 999 dsl_dir_set_quota_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 1000 { 1001 uint64_t *quotap = arg; 1002 uint64_t new_quota = *quotap; 1003 int err = 0; 1004 1005 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1006 1007 mutex_enter(&dd->dd_lock); 1008 if (new_quota != 0 && (new_quota < dd->dd_phys->dd_reserved || 1009 new_quota < dsl_dir_estimated_space(dd))) { 1010 err = ENOSPC; 1011 } else { 1012 dd->dd_phys->dd_quota = new_quota; 1013 } 1014 mutex_exit(&dd->dd_lock); 1015 return (err); 1016 } 1017 1018 int 1019 dsl_dir_set_quota(const char *ddname, uint64_t quota) 1020 { 1021 dsl_dir_t *dd; 1022 int err; 1023 1024 dd = dsl_dir_open(ddname, FTAG, NULL); 1025 if (dd == NULL) 1026 return (ENOENT); 1027 /* 1028 * If someone removes a file, then tries to set the quota, we 1029 * want to make sure the file freeing takes effect. 1030 */ 1031 txg_wait_open(dd->dd_pool, 0); 1032 1033 err = dsl_dir_sync_task(dd, dsl_dir_set_quota_sync, "a, 0); 1034 dsl_dir_close(dd, FTAG); 1035 return (err); 1036 } 1037 1038 static int 1039 dsl_dir_set_reservation_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 1040 { 1041 uint64_t *reservationp = arg; 1042 uint64_t new_reservation = *reservationp; 1043 uint64_t used, avail; 1044 int64_t delta; 1045 1046 if (new_reservation > INT64_MAX) 1047 return (EOVERFLOW); 1048 1049 mutex_enter(&dd->dd_lock); 1050 used = dd->dd_used_bytes; 1051 delta = MAX(used, new_reservation) - 1052 MAX(used, dd->dd_phys->dd_reserved); 1053 mutex_exit(&dd->dd_lock); 1054 1055 if (dd->dd_parent) { 1056 avail = dsl_dir_space_available(dd->dd_parent, 1057 NULL, 0, FALSE); 1058 } else { 1059 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; 1060 } 1061 1062 if (delta > 0 && delta > avail) 1063 return (ENOSPC); 1064 if (delta > 0 && dd->dd_phys->dd_quota > 0 && 1065 new_reservation > dd->dd_phys->dd_quota) 1066 return (ENOSPC); 1067 1068 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1069 dd->dd_phys->dd_reserved = new_reservation; 1070 1071 if (dd->dd_parent != NULL) { 1072 /* Roll up this additional usage into our ancestors */ 1073 dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); 1074 } 1075 return (0); 1076 } 1077 1078 int 1079 dsl_dir_set_reservation(const char *ddname, uint64_t reservation) 1080 { 1081 dsl_dir_t *dd; 1082 int err; 1083 1084 dd = dsl_dir_open(ddname, FTAG, NULL); 1085 if (dd == NULL) 1086 return (ENOENT); 1087 err = dsl_dir_sync_task(dd, 1088 dsl_dir_set_reservation_sync, &reservation, 0); 1089 dsl_dir_close(dd, FTAG); 1090 return (err); 1091 } 1092 1093 static dsl_dir_t * 1094 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) 1095 { 1096 for (; ds1; ds1 = ds1->dd_parent) { 1097 dsl_dir_t *dd; 1098 for (dd = ds2; dd; dd = dd->dd_parent) { 1099 if (ds1 == dd) 1100 return (dd); 1101 } 1102 } 1103 return (NULL); 1104 } 1105 1106 /* 1107 * If delta is applied to dd, how much of that delta would be applied to 1108 * ancestor? Syncing context only. 1109 */ 1110 static int64_t 1111 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) 1112 { 1113 if (dd == ancestor) 1114 return (delta); 1115 1116 mutex_enter(&dd->dd_lock); 1117 delta = parent_delta(dd, dd->dd_used_bytes, delta); 1118 mutex_exit(&dd->dd_lock); 1119 return (would_change(dd->dd_parent, delta, ancestor)); 1120 } 1121 1122 int 1123 dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 1124 { 1125 const char *newname = arg; 1126 dsl_pool_t *dp = dd->dd_pool; 1127 objset_t *mos = dp->dp_meta_objset; 1128 dsl_dir_t *newpds; 1129 const char *tail; 1130 int err, len; 1131 1132 /* can't rename to different pool */ 1133 len = strlen(dp->dp_root_dir->dd_myname); 1134 if (strncmp(dp->dp_root_dir->dd_myname, newname, len != 0) || 1135 newname[len] != '/') { 1136 return (ENXIO); 1137 } 1138 1139 newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail); 1140 1141 /* new parent should exist */ 1142 if (newpds == NULL) 1143 return (ENOENT); 1144 1145 /* new name should not already exist */ 1146 if (tail == NULL) { 1147 dsl_dir_close(newpds, FTAG); 1148 return (EEXIST); 1149 } 1150 1151 rw_enter(&dp->dp_config_rwlock, RW_WRITER); 1152 1153 /* There should be 2 references: the open and the dirty */ 1154 if (dmu_buf_refcount(dd->dd_dbuf) > 2) { 1155 rw_exit(&dp->dp_config_rwlock); 1156 dsl_dir_close(newpds, FTAG); 1157 return (EBUSY); 1158 } 1159 1160 if (newpds != dd->dd_parent) { 1161 dsl_dir_t *ancestor; 1162 int64_t adelta; 1163 uint64_t myspace, avail; 1164 1165 ancestor = closest_common_ancestor(dd, newpds); 1166 1167 /* no rename into our descendent */ 1168 if (ancestor == dd) { 1169 dsl_dir_close(newpds, FTAG); 1170 rw_exit(&dp->dp_config_rwlock); 1171 return (EINVAL); 1172 } 1173 1174 myspace = MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); 1175 adelta = would_change(dd->dd_parent, -myspace, ancestor); 1176 avail = dsl_dir_space_available(newpds, 1177 ancestor, adelta, FALSE); 1178 if (avail < myspace) { 1179 dsl_dir_close(newpds, FTAG); 1180 rw_exit(&dp->dp_config_rwlock); 1181 return (ENOSPC); 1182 } 1183 1184 /* The point of no (unsuccessful) return */ 1185 1186 dsl_dir_diduse_space(dd->dd_parent, -myspace, 1187 -dd->dd_phys->dd_compressed_bytes, 1188 -dd->dd_phys->dd_uncompressed_bytes, tx); 1189 dsl_dir_diduse_space(newpds, myspace, 1190 dd->dd_phys->dd_compressed_bytes, 1191 dd->dd_phys->dd_uncompressed_bytes, tx); 1192 } 1193 1194 /* The point of no (unsuccessful) return */ 1195 1196 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1197 1198 /* remove from old parent zapobj */ 1199 err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, 1200 dd->dd_myname, tx); 1201 ASSERT3U(err, ==, 0); 1202 1203 (void) strcpy(dd->dd_myname, tail); 1204 dsl_dir_close(dd->dd_parent, dd); 1205 dd->dd_phys->dd_parent_obj = newpds->dd_object; 1206 dd->dd_parent = dsl_dir_open_obj(dd->dd_pool, 1207 newpds->dd_object, NULL, dd); 1208 1209 /* add to new parent zapobj */ 1210 err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj, 1211 dd->dd_myname, 8, 1, &dd->dd_object, tx); 1212 ASSERT3U(err, ==, 0); 1213 1214 dsl_dir_close(newpds, FTAG); 1215 rw_exit(&dp->dp_config_rwlock); 1216 return (0); 1217 } 1218