1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 */ 27 28 /* Portions Copyright 2010 Robert Milkowski */ 29 30 #include <sys/cred.h> 31 #include <sys/zfs_context.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/dsl_dataset.h> 35 #include <sys/dsl_prop.h> 36 #include <sys/dsl_pool.h> 37 #include <sys/dsl_synctask.h> 38 #include <sys/dsl_deleg.h> 39 #include <sys/dnode.h> 40 #include <sys/dbuf.h> 41 #include <sys/zvol.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/dmu_impl.h> 46 #include <sys/zfs_ioctl.h> 47 #include <sys/sa.h> 48 #include <sys/zfs_onexit.h> 49 #include <sys/dsl_destroy.h> 50 51 /* 52 * Needed to close a window in dnode_move() that allows the objset to be freed 53 * before it can be safely accessed. 54 */ 55 krwlock_t os_lock; 56 57 void 58 dmu_objset_init(void) 59 { 60 rw_init(&os_lock, NULL, RW_DEFAULT, NULL); 61 } 62 63 void 64 dmu_objset_fini(void) 65 { 66 rw_destroy(&os_lock); 67 } 68 69 spa_t * 70 dmu_objset_spa(objset_t *os) 71 { 72 return (os->os_spa); 73 } 74 75 zilog_t * 76 dmu_objset_zil(objset_t *os) 77 { 78 return (os->os_zil); 79 } 80 81 dsl_pool_t * 82 dmu_objset_pool(objset_t *os) 83 { 84 dsl_dataset_t *ds; 85 86 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) 87 return (ds->ds_dir->dd_pool); 88 else 89 return (spa_get_dsl(os->os_spa)); 90 } 91 92 dsl_dataset_t * 93 dmu_objset_ds(objset_t *os) 94 { 95 return (os->os_dsl_dataset); 96 } 97 98 dmu_objset_type_t 99 dmu_objset_type(objset_t *os) 100 { 101 return (os->os_phys->os_type); 102 } 103 104 void 105 dmu_objset_name(objset_t *os, char *buf) 106 { 107 dsl_dataset_name(os->os_dsl_dataset, buf); 108 } 109 110 uint64_t 111 dmu_objset_id(objset_t *os) 112 { 113 dsl_dataset_t *ds = os->os_dsl_dataset; 114 115 return (ds ? ds->ds_object : 0); 116 } 117 118 uint64_t 119 dmu_objset_syncprop(objset_t *os) 120 { 121 return (os->os_sync); 122 } 123 124 uint64_t 125 dmu_objset_logbias(objset_t *os) 126 { 127 return (os->os_logbias); 128 } 129 130 static void 131 checksum_changed_cb(void *arg, uint64_t newval) 132 { 133 objset_t *os = arg; 134 135 /* 136 * Inheritance should have been done by now. 137 */ 138 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 139 140 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); 141 } 142 143 static void 144 compression_changed_cb(void *arg, uint64_t newval) 145 { 146 objset_t *os = arg; 147 148 /* 149 * Inheritance and range checking should have been done by now. 150 */ 151 ASSERT(newval != ZIO_COMPRESS_INHERIT); 152 153 os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); 154 } 155 156 static void 157 copies_changed_cb(void *arg, uint64_t newval) 158 { 159 objset_t *os = arg; 160 161 /* 162 * Inheritance and range checking should have been done by now. 163 */ 164 ASSERT(newval > 0); 165 ASSERT(newval <= spa_max_replication(os->os_spa)); 166 167 os->os_copies = newval; 168 } 169 170 static void 171 dedup_changed_cb(void *arg, uint64_t newval) 172 { 173 objset_t *os = arg; 174 spa_t *spa = os->os_spa; 175 enum zio_checksum checksum; 176 177 /* 178 * Inheritance should have been done by now. 179 */ 180 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 181 182 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); 183 184 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; 185 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); 186 } 187 188 static void 189 primary_cache_changed_cb(void *arg, uint64_t newval) 190 { 191 objset_t *os = arg; 192 193 /* 194 * Inheritance and range checking should have been done by now. 195 */ 196 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 197 newval == ZFS_CACHE_METADATA); 198 199 os->os_primary_cache = newval; 200 } 201 202 static void 203 secondary_cache_changed_cb(void *arg, uint64_t newval) 204 { 205 objset_t *os = arg; 206 207 /* 208 * Inheritance and range checking should have been done by now. 209 */ 210 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 211 newval == ZFS_CACHE_METADATA); 212 213 os->os_secondary_cache = newval; 214 } 215 216 static void 217 sync_changed_cb(void *arg, uint64_t newval) 218 { 219 objset_t *os = arg; 220 221 /* 222 * Inheritance and range checking should have been done by now. 223 */ 224 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || 225 newval == ZFS_SYNC_DISABLED); 226 227 os->os_sync = newval; 228 if (os->os_zil) 229 zil_set_sync(os->os_zil, newval); 230 } 231 232 static void 233 logbias_changed_cb(void *arg, uint64_t newval) 234 { 235 objset_t *os = arg; 236 237 ASSERT(newval == ZFS_LOGBIAS_LATENCY || 238 newval == ZFS_LOGBIAS_THROUGHPUT); 239 os->os_logbias = newval; 240 if (os->os_zil) 241 zil_set_logbias(os->os_zil, newval); 242 } 243 244 void 245 dmu_objset_byteswap(void *buf, size_t size) 246 { 247 objset_phys_t *osp = buf; 248 249 ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); 250 dnode_byteswap(&osp->os_meta_dnode); 251 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); 252 osp->os_type = BSWAP_64(osp->os_type); 253 osp->os_flags = BSWAP_64(osp->os_flags); 254 if (size == sizeof (objset_phys_t)) { 255 dnode_byteswap(&osp->os_userused_dnode); 256 dnode_byteswap(&osp->os_groupused_dnode); 257 } 258 } 259 260 int 261 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 262 objset_t **osp) 263 { 264 objset_t *os; 265 int i, err; 266 267 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); 268 269 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); 270 os->os_dsl_dataset = ds; 271 os->os_spa = spa; 272 os->os_rootbp = bp; 273 if (!BP_IS_HOLE(os->os_rootbp)) { 274 uint32_t aflags = ARC_WAIT; 275 zbookmark_t zb; 276 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 277 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 278 279 if (DMU_OS_IS_L2CACHEABLE(os)) 280 aflags |= ARC_L2CACHE; 281 if (DMU_OS_IS_L2COMPRESSIBLE(os)) 282 aflags |= ARC_L2COMPRESS; 283 284 dprintf_bp(os->os_rootbp, "reading %s", ""); 285 err = arc_read(NULL, spa, os->os_rootbp, 286 arc_getbuf_func, &os->os_phys_buf, 287 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); 288 if (err != 0) { 289 kmem_free(os, sizeof (objset_t)); 290 /* convert checksum errors into IO errors */ 291 if (err == ECKSUM) 292 err = SET_ERROR(EIO); 293 return (err); 294 } 295 296 /* Increase the blocksize if we are permitted. */ 297 if (spa_version(spa) >= SPA_VERSION_USERSPACE && 298 arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { 299 arc_buf_t *buf = arc_buf_alloc(spa, 300 sizeof (objset_phys_t), &os->os_phys_buf, 301 ARC_BUFC_METADATA); 302 bzero(buf->b_data, sizeof (objset_phys_t)); 303 bcopy(os->os_phys_buf->b_data, buf->b_data, 304 arc_buf_size(os->os_phys_buf)); 305 (void) arc_buf_remove_ref(os->os_phys_buf, 306 &os->os_phys_buf); 307 os->os_phys_buf = buf; 308 } 309 310 os->os_phys = os->os_phys_buf->b_data; 311 os->os_flags = os->os_phys->os_flags; 312 } else { 313 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? 314 sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; 315 os->os_phys_buf = arc_buf_alloc(spa, size, 316 &os->os_phys_buf, ARC_BUFC_METADATA); 317 os->os_phys = os->os_phys_buf->b_data; 318 bzero(os->os_phys, size); 319 } 320 321 /* 322 * Note: the changed_cb will be called once before the register 323 * func returns, thus changing the checksum/compression from the 324 * default (fletcher2/off). Snapshots don't need to know about 325 * checksum/compression/copies. 326 */ 327 if (ds) { 328 err = dsl_prop_register(ds, 329 zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), 330 primary_cache_changed_cb, os); 331 if (err == 0) { 332 err = dsl_prop_register(ds, 333 zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), 334 secondary_cache_changed_cb, os); 335 } 336 if (!dsl_dataset_is_snapshot(ds)) { 337 if (err == 0) { 338 err = dsl_prop_register(ds, 339 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 340 checksum_changed_cb, os); 341 } 342 if (err == 0) { 343 err = dsl_prop_register(ds, 344 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 345 compression_changed_cb, os); 346 } 347 if (err == 0) { 348 err = dsl_prop_register(ds, 349 zfs_prop_to_name(ZFS_PROP_COPIES), 350 copies_changed_cb, os); 351 } 352 if (err == 0) { 353 err = dsl_prop_register(ds, 354 zfs_prop_to_name(ZFS_PROP_DEDUP), 355 dedup_changed_cb, os); 356 } 357 if (err == 0) { 358 err = dsl_prop_register(ds, 359 zfs_prop_to_name(ZFS_PROP_LOGBIAS), 360 logbias_changed_cb, os); 361 } 362 if (err == 0) { 363 err = dsl_prop_register(ds, 364 zfs_prop_to_name(ZFS_PROP_SYNC), 365 sync_changed_cb, os); 366 } 367 } 368 if (err != 0) { 369 VERIFY(arc_buf_remove_ref(os->os_phys_buf, 370 &os->os_phys_buf)); 371 kmem_free(os, sizeof (objset_t)); 372 return (err); 373 } 374 } else if (ds == NULL) { 375 /* It's the meta-objset. */ 376 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; 377 os->os_compress = ZIO_COMPRESS_LZJB; 378 os->os_copies = spa_max_replication(spa); 379 os->os_dedup_checksum = ZIO_CHECKSUM_OFF; 380 os->os_dedup_verify = 0; 381 os->os_logbias = 0; 382 os->os_sync = 0; 383 os->os_primary_cache = ZFS_CACHE_ALL; 384 os->os_secondary_cache = ZFS_CACHE_ALL; 385 } 386 387 if (ds == NULL || !dsl_dataset_is_snapshot(ds)) 388 os->os_zil_header = os->os_phys->os_zil_header; 389 os->os_zil = zil_alloc(os, &os->os_zil_header); 390 391 for (i = 0; i < TXG_SIZE; i++) { 392 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), 393 offsetof(dnode_t, dn_dirty_link[i])); 394 list_create(&os->os_free_dnodes[i], sizeof (dnode_t), 395 offsetof(dnode_t, dn_dirty_link[i])); 396 } 397 list_create(&os->os_dnodes, sizeof (dnode_t), 398 offsetof(dnode_t, dn_link)); 399 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), 400 offsetof(dmu_buf_impl_t, db_link)); 401 402 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); 403 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); 404 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); 405 406 DMU_META_DNODE(os) = dnode_special_open(os, 407 &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, 408 &os->os_meta_dnode); 409 if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { 410 DMU_USERUSED_DNODE(os) = dnode_special_open(os, 411 &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, 412 &os->os_userused_dnode); 413 DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, 414 &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, 415 &os->os_groupused_dnode); 416 } 417 418 /* 419 * We should be the only thread trying to do this because we 420 * have ds_opening_lock 421 */ 422 if (ds) { 423 mutex_enter(&ds->ds_lock); 424 ASSERT(ds->ds_objset == NULL); 425 ds->ds_objset = os; 426 mutex_exit(&ds->ds_lock); 427 } 428 429 *osp = os; 430 return (0); 431 } 432 433 int 434 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) 435 { 436 int err = 0; 437 438 mutex_enter(&ds->ds_opening_lock); 439 *osp = ds->ds_objset; 440 if (*osp == NULL) { 441 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), 442 ds, dsl_dataset_get_blkptr(ds), osp); 443 } 444 mutex_exit(&ds->ds_opening_lock); 445 return (err); 446 } 447 448 /* 449 * Holds the pool while the objset is held. Therefore only one objset 450 * can be held at a time. 451 */ 452 int 453 dmu_objset_hold(const char *name, void *tag, objset_t **osp) 454 { 455 dsl_pool_t *dp; 456 dsl_dataset_t *ds; 457 int err; 458 459 err = dsl_pool_hold(name, tag, &dp); 460 if (err != 0) 461 return (err); 462 err = dsl_dataset_hold(dp, name, tag, &ds); 463 if (err != 0) { 464 dsl_pool_rele(dp, tag); 465 return (err); 466 } 467 468 err = dmu_objset_from_ds(ds, osp); 469 if (err != 0) { 470 dsl_dataset_rele(ds, tag); 471 dsl_pool_rele(dp, tag); 472 } 473 474 return (err); 475 } 476 477 /* 478 * dsl_pool must not be held when this is called. 479 * Upon successful return, there will be a longhold on the dataset, 480 * and the dsl_pool will not be held. 481 */ 482 int 483 dmu_objset_own(const char *name, dmu_objset_type_t type, 484 boolean_t readonly, void *tag, objset_t **osp) 485 { 486 dsl_pool_t *dp; 487 dsl_dataset_t *ds; 488 int err; 489 490 err = dsl_pool_hold(name, FTAG, &dp); 491 if (err != 0) 492 return (err); 493 err = dsl_dataset_own(dp, name, tag, &ds); 494 if (err != 0) { 495 dsl_pool_rele(dp, FTAG); 496 return (err); 497 } 498 499 err = dmu_objset_from_ds(ds, osp); 500 dsl_pool_rele(dp, FTAG); 501 if (err != 0) { 502 dsl_dataset_disown(ds, tag); 503 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { 504 dsl_dataset_disown(ds, tag); 505 return (SET_ERROR(EINVAL)); 506 } else if (!readonly && dsl_dataset_is_snapshot(ds)) { 507 dsl_dataset_disown(ds, tag); 508 return (SET_ERROR(EROFS)); 509 } 510 return (err); 511 } 512 513 void 514 dmu_objset_rele(objset_t *os, void *tag) 515 { 516 dsl_pool_t *dp = dmu_objset_pool(os); 517 dsl_dataset_rele(os->os_dsl_dataset, tag); 518 dsl_pool_rele(dp, tag); 519 } 520 521 /* 522 * When we are called, os MUST refer to an objset associated with a dataset 523 * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner 524 * == tag. We will then release and reacquire ownership of the dataset while 525 * holding the pool config_rwlock to avoid intervening namespace or ownership 526 * changes may occur. 527 * 528 * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to 529 * release the hold on its dataset and acquire a new one on the dataset of the 530 * same name so that it can be partially torn down and reconstructed. 531 */ 532 void 533 dmu_objset_refresh_ownership(objset_t *os, void *tag) 534 { 535 dsl_pool_t *dp; 536 dsl_dataset_t *ds, *newds; 537 char name[MAXNAMELEN]; 538 539 ds = os->os_dsl_dataset; 540 VERIFY3P(ds, !=, NULL); 541 VERIFY3P(ds->ds_owner, ==, tag); 542 VERIFY(dsl_dataset_long_held(ds)); 543 544 dsl_dataset_name(ds, name); 545 dp = dmu_objset_pool(os); 546 dsl_pool_config_enter(dp, FTAG); 547 dmu_objset_disown(os, tag); 548 VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); 549 VERIFY3P(newds, ==, os->os_dsl_dataset); 550 dsl_pool_config_exit(dp, FTAG); 551 } 552 553 void 554 dmu_objset_disown(objset_t *os, void *tag) 555 { 556 dsl_dataset_disown(os->os_dsl_dataset, tag); 557 } 558 559 void 560 dmu_objset_evict_dbufs(objset_t *os) 561 { 562 dnode_t *dn; 563 564 mutex_enter(&os->os_lock); 565 566 /* process the mdn last, since the other dnodes have holds on it */ 567 list_remove(&os->os_dnodes, DMU_META_DNODE(os)); 568 list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); 569 570 /* 571 * Find the first dnode with holds. We have to do this dance 572 * because dnode_add_ref() only works if you already have a 573 * hold. If there are no holds then it has no dbufs so OK to 574 * skip. 575 */ 576 for (dn = list_head(&os->os_dnodes); 577 dn && !dnode_add_ref(dn, FTAG); 578 dn = list_next(&os->os_dnodes, dn)) 579 continue; 580 581 while (dn) { 582 dnode_t *next_dn = dn; 583 584 do { 585 next_dn = list_next(&os->os_dnodes, next_dn); 586 } while (next_dn && !dnode_add_ref(next_dn, FTAG)); 587 588 mutex_exit(&os->os_lock); 589 dnode_evict_dbufs(dn); 590 dnode_rele(dn, FTAG); 591 mutex_enter(&os->os_lock); 592 dn = next_dn; 593 } 594 mutex_exit(&os->os_lock); 595 } 596 597 void 598 dmu_objset_evict(objset_t *os) 599 { 600 dsl_dataset_t *ds = os->os_dsl_dataset; 601 602 for (int t = 0; t < TXG_SIZE; t++) 603 ASSERT(!dmu_objset_is_dirty(os, t)); 604 605 if (ds) { 606 if (!dsl_dataset_is_snapshot(ds)) { 607 VERIFY0(dsl_prop_unregister(ds, 608 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 609 checksum_changed_cb, os)); 610 VERIFY0(dsl_prop_unregister(ds, 611 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 612 compression_changed_cb, os)); 613 VERIFY0(dsl_prop_unregister(ds, 614 zfs_prop_to_name(ZFS_PROP_COPIES), 615 copies_changed_cb, os)); 616 VERIFY0(dsl_prop_unregister(ds, 617 zfs_prop_to_name(ZFS_PROP_DEDUP), 618 dedup_changed_cb, os)); 619 VERIFY0(dsl_prop_unregister(ds, 620 zfs_prop_to_name(ZFS_PROP_LOGBIAS), 621 logbias_changed_cb, os)); 622 VERIFY0(dsl_prop_unregister(ds, 623 zfs_prop_to_name(ZFS_PROP_SYNC), 624 sync_changed_cb, os)); 625 } 626 VERIFY0(dsl_prop_unregister(ds, 627 zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), 628 primary_cache_changed_cb, os)); 629 VERIFY0(dsl_prop_unregister(ds, 630 zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), 631 secondary_cache_changed_cb, os)); 632 } 633 634 if (os->os_sa) 635 sa_tear_down(os); 636 637 dmu_objset_evict_dbufs(os); 638 639 dnode_special_close(&os->os_meta_dnode); 640 if (DMU_USERUSED_DNODE(os)) { 641 dnode_special_close(&os->os_userused_dnode); 642 dnode_special_close(&os->os_groupused_dnode); 643 } 644 zil_free(os->os_zil); 645 646 ASSERT3P(list_head(&os->os_dnodes), ==, NULL); 647 648 VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); 649 650 /* 651 * This is a barrier to prevent the objset from going away in 652 * dnode_move() until we can safely ensure that the objset is still in 653 * use. We consider the objset valid before the barrier and invalid 654 * after the barrier. 655 */ 656 rw_enter(&os_lock, RW_READER); 657 rw_exit(&os_lock); 658 659 mutex_destroy(&os->os_lock); 660 mutex_destroy(&os->os_obj_lock); 661 mutex_destroy(&os->os_user_ptr_lock); 662 kmem_free(os, sizeof (objset_t)); 663 } 664 665 timestruc_t 666 dmu_objset_snap_cmtime(objset_t *os) 667 { 668 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); 669 } 670 671 /* called from dsl for meta-objset */ 672 objset_t * 673 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 674 dmu_objset_type_t type, dmu_tx_t *tx) 675 { 676 objset_t *os; 677 dnode_t *mdn; 678 679 ASSERT(dmu_tx_is_syncing(tx)); 680 681 if (ds != NULL) 682 VERIFY0(dmu_objset_from_ds(ds, &os)); 683 else 684 VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); 685 686 mdn = DMU_META_DNODE(os); 687 688 dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, 689 DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); 690 691 /* 692 * We don't want to have to increase the meta-dnode's nlevels 693 * later, because then we could do it in quescing context while 694 * we are also accessing it in open context. 695 * 696 * This precaution is not necessary for the MOS (ds == NULL), 697 * because the MOS is only updated in syncing context. 698 * This is most fortunate: the MOS is the only objset that 699 * needs to be synced multiple times as spa_sync() iterates 700 * to convergence, so minimizing its dn_nlevels matters. 701 */ 702 if (ds != NULL) { 703 int levels = 1; 704 705 /* 706 * Determine the number of levels necessary for the meta-dnode 707 * to contain DN_MAX_OBJECT dnodes. 708 */ 709 while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + 710 (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < 711 DN_MAX_OBJECT * sizeof (dnode_phys_t)) 712 levels++; 713 714 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = 715 mdn->dn_nlevels = levels; 716 } 717 718 ASSERT(type != DMU_OST_NONE); 719 ASSERT(type != DMU_OST_ANY); 720 ASSERT(type < DMU_OST_NUMTYPES); 721 os->os_phys->os_type = type; 722 if (dmu_objset_userused_enabled(os)) { 723 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 724 os->os_flags = os->os_phys->os_flags; 725 } 726 727 dsl_dataset_dirty(ds, tx); 728 729 return (os); 730 } 731 732 typedef struct dmu_objset_create_arg { 733 const char *doca_name; 734 cred_t *doca_cred; 735 void (*doca_userfunc)(objset_t *os, void *arg, 736 cred_t *cr, dmu_tx_t *tx); 737 void *doca_userarg; 738 dmu_objset_type_t doca_type; 739 uint64_t doca_flags; 740 } dmu_objset_create_arg_t; 741 742 /*ARGSUSED*/ 743 static int 744 dmu_objset_create_check(void *arg, dmu_tx_t *tx) 745 { 746 dmu_objset_create_arg_t *doca = arg; 747 dsl_pool_t *dp = dmu_tx_pool(tx); 748 dsl_dir_t *pdd; 749 const char *tail; 750 int error; 751 752 if (strchr(doca->doca_name, '@') != NULL) 753 return (SET_ERROR(EINVAL)); 754 755 error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); 756 if (error != 0) 757 return (error); 758 if (tail == NULL) { 759 dsl_dir_rele(pdd, FTAG); 760 return (SET_ERROR(EEXIST)); 761 } 762 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, 763 doca->doca_cred); 764 dsl_dir_rele(pdd, FTAG); 765 766 return (error); 767 } 768 769 static void 770 dmu_objset_create_sync(void *arg, dmu_tx_t *tx) 771 { 772 dmu_objset_create_arg_t *doca = arg; 773 dsl_pool_t *dp = dmu_tx_pool(tx); 774 dsl_dir_t *pdd; 775 const char *tail; 776 dsl_dataset_t *ds; 777 uint64_t obj; 778 blkptr_t *bp; 779 objset_t *os; 780 781 VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); 782 783 obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, 784 doca->doca_cred, tx); 785 786 VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); 787 bp = dsl_dataset_get_blkptr(ds); 788 os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, 789 ds, bp, doca->doca_type, tx); 790 791 if (doca->doca_userfunc != NULL) { 792 doca->doca_userfunc(os, doca->doca_userarg, 793 doca->doca_cred, tx); 794 } 795 796 spa_history_log_internal_ds(ds, "create", tx, ""); 797 dsl_dataset_rele(ds, FTAG); 798 dsl_dir_rele(pdd, FTAG); 799 } 800 801 int 802 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 803 void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) 804 { 805 dmu_objset_create_arg_t doca; 806 807 doca.doca_name = name; 808 doca.doca_cred = CRED(); 809 doca.doca_flags = flags; 810 doca.doca_userfunc = func; 811 doca.doca_userarg = arg; 812 doca.doca_type = type; 813 814 return (dsl_sync_task(name, 815 dmu_objset_create_check, dmu_objset_create_sync, &doca, 5)); 816 } 817 818 typedef struct dmu_objset_clone_arg { 819 const char *doca_clone; 820 const char *doca_origin; 821 cred_t *doca_cred; 822 } dmu_objset_clone_arg_t; 823 824 /*ARGSUSED*/ 825 static int 826 dmu_objset_clone_check(void *arg, dmu_tx_t *tx) 827 { 828 dmu_objset_clone_arg_t *doca = arg; 829 dsl_dir_t *pdd; 830 const char *tail; 831 int error; 832 dsl_dataset_t *origin; 833 dsl_pool_t *dp = dmu_tx_pool(tx); 834 835 if (strchr(doca->doca_clone, '@') != NULL) 836 return (SET_ERROR(EINVAL)); 837 838 error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); 839 if (error != 0) 840 return (error); 841 if (tail == NULL) { 842 dsl_dir_rele(pdd, FTAG); 843 return (SET_ERROR(EEXIST)); 844 } 845 /* You can't clone across pools. */ 846 if (pdd->dd_pool != dp) { 847 dsl_dir_rele(pdd, FTAG); 848 return (SET_ERROR(EXDEV)); 849 } 850 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, 851 doca->doca_cred); 852 if (error != 0) { 853 dsl_dir_rele(pdd, FTAG); 854 return (SET_ERROR(EDQUOT)); 855 } 856 dsl_dir_rele(pdd, FTAG); 857 858 error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); 859 if (error != 0) 860 return (error); 861 862 /* You can't clone across pools. */ 863 if (origin->ds_dir->dd_pool != dp) { 864 dsl_dataset_rele(origin, FTAG); 865 return (SET_ERROR(EXDEV)); 866 } 867 868 /* You can only clone snapshots, not the head datasets. */ 869 if (!dsl_dataset_is_snapshot(origin)) { 870 dsl_dataset_rele(origin, FTAG); 871 return (SET_ERROR(EINVAL)); 872 } 873 dsl_dataset_rele(origin, FTAG); 874 875 return (0); 876 } 877 878 static void 879 dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) 880 { 881 dmu_objset_clone_arg_t *doca = arg; 882 dsl_pool_t *dp = dmu_tx_pool(tx); 883 dsl_dir_t *pdd; 884 const char *tail; 885 dsl_dataset_t *origin, *ds; 886 uint64_t obj; 887 char namebuf[MAXNAMELEN]; 888 889 VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); 890 VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); 891 892 obj = dsl_dataset_create_sync(pdd, tail, origin, 0, 893 doca->doca_cred, tx); 894 895 VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); 896 dsl_dataset_name(origin, namebuf); 897 spa_history_log_internal_ds(ds, "clone", tx, 898 "origin=%s (%llu)", namebuf, origin->ds_object); 899 dsl_dataset_rele(ds, FTAG); 900 dsl_dataset_rele(origin, FTAG); 901 dsl_dir_rele(pdd, FTAG); 902 } 903 904 int 905 dmu_objset_clone(const char *clone, const char *origin) 906 { 907 dmu_objset_clone_arg_t doca; 908 909 doca.doca_clone = clone; 910 doca.doca_origin = origin; 911 doca.doca_cred = CRED(); 912 913 return (dsl_sync_task(clone, 914 dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5)); 915 } 916 917 int 918 dmu_objset_snapshot_one(const char *fsname, const char *snapname) 919 { 920 int err; 921 char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); 922 nvlist_t *snaps = fnvlist_alloc(); 923 924 fnvlist_add_boolean(snaps, longsnap); 925 strfree(longsnap); 926 err = dsl_dataset_snapshot(snaps, NULL, NULL); 927 fnvlist_free(snaps); 928 return (err); 929 } 930 931 static void 932 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) 933 { 934 dnode_t *dn; 935 936 while (dn = list_head(list)) { 937 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 938 ASSERT(dn->dn_dbuf->db_data_pending); 939 /* 940 * Initialize dn_zio outside dnode_sync() because the 941 * meta-dnode needs to set it ouside dnode_sync(). 942 */ 943 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; 944 ASSERT(dn->dn_zio); 945 946 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); 947 list_remove(list, dn); 948 949 if (newlist) { 950 (void) dnode_add_ref(dn, newlist); 951 list_insert_tail(newlist, dn); 952 } 953 954 dnode_sync(dn, tx); 955 } 956 } 957 958 /* ARGSUSED */ 959 static void 960 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) 961 { 962 blkptr_t *bp = zio->io_bp; 963 objset_t *os = arg; 964 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; 965 966 ASSERT3P(bp, ==, os->os_rootbp); 967 ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); 968 ASSERT0(BP_GET_LEVEL(bp)); 969 970 /* 971 * Update rootbp fill count: it should be the number of objects 972 * allocated in the object set (not counting the "special" 973 * objects that are stored in the objset_phys_t -- the meta 974 * dnode and user/group accounting objects). 975 */ 976 bp->blk_fill = 0; 977 for (int i = 0; i < dnp->dn_nblkptr; i++) 978 bp->blk_fill += dnp->dn_blkptr[i].blk_fill; 979 } 980 981 /* ARGSUSED */ 982 static void 983 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) 984 { 985 blkptr_t *bp = zio->io_bp; 986 blkptr_t *bp_orig = &zio->io_bp_orig; 987 objset_t *os = arg; 988 989 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 990 ASSERT(BP_EQUAL(bp, bp_orig)); 991 } else { 992 dsl_dataset_t *ds = os->os_dsl_dataset; 993 dmu_tx_t *tx = os->os_synctx; 994 995 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 996 dsl_dataset_block_born(ds, bp, tx); 997 } 998 } 999 1000 /* called from dsl */ 1001 void 1002 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) 1003 { 1004 int txgoff; 1005 zbookmark_t zb; 1006 zio_prop_t zp; 1007 zio_t *zio; 1008 list_t *list; 1009 list_t *newlist = NULL; 1010 dbuf_dirty_record_t *dr; 1011 1012 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); 1013 1014 ASSERT(dmu_tx_is_syncing(tx)); 1015 /* XXX the write_done callback should really give us the tx... */ 1016 os->os_synctx = tx; 1017 1018 if (os->os_dsl_dataset == NULL) { 1019 /* 1020 * This is the MOS. If we have upgraded, 1021 * spa_max_replication() could change, so reset 1022 * os_copies here. 1023 */ 1024 os->os_copies = spa_max_replication(os->os_spa); 1025 } 1026 1027 /* 1028 * Create the root block IO 1029 */ 1030 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 1031 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1032 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1033 arc_release(os->os_phys_buf, &os->os_phys_buf); 1034 1035 dmu_write_policy(os, NULL, 0, 0, &zp); 1036 1037 zio = arc_write(pio, os->os_spa, tx->tx_txg, 1038 os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), 1039 DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, 1040 NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, 1041 ZIO_FLAG_MUSTSUCCEED, &zb); 1042 1043 /* 1044 * Sync special dnodes - the parent IO for the sync is the root block 1045 */ 1046 DMU_META_DNODE(os)->dn_zio = zio; 1047 dnode_sync(DMU_META_DNODE(os), tx); 1048 1049 os->os_phys->os_flags = os->os_flags; 1050 1051 if (DMU_USERUSED_DNODE(os) && 1052 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { 1053 DMU_USERUSED_DNODE(os)->dn_zio = zio; 1054 dnode_sync(DMU_USERUSED_DNODE(os), tx); 1055 DMU_GROUPUSED_DNODE(os)->dn_zio = zio; 1056 dnode_sync(DMU_GROUPUSED_DNODE(os), tx); 1057 } 1058 1059 txgoff = tx->tx_txg & TXG_MASK; 1060 1061 if (dmu_objset_userused_enabled(os)) { 1062 newlist = &os->os_synced_dnodes; 1063 /* 1064 * We must create the list here because it uses the 1065 * dn_dirty_link[] of this txg. 1066 */ 1067 list_create(newlist, sizeof (dnode_t), 1068 offsetof(dnode_t, dn_dirty_link[txgoff])); 1069 } 1070 1071 dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); 1072 dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); 1073 1074 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; 1075 while (dr = list_head(list)) { 1076 ASSERT0(dr->dr_dbuf->db_level); 1077 list_remove(list, dr); 1078 if (dr->dr_zio) 1079 zio_nowait(dr->dr_zio); 1080 } 1081 /* 1082 * Free intent log blocks up to this tx. 1083 */ 1084 zil_sync(os->os_zil, tx); 1085 os->os_phys->os_zil_header = os->os_zil_header; 1086 zio_nowait(zio); 1087 } 1088 1089 boolean_t 1090 dmu_objset_is_dirty(objset_t *os, uint64_t txg) 1091 { 1092 return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || 1093 !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); 1094 } 1095 1096 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; 1097 1098 void 1099 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) 1100 { 1101 used_cbs[ost] = cb; 1102 } 1103 1104 boolean_t 1105 dmu_objset_userused_enabled(objset_t *os) 1106 { 1107 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && 1108 used_cbs[os->os_phys->os_type] != NULL && 1109 DMU_USERUSED_DNODE(os) != NULL); 1110 } 1111 1112 static void 1113 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, 1114 uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) 1115 { 1116 if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { 1117 int64_t delta = DNODE_SIZE + used; 1118 if (subtract) 1119 delta = -delta; 1120 VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, 1121 user, delta, tx)); 1122 VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, 1123 group, delta, tx)); 1124 } 1125 } 1126 1127 void 1128 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) 1129 { 1130 dnode_t *dn; 1131 list_t *list = &os->os_synced_dnodes; 1132 1133 ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); 1134 1135 while (dn = list_head(list)) { 1136 int flags; 1137 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); 1138 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || 1139 dn->dn_phys->dn_flags & 1140 DNODE_FLAG_USERUSED_ACCOUNTED); 1141 1142 /* Allocate the user/groupused objects if necessary. */ 1143 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { 1144 VERIFY(0 == zap_create_claim(os, 1145 DMU_USERUSED_OBJECT, 1146 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1147 VERIFY(0 == zap_create_claim(os, 1148 DMU_GROUPUSED_OBJECT, 1149 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1150 } 1151 1152 /* 1153 * We intentionally modify the zap object even if the 1154 * net delta is zero. Otherwise 1155 * the block of the zap obj could be shared between 1156 * datasets but need to be different between them after 1157 * a bprewrite. 1158 */ 1159 1160 flags = dn->dn_id_flags; 1161 ASSERT(flags); 1162 if (flags & DN_ID_OLD_EXIST) { 1163 do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, 1164 dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); 1165 } 1166 if (flags & DN_ID_NEW_EXIST) { 1167 do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), 1168 dn->dn_phys->dn_flags, dn->dn_newuid, 1169 dn->dn_newgid, B_FALSE, tx); 1170 } 1171 1172 mutex_enter(&dn->dn_mtx); 1173 dn->dn_oldused = 0; 1174 dn->dn_oldflags = 0; 1175 if (dn->dn_id_flags & DN_ID_NEW_EXIST) { 1176 dn->dn_olduid = dn->dn_newuid; 1177 dn->dn_oldgid = dn->dn_newgid; 1178 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1179 if (dn->dn_bonuslen == 0) 1180 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1181 else 1182 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1183 } 1184 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); 1185 mutex_exit(&dn->dn_mtx); 1186 1187 list_remove(list, dn); 1188 dnode_rele(dn, list); 1189 } 1190 } 1191 1192 /* 1193 * Returns a pointer to data to find uid/gid from 1194 * 1195 * If a dirty record for transaction group that is syncing can't 1196 * be found then NULL is returned. In the NULL case it is assumed 1197 * the uid/gid aren't changing. 1198 */ 1199 static void * 1200 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) 1201 { 1202 dbuf_dirty_record_t *dr, **drp; 1203 void *data; 1204 1205 if (db->db_dirtycnt == 0) 1206 return (db->db.db_data); /* Nothing is changing */ 1207 1208 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1209 if (dr->dr_txg == tx->tx_txg) 1210 break; 1211 1212 if (dr == NULL) { 1213 data = NULL; 1214 } else { 1215 dnode_t *dn; 1216 1217 DB_DNODE_ENTER(dr->dr_dbuf); 1218 dn = DB_DNODE(dr->dr_dbuf); 1219 1220 if (dn->dn_bonuslen == 0 && 1221 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) 1222 data = dr->dt.dl.dr_data->b_data; 1223 else 1224 data = dr->dt.dl.dr_data; 1225 1226 DB_DNODE_EXIT(dr->dr_dbuf); 1227 } 1228 1229 return (data); 1230 } 1231 1232 void 1233 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) 1234 { 1235 objset_t *os = dn->dn_objset; 1236 void *data = NULL; 1237 dmu_buf_impl_t *db = NULL; 1238 uint64_t *user = NULL; 1239 uint64_t *group = NULL; 1240 int flags = dn->dn_id_flags; 1241 int error; 1242 boolean_t have_spill = B_FALSE; 1243 1244 if (!dmu_objset_userused_enabled(dn->dn_objset)) 1245 return; 1246 1247 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| 1248 DN_ID_CHKED_SPILL))) 1249 return; 1250 1251 if (before && dn->dn_bonuslen != 0) 1252 data = DN_BONUS(dn->dn_phys); 1253 else if (!before && dn->dn_bonuslen != 0) { 1254 if (dn->dn_bonus) { 1255 db = dn->dn_bonus; 1256 mutex_enter(&db->db_mtx); 1257 data = dmu_objset_userquota_find_data(db, tx); 1258 } else { 1259 data = DN_BONUS(dn->dn_phys); 1260 } 1261 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { 1262 int rf = 0; 1263 1264 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) 1265 rf |= DB_RF_HAVESTRUCT; 1266 error = dmu_spill_hold_by_dnode(dn, 1267 rf | DB_RF_MUST_SUCCEED, 1268 FTAG, (dmu_buf_t **)&db); 1269 ASSERT(error == 0); 1270 mutex_enter(&db->db_mtx); 1271 data = (before) ? db->db.db_data : 1272 dmu_objset_userquota_find_data(db, tx); 1273 have_spill = B_TRUE; 1274 } else { 1275 mutex_enter(&dn->dn_mtx); 1276 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1277 mutex_exit(&dn->dn_mtx); 1278 return; 1279 } 1280 1281 if (before) { 1282 ASSERT(data); 1283 user = &dn->dn_olduid; 1284 group = &dn->dn_oldgid; 1285 } else if (data) { 1286 user = &dn->dn_newuid; 1287 group = &dn->dn_newgid; 1288 } 1289 1290 /* 1291 * Must always call the callback in case the object 1292 * type has changed and that type isn't an object type to track 1293 */ 1294 error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, 1295 user, group); 1296 1297 /* 1298 * Preserve existing uid/gid when the callback can't determine 1299 * what the new uid/gid are and the callback returned EEXIST. 1300 * The EEXIST error tells us to just use the existing uid/gid. 1301 * If we don't know what the old values are then just assign 1302 * them to 0, since that is a new file being created. 1303 */ 1304 if (!before && data == NULL && error == EEXIST) { 1305 if (flags & DN_ID_OLD_EXIST) { 1306 dn->dn_newuid = dn->dn_olduid; 1307 dn->dn_newgid = dn->dn_oldgid; 1308 } else { 1309 dn->dn_newuid = 0; 1310 dn->dn_newgid = 0; 1311 } 1312 error = 0; 1313 } 1314 1315 if (db) 1316 mutex_exit(&db->db_mtx); 1317 1318 mutex_enter(&dn->dn_mtx); 1319 if (error == 0 && before) 1320 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1321 if (error == 0 && !before) 1322 dn->dn_id_flags |= DN_ID_NEW_EXIST; 1323 1324 if (have_spill) { 1325 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1326 } else { 1327 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1328 } 1329 mutex_exit(&dn->dn_mtx); 1330 if (have_spill) 1331 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1332 } 1333 1334 boolean_t 1335 dmu_objset_userspace_present(objset_t *os) 1336 { 1337 return (os->os_phys->os_flags & 1338 OBJSET_FLAG_USERACCOUNTING_COMPLETE); 1339 } 1340 1341 int 1342 dmu_objset_userspace_upgrade(objset_t *os) 1343 { 1344 uint64_t obj; 1345 int err = 0; 1346 1347 if (dmu_objset_userspace_present(os)) 1348 return (0); 1349 if (!dmu_objset_userused_enabled(os)) 1350 return (SET_ERROR(ENOTSUP)); 1351 if (dmu_objset_is_snapshot(os)) 1352 return (SET_ERROR(EINVAL)); 1353 1354 /* 1355 * We simply need to mark every object dirty, so that it will be 1356 * synced out and now accounted. If this is called 1357 * concurrently, or if we already did some work before crashing, 1358 * that's fine, since we track each object's accounted state 1359 * independently. 1360 */ 1361 1362 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 1363 dmu_tx_t *tx; 1364 dmu_buf_t *db; 1365 int objerr; 1366 1367 if (issig(JUSTLOOKING) && issig(FORREAL)) 1368 return (SET_ERROR(EINTR)); 1369 1370 objerr = dmu_bonus_hold(os, obj, FTAG, &db); 1371 if (objerr != 0) 1372 continue; 1373 tx = dmu_tx_create(os); 1374 dmu_tx_hold_bonus(tx, obj); 1375 objerr = dmu_tx_assign(tx, TXG_WAIT); 1376 if (objerr != 0) { 1377 dmu_tx_abort(tx); 1378 continue; 1379 } 1380 dmu_buf_will_dirty(db, tx); 1381 dmu_buf_rele(db, FTAG); 1382 dmu_tx_commit(tx); 1383 } 1384 1385 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 1386 txg_wait_synced(dmu_objset_pool(os), 0); 1387 return (0); 1388 } 1389 1390 void 1391 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, 1392 uint64_t *usedobjsp, uint64_t *availobjsp) 1393 { 1394 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, 1395 usedobjsp, availobjsp); 1396 } 1397 1398 uint64_t 1399 dmu_objset_fsid_guid(objset_t *os) 1400 { 1401 return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); 1402 } 1403 1404 void 1405 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) 1406 { 1407 stat->dds_type = os->os_phys->os_type; 1408 if (os->os_dsl_dataset) 1409 dsl_dataset_fast_stat(os->os_dsl_dataset, stat); 1410 } 1411 1412 void 1413 dmu_objset_stats(objset_t *os, nvlist_t *nv) 1414 { 1415 ASSERT(os->os_dsl_dataset || 1416 os->os_phys->os_type == DMU_OST_META); 1417 1418 if (os->os_dsl_dataset != NULL) 1419 dsl_dataset_stats(os->os_dsl_dataset, nv); 1420 1421 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, 1422 os->os_phys->os_type); 1423 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, 1424 dmu_objset_userspace_present(os)); 1425 } 1426 1427 int 1428 dmu_objset_is_snapshot(objset_t *os) 1429 { 1430 if (os->os_dsl_dataset != NULL) 1431 return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); 1432 else 1433 return (B_FALSE); 1434 } 1435 1436 int 1437 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, 1438 boolean_t *conflict) 1439 { 1440 dsl_dataset_t *ds = os->os_dsl_dataset; 1441 uint64_t ignored; 1442 1443 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1444 return (SET_ERROR(ENOENT)); 1445 1446 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, 1447 ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, 1448 real, maxlen, conflict)); 1449 } 1450 1451 int 1452 dmu_snapshot_list_next(objset_t *os, int namelen, char *name, 1453 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) 1454 { 1455 dsl_dataset_t *ds = os->os_dsl_dataset; 1456 zap_cursor_t cursor; 1457 zap_attribute_t attr; 1458 1459 ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); 1460 1461 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1462 return (SET_ERROR(ENOENT)); 1463 1464 zap_cursor_init_serialized(&cursor, 1465 ds->ds_dir->dd_pool->dp_meta_objset, 1466 ds->ds_phys->ds_snapnames_zapobj, *offp); 1467 1468 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1469 zap_cursor_fini(&cursor); 1470 return (SET_ERROR(ENOENT)); 1471 } 1472 1473 if (strlen(attr.za_name) + 1 > namelen) { 1474 zap_cursor_fini(&cursor); 1475 return (SET_ERROR(ENAMETOOLONG)); 1476 } 1477 1478 (void) strcpy(name, attr.za_name); 1479 if (idp) 1480 *idp = attr.za_first_integer; 1481 if (case_conflict) 1482 *case_conflict = attr.za_normalization_conflict; 1483 zap_cursor_advance(&cursor); 1484 *offp = zap_cursor_serialize(&cursor); 1485 zap_cursor_fini(&cursor); 1486 1487 return (0); 1488 } 1489 1490 int 1491 dmu_dir_list_next(objset_t *os, int namelen, char *name, 1492 uint64_t *idp, uint64_t *offp) 1493 { 1494 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 1495 zap_cursor_t cursor; 1496 zap_attribute_t attr; 1497 1498 /* there is no next dir on a snapshot! */ 1499 if (os->os_dsl_dataset->ds_object != 1500 dd->dd_phys->dd_head_dataset_obj) 1501 return (SET_ERROR(ENOENT)); 1502 1503 zap_cursor_init_serialized(&cursor, 1504 dd->dd_pool->dp_meta_objset, 1505 dd->dd_phys->dd_child_dir_zapobj, *offp); 1506 1507 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1508 zap_cursor_fini(&cursor); 1509 return (SET_ERROR(ENOENT)); 1510 } 1511 1512 if (strlen(attr.za_name) + 1 > namelen) { 1513 zap_cursor_fini(&cursor); 1514 return (SET_ERROR(ENAMETOOLONG)); 1515 } 1516 1517 (void) strcpy(name, attr.za_name); 1518 if (idp) 1519 *idp = attr.za_first_integer; 1520 zap_cursor_advance(&cursor); 1521 *offp = zap_cursor_serialize(&cursor); 1522 zap_cursor_fini(&cursor); 1523 1524 return (0); 1525 } 1526 1527 /* 1528 * Find objsets under and including ddobj, call func(ds) on each. 1529 */ 1530 int 1531 dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, 1532 int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) 1533 { 1534 dsl_dir_t *dd; 1535 dsl_dataset_t *ds; 1536 zap_cursor_t zc; 1537 zap_attribute_t *attr; 1538 uint64_t thisobj; 1539 int err; 1540 1541 ASSERT(dsl_pool_config_held(dp)); 1542 1543 err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); 1544 if (err != 0) 1545 return (err); 1546 1547 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1548 if (dd->dd_myname[0] == '$') { 1549 dsl_dir_rele(dd, FTAG); 1550 return (0); 1551 } 1552 1553 thisobj = dd->dd_phys->dd_head_dataset_obj; 1554 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1555 1556 /* 1557 * Iterate over all children. 1558 */ 1559 if (flags & DS_FIND_CHILDREN) { 1560 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1561 dd->dd_phys->dd_child_dir_zapobj); 1562 zap_cursor_retrieve(&zc, attr) == 0; 1563 (void) zap_cursor_advance(&zc)) { 1564 ASSERT3U(attr->za_integer_length, ==, 1565 sizeof (uint64_t)); 1566 ASSERT3U(attr->za_num_integers, ==, 1); 1567 1568 err = dmu_objset_find_dp(dp, attr->za_first_integer, 1569 func, arg, flags); 1570 if (err != 0) 1571 break; 1572 } 1573 zap_cursor_fini(&zc); 1574 1575 if (err != 0) { 1576 dsl_dir_rele(dd, FTAG); 1577 kmem_free(attr, sizeof (zap_attribute_t)); 1578 return (err); 1579 } 1580 } 1581 1582 /* 1583 * Iterate over all snapshots. 1584 */ 1585 if (flags & DS_FIND_SNAPSHOTS) { 1586 dsl_dataset_t *ds; 1587 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1588 1589 if (err == 0) { 1590 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1591 dsl_dataset_rele(ds, FTAG); 1592 1593 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1594 zap_cursor_retrieve(&zc, attr) == 0; 1595 (void) zap_cursor_advance(&zc)) { 1596 ASSERT3U(attr->za_integer_length, ==, 1597 sizeof (uint64_t)); 1598 ASSERT3U(attr->za_num_integers, ==, 1); 1599 1600 err = dsl_dataset_hold_obj(dp, 1601 attr->za_first_integer, FTAG, &ds); 1602 if (err != 0) 1603 break; 1604 err = func(dp, ds, arg); 1605 dsl_dataset_rele(ds, FTAG); 1606 if (err != 0) 1607 break; 1608 } 1609 zap_cursor_fini(&zc); 1610 } 1611 } 1612 1613 dsl_dir_rele(dd, FTAG); 1614 kmem_free(attr, sizeof (zap_attribute_t)); 1615 1616 if (err != 0) 1617 return (err); 1618 1619 /* 1620 * Apply to self. 1621 */ 1622 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1623 if (err != 0) 1624 return (err); 1625 err = func(dp, ds, arg); 1626 dsl_dataset_rele(ds, FTAG); 1627 return (err); 1628 } 1629 1630 /* 1631 * Find all objsets under name, and for each, call 'func(child_name, arg)'. 1632 * The dp_config_rwlock must not be held when this is called, and it 1633 * will not be held when the callback is called. 1634 * Therefore this function should only be used when the pool is not changing 1635 * (e.g. in syncing context), or the callback can deal with the possible races. 1636 */ 1637 static int 1638 dmu_objset_find_impl(spa_t *spa, const char *name, 1639 int func(const char *, void *), void *arg, int flags) 1640 { 1641 dsl_dir_t *dd; 1642 dsl_pool_t *dp = spa_get_dsl(spa); 1643 dsl_dataset_t *ds; 1644 zap_cursor_t zc; 1645 zap_attribute_t *attr; 1646 char *child; 1647 uint64_t thisobj; 1648 int err; 1649 1650 dsl_pool_config_enter(dp, FTAG); 1651 1652 err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); 1653 if (err != 0) { 1654 dsl_pool_config_exit(dp, FTAG); 1655 return (err); 1656 } 1657 1658 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1659 if (dd->dd_myname[0] == '$') { 1660 dsl_dir_rele(dd, FTAG); 1661 dsl_pool_config_exit(dp, FTAG); 1662 return (0); 1663 } 1664 1665 thisobj = dd->dd_phys->dd_head_dataset_obj; 1666 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1667 1668 /* 1669 * Iterate over all children. 1670 */ 1671 if (flags & DS_FIND_CHILDREN) { 1672 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1673 dd->dd_phys->dd_child_dir_zapobj); 1674 zap_cursor_retrieve(&zc, attr) == 0; 1675 (void) zap_cursor_advance(&zc)) { 1676 ASSERT3U(attr->za_integer_length, ==, 1677 sizeof (uint64_t)); 1678 ASSERT3U(attr->za_num_integers, ==, 1); 1679 1680 child = kmem_asprintf("%s/%s", name, attr->za_name); 1681 dsl_pool_config_exit(dp, FTAG); 1682 err = dmu_objset_find_impl(spa, child, 1683 func, arg, flags); 1684 dsl_pool_config_enter(dp, FTAG); 1685 strfree(child); 1686 if (err != 0) 1687 break; 1688 } 1689 zap_cursor_fini(&zc); 1690 1691 if (err != 0) { 1692 dsl_dir_rele(dd, FTAG); 1693 dsl_pool_config_exit(dp, FTAG); 1694 kmem_free(attr, sizeof (zap_attribute_t)); 1695 return (err); 1696 } 1697 } 1698 1699 /* 1700 * Iterate over all snapshots. 1701 */ 1702 if (flags & DS_FIND_SNAPSHOTS) { 1703 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1704 1705 if (err == 0) { 1706 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1707 dsl_dataset_rele(ds, FTAG); 1708 1709 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1710 zap_cursor_retrieve(&zc, attr) == 0; 1711 (void) zap_cursor_advance(&zc)) { 1712 ASSERT3U(attr->za_integer_length, ==, 1713 sizeof (uint64_t)); 1714 ASSERT3U(attr->za_num_integers, ==, 1); 1715 1716 child = kmem_asprintf("%s@%s", 1717 name, attr->za_name); 1718 dsl_pool_config_exit(dp, FTAG); 1719 err = func(child, arg); 1720 dsl_pool_config_enter(dp, FTAG); 1721 strfree(child); 1722 if (err != 0) 1723 break; 1724 } 1725 zap_cursor_fini(&zc); 1726 } 1727 } 1728 1729 dsl_dir_rele(dd, FTAG); 1730 kmem_free(attr, sizeof (zap_attribute_t)); 1731 dsl_pool_config_exit(dp, FTAG); 1732 1733 if (err != 0) 1734 return (err); 1735 1736 /* Apply to self. */ 1737 return (func(name, arg)); 1738 } 1739 1740 /* 1741 * See comment above dmu_objset_find_impl(). 1742 */ 1743 int 1744 dmu_objset_find(char *name, int func(const char *, void *), void *arg, 1745 int flags) 1746 { 1747 spa_t *spa; 1748 int error; 1749 1750 error = spa_open(name, &spa, FTAG); 1751 if (error != 0) 1752 return (error); 1753 error = dmu_objset_find_impl(spa, name, func, arg, flags); 1754 spa_close(spa, FTAG); 1755 return (error); 1756 } 1757 1758 void 1759 dmu_objset_set_user(objset_t *os, void *user_ptr) 1760 { 1761 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1762 os->os_user_ptr = user_ptr; 1763 } 1764 1765 void * 1766 dmu_objset_get_user(objset_t *os) 1767 { 1768 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1769 return (os->os_user_ptr); 1770 } 1771 1772 /* 1773 * Determine name of filesystem, given name of snapshot. 1774 * buf must be at least MAXNAMELEN bytes 1775 */ 1776 int 1777 dmu_fsname(const char *snapname, char *buf) 1778 { 1779 char *atp = strchr(snapname, '@'); 1780 if (atp == NULL) 1781 return (SET_ERROR(EINVAL)); 1782 if (atp - snapname >= MAXNAMELEN) 1783 return (SET_ERROR(ENAMETOOLONG)); 1784 (void) strlcpy(buf, snapname, atp - snapname + 1); 1785 return (0); 1786 } 1787