1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 /* Portions Copyright 2010 Robert Milkowski */ 27 28 #include <sys/cred.h> 29 #include <sys/zfs_context.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dsl_dir.h> 32 #include <sys/dsl_dataset.h> 33 #include <sys/dsl_prop.h> 34 #include <sys/dsl_pool.h> 35 #include <sys/dsl_synctask.h> 36 #include <sys/dsl_deleg.h> 37 #include <sys/dnode.h> 38 #include <sys/dbuf.h> 39 #include <sys/zvol.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/dmu_impl.h> 44 #include <sys/zfs_ioctl.h> 45 #include <sys/sa.h> 46 #include <sys/zfs_onexit.h> 47 48 /* 49 * Needed to close a window in dnode_move() that allows the objset to be freed 50 * before it can be safely accessed. 51 */ 52 krwlock_t os_lock; 53 54 void 55 dmu_objset_init(void) 56 { 57 rw_init(&os_lock, NULL, RW_DEFAULT, NULL); 58 } 59 60 void 61 dmu_objset_fini(void) 62 { 63 rw_destroy(&os_lock); 64 } 65 66 spa_t * 67 dmu_objset_spa(objset_t *os) 68 { 69 return (os->os_spa); 70 } 71 72 zilog_t * 73 dmu_objset_zil(objset_t *os) 74 { 75 return (os->os_zil); 76 } 77 78 dsl_pool_t * 79 dmu_objset_pool(objset_t *os) 80 { 81 dsl_dataset_t *ds; 82 83 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) 84 return (ds->ds_dir->dd_pool); 85 else 86 return (spa_get_dsl(os->os_spa)); 87 } 88 89 dsl_dataset_t * 90 dmu_objset_ds(objset_t *os) 91 { 92 return (os->os_dsl_dataset); 93 } 94 95 dmu_objset_type_t 96 dmu_objset_type(objset_t *os) 97 { 98 return (os->os_phys->os_type); 99 } 100 101 void 102 dmu_objset_name(objset_t *os, char *buf) 103 { 104 dsl_dataset_name(os->os_dsl_dataset, buf); 105 } 106 107 uint64_t 108 dmu_objset_id(objset_t *os) 109 { 110 dsl_dataset_t *ds = os->os_dsl_dataset; 111 112 return (ds ? ds->ds_object : 0); 113 } 114 115 uint64_t 116 dmu_objset_syncprop(objset_t *os) 117 { 118 return (os->os_sync); 119 } 120 121 uint64_t 122 dmu_objset_logbias(objset_t *os) 123 { 124 return (os->os_logbias); 125 } 126 127 static void 128 checksum_changed_cb(void *arg, uint64_t newval) 129 { 130 objset_t *os = arg; 131 132 /* 133 * Inheritance should have been done by now. 134 */ 135 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 136 137 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); 138 } 139 140 static void 141 compression_changed_cb(void *arg, uint64_t newval) 142 { 143 objset_t *os = arg; 144 145 /* 146 * Inheritance and range checking should have been done by now. 147 */ 148 ASSERT(newval != ZIO_COMPRESS_INHERIT); 149 150 os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); 151 } 152 153 static void 154 copies_changed_cb(void *arg, uint64_t newval) 155 { 156 objset_t *os = arg; 157 158 /* 159 * Inheritance and range checking should have been done by now. 160 */ 161 ASSERT(newval > 0); 162 ASSERT(newval <= spa_max_replication(os->os_spa)); 163 164 os->os_copies = newval; 165 } 166 167 static void 168 dedup_changed_cb(void *arg, uint64_t newval) 169 { 170 objset_t *os = arg; 171 spa_t *spa = os->os_spa; 172 enum zio_checksum checksum; 173 174 /* 175 * Inheritance should have been done by now. 176 */ 177 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 178 179 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); 180 181 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; 182 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); 183 } 184 185 static void 186 primary_cache_changed_cb(void *arg, uint64_t newval) 187 { 188 objset_t *os = arg; 189 190 /* 191 * Inheritance and range checking should have been done by now. 192 */ 193 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 194 newval == ZFS_CACHE_METADATA); 195 196 os->os_primary_cache = newval; 197 } 198 199 static void 200 secondary_cache_changed_cb(void *arg, uint64_t newval) 201 { 202 objset_t *os = arg; 203 204 /* 205 * Inheritance and range checking should have been done by now. 206 */ 207 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 208 newval == ZFS_CACHE_METADATA); 209 210 os->os_secondary_cache = newval; 211 } 212 213 static void 214 sync_changed_cb(void *arg, uint64_t newval) 215 { 216 objset_t *os = arg; 217 218 /* 219 * Inheritance and range checking should have been done by now. 220 */ 221 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || 222 newval == ZFS_SYNC_DISABLED); 223 224 os->os_sync = newval; 225 if (os->os_zil) 226 zil_set_sync(os->os_zil, newval); 227 } 228 229 static void 230 logbias_changed_cb(void *arg, uint64_t newval) 231 { 232 objset_t *os = arg; 233 234 ASSERT(newval == ZFS_LOGBIAS_LATENCY || 235 newval == ZFS_LOGBIAS_THROUGHPUT); 236 os->os_logbias = newval; 237 if (os->os_zil) 238 zil_set_logbias(os->os_zil, newval); 239 } 240 241 void 242 dmu_objset_byteswap(void *buf, size_t size) 243 { 244 objset_phys_t *osp = buf; 245 246 ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); 247 dnode_byteswap(&osp->os_meta_dnode); 248 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); 249 osp->os_type = BSWAP_64(osp->os_type); 250 osp->os_flags = BSWAP_64(osp->os_flags); 251 if (size == sizeof (objset_phys_t)) { 252 dnode_byteswap(&osp->os_userused_dnode); 253 dnode_byteswap(&osp->os_groupused_dnode); 254 } 255 } 256 257 int 258 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 259 objset_t **osp) 260 { 261 objset_t *os; 262 int i, err; 263 264 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); 265 266 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); 267 os->os_dsl_dataset = ds; 268 os->os_spa = spa; 269 os->os_rootbp = bp; 270 if (!BP_IS_HOLE(os->os_rootbp)) { 271 uint32_t aflags = ARC_WAIT; 272 zbookmark_t zb; 273 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 274 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 275 276 if (DMU_OS_IS_L2CACHEABLE(os)) 277 aflags |= ARC_L2CACHE; 278 279 dprintf_bp(os->os_rootbp, "reading %s", ""); 280 err = arc_read(NULL, spa, os->os_rootbp, 281 arc_getbuf_func, &os->os_phys_buf, 282 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); 283 if (err) { 284 kmem_free(os, sizeof (objset_t)); 285 /* convert checksum errors into IO errors */ 286 if (err == ECKSUM) 287 err = EIO; 288 return (err); 289 } 290 291 /* Increase the blocksize if we are permitted. */ 292 if (spa_version(spa) >= SPA_VERSION_USERSPACE && 293 arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { 294 arc_buf_t *buf = arc_buf_alloc(spa, 295 sizeof (objset_phys_t), &os->os_phys_buf, 296 ARC_BUFC_METADATA); 297 bzero(buf->b_data, sizeof (objset_phys_t)); 298 bcopy(os->os_phys_buf->b_data, buf->b_data, 299 arc_buf_size(os->os_phys_buf)); 300 (void) arc_buf_remove_ref(os->os_phys_buf, 301 &os->os_phys_buf); 302 os->os_phys_buf = buf; 303 } 304 305 os->os_phys = os->os_phys_buf->b_data; 306 os->os_flags = os->os_phys->os_flags; 307 } else { 308 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? 309 sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; 310 os->os_phys_buf = arc_buf_alloc(spa, size, 311 &os->os_phys_buf, ARC_BUFC_METADATA); 312 os->os_phys = os->os_phys_buf->b_data; 313 bzero(os->os_phys, size); 314 } 315 316 /* 317 * Note: the changed_cb will be called once before the register 318 * func returns, thus changing the checksum/compression from the 319 * default (fletcher2/off). Snapshots don't need to know about 320 * checksum/compression/copies. 321 */ 322 if (ds) { 323 err = dsl_prop_register(ds, "primarycache", 324 primary_cache_changed_cb, os); 325 if (err == 0) 326 err = dsl_prop_register(ds, "secondarycache", 327 secondary_cache_changed_cb, os); 328 if (!dsl_dataset_is_snapshot(ds)) { 329 if (err == 0) 330 err = dsl_prop_register(ds, "checksum", 331 checksum_changed_cb, os); 332 if (err == 0) 333 err = dsl_prop_register(ds, "compression", 334 compression_changed_cb, os); 335 if (err == 0) 336 err = dsl_prop_register(ds, "copies", 337 copies_changed_cb, os); 338 if (err == 0) 339 err = dsl_prop_register(ds, "dedup", 340 dedup_changed_cb, os); 341 if (err == 0) 342 err = dsl_prop_register(ds, "logbias", 343 logbias_changed_cb, os); 344 if (err == 0) 345 err = dsl_prop_register(ds, "sync", 346 sync_changed_cb, os); 347 } 348 if (err) { 349 VERIFY(arc_buf_remove_ref(os->os_phys_buf, 350 &os->os_phys_buf) == 1); 351 kmem_free(os, sizeof (objset_t)); 352 return (err); 353 } 354 } else if (ds == NULL) { 355 /* It's the meta-objset. */ 356 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; 357 os->os_compress = ZIO_COMPRESS_LZJB; 358 os->os_copies = spa_max_replication(spa); 359 os->os_dedup_checksum = ZIO_CHECKSUM_OFF; 360 os->os_dedup_verify = 0; 361 os->os_logbias = 0; 362 os->os_sync = 0; 363 os->os_primary_cache = ZFS_CACHE_ALL; 364 os->os_secondary_cache = ZFS_CACHE_ALL; 365 } 366 367 if (ds == NULL || !dsl_dataset_is_snapshot(ds)) 368 os->os_zil_header = os->os_phys->os_zil_header; 369 os->os_zil = zil_alloc(os, &os->os_zil_header); 370 371 for (i = 0; i < TXG_SIZE; i++) { 372 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), 373 offsetof(dnode_t, dn_dirty_link[i])); 374 list_create(&os->os_free_dnodes[i], sizeof (dnode_t), 375 offsetof(dnode_t, dn_dirty_link[i])); 376 } 377 list_create(&os->os_dnodes, sizeof (dnode_t), 378 offsetof(dnode_t, dn_link)); 379 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), 380 offsetof(dmu_buf_impl_t, db_link)); 381 382 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); 383 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); 384 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); 385 386 DMU_META_DNODE(os) = dnode_special_open(os, 387 &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, 388 &os->os_meta_dnode); 389 if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { 390 DMU_USERUSED_DNODE(os) = dnode_special_open(os, 391 &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, 392 &os->os_userused_dnode); 393 DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, 394 &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, 395 &os->os_groupused_dnode); 396 } 397 398 /* 399 * We should be the only thread trying to do this because we 400 * have ds_opening_lock 401 */ 402 if (ds) { 403 mutex_enter(&ds->ds_lock); 404 ASSERT(ds->ds_objset == NULL); 405 ds->ds_objset = os; 406 mutex_exit(&ds->ds_lock); 407 } 408 409 *osp = os; 410 return (0); 411 } 412 413 int 414 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) 415 { 416 int err = 0; 417 418 mutex_enter(&ds->ds_opening_lock); 419 *osp = ds->ds_objset; 420 if (*osp == NULL) { 421 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), 422 ds, dsl_dataset_get_blkptr(ds), osp); 423 } 424 mutex_exit(&ds->ds_opening_lock); 425 return (err); 426 } 427 428 /* called from zpl */ 429 int 430 dmu_objset_hold(const char *name, void *tag, objset_t **osp) 431 { 432 dsl_dataset_t *ds; 433 int err; 434 435 err = dsl_dataset_hold(name, tag, &ds); 436 if (err) 437 return (err); 438 439 err = dmu_objset_from_ds(ds, osp); 440 if (err) 441 dsl_dataset_rele(ds, tag); 442 443 return (err); 444 } 445 446 /* called from zpl */ 447 int 448 dmu_objset_own(const char *name, dmu_objset_type_t type, 449 boolean_t readonly, void *tag, objset_t **osp) 450 { 451 dsl_dataset_t *ds; 452 int err; 453 454 err = dsl_dataset_own(name, B_FALSE, tag, &ds); 455 if (err) 456 return (err); 457 458 err = dmu_objset_from_ds(ds, osp); 459 if (err) { 460 dsl_dataset_disown(ds, tag); 461 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { 462 dmu_objset_disown(*osp, tag); 463 return (EINVAL); 464 } else if (!readonly && dsl_dataset_is_snapshot(ds)) { 465 dmu_objset_disown(*osp, tag); 466 return (EROFS); 467 } 468 return (err); 469 } 470 471 void 472 dmu_objset_rele(objset_t *os, void *tag) 473 { 474 dsl_dataset_rele(os->os_dsl_dataset, tag); 475 } 476 477 void 478 dmu_objset_disown(objset_t *os, void *tag) 479 { 480 dsl_dataset_disown(os->os_dsl_dataset, tag); 481 } 482 483 int 484 dmu_objset_evict_dbufs(objset_t *os) 485 { 486 dnode_t *dn; 487 488 mutex_enter(&os->os_lock); 489 490 /* process the mdn last, since the other dnodes have holds on it */ 491 list_remove(&os->os_dnodes, DMU_META_DNODE(os)); 492 list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); 493 494 /* 495 * Find the first dnode with holds. We have to do this dance 496 * because dnode_add_ref() only works if you already have a 497 * hold. If there are no holds then it has no dbufs so OK to 498 * skip. 499 */ 500 for (dn = list_head(&os->os_dnodes); 501 dn && !dnode_add_ref(dn, FTAG); 502 dn = list_next(&os->os_dnodes, dn)) 503 continue; 504 505 while (dn) { 506 dnode_t *next_dn = dn; 507 508 do { 509 next_dn = list_next(&os->os_dnodes, next_dn); 510 } while (next_dn && !dnode_add_ref(next_dn, FTAG)); 511 512 mutex_exit(&os->os_lock); 513 dnode_evict_dbufs(dn); 514 dnode_rele(dn, FTAG); 515 mutex_enter(&os->os_lock); 516 dn = next_dn; 517 } 518 dn = list_head(&os->os_dnodes); 519 mutex_exit(&os->os_lock); 520 return (dn != DMU_META_DNODE(os)); 521 } 522 523 void 524 dmu_objset_evict(objset_t *os) 525 { 526 dsl_dataset_t *ds = os->os_dsl_dataset; 527 528 for (int t = 0; t < TXG_SIZE; t++) 529 ASSERT(!dmu_objset_is_dirty(os, t)); 530 531 if (ds) { 532 if (!dsl_dataset_is_snapshot(ds)) { 533 VERIFY(0 == dsl_prop_unregister(ds, "checksum", 534 checksum_changed_cb, os)); 535 VERIFY(0 == dsl_prop_unregister(ds, "compression", 536 compression_changed_cb, os)); 537 VERIFY(0 == dsl_prop_unregister(ds, "copies", 538 copies_changed_cb, os)); 539 VERIFY(0 == dsl_prop_unregister(ds, "dedup", 540 dedup_changed_cb, os)); 541 VERIFY(0 == dsl_prop_unregister(ds, "logbias", 542 logbias_changed_cb, os)); 543 VERIFY(0 == dsl_prop_unregister(ds, "sync", 544 sync_changed_cb, os)); 545 } 546 VERIFY(0 == dsl_prop_unregister(ds, "primarycache", 547 primary_cache_changed_cb, os)); 548 VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", 549 secondary_cache_changed_cb, os)); 550 } 551 552 if (os->os_sa) 553 sa_tear_down(os); 554 555 /* 556 * We should need only a single pass over the dnode list, since 557 * nothing can be added to the list at this point. 558 */ 559 (void) dmu_objset_evict_dbufs(os); 560 561 dnode_special_close(&os->os_meta_dnode); 562 if (DMU_USERUSED_DNODE(os)) { 563 dnode_special_close(&os->os_userused_dnode); 564 dnode_special_close(&os->os_groupused_dnode); 565 } 566 zil_free(os->os_zil); 567 568 ASSERT3P(list_head(&os->os_dnodes), ==, NULL); 569 570 VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); 571 572 /* 573 * This is a barrier to prevent the objset from going away in 574 * dnode_move() until we can safely ensure that the objset is still in 575 * use. We consider the objset valid before the barrier and invalid 576 * after the barrier. 577 */ 578 rw_enter(&os_lock, RW_READER); 579 rw_exit(&os_lock); 580 581 mutex_destroy(&os->os_lock); 582 mutex_destroy(&os->os_obj_lock); 583 mutex_destroy(&os->os_user_ptr_lock); 584 kmem_free(os, sizeof (objset_t)); 585 } 586 587 timestruc_t 588 dmu_objset_snap_cmtime(objset_t *os) 589 { 590 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); 591 } 592 593 /* called from dsl for meta-objset */ 594 objset_t * 595 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 596 dmu_objset_type_t type, dmu_tx_t *tx) 597 { 598 objset_t *os; 599 dnode_t *mdn; 600 601 ASSERT(dmu_tx_is_syncing(tx)); 602 if (ds != NULL) 603 VERIFY(0 == dmu_objset_from_ds(ds, &os)); 604 else 605 VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); 606 607 mdn = DMU_META_DNODE(os); 608 609 dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, 610 DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); 611 612 /* 613 * We don't want to have to increase the meta-dnode's nlevels 614 * later, because then we could do it in quescing context while 615 * we are also accessing it in open context. 616 * 617 * This precaution is not necessary for the MOS (ds == NULL), 618 * because the MOS is only updated in syncing context. 619 * This is most fortunate: the MOS is the only objset that 620 * needs to be synced multiple times as spa_sync() iterates 621 * to convergence, so minimizing its dn_nlevels matters. 622 */ 623 if (ds != NULL) { 624 int levels = 1; 625 626 /* 627 * Determine the number of levels necessary for the meta-dnode 628 * to contain DN_MAX_OBJECT dnodes. 629 */ 630 while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + 631 (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < 632 DN_MAX_OBJECT * sizeof (dnode_phys_t)) 633 levels++; 634 635 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = 636 mdn->dn_nlevels = levels; 637 } 638 639 ASSERT(type != DMU_OST_NONE); 640 ASSERT(type != DMU_OST_ANY); 641 ASSERT(type < DMU_OST_NUMTYPES); 642 os->os_phys->os_type = type; 643 if (dmu_objset_userused_enabled(os)) { 644 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 645 os->os_flags = os->os_phys->os_flags; 646 } 647 648 dsl_dataset_dirty(ds, tx); 649 650 return (os); 651 } 652 653 struct oscarg { 654 void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); 655 void *userarg; 656 dsl_dataset_t *clone_origin; 657 const char *lastname; 658 dmu_objset_type_t type; 659 uint64_t flags; 660 cred_t *cr; 661 }; 662 663 /*ARGSUSED*/ 664 static int 665 dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) 666 { 667 dsl_dir_t *dd = arg1; 668 struct oscarg *oa = arg2; 669 objset_t *mos = dd->dd_pool->dp_meta_objset; 670 int err; 671 uint64_t ddobj; 672 673 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 674 oa->lastname, sizeof (uint64_t), 1, &ddobj); 675 if (err != ENOENT) 676 return (err ? err : EEXIST); 677 678 if (oa->clone_origin != NULL) { 679 /* You can't clone across pools. */ 680 if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) 681 return (EXDEV); 682 683 /* You can only clone snapshots, not the head datasets. */ 684 if (!dsl_dataset_is_snapshot(oa->clone_origin)) 685 return (EINVAL); 686 } 687 688 return (0); 689 } 690 691 static void 692 dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) 693 { 694 dsl_dir_t *dd = arg1; 695 spa_t *spa = dd->dd_pool->dp_spa; 696 struct oscarg *oa = arg2; 697 uint64_t obj; 698 dsl_dataset_t *ds; 699 blkptr_t *bp; 700 701 ASSERT(dmu_tx_is_syncing(tx)); 702 703 obj = dsl_dataset_create_sync(dd, oa->lastname, 704 oa->clone_origin, oa->flags, oa->cr, tx); 705 706 VERIFY3U(0, ==, dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds)); 707 bp = dsl_dataset_get_blkptr(ds); 708 if (BP_IS_HOLE(bp)) { 709 objset_t *os = 710 dmu_objset_create_impl(spa, ds, bp, oa->type, tx); 711 712 if (oa->userfunc) 713 oa->userfunc(os, oa->userarg, oa->cr, tx); 714 } 715 716 if (oa->clone_origin == NULL) { 717 spa_history_log_internal_ds(ds, "create", tx, ""); 718 } else { 719 char namebuf[MAXNAMELEN]; 720 dsl_dataset_name(oa->clone_origin, namebuf); 721 spa_history_log_internal_ds(ds, "clone", tx, 722 "origin=%s (%llu)", namebuf, oa->clone_origin->ds_object); 723 } 724 dsl_dataset_rele(ds, FTAG); 725 } 726 727 int 728 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 729 void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) 730 { 731 dsl_dir_t *pdd; 732 const char *tail; 733 int err = 0; 734 struct oscarg oa = { 0 }; 735 736 ASSERT(strchr(name, '@') == NULL); 737 err = dsl_dir_open(name, FTAG, &pdd, &tail); 738 if (err) 739 return (err); 740 if (tail == NULL) { 741 dsl_dir_close(pdd, FTAG); 742 return (EEXIST); 743 } 744 745 oa.userfunc = func; 746 oa.userarg = arg; 747 oa.lastname = tail; 748 oa.type = type; 749 oa.flags = flags; 750 oa.cr = CRED(); 751 752 err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, 753 dmu_objset_create_sync, pdd, &oa, 5); 754 dsl_dir_close(pdd, FTAG); 755 return (err); 756 } 757 758 int 759 dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) 760 { 761 dsl_dir_t *pdd; 762 const char *tail; 763 int err = 0; 764 struct oscarg oa = { 0 }; 765 766 ASSERT(strchr(name, '@') == NULL); 767 err = dsl_dir_open(name, FTAG, &pdd, &tail); 768 if (err) 769 return (err); 770 if (tail == NULL) { 771 dsl_dir_close(pdd, FTAG); 772 return (EEXIST); 773 } 774 775 oa.lastname = tail; 776 oa.clone_origin = clone_origin; 777 oa.flags = flags; 778 oa.cr = CRED(); 779 780 err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, 781 dmu_objset_create_sync, pdd, &oa, 5); 782 dsl_dir_close(pdd, FTAG); 783 return (err); 784 } 785 786 int 787 dmu_objset_destroy(const char *name, boolean_t defer) 788 { 789 dsl_dataset_t *ds; 790 int error; 791 792 error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); 793 if (error == 0) { 794 error = dsl_dataset_destroy(ds, FTAG, defer); 795 /* dsl_dataset_destroy() closes the ds. */ 796 } 797 798 return (error); 799 } 800 801 typedef struct snapallarg { 802 dsl_sync_task_group_t *saa_dstg; 803 boolean_t saa_needsuspend; 804 nvlist_t *saa_props; 805 806 /* the following are used only if 'temporary' is set: */ 807 boolean_t saa_temporary; 808 const char *saa_htag; 809 struct dsl_ds_holdarg *saa_ha; 810 dsl_dataset_t *saa_newds; 811 } snapallarg_t; 812 813 typedef struct snaponearg { 814 const char *soa_longname; /* long snap name */ 815 const char *soa_snapname; /* short snap name */ 816 snapallarg_t *soa_saa; 817 } snaponearg_t; 818 819 static int 820 snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 821 { 822 objset_t *os = arg1; 823 snaponearg_t *soa = arg2; 824 snapallarg_t *saa = soa->soa_saa; 825 int error; 826 827 /* The props have already been checked by zfs_check_userprops(). */ 828 829 error = dsl_dataset_snapshot_check(os->os_dsl_dataset, 830 soa->soa_snapname, tx); 831 if (error) 832 return (error); 833 834 if (saa->saa_temporary) { 835 /* 836 * Ideally we would just call 837 * dsl_dataset_user_hold_check() and 838 * dsl_dataset_destroy_check() here. However the 839 * dataset we want to hold and destroy is the snapshot 840 * that we just confirmed we can create, but it won't 841 * exist until after these checks are run. Do any 842 * checks we can here and if more checks are added to 843 * those routines in the future, similar checks may be 844 * necessary here. 845 */ 846 if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) 847 return (ENOTSUP); 848 /* 849 * Not checking number of tags because the tag will be 850 * unique, as it will be the only tag. 851 */ 852 if (strlen(saa->saa_htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 853 return (E2BIG); 854 855 saa->saa_ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), 856 KM_SLEEP); 857 saa->saa_ha->temphold = B_TRUE; 858 saa->saa_ha->htag = saa->saa_htag; 859 } 860 return (error); 861 } 862 863 static void 864 snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 865 { 866 objset_t *os = arg1; 867 dsl_dataset_t *ds = os->os_dsl_dataset; 868 snaponearg_t *soa = arg2; 869 snapallarg_t *saa = soa->soa_saa; 870 871 dsl_dataset_snapshot_sync(ds, soa->soa_snapname, tx); 872 873 if (saa->saa_props != NULL) { 874 dsl_props_arg_t pa; 875 pa.pa_props = saa->saa_props; 876 pa.pa_source = ZPROP_SRC_LOCAL; 877 dsl_props_set_sync(ds->ds_prev, &pa, tx); 878 } 879 880 if (saa->saa_temporary) { 881 struct dsl_ds_destroyarg da; 882 883 dsl_dataset_user_hold_sync(ds->ds_prev, saa->saa_ha, tx); 884 kmem_free(saa->saa_ha, sizeof (struct dsl_ds_holdarg)); 885 saa->saa_ha = NULL; 886 saa->saa_newds = ds->ds_prev; 887 888 da.ds = ds->ds_prev; 889 da.defer = B_TRUE; 890 dsl_dataset_destroy_sync(&da, FTAG, tx); 891 } 892 } 893 894 static int 895 snapshot_one_impl(const char *snapname, void *arg) 896 { 897 char fsname[MAXPATHLEN]; 898 snapallarg_t *saa = arg; 899 snaponearg_t *soa; 900 objset_t *os; 901 int err; 902 903 (void) strlcpy(fsname, snapname, sizeof (fsname)); 904 strchr(fsname, '@')[0] = '\0'; 905 906 err = dmu_objset_hold(fsname, saa, &os); 907 if (err != 0) 908 return (err); 909 910 /* 911 * If the objset is in an inconsistent state (eg, in the process 912 * of being destroyed), don't snapshot it. 913 */ 914 if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { 915 dmu_objset_rele(os, saa); 916 return (EBUSY); 917 } 918 919 if (saa->saa_needsuspend) { 920 err = zil_suspend(dmu_objset_zil(os)); 921 if (err) { 922 dmu_objset_rele(os, saa); 923 return (err); 924 } 925 } 926 927 soa = kmem_zalloc(sizeof (*soa), KM_SLEEP); 928 soa->soa_saa = saa; 929 soa->soa_longname = snapname; 930 soa->soa_snapname = strchr(snapname, '@') + 1; 931 932 dsl_sync_task_create(saa->saa_dstg, snapshot_check, snapshot_sync, 933 os, soa, 3); 934 935 return (0); 936 } 937 938 /* 939 * The snapshots must all be in the same pool. 940 */ 941 int 942 dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) 943 { 944 dsl_sync_task_t *dst; 945 snapallarg_t saa = { 0 }; 946 spa_t *spa; 947 int rv = 0; 948 int err; 949 nvpair_t *pair; 950 951 pair = nvlist_next_nvpair(snaps, NULL); 952 if (pair == NULL) 953 return (0); 954 955 err = spa_open(nvpair_name(pair), &spa, FTAG); 956 if (err) 957 return (err); 958 saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 959 saa.saa_props = props; 960 saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); 961 962 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; 963 pair = nvlist_next_nvpair(snaps, pair)) { 964 err = snapshot_one_impl(nvpair_name(pair), &saa); 965 if (err != 0) { 966 if (errors != NULL) { 967 fnvlist_add_int32(errors, 968 nvpair_name(pair), err); 969 } 970 rv = err; 971 } 972 } 973 974 /* 975 * If any call to snapshot_one_impl() failed, don't execute the 976 * sync task. The error handling code below will clean up the 977 * snaponearg_t from any successful calls to 978 * snapshot_one_impl(). 979 */ 980 if (rv == 0) 981 err = dsl_sync_task_group_wait(saa.saa_dstg); 982 if (err != 0) 983 rv = err; 984 985 for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst; 986 dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) { 987 objset_t *os = dst->dst_arg1; 988 snaponearg_t *soa = dst->dst_arg2; 989 if (dst->dst_err != 0) { 990 if (errors != NULL) { 991 fnvlist_add_int32(errors, 992 soa->soa_longname, dst->dst_err); 993 } 994 rv = dst->dst_err; 995 } 996 997 if (saa.saa_needsuspend) 998 zil_resume(dmu_objset_zil(os)); 999 dmu_objset_rele(os, &saa); 1000 kmem_free(soa, sizeof (*soa)); 1001 } 1002 1003 dsl_sync_task_group_destroy(saa.saa_dstg); 1004 spa_close(spa, FTAG); 1005 return (rv); 1006 } 1007 1008 int 1009 dmu_objset_snapshot_one(const char *fsname, const char *snapname) 1010 { 1011 int err; 1012 char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); 1013 nvlist_t *snaps = fnvlist_alloc(); 1014 1015 fnvlist_add_boolean(snaps, longsnap); 1016 err = dmu_objset_snapshot(snaps, NULL, NULL); 1017 fnvlist_free(snaps); 1018 strfree(longsnap); 1019 return (err); 1020 } 1021 1022 int 1023 dmu_objset_snapshot_tmp(const char *snapname, const char *tag, int cleanup_fd) 1024 { 1025 dsl_sync_task_t *dst; 1026 snapallarg_t saa = { 0 }; 1027 spa_t *spa; 1028 minor_t minor; 1029 int err; 1030 1031 err = spa_open(snapname, &spa, FTAG); 1032 if (err) 1033 return (err); 1034 saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 1035 saa.saa_htag = tag; 1036 saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); 1037 saa.saa_temporary = B_TRUE; 1038 1039 if (cleanup_fd < 0) { 1040 spa_close(spa, FTAG); 1041 return (EINVAL); 1042 } 1043 if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { 1044 spa_close(spa, FTAG); 1045 return (err); 1046 } 1047 1048 err = snapshot_one_impl(snapname, &saa); 1049 1050 if (err == 0) 1051 err = dsl_sync_task_group_wait(saa.saa_dstg); 1052 1053 for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst; 1054 dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) { 1055 objset_t *os = dst->dst_arg1; 1056 dsl_register_onexit_hold_cleanup(saa.saa_newds, tag, minor); 1057 if (saa.saa_needsuspend) 1058 zil_resume(dmu_objset_zil(os)); 1059 dmu_objset_rele(os, &saa); 1060 } 1061 1062 zfs_onexit_fd_rele(cleanup_fd); 1063 dsl_sync_task_group_destroy(saa.saa_dstg); 1064 spa_close(spa, FTAG); 1065 return (err); 1066 } 1067 1068 1069 static void 1070 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) 1071 { 1072 dnode_t *dn; 1073 1074 while (dn = list_head(list)) { 1075 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 1076 ASSERT(dn->dn_dbuf->db_data_pending); 1077 /* 1078 * Initialize dn_zio outside dnode_sync() because the 1079 * meta-dnode needs to set it ouside dnode_sync(). 1080 */ 1081 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; 1082 ASSERT(dn->dn_zio); 1083 1084 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); 1085 list_remove(list, dn); 1086 1087 if (newlist) { 1088 (void) dnode_add_ref(dn, newlist); 1089 list_insert_tail(newlist, dn); 1090 } 1091 1092 dnode_sync(dn, tx); 1093 } 1094 } 1095 1096 /* ARGSUSED */ 1097 static void 1098 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) 1099 { 1100 blkptr_t *bp = zio->io_bp; 1101 objset_t *os = arg; 1102 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; 1103 1104 ASSERT(bp == os->os_rootbp); 1105 ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); 1106 ASSERT(BP_GET_LEVEL(bp) == 0); 1107 1108 /* 1109 * Update rootbp fill count: it should be the number of objects 1110 * allocated in the object set (not counting the "special" 1111 * objects that are stored in the objset_phys_t -- the meta 1112 * dnode and user/group accounting objects). 1113 */ 1114 bp->blk_fill = 0; 1115 for (int i = 0; i < dnp->dn_nblkptr; i++) 1116 bp->blk_fill += dnp->dn_blkptr[i].blk_fill; 1117 } 1118 1119 /* ARGSUSED */ 1120 static void 1121 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) 1122 { 1123 blkptr_t *bp = zio->io_bp; 1124 blkptr_t *bp_orig = &zio->io_bp_orig; 1125 objset_t *os = arg; 1126 1127 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 1128 ASSERT(BP_EQUAL(bp, bp_orig)); 1129 } else { 1130 dsl_dataset_t *ds = os->os_dsl_dataset; 1131 dmu_tx_t *tx = os->os_synctx; 1132 1133 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 1134 dsl_dataset_block_born(ds, bp, tx); 1135 } 1136 } 1137 1138 /* called from dsl */ 1139 void 1140 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) 1141 { 1142 int txgoff; 1143 zbookmark_t zb; 1144 zio_prop_t zp; 1145 zio_t *zio; 1146 list_t *list; 1147 list_t *newlist = NULL; 1148 dbuf_dirty_record_t *dr; 1149 1150 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); 1151 1152 ASSERT(dmu_tx_is_syncing(tx)); 1153 /* XXX the write_done callback should really give us the tx... */ 1154 os->os_synctx = tx; 1155 1156 if (os->os_dsl_dataset == NULL) { 1157 /* 1158 * This is the MOS. If we have upgraded, 1159 * spa_max_replication() could change, so reset 1160 * os_copies here. 1161 */ 1162 os->os_copies = spa_max_replication(os->os_spa); 1163 } 1164 1165 /* 1166 * Create the root block IO 1167 */ 1168 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 1169 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1170 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1171 arc_release(os->os_phys_buf, &os->os_phys_buf); 1172 1173 dmu_write_policy(os, NULL, 0, 0, &zp); 1174 1175 zio = arc_write(pio, os->os_spa, tx->tx_txg, 1176 os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, 1177 dmu_objset_write_ready, dmu_objset_write_done, os, 1178 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 1179 1180 /* 1181 * Sync special dnodes - the parent IO for the sync is the root block 1182 */ 1183 DMU_META_DNODE(os)->dn_zio = zio; 1184 dnode_sync(DMU_META_DNODE(os), tx); 1185 1186 os->os_phys->os_flags = os->os_flags; 1187 1188 if (DMU_USERUSED_DNODE(os) && 1189 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { 1190 DMU_USERUSED_DNODE(os)->dn_zio = zio; 1191 dnode_sync(DMU_USERUSED_DNODE(os), tx); 1192 DMU_GROUPUSED_DNODE(os)->dn_zio = zio; 1193 dnode_sync(DMU_GROUPUSED_DNODE(os), tx); 1194 } 1195 1196 txgoff = tx->tx_txg & TXG_MASK; 1197 1198 if (dmu_objset_userused_enabled(os)) { 1199 newlist = &os->os_synced_dnodes; 1200 /* 1201 * We must create the list here because it uses the 1202 * dn_dirty_link[] of this txg. 1203 */ 1204 list_create(newlist, sizeof (dnode_t), 1205 offsetof(dnode_t, dn_dirty_link[txgoff])); 1206 } 1207 1208 dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); 1209 dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); 1210 1211 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; 1212 while (dr = list_head(list)) { 1213 ASSERT(dr->dr_dbuf->db_level == 0); 1214 list_remove(list, dr); 1215 if (dr->dr_zio) 1216 zio_nowait(dr->dr_zio); 1217 } 1218 /* 1219 * Free intent log blocks up to this tx. 1220 */ 1221 zil_sync(os->os_zil, tx); 1222 os->os_phys->os_zil_header = os->os_zil_header; 1223 zio_nowait(zio); 1224 } 1225 1226 boolean_t 1227 dmu_objset_is_dirty(objset_t *os, uint64_t txg) 1228 { 1229 return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || 1230 !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); 1231 } 1232 1233 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; 1234 1235 void 1236 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) 1237 { 1238 used_cbs[ost] = cb; 1239 } 1240 1241 boolean_t 1242 dmu_objset_userused_enabled(objset_t *os) 1243 { 1244 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && 1245 used_cbs[os->os_phys->os_type] != NULL && 1246 DMU_USERUSED_DNODE(os) != NULL); 1247 } 1248 1249 static void 1250 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, 1251 uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) 1252 { 1253 if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { 1254 int64_t delta = DNODE_SIZE + used; 1255 if (subtract) 1256 delta = -delta; 1257 VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, 1258 user, delta, tx)); 1259 VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, 1260 group, delta, tx)); 1261 } 1262 } 1263 1264 void 1265 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) 1266 { 1267 dnode_t *dn; 1268 list_t *list = &os->os_synced_dnodes; 1269 1270 ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); 1271 1272 while (dn = list_head(list)) { 1273 int flags; 1274 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); 1275 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || 1276 dn->dn_phys->dn_flags & 1277 DNODE_FLAG_USERUSED_ACCOUNTED); 1278 1279 /* Allocate the user/groupused objects if necessary. */ 1280 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { 1281 VERIFY(0 == zap_create_claim(os, 1282 DMU_USERUSED_OBJECT, 1283 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1284 VERIFY(0 == zap_create_claim(os, 1285 DMU_GROUPUSED_OBJECT, 1286 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1287 } 1288 1289 /* 1290 * We intentionally modify the zap object even if the 1291 * net delta is zero. Otherwise 1292 * the block of the zap obj could be shared between 1293 * datasets but need to be different between them after 1294 * a bprewrite. 1295 */ 1296 1297 flags = dn->dn_id_flags; 1298 ASSERT(flags); 1299 if (flags & DN_ID_OLD_EXIST) { 1300 do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, 1301 dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); 1302 } 1303 if (flags & DN_ID_NEW_EXIST) { 1304 do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), 1305 dn->dn_phys->dn_flags, dn->dn_newuid, 1306 dn->dn_newgid, B_FALSE, tx); 1307 } 1308 1309 mutex_enter(&dn->dn_mtx); 1310 dn->dn_oldused = 0; 1311 dn->dn_oldflags = 0; 1312 if (dn->dn_id_flags & DN_ID_NEW_EXIST) { 1313 dn->dn_olduid = dn->dn_newuid; 1314 dn->dn_oldgid = dn->dn_newgid; 1315 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1316 if (dn->dn_bonuslen == 0) 1317 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1318 else 1319 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1320 } 1321 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); 1322 mutex_exit(&dn->dn_mtx); 1323 1324 list_remove(list, dn); 1325 dnode_rele(dn, list); 1326 } 1327 } 1328 1329 /* 1330 * Returns a pointer to data to find uid/gid from 1331 * 1332 * If a dirty record for transaction group that is syncing can't 1333 * be found then NULL is returned. In the NULL case it is assumed 1334 * the uid/gid aren't changing. 1335 */ 1336 static void * 1337 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) 1338 { 1339 dbuf_dirty_record_t *dr, **drp; 1340 void *data; 1341 1342 if (db->db_dirtycnt == 0) 1343 return (db->db.db_data); /* Nothing is changing */ 1344 1345 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1346 if (dr->dr_txg == tx->tx_txg) 1347 break; 1348 1349 if (dr == NULL) { 1350 data = NULL; 1351 } else { 1352 dnode_t *dn; 1353 1354 DB_DNODE_ENTER(dr->dr_dbuf); 1355 dn = DB_DNODE(dr->dr_dbuf); 1356 1357 if (dn->dn_bonuslen == 0 && 1358 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) 1359 data = dr->dt.dl.dr_data->b_data; 1360 else 1361 data = dr->dt.dl.dr_data; 1362 1363 DB_DNODE_EXIT(dr->dr_dbuf); 1364 } 1365 1366 return (data); 1367 } 1368 1369 void 1370 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) 1371 { 1372 objset_t *os = dn->dn_objset; 1373 void *data = NULL; 1374 dmu_buf_impl_t *db = NULL; 1375 uint64_t *user = NULL; 1376 uint64_t *group = NULL; 1377 int flags = dn->dn_id_flags; 1378 int error; 1379 boolean_t have_spill = B_FALSE; 1380 1381 if (!dmu_objset_userused_enabled(dn->dn_objset)) 1382 return; 1383 1384 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| 1385 DN_ID_CHKED_SPILL))) 1386 return; 1387 1388 if (before && dn->dn_bonuslen != 0) 1389 data = DN_BONUS(dn->dn_phys); 1390 else if (!before && dn->dn_bonuslen != 0) { 1391 if (dn->dn_bonus) { 1392 db = dn->dn_bonus; 1393 mutex_enter(&db->db_mtx); 1394 data = dmu_objset_userquota_find_data(db, tx); 1395 } else { 1396 data = DN_BONUS(dn->dn_phys); 1397 } 1398 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { 1399 int rf = 0; 1400 1401 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) 1402 rf |= DB_RF_HAVESTRUCT; 1403 error = dmu_spill_hold_by_dnode(dn, 1404 rf | DB_RF_MUST_SUCCEED, 1405 FTAG, (dmu_buf_t **)&db); 1406 ASSERT(error == 0); 1407 mutex_enter(&db->db_mtx); 1408 data = (before) ? db->db.db_data : 1409 dmu_objset_userquota_find_data(db, tx); 1410 have_spill = B_TRUE; 1411 } else { 1412 mutex_enter(&dn->dn_mtx); 1413 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1414 mutex_exit(&dn->dn_mtx); 1415 return; 1416 } 1417 1418 if (before) { 1419 ASSERT(data); 1420 user = &dn->dn_olduid; 1421 group = &dn->dn_oldgid; 1422 } else if (data) { 1423 user = &dn->dn_newuid; 1424 group = &dn->dn_newgid; 1425 } 1426 1427 /* 1428 * Must always call the callback in case the object 1429 * type has changed and that type isn't an object type to track 1430 */ 1431 error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, 1432 user, group); 1433 1434 /* 1435 * Preserve existing uid/gid when the callback can't determine 1436 * what the new uid/gid are and the callback returned EEXIST. 1437 * The EEXIST error tells us to just use the existing uid/gid. 1438 * If we don't know what the old values are then just assign 1439 * them to 0, since that is a new file being created. 1440 */ 1441 if (!before && data == NULL && error == EEXIST) { 1442 if (flags & DN_ID_OLD_EXIST) { 1443 dn->dn_newuid = dn->dn_olduid; 1444 dn->dn_newgid = dn->dn_oldgid; 1445 } else { 1446 dn->dn_newuid = 0; 1447 dn->dn_newgid = 0; 1448 } 1449 error = 0; 1450 } 1451 1452 if (db) 1453 mutex_exit(&db->db_mtx); 1454 1455 mutex_enter(&dn->dn_mtx); 1456 if (error == 0 && before) 1457 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1458 if (error == 0 && !before) 1459 dn->dn_id_flags |= DN_ID_NEW_EXIST; 1460 1461 if (have_spill) { 1462 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1463 } else { 1464 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1465 } 1466 mutex_exit(&dn->dn_mtx); 1467 if (have_spill) 1468 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1469 } 1470 1471 boolean_t 1472 dmu_objset_userspace_present(objset_t *os) 1473 { 1474 return (os->os_phys->os_flags & 1475 OBJSET_FLAG_USERACCOUNTING_COMPLETE); 1476 } 1477 1478 int 1479 dmu_objset_userspace_upgrade(objset_t *os) 1480 { 1481 uint64_t obj; 1482 int err = 0; 1483 1484 if (dmu_objset_userspace_present(os)) 1485 return (0); 1486 if (!dmu_objset_userused_enabled(os)) 1487 return (ENOTSUP); 1488 if (dmu_objset_is_snapshot(os)) 1489 return (EINVAL); 1490 1491 /* 1492 * We simply need to mark every object dirty, so that it will be 1493 * synced out and now accounted. If this is called 1494 * concurrently, or if we already did some work before crashing, 1495 * that's fine, since we track each object's accounted state 1496 * independently. 1497 */ 1498 1499 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 1500 dmu_tx_t *tx; 1501 dmu_buf_t *db; 1502 int objerr; 1503 1504 if (issig(JUSTLOOKING) && issig(FORREAL)) 1505 return (EINTR); 1506 1507 objerr = dmu_bonus_hold(os, obj, FTAG, &db); 1508 if (objerr) 1509 continue; 1510 tx = dmu_tx_create(os); 1511 dmu_tx_hold_bonus(tx, obj); 1512 objerr = dmu_tx_assign(tx, TXG_WAIT); 1513 if (objerr) { 1514 dmu_tx_abort(tx); 1515 continue; 1516 } 1517 dmu_buf_will_dirty(db, tx); 1518 dmu_buf_rele(db, FTAG); 1519 dmu_tx_commit(tx); 1520 } 1521 1522 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 1523 txg_wait_synced(dmu_objset_pool(os), 0); 1524 return (0); 1525 } 1526 1527 void 1528 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, 1529 uint64_t *usedobjsp, uint64_t *availobjsp) 1530 { 1531 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, 1532 usedobjsp, availobjsp); 1533 } 1534 1535 uint64_t 1536 dmu_objset_fsid_guid(objset_t *os) 1537 { 1538 return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); 1539 } 1540 1541 void 1542 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) 1543 { 1544 stat->dds_type = os->os_phys->os_type; 1545 if (os->os_dsl_dataset) 1546 dsl_dataset_fast_stat(os->os_dsl_dataset, stat); 1547 } 1548 1549 void 1550 dmu_objset_stats(objset_t *os, nvlist_t *nv) 1551 { 1552 ASSERT(os->os_dsl_dataset || 1553 os->os_phys->os_type == DMU_OST_META); 1554 1555 if (os->os_dsl_dataset != NULL) 1556 dsl_dataset_stats(os->os_dsl_dataset, nv); 1557 1558 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, 1559 os->os_phys->os_type); 1560 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, 1561 dmu_objset_userspace_present(os)); 1562 } 1563 1564 int 1565 dmu_objset_is_snapshot(objset_t *os) 1566 { 1567 if (os->os_dsl_dataset != NULL) 1568 return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); 1569 else 1570 return (B_FALSE); 1571 } 1572 1573 int 1574 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, 1575 boolean_t *conflict) 1576 { 1577 dsl_dataset_t *ds = os->os_dsl_dataset; 1578 uint64_t ignored; 1579 1580 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1581 return (ENOENT); 1582 1583 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, 1584 ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, 1585 real, maxlen, conflict)); 1586 } 1587 1588 int 1589 dmu_snapshot_list_next(objset_t *os, int namelen, char *name, 1590 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) 1591 { 1592 dsl_dataset_t *ds = os->os_dsl_dataset; 1593 zap_cursor_t cursor; 1594 zap_attribute_t attr; 1595 1596 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1597 return (ENOENT); 1598 1599 zap_cursor_init_serialized(&cursor, 1600 ds->ds_dir->dd_pool->dp_meta_objset, 1601 ds->ds_phys->ds_snapnames_zapobj, *offp); 1602 1603 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1604 zap_cursor_fini(&cursor); 1605 return (ENOENT); 1606 } 1607 1608 if (strlen(attr.za_name) + 1 > namelen) { 1609 zap_cursor_fini(&cursor); 1610 return (ENAMETOOLONG); 1611 } 1612 1613 (void) strcpy(name, attr.za_name); 1614 if (idp) 1615 *idp = attr.za_first_integer; 1616 if (case_conflict) 1617 *case_conflict = attr.za_normalization_conflict; 1618 zap_cursor_advance(&cursor); 1619 *offp = zap_cursor_serialize(&cursor); 1620 zap_cursor_fini(&cursor); 1621 1622 return (0); 1623 } 1624 1625 int 1626 dmu_dir_list_next(objset_t *os, int namelen, char *name, 1627 uint64_t *idp, uint64_t *offp) 1628 { 1629 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 1630 zap_cursor_t cursor; 1631 zap_attribute_t attr; 1632 1633 /* there is no next dir on a snapshot! */ 1634 if (os->os_dsl_dataset->ds_object != 1635 dd->dd_phys->dd_head_dataset_obj) 1636 return (ENOENT); 1637 1638 zap_cursor_init_serialized(&cursor, 1639 dd->dd_pool->dp_meta_objset, 1640 dd->dd_phys->dd_child_dir_zapobj, *offp); 1641 1642 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1643 zap_cursor_fini(&cursor); 1644 return (ENOENT); 1645 } 1646 1647 if (strlen(attr.za_name) + 1 > namelen) { 1648 zap_cursor_fini(&cursor); 1649 return (ENAMETOOLONG); 1650 } 1651 1652 (void) strcpy(name, attr.za_name); 1653 if (idp) 1654 *idp = attr.za_first_integer; 1655 zap_cursor_advance(&cursor); 1656 *offp = zap_cursor_serialize(&cursor); 1657 zap_cursor_fini(&cursor); 1658 1659 return (0); 1660 } 1661 1662 struct findarg { 1663 int (*func)(const char *, void *); 1664 void *arg; 1665 }; 1666 1667 /* ARGSUSED */ 1668 static int 1669 findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 1670 { 1671 struct findarg *fa = arg; 1672 return (fa->func(dsname, fa->arg)); 1673 } 1674 1675 /* 1676 * Find all objsets under name, and for each, call 'func(child_name, arg)'. 1677 * Perhaps change all callers to use dmu_objset_find_spa()? 1678 */ 1679 int 1680 dmu_objset_find(char *name, int func(const char *, void *), void *arg, 1681 int flags) 1682 { 1683 struct findarg fa; 1684 fa.func = func; 1685 fa.arg = arg; 1686 return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); 1687 } 1688 1689 /* 1690 * Find all objsets under name, call func on each 1691 */ 1692 int 1693 dmu_objset_find_spa(spa_t *spa, const char *name, 1694 int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) 1695 { 1696 dsl_dir_t *dd; 1697 dsl_pool_t *dp; 1698 dsl_dataset_t *ds; 1699 zap_cursor_t zc; 1700 zap_attribute_t *attr; 1701 char *child; 1702 uint64_t thisobj; 1703 int err; 1704 1705 if (name == NULL) 1706 name = spa_name(spa); 1707 err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); 1708 if (err) 1709 return (err); 1710 1711 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1712 if (dd->dd_myname[0] == '$') { 1713 dsl_dir_close(dd, FTAG); 1714 return (0); 1715 } 1716 1717 thisobj = dd->dd_phys->dd_head_dataset_obj; 1718 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1719 dp = dd->dd_pool; 1720 1721 /* 1722 * Iterate over all children. 1723 */ 1724 if (flags & DS_FIND_CHILDREN) { 1725 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1726 dd->dd_phys->dd_child_dir_zapobj); 1727 zap_cursor_retrieve(&zc, attr) == 0; 1728 (void) zap_cursor_advance(&zc)) { 1729 ASSERT(attr->za_integer_length == sizeof (uint64_t)); 1730 ASSERT(attr->za_num_integers == 1); 1731 1732 child = kmem_asprintf("%s/%s", name, attr->za_name); 1733 err = dmu_objset_find_spa(spa, child, func, arg, flags); 1734 strfree(child); 1735 if (err) 1736 break; 1737 } 1738 zap_cursor_fini(&zc); 1739 1740 if (err) { 1741 dsl_dir_close(dd, FTAG); 1742 kmem_free(attr, sizeof (zap_attribute_t)); 1743 return (err); 1744 } 1745 } 1746 1747 /* 1748 * Iterate over all snapshots. 1749 */ 1750 if (flags & DS_FIND_SNAPSHOTS) { 1751 if (!dsl_pool_sync_context(dp)) 1752 rw_enter(&dp->dp_config_rwlock, RW_READER); 1753 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1754 if (!dsl_pool_sync_context(dp)) 1755 rw_exit(&dp->dp_config_rwlock); 1756 1757 if (err == 0) { 1758 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1759 dsl_dataset_rele(ds, FTAG); 1760 1761 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1762 zap_cursor_retrieve(&zc, attr) == 0; 1763 (void) zap_cursor_advance(&zc)) { 1764 ASSERT(attr->za_integer_length == 1765 sizeof (uint64_t)); 1766 ASSERT(attr->za_num_integers == 1); 1767 1768 child = kmem_asprintf("%s@%s", 1769 name, attr->za_name); 1770 err = func(spa, attr->za_first_integer, 1771 child, arg); 1772 strfree(child); 1773 if (err) 1774 break; 1775 } 1776 zap_cursor_fini(&zc); 1777 } 1778 } 1779 1780 dsl_dir_close(dd, FTAG); 1781 kmem_free(attr, sizeof (zap_attribute_t)); 1782 1783 if (err) 1784 return (err); 1785 1786 /* 1787 * Apply to self if appropriate. 1788 */ 1789 err = func(spa, thisobj, name, arg); 1790 return (err); 1791 } 1792 1793 /* ARGSUSED */ 1794 int 1795 dmu_objset_prefetch(const char *name, void *arg) 1796 { 1797 dsl_dataset_t *ds; 1798 1799 if (dsl_dataset_hold(name, FTAG, &ds)) 1800 return (0); 1801 1802 if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { 1803 mutex_enter(&ds->ds_opening_lock); 1804 if (ds->ds_objset == NULL) { 1805 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1806 zbookmark_t zb; 1807 1808 SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, 1809 ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1810 1811 (void) arc_read(NULL, dsl_dataset_get_spa(ds), 1812 &ds->ds_phys->ds_bp, NULL, NULL, 1813 ZIO_PRIORITY_ASYNC_READ, 1814 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1815 &aflags, &zb); 1816 } 1817 mutex_exit(&ds->ds_opening_lock); 1818 } 1819 1820 dsl_dataset_rele(ds, FTAG); 1821 return (0); 1822 } 1823 1824 void 1825 dmu_objset_set_user(objset_t *os, void *user_ptr) 1826 { 1827 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1828 os->os_user_ptr = user_ptr; 1829 } 1830 1831 void * 1832 dmu_objset_get_user(objset_t *os) 1833 { 1834 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1835 return (os->os_user_ptr); 1836 } 1837