1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 /* Portions Copyright 2010 Robert Milkowski */ 27 28 #include <sys/cred.h> 29 #include <sys/zfs_context.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dsl_dir.h> 32 #include <sys/dsl_dataset.h> 33 #include <sys/dsl_prop.h> 34 #include <sys/dsl_pool.h> 35 #include <sys/dsl_synctask.h> 36 #include <sys/dsl_deleg.h> 37 #include <sys/dnode.h> 38 #include <sys/dbuf.h> 39 #include <sys/zvol.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/dmu_impl.h> 44 #include <sys/zfs_ioctl.h> 45 #include <sys/sa.h> 46 #include <sys/zfs_onexit.h> 47 48 /* 49 * Needed to close a window in dnode_move() that allows the objset to be freed 50 * before it can be safely accessed. 51 */ 52 krwlock_t os_lock; 53 54 void 55 dmu_objset_init(void) 56 { 57 rw_init(&os_lock, NULL, RW_DEFAULT, NULL); 58 } 59 60 void 61 dmu_objset_fini(void) 62 { 63 rw_destroy(&os_lock); 64 } 65 66 spa_t * 67 dmu_objset_spa(objset_t *os) 68 { 69 return (os->os_spa); 70 } 71 72 zilog_t * 73 dmu_objset_zil(objset_t *os) 74 { 75 return (os->os_zil); 76 } 77 78 dsl_pool_t * 79 dmu_objset_pool(objset_t *os) 80 { 81 dsl_dataset_t *ds; 82 83 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) 84 return (ds->ds_dir->dd_pool); 85 else 86 return (spa_get_dsl(os->os_spa)); 87 } 88 89 dsl_dataset_t * 90 dmu_objset_ds(objset_t *os) 91 { 92 return (os->os_dsl_dataset); 93 } 94 95 dmu_objset_type_t 96 dmu_objset_type(objset_t *os) 97 { 98 return (os->os_phys->os_type); 99 } 100 101 void 102 dmu_objset_name(objset_t *os, char *buf) 103 { 104 dsl_dataset_name(os->os_dsl_dataset, buf); 105 } 106 107 uint64_t 108 dmu_objset_id(objset_t *os) 109 { 110 dsl_dataset_t *ds = os->os_dsl_dataset; 111 112 return (ds ? ds->ds_object : 0); 113 } 114 115 uint64_t 116 dmu_objset_syncprop(objset_t *os) 117 { 118 return (os->os_sync); 119 } 120 121 uint64_t 122 dmu_objset_logbias(objset_t *os) 123 { 124 return (os->os_logbias); 125 } 126 127 static void 128 checksum_changed_cb(void *arg, uint64_t newval) 129 { 130 objset_t *os = arg; 131 132 /* 133 * Inheritance should have been done by now. 134 */ 135 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 136 137 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); 138 } 139 140 static void 141 compression_changed_cb(void *arg, uint64_t newval) 142 { 143 objset_t *os = arg; 144 145 /* 146 * Inheritance and range checking should have been done by now. 147 */ 148 ASSERT(newval != ZIO_COMPRESS_INHERIT); 149 150 os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); 151 } 152 153 static void 154 copies_changed_cb(void *arg, uint64_t newval) 155 { 156 objset_t *os = arg; 157 158 /* 159 * Inheritance and range checking should have been done by now. 160 */ 161 ASSERT(newval > 0); 162 ASSERT(newval <= spa_max_replication(os->os_spa)); 163 164 os->os_copies = newval; 165 } 166 167 static void 168 dedup_changed_cb(void *arg, uint64_t newval) 169 { 170 objset_t *os = arg; 171 spa_t *spa = os->os_spa; 172 enum zio_checksum checksum; 173 174 /* 175 * Inheritance should have been done by now. 176 */ 177 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 178 179 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); 180 181 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; 182 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); 183 } 184 185 static void 186 primary_cache_changed_cb(void *arg, uint64_t newval) 187 { 188 objset_t *os = arg; 189 190 /* 191 * Inheritance and range checking should have been done by now. 192 */ 193 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 194 newval == ZFS_CACHE_METADATA); 195 196 os->os_primary_cache = newval; 197 } 198 199 static void 200 secondary_cache_changed_cb(void *arg, uint64_t newval) 201 { 202 objset_t *os = arg; 203 204 /* 205 * Inheritance and range checking should have been done by now. 206 */ 207 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 208 newval == ZFS_CACHE_METADATA); 209 210 os->os_secondary_cache = newval; 211 } 212 213 static void 214 sync_changed_cb(void *arg, uint64_t newval) 215 { 216 objset_t *os = arg; 217 218 /* 219 * Inheritance and range checking should have been done by now. 220 */ 221 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || 222 newval == ZFS_SYNC_DISABLED); 223 224 os->os_sync = newval; 225 if (os->os_zil) 226 zil_set_sync(os->os_zil, newval); 227 } 228 229 static void 230 logbias_changed_cb(void *arg, uint64_t newval) 231 { 232 objset_t *os = arg; 233 234 ASSERT(newval == ZFS_LOGBIAS_LATENCY || 235 newval == ZFS_LOGBIAS_THROUGHPUT); 236 os->os_logbias = newval; 237 if (os->os_zil) 238 zil_set_logbias(os->os_zil, newval); 239 } 240 241 void 242 dmu_objset_byteswap(void *buf, size_t size) 243 { 244 objset_phys_t *osp = buf; 245 246 ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); 247 dnode_byteswap(&osp->os_meta_dnode); 248 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); 249 osp->os_type = BSWAP_64(osp->os_type); 250 osp->os_flags = BSWAP_64(osp->os_flags); 251 if (size == sizeof (objset_phys_t)) { 252 dnode_byteswap(&osp->os_userused_dnode); 253 dnode_byteswap(&osp->os_groupused_dnode); 254 } 255 } 256 257 int 258 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 259 objset_t **osp) 260 { 261 objset_t *os; 262 int i, err; 263 264 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); 265 266 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); 267 os->os_dsl_dataset = ds; 268 os->os_spa = spa; 269 os->os_rootbp = bp; 270 if (!BP_IS_HOLE(os->os_rootbp)) { 271 uint32_t aflags = ARC_WAIT; 272 zbookmark_t zb; 273 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 274 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 275 276 if (DMU_OS_IS_L2CACHEABLE(os)) 277 aflags |= ARC_L2CACHE; 278 279 dprintf_bp(os->os_rootbp, "reading %s", ""); 280 err = arc_read(NULL, spa, os->os_rootbp, 281 arc_getbuf_func, &os->os_phys_buf, 282 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); 283 if (err) { 284 kmem_free(os, sizeof (objset_t)); 285 /* convert checksum errors into IO errors */ 286 if (err == ECKSUM) 287 err = EIO; 288 return (err); 289 } 290 291 /* Increase the blocksize if we are permitted. */ 292 if (spa_version(spa) >= SPA_VERSION_USERSPACE && 293 arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { 294 arc_buf_t *buf = arc_buf_alloc(spa, 295 sizeof (objset_phys_t), &os->os_phys_buf, 296 ARC_BUFC_METADATA); 297 bzero(buf->b_data, sizeof (objset_phys_t)); 298 bcopy(os->os_phys_buf->b_data, buf->b_data, 299 arc_buf_size(os->os_phys_buf)); 300 (void) arc_buf_remove_ref(os->os_phys_buf, 301 &os->os_phys_buf); 302 os->os_phys_buf = buf; 303 } 304 305 os->os_phys = os->os_phys_buf->b_data; 306 os->os_flags = os->os_phys->os_flags; 307 } else { 308 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? 309 sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; 310 os->os_phys_buf = arc_buf_alloc(spa, size, 311 &os->os_phys_buf, ARC_BUFC_METADATA); 312 os->os_phys = os->os_phys_buf->b_data; 313 bzero(os->os_phys, size); 314 } 315 316 /* 317 * Note: the changed_cb will be called once before the register 318 * func returns, thus changing the checksum/compression from the 319 * default (fletcher2/off). Snapshots don't need to know about 320 * checksum/compression/copies. 321 */ 322 if (ds) { 323 err = dsl_prop_register(ds, "primarycache", 324 primary_cache_changed_cb, os); 325 if (err == 0) 326 err = dsl_prop_register(ds, "secondarycache", 327 secondary_cache_changed_cb, os); 328 if (!dsl_dataset_is_snapshot(ds)) { 329 if (err == 0) 330 err = dsl_prop_register(ds, "checksum", 331 checksum_changed_cb, os); 332 if (err == 0) 333 err = dsl_prop_register(ds, "compression", 334 compression_changed_cb, os); 335 if (err == 0) 336 err = dsl_prop_register(ds, "copies", 337 copies_changed_cb, os); 338 if (err == 0) 339 err = dsl_prop_register(ds, "dedup", 340 dedup_changed_cb, os); 341 if (err == 0) 342 err = dsl_prop_register(ds, "logbias", 343 logbias_changed_cb, os); 344 if (err == 0) 345 err = dsl_prop_register(ds, "sync", 346 sync_changed_cb, os); 347 } 348 if (err) { 349 VERIFY(arc_buf_remove_ref(os->os_phys_buf, 350 &os->os_phys_buf) == 1); 351 kmem_free(os, sizeof (objset_t)); 352 return (err); 353 } 354 } else if (ds == NULL) { 355 /* It's the meta-objset. */ 356 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; 357 os->os_compress = ZIO_COMPRESS_LZJB; 358 os->os_copies = spa_max_replication(spa); 359 os->os_dedup_checksum = ZIO_CHECKSUM_OFF; 360 os->os_dedup_verify = 0; 361 os->os_logbias = 0; 362 os->os_sync = 0; 363 os->os_primary_cache = ZFS_CACHE_ALL; 364 os->os_secondary_cache = ZFS_CACHE_ALL; 365 } 366 367 if (ds == NULL || !dsl_dataset_is_snapshot(ds)) 368 os->os_zil_header = os->os_phys->os_zil_header; 369 os->os_zil = zil_alloc(os, &os->os_zil_header); 370 371 for (i = 0; i < TXG_SIZE; i++) { 372 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), 373 offsetof(dnode_t, dn_dirty_link[i])); 374 list_create(&os->os_free_dnodes[i], sizeof (dnode_t), 375 offsetof(dnode_t, dn_dirty_link[i])); 376 } 377 list_create(&os->os_dnodes, sizeof (dnode_t), 378 offsetof(dnode_t, dn_link)); 379 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), 380 offsetof(dmu_buf_impl_t, db_link)); 381 382 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); 383 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); 384 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); 385 386 DMU_META_DNODE(os) = dnode_special_open(os, 387 &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, 388 &os->os_meta_dnode); 389 if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { 390 DMU_USERUSED_DNODE(os) = dnode_special_open(os, 391 &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, 392 &os->os_userused_dnode); 393 DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, 394 &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, 395 &os->os_groupused_dnode); 396 } 397 398 /* 399 * We should be the only thread trying to do this because we 400 * have ds_opening_lock 401 */ 402 if (ds) { 403 mutex_enter(&ds->ds_lock); 404 ASSERT(ds->ds_objset == NULL); 405 ds->ds_objset = os; 406 mutex_exit(&ds->ds_lock); 407 } 408 409 *osp = os; 410 return (0); 411 } 412 413 int 414 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) 415 { 416 int err = 0; 417 418 mutex_enter(&ds->ds_opening_lock); 419 *osp = ds->ds_objset; 420 if (*osp == NULL) { 421 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), 422 ds, dsl_dataset_get_blkptr(ds), osp); 423 } 424 mutex_exit(&ds->ds_opening_lock); 425 return (err); 426 } 427 428 /* called from zpl */ 429 int 430 dmu_objset_hold(const char *name, void *tag, objset_t **osp) 431 { 432 dsl_dataset_t *ds; 433 int err; 434 435 err = dsl_dataset_hold(name, tag, &ds); 436 if (err) 437 return (err); 438 439 err = dmu_objset_from_ds(ds, osp); 440 if (err) 441 dsl_dataset_rele(ds, tag); 442 443 return (err); 444 } 445 446 /* called from zpl */ 447 int 448 dmu_objset_own(const char *name, dmu_objset_type_t type, 449 boolean_t readonly, void *tag, objset_t **osp) 450 { 451 dsl_dataset_t *ds; 452 int err; 453 454 err = dsl_dataset_own(name, B_FALSE, tag, &ds); 455 if (err) 456 return (err); 457 458 err = dmu_objset_from_ds(ds, osp); 459 if (err) { 460 dsl_dataset_disown(ds, tag); 461 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { 462 dmu_objset_disown(*osp, tag); 463 return (EINVAL); 464 } else if (!readonly && dsl_dataset_is_snapshot(ds)) { 465 dmu_objset_disown(*osp, tag); 466 return (EROFS); 467 } 468 return (err); 469 } 470 471 void 472 dmu_objset_rele(objset_t *os, void *tag) 473 { 474 dsl_dataset_rele(os->os_dsl_dataset, tag); 475 } 476 477 void 478 dmu_objset_disown(objset_t *os, void *tag) 479 { 480 dsl_dataset_disown(os->os_dsl_dataset, tag); 481 } 482 483 int 484 dmu_objset_evict_dbufs(objset_t *os) 485 { 486 dnode_t *dn; 487 488 mutex_enter(&os->os_lock); 489 490 /* process the mdn last, since the other dnodes have holds on it */ 491 list_remove(&os->os_dnodes, DMU_META_DNODE(os)); 492 list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); 493 494 /* 495 * Find the first dnode with holds. We have to do this dance 496 * because dnode_add_ref() only works if you already have a 497 * hold. If there are no holds then it has no dbufs so OK to 498 * skip. 499 */ 500 for (dn = list_head(&os->os_dnodes); 501 dn && !dnode_add_ref(dn, FTAG); 502 dn = list_next(&os->os_dnodes, dn)) 503 continue; 504 505 while (dn) { 506 dnode_t *next_dn = dn; 507 508 do { 509 next_dn = list_next(&os->os_dnodes, next_dn); 510 } while (next_dn && !dnode_add_ref(next_dn, FTAG)); 511 512 mutex_exit(&os->os_lock); 513 dnode_evict_dbufs(dn); 514 dnode_rele(dn, FTAG); 515 mutex_enter(&os->os_lock); 516 dn = next_dn; 517 } 518 dn = list_head(&os->os_dnodes); 519 mutex_exit(&os->os_lock); 520 return (dn != DMU_META_DNODE(os)); 521 } 522 523 void 524 dmu_objset_evict(objset_t *os) 525 { 526 dsl_dataset_t *ds = os->os_dsl_dataset; 527 528 for (int t = 0; t < TXG_SIZE; t++) 529 ASSERT(!dmu_objset_is_dirty(os, t)); 530 531 if (ds) { 532 if (!dsl_dataset_is_snapshot(ds)) { 533 VERIFY(0 == dsl_prop_unregister(ds, "checksum", 534 checksum_changed_cb, os)); 535 VERIFY(0 == dsl_prop_unregister(ds, "compression", 536 compression_changed_cb, os)); 537 VERIFY(0 == dsl_prop_unregister(ds, "copies", 538 copies_changed_cb, os)); 539 VERIFY(0 == dsl_prop_unregister(ds, "dedup", 540 dedup_changed_cb, os)); 541 VERIFY(0 == dsl_prop_unregister(ds, "logbias", 542 logbias_changed_cb, os)); 543 VERIFY(0 == dsl_prop_unregister(ds, "sync", 544 sync_changed_cb, os)); 545 } 546 VERIFY(0 == dsl_prop_unregister(ds, "primarycache", 547 primary_cache_changed_cb, os)); 548 VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", 549 secondary_cache_changed_cb, os)); 550 } 551 552 if (os->os_sa) 553 sa_tear_down(os); 554 555 /* 556 * We should need only a single pass over the dnode list, since 557 * nothing can be added to the list at this point. 558 */ 559 (void) dmu_objset_evict_dbufs(os); 560 561 dnode_special_close(&os->os_meta_dnode); 562 if (DMU_USERUSED_DNODE(os)) { 563 dnode_special_close(&os->os_userused_dnode); 564 dnode_special_close(&os->os_groupused_dnode); 565 } 566 zil_free(os->os_zil); 567 568 ASSERT3P(list_head(&os->os_dnodes), ==, NULL); 569 570 VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); 571 572 /* 573 * This is a barrier to prevent the objset from going away in 574 * dnode_move() until we can safely ensure that the objset is still in 575 * use. We consider the objset valid before the barrier and invalid 576 * after the barrier. 577 */ 578 rw_enter(&os_lock, RW_READER); 579 rw_exit(&os_lock); 580 581 mutex_destroy(&os->os_lock); 582 mutex_destroy(&os->os_obj_lock); 583 mutex_destroy(&os->os_user_ptr_lock); 584 kmem_free(os, sizeof (objset_t)); 585 } 586 587 timestruc_t 588 dmu_objset_snap_cmtime(objset_t *os) 589 { 590 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); 591 } 592 593 /* called from dsl for meta-objset */ 594 objset_t * 595 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 596 dmu_objset_type_t type, dmu_tx_t *tx) 597 { 598 objset_t *os; 599 dnode_t *mdn; 600 601 ASSERT(dmu_tx_is_syncing(tx)); 602 if (ds != NULL) 603 VERIFY(0 == dmu_objset_from_ds(ds, &os)); 604 else 605 VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); 606 607 mdn = DMU_META_DNODE(os); 608 609 dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, 610 DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); 611 612 /* 613 * We don't want to have to increase the meta-dnode's nlevels 614 * later, because then we could do it in quescing context while 615 * we are also accessing it in open context. 616 * 617 * This precaution is not necessary for the MOS (ds == NULL), 618 * because the MOS is only updated in syncing context. 619 * This is most fortunate: the MOS is the only objset that 620 * needs to be synced multiple times as spa_sync() iterates 621 * to convergence, so minimizing its dn_nlevels matters. 622 */ 623 if (ds != NULL) { 624 int levels = 1; 625 626 /* 627 * Determine the number of levels necessary for the meta-dnode 628 * to contain DN_MAX_OBJECT dnodes. 629 */ 630 while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + 631 (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < 632 DN_MAX_OBJECT * sizeof (dnode_phys_t)) 633 levels++; 634 635 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = 636 mdn->dn_nlevels = levels; 637 } 638 639 ASSERT(type != DMU_OST_NONE); 640 ASSERT(type != DMU_OST_ANY); 641 ASSERT(type < DMU_OST_NUMTYPES); 642 os->os_phys->os_type = type; 643 if (dmu_objset_userused_enabled(os)) { 644 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 645 os->os_flags = os->os_phys->os_flags; 646 } 647 648 dsl_dataset_dirty(ds, tx); 649 650 return (os); 651 } 652 653 struct oscarg { 654 void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); 655 void *userarg; 656 dsl_dataset_t *clone_origin; 657 const char *lastname; 658 dmu_objset_type_t type; 659 uint64_t flags; 660 cred_t *cr; 661 }; 662 663 /*ARGSUSED*/ 664 static int 665 dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) 666 { 667 dsl_dir_t *dd = arg1; 668 struct oscarg *oa = arg2; 669 objset_t *mos = dd->dd_pool->dp_meta_objset; 670 int err; 671 uint64_t ddobj; 672 673 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 674 oa->lastname, sizeof (uint64_t), 1, &ddobj); 675 if (err != ENOENT) 676 return (err ? err : EEXIST); 677 678 if (oa->clone_origin != NULL) { 679 /* You can't clone across pools. */ 680 if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) 681 return (EXDEV); 682 683 /* You can only clone snapshots, not the head datasets. */ 684 if (!dsl_dataset_is_snapshot(oa->clone_origin)) 685 return (EINVAL); 686 } 687 688 return (0); 689 } 690 691 static void 692 dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) 693 { 694 dsl_dir_t *dd = arg1; 695 spa_t *spa = dd->dd_pool->dp_spa; 696 struct oscarg *oa = arg2; 697 uint64_t obj; 698 dsl_dataset_t *ds; 699 blkptr_t *bp; 700 701 ASSERT(dmu_tx_is_syncing(tx)); 702 703 obj = dsl_dataset_create_sync(dd, oa->lastname, 704 oa->clone_origin, oa->flags, oa->cr, tx); 705 706 VERIFY3U(0, ==, dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds)); 707 bp = dsl_dataset_get_blkptr(ds); 708 if (BP_IS_HOLE(bp)) { 709 objset_t *os = 710 dmu_objset_create_impl(spa, ds, bp, oa->type, tx); 711 712 if (oa->userfunc) 713 oa->userfunc(os, oa->userarg, oa->cr, tx); 714 } 715 716 if (oa->clone_origin == NULL) { 717 spa_history_log_internal_ds(ds, "create", tx, ""); 718 } else { 719 char namebuf[MAXNAMELEN]; 720 dsl_dataset_name(oa->clone_origin, namebuf); 721 spa_history_log_internal_ds(ds, "clone", tx, 722 "origin=%s (%llu)", namebuf, oa->clone_origin->ds_object); 723 } 724 dsl_dataset_rele(ds, FTAG); 725 } 726 727 int 728 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 729 void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) 730 { 731 dsl_dir_t *pdd; 732 const char *tail; 733 int err = 0; 734 struct oscarg oa = { 0 }; 735 736 ASSERT(strchr(name, '@') == NULL); 737 err = dsl_dir_open(name, FTAG, &pdd, &tail); 738 if (err) 739 return (err); 740 if (tail == NULL) { 741 dsl_dir_close(pdd, FTAG); 742 return (EEXIST); 743 } 744 745 oa.userfunc = func; 746 oa.userarg = arg; 747 oa.lastname = tail; 748 oa.type = type; 749 oa.flags = flags; 750 oa.cr = CRED(); 751 752 err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, 753 dmu_objset_create_sync, pdd, &oa, 5); 754 dsl_dir_close(pdd, FTAG); 755 return (err); 756 } 757 758 int 759 dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) 760 { 761 dsl_dir_t *pdd; 762 const char *tail; 763 int err = 0; 764 struct oscarg oa = { 0 }; 765 766 ASSERT(strchr(name, '@') == NULL); 767 err = dsl_dir_open(name, FTAG, &pdd, &tail); 768 if (err) 769 return (err); 770 if (tail == NULL) { 771 dsl_dir_close(pdd, FTAG); 772 return (EEXIST); 773 } 774 775 oa.lastname = tail; 776 oa.clone_origin = clone_origin; 777 oa.flags = flags; 778 oa.cr = CRED(); 779 780 err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, 781 dmu_objset_create_sync, pdd, &oa, 5); 782 dsl_dir_close(pdd, FTAG); 783 return (err); 784 } 785 786 int 787 dmu_objset_destroy(const char *name, boolean_t defer) 788 { 789 dsl_dataset_t *ds; 790 int error; 791 792 error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); 793 if (error == 0) { 794 error = dsl_dataset_destroy(ds, FTAG, defer); 795 /* dsl_dataset_destroy() closes the ds. */ 796 } 797 798 return (error); 799 } 800 801 typedef struct snapallarg { 802 dsl_sync_task_group_t *saa_dstg; 803 boolean_t saa_needsuspend; 804 nvlist_t *saa_props; 805 806 /* the following are used only if 'temporary' is set: */ 807 boolean_t saa_temporary; 808 const char *saa_htag; 809 struct dsl_ds_holdarg *saa_ha; 810 dsl_dataset_t *saa_newds; 811 } snapallarg_t; 812 813 typedef struct snaponearg { 814 const char *soa_longname; /* long snap name */ 815 const char *soa_snapname; /* short snap name */ 816 snapallarg_t *soa_saa; 817 } snaponearg_t; 818 819 static int 820 snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 821 { 822 objset_t *os = arg1; 823 snaponearg_t *soa = arg2; 824 snapallarg_t *saa = soa->soa_saa; 825 int error; 826 827 /* The props have already been checked by zfs_check_userprops(). */ 828 829 error = dsl_dataset_snapshot_check(os->os_dsl_dataset, 830 soa->soa_snapname, tx); 831 if (error) 832 return (error); 833 834 if (saa->saa_temporary) { 835 /* 836 * Ideally we would just call 837 * dsl_dataset_user_hold_check() and 838 * dsl_dataset_destroy_check() here. However the 839 * dataset we want to hold and destroy is the snapshot 840 * that we just confirmed we can create, but it won't 841 * exist until after these checks are run. Do any 842 * checks we can here and if more checks are added to 843 * those routines in the future, similar checks may be 844 * necessary here. 845 */ 846 if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) 847 return (ENOTSUP); 848 /* 849 * Not checking number of tags because the tag will be 850 * unique, as it will be the only tag. 851 */ 852 if (strlen(saa->saa_htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 853 return (E2BIG); 854 855 saa->saa_ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), 856 KM_SLEEP); 857 saa->saa_ha->temphold = B_TRUE; 858 saa->saa_ha->htag = saa->saa_htag; 859 } 860 return (error); 861 } 862 863 static void 864 snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 865 { 866 objset_t *os = arg1; 867 dsl_dataset_t *ds = os->os_dsl_dataset; 868 snaponearg_t *soa = arg2; 869 snapallarg_t *saa = soa->soa_saa; 870 871 dsl_dataset_snapshot_sync(ds, soa->soa_snapname, tx); 872 873 if (saa->saa_props != NULL) { 874 dsl_props_arg_t pa; 875 pa.pa_props = saa->saa_props; 876 pa.pa_source = ZPROP_SRC_LOCAL; 877 dsl_props_set_sync(ds->ds_prev, &pa, tx); 878 } 879 880 if (saa->saa_temporary) { 881 struct dsl_ds_destroyarg da; 882 883 dsl_dataset_user_hold_sync(ds->ds_prev, saa->saa_ha, tx); 884 kmem_free(saa->saa_ha, sizeof (struct dsl_ds_holdarg)); 885 saa->saa_ha = NULL; 886 saa->saa_newds = ds->ds_prev; 887 888 da.ds = ds->ds_prev; 889 da.defer = B_TRUE; 890 dsl_dataset_destroy_sync(&da, FTAG, tx); 891 } 892 } 893 894 static int 895 snapshot_one_impl(const char *snapname, void *arg) 896 { 897 char fsname[MAXPATHLEN]; 898 snapallarg_t *saa = arg; 899 snaponearg_t *soa; 900 objset_t *os; 901 int err; 902 903 (void) strlcpy(fsname, snapname, sizeof (fsname)); 904 strchr(fsname, '@')[0] = '\0'; 905 906 err = dmu_objset_hold(fsname, saa, &os); 907 if (err != 0) 908 return (err); 909 910 /* 911 * If the objset is in an inconsistent state (eg, in the process 912 * of being destroyed), don't snapshot it. 913 */ 914 if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { 915 dmu_objset_rele(os, saa); 916 return (EBUSY); 917 } 918 919 if (saa->saa_needsuspend) { 920 err = zil_suspend(dmu_objset_zil(os)); 921 if (err) { 922 dmu_objset_rele(os, saa); 923 return (err); 924 } 925 } 926 927 soa = kmem_zalloc(sizeof (*soa), KM_SLEEP); 928 soa->soa_saa = saa; 929 soa->soa_longname = snapname; 930 soa->soa_snapname = strchr(snapname, '@') + 1; 931 932 dsl_sync_task_create(saa->saa_dstg, snapshot_check, snapshot_sync, 933 os, soa, 3); 934 935 return (0); 936 } 937 938 /* 939 * The snapshots must all be in the same pool. 940 */ 941 int 942 dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) 943 { 944 dsl_sync_task_t *dst; 945 snapallarg_t saa = { 0 }; 946 spa_t *spa; 947 int rv = 0; 948 int err; 949 nvpair_t *pair; 950 951 pair = nvlist_next_nvpair(snaps, NULL); 952 if (pair == NULL) 953 return (0); 954 955 err = spa_open(nvpair_name(pair), &spa, FTAG); 956 if (err) 957 return (err); 958 saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 959 saa.saa_props = props; 960 saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); 961 962 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; 963 pair = nvlist_next_nvpair(snaps, pair)) { 964 err = snapshot_one_impl(nvpair_name(pair), &saa); 965 if (err != 0) { 966 if (errors != NULL) { 967 fnvlist_add_int32(errors, 968 nvpair_name(pair), err); 969 } 970 rv = err; 971 } 972 } 973 974 /* 975 * If any call to snapshot_one_impl() failed, don't execute the 976 * sync task. The error handling code below will clean up the 977 * snaponearg_t from any successful calls to 978 * snapshot_one_impl(). 979 */ 980 if (rv == 0) 981 err = dsl_sync_task_group_wait(saa.saa_dstg); 982 if (err != 0) 983 rv = err; 984 985 for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst; 986 dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) { 987 objset_t *os = dst->dst_arg1; 988 snaponearg_t *soa = dst->dst_arg2; 989 if (dst->dst_err != 0) { 990 if (errors != NULL) { 991 fnvlist_add_int32(errors, 992 soa->soa_longname, dst->dst_err); 993 } 994 rv = dst->dst_err; 995 } 996 997 if (saa.saa_needsuspend) 998 zil_resume(dmu_objset_zil(os)); 999 dmu_objset_rele(os, &saa); 1000 kmem_free(soa, sizeof (*soa)); 1001 } 1002 1003 dsl_sync_task_group_destroy(saa.saa_dstg); 1004 spa_close(spa, FTAG); 1005 return (rv); 1006 } 1007 1008 int 1009 dmu_objset_snapshot_one(const char *fsname, const char *snapname) 1010 { 1011 int err; 1012 char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); 1013 nvlist_t *snaps = fnvlist_alloc(); 1014 1015 fnvlist_add_boolean(snaps, longsnap); 1016 err = dmu_objset_snapshot(snaps, NULL, NULL); 1017 fnvlist_free(snaps); 1018 strfree(longsnap); 1019 return (err); 1020 } 1021 1022 int 1023 dmu_objset_snapshot_tmp(const char *snapname, const char *tag, int cleanup_fd) 1024 { 1025 dsl_sync_task_t *dst; 1026 snapallarg_t saa = { 0 }; 1027 spa_t *spa; 1028 minor_t minor; 1029 int err; 1030 1031 err = spa_open(snapname, &spa, FTAG); 1032 if (err) 1033 return (err); 1034 saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 1035 saa.saa_htag = tag; 1036 saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); 1037 saa.saa_temporary = B_TRUE; 1038 1039 if (cleanup_fd < 0) { 1040 spa_close(spa, FTAG); 1041 return (EINVAL); 1042 } 1043 if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { 1044 spa_close(spa, FTAG); 1045 return (err); 1046 } 1047 1048 err = snapshot_one_impl(snapname, &saa); 1049 1050 if (err == 0) 1051 err = dsl_sync_task_group_wait(saa.saa_dstg); 1052 1053 for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst; 1054 dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) { 1055 objset_t *os = dst->dst_arg1; 1056 dsl_register_onexit_hold_cleanup(saa.saa_newds, tag, minor); 1057 if (saa.saa_needsuspend) 1058 zil_resume(dmu_objset_zil(os)); 1059 dmu_objset_rele(os, &saa); 1060 } 1061 1062 zfs_onexit_fd_rele(cleanup_fd); 1063 dsl_sync_task_group_destroy(saa.saa_dstg); 1064 spa_close(spa, FTAG); 1065 return (err); 1066 } 1067 1068 1069 static void 1070 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) 1071 { 1072 dnode_t *dn; 1073 1074 while (dn = list_head(list)) { 1075 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 1076 ASSERT(dn->dn_dbuf->db_data_pending); 1077 /* 1078 * Initialize dn_zio outside dnode_sync() because the 1079 * meta-dnode needs to set it ouside dnode_sync(). 1080 */ 1081 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; 1082 ASSERT(dn->dn_zio); 1083 1084 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); 1085 list_remove(list, dn); 1086 1087 if (newlist) { 1088 (void) dnode_add_ref(dn, newlist); 1089 list_insert_tail(newlist, dn); 1090 } 1091 1092 dnode_sync(dn, tx); 1093 } 1094 } 1095 1096 /* ARGSUSED */ 1097 static void 1098 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) 1099 { 1100 blkptr_t *bp = zio->io_bp; 1101 objset_t *os = arg; 1102 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; 1103 1104 ASSERT(bp == os->os_rootbp); 1105 ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); 1106 ASSERT(BP_GET_LEVEL(bp) == 0); 1107 1108 /* 1109 * Update rootbp fill count: it should be the number of objects 1110 * allocated in the object set (not counting the "special" 1111 * objects that are stored in the objset_phys_t -- the meta 1112 * dnode and user/group accounting objects). 1113 */ 1114 bp->blk_fill = 0; 1115 for (int i = 0; i < dnp->dn_nblkptr; i++) 1116 bp->blk_fill += dnp->dn_blkptr[i].blk_fill; 1117 } 1118 1119 /* ARGSUSED */ 1120 static void 1121 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) 1122 { 1123 blkptr_t *bp = zio->io_bp; 1124 blkptr_t *bp_orig = &zio->io_bp_orig; 1125 objset_t *os = arg; 1126 1127 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 1128 ASSERT(BP_EQUAL(bp, bp_orig)); 1129 } else { 1130 dsl_dataset_t *ds = os->os_dsl_dataset; 1131 dmu_tx_t *tx = os->os_synctx; 1132 1133 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 1134 dsl_dataset_block_born(ds, bp, tx); 1135 } 1136 } 1137 1138 /* called from dsl */ 1139 void 1140 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) 1141 { 1142 int txgoff; 1143 zbookmark_t zb; 1144 zio_prop_t zp; 1145 zio_t *zio; 1146 list_t *list; 1147 list_t *newlist = NULL; 1148 dbuf_dirty_record_t *dr; 1149 1150 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); 1151 1152 ASSERT(dmu_tx_is_syncing(tx)); 1153 /* XXX the write_done callback should really give us the tx... */ 1154 os->os_synctx = tx; 1155 1156 if (os->os_dsl_dataset == NULL) { 1157 /* 1158 * This is the MOS. If we have upgraded, 1159 * spa_max_replication() could change, so reset 1160 * os_copies here. 1161 */ 1162 os->os_copies = spa_max_replication(os->os_spa); 1163 } 1164 1165 /* 1166 * Create the root block IO 1167 */ 1168 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 1169 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1170 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1171 arc_release(os->os_phys_buf, &os->os_phys_buf); 1172 1173 dmu_write_policy(os, NULL, 0, 0, &zp); 1174 1175 zio = arc_write(pio, os->os_spa, tx->tx_txg, 1176 os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, 1177 dmu_objset_write_ready, dmu_objset_write_done, os, 1178 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 1179 1180 /* 1181 * Sync special dnodes - the parent IO for the sync is the root block 1182 */ 1183 DMU_META_DNODE(os)->dn_zio = zio; 1184 dnode_sync(DMU_META_DNODE(os), tx); 1185 1186 os->os_phys->os_flags = os->os_flags; 1187 1188 if (DMU_USERUSED_DNODE(os) && 1189 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { 1190 DMU_USERUSED_DNODE(os)->dn_zio = zio; 1191 dnode_sync(DMU_USERUSED_DNODE(os), tx); 1192 DMU_GROUPUSED_DNODE(os)->dn_zio = zio; 1193 dnode_sync(DMU_GROUPUSED_DNODE(os), tx); 1194 } 1195 1196 txgoff = tx->tx_txg & TXG_MASK; 1197 1198 if (dmu_objset_userused_enabled(os)) { 1199 newlist = &os->os_synced_dnodes; 1200 /* 1201 * We must create the list here because it uses the 1202 * dn_dirty_link[] of this txg. 1203 */ 1204 list_create(newlist, sizeof (dnode_t), 1205 offsetof(dnode_t, dn_dirty_link[txgoff])); 1206 } 1207 1208 dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); 1209 dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); 1210 1211 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; 1212 while (dr = list_head(list)) { 1213 ASSERT(dr->dr_dbuf->db_level == 0); 1214 list_remove(list, dr); 1215 if (dr->dr_zio) 1216 zio_nowait(dr->dr_zio); 1217 } 1218 /* 1219 * Free intent log blocks up to this tx. 1220 */ 1221 zil_sync(os->os_zil, tx); 1222 os->os_phys->os_zil_header = os->os_zil_header; 1223 zio_nowait(zio); 1224 } 1225 1226 boolean_t 1227 dmu_objset_is_dirty(objset_t *os, uint64_t txg) 1228 { 1229 return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || 1230 !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); 1231 } 1232 1233 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; 1234 1235 void 1236 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) 1237 { 1238 used_cbs[ost] = cb; 1239 } 1240 1241 boolean_t 1242 dmu_objset_userused_enabled(objset_t *os) 1243 { 1244 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && 1245 used_cbs[os->os_phys->os_type] != NULL && 1246 DMU_USERUSED_DNODE(os) != NULL); 1247 } 1248 1249 static void 1250 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, 1251 uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) 1252 { 1253 if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { 1254 int64_t delta = DNODE_SIZE + used; 1255 if (subtract) 1256 delta = -delta; 1257 VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, 1258 user, delta, tx)); 1259 VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, 1260 group, delta, tx)); 1261 } 1262 } 1263 1264 void 1265 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) 1266 { 1267 dnode_t *dn; 1268 list_t *list = &os->os_synced_dnodes; 1269 1270 ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); 1271 1272 while (dn = list_head(list)) { 1273 int flags; 1274 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); 1275 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || 1276 dn->dn_phys->dn_flags & 1277 DNODE_FLAG_USERUSED_ACCOUNTED); 1278 1279 /* Allocate the user/groupused objects if necessary. */ 1280 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { 1281 VERIFY(0 == zap_create_claim(os, 1282 DMU_USERUSED_OBJECT, 1283 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1284 VERIFY(0 == zap_create_claim(os, 1285 DMU_GROUPUSED_OBJECT, 1286 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1287 } 1288 1289 /* 1290 * We intentionally modify the zap object even if the 1291 * net delta is zero. Otherwise 1292 * the block of the zap obj could be shared between 1293 * datasets but need to be different between them after 1294 * a bprewrite. 1295 */ 1296 1297 flags = dn->dn_id_flags; 1298 ASSERT(flags); 1299 if (flags & DN_ID_OLD_EXIST) { 1300 do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, 1301 dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); 1302 } 1303 if (flags & DN_ID_NEW_EXIST) { 1304 do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), 1305 dn->dn_phys->dn_flags, dn->dn_newuid, 1306 dn->dn_newgid, B_FALSE, tx); 1307 } 1308 1309 mutex_enter(&dn->dn_mtx); 1310 dn->dn_oldused = 0; 1311 dn->dn_oldflags = 0; 1312 if (dn->dn_id_flags & DN_ID_NEW_EXIST) { 1313 dn->dn_olduid = dn->dn_newuid; 1314 dn->dn_oldgid = dn->dn_newgid; 1315 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1316 if (dn->dn_bonuslen == 0) 1317 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1318 else 1319 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1320 } 1321 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); 1322 mutex_exit(&dn->dn_mtx); 1323 1324 list_remove(list, dn); 1325 dnode_rele(dn, list); 1326 } 1327 } 1328 1329 /* 1330 * Returns a pointer to data to find uid/gid from 1331 * 1332 * If a dirty record for transaction group that is syncing can't 1333 * be found then NULL is returned. In the NULL case it is assumed 1334 * the uid/gid aren't changing. 1335 */ 1336 static void * 1337 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) 1338 { 1339 dbuf_dirty_record_t *dr, **drp; 1340 void *data; 1341 1342 if (db->db_dirtycnt == 0) 1343 return (db->db.db_data); /* Nothing is changing */ 1344 1345 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1346 if (dr->dr_txg == tx->tx_txg) 1347 break; 1348 1349 if (dr == NULL) { 1350 data = NULL; 1351 } else { 1352 dnode_t *dn; 1353 1354 DB_DNODE_ENTER(dr->dr_dbuf); 1355 dn = DB_DNODE(dr->dr_dbuf); 1356 1357 if (dn->dn_bonuslen == 0 && 1358 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) 1359 data = dr->dt.dl.dr_data->b_data; 1360 else 1361 data = dr->dt.dl.dr_data; 1362 1363 DB_DNODE_EXIT(dr->dr_dbuf); 1364 } 1365 1366 return (data); 1367 } 1368 1369 void 1370 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) 1371 { 1372 objset_t *os = dn->dn_objset; 1373 void *data = NULL; 1374 dmu_buf_impl_t *db = NULL; 1375 uint64_t *user, *group; 1376 int flags = dn->dn_id_flags; 1377 int error; 1378 boolean_t have_spill = B_FALSE; 1379 1380 if (!dmu_objset_userused_enabled(dn->dn_objset)) 1381 return; 1382 1383 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| 1384 DN_ID_CHKED_SPILL))) 1385 return; 1386 1387 if (before && dn->dn_bonuslen != 0) 1388 data = DN_BONUS(dn->dn_phys); 1389 else if (!before && dn->dn_bonuslen != 0) { 1390 if (dn->dn_bonus) { 1391 db = dn->dn_bonus; 1392 mutex_enter(&db->db_mtx); 1393 data = dmu_objset_userquota_find_data(db, tx); 1394 } else { 1395 data = DN_BONUS(dn->dn_phys); 1396 } 1397 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { 1398 int rf = 0; 1399 1400 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) 1401 rf |= DB_RF_HAVESTRUCT; 1402 error = dmu_spill_hold_by_dnode(dn, 1403 rf | DB_RF_MUST_SUCCEED, 1404 FTAG, (dmu_buf_t **)&db); 1405 ASSERT(error == 0); 1406 mutex_enter(&db->db_mtx); 1407 data = (before) ? db->db.db_data : 1408 dmu_objset_userquota_find_data(db, tx); 1409 have_spill = B_TRUE; 1410 } else { 1411 mutex_enter(&dn->dn_mtx); 1412 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1413 mutex_exit(&dn->dn_mtx); 1414 return; 1415 } 1416 1417 if (before) { 1418 ASSERT(data); 1419 user = &dn->dn_olduid; 1420 group = &dn->dn_oldgid; 1421 } else if (data) { 1422 user = &dn->dn_newuid; 1423 group = &dn->dn_newgid; 1424 } 1425 1426 /* 1427 * Must always call the callback in case the object 1428 * type has changed and that type isn't an object type to track 1429 */ 1430 error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, 1431 user, group); 1432 1433 /* 1434 * Preserve existing uid/gid when the callback can't determine 1435 * what the new uid/gid are and the callback returned EEXIST. 1436 * The EEXIST error tells us to just use the existing uid/gid. 1437 * If we don't know what the old values are then just assign 1438 * them to 0, since that is a new file being created. 1439 */ 1440 if (!before && data == NULL && error == EEXIST) { 1441 if (flags & DN_ID_OLD_EXIST) { 1442 dn->dn_newuid = dn->dn_olduid; 1443 dn->dn_newgid = dn->dn_oldgid; 1444 } else { 1445 dn->dn_newuid = 0; 1446 dn->dn_newgid = 0; 1447 } 1448 error = 0; 1449 } 1450 1451 if (db) 1452 mutex_exit(&db->db_mtx); 1453 1454 mutex_enter(&dn->dn_mtx); 1455 if (error == 0 && before) 1456 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1457 if (error == 0 && !before) 1458 dn->dn_id_flags |= DN_ID_NEW_EXIST; 1459 1460 if (have_spill) { 1461 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1462 } else { 1463 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1464 } 1465 mutex_exit(&dn->dn_mtx); 1466 if (have_spill) 1467 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1468 } 1469 1470 boolean_t 1471 dmu_objset_userspace_present(objset_t *os) 1472 { 1473 return (os->os_phys->os_flags & 1474 OBJSET_FLAG_USERACCOUNTING_COMPLETE); 1475 } 1476 1477 int 1478 dmu_objset_userspace_upgrade(objset_t *os) 1479 { 1480 uint64_t obj; 1481 int err = 0; 1482 1483 if (dmu_objset_userspace_present(os)) 1484 return (0); 1485 if (!dmu_objset_userused_enabled(os)) 1486 return (ENOTSUP); 1487 if (dmu_objset_is_snapshot(os)) 1488 return (EINVAL); 1489 1490 /* 1491 * We simply need to mark every object dirty, so that it will be 1492 * synced out and now accounted. If this is called 1493 * concurrently, or if we already did some work before crashing, 1494 * that's fine, since we track each object's accounted state 1495 * independently. 1496 */ 1497 1498 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 1499 dmu_tx_t *tx; 1500 dmu_buf_t *db; 1501 int objerr; 1502 1503 if (issig(JUSTLOOKING) && issig(FORREAL)) 1504 return (EINTR); 1505 1506 objerr = dmu_bonus_hold(os, obj, FTAG, &db); 1507 if (objerr) 1508 continue; 1509 tx = dmu_tx_create(os); 1510 dmu_tx_hold_bonus(tx, obj); 1511 objerr = dmu_tx_assign(tx, TXG_WAIT); 1512 if (objerr) { 1513 dmu_tx_abort(tx); 1514 continue; 1515 } 1516 dmu_buf_will_dirty(db, tx); 1517 dmu_buf_rele(db, FTAG); 1518 dmu_tx_commit(tx); 1519 } 1520 1521 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 1522 txg_wait_synced(dmu_objset_pool(os), 0); 1523 return (0); 1524 } 1525 1526 void 1527 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, 1528 uint64_t *usedobjsp, uint64_t *availobjsp) 1529 { 1530 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, 1531 usedobjsp, availobjsp); 1532 } 1533 1534 uint64_t 1535 dmu_objset_fsid_guid(objset_t *os) 1536 { 1537 return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); 1538 } 1539 1540 void 1541 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) 1542 { 1543 stat->dds_type = os->os_phys->os_type; 1544 if (os->os_dsl_dataset) 1545 dsl_dataset_fast_stat(os->os_dsl_dataset, stat); 1546 } 1547 1548 void 1549 dmu_objset_stats(objset_t *os, nvlist_t *nv) 1550 { 1551 ASSERT(os->os_dsl_dataset || 1552 os->os_phys->os_type == DMU_OST_META); 1553 1554 if (os->os_dsl_dataset != NULL) 1555 dsl_dataset_stats(os->os_dsl_dataset, nv); 1556 1557 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, 1558 os->os_phys->os_type); 1559 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, 1560 dmu_objset_userspace_present(os)); 1561 } 1562 1563 int 1564 dmu_objset_is_snapshot(objset_t *os) 1565 { 1566 if (os->os_dsl_dataset != NULL) 1567 return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); 1568 else 1569 return (B_FALSE); 1570 } 1571 1572 int 1573 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, 1574 boolean_t *conflict) 1575 { 1576 dsl_dataset_t *ds = os->os_dsl_dataset; 1577 uint64_t ignored; 1578 1579 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1580 return (ENOENT); 1581 1582 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, 1583 ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, 1584 real, maxlen, conflict)); 1585 } 1586 1587 int 1588 dmu_snapshot_list_next(objset_t *os, int namelen, char *name, 1589 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) 1590 { 1591 dsl_dataset_t *ds = os->os_dsl_dataset; 1592 zap_cursor_t cursor; 1593 zap_attribute_t attr; 1594 1595 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1596 return (ENOENT); 1597 1598 zap_cursor_init_serialized(&cursor, 1599 ds->ds_dir->dd_pool->dp_meta_objset, 1600 ds->ds_phys->ds_snapnames_zapobj, *offp); 1601 1602 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1603 zap_cursor_fini(&cursor); 1604 return (ENOENT); 1605 } 1606 1607 if (strlen(attr.za_name) + 1 > namelen) { 1608 zap_cursor_fini(&cursor); 1609 return (ENAMETOOLONG); 1610 } 1611 1612 (void) strcpy(name, attr.za_name); 1613 if (idp) 1614 *idp = attr.za_first_integer; 1615 if (case_conflict) 1616 *case_conflict = attr.za_normalization_conflict; 1617 zap_cursor_advance(&cursor); 1618 *offp = zap_cursor_serialize(&cursor); 1619 zap_cursor_fini(&cursor); 1620 1621 return (0); 1622 } 1623 1624 int 1625 dmu_dir_list_next(objset_t *os, int namelen, char *name, 1626 uint64_t *idp, uint64_t *offp) 1627 { 1628 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 1629 zap_cursor_t cursor; 1630 zap_attribute_t attr; 1631 1632 /* there is no next dir on a snapshot! */ 1633 if (os->os_dsl_dataset->ds_object != 1634 dd->dd_phys->dd_head_dataset_obj) 1635 return (ENOENT); 1636 1637 zap_cursor_init_serialized(&cursor, 1638 dd->dd_pool->dp_meta_objset, 1639 dd->dd_phys->dd_child_dir_zapobj, *offp); 1640 1641 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1642 zap_cursor_fini(&cursor); 1643 return (ENOENT); 1644 } 1645 1646 if (strlen(attr.za_name) + 1 > namelen) { 1647 zap_cursor_fini(&cursor); 1648 return (ENAMETOOLONG); 1649 } 1650 1651 (void) strcpy(name, attr.za_name); 1652 if (idp) 1653 *idp = attr.za_first_integer; 1654 zap_cursor_advance(&cursor); 1655 *offp = zap_cursor_serialize(&cursor); 1656 zap_cursor_fini(&cursor); 1657 1658 return (0); 1659 } 1660 1661 struct findarg { 1662 int (*func)(const char *, void *); 1663 void *arg; 1664 }; 1665 1666 /* ARGSUSED */ 1667 static int 1668 findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) 1669 { 1670 struct findarg *fa = arg; 1671 return (fa->func(dsname, fa->arg)); 1672 } 1673 1674 /* 1675 * Find all objsets under name, and for each, call 'func(child_name, arg)'. 1676 * Perhaps change all callers to use dmu_objset_find_spa()? 1677 */ 1678 int 1679 dmu_objset_find(char *name, int func(const char *, void *), void *arg, 1680 int flags) 1681 { 1682 struct findarg fa; 1683 fa.func = func; 1684 fa.arg = arg; 1685 return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); 1686 } 1687 1688 /* 1689 * Find all objsets under name, call func on each 1690 */ 1691 int 1692 dmu_objset_find_spa(spa_t *spa, const char *name, 1693 int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) 1694 { 1695 dsl_dir_t *dd; 1696 dsl_pool_t *dp; 1697 dsl_dataset_t *ds; 1698 zap_cursor_t zc; 1699 zap_attribute_t *attr; 1700 char *child; 1701 uint64_t thisobj; 1702 int err; 1703 1704 if (name == NULL) 1705 name = spa_name(spa); 1706 err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); 1707 if (err) 1708 return (err); 1709 1710 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1711 if (dd->dd_myname[0] == '$') { 1712 dsl_dir_close(dd, FTAG); 1713 return (0); 1714 } 1715 1716 thisobj = dd->dd_phys->dd_head_dataset_obj; 1717 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1718 dp = dd->dd_pool; 1719 1720 /* 1721 * Iterate over all children. 1722 */ 1723 if (flags & DS_FIND_CHILDREN) { 1724 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1725 dd->dd_phys->dd_child_dir_zapobj); 1726 zap_cursor_retrieve(&zc, attr) == 0; 1727 (void) zap_cursor_advance(&zc)) { 1728 ASSERT(attr->za_integer_length == sizeof (uint64_t)); 1729 ASSERT(attr->za_num_integers == 1); 1730 1731 child = kmem_asprintf("%s/%s", name, attr->za_name); 1732 err = dmu_objset_find_spa(spa, child, func, arg, flags); 1733 strfree(child); 1734 if (err) 1735 break; 1736 } 1737 zap_cursor_fini(&zc); 1738 1739 if (err) { 1740 dsl_dir_close(dd, FTAG); 1741 kmem_free(attr, sizeof (zap_attribute_t)); 1742 return (err); 1743 } 1744 } 1745 1746 /* 1747 * Iterate over all snapshots. 1748 */ 1749 if (flags & DS_FIND_SNAPSHOTS) { 1750 if (!dsl_pool_sync_context(dp)) 1751 rw_enter(&dp->dp_config_rwlock, RW_READER); 1752 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1753 if (!dsl_pool_sync_context(dp)) 1754 rw_exit(&dp->dp_config_rwlock); 1755 1756 if (err == 0) { 1757 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1758 dsl_dataset_rele(ds, FTAG); 1759 1760 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1761 zap_cursor_retrieve(&zc, attr) == 0; 1762 (void) zap_cursor_advance(&zc)) { 1763 ASSERT(attr->za_integer_length == 1764 sizeof (uint64_t)); 1765 ASSERT(attr->za_num_integers == 1); 1766 1767 child = kmem_asprintf("%s@%s", 1768 name, attr->za_name); 1769 err = func(spa, attr->za_first_integer, 1770 child, arg); 1771 strfree(child); 1772 if (err) 1773 break; 1774 } 1775 zap_cursor_fini(&zc); 1776 } 1777 } 1778 1779 dsl_dir_close(dd, FTAG); 1780 kmem_free(attr, sizeof (zap_attribute_t)); 1781 1782 if (err) 1783 return (err); 1784 1785 /* 1786 * Apply to self if appropriate. 1787 */ 1788 err = func(spa, thisobj, name, arg); 1789 return (err); 1790 } 1791 1792 /* ARGSUSED */ 1793 int 1794 dmu_objset_prefetch(const char *name, void *arg) 1795 { 1796 dsl_dataset_t *ds; 1797 1798 if (dsl_dataset_hold(name, FTAG, &ds)) 1799 return (0); 1800 1801 if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { 1802 mutex_enter(&ds->ds_opening_lock); 1803 if (ds->ds_objset == NULL) { 1804 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1805 zbookmark_t zb; 1806 1807 SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, 1808 ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1809 1810 (void) arc_read(NULL, dsl_dataset_get_spa(ds), 1811 &ds->ds_phys->ds_bp, NULL, NULL, 1812 ZIO_PRIORITY_ASYNC_READ, 1813 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1814 &aflags, &zb); 1815 } 1816 mutex_exit(&ds->ds_opening_lock); 1817 } 1818 1819 dsl_dataset_rele(ds, FTAG); 1820 return (0); 1821 } 1822 1823 void 1824 dmu_objset_set_user(objset_t *os, void *user_ptr) 1825 { 1826 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1827 os->os_user_ptr = user_ptr; 1828 } 1829 1830 void * 1831 dmu_objset_get_user(objset_t *os) 1832 { 1833 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1834 return (os->os_user_ptr); 1835 } 1836