1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 /* Portions Copyright 2010 Robert Milkowski */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/sysmacros.h> 32 #include <sys/kmem.h> 33 #include <sys/pathname.h> 34 #include <sys/vnode.h> 35 #include <sys/vfs.h> 36 #include <sys/vfs_opreg.h> 37 #include <sys/mntent.h> 38 #include <sys/mount.h> 39 #include <sys/cmn_err.h> 40 #include "fs/fs_subr.h" 41 #include <sys/zfs_znode.h> 42 #include <sys/zfs_dir.h> 43 #include <sys/zil.h> 44 #include <sys/fs/zfs.h> 45 #include <sys/dmu.h> 46 #include <sys/dsl_prop.h> 47 #include <sys/dsl_dataset.h> 48 #include <sys/dsl_deleg.h> 49 #include <sys/spa.h> 50 #include <sys/zap.h> 51 #include <sys/sa.h> 52 #include <sys/sa_impl.h> 53 #include <sys/varargs.h> 54 #include <sys/policy.h> 55 #include <sys/atomic.h> 56 #include <sys/mkdev.h> 57 #include <sys/modctl.h> 58 #include <sys/refstr.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/zfs_ctldir.h> 61 #include <sys/zfs_fuid.h> 62 #include <sys/bootconf.h> 63 #include <sys/sunddi.h> 64 #include <sys/dnlc.h> 65 #include <sys/dmu_objset.h> 66 #include <sys/spa_boot.h> 67 #include "zfs_comutil.h" 68 69 int zfsfstype; 70 vfsops_t *zfs_vfsops = NULL; 71 static major_t zfs_major; 72 static minor_t zfs_minor; 73 static kmutex_t zfs_dev_mtx; 74 75 extern int sys_shutdown; 76 77 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); 78 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); 79 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); 80 static int zfs_root(vfs_t *vfsp, vnode_t **vpp); 81 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); 82 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); 83 static void zfs_freevfs(vfs_t *vfsp); 84 85 static const fs_operation_def_t zfs_vfsops_template[] = { 86 VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, 87 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, 88 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, 89 VFSNAME_ROOT, { .vfs_root = zfs_root }, 90 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, 91 VFSNAME_SYNC, { .vfs_sync = zfs_sync }, 92 VFSNAME_VGET, { .vfs_vget = zfs_vget }, 93 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, 94 NULL, NULL 95 }; 96 97 static const fs_operation_def_t zfs_vfsops_eio_template[] = { 98 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, 99 NULL, NULL 100 }; 101 102 /* 103 * We need to keep a count of active fs's. 104 * This is necessary to prevent our module 105 * from being unloaded after a umount -f 106 */ 107 static uint32_t zfs_active_fs_count = 0; 108 109 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; 110 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; 111 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 112 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 113 114 /* 115 * MO_DEFAULT is not used since the default value is determined 116 * by the equivalent property. 117 */ 118 static mntopt_t mntopts[] = { 119 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, 120 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, 121 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, 122 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } 123 }; 124 125 static mntopts_t zfs_mntopts = { 126 sizeof (mntopts) / sizeof (mntopt_t), 127 mntopts 128 }; 129 130 /*ARGSUSED*/ 131 int 132 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) 133 { 134 /* 135 * Data integrity is job one. We don't want a compromised kernel 136 * writing to the storage pool, so we never sync during panic. 137 */ 138 if (panicstr) 139 return (0); 140 141 /* 142 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS 143 * to sync metadata, which they would otherwise cache indefinitely. 144 * Semantically, the only requirement is that the sync be initiated. 145 * The DMU syncs out txgs frequently, so there's nothing to do. 146 */ 147 if (flag & SYNC_ATTR) 148 return (0); 149 150 if (vfsp != NULL) { 151 /* 152 * Sync a specific filesystem. 153 */ 154 zfsvfs_t *zfsvfs = vfsp->vfs_data; 155 dsl_pool_t *dp; 156 157 ZFS_ENTER(zfsvfs); 158 dp = dmu_objset_pool(zfsvfs->z_os); 159 160 /* 161 * If the system is shutting down, then skip any 162 * filesystems which may exist on a suspended pool. 163 */ 164 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 165 ZFS_EXIT(zfsvfs); 166 return (0); 167 } 168 169 if (zfsvfs->z_log != NULL) 170 zil_commit(zfsvfs->z_log, 0); 171 172 ZFS_EXIT(zfsvfs); 173 } else { 174 /* 175 * Sync all ZFS filesystems. This is what happens when you 176 * run sync(1M). Unlike other filesystems, ZFS honors the 177 * request by waiting for all pools to commit all dirty data. 178 */ 179 spa_sync_allpools(); 180 } 181 182 return (0); 183 } 184 185 static int 186 zfs_create_unique_device(dev_t *dev) 187 { 188 major_t new_major; 189 190 do { 191 ASSERT3U(zfs_minor, <=, MAXMIN32); 192 minor_t start = zfs_minor; 193 do { 194 mutex_enter(&zfs_dev_mtx); 195 if (zfs_minor >= MAXMIN32) { 196 /* 197 * If we're still using the real major 198 * keep out of /dev/zfs and /dev/zvol minor 199 * number space. If we're using a getudev()'ed 200 * major number, we can use all of its minors. 201 */ 202 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 203 zfs_minor = ZFS_MIN_MINOR; 204 else 205 zfs_minor = 0; 206 } else { 207 zfs_minor++; 208 } 209 *dev = makedevice(zfs_major, zfs_minor); 210 mutex_exit(&zfs_dev_mtx); 211 } while (vfs_devismounted(*dev) && zfs_minor != start); 212 if (zfs_minor == start) { 213 /* 214 * We are using all ~262,000 minor numbers for the 215 * current major number. Create a new major number. 216 */ 217 if ((new_major = getudev()) == (major_t)-1) { 218 cmn_err(CE_WARN, 219 "zfs_mount: Can't get unique major " 220 "device number."); 221 return (-1); 222 } 223 mutex_enter(&zfs_dev_mtx); 224 zfs_major = new_major; 225 zfs_minor = 0; 226 227 mutex_exit(&zfs_dev_mtx); 228 } else { 229 break; 230 } 231 /* CONSTANTCONDITION */ 232 } while (1); 233 234 return (0); 235 } 236 237 static void 238 atime_changed_cb(void *arg, uint64_t newval) 239 { 240 zfsvfs_t *zfsvfs = arg; 241 242 if (newval == TRUE) { 243 zfsvfs->z_atime = TRUE; 244 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 245 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 246 } else { 247 zfsvfs->z_atime = FALSE; 248 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 249 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 250 } 251 } 252 253 static void 254 xattr_changed_cb(void *arg, uint64_t newval) 255 { 256 zfsvfs_t *zfsvfs = arg; 257 258 if (newval == TRUE) { 259 /* XXX locking on vfs_flag? */ 260 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 261 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 262 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 263 } else { 264 /* XXX locking on vfs_flag? */ 265 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 266 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 267 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 268 } 269 } 270 271 static void 272 blksz_changed_cb(void *arg, uint64_t newval) 273 { 274 zfsvfs_t *zfsvfs = arg; 275 276 if (newval < SPA_MINBLOCKSIZE || 277 newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) 278 newval = SPA_MAXBLOCKSIZE; 279 280 zfsvfs->z_max_blksz = newval; 281 zfsvfs->z_vfs->vfs_bsize = newval; 282 } 283 284 static void 285 readonly_changed_cb(void *arg, uint64_t newval) 286 { 287 zfsvfs_t *zfsvfs = arg; 288 289 if (newval) { 290 /* XXX locking on vfs_flag? */ 291 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 292 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 293 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 294 } else { 295 /* XXX locking on vfs_flag? */ 296 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 297 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 298 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 299 } 300 } 301 302 static void 303 devices_changed_cb(void *arg, uint64_t newval) 304 { 305 zfsvfs_t *zfsvfs = arg; 306 307 if (newval == FALSE) { 308 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; 309 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); 310 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); 311 } else { 312 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; 313 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); 314 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); 315 } 316 } 317 318 static void 319 setuid_changed_cb(void *arg, uint64_t newval) 320 { 321 zfsvfs_t *zfsvfs = arg; 322 323 if (newval == FALSE) { 324 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 325 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 326 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 327 } else { 328 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 329 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 330 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 331 } 332 } 333 334 static void 335 exec_changed_cb(void *arg, uint64_t newval) 336 { 337 zfsvfs_t *zfsvfs = arg; 338 339 if (newval == FALSE) { 340 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 341 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 342 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 343 } else { 344 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 345 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 346 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 347 } 348 } 349 350 /* 351 * The nbmand mount option can be changed at mount time. 352 * We can't allow it to be toggled on live file systems or incorrect 353 * behavior may be seen from cifs clients 354 * 355 * This property isn't registered via dsl_prop_register(), but this callback 356 * will be called when a file system is first mounted 357 */ 358 static void 359 nbmand_changed_cb(void *arg, uint64_t newval) 360 { 361 zfsvfs_t *zfsvfs = arg; 362 if (newval == FALSE) { 363 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 364 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 365 } else { 366 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 367 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 368 } 369 } 370 371 static void 372 snapdir_changed_cb(void *arg, uint64_t newval) 373 { 374 zfsvfs_t *zfsvfs = arg; 375 376 zfsvfs->z_show_ctldir = newval; 377 } 378 379 static void 380 vscan_changed_cb(void *arg, uint64_t newval) 381 { 382 zfsvfs_t *zfsvfs = arg; 383 384 zfsvfs->z_vscan = newval; 385 } 386 387 static void 388 acl_mode_changed_cb(void *arg, uint64_t newval) 389 { 390 zfsvfs_t *zfsvfs = arg; 391 392 zfsvfs->z_acl_mode = newval; 393 } 394 395 static void 396 acl_inherit_changed_cb(void *arg, uint64_t newval) 397 { 398 zfsvfs_t *zfsvfs = arg; 399 400 zfsvfs->z_acl_inherit = newval; 401 } 402 403 static int 404 zfs_register_callbacks(vfs_t *vfsp) 405 { 406 struct dsl_dataset *ds = NULL; 407 objset_t *os = NULL; 408 zfsvfs_t *zfsvfs = NULL; 409 uint64_t nbmand; 410 int readonly, do_readonly = B_FALSE; 411 int setuid, do_setuid = B_FALSE; 412 int exec, do_exec = B_FALSE; 413 int devices, do_devices = B_FALSE; 414 int xattr, do_xattr = B_FALSE; 415 int atime, do_atime = B_FALSE; 416 int error = 0; 417 418 ASSERT(vfsp); 419 zfsvfs = vfsp->vfs_data; 420 ASSERT(zfsvfs); 421 os = zfsvfs->z_os; 422 423 /* 424 * The act of registering our callbacks will destroy any mount 425 * options we may have. In order to enable temporary overrides 426 * of mount options, we stash away the current values and 427 * restore them after we register the callbacks. 428 */ 429 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 430 !spa_writeable(dmu_objset_spa(os))) { 431 readonly = B_TRUE; 432 do_readonly = B_TRUE; 433 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 434 readonly = B_FALSE; 435 do_readonly = B_TRUE; 436 } 437 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 438 devices = B_FALSE; 439 setuid = B_FALSE; 440 do_devices = B_TRUE; 441 do_setuid = B_TRUE; 442 } else { 443 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 444 devices = B_FALSE; 445 do_devices = B_TRUE; 446 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { 447 devices = B_TRUE; 448 do_devices = B_TRUE; 449 } 450 451 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 452 setuid = B_FALSE; 453 do_setuid = B_TRUE; 454 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 455 setuid = B_TRUE; 456 do_setuid = B_TRUE; 457 } 458 } 459 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 460 exec = B_FALSE; 461 do_exec = B_TRUE; 462 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 463 exec = B_TRUE; 464 do_exec = B_TRUE; 465 } 466 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 467 xattr = B_FALSE; 468 do_xattr = B_TRUE; 469 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 470 xattr = B_TRUE; 471 do_xattr = B_TRUE; 472 } 473 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 474 atime = B_FALSE; 475 do_atime = B_TRUE; 476 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 477 atime = B_TRUE; 478 do_atime = B_TRUE; 479 } 480 481 /* 482 * nbmand is a special property. It can only be changed at 483 * mount time. 484 * 485 * This is weird, but it is documented to only be changeable 486 * at mount time. 487 */ 488 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 489 nbmand = B_FALSE; 490 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 491 nbmand = B_TRUE; 492 } else { 493 char osname[MAXNAMELEN]; 494 495 dmu_objset_name(os, osname); 496 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 497 NULL)) { 498 return (error); 499 } 500 } 501 502 /* 503 * Register property callbacks. 504 * 505 * It would probably be fine to just check for i/o error from 506 * the first prop_register(), but I guess I like to go 507 * overboard... 508 */ 509 ds = dmu_objset_ds(os); 510 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); 511 error = error ? error : dsl_prop_register(ds, 512 "xattr", xattr_changed_cb, zfsvfs); 513 error = error ? error : dsl_prop_register(ds, 514 "recordsize", blksz_changed_cb, zfsvfs); 515 error = error ? error : dsl_prop_register(ds, 516 "readonly", readonly_changed_cb, zfsvfs); 517 error = error ? error : dsl_prop_register(ds, 518 "devices", devices_changed_cb, zfsvfs); 519 error = error ? error : dsl_prop_register(ds, 520 "setuid", setuid_changed_cb, zfsvfs); 521 error = error ? error : dsl_prop_register(ds, 522 "exec", exec_changed_cb, zfsvfs); 523 error = error ? error : dsl_prop_register(ds, 524 "snapdir", snapdir_changed_cb, zfsvfs); 525 error = error ? error : dsl_prop_register(ds, 526 "aclmode", acl_mode_changed_cb, zfsvfs); 527 error = error ? error : dsl_prop_register(ds, 528 "aclinherit", acl_inherit_changed_cb, zfsvfs); 529 error = error ? error : dsl_prop_register(ds, 530 "vscan", vscan_changed_cb, zfsvfs); 531 if (error) 532 goto unregister; 533 534 /* 535 * Invoke our callbacks to restore temporary mount options. 536 */ 537 if (do_readonly) 538 readonly_changed_cb(zfsvfs, readonly); 539 if (do_setuid) 540 setuid_changed_cb(zfsvfs, setuid); 541 if (do_exec) 542 exec_changed_cb(zfsvfs, exec); 543 if (do_devices) 544 devices_changed_cb(zfsvfs, devices); 545 if (do_xattr) 546 xattr_changed_cb(zfsvfs, xattr); 547 if (do_atime) 548 atime_changed_cb(zfsvfs, atime); 549 550 nbmand_changed_cb(zfsvfs, nbmand); 551 552 return (0); 553 554 unregister: 555 /* 556 * We may attempt to unregister some callbacks that are not 557 * registered, but this is OK; it will simply return ENOMSG, 558 * which we will ignore. 559 */ 560 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); 561 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); 562 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); 563 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); 564 (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); 565 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); 566 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); 567 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); 568 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); 569 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, 570 zfsvfs); 571 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); 572 return (error); 573 574 } 575 576 static int 577 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, 578 uint64_t *userp, uint64_t *groupp) 579 { 580 int error = 0; 581 582 /* 583 * Is it a valid type of object to track? 584 */ 585 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) 586 return (ENOENT); 587 588 /* 589 * If we have a NULL data pointer 590 * then assume the id's aren't changing and 591 * return EEXIST to the dmu to let it know to 592 * use the same ids 593 */ 594 if (data == NULL) 595 return (EEXIST); 596 597 if (bonustype == DMU_OT_ZNODE) { 598 znode_phys_t *znp = data; 599 *userp = znp->zp_uid; 600 *groupp = znp->zp_gid; 601 } else { 602 int hdrsize; 603 sa_hdr_phys_t *sap = data; 604 sa_hdr_phys_t sa = *sap; 605 boolean_t swap = B_FALSE; 606 607 ASSERT(bonustype == DMU_OT_SA); 608 609 if (sa.sa_magic == 0) { 610 /* 611 * This should only happen for newly created 612 * files that haven't had the znode data filled 613 * in yet. 614 */ 615 *userp = 0; 616 *groupp = 0; 617 return (0); 618 } 619 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { 620 sa.sa_magic = SA_MAGIC; 621 sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); 622 swap = B_TRUE; 623 } else { 624 VERIFY3U(sa.sa_magic, ==, SA_MAGIC); 625 } 626 627 hdrsize = sa_hdrsize(&sa); 628 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); 629 *userp = *((uint64_t *)((uintptr_t)data + hdrsize + 630 SA_UID_OFFSET)); 631 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + 632 SA_GID_OFFSET)); 633 if (swap) { 634 *userp = BSWAP_64(*userp); 635 *groupp = BSWAP_64(*groupp); 636 } 637 } 638 return (error); 639 } 640 641 static void 642 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 643 char *domainbuf, int buflen, uid_t *ridp) 644 { 645 uint64_t fuid; 646 const char *domain; 647 648 fuid = strtonum(fuidstr, NULL); 649 650 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 651 if (domain) 652 (void) strlcpy(domainbuf, domain, buflen); 653 else 654 domainbuf[0] = '\0'; 655 *ridp = FUID_RID(fuid); 656 } 657 658 static uint64_t 659 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 660 { 661 switch (type) { 662 case ZFS_PROP_USERUSED: 663 return (DMU_USERUSED_OBJECT); 664 case ZFS_PROP_GROUPUSED: 665 return (DMU_GROUPUSED_OBJECT); 666 case ZFS_PROP_USERQUOTA: 667 return (zfsvfs->z_userquota_obj); 668 case ZFS_PROP_GROUPQUOTA: 669 return (zfsvfs->z_groupquota_obj); 670 } 671 return (0); 672 } 673 674 int 675 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 676 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 677 { 678 int error; 679 zap_cursor_t zc; 680 zap_attribute_t za; 681 zfs_useracct_t *buf = vbuf; 682 uint64_t obj; 683 684 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 685 return (ENOTSUP); 686 687 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 688 if (obj == 0) { 689 *bufsizep = 0; 690 return (0); 691 } 692 693 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 694 (error = zap_cursor_retrieve(&zc, &za)) == 0; 695 zap_cursor_advance(&zc)) { 696 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 697 *bufsizep) 698 break; 699 700 fuidstr_to_sid(zfsvfs, za.za_name, 701 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 702 703 buf->zu_space = za.za_first_integer; 704 buf++; 705 } 706 if (error == ENOENT) 707 error = 0; 708 709 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 710 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 711 *cookiep = zap_cursor_serialize(&zc); 712 zap_cursor_fini(&zc); 713 return (error); 714 } 715 716 /* 717 * buf must be big enough (eg, 32 bytes) 718 */ 719 static int 720 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 721 char *buf, boolean_t addok) 722 { 723 uint64_t fuid; 724 int domainid = 0; 725 726 if (domain && domain[0]) { 727 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 728 if (domainid == -1) 729 return (ENOENT); 730 } 731 fuid = FUID_ENCODE(domainid, rid); 732 (void) sprintf(buf, "%llx", (longlong_t)fuid); 733 return (0); 734 } 735 736 int 737 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 738 const char *domain, uint64_t rid, uint64_t *valp) 739 { 740 char buf[32]; 741 int err; 742 uint64_t obj; 743 744 *valp = 0; 745 746 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 747 return (ENOTSUP); 748 749 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 750 if (obj == 0) 751 return (0); 752 753 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); 754 if (err) 755 return (err); 756 757 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 758 if (err == ENOENT) 759 err = 0; 760 return (err); 761 } 762 763 int 764 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 765 const char *domain, uint64_t rid, uint64_t quota) 766 { 767 char buf[32]; 768 int err; 769 dmu_tx_t *tx; 770 uint64_t *objp; 771 boolean_t fuid_dirtied; 772 773 if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) 774 return (EINVAL); 775 776 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 777 return (ENOTSUP); 778 779 objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : 780 &zfsvfs->z_groupquota_obj; 781 782 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); 783 if (err) 784 return (err); 785 fuid_dirtied = zfsvfs->z_fuid_dirty; 786 787 tx = dmu_tx_create(zfsvfs->z_os); 788 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 789 if (*objp == 0) { 790 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 791 zfs_userquota_prop_prefixes[type]); 792 } 793 if (fuid_dirtied) 794 zfs_fuid_txhold(zfsvfs, tx); 795 err = dmu_tx_assign(tx, TXG_WAIT); 796 if (err) { 797 dmu_tx_abort(tx); 798 return (err); 799 } 800 801 mutex_enter(&zfsvfs->z_lock); 802 if (*objp == 0) { 803 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 804 DMU_OT_NONE, 0, tx); 805 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 806 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 807 } 808 mutex_exit(&zfsvfs->z_lock); 809 810 if (quota == 0) { 811 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 812 if (err == ENOENT) 813 err = 0; 814 } else { 815 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 816 } 817 ASSERT(err == 0); 818 if (fuid_dirtied) 819 zfs_fuid_sync(zfsvfs, tx); 820 dmu_tx_commit(tx); 821 return (err); 822 } 823 824 boolean_t 825 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) 826 { 827 char buf[32]; 828 uint64_t used, quota, usedobj, quotaobj; 829 int err; 830 831 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 832 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 833 834 if (quotaobj == 0 || zfsvfs->z_replay) 835 return (B_FALSE); 836 837 (void) sprintf(buf, "%llx", (longlong_t)fuid); 838 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 839 if (err != 0) 840 return (B_FALSE); 841 842 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 843 if (err != 0) 844 return (B_FALSE); 845 return (used >= quota); 846 } 847 848 boolean_t 849 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) 850 { 851 uint64_t fuid; 852 uint64_t quotaobj; 853 854 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 855 856 fuid = isgroup ? zp->z_gid : zp->z_uid; 857 858 if (quotaobj == 0 || zfsvfs->z_replay) 859 return (B_FALSE); 860 861 return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); 862 } 863 864 int 865 zfsvfs_create(const char *osname, zfsvfs_t **zfvp) 866 { 867 objset_t *os; 868 zfsvfs_t *zfsvfs; 869 uint64_t zval; 870 int i, error; 871 uint64_t sa_obj; 872 873 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 874 875 /* 876 * We claim to always be readonly so we can open snapshots; 877 * other ZPL code will prevent us from writing to snapshots. 878 */ 879 error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); 880 if (error) { 881 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 882 return (error); 883 } 884 885 /* 886 * Initialize the zfs-specific filesystem structure. 887 * Should probably make this a kmem cache, shuffle fields, 888 * and just bzero up to z_hold_mtx[]. 889 */ 890 zfsvfs->z_vfs = NULL; 891 zfsvfs->z_parent = zfsvfs; 892 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; 893 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 894 zfsvfs->z_os = os; 895 896 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 897 if (error) { 898 goto out; 899 } else if (zfsvfs->z_version > 900 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 901 (void) printf("Can't mount a version %lld file system " 902 "on a version %lld pool\n. Pool must be upgraded to mount " 903 "this file system.", (u_longlong_t)zfsvfs->z_version, 904 (u_longlong_t)spa_version(dmu_objset_spa(os))); 905 error = ENOTSUP; 906 goto out; 907 } 908 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 909 goto out; 910 zfsvfs->z_norm = (int)zval; 911 912 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 913 goto out; 914 zfsvfs->z_utf8 = (zval != 0); 915 916 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 917 goto out; 918 zfsvfs->z_case = (uint_t)zval; 919 920 /* 921 * Fold case on file systems that are always or sometimes case 922 * insensitive. 923 */ 924 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 925 zfsvfs->z_case == ZFS_CASE_MIXED) 926 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 927 928 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 929 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 930 931 if (zfsvfs->z_use_sa) { 932 /* should either have both of these objects or none */ 933 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 934 &sa_obj); 935 if (error) 936 return (error); 937 } else { 938 /* 939 * Pre SA versions file systems should never touch 940 * either the attribute registration or layout objects. 941 */ 942 sa_obj = 0; 943 } 944 945 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 946 &zfsvfs->z_attr_table); 947 if (error) 948 goto out; 949 950 if (zfsvfs->z_version >= ZPL_VERSION_SA) 951 sa_register_update_callback(os, zfs_sa_upgrade); 952 953 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 954 &zfsvfs->z_root); 955 if (error) 956 goto out; 957 ASSERT(zfsvfs->z_root != 0); 958 959 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 960 &zfsvfs->z_unlinkedobj); 961 if (error) 962 goto out; 963 964 error = zap_lookup(os, MASTER_NODE_OBJ, 965 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 966 8, 1, &zfsvfs->z_userquota_obj); 967 if (error && error != ENOENT) 968 goto out; 969 970 error = zap_lookup(os, MASTER_NODE_OBJ, 971 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 972 8, 1, &zfsvfs->z_groupquota_obj); 973 if (error && error != ENOENT) 974 goto out; 975 976 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 977 &zfsvfs->z_fuid_obj); 978 if (error && error != ENOENT) 979 goto out; 980 981 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 982 &zfsvfs->z_shares_dir); 983 if (error && error != ENOENT) 984 goto out; 985 986 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 987 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 988 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 989 offsetof(znode_t, z_link_node)); 990 rrw_init(&zfsvfs->z_teardown_lock); 991 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 992 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 993 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 994 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 995 996 *zfvp = zfsvfs; 997 return (0); 998 999 out: 1000 dmu_objset_disown(os, zfsvfs); 1001 *zfvp = NULL; 1002 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1003 return (error); 1004 } 1005 1006 static int 1007 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1008 { 1009 int error; 1010 1011 error = zfs_register_callbacks(zfsvfs->z_vfs); 1012 if (error) 1013 return (error); 1014 1015 /* 1016 * Set the objset user_ptr to track its zfsvfs. 1017 */ 1018 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1019 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1020 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1021 1022 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1023 1024 /* 1025 * If we are not mounting (ie: online recv), then we don't 1026 * have to worry about replaying the log as we blocked all 1027 * operations out since we closed the ZIL. 1028 */ 1029 if (mounting) { 1030 boolean_t readonly; 1031 1032 /* 1033 * During replay we remove the read only flag to 1034 * allow replays to succeed. 1035 */ 1036 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1037 if (readonly != 0) 1038 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1039 else 1040 zfs_unlinked_drain(zfsvfs); 1041 1042 /* 1043 * Parse and replay the intent log. 1044 * 1045 * Because of ziltest, this must be done after 1046 * zfs_unlinked_drain(). (Further note: ziltest 1047 * doesn't use readonly mounts, where 1048 * zfs_unlinked_drain() isn't called.) This is because 1049 * ziltest causes spa_sync() to think it's committed, 1050 * but actually it is not, so the intent log contains 1051 * many txg's worth of changes. 1052 * 1053 * In particular, if object N is in the unlinked set in 1054 * the last txg to actually sync, then it could be 1055 * actually freed in a later txg and then reallocated 1056 * in a yet later txg. This would write a "create 1057 * object N" record to the intent log. Normally, this 1058 * would be fine because the spa_sync() would have 1059 * written out the fact that object N is free, before 1060 * we could write the "create object N" intent log 1061 * record. 1062 * 1063 * But when we are in ziltest mode, we advance the "open 1064 * txg" without actually spa_sync()-ing the changes to 1065 * disk. So we would see that object N is still 1066 * allocated and in the unlinked set, and there is an 1067 * intent log record saying to allocate it. 1068 */ 1069 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1070 if (zil_replay_disable) { 1071 zil_destroy(zfsvfs->z_log, B_FALSE); 1072 } else { 1073 zfsvfs->z_replay = B_TRUE; 1074 zil_replay(zfsvfs->z_os, zfsvfs, 1075 zfs_replay_vector); 1076 zfsvfs->z_replay = B_FALSE; 1077 } 1078 } 1079 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ 1080 } 1081 1082 return (0); 1083 } 1084 1085 void 1086 zfsvfs_free(zfsvfs_t *zfsvfs) 1087 { 1088 int i; 1089 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1090 1091 /* 1092 * This is a barrier to prevent the filesystem from going away in 1093 * zfs_znode_move() until we can safely ensure that the filesystem is 1094 * not unmounted. We consider the filesystem valid before the barrier 1095 * and invalid after the barrier. 1096 */ 1097 rw_enter(&zfsvfs_lock, RW_READER); 1098 rw_exit(&zfsvfs_lock); 1099 1100 zfs_fuid_destroy(zfsvfs); 1101 1102 mutex_destroy(&zfsvfs->z_znodes_lock); 1103 mutex_destroy(&zfsvfs->z_lock); 1104 list_destroy(&zfsvfs->z_all_znodes); 1105 rrw_destroy(&zfsvfs->z_teardown_lock); 1106 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1107 rw_destroy(&zfsvfs->z_fuid_lock); 1108 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1109 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1110 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1111 } 1112 1113 static void 1114 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1115 { 1116 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1117 if (zfsvfs->z_vfs) { 1118 if (zfsvfs->z_use_fuids) { 1119 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1120 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1121 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1122 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1123 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1124 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1125 } else { 1126 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1127 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1128 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1129 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1130 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1131 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1132 } 1133 } 1134 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1135 } 1136 1137 static int 1138 zfs_domount(vfs_t *vfsp, char *osname) 1139 { 1140 dev_t mount_dev; 1141 uint64_t recordsize, fsid_guid; 1142 int error = 0; 1143 zfsvfs_t *zfsvfs; 1144 1145 ASSERT(vfsp); 1146 ASSERT(osname); 1147 1148 error = zfsvfs_create(osname, &zfsvfs); 1149 if (error) 1150 return (error); 1151 zfsvfs->z_vfs = vfsp; 1152 1153 /* Initialize the generic filesystem structure. */ 1154 vfsp->vfs_bcount = 0; 1155 vfsp->vfs_data = NULL; 1156 1157 if (zfs_create_unique_device(&mount_dev) == -1) { 1158 error = ENODEV; 1159 goto out; 1160 } 1161 ASSERT(vfs_devismounted(mount_dev) == 0); 1162 1163 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1164 NULL)) 1165 goto out; 1166 1167 vfsp->vfs_dev = mount_dev; 1168 vfsp->vfs_fstype = zfsfstype; 1169 vfsp->vfs_bsize = recordsize; 1170 vfsp->vfs_flag |= VFS_NOTRUNC; 1171 vfsp->vfs_data = zfsvfs; 1172 1173 /* 1174 * The fsid is 64 bits, composed of an 8-bit fs type, which 1175 * separates our fsid from any other filesystem types, and a 1176 * 56-bit objset unique ID. The objset unique ID is unique to 1177 * all objsets open on this system, provided by unique_create(). 1178 * The 8-bit fs type must be put in the low bits of fsid[1] 1179 * because that's where other Solaris filesystems put it. 1180 */ 1181 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1182 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1183 vfsp->vfs_fsid.val[0] = fsid_guid; 1184 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1185 zfsfstype & 0xFF; 1186 1187 /* 1188 * Set features for file system. 1189 */ 1190 zfs_set_fuid_feature(zfsvfs); 1191 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1192 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1193 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1194 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1195 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1196 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1197 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1198 } 1199 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1200 1201 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1202 uint64_t pval; 1203 1204 atime_changed_cb(zfsvfs, B_FALSE); 1205 readonly_changed_cb(zfsvfs, B_TRUE); 1206 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1207 goto out; 1208 xattr_changed_cb(zfsvfs, pval); 1209 zfsvfs->z_issnap = B_TRUE; 1210 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1211 1212 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1213 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1214 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1215 } else { 1216 error = zfsvfs_setup(zfsvfs, B_TRUE); 1217 } 1218 1219 if (!zfsvfs->z_issnap) 1220 zfsctl_create(zfsvfs); 1221 out: 1222 if (error) { 1223 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 1224 zfsvfs_free(zfsvfs); 1225 } else { 1226 atomic_add_32(&zfs_active_fs_count, 1); 1227 } 1228 1229 return (error); 1230 } 1231 1232 void 1233 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1234 { 1235 objset_t *os = zfsvfs->z_os; 1236 struct dsl_dataset *ds; 1237 1238 /* 1239 * Unregister properties. 1240 */ 1241 if (!dmu_objset_is_snapshot(os)) { 1242 ds = dmu_objset_ds(os); 1243 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, 1244 zfsvfs) == 0); 1245 1246 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, 1247 zfsvfs) == 0); 1248 1249 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, 1250 zfsvfs) == 0); 1251 1252 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, 1253 zfsvfs) == 0); 1254 1255 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, 1256 zfsvfs) == 0); 1257 1258 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, 1259 zfsvfs) == 0); 1260 1261 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, 1262 zfsvfs) == 0); 1263 1264 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, 1265 zfsvfs) == 0); 1266 1267 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, 1268 zfsvfs) == 0); 1269 1270 VERIFY(dsl_prop_unregister(ds, "aclinherit", 1271 acl_inherit_changed_cb, zfsvfs) == 0); 1272 1273 VERIFY(dsl_prop_unregister(ds, "vscan", 1274 vscan_changed_cb, zfsvfs) == 0); 1275 } 1276 } 1277 1278 /* 1279 * Convert a decimal digit string to a uint64_t integer. 1280 */ 1281 static int 1282 str_to_uint64(char *str, uint64_t *objnum) 1283 { 1284 uint64_t num = 0; 1285 1286 while (*str) { 1287 if (*str < '0' || *str > '9') 1288 return (EINVAL); 1289 1290 num = num*10 + *str++ - '0'; 1291 } 1292 1293 *objnum = num; 1294 return (0); 1295 } 1296 1297 /* 1298 * The boot path passed from the boot loader is in the form of 1299 * "rootpool-name/root-filesystem-object-number'. Convert this 1300 * string to a dataset name: "rootpool-name/root-filesystem-name". 1301 */ 1302 static int 1303 zfs_parse_bootfs(char *bpath, char *outpath) 1304 { 1305 char *slashp; 1306 uint64_t objnum; 1307 int error; 1308 1309 if (*bpath == 0 || *bpath == '/') 1310 return (EINVAL); 1311 1312 (void) strcpy(outpath, bpath); 1313 1314 slashp = strchr(bpath, '/'); 1315 1316 /* if no '/', just return the pool name */ 1317 if (slashp == NULL) { 1318 return (0); 1319 } 1320 1321 /* if not a number, just return the root dataset name */ 1322 if (str_to_uint64(slashp+1, &objnum)) { 1323 return (0); 1324 } 1325 1326 *slashp = '\0'; 1327 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1328 *slashp = '/'; 1329 1330 return (error); 1331 } 1332 1333 /* 1334 * zfs_check_global_label: 1335 * Check that the hex label string is appropriate for the dataset 1336 * being mounted into the global_zone proper. 1337 * 1338 * Return an error if the hex label string is not default or 1339 * admin_low/admin_high. For admin_low labels, the corresponding 1340 * dataset must be readonly. 1341 */ 1342 int 1343 zfs_check_global_label(const char *dsname, const char *hexsl) 1344 { 1345 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1346 return (0); 1347 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1348 return (0); 1349 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1350 /* must be readonly */ 1351 uint64_t rdonly; 1352 1353 if (dsl_prop_get_integer(dsname, 1354 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1355 return (EACCES); 1356 return (rdonly ? 0 : EACCES); 1357 } 1358 return (EACCES); 1359 } 1360 1361 /* 1362 * zfs_mount_label_policy: 1363 * Determine whether the mount is allowed according to MAC check. 1364 * by comparing (where appropriate) label of the dataset against 1365 * the label of the zone being mounted into. If the dataset has 1366 * no label, create one. 1367 * 1368 * Returns: 1369 * 0 : access allowed 1370 * >0 : error code, such as EACCES 1371 */ 1372 static int 1373 zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1374 { 1375 int error, retv; 1376 zone_t *mntzone = NULL; 1377 ts_label_t *mnt_tsl; 1378 bslabel_t *mnt_sl; 1379 bslabel_t ds_sl; 1380 char ds_hexsl[MAXNAMELEN]; 1381 1382 retv = EACCES; /* assume the worst */ 1383 1384 /* 1385 * Start by getting the dataset label if it exists. 1386 */ 1387 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1388 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1389 if (error) 1390 return (EACCES); 1391 1392 /* 1393 * If labeling is NOT enabled, then disallow the mount of datasets 1394 * which have a non-default label already. No other label checks 1395 * are needed. 1396 */ 1397 if (!is_system_labeled()) { 1398 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1399 return (0); 1400 return (EACCES); 1401 } 1402 1403 /* 1404 * Get the label of the mountpoint. If mounting into the global 1405 * zone (i.e. mountpoint is not within an active zone and the 1406 * zoned property is off), the label must be default or 1407 * admin_low/admin_high only; no other checks are needed. 1408 */ 1409 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1410 if (mntzone->zone_id == GLOBAL_ZONEID) { 1411 uint64_t zoned; 1412 1413 zone_rele(mntzone); 1414 1415 if (dsl_prop_get_integer(osname, 1416 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1417 return (EACCES); 1418 if (!zoned) 1419 return (zfs_check_global_label(osname, ds_hexsl)); 1420 else 1421 /* 1422 * This is the case of a zone dataset being mounted 1423 * initially, before the zone has been fully created; 1424 * allow this mount into global zone. 1425 */ 1426 return (0); 1427 } 1428 1429 mnt_tsl = mntzone->zone_slabel; 1430 ASSERT(mnt_tsl != NULL); 1431 label_hold(mnt_tsl); 1432 mnt_sl = label2bslabel(mnt_tsl); 1433 1434 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1435 /* 1436 * The dataset doesn't have a real label, so fabricate one. 1437 */ 1438 char *str = NULL; 1439 1440 if (l_to_str_internal(mnt_sl, &str) == 0 && 1441 dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1442 ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) 1443 retv = 0; 1444 if (str != NULL) 1445 kmem_free(str, strlen(str) + 1); 1446 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1447 /* 1448 * Now compare labels to complete the MAC check. If the 1449 * labels are equal then allow access. If the mountpoint 1450 * label dominates the dataset label, allow readonly access. 1451 * Otherwise, access is denied. 1452 */ 1453 if (blequal(mnt_sl, &ds_sl)) 1454 retv = 0; 1455 else if (bldominates(mnt_sl, &ds_sl)) { 1456 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1457 retv = 0; 1458 } 1459 } 1460 1461 label_rele(mnt_tsl); 1462 zone_rele(mntzone); 1463 return (retv); 1464 } 1465 1466 static int 1467 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1468 { 1469 int error = 0; 1470 static int zfsrootdone = 0; 1471 zfsvfs_t *zfsvfs = NULL; 1472 znode_t *zp = NULL; 1473 vnode_t *vp = NULL; 1474 char *zfs_bootfs; 1475 char *zfs_devid; 1476 1477 ASSERT(vfsp); 1478 1479 /* 1480 * The filesystem that we mount as root is defined in the 1481 * boot property "zfs-bootfs" with a format of 1482 * "poolname/root-dataset-objnum". 1483 */ 1484 if (why == ROOT_INIT) { 1485 if (zfsrootdone++) 1486 return (EBUSY); 1487 /* 1488 * the process of doing a spa_load will require the 1489 * clock to be set before we could (for example) do 1490 * something better by looking at the timestamp on 1491 * an uberblock, so just set it to -1. 1492 */ 1493 clkset(-1); 1494 1495 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1496 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1497 "bootfs name"); 1498 return (EINVAL); 1499 } 1500 zfs_devid = spa_get_bootprop("diskdevid"); 1501 error = spa_import_rootpool(rootfs.bo_name, zfs_devid); 1502 if (zfs_devid) 1503 spa_free_bootprop(zfs_devid); 1504 if (error) { 1505 spa_free_bootprop(zfs_bootfs); 1506 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1507 error); 1508 return (error); 1509 } 1510 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1511 spa_free_bootprop(zfs_bootfs); 1512 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1513 error); 1514 return (error); 1515 } 1516 1517 spa_free_bootprop(zfs_bootfs); 1518 1519 if (error = vfs_lock(vfsp)) 1520 return (error); 1521 1522 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1523 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1524 goto out; 1525 } 1526 1527 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1528 ASSERT(zfsvfs); 1529 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { 1530 cmn_err(CE_NOTE, "zfs_zget: error %d", error); 1531 goto out; 1532 } 1533 1534 vp = ZTOV(zp); 1535 mutex_enter(&vp->v_lock); 1536 vp->v_flag |= VROOT; 1537 mutex_exit(&vp->v_lock); 1538 rootvp = vp; 1539 1540 /* 1541 * Leave rootvp held. The root file system is never unmounted. 1542 */ 1543 1544 vfs_add((struct vnode *)0, vfsp, 1545 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1546 out: 1547 vfs_unlock(vfsp); 1548 return (error); 1549 } else if (why == ROOT_REMOUNT) { 1550 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1551 vfsp->vfs_flag |= VFS_REMOUNT; 1552 1553 /* refresh mount options */ 1554 zfs_unregister_callbacks(vfsp->vfs_data); 1555 return (zfs_register_callbacks(vfsp)); 1556 1557 } else if (why == ROOT_UNMOUNT) { 1558 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1559 (void) zfs_sync(vfsp, 0, 0); 1560 return (0); 1561 } 1562 1563 /* 1564 * if "why" is equal to anything else other than ROOT_INIT, 1565 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1566 */ 1567 return (ENOTSUP); 1568 } 1569 1570 /*ARGSUSED*/ 1571 static int 1572 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 1573 { 1574 char *osname; 1575 pathname_t spn; 1576 int error = 0; 1577 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? 1578 UIO_SYSSPACE : UIO_USERSPACE; 1579 int canwrite; 1580 1581 if (mvp->v_type != VDIR) 1582 return (ENOTDIR); 1583 1584 mutex_enter(&mvp->v_lock); 1585 if ((uap->flags & MS_REMOUNT) == 0 && 1586 (uap->flags & MS_OVERLAY) == 0 && 1587 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 1588 mutex_exit(&mvp->v_lock); 1589 return (EBUSY); 1590 } 1591 mutex_exit(&mvp->v_lock); 1592 1593 /* 1594 * ZFS does not support passing unparsed data in via MS_DATA. 1595 * Users should use the MS_OPTIONSTR interface; this means 1596 * that all option parsing is already done and the options struct 1597 * can be interrogated. 1598 */ 1599 if ((uap->flags & MS_DATA) && uap->datalen > 0) 1600 return (EINVAL); 1601 1602 /* 1603 * Get the objset name (the "special" mount argument). 1604 */ 1605 if (error = pn_get(uap->spec, fromspace, &spn)) 1606 return (error); 1607 1608 osname = spn.pn_path; 1609 1610 /* 1611 * Check for mount privilege? 1612 * 1613 * If we don't have privilege then see if 1614 * we have local permission to allow it 1615 */ 1616 error = secpolicy_fs_mount(cr, mvp, vfsp); 1617 if (error) { 1618 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) { 1619 vattr_t vattr; 1620 1621 /* 1622 * Make sure user is the owner of the mount point 1623 * or has sufficient privileges. 1624 */ 1625 1626 vattr.va_mask = AT_UID; 1627 1628 if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { 1629 goto out; 1630 } 1631 1632 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && 1633 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { 1634 goto out; 1635 } 1636 secpolicy_fs_mount_clearopts(cr, vfsp); 1637 } else { 1638 goto out; 1639 } 1640 } 1641 1642 /* 1643 * Refuse to mount a filesystem if we are in a local zone and the 1644 * dataset is not visible. 1645 */ 1646 if (!INGLOBALZONE(curproc) && 1647 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1648 error = EPERM; 1649 goto out; 1650 } 1651 1652 error = zfs_mount_label_policy(vfsp, osname); 1653 if (error) 1654 goto out; 1655 1656 /* 1657 * When doing a remount, we simply refresh our temporary properties 1658 * according to those options set in the current VFS options. 1659 */ 1660 if (uap->flags & MS_REMOUNT) { 1661 /* refresh mount options */ 1662 zfs_unregister_callbacks(vfsp->vfs_data); 1663 error = zfs_register_callbacks(vfsp); 1664 goto out; 1665 } 1666 1667 error = zfs_domount(vfsp, osname); 1668 1669 /* 1670 * Add an extra VFS_HOLD on our parent vfs so that it can't 1671 * disappear due to a forced unmount. 1672 */ 1673 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 1674 VFS_HOLD(mvp->v_vfsp); 1675 1676 out: 1677 pn_free(&spn); 1678 return (error); 1679 } 1680 1681 static int 1682 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) 1683 { 1684 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1685 dev32_t d32; 1686 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1687 1688 ZFS_ENTER(zfsvfs); 1689 1690 dmu_objset_space(zfsvfs->z_os, 1691 &refdbytes, &availbytes, &usedobjs, &availobjs); 1692 1693 /* 1694 * The underlying storage pool actually uses multiple block sizes. 1695 * We report the fragsize as the smallest block size we support, 1696 * and we report our blocksize as the filesystem's maximum blocksize. 1697 */ 1698 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; 1699 statp->f_bsize = zfsvfs->z_max_blksz; 1700 1701 /* 1702 * The following report "total" blocks of various kinds in the 1703 * file system, but reported in terms of f_frsize - the 1704 * "fragment" size. 1705 */ 1706 1707 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1708 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; 1709 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1710 1711 /* 1712 * statvfs() should really be called statufs(), because it assumes 1713 * static metadata. ZFS doesn't preallocate files, so the best 1714 * we can do is report the max that could possibly fit in f_files, 1715 * and that minus the number actually used in f_ffree. 1716 * For f_ffree, report the smaller of the number of object available 1717 * and the number of blocks (each object will take at least a block). 1718 */ 1719 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1720 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 1721 statp->f_files = statp->f_ffree + usedobjs; 1722 1723 (void) cmpldev(&d32, vfsp->vfs_dev); 1724 statp->f_fsid = d32; 1725 1726 /* 1727 * We're a zfs filesystem. 1728 */ 1729 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); 1730 1731 statp->f_flag = vf_to_stf(vfsp->vfs_flag); 1732 1733 statp->f_namemax = ZFS_MAXNAMELEN; 1734 1735 /* 1736 * We have all of 32 characters to stuff a string here. 1737 * Is there anything useful we could/should provide? 1738 */ 1739 bzero(statp->f_fstr, sizeof (statp->f_fstr)); 1740 1741 ZFS_EXIT(zfsvfs); 1742 return (0); 1743 } 1744 1745 static int 1746 zfs_root(vfs_t *vfsp, vnode_t **vpp) 1747 { 1748 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1749 znode_t *rootzp; 1750 int error; 1751 1752 ZFS_ENTER(zfsvfs); 1753 1754 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1755 if (error == 0) 1756 *vpp = ZTOV(rootzp); 1757 1758 ZFS_EXIT(zfsvfs); 1759 return (error); 1760 } 1761 1762 /* 1763 * Teardown the zfsvfs::z_os. 1764 * 1765 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' 1766 * and 'z_teardown_inactive_lock' held. 1767 */ 1768 static int 1769 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1770 { 1771 znode_t *zp; 1772 1773 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 1774 1775 if (!unmounting) { 1776 /* 1777 * We purge the parent filesystem's vfsp as the parent 1778 * filesystem and all of its snapshots have their vnode's 1779 * v_vfsp set to the parent's filesystem's vfsp. Note, 1780 * 'z_parent' is self referential for non-snapshots. 1781 */ 1782 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1783 } 1784 1785 /* 1786 * Close the zil. NB: Can't close the zil while zfs_inactive 1787 * threads are blocked as zil_close can call zfs_inactive. 1788 */ 1789 if (zfsvfs->z_log) { 1790 zil_close(zfsvfs->z_log); 1791 zfsvfs->z_log = NULL; 1792 } 1793 1794 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 1795 1796 /* 1797 * If we are not unmounting (ie: online recv) and someone already 1798 * unmounted this file system while we were doing the switcheroo, 1799 * or a reopen of z_os failed then just bail out now. 1800 */ 1801 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1802 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1803 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1804 return (EIO); 1805 } 1806 1807 /* 1808 * At this point there are no vops active, and any new vops will 1809 * fail with EIO since we have z_teardown_lock for writer (only 1810 * relavent for forced unmount). 1811 * 1812 * Release all holds on dbufs. 1813 */ 1814 mutex_enter(&zfsvfs->z_znodes_lock); 1815 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1816 zp = list_next(&zfsvfs->z_all_znodes, zp)) 1817 if (zp->z_sa_hdl) { 1818 ASSERT(ZTOV(zp)->v_count > 0); 1819 zfs_znode_dmu_fini(zp); 1820 } 1821 mutex_exit(&zfsvfs->z_znodes_lock); 1822 1823 /* 1824 * If we are unmounting, set the unmounted flag and let new vops 1825 * unblock. zfs_inactive will have the unmounted behavior, and all 1826 * other vops will fail with EIO. 1827 */ 1828 if (unmounting) { 1829 zfsvfs->z_unmounted = B_TRUE; 1830 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 1831 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1832 } 1833 1834 /* 1835 * z_os will be NULL if there was an error in attempting to reopen 1836 * zfsvfs, so just return as the properties had already been 1837 * unregistered and cached data had been evicted before. 1838 */ 1839 if (zfsvfs->z_os == NULL) 1840 return (0); 1841 1842 /* 1843 * Unregister properties. 1844 */ 1845 zfs_unregister_callbacks(zfsvfs); 1846 1847 /* 1848 * Evict cached data 1849 */ 1850 if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && 1851 !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) 1852 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1853 (void) dmu_objset_evict_dbufs(zfsvfs->z_os); 1854 1855 return (0); 1856 } 1857 1858 /*ARGSUSED*/ 1859 static int 1860 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) 1861 { 1862 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1863 objset_t *os; 1864 int ret; 1865 1866 ret = secpolicy_fs_unmount(cr, vfsp); 1867 if (ret) { 1868 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 1869 ZFS_DELEG_PERM_MOUNT, cr)) 1870 return (ret); 1871 } 1872 1873 /* 1874 * We purge the parent filesystem's vfsp as the parent filesystem 1875 * and all of its snapshots have their vnode's v_vfsp set to the 1876 * parent's filesystem's vfsp. Note, 'z_parent' is self 1877 * referential for non-snapshots. 1878 */ 1879 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1880 1881 /* 1882 * Unmount any snapshots mounted under .zfs before unmounting the 1883 * dataset itself. 1884 */ 1885 if (zfsvfs->z_ctldir != NULL && 1886 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { 1887 return (ret); 1888 } 1889 1890 if (!(fflag & MS_FORCE)) { 1891 /* 1892 * Check the number of active vnodes in the file system. 1893 * Our count is maintained in the vfs structure, but the 1894 * number is off by 1 to indicate a hold on the vfs 1895 * structure itself. 1896 * 1897 * The '.zfs' directory maintains a reference of its 1898 * own, and any active references underneath are 1899 * reflected in the vnode count. 1900 */ 1901 if (zfsvfs->z_ctldir == NULL) { 1902 if (vfsp->vfs_count > 1) 1903 return (EBUSY); 1904 } else { 1905 if (vfsp->vfs_count > 2 || 1906 zfsvfs->z_ctldir->v_count > 1) 1907 return (EBUSY); 1908 } 1909 } 1910 1911 vfsp->vfs_flag |= VFS_UNMOUNTED; 1912 1913 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 1914 os = zfsvfs->z_os; 1915 1916 /* 1917 * z_os will be NULL if there was an error in 1918 * attempting to reopen zfsvfs. 1919 */ 1920 if (os != NULL) { 1921 /* 1922 * Unset the objset user_ptr. 1923 */ 1924 mutex_enter(&os->os_user_ptr_lock); 1925 dmu_objset_set_user(os, NULL); 1926 mutex_exit(&os->os_user_ptr_lock); 1927 1928 /* 1929 * Finally release the objset 1930 */ 1931 dmu_objset_disown(os, zfsvfs); 1932 } 1933 1934 /* 1935 * We can now safely destroy the '.zfs' directory node. 1936 */ 1937 if (zfsvfs->z_ctldir != NULL) 1938 zfsctl_destroy(zfsvfs); 1939 1940 return (0); 1941 } 1942 1943 static int 1944 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 1945 { 1946 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1947 znode_t *zp; 1948 uint64_t object = 0; 1949 uint64_t fid_gen = 0; 1950 uint64_t gen_mask; 1951 uint64_t zp_gen; 1952 int i, err; 1953 1954 *vpp = NULL; 1955 1956 ZFS_ENTER(zfsvfs); 1957 1958 if (fidp->fid_len == LONG_FID_LEN) { 1959 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1960 uint64_t objsetid = 0; 1961 uint64_t setgen = 0; 1962 1963 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1964 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1965 1966 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1967 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1968 1969 ZFS_EXIT(zfsvfs); 1970 1971 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1972 if (err) 1973 return (EINVAL); 1974 ZFS_ENTER(zfsvfs); 1975 } 1976 1977 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1978 zfid_short_t *zfid = (zfid_short_t *)fidp; 1979 1980 for (i = 0; i < sizeof (zfid->zf_object); i++) 1981 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1982 1983 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1984 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1985 } else { 1986 ZFS_EXIT(zfsvfs); 1987 return (EINVAL); 1988 } 1989 1990 /* A zero fid_gen means we are in the .zfs control directories */ 1991 if (fid_gen == 0 && 1992 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 1993 *vpp = zfsvfs->z_ctldir; 1994 ASSERT(*vpp != NULL); 1995 if (object == ZFSCTL_INO_SNAPDIR) { 1996 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 1997 0, NULL, NULL, NULL, NULL, NULL) == 0); 1998 } else { 1999 VN_HOLD(*vpp); 2000 } 2001 ZFS_EXIT(zfsvfs); 2002 return (0); 2003 } 2004 2005 gen_mask = -1ULL >> (64 - 8 * i); 2006 2007 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2008 if (err = zfs_zget(zfsvfs, object, &zp)) { 2009 ZFS_EXIT(zfsvfs); 2010 return (err); 2011 } 2012 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 2013 sizeof (uint64_t)); 2014 zp_gen = zp_gen & gen_mask; 2015 if (zp_gen == 0) 2016 zp_gen = 1; 2017 if (zp->z_unlinked || zp_gen != fid_gen) { 2018 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2019 VN_RELE(ZTOV(zp)); 2020 ZFS_EXIT(zfsvfs); 2021 return (EINVAL); 2022 } 2023 2024 *vpp = ZTOV(zp); 2025 ZFS_EXIT(zfsvfs); 2026 return (0); 2027 } 2028 2029 /* 2030 * Block out VOPs and close zfsvfs_t::z_os 2031 * 2032 * Note, if successful, then we return with the 'z_teardown_lock' and 2033 * 'z_teardown_inactive_lock' write held. 2034 */ 2035 int 2036 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2037 { 2038 int error; 2039 2040 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2041 return (error); 2042 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 2043 2044 return (0); 2045 } 2046 2047 /* 2048 * Reopen zfsvfs_t::z_os and release VOPs. 2049 */ 2050 int 2051 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) 2052 { 2053 int err; 2054 2055 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2056 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2057 2058 err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, 2059 &zfsvfs->z_os); 2060 if (err) { 2061 zfsvfs->z_os = NULL; 2062 } else { 2063 znode_t *zp; 2064 uint64_t sa_obj = 0; 2065 2066 /* 2067 * Make sure version hasn't changed 2068 */ 2069 2070 err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION, 2071 &zfsvfs->z_version); 2072 2073 if (err) 2074 goto bail; 2075 2076 err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, 2077 ZFS_SA_ATTRS, 8, 1, &sa_obj); 2078 2079 if (err && zfsvfs->z_version >= ZPL_VERSION_SA) 2080 goto bail; 2081 2082 if ((err = sa_setup(zfsvfs->z_os, sa_obj, 2083 zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0) 2084 goto bail; 2085 2086 if (zfsvfs->z_version >= ZPL_VERSION_SA) 2087 sa_register_update_callback(zfsvfs->z_os, 2088 zfs_sa_upgrade); 2089 2090 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2091 2092 zfs_set_fuid_feature(zfsvfs); 2093 2094 /* 2095 * Attempt to re-establish all the active znodes with 2096 * their dbufs. If a zfs_rezget() fails, then we'll let 2097 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2098 * when they try to use their znode. 2099 */ 2100 mutex_enter(&zfsvfs->z_znodes_lock); 2101 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2102 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2103 (void) zfs_rezget(zp); 2104 } 2105 mutex_exit(&zfsvfs->z_znodes_lock); 2106 } 2107 2108 bail: 2109 /* release the VOPs */ 2110 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2111 rrw_exit(&zfsvfs->z_teardown_lock, FTAG); 2112 2113 if (err) { 2114 /* 2115 * Since we couldn't reopen zfsvfs::z_os, or 2116 * setup the sa framework force unmount this file system. 2117 */ 2118 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 2119 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); 2120 } 2121 return (err); 2122 } 2123 2124 static void 2125 zfs_freevfs(vfs_t *vfsp) 2126 { 2127 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2128 2129 /* 2130 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2131 * from zfs_mount(). Release it here. If we came through 2132 * zfs_mountroot() instead, we didn't grab an extra hold, so 2133 * skip the VFS_RELE for rootvfs. 2134 */ 2135 if (zfsvfs->z_issnap && (vfsp != rootvfs)) 2136 VFS_RELE(zfsvfs->z_parent->z_vfs); 2137 2138 zfsvfs_free(zfsvfs); 2139 2140 atomic_add_32(&zfs_active_fs_count, -1); 2141 } 2142 2143 /* 2144 * VFS_INIT() initialization. Note that there is no VFS_FINI(), 2145 * so we can't safely do any non-idempotent initialization here. 2146 * Leave that to zfs_init() and zfs_fini(), which are called 2147 * from the module's _init() and _fini() entry points. 2148 */ 2149 /*ARGSUSED*/ 2150 static int 2151 zfs_vfsinit(int fstype, char *name) 2152 { 2153 int error; 2154 2155 zfsfstype = fstype; 2156 2157 /* 2158 * Setup vfsops and vnodeops tables. 2159 */ 2160 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); 2161 if (error != 0) { 2162 cmn_err(CE_WARN, "zfs: bad vfs ops template"); 2163 } 2164 2165 error = zfs_create_op_tables(); 2166 if (error) { 2167 zfs_remove_op_tables(); 2168 cmn_err(CE_WARN, "zfs: bad vnode ops template"); 2169 (void) vfs_freevfsops_by_type(zfsfstype); 2170 return (error); 2171 } 2172 2173 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 2174 2175 /* 2176 * Unique major number for all zfs mounts. 2177 * If we run out of 32-bit minors, we'll getudev() another major. 2178 */ 2179 zfs_major = ddi_name_to_major(ZFS_DRIVER); 2180 zfs_minor = ZFS_MIN_MINOR; 2181 2182 return (0); 2183 } 2184 2185 void 2186 zfs_init(void) 2187 { 2188 /* 2189 * Initialize .zfs directory structures 2190 */ 2191 zfsctl_init(); 2192 2193 /* 2194 * Initialize znode cache, vnode ops, etc... 2195 */ 2196 zfs_znode_init(); 2197 2198 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2199 } 2200 2201 void 2202 zfs_fini(void) 2203 { 2204 zfsctl_fini(); 2205 zfs_znode_fini(); 2206 } 2207 2208 int 2209 zfs_busy(void) 2210 { 2211 return (zfs_active_fs_count != 0); 2212 } 2213 2214 int 2215 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2216 { 2217 int error; 2218 objset_t *os = zfsvfs->z_os; 2219 dmu_tx_t *tx; 2220 2221 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2222 return (EINVAL); 2223 2224 if (newvers < zfsvfs->z_version) 2225 return (EINVAL); 2226 2227 if (zfs_spa_version_map(newvers) > 2228 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2229 return (ENOTSUP); 2230 2231 tx = dmu_tx_create(os); 2232 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2233 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2234 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2235 ZFS_SA_ATTRS); 2236 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2237 } 2238 error = dmu_tx_assign(tx, TXG_WAIT); 2239 if (error) { 2240 dmu_tx_abort(tx); 2241 return (error); 2242 } 2243 2244 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2245 8, 1, &newvers, tx); 2246 2247 if (error) { 2248 dmu_tx_commit(tx); 2249 return (error); 2250 } 2251 2252 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2253 uint64_t sa_obj; 2254 2255 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2256 SPA_VERSION_SA); 2257 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2258 DMU_OT_NONE, 0, tx); 2259 2260 error = zap_add(os, MASTER_NODE_OBJ, 2261 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2262 ASSERT0(error); 2263 2264 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2265 sa_register_update_callback(os, zfs_sa_upgrade); 2266 } 2267 2268 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2269 "from %llu to %llu", zfsvfs->z_version, newvers); 2270 2271 dmu_tx_commit(tx); 2272 2273 zfsvfs->z_version = newvers; 2274 2275 zfs_set_fuid_feature(zfsvfs); 2276 2277 return (0); 2278 } 2279 2280 /* 2281 * Read a property stored within the master node. 2282 */ 2283 int 2284 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2285 { 2286 const char *pname; 2287 int error = ENOENT; 2288 2289 /* 2290 * Look up the file system's value for the property. For the 2291 * version property, we look up a slightly different string. 2292 */ 2293 if (prop == ZFS_PROP_VERSION) 2294 pname = ZPL_VERSION_STR; 2295 else 2296 pname = zfs_prop_to_name(prop); 2297 2298 if (os != NULL) 2299 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2300 2301 if (error == ENOENT) { 2302 /* No value set, use the default value */ 2303 switch (prop) { 2304 case ZFS_PROP_VERSION: 2305 *value = ZPL_VERSION; 2306 break; 2307 case ZFS_PROP_NORMALIZE: 2308 case ZFS_PROP_UTF8ONLY: 2309 *value = 0; 2310 break; 2311 case ZFS_PROP_CASE: 2312 *value = ZFS_CASE_SENSITIVE; 2313 break; 2314 default: 2315 return (error); 2316 } 2317 error = 0; 2318 } 2319 return (error); 2320 } 2321 2322 static vfsdef_t vfw = { 2323 VFSDEF_VERSION, 2324 MNTTYPE_ZFS, 2325 zfs_vfsinit, 2326 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| 2327 VSW_XID|VSW_ZMOUNT, 2328 &zfs_mntopts 2329 }; 2330 2331 struct modlfs zfs_modlfs = { 2332 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw 2333 }; 2334