1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 24 */ 25 26 /* Portions Copyright 2010 Robert Milkowski */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/sysmacros.h> 32 #include <sys/kmem.h> 33 #include <sys/pathname.h> 34 #include <sys/vnode.h> 35 #include <sys/vfs.h> 36 #include <sys/vfs_opreg.h> 37 #include <sys/mntent.h> 38 #include <sys/mount.h> 39 #include <sys/cmn_err.h> 40 #include "fs/fs_subr.h" 41 #include <sys/zfs_znode.h> 42 #include <sys/zfs_dir.h> 43 #include <sys/zil.h> 44 #include <sys/fs/zfs.h> 45 #include <sys/dmu.h> 46 #include <sys/dsl_prop.h> 47 #include <sys/dsl_dataset.h> 48 #include <sys/dsl_deleg.h> 49 #include <sys/spa.h> 50 #include <sys/zap.h> 51 #include <sys/sa.h> 52 #include <sys/sa_impl.h> 53 #include <sys/varargs.h> 54 #include <sys/policy.h> 55 #include <sys/atomic.h> 56 #include <sys/mkdev.h> 57 #include <sys/modctl.h> 58 #include <sys/refstr.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/zfs_ctldir.h> 61 #include <sys/zfs_fuid.h> 62 #include <sys/bootconf.h> 63 #include <sys/sunddi.h> 64 #include <sys/dnlc.h> 65 #include <sys/dmu_objset.h> 66 #include <sys/spa_boot.h> 67 #include "zfs_comutil.h" 68 69 int zfsfstype; 70 vfsops_t *zfs_vfsops = NULL; 71 static major_t zfs_major; 72 static minor_t zfs_minor; 73 static kmutex_t zfs_dev_mtx; 74 75 extern int sys_shutdown; 76 77 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); 78 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); 79 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); 80 static int zfs_root(vfs_t *vfsp, vnode_t **vpp); 81 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); 82 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); 83 static void zfs_freevfs(vfs_t *vfsp); 84 85 static const fs_operation_def_t zfs_vfsops_template[] = { 86 VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, 87 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, 88 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, 89 VFSNAME_ROOT, { .vfs_root = zfs_root }, 90 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, 91 VFSNAME_SYNC, { .vfs_sync = zfs_sync }, 92 VFSNAME_VGET, { .vfs_vget = zfs_vget }, 93 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, 94 NULL, NULL 95 }; 96 97 static const fs_operation_def_t zfs_vfsops_eio_template[] = { 98 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, 99 NULL, NULL 100 }; 101 102 /* 103 * We need to keep a count of active fs's. 104 * This is necessary to prevent our module 105 * from being unloaded after a umount -f 106 */ 107 static uint32_t zfs_active_fs_count = 0; 108 109 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; 110 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; 111 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 112 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 113 114 /* 115 * MO_DEFAULT is not used since the default value is determined 116 * by the equivalent property. 117 */ 118 static mntopt_t mntopts[] = { 119 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, 120 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, 121 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, 122 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } 123 }; 124 125 static mntopts_t zfs_mntopts = { 126 sizeof (mntopts) / sizeof (mntopt_t), 127 mntopts 128 }; 129 130 /*ARGSUSED*/ 131 int 132 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) 133 { 134 /* 135 * Data integrity is job one. We don't want a compromised kernel 136 * writing to the storage pool, so we never sync during panic. 137 */ 138 if (panicstr) 139 return (0); 140 141 /* 142 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS 143 * to sync metadata, which they would otherwise cache indefinitely. 144 * Semantically, the only requirement is that the sync be initiated. 145 * The DMU syncs out txgs frequently, so there's nothing to do. 146 */ 147 if (flag & SYNC_ATTR) 148 return (0); 149 150 if (vfsp != NULL) { 151 /* 152 * Sync a specific filesystem. 153 */ 154 zfsvfs_t *zfsvfs = vfsp->vfs_data; 155 dsl_pool_t *dp; 156 157 ZFS_ENTER(zfsvfs); 158 dp = dmu_objset_pool(zfsvfs->z_os); 159 160 /* 161 * If the system is shutting down, then skip any 162 * filesystems which may exist on a suspended pool. 163 */ 164 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 165 ZFS_EXIT(zfsvfs); 166 return (0); 167 } 168 169 if (zfsvfs->z_log != NULL) 170 zil_commit(zfsvfs->z_log, 0); 171 172 ZFS_EXIT(zfsvfs); 173 } else { 174 /* 175 * Sync all ZFS filesystems. This is what happens when you 176 * run sync(1M). Unlike other filesystems, ZFS honors the 177 * request by waiting for all pools to commit all dirty data. 178 */ 179 spa_sync_allpools(); 180 } 181 182 return (0); 183 } 184 185 static int 186 zfs_create_unique_device(dev_t *dev) 187 { 188 major_t new_major; 189 190 do { 191 ASSERT3U(zfs_minor, <=, MAXMIN32); 192 minor_t start = zfs_minor; 193 do { 194 mutex_enter(&zfs_dev_mtx); 195 if (zfs_minor >= MAXMIN32) { 196 /* 197 * If we're still using the real major 198 * keep out of /dev/zfs and /dev/zvol minor 199 * number space. If we're using a getudev()'ed 200 * major number, we can use all of its minors. 201 */ 202 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 203 zfs_minor = ZFS_MIN_MINOR; 204 else 205 zfs_minor = 0; 206 } else { 207 zfs_minor++; 208 } 209 *dev = makedevice(zfs_major, zfs_minor); 210 mutex_exit(&zfs_dev_mtx); 211 } while (vfs_devismounted(*dev) && zfs_minor != start); 212 if (zfs_minor == start) { 213 /* 214 * We are using all ~262,000 minor numbers for the 215 * current major number. Create a new major number. 216 */ 217 if ((new_major = getudev()) == (major_t)-1) { 218 cmn_err(CE_WARN, 219 "zfs_mount: Can't get unique major " 220 "device number."); 221 return (-1); 222 } 223 mutex_enter(&zfs_dev_mtx); 224 zfs_major = new_major; 225 zfs_minor = 0; 226 227 mutex_exit(&zfs_dev_mtx); 228 } else { 229 break; 230 } 231 /* CONSTANTCONDITION */ 232 } while (1); 233 234 return (0); 235 } 236 237 static void 238 atime_changed_cb(void *arg, uint64_t newval) 239 { 240 zfsvfs_t *zfsvfs = arg; 241 242 if (newval == TRUE) { 243 zfsvfs->z_atime = TRUE; 244 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 245 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 246 } else { 247 zfsvfs->z_atime = FALSE; 248 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 249 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 250 } 251 } 252 253 static void 254 xattr_changed_cb(void *arg, uint64_t newval) 255 { 256 zfsvfs_t *zfsvfs = arg; 257 258 if (newval == TRUE) { 259 /* XXX locking on vfs_flag? */ 260 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 261 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 262 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 263 } else { 264 /* XXX locking on vfs_flag? */ 265 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 266 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 267 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 268 } 269 } 270 271 static void 272 blksz_changed_cb(void *arg, uint64_t newval) 273 { 274 zfsvfs_t *zfsvfs = arg; 275 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 276 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 277 ASSERT(ISP2(newval)); 278 279 zfsvfs->z_max_blksz = newval; 280 zfsvfs->z_vfs->vfs_bsize = newval; 281 } 282 283 static void 284 readonly_changed_cb(void *arg, uint64_t newval) 285 { 286 zfsvfs_t *zfsvfs = arg; 287 288 if (newval) { 289 /* XXX locking on vfs_flag? */ 290 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 291 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 292 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 293 } else { 294 /* XXX locking on vfs_flag? */ 295 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 296 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 297 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 298 } 299 } 300 301 static void 302 devices_changed_cb(void *arg, uint64_t newval) 303 { 304 zfsvfs_t *zfsvfs = arg; 305 306 if (newval == FALSE) { 307 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; 308 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); 309 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); 310 } else { 311 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; 312 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); 313 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); 314 } 315 } 316 317 static void 318 setuid_changed_cb(void *arg, uint64_t newval) 319 { 320 zfsvfs_t *zfsvfs = arg; 321 322 if (newval == FALSE) { 323 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 324 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 325 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 326 } else { 327 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 328 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 329 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 330 } 331 } 332 333 static void 334 exec_changed_cb(void *arg, uint64_t newval) 335 { 336 zfsvfs_t *zfsvfs = arg; 337 338 if (newval == FALSE) { 339 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 340 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 341 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 342 } else { 343 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 344 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 345 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 346 } 347 } 348 349 /* 350 * The nbmand mount option can be changed at mount time. 351 * We can't allow it to be toggled on live file systems or incorrect 352 * behavior may be seen from cifs clients 353 * 354 * This property isn't registered via dsl_prop_register(), but this callback 355 * will be called when a file system is first mounted 356 */ 357 static void 358 nbmand_changed_cb(void *arg, uint64_t newval) 359 { 360 zfsvfs_t *zfsvfs = arg; 361 if (newval == FALSE) { 362 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 363 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 364 } else { 365 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 366 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 367 } 368 } 369 370 static void 371 snapdir_changed_cb(void *arg, uint64_t newval) 372 { 373 zfsvfs_t *zfsvfs = arg; 374 375 zfsvfs->z_show_ctldir = newval; 376 } 377 378 static void 379 vscan_changed_cb(void *arg, uint64_t newval) 380 { 381 zfsvfs_t *zfsvfs = arg; 382 383 zfsvfs->z_vscan = newval; 384 } 385 386 static void 387 acl_mode_changed_cb(void *arg, uint64_t newval) 388 { 389 zfsvfs_t *zfsvfs = arg; 390 391 zfsvfs->z_acl_mode = newval; 392 } 393 394 static void 395 acl_inherit_changed_cb(void *arg, uint64_t newval) 396 { 397 zfsvfs_t *zfsvfs = arg; 398 399 zfsvfs->z_acl_inherit = newval; 400 } 401 402 static int 403 zfs_register_callbacks(vfs_t *vfsp) 404 { 405 struct dsl_dataset *ds = NULL; 406 objset_t *os = NULL; 407 zfsvfs_t *zfsvfs = NULL; 408 uint64_t nbmand; 409 boolean_t readonly = B_FALSE; 410 boolean_t do_readonly = B_FALSE; 411 boolean_t setuid = B_FALSE; 412 boolean_t do_setuid = B_FALSE; 413 boolean_t exec = B_FALSE; 414 boolean_t do_exec = B_FALSE; 415 boolean_t devices = B_FALSE; 416 boolean_t do_devices = B_FALSE; 417 boolean_t xattr = B_FALSE; 418 boolean_t do_xattr = B_FALSE; 419 boolean_t atime = B_FALSE; 420 boolean_t do_atime = B_FALSE; 421 int error = 0; 422 423 ASSERT(vfsp); 424 zfsvfs = vfsp->vfs_data; 425 ASSERT(zfsvfs); 426 os = zfsvfs->z_os; 427 428 /* 429 * The act of registering our callbacks will destroy any mount 430 * options we may have. In order to enable temporary overrides 431 * of mount options, we stash away the current values and 432 * restore them after we register the callbacks. 433 */ 434 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 435 !spa_writeable(dmu_objset_spa(os))) { 436 readonly = B_TRUE; 437 do_readonly = B_TRUE; 438 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 439 readonly = B_FALSE; 440 do_readonly = B_TRUE; 441 } 442 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 443 devices = B_FALSE; 444 setuid = B_FALSE; 445 do_devices = B_TRUE; 446 do_setuid = B_TRUE; 447 } else { 448 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 449 devices = B_FALSE; 450 do_devices = B_TRUE; 451 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { 452 devices = B_TRUE; 453 do_devices = B_TRUE; 454 } 455 456 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 457 setuid = B_FALSE; 458 do_setuid = B_TRUE; 459 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 460 setuid = B_TRUE; 461 do_setuid = B_TRUE; 462 } 463 } 464 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 465 exec = B_FALSE; 466 do_exec = B_TRUE; 467 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 468 exec = B_TRUE; 469 do_exec = B_TRUE; 470 } 471 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 472 xattr = B_FALSE; 473 do_xattr = B_TRUE; 474 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 475 xattr = B_TRUE; 476 do_xattr = B_TRUE; 477 } 478 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 479 atime = B_FALSE; 480 do_atime = B_TRUE; 481 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 482 atime = B_TRUE; 483 do_atime = B_TRUE; 484 } 485 486 /* 487 * nbmand is a special property. It can only be changed at 488 * mount time. 489 * 490 * This is weird, but it is documented to only be changeable 491 * at mount time. 492 */ 493 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 494 nbmand = B_FALSE; 495 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 496 nbmand = B_TRUE; 497 } else { 498 char osname[MAXNAMELEN]; 499 500 dmu_objset_name(os, osname); 501 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 502 NULL)) { 503 return (error); 504 } 505 } 506 507 /* 508 * Register property callbacks. 509 * 510 * It would probably be fine to just check for i/o error from 511 * the first prop_register(), but I guess I like to go 512 * overboard... 513 */ 514 ds = dmu_objset_ds(os); 515 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 516 error = dsl_prop_register(ds, 517 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 518 error = error ? error : dsl_prop_register(ds, 519 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 520 error = error ? error : dsl_prop_register(ds, 521 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 522 error = error ? error : dsl_prop_register(ds, 523 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 524 error = error ? error : dsl_prop_register(ds, 525 zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); 526 error = error ? error : dsl_prop_register(ds, 527 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 528 error = error ? error : dsl_prop_register(ds, 529 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 530 error = error ? error : dsl_prop_register(ds, 531 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 532 error = error ? error : dsl_prop_register(ds, 533 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 534 error = error ? error : dsl_prop_register(ds, 535 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 536 zfsvfs); 537 error = error ? error : dsl_prop_register(ds, 538 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); 539 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 540 if (error) 541 goto unregister; 542 543 /* 544 * Invoke our callbacks to restore temporary mount options. 545 */ 546 if (do_readonly) 547 readonly_changed_cb(zfsvfs, readonly); 548 if (do_setuid) 549 setuid_changed_cb(zfsvfs, setuid); 550 if (do_exec) 551 exec_changed_cb(zfsvfs, exec); 552 if (do_devices) 553 devices_changed_cb(zfsvfs, devices); 554 if (do_xattr) 555 xattr_changed_cb(zfsvfs, xattr); 556 if (do_atime) 557 atime_changed_cb(zfsvfs, atime); 558 559 nbmand_changed_cb(zfsvfs, nbmand); 560 561 return (0); 562 563 unregister: 564 /* 565 * We may attempt to unregister some callbacks that are not 566 * registered, but this is OK; it will simply return ENOMSG, 567 * which we will ignore. 568 */ 569 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME), 570 atime_changed_cb, zfsvfs); 571 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR), 572 xattr_changed_cb, zfsvfs); 573 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 574 blksz_changed_cb, zfsvfs); 575 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY), 576 readonly_changed_cb, zfsvfs); 577 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), 578 devices_changed_cb, zfsvfs); 579 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID), 580 setuid_changed_cb, zfsvfs); 581 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC), 582 exec_changed_cb, zfsvfs); 583 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), 584 snapdir_changed_cb, zfsvfs); 585 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), 586 acl_mode_changed_cb, zfsvfs); 587 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), 588 acl_inherit_changed_cb, zfsvfs); 589 (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), 590 vscan_changed_cb, zfsvfs); 591 return (error); 592 } 593 594 static int 595 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, 596 uint64_t *userp, uint64_t *groupp) 597 { 598 /* 599 * Is it a valid type of object to track? 600 */ 601 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) 602 return (SET_ERROR(ENOENT)); 603 604 /* 605 * If we have a NULL data pointer 606 * then assume the id's aren't changing and 607 * return EEXIST to the dmu to let it know to 608 * use the same ids 609 */ 610 if (data == NULL) 611 return (SET_ERROR(EEXIST)); 612 613 if (bonustype == DMU_OT_ZNODE) { 614 znode_phys_t *znp = data; 615 *userp = znp->zp_uid; 616 *groupp = znp->zp_gid; 617 } else { 618 int hdrsize; 619 sa_hdr_phys_t *sap = data; 620 sa_hdr_phys_t sa = *sap; 621 boolean_t swap = B_FALSE; 622 623 ASSERT(bonustype == DMU_OT_SA); 624 625 if (sa.sa_magic == 0) { 626 /* 627 * This should only happen for newly created 628 * files that haven't had the znode data filled 629 * in yet. 630 */ 631 *userp = 0; 632 *groupp = 0; 633 return (0); 634 } 635 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { 636 sa.sa_magic = SA_MAGIC; 637 sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); 638 swap = B_TRUE; 639 } else { 640 VERIFY3U(sa.sa_magic, ==, SA_MAGIC); 641 } 642 643 hdrsize = sa_hdrsize(&sa); 644 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); 645 *userp = *((uint64_t *)((uintptr_t)data + hdrsize + 646 SA_UID_OFFSET)); 647 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + 648 SA_GID_OFFSET)); 649 if (swap) { 650 *userp = BSWAP_64(*userp); 651 *groupp = BSWAP_64(*groupp); 652 } 653 } 654 return (0); 655 } 656 657 static void 658 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 659 char *domainbuf, int buflen, uid_t *ridp) 660 { 661 uint64_t fuid; 662 const char *domain; 663 664 fuid = strtonum(fuidstr, NULL); 665 666 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 667 if (domain) 668 (void) strlcpy(domainbuf, domain, buflen); 669 else 670 domainbuf[0] = '\0'; 671 *ridp = FUID_RID(fuid); 672 } 673 674 static uint64_t 675 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 676 { 677 switch (type) { 678 case ZFS_PROP_USERUSED: 679 return (DMU_USERUSED_OBJECT); 680 case ZFS_PROP_GROUPUSED: 681 return (DMU_GROUPUSED_OBJECT); 682 case ZFS_PROP_USERQUOTA: 683 return (zfsvfs->z_userquota_obj); 684 case ZFS_PROP_GROUPQUOTA: 685 return (zfsvfs->z_groupquota_obj); 686 } 687 return (0); 688 } 689 690 int 691 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 692 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 693 { 694 int error; 695 zap_cursor_t zc; 696 zap_attribute_t za; 697 zfs_useracct_t *buf = vbuf; 698 uint64_t obj; 699 700 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 701 return (SET_ERROR(ENOTSUP)); 702 703 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 704 if (obj == 0) { 705 *bufsizep = 0; 706 return (0); 707 } 708 709 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 710 (error = zap_cursor_retrieve(&zc, &za)) == 0; 711 zap_cursor_advance(&zc)) { 712 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 713 *bufsizep) 714 break; 715 716 fuidstr_to_sid(zfsvfs, za.za_name, 717 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 718 719 buf->zu_space = za.za_first_integer; 720 buf++; 721 } 722 if (error == ENOENT) 723 error = 0; 724 725 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 726 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 727 *cookiep = zap_cursor_serialize(&zc); 728 zap_cursor_fini(&zc); 729 return (error); 730 } 731 732 /* 733 * buf must be big enough (eg, 32 bytes) 734 */ 735 static int 736 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 737 char *buf, boolean_t addok) 738 { 739 uint64_t fuid; 740 int domainid = 0; 741 742 if (domain && domain[0]) { 743 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 744 if (domainid == -1) 745 return (SET_ERROR(ENOENT)); 746 } 747 fuid = FUID_ENCODE(domainid, rid); 748 (void) sprintf(buf, "%llx", (longlong_t)fuid); 749 return (0); 750 } 751 752 int 753 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 754 const char *domain, uint64_t rid, uint64_t *valp) 755 { 756 char buf[32]; 757 int err; 758 uint64_t obj; 759 760 *valp = 0; 761 762 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 763 return (SET_ERROR(ENOTSUP)); 764 765 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 766 if (obj == 0) 767 return (0); 768 769 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); 770 if (err) 771 return (err); 772 773 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 774 if (err == ENOENT) 775 err = 0; 776 return (err); 777 } 778 779 int 780 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 781 const char *domain, uint64_t rid, uint64_t quota) 782 { 783 char buf[32]; 784 int err; 785 dmu_tx_t *tx; 786 uint64_t *objp; 787 boolean_t fuid_dirtied; 788 789 if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) 790 return (SET_ERROR(EINVAL)); 791 792 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 793 return (SET_ERROR(ENOTSUP)); 794 795 objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : 796 &zfsvfs->z_groupquota_obj; 797 798 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); 799 if (err) 800 return (err); 801 fuid_dirtied = zfsvfs->z_fuid_dirty; 802 803 tx = dmu_tx_create(zfsvfs->z_os); 804 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 805 if (*objp == 0) { 806 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 807 zfs_userquota_prop_prefixes[type]); 808 } 809 if (fuid_dirtied) 810 zfs_fuid_txhold(zfsvfs, tx); 811 err = dmu_tx_assign(tx, TXG_WAIT); 812 if (err) { 813 dmu_tx_abort(tx); 814 return (err); 815 } 816 817 mutex_enter(&zfsvfs->z_lock); 818 if (*objp == 0) { 819 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 820 DMU_OT_NONE, 0, tx); 821 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 822 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 823 } 824 mutex_exit(&zfsvfs->z_lock); 825 826 if (quota == 0) { 827 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 828 if (err == ENOENT) 829 err = 0; 830 } else { 831 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 832 } 833 ASSERT(err == 0); 834 if (fuid_dirtied) 835 zfs_fuid_sync(zfsvfs, tx); 836 dmu_tx_commit(tx); 837 return (err); 838 } 839 840 boolean_t 841 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) 842 { 843 char buf[32]; 844 uint64_t used, quota, usedobj, quotaobj; 845 int err; 846 847 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 848 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 849 850 if (quotaobj == 0 || zfsvfs->z_replay) 851 return (B_FALSE); 852 853 (void) sprintf(buf, "%llx", (longlong_t)fuid); 854 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 855 if (err != 0) 856 return (B_FALSE); 857 858 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 859 if (err != 0) 860 return (B_FALSE); 861 return (used >= quota); 862 } 863 864 boolean_t 865 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) 866 { 867 uint64_t fuid; 868 uint64_t quotaobj; 869 870 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 871 872 fuid = isgroup ? zp->z_gid : zp->z_uid; 873 874 if (quotaobj == 0 || zfsvfs->z_replay) 875 return (B_FALSE); 876 877 return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); 878 } 879 880 int 881 zfsvfs_create(const char *osname, zfsvfs_t **zfvp) 882 { 883 objset_t *os; 884 zfsvfs_t *zfsvfs; 885 uint64_t zval; 886 int i, error; 887 uint64_t sa_obj; 888 889 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 890 891 /* 892 * We claim to always be readonly so we can open snapshots; 893 * other ZPL code will prevent us from writing to snapshots. 894 */ 895 error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); 896 if (error) { 897 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 898 return (error); 899 } 900 901 /* 902 * Initialize the zfs-specific filesystem structure. 903 * Should probably make this a kmem cache, shuffle fields, 904 * and just bzero up to z_hold_mtx[]. 905 */ 906 zfsvfs->z_vfs = NULL; 907 zfsvfs->z_parent = zfsvfs; 908 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 909 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 910 zfsvfs->z_os = os; 911 912 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 913 if (error) { 914 goto out; 915 } else if (zfsvfs->z_version > 916 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 917 (void) printf("Can't mount a version %lld file system " 918 "on a version %lld pool\n. Pool must be upgraded to mount " 919 "this file system.", (u_longlong_t)zfsvfs->z_version, 920 (u_longlong_t)spa_version(dmu_objset_spa(os))); 921 error = SET_ERROR(ENOTSUP); 922 goto out; 923 } 924 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 925 goto out; 926 zfsvfs->z_norm = (int)zval; 927 928 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 929 goto out; 930 zfsvfs->z_utf8 = (zval != 0); 931 932 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 933 goto out; 934 zfsvfs->z_case = (uint_t)zval; 935 936 /* 937 * Fold case on file systems that are always or sometimes case 938 * insensitive. 939 */ 940 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 941 zfsvfs->z_case == ZFS_CASE_MIXED) 942 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 943 944 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 945 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 946 947 if (zfsvfs->z_use_sa) { 948 /* should either have both of these objects or none */ 949 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 950 &sa_obj); 951 if (error) 952 goto out; 953 } else { 954 /* 955 * Pre SA versions file systems should never touch 956 * either the attribute registration or layout objects. 957 */ 958 sa_obj = 0; 959 } 960 961 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 962 &zfsvfs->z_attr_table); 963 if (error) 964 goto out; 965 966 if (zfsvfs->z_version >= ZPL_VERSION_SA) 967 sa_register_update_callback(os, zfs_sa_upgrade); 968 969 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 970 &zfsvfs->z_root); 971 if (error) 972 goto out; 973 ASSERT(zfsvfs->z_root != 0); 974 975 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 976 &zfsvfs->z_unlinkedobj); 977 if (error) 978 goto out; 979 980 error = zap_lookup(os, MASTER_NODE_OBJ, 981 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 982 8, 1, &zfsvfs->z_userquota_obj); 983 if (error && error != ENOENT) 984 goto out; 985 986 error = zap_lookup(os, MASTER_NODE_OBJ, 987 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 988 8, 1, &zfsvfs->z_groupquota_obj); 989 if (error && error != ENOENT) 990 goto out; 991 992 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 993 &zfsvfs->z_fuid_obj); 994 if (error && error != ENOENT) 995 goto out; 996 997 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 998 &zfsvfs->z_shares_dir); 999 if (error && error != ENOENT) 1000 goto out; 1001 1002 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1003 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1004 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1005 offsetof(znode_t, z_link_node)); 1006 rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); 1007 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 1008 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1009 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1010 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1011 1012 *zfvp = zfsvfs; 1013 return (0); 1014 1015 out: 1016 dmu_objset_disown(os, zfsvfs); 1017 *zfvp = NULL; 1018 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1019 return (error); 1020 } 1021 1022 static int 1023 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1024 { 1025 int error; 1026 1027 error = zfs_register_callbacks(zfsvfs->z_vfs); 1028 if (error) 1029 return (error); 1030 1031 /* 1032 * Set the objset user_ptr to track its zfsvfs. 1033 */ 1034 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1035 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1036 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1037 1038 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1039 1040 /* 1041 * If we are not mounting (ie: online recv), then we don't 1042 * have to worry about replaying the log as we blocked all 1043 * operations out since we closed the ZIL. 1044 */ 1045 if (mounting) { 1046 boolean_t readonly; 1047 1048 /* 1049 * During replay we remove the read only flag to 1050 * allow replays to succeed. 1051 */ 1052 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1053 if (readonly != 0) 1054 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1055 else 1056 zfs_unlinked_drain(zfsvfs); 1057 1058 /* 1059 * Parse and replay the intent log. 1060 * 1061 * Because of ziltest, this must be done after 1062 * zfs_unlinked_drain(). (Further note: ziltest 1063 * doesn't use readonly mounts, where 1064 * zfs_unlinked_drain() isn't called.) This is because 1065 * ziltest causes spa_sync() to think it's committed, 1066 * but actually it is not, so the intent log contains 1067 * many txg's worth of changes. 1068 * 1069 * In particular, if object N is in the unlinked set in 1070 * the last txg to actually sync, then it could be 1071 * actually freed in a later txg and then reallocated 1072 * in a yet later txg. This would write a "create 1073 * object N" record to the intent log. Normally, this 1074 * would be fine because the spa_sync() would have 1075 * written out the fact that object N is free, before 1076 * we could write the "create object N" intent log 1077 * record. 1078 * 1079 * But when we are in ziltest mode, we advance the "open 1080 * txg" without actually spa_sync()-ing the changes to 1081 * disk. So we would see that object N is still 1082 * allocated and in the unlinked set, and there is an 1083 * intent log record saying to allocate it. 1084 */ 1085 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1086 if (zil_replay_disable) { 1087 zil_destroy(zfsvfs->z_log, B_FALSE); 1088 } else { 1089 zfsvfs->z_replay = B_TRUE; 1090 zil_replay(zfsvfs->z_os, zfsvfs, 1091 zfs_replay_vector); 1092 zfsvfs->z_replay = B_FALSE; 1093 } 1094 } 1095 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ 1096 } 1097 1098 return (0); 1099 } 1100 1101 void 1102 zfsvfs_free(zfsvfs_t *zfsvfs) 1103 { 1104 int i; 1105 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1106 1107 /* 1108 * This is a barrier to prevent the filesystem from going away in 1109 * zfs_znode_move() until we can safely ensure that the filesystem is 1110 * not unmounted. We consider the filesystem valid before the barrier 1111 * and invalid after the barrier. 1112 */ 1113 rw_enter(&zfsvfs_lock, RW_READER); 1114 rw_exit(&zfsvfs_lock); 1115 1116 zfs_fuid_destroy(zfsvfs); 1117 1118 mutex_destroy(&zfsvfs->z_znodes_lock); 1119 mutex_destroy(&zfsvfs->z_lock); 1120 list_destroy(&zfsvfs->z_all_znodes); 1121 rrm_destroy(&zfsvfs->z_teardown_lock); 1122 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1123 rw_destroy(&zfsvfs->z_fuid_lock); 1124 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1125 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1126 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1127 } 1128 1129 static void 1130 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1131 { 1132 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1133 if (zfsvfs->z_vfs) { 1134 if (zfsvfs->z_use_fuids) { 1135 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1136 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1137 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1138 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1139 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1140 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1141 } else { 1142 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1143 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1144 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1145 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1146 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1147 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1148 } 1149 } 1150 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1151 } 1152 1153 static int 1154 zfs_domount(vfs_t *vfsp, char *osname) 1155 { 1156 dev_t mount_dev; 1157 uint64_t recordsize, fsid_guid; 1158 int error = 0; 1159 zfsvfs_t *zfsvfs; 1160 1161 ASSERT(vfsp); 1162 ASSERT(osname); 1163 1164 error = zfsvfs_create(osname, &zfsvfs); 1165 if (error) 1166 return (error); 1167 zfsvfs->z_vfs = vfsp; 1168 1169 /* Initialize the generic filesystem structure. */ 1170 vfsp->vfs_bcount = 0; 1171 vfsp->vfs_data = NULL; 1172 1173 if (zfs_create_unique_device(&mount_dev) == -1) { 1174 error = SET_ERROR(ENODEV); 1175 goto out; 1176 } 1177 ASSERT(vfs_devismounted(mount_dev) == 0); 1178 1179 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1180 NULL)) 1181 goto out; 1182 1183 vfsp->vfs_dev = mount_dev; 1184 vfsp->vfs_fstype = zfsfstype; 1185 vfsp->vfs_bsize = recordsize; 1186 vfsp->vfs_flag |= VFS_NOTRUNC; 1187 vfsp->vfs_data = zfsvfs; 1188 1189 /* 1190 * The fsid is 64 bits, composed of an 8-bit fs type, which 1191 * separates our fsid from any other filesystem types, and a 1192 * 56-bit objset unique ID. The objset unique ID is unique to 1193 * all objsets open on this system, provided by unique_create(). 1194 * The 8-bit fs type must be put in the low bits of fsid[1] 1195 * because that's where other Solaris filesystems put it. 1196 */ 1197 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1198 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1199 vfsp->vfs_fsid.val[0] = fsid_guid; 1200 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1201 zfsfstype & 0xFF; 1202 1203 /* 1204 * Set features for file system. 1205 */ 1206 zfs_set_fuid_feature(zfsvfs); 1207 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1208 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1209 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1210 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1211 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1212 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1213 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1214 } 1215 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1216 1217 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1218 uint64_t pval; 1219 1220 atime_changed_cb(zfsvfs, B_FALSE); 1221 readonly_changed_cb(zfsvfs, B_TRUE); 1222 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1223 goto out; 1224 xattr_changed_cb(zfsvfs, pval); 1225 zfsvfs->z_issnap = B_TRUE; 1226 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1227 1228 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1229 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1230 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1231 } else { 1232 error = zfsvfs_setup(zfsvfs, B_TRUE); 1233 } 1234 1235 if (!zfsvfs->z_issnap) 1236 zfsctl_create(zfsvfs); 1237 out: 1238 if (error) { 1239 dmu_objset_disown(zfsvfs->z_os, zfsvfs); 1240 zfsvfs_free(zfsvfs); 1241 } else { 1242 atomic_inc_32(&zfs_active_fs_count); 1243 } 1244 1245 return (error); 1246 } 1247 1248 void 1249 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1250 { 1251 objset_t *os = zfsvfs->z_os; 1252 struct dsl_dataset *ds; 1253 1254 /* 1255 * Unregister properties. 1256 */ 1257 if (!dmu_objset_is_snapshot(os)) { 1258 ds = dmu_objset_ds(os); 1259 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, 1260 zfsvfs) == 0); 1261 1262 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, 1263 zfsvfs) == 0); 1264 1265 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, 1266 zfsvfs) == 0); 1267 1268 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, 1269 zfsvfs) == 0); 1270 1271 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, 1272 zfsvfs) == 0); 1273 1274 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, 1275 zfsvfs) == 0); 1276 1277 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, 1278 zfsvfs) == 0); 1279 1280 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, 1281 zfsvfs) == 0); 1282 1283 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, 1284 zfsvfs) == 0); 1285 1286 VERIFY(dsl_prop_unregister(ds, "aclinherit", 1287 acl_inherit_changed_cb, zfsvfs) == 0); 1288 1289 VERIFY(dsl_prop_unregister(ds, "vscan", 1290 vscan_changed_cb, zfsvfs) == 0); 1291 } 1292 } 1293 1294 /* 1295 * Convert a decimal digit string to a uint64_t integer. 1296 */ 1297 static int 1298 str_to_uint64(char *str, uint64_t *objnum) 1299 { 1300 uint64_t num = 0; 1301 1302 while (*str) { 1303 if (*str < '0' || *str > '9') 1304 return (SET_ERROR(EINVAL)); 1305 1306 num = num*10 + *str++ - '0'; 1307 } 1308 1309 *objnum = num; 1310 return (0); 1311 } 1312 1313 /* 1314 * The boot path passed from the boot loader is in the form of 1315 * "rootpool-name/root-filesystem-object-number'. Convert this 1316 * string to a dataset name: "rootpool-name/root-filesystem-name". 1317 */ 1318 static int 1319 zfs_parse_bootfs(char *bpath, char *outpath) 1320 { 1321 char *slashp; 1322 uint64_t objnum; 1323 int error; 1324 1325 if (*bpath == 0 || *bpath == '/') 1326 return (SET_ERROR(EINVAL)); 1327 1328 (void) strcpy(outpath, bpath); 1329 1330 slashp = strchr(bpath, '/'); 1331 1332 /* if no '/', just return the pool name */ 1333 if (slashp == NULL) { 1334 return (0); 1335 } 1336 1337 /* if not a number, just return the root dataset name */ 1338 if (str_to_uint64(slashp+1, &objnum)) { 1339 return (0); 1340 } 1341 1342 *slashp = '\0'; 1343 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1344 *slashp = '/'; 1345 1346 return (error); 1347 } 1348 1349 /* 1350 * Check that the hex label string is appropriate for the dataset being 1351 * mounted into the global_zone proper. 1352 * 1353 * Return an error if the hex label string is not default or 1354 * admin_low/admin_high. For admin_low labels, the corresponding 1355 * dataset must be readonly. 1356 */ 1357 int 1358 zfs_check_global_label(const char *dsname, const char *hexsl) 1359 { 1360 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1361 return (0); 1362 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1363 return (0); 1364 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1365 /* must be readonly */ 1366 uint64_t rdonly; 1367 1368 if (dsl_prop_get_integer(dsname, 1369 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1370 return (SET_ERROR(EACCES)); 1371 return (rdonly ? 0 : EACCES); 1372 } 1373 return (SET_ERROR(EACCES)); 1374 } 1375 1376 /* 1377 * Determine whether the mount is allowed according to MAC check. 1378 * by comparing (where appropriate) label of the dataset against 1379 * the label of the zone being mounted into. If the dataset has 1380 * no label, create one. 1381 * 1382 * Returns 0 if access allowed, error otherwise (e.g. EACCES) 1383 */ 1384 static int 1385 zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1386 { 1387 int error, retv; 1388 zone_t *mntzone = NULL; 1389 ts_label_t *mnt_tsl; 1390 bslabel_t *mnt_sl; 1391 bslabel_t ds_sl; 1392 char ds_hexsl[MAXNAMELEN]; 1393 1394 retv = EACCES; /* assume the worst */ 1395 1396 /* 1397 * Start by getting the dataset label if it exists. 1398 */ 1399 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1400 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1401 if (error) 1402 return (SET_ERROR(EACCES)); 1403 1404 /* 1405 * If labeling is NOT enabled, then disallow the mount of datasets 1406 * which have a non-default label already. No other label checks 1407 * are needed. 1408 */ 1409 if (!is_system_labeled()) { 1410 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1411 return (0); 1412 return (SET_ERROR(EACCES)); 1413 } 1414 1415 /* 1416 * Get the label of the mountpoint. If mounting into the global 1417 * zone (i.e. mountpoint is not within an active zone and the 1418 * zoned property is off), the label must be default or 1419 * admin_low/admin_high only; no other checks are needed. 1420 */ 1421 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1422 if (mntzone->zone_id == GLOBAL_ZONEID) { 1423 uint64_t zoned; 1424 1425 zone_rele(mntzone); 1426 1427 if (dsl_prop_get_integer(osname, 1428 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1429 return (SET_ERROR(EACCES)); 1430 if (!zoned) 1431 return (zfs_check_global_label(osname, ds_hexsl)); 1432 else 1433 /* 1434 * This is the case of a zone dataset being mounted 1435 * initially, before the zone has been fully created; 1436 * allow this mount into global zone. 1437 */ 1438 return (0); 1439 } 1440 1441 mnt_tsl = mntzone->zone_slabel; 1442 ASSERT(mnt_tsl != NULL); 1443 label_hold(mnt_tsl); 1444 mnt_sl = label2bslabel(mnt_tsl); 1445 1446 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1447 /* 1448 * The dataset doesn't have a real label, so fabricate one. 1449 */ 1450 char *str = NULL; 1451 1452 if (l_to_str_internal(mnt_sl, &str) == 0 && 1453 dsl_prop_set_string(osname, 1454 zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1455 ZPROP_SRC_LOCAL, str) == 0) 1456 retv = 0; 1457 if (str != NULL) 1458 kmem_free(str, strlen(str) + 1); 1459 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1460 /* 1461 * Now compare labels to complete the MAC check. If the 1462 * labels are equal then allow access. If the mountpoint 1463 * label dominates the dataset label, allow readonly access. 1464 * Otherwise, access is denied. 1465 */ 1466 if (blequal(mnt_sl, &ds_sl)) 1467 retv = 0; 1468 else if (bldominates(mnt_sl, &ds_sl)) { 1469 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1470 retv = 0; 1471 } 1472 } 1473 1474 label_rele(mnt_tsl); 1475 zone_rele(mntzone); 1476 return (retv); 1477 } 1478 1479 static int 1480 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1481 { 1482 int error = 0; 1483 static int zfsrootdone = 0; 1484 zfsvfs_t *zfsvfs = NULL; 1485 znode_t *zp = NULL; 1486 vnode_t *vp = NULL; 1487 char *zfs_bootfs; 1488 char *zfs_devid; 1489 1490 ASSERT(vfsp); 1491 1492 /* 1493 * The filesystem that we mount as root is defined in the 1494 * boot property "zfs-bootfs" with a format of 1495 * "poolname/root-dataset-objnum". 1496 */ 1497 if (why == ROOT_INIT) { 1498 if (zfsrootdone++) 1499 return (SET_ERROR(EBUSY)); 1500 /* 1501 * the process of doing a spa_load will require the 1502 * clock to be set before we could (for example) do 1503 * something better by looking at the timestamp on 1504 * an uberblock, so just set it to -1. 1505 */ 1506 clkset(-1); 1507 1508 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1509 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1510 "bootfs name"); 1511 return (SET_ERROR(EINVAL)); 1512 } 1513 zfs_devid = spa_get_bootprop("diskdevid"); 1514 error = spa_import_rootpool(rootfs.bo_name, zfs_devid); 1515 if (zfs_devid) 1516 spa_free_bootprop(zfs_devid); 1517 if (error) { 1518 spa_free_bootprop(zfs_bootfs); 1519 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1520 error); 1521 return (error); 1522 } 1523 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1524 spa_free_bootprop(zfs_bootfs); 1525 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1526 error); 1527 return (error); 1528 } 1529 1530 spa_free_bootprop(zfs_bootfs); 1531 1532 if (error = vfs_lock(vfsp)) 1533 return (error); 1534 1535 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1536 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1537 goto out; 1538 } 1539 1540 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1541 ASSERT(zfsvfs); 1542 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { 1543 cmn_err(CE_NOTE, "zfs_zget: error %d", error); 1544 goto out; 1545 } 1546 1547 vp = ZTOV(zp); 1548 mutex_enter(&vp->v_lock); 1549 vp->v_flag |= VROOT; 1550 mutex_exit(&vp->v_lock); 1551 rootvp = vp; 1552 1553 /* 1554 * Leave rootvp held. The root file system is never unmounted. 1555 */ 1556 1557 vfs_add((struct vnode *)0, vfsp, 1558 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1559 out: 1560 vfs_unlock(vfsp); 1561 return (error); 1562 } else if (why == ROOT_REMOUNT) { 1563 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1564 vfsp->vfs_flag |= VFS_REMOUNT; 1565 1566 /* refresh mount options */ 1567 zfs_unregister_callbacks(vfsp->vfs_data); 1568 return (zfs_register_callbacks(vfsp)); 1569 1570 } else if (why == ROOT_UNMOUNT) { 1571 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1572 (void) zfs_sync(vfsp, 0, 0); 1573 return (0); 1574 } 1575 1576 /* 1577 * if "why" is equal to anything else other than ROOT_INIT, 1578 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1579 */ 1580 return (SET_ERROR(ENOTSUP)); 1581 } 1582 1583 /*ARGSUSED*/ 1584 static int 1585 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 1586 { 1587 char *osname; 1588 pathname_t spn; 1589 int error = 0; 1590 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? 1591 UIO_SYSSPACE : UIO_USERSPACE; 1592 int canwrite; 1593 1594 if (mvp->v_type != VDIR) 1595 return (SET_ERROR(ENOTDIR)); 1596 1597 mutex_enter(&mvp->v_lock); 1598 if ((uap->flags & MS_REMOUNT) == 0 && 1599 (uap->flags & MS_OVERLAY) == 0 && 1600 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 1601 mutex_exit(&mvp->v_lock); 1602 return (SET_ERROR(EBUSY)); 1603 } 1604 mutex_exit(&mvp->v_lock); 1605 1606 /* 1607 * ZFS does not support passing unparsed data in via MS_DATA. 1608 * Users should use the MS_OPTIONSTR interface; this means 1609 * that all option parsing is already done and the options struct 1610 * can be interrogated. 1611 */ 1612 if ((uap->flags & MS_DATA) && uap->datalen > 0) 1613 return (SET_ERROR(EINVAL)); 1614 1615 /* 1616 * Get the objset name (the "special" mount argument). 1617 */ 1618 if (error = pn_get(uap->spec, fromspace, &spn)) 1619 return (error); 1620 1621 osname = spn.pn_path; 1622 1623 /* 1624 * Check for mount privilege? 1625 * 1626 * If we don't have privilege then see if 1627 * we have local permission to allow it 1628 */ 1629 error = secpolicy_fs_mount(cr, mvp, vfsp); 1630 if (error) { 1631 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) { 1632 vattr_t vattr; 1633 1634 /* 1635 * Make sure user is the owner of the mount point 1636 * or has sufficient privileges. 1637 */ 1638 1639 vattr.va_mask = AT_UID; 1640 1641 if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { 1642 goto out; 1643 } 1644 1645 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && 1646 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { 1647 goto out; 1648 } 1649 secpolicy_fs_mount_clearopts(cr, vfsp); 1650 } else { 1651 goto out; 1652 } 1653 } 1654 1655 /* 1656 * Refuse to mount a filesystem if we are in a local zone and the 1657 * dataset is not visible. 1658 */ 1659 if (!INGLOBALZONE(curproc) && 1660 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1661 error = SET_ERROR(EPERM); 1662 goto out; 1663 } 1664 1665 error = zfs_mount_label_policy(vfsp, osname); 1666 if (error) 1667 goto out; 1668 1669 /* 1670 * When doing a remount, we simply refresh our temporary properties 1671 * according to those options set in the current VFS options. 1672 */ 1673 if (uap->flags & MS_REMOUNT) { 1674 /* refresh mount options */ 1675 zfs_unregister_callbacks(vfsp->vfs_data); 1676 error = zfs_register_callbacks(vfsp); 1677 goto out; 1678 } 1679 1680 error = zfs_domount(vfsp, osname); 1681 1682 /* 1683 * Add an extra VFS_HOLD on our parent vfs so that it can't 1684 * disappear due to a forced unmount. 1685 */ 1686 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 1687 VFS_HOLD(mvp->v_vfsp); 1688 1689 out: 1690 pn_free(&spn); 1691 return (error); 1692 } 1693 1694 static int 1695 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) 1696 { 1697 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1698 dev32_t d32; 1699 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1700 1701 ZFS_ENTER(zfsvfs); 1702 1703 dmu_objset_space(zfsvfs->z_os, 1704 &refdbytes, &availbytes, &usedobjs, &availobjs); 1705 1706 /* 1707 * The underlying storage pool actually uses multiple block sizes. 1708 * We report the fragsize as the smallest block size we support, 1709 * and we report our blocksize as the filesystem's maximum blocksize. 1710 */ 1711 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; 1712 statp->f_bsize = zfsvfs->z_max_blksz; 1713 1714 /* 1715 * The following report "total" blocks of various kinds in the 1716 * file system, but reported in terms of f_frsize - the 1717 * "fragment" size. 1718 */ 1719 1720 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1721 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; 1722 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1723 1724 /* 1725 * statvfs() should really be called statufs(), because it assumes 1726 * static metadata. ZFS doesn't preallocate files, so the best 1727 * we can do is report the max that could possibly fit in f_files, 1728 * and that minus the number actually used in f_ffree. 1729 * For f_ffree, report the smaller of the number of object available 1730 * and the number of blocks (each object will take at least a block). 1731 */ 1732 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1733 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 1734 statp->f_files = statp->f_ffree + usedobjs; 1735 1736 (void) cmpldev(&d32, vfsp->vfs_dev); 1737 statp->f_fsid = d32; 1738 1739 /* 1740 * We're a zfs filesystem. 1741 */ 1742 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); 1743 1744 statp->f_flag = vf_to_stf(vfsp->vfs_flag); 1745 1746 statp->f_namemax = ZFS_MAXNAMELEN; 1747 1748 /* 1749 * We have all of 32 characters to stuff a string here. 1750 * Is there anything useful we could/should provide? 1751 */ 1752 bzero(statp->f_fstr, sizeof (statp->f_fstr)); 1753 1754 ZFS_EXIT(zfsvfs); 1755 return (0); 1756 } 1757 1758 static int 1759 zfs_root(vfs_t *vfsp, vnode_t **vpp) 1760 { 1761 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1762 znode_t *rootzp; 1763 int error; 1764 1765 ZFS_ENTER(zfsvfs); 1766 1767 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1768 if (error == 0) 1769 *vpp = ZTOV(rootzp); 1770 1771 ZFS_EXIT(zfsvfs); 1772 return (error); 1773 } 1774 1775 /* 1776 * Teardown the zfsvfs::z_os. 1777 * 1778 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' 1779 * and 'z_teardown_inactive_lock' held. 1780 */ 1781 static int 1782 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1783 { 1784 znode_t *zp; 1785 1786 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 1787 1788 if (!unmounting) { 1789 /* 1790 * We purge the parent filesystem's vfsp as the parent 1791 * filesystem and all of its snapshots have their vnode's 1792 * v_vfsp set to the parent's filesystem's vfsp. Note, 1793 * 'z_parent' is self referential for non-snapshots. 1794 */ 1795 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1796 } 1797 1798 /* 1799 * Close the zil. NB: Can't close the zil while zfs_inactive 1800 * threads are blocked as zil_close can call zfs_inactive. 1801 */ 1802 if (zfsvfs->z_log) { 1803 zil_close(zfsvfs->z_log); 1804 zfsvfs->z_log = NULL; 1805 } 1806 1807 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 1808 1809 /* 1810 * If we are not unmounting (ie: online recv) and someone already 1811 * unmounted this file system while we were doing the switcheroo, 1812 * or a reopen of z_os failed then just bail out now. 1813 */ 1814 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1815 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1816 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 1817 return (SET_ERROR(EIO)); 1818 } 1819 1820 /* 1821 * At this point there are no vops active, and any new vops will 1822 * fail with EIO since we have z_teardown_lock for writer (only 1823 * relavent for forced unmount). 1824 * 1825 * Release all holds on dbufs. 1826 */ 1827 mutex_enter(&zfsvfs->z_znodes_lock); 1828 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1829 zp = list_next(&zfsvfs->z_all_znodes, zp)) 1830 if (zp->z_sa_hdl) { 1831 ASSERT(ZTOV(zp)->v_count > 0); 1832 zfs_znode_dmu_fini(zp); 1833 } 1834 mutex_exit(&zfsvfs->z_znodes_lock); 1835 1836 /* 1837 * If we are unmounting, set the unmounted flag and let new vops 1838 * unblock. zfs_inactive will have the unmounted behavior, and all 1839 * other vops will fail with EIO. 1840 */ 1841 if (unmounting) { 1842 zfsvfs->z_unmounted = B_TRUE; 1843 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 1844 rw_exit(&zfsvfs->z_teardown_inactive_lock); 1845 } 1846 1847 /* 1848 * z_os will be NULL if there was an error in attempting to reopen 1849 * zfsvfs, so just return as the properties had already been 1850 * unregistered and cached data had been evicted before. 1851 */ 1852 if (zfsvfs->z_os == NULL) 1853 return (0); 1854 1855 /* 1856 * Unregister properties. 1857 */ 1858 zfs_unregister_callbacks(zfsvfs); 1859 1860 /* 1861 * Evict cached data 1862 */ 1863 if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && 1864 !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) 1865 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1866 dmu_objset_evict_dbufs(zfsvfs->z_os); 1867 1868 return (0); 1869 } 1870 1871 /*ARGSUSED*/ 1872 static int 1873 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) 1874 { 1875 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1876 objset_t *os; 1877 int ret; 1878 1879 ret = secpolicy_fs_unmount(cr, vfsp); 1880 if (ret) { 1881 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 1882 ZFS_DELEG_PERM_MOUNT, cr)) 1883 return (ret); 1884 } 1885 1886 /* 1887 * We purge the parent filesystem's vfsp as the parent filesystem 1888 * and all of its snapshots have their vnode's v_vfsp set to the 1889 * parent's filesystem's vfsp. Note, 'z_parent' is self 1890 * referential for non-snapshots. 1891 */ 1892 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 1893 1894 /* 1895 * Unmount any snapshots mounted under .zfs before unmounting the 1896 * dataset itself. 1897 */ 1898 if (zfsvfs->z_ctldir != NULL && 1899 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { 1900 return (ret); 1901 } 1902 1903 if (!(fflag & MS_FORCE)) { 1904 /* 1905 * Check the number of active vnodes in the file system. 1906 * Our count is maintained in the vfs structure, but the 1907 * number is off by 1 to indicate a hold on the vfs 1908 * structure itself. 1909 * 1910 * The '.zfs' directory maintains a reference of its 1911 * own, and any active references underneath are 1912 * reflected in the vnode count. 1913 */ 1914 if (zfsvfs->z_ctldir == NULL) { 1915 if (vfsp->vfs_count > 1) 1916 return (SET_ERROR(EBUSY)); 1917 } else { 1918 if (vfsp->vfs_count > 2 || 1919 zfsvfs->z_ctldir->v_count > 1) 1920 return (SET_ERROR(EBUSY)); 1921 } 1922 } 1923 1924 vfsp->vfs_flag |= VFS_UNMOUNTED; 1925 1926 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 1927 os = zfsvfs->z_os; 1928 1929 /* 1930 * z_os will be NULL if there was an error in 1931 * attempting to reopen zfsvfs. 1932 */ 1933 if (os != NULL) { 1934 /* 1935 * Unset the objset user_ptr. 1936 */ 1937 mutex_enter(&os->os_user_ptr_lock); 1938 dmu_objset_set_user(os, NULL); 1939 mutex_exit(&os->os_user_ptr_lock); 1940 1941 /* 1942 * Finally release the objset 1943 */ 1944 dmu_objset_disown(os, zfsvfs); 1945 } 1946 1947 /* 1948 * We can now safely destroy the '.zfs' directory node. 1949 */ 1950 if (zfsvfs->z_ctldir != NULL) 1951 zfsctl_destroy(zfsvfs); 1952 1953 return (0); 1954 } 1955 1956 static int 1957 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 1958 { 1959 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1960 znode_t *zp; 1961 uint64_t object = 0; 1962 uint64_t fid_gen = 0; 1963 uint64_t gen_mask; 1964 uint64_t zp_gen; 1965 int i, err; 1966 1967 *vpp = NULL; 1968 1969 ZFS_ENTER(zfsvfs); 1970 1971 if (fidp->fid_len == LONG_FID_LEN) { 1972 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1973 uint64_t objsetid = 0; 1974 uint64_t setgen = 0; 1975 1976 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1977 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1978 1979 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1980 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1981 1982 ZFS_EXIT(zfsvfs); 1983 1984 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1985 if (err) 1986 return (SET_ERROR(EINVAL)); 1987 ZFS_ENTER(zfsvfs); 1988 } 1989 1990 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1991 zfid_short_t *zfid = (zfid_short_t *)fidp; 1992 1993 for (i = 0; i < sizeof (zfid->zf_object); i++) 1994 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1995 1996 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1997 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1998 } else { 1999 ZFS_EXIT(zfsvfs); 2000 return (SET_ERROR(EINVAL)); 2001 } 2002 2003 /* A zero fid_gen means we are in the .zfs control directories */ 2004 if (fid_gen == 0 && 2005 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 2006 *vpp = zfsvfs->z_ctldir; 2007 ASSERT(*vpp != NULL); 2008 if (object == ZFSCTL_INO_SNAPDIR) { 2009 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 2010 0, NULL, NULL, NULL, NULL, NULL) == 0); 2011 } else { 2012 VN_HOLD(*vpp); 2013 } 2014 ZFS_EXIT(zfsvfs); 2015 return (0); 2016 } 2017 2018 gen_mask = -1ULL >> (64 - 8 * i); 2019 2020 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2021 if (err = zfs_zget(zfsvfs, object, &zp)) { 2022 ZFS_EXIT(zfsvfs); 2023 return (err); 2024 } 2025 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 2026 sizeof (uint64_t)); 2027 zp_gen = zp_gen & gen_mask; 2028 if (zp_gen == 0) 2029 zp_gen = 1; 2030 if (zp->z_unlinked || zp_gen != fid_gen) { 2031 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2032 VN_RELE(ZTOV(zp)); 2033 ZFS_EXIT(zfsvfs); 2034 return (SET_ERROR(EINVAL)); 2035 } 2036 2037 *vpp = ZTOV(zp); 2038 ZFS_EXIT(zfsvfs); 2039 return (0); 2040 } 2041 2042 /* 2043 * Block out VOPs and close zfsvfs_t::z_os 2044 * 2045 * Note, if successful, then we return with the 'z_teardown_lock' and 2046 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 2047 * dataset and objset intact so that they can be atomically handed off during 2048 * a subsequent rollback or recv operation and the resume thereafter. 2049 */ 2050 int 2051 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2052 { 2053 int error; 2054 2055 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2056 return (error); 2057 2058 return (0); 2059 } 2060 2061 /* 2062 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 2063 * is an invariant across any of the operations that can be performed while the 2064 * filesystem was suspended. Whether it succeeded or failed, the preconditions 2065 * are the same: the relevant objset and associated dataset are owned by 2066 * zfsvfs, held, and long held on entry. 2067 */ 2068 int 2069 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) 2070 { 2071 int err; 2072 znode_t *zp; 2073 uint64_t sa_obj = 0; 2074 2075 ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2076 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2077 2078 /* 2079 * We already own this, so just hold and rele it to update the 2080 * objset_t, as the one we had before may have been evicted. 2081 */ 2082 VERIFY0(dmu_objset_hold(osname, zfsvfs, &zfsvfs->z_os)); 2083 VERIFY3P(zfsvfs->z_os->os_dsl_dataset->ds_owner, ==, zfsvfs); 2084 VERIFY(dsl_dataset_long_held(zfsvfs->z_os->os_dsl_dataset)); 2085 dmu_objset_rele(zfsvfs->z_os, zfsvfs); 2086 2087 /* 2088 * Make sure version hasn't changed 2089 */ 2090 2091 err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION, 2092 &zfsvfs->z_version); 2093 2094 if (err) 2095 goto bail; 2096 2097 err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, 2098 ZFS_SA_ATTRS, 8, 1, &sa_obj); 2099 2100 if (err && zfsvfs->z_version >= ZPL_VERSION_SA) 2101 goto bail; 2102 2103 if ((err = sa_setup(zfsvfs->z_os, sa_obj, 2104 zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0) 2105 goto bail; 2106 2107 if (zfsvfs->z_version >= ZPL_VERSION_SA) 2108 sa_register_update_callback(zfsvfs->z_os, 2109 zfs_sa_upgrade); 2110 2111 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2112 2113 zfs_set_fuid_feature(zfsvfs); 2114 2115 /* 2116 * Attempt to re-establish all the active znodes with 2117 * their dbufs. If a zfs_rezget() fails, then we'll let 2118 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2119 * when they try to use their znode. 2120 */ 2121 mutex_enter(&zfsvfs->z_znodes_lock); 2122 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2123 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2124 (void) zfs_rezget(zp); 2125 } 2126 mutex_exit(&zfsvfs->z_znodes_lock); 2127 2128 bail: 2129 /* release the VOPs */ 2130 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2131 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2132 2133 if (err) { 2134 /* 2135 * Since we couldn't setup the sa framework, try to force 2136 * unmount this file system. 2137 */ 2138 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 2139 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); 2140 } 2141 return (err); 2142 } 2143 2144 static void 2145 zfs_freevfs(vfs_t *vfsp) 2146 { 2147 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2148 2149 /* 2150 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2151 * from zfs_mount(). Release it here. If we came through 2152 * zfs_mountroot() instead, we didn't grab an extra hold, so 2153 * skip the VFS_RELE for rootvfs. 2154 */ 2155 if (zfsvfs->z_issnap && (vfsp != rootvfs)) 2156 VFS_RELE(zfsvfs->z_parent->z_vfs); 2157 2158 zfsvfs_free(zfsvfs); 2159 2160 atomic_dec_32(&zfs_active_fs_count); 2161 } 2162 2163 /* 2164 * VFS_INIT() initialization. Note that there is no VFS_FINI(), 2165 * so we can't safely do any non-idempotent initialization here. 2166 * Leave that to zfs_init() and zfs_fini(), which are called 2167 * from the module's _init() and _fini() entry points. 2168 */ 2169 /*ARGSUSED*/ 2170 static int 2171 zfs_vfsinit(int fstype, char *name) 2172 { 2173 int error; 2174 2175 zfsfstype = fstype; 2176 2177 /* 2178 * Setup vfsops and vnodeops tables. 2179 */ 2180 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); 2181 if (error != 0) { 2182 cmn_err(CE_WARN, "zfs: bad vfs ops template"); 2183 } 2184 2185 error = zfs_create_op_tables(); 2186 if (error) { 2187 zfs_remove_op_tables(); 2188 cmn_err(CE_WARN, "zfs: bad vnode ops template"); 2189 (void) vfs_freevfsops_by_type(zfsfstype); 2190 return (error); 2191 } 2192 2193 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 2194 2195 /* 2196 * Unique major number for all zfs mounts. 2197 * If we run out of 32-bit minors, we'll getudev() another major. 2198 */ 2199 zfs_major = ddi_name_to_major(ZFS_DRIVER); 2200 zfs_minor = ZFS_MIN_MINOR; 2201 2202 return (0); 2203 } 2204 2205 void 2206 zfs_init(void) 2207 { 2208 /* 2209 * Initialize .zfs directory structures 2210 */ 2211 zfsctl_init(); 2212 2213 /* 2214 * Initialize znode cache, vnode ops, etc... 2215 */ 2216 zfs_znode_init(); 2217 2218 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2219 } 2220 2221 void 2222 zfs_fini(void) 2223 { 2224 zfsctl_fini(); 2225 zfs_znode_fini(); 2226 } 2227 2228 int 2229 zfs_busy(void) 2230 { 2231 return (zfs_active_fs_count != 0); 2232 } 2233 2234 int 2235 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2236 { 2237 int error; 2238 objset_t *os = zfsvfs->z_os; 2239 dmu_tx_t *tx; 2240 2241 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2242 return (SET_ERROR(EINVAL)); 2243 2244 if (newvers < zfsvfs->z_version) 2245 return (SET_ERROR(EINVAL)); 2246 2247 if (zfs_spa_version_map(newvers) > 2248 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2249 return (SET_ERROR(ENOTSUP)); 2250 2251 tx = dmu_tx_create(os); 2252 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2253 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2254 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2255 ZFS_SA_ATTRS); 2256 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2257 } 2258 error = dmu_tx_assign(tx, TXG_WAIT); 2259 if (error) { 2260 dmu_tx_abort(tx); 2261 return (error); 2262 } 2263 2264 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2265 8, 1, &newvers, tx); 2266 2267 if (error) { 2268 dmu_tx_commit(tx); 2269 return (error); 2270 } 2271 2272 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2273 uint64_t sa_obj; 2274 2275 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2276 SPA_VERSION_SA); 2277 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2278 DMU_OT_NONE, 0, tx); 2279 2280 error = zap_add(os, MASTER_NODE_OBJ, 2281 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2282 ASSERT0(error); 2283 2284 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2285 sa_register_update_callback(os, zfs_sa_upgrade); 2286 } 2287 2288 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2289 "from %llu to %llu", zfsvfs->z_version, newvers); 2290 2291 dmu_tx_commit(tx); 2292 2293 zfsvfs->z_version = newvers; 2294 2295 zfs_set_fuid_feature(zfsvfs); 2296 2297 return (0); 2298 } 2299 2300 /* 2301 * Read a property stored within the master node. 2302 */ 2303 int 2304 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2305 { 2306 const char *pname; 2307 int error = ENOENT; 2308 2309 /* 2310 * Look up the file system's value for the property. For the 2311 * version property, we look up a slightly different string. 2312 */ 2313 if (prop == ZFS_PROP_VERSION) 2314 pname = ZPL_VERSION_STR; 2315 else 2316 pname = zfs_prop_to_name(prop); 2317 2318 if (os != NULL) 2319 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2320 2321 if (error == ENOENT) { 2322 /* No value set, use the default value */ 2323 switch (prop) { 2324 case ZFS_PROP_VERSION: 2325 *value = ZPL_VERSION; 2326 break; 2327 case ZFS_PROP_NORMALIZE: 2328 case ZFS_PROP_UTF8ONLY: 2329 *value = 0; 2330 break; 2331 case ZFS_PROP_CASE: 2332 *value = ZFS_CASE_SENSITIVE; 2333 break; 2334 default: 2335 return (error); 2336 } 2337 error = 0; 2338 } 2339 return (error); 2340 } 2341 2342 static vfsdef_t vfw = { 2343 VFSDEF_VERSION, 2344 MNTTYPE_ZFS, 2345 zfs_vfsinit, 2346 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| 2347 VSW_XID|VSW_ZMOUNT, 2348 &zfs_mntopts 2349 }; 2350 2351 struct modlfs zfs_modlfs = { 2352 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw 2353 }; 2354