1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/sysmacros.h> 32 #include <sys/kmem.h> 33 #include <sys/pathname.h> 34 #include <sys/acl.h> 35 #include <sys/vnode.h> 36 #include <sys/vfs.h> 37 #include <sys/mntent.h> 38 #include <sys/mount.h> 39 #include <sys/cmn_err.h> 40 #include "fs/fs_subr.h" 41 #include <sys/zfs_znode.h> 42 #include <sys/zil.h> 43 #include <sys/fs/zfs.h> 44 #include <sys/dmu.h> 45 #include <sys/dsl_prop.h> 46 #include <sys/spa.h> 47 #include <sys/zap.h> 48 #include <sys/varargs.h> 49 #include <sys/policy.h> 50 #include <sys/atomic.h> 51 #include <sys/mkdev.h> 52 #include <sys/modctl.h> 53 #include <sys/zfs_ioctl.h> 54 #include <sys/zfs_ctldir.h> 55 #include <sys/bootconf.h> 56 #include <sys/sunddi.h> 57 #include <sys/dnlc.h> 58 59 int zfsfstype; 60 vfsops_t *zfs_vfsops = NULL; 61 static major_t zfs_major; 62 static minor_t zfs_minor; 63 static kmutex_t zfs_dev_mtx; 64 65 extern char zfs_bootpath[BO_MAXOBJNAME]; 66 67 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); 68 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); 69 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); 70 static int zfs_root(vfs_t *vfsp, vnode_t **vpp); 71 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); 72 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); 73 static void zfs_freevfs(vfs_t *vfsp); 74 static void zfs_objset_close(zfsvfs_t *zfsvfs); 75 76 static const fs_operation_def_t zfs_vfsops_template[] = { 77 VFSNAME_MOUNT, zfs_mount, 78 VFSNAME_MOUNTROOT, zfs_mountroot, 79 VFSNAME_UNMOUNT, zfs_umount, 80 VFSNAME_ROOT, zfs_root, 81 VFSNAME_STATVFS, zfs_statvfs, 82 VFSNAME_SYNC, (fs_generic_func_p) zfs_sync, 83 VFSNAME_VGET, zfs_vget, 84 VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs, 85 NULL, NULL 86 }; 87 88 static const fs_operation_def_t zfs_vfsops_eio_template[] = { 89 VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs, 90 NULL, NULL 91 }; 92 93 /* 94 * We need to keep a count of active fs's. 95 * This is necessary to prevent our module 96 * from being unloaded after a umount -f 97 */ 98 static uint32_t zfs_active_fs_count = 0; 99 100 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; 101 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; 102 103 static mntopt_t mntopts[] = { 104 { MNTOPT_XATTR, NULL, NULL, MO_NODISPLAY|MO_DEFAULT, NULL }, 105 { MNTOPT_NOATIME, noatime_cancel, NULL, MO_DEFAULT, NULL }, 106 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } 107 }; 108 109 static mntopts_t zfs_mntopts = { 110 sizeof (mntopts) / sizeof (mntopt_t), 111 mntopts 112 }; 113 114 /*ARGSUSED*/ 115 int 116 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) 117 { 118 /* 119 * Data integrity is job one. We don't want a compromised kernel 120 * writing to the storage pool, so we never sync during panic. 121 */ 122 if (panicstr) 123 return (0); 124 125 /* 126 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS 127 * to sync metadata, which they would otherwise cache indefinitely. 128 * Semantically, the only requirement is that the sync be initiated. 129 * The DMU syncs out txgs frequently, so there's nothing to do. 130 */ 131 if (flag & SYNC_ATTR) 132 return (0); 133 134 if (vfsp != NULL) { 135 /* 136 * Sync a specific filesystem. 137 */ 138 zfsvfs_t *zfsvfs = vfsp->vfs_data; 139 140 ZFS_ENTER(zfsvfs); 141 if (zfsvfs->z_log != NULL) 142 zil_commit(zfsvfs->z_log, UINT64_MAX, FSYNC); 143 else 144 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 145 ZFS_EXIT(zfsvfs); 146 } else { 147 /* 148 * Sync all ZFS filesystems. This is what happens when you 149 * run sync(1M). Unlike other filesystems, ZFS honors the 150 * request by waiting for all pools to commit all dirty data. 151 */ 152 spa_sync_allpools(); 153 } 154 155 return (0); 156 } 157 158 static int 159 zfs_create_unique_device(dev_t *dev) 160 { 161 major_t new_major; 162 163 do { 164 ASSERT3U(zfs_minor, <=, MAXMIN32); 165 minor_t start = zfs_minor; 166 do { 167 mutex_enter(&zfs_dev_mtx); 168 if (zfs_minor >= MAXMIN32) { 169 /* 170 * If we're still using the real major 171 * keep out of /dev/zfs and /dev/zvol minor 172 * number space. If we're using a getudev()'ed 173 * major number, we can use all of its minors. 174 */ 175 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 176 zfs_minor = ZFS_MIN_MINOR; 177 else 178 zfs_minor = 0; 179 } else { 180 zfs_minor++; 181 } 182 *dev = makedevice(zfs_major, zfs_minor); 183 mutex_exit(&zfs_dev_mtx); 184 } while (vfs_devismounted(*dev) && zfs_minor != start); 185 if (zfs_minor == start) { 186 /* 187 * We are using all ~262,000 minor numbers for the 188 * current major number. Create a new major number. 189 */ 190 if ((new_major = getudev()) == (major_t)-1) { 191 cmn_err(CE_WARN, 192 "zfs_mount: Can't get unique major " 193 "device number."); 194 return (-1); 195 } 196 mutex_enter(&zfs_dev_mtx); 197 zfs_major = new_major; 198 zfs_minor = 0; 199 200 mutex_exit(&zfs_dev_mtx); 201 } else { 202 break; 203 } 204 /* CONSTANTCONDITION */ 205 } while (1); 206 207 return (0); 208 } 209 210 static void 211 atime_changed_cb(void *arg, uint64_t newval) 212 { 213 zfsvfs_t *zfsvfs = arg; 214 215 if (newval == TRUE) { 216 zfsvfs->z_atime = TRUE; 217 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 218 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 219 } else { 220 zfsvfs->z_atime = FALSE; 221 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 222 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 223 } 224 } 225 226 static void 227 blksz_changed_cb(void *arg, uint64_t newval) 228 { 229 zfsvfs_t *zfsvfs = arg; 230 231 if (newval < SPA_MINBLOCKSIZE || 232 newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) 233 newval = SPA_MAXBLOCKSIZE; 234 235 zfsvfs->z_max_blksz = newval; 236 zfsvfs->z_vfs->vfs_bsize = newval; 237 } 238 239 static void 240 readonly_changed_cb(void *arg, uint64_t newval) 241 { 242 zfsvfs_t *zfsvfs = arg; 243 244 if (newval) { 245 /* XXX locking on vfs_flag? */ 246 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 247 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 248 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 249 (void) zfs_delete_thread_target(zfsvfs, 0); 250 } else { 251 /* XXX locking on vfs_flag? */ 252 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 253 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 254 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 255 (void) zfs_delete_thread_target(zfsvfs, 1); 256 } 257 } 258 259 static void 260 devices_changed_cb(void *arg, uint64_t newval) 261 { 262 zfsvfs_t *zfsvfs = arg; 263 264 if (newval == FALSE) { 265 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; 266 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); 267 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); 268 } else { 269 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; 270 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); 271 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); 272 } 273 } 274 275 static void 276 setuid_changed_cb(void *arg, uint64_t newval) 277 { 278 zfsvfs_t *zfsvfs = arg; 279 280 if (newval == FALSE) { 281 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 282 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 283 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 284 } else { 285 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 286 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 287 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 288 } 289 } 290 291 static void 292 exec_changed_cb(void *arg, uint64_t newval) 293 { 294 zfsvfs_t *zfsvfs = arg; 295 296 if (newval == FALSE) { 297 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 298 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 299 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 300 } else { 301 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 302 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 303 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 304 } 305 } 306 307 static void 308 snapdir_changed_cb(void *arg, uint64_t newval) 309 { 310 zfsvfs_t *zfsvfs = arg; 311 312 zfsvfs->z_show_ctldir = newval; 313 } 314 315 static void 316 acl_mode_changed_cb(void *arg, uint64_t newval) 317 { 318 zfsvfs_t *zfsvfs = arg; 319 320 zfsvfs->z_acl_mode = newval; 321 } 322 323 static void 324 acl_inherit_changed_cb(void *arg, uint64_t newval) 325 { 326 zfsvfs_t *zfsvfs = arg; 327 328 zfsvfs->z_acl_inherit = newval; 329 } 330 331 static int 332 zfs_refresh_properties(vfs_t *vfsp) 333 { 334 zfsvfs_t *zfsvfs = vfsp->vfs_data; 335 336 /* 337 * Remount operations default to "rw" unless "ro" is explicitly 338 * specified. 339 */ 340 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 341 readonly_changed_cb(zfsvfs, B_TRUE); 342 } else { 343 if (!dmu_objset_is_snapshot(zfsvfs->z_os)) 344 readonly_changed_cb(zfsvfs, B_FALSE); 345 else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) 346 return (EROFS); 347 } 348 349 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 350 devices_changed_cb(zfsvfs, B_FALSE); 351 setuid_changed_cb(zfsvfs, B_FALSE); 352 } else { 353 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 354 devices_changed_cb(zfsvfs, B_FALSE); 355 else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) 356 devices_changed_cb(zfsvfs, B_TRUE); 357 358 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 359 setuid_changed_cb(zfsvfs, B_FALSE); 360 else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) 361 setuid_changed_cb(zfsvfs, B_TRUE); 362 } 363 364 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 365 exec_changed_cb(zfsvfs, B_FALSE); 366 else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) 367 exec_changed_cb(zfsvfs, B_TRUE); 368 369 return (0); 370 } 371 372 static int 373 zfs_register_callbacks(vfs_t *vfsp) 374 { 375 struct dsl_dataset *ds = NULL; 376 objset_t *os = NULL; 377 zfsvfs_t *zfsvfs = NULL; 378 int do_readonly = FALSE, readonly; 379 int do_setuid = FALSE, setuid; 380 int do_exec = FALSE, exec; 381 int do_devices = FALSE, devices; 382 int error = 0; 383 384 ASSERT(vfsp); 385 zfsvfs = vfsp->vfs_data; 386 ASSERT(zfsvfs); 387 os = zfsvfs->z_os; 388 389 /* 390 * The act of registering our callbacks will destroy any mount 391 * options we may have. In order to enable temporary overrides 392 * of mount options, we stash away the current values and restore 393 * restore them after we register the callbacks. 394 */ 395 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { 396 readonly = B_TRUE; 397 do_readonly = B_TRUE; 398 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 399 readonly = B_FALSE; 400 do_readonly = B_TRUE; 401 } 402 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 403 devices = B_FALSE; 404 setuid = B_FALSE; 405 do_devices = B_TRUE; 406 do_setuid = B_TRUE; 407 } else { 408 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 409 devices = B_FALSE; 410 do_devices = B_TRUE; 411 } else if (vfs_optionisset(vfsp, 412 MNTOPT_DEVICES, NULL)) { 413 devices = B_TRUE; 414 do_devices = B_TRUE; 415 } 416 417 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 418 setuid = B_FALSE; 419 do_setuid = B_TRUE; 420 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 421 setuid = B_TRUE; 422 do_setuid = B_TRUE; 423 } 424 } 425 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 426 exec = B_FALSE; 427 do_exec = B_TRUE; 428 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 429 exec = B_TRUE; 430 do_exec = B_TRUE; 431 } 432 433 /* 434 * Register property callbacks. 435 * 436 * It would probably be fine to just check for i/o error from 437 * the first prop_register(), but I guess I like to go 438 * overboard... 439 */ 440 ds = dmu_objset_ds(os); 441 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); 442 error = error ? error : dsl_prop_register(ds, 443 "recordsize", blksz_changed_cb, zfsvfs); 444 error = error ? error : dsl_prop_register(ds, 445 "readonly", readonly_changed_cb, zfsvfs); 446 error = error ? error : dsl_prop_register(ds, 447 "devices", devices_changed_cb, zfsvfs); 448 error = error ? error : dsl_prop_register(ds, 449 "setuid", setuid_changed_cb, zfsvfs); 450 error = error ? error : dsl_prop_register(ds, 451 "exec", exec_changed_cb, zfsvfs); 452 error = error ? error : dsl_prop_register(ds, 453 "snapdir", snapdir_changed_cb, zfsvfs); 454 error = error ? error : dsl_prop_register(ds, 455 "aclmode", acl_mode_changed_cb, zfsvfs); 456 error = error ? error : dsl_prop_register(ds, 457 "aclinherit", acl_inherit_changed_cb, zfsvfs); 458 if (error) 459 goto unregister; 460 461 /* 462 * Invoke our callbacks to restore temporary mount options. 463 */ 464 if (do_readonly) 465 readonly_changed_cb(zfsvfs, readonly); 466 if (do_setuid) 467 setuid_changed_cb(zfsvfs, setuid); 468 if (do_exec) 469 exec_changed_cb(zfsvfs, exec); 470 if (do_devices) 471 devices_changed_cb(zfsvfs, devices); 472 473 return (0); 474 475 unregister: 476 /* 477 * We may attempt to unregister some callbacks that are not 478 * registered, but this is OK; it will simply return ENOMSG, 479 * which we will ignore. 480 */ 481 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); 482 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); 483 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); 484 (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); 485 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); 486 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); 487 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); 488 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); 489 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, 490 zfsvfs); 491 return (error); 492 493 } 494 495 static int 496 zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr) 497 { 498 dev_t mount_dev; 499 uint64_t recordsize, readonly; 500 int error = 0; 501 int mode; 502 zfsvfs_t *zfsvfs; 503 znode_t *zp = NULL; 504 505 ASSERT(vfsp); 506 ASSERT(osname); 507 508 /* 509 * Initialize the zfs-specific filesystem structure. 510 * Should probably make this a kmem cache, shuffle fields, 511 * and just bzero up to z_hold_mtx[]. 512 */ 513 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 514 zfsvfs->z_vfs = vfsp; 515 zfsvfs->z_parent = zfsvfs; 516 zfsvfs->z_assign = TXG_NOWAIT; 517 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; 518 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 519 520 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 521 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 522 offsetof(znode_t, z_link_node)); 523 rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL); 524 525 /* Initialize the generic filesystem structure. */ 526 vfsp->vfs_bcount = 0; 527 vfsp->vfs_data = NULL; 528 529 if (zfs_create_unique_device(&mount_dev) == -1) { 530 error = ENODEV; 531 goto out; 532 } 533 ASSERT(vfs_devismounted(mount_dev) == 0); 534 535 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 536 NULL)) 537 goto out; 538 539 vfsp->vfs_dev = mount_dev; 540 vfsp->vfs_fstype = zfsfstype; 541 vfsp->vfs_bsize = recordsize; 542 vfsp->vfs_flag |= VFS_NOTRUNC; 543 vfsp->vfs_data = zfsvfs; 544 545 if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) 546 goto out; 547 548 if (readonly) 549 mode = DS_MODE_PRIMARY | DS_MODE_READONLY; 550 else 551 mode = DS_MODE_PRIMARY; 552 553 error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); 554 if (error == EROFS) { 555 mode = DS_MODE_PRIMARY | DS_MODE_READONLY; 556 error = dmu_objset_open(osname, DMU_OST_ZFS, mode, 557 &zfsvfs->z_os); 558 } 559 560 if (error) 561 goto out; 562 563 if (error = zfs_init_fs(zfsvfs, &zp, cr)) 564 goto out; 565 566 /* The call to zfs_init_fs leaves the vnode held, release it here. */ 567 VN_RELE(ZTOV(zp)); 568 569 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 570 ASSERT(mode & DS_MODE_READONLY); 571 atime_changed_cb(zfsvfs, B_FALSE); 572 readonly_changed_cb(zfsvfs, B_TRUE); 573 zfsvfs->z_issnap = B_TRUE; 574 } else { 575 error = zfs_register_callbacks(vfsp); 576 if (error) 577 goto out; 578 579 /* 580 * Start a delete thread running. 581 */ 582 (void) zfs_delete_thread_target(zfsvfs, 1); 583 584 /* 585 * Parse and replay the intent log. 586 */ 587 zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, 588 zfs_replay_vector, (void (*)(void *))zfs_delete_wait_empty); 589 590 if (!zil_disable) 591 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 592 } 593 594 if (!zfsvfs->z_issnap) 595 zfsctl_create(zfsvfs); 596 out: 597 if (error) { 598 if (zfsvfs->z_os) 599 dmu_objset_close(zfsvfs->z_os); 600 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 601 } else { 602 atomic_add_32(&zfs_active_fs_count, 1); 603 } 604 605 return (error); 606 607 } 608 609 void 610 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 611 { 612 objset_t *os = zfsvfs->z_os; 613 struct dsl_dataset *ds; 614 615 /* 616 * Unregister properties. 617 */ 618 if (!dmu_objset_is_snapshot(os)) { 619 ds = dmu_objset_ds(os); 620 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, 621 zfsvfs) == 0); 622 623 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, 624 zfsvfs) == 0); 625 626 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, 627 zfsvfs) == 0); 628 629 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, 630 zfsvfs) == 0); 631 632 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, 633 zfsvfs) == 0); 634 635 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, 636 zfsvfs) == 0); 637 638 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, 639 zfsvfs) == 0); 640 641 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, 642 zfsvfs) == 0); 643 644 VERIFY(dsl_prop_unregister(ds, "aclinherit", 645 acl_inherit_changed_cb, zfsvfs) == 0); 646 } 647 } 648 649 static int 650 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 651 { 652 int error = 0; 653 int ret = 0; 654 static int zfsrootdone = 0; 655 zfsvfs_t *zfsvfs = NULL; 656 znode_t *zp = NULL; 657 vnode_t *vp = NULL; 658 659 ASSERT(vfsp); 660 661 /* 662 * The filesystem that we mount as root is defined in 663 * /etc/system using the zfsroot variable. The value defined 664 * there is copied early in startup code to zfs_bootpath 665 * (defined in modsysfile.c). 666 */ 667 if (why == ROOT_INIT) { 668 if (zfsrootdone++) 669 return (EBUSY); 670 671 /* 672 * This needs to be done here, so that when we return from 673 * mountroot, the vfs resource name will be set correctly. 674 */ 675 if (snprintf(rootfs.bo_name, BO_MAXOBJNAME, "%s", zfs_bootpath) 676 >= BO_MAXOBJNAME) 677 return (ENAMETOOLONG); 678 679 if (error = vfs_lock(vfsp)) 680 return (error); 681 682 if (error = zfs_domount(vfsp, zfs_bootpath, CRED())) 683 goto out; 684 685 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 686 ASSERT(zfsvfs); 687 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) 688 goto out; 689 690 vp = ZTOV(zp); 691 mutex_enter(&vp->v_lock); 692 vp->v_flag |= VROOT; 693 mutex_exit(&vp->v_lock); 694 rootvp = vp; 695 696 /* 697 * The zfs_zget call above returns with a hold on vp, we release 698 * it here. 699 */ 700 VN_RELE(vp); 701 702 /* 703 * Mount root as readonly initially, it will be remouted 704 * read/write by /lib/svc/method/fs-usr. 705 */ 706 readonly_changed_cb(vfsp->vfs_data, B_TRUE); 707 vfs_add((struct vnode *)0, vfsp, 708 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 709 out: 710 vfs_unlock(vfsp); 711 ret = (error) ? error : 0; 712 return (ret); 713 714 } else if (why == ROOT_REMOUNT) { 715 716 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 717 vfsp->vfs_flag |= VFS_REMOUNT; 718 return (zfs_refresh_properties(vfsp)); 719 720 } else if (why == ROOT_UNMOUNT) { 721 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 722 (void) zfs_sync(vfsp, 0, 0); 723 return (0); 724 } 725 726 /* 727 * if "why" is equal to anything else other than ROOT_INIT, 728 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 729 */ 730 return (ENOTSUP); 731 } 732 733 /*ARGSUSED*/ 734 static int 735 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 736 { 737 char *osname; 738 pathname_t spn; 739 int error = 0; 740 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? 741 UIO_SYSSPACE : UIO_USERSPACE; 742 int canwrite; 743 744 if (mvp->v_type != VDIR) 745 return (ENOTDIR); 746 747 mutex_enter(&mvp->v_lock); 748 if ((uap->flags & MS_REMOUNT) == 0 && 749 (uap->flags & MS_OVERLAY) == 0 && 750 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 751 mutex_exit(&mvp->v_lock); 752 return (EBUSY); 753 } 754 mutex_exit(&mvp->v_lock); 755 756 /* 757 * ZFS does not support passing unparsed data in via MS_DATA. 758 * Users should use the MS_OPTIONSTR interface; this means 759 * that all option parsing is already done and the options struct 760 * can be interrogated. 761 */ 762 if ((uap->flags & MS_DATA) && uap->datalen > 0) 763 return (EINVAL); 764 765 /* 766 * When doing a remount, we simply refresh our temporary properties 767 * according to those options set in the current VFS options. 768 */ 769 if (uap->flags & MS_REMOUNT) { 770 return (zfs_refresh_properties(vfsp)); 771 } 772 773 /* 774 * Get the objset name (the "special" mount argument). 775 */ 776 if (error = pn_get(uap->spec, fromspace, &spn)) 777 return (error); 778 779 osname = spn.pn_path; 780 781 if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) 782 goto out; 783 784 /* 785 * Refuse to mount a filesystem if we are in a local zone and the 786 * dataset is not visible. 787 */ 788 if (!INGLOBALZONE(curproc) && 789 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 790 error = EPERM; 791 goto out; 792 } 793 794 error = zfs_domount(vfsp, osname, cr); 795 796 out: 797 pn_free(&spn); 798 return (error); 799 } 800 801 static int 802 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) 803 { 804 zfsvfs_t *zfsvfs = vfsp->vfs_data; 805 dmu_objset_stats_t dstats; 806 dev32_t d32; 807 808 ZFS_ENTER(zfsvfs); 809 810 dmu_objset_stats(zfsvfs->z_os, &dstats); 811 812 /* 813 * The underlying storage pool actually uses multiple block sizes. 814 * We report the fragsize as the smallest block size we support, 815 * and we report our blocksize as the filesystem's maximum blocksize. 816 */ 817 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; 818 statp->f_bsize = zfsvfs->z_max_blksz; 819 820 /* 821 * The following report "total" blocks of various kinds in the 822 * file system, but reported in terms of f_frsize - the 823 * "fragment" size. 824 */ 825 826 statp->f_blocks = 827 (dstats.dds_space_refd + dstats.dds_available) >> SPA_MINBLOCKSHIFT; 828 statp->f_bfree = dstats.dds_available >> SPA_MINBLOCKSHIFT; 829 statp->f_bavail = statp->f_bfree; /* no root reservation */ 830 831 /* 832 * statvfs() should really be called statufs(), because it assumes 833 * static metadata. ZFS doesn't preallocate files, so the best 834 * we can do is report the max that could possibly fit in f_files, 835 * and that minus the number actually used in f_ffree. 836 * For f_ffree, report the smaller of the number of object available 837 * and the number of blocks (each object will take at least a block). 838 */ 839 statp->f_ffree = MIN(dstats.dds_objects_avail, statp->f_bfree); 840 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 841 statp->f_files = statp->f_ffree + dstats.dds_objects_used; 842 843 (void) cmpldev(&d32, vfsp->vfs_dev); 844 statp->f_fsid = d32; 845 846 /* 847 * We're a zfs filesystem. 848 */ 849 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); 850 851 statp->f_flag = vf_to_stf(vfsp->vfs_flag); 852 853 statp->f_namemax = ZFS_MAXNAMELEN; 854 855 /* 856 * We have all of 32 characters to stuff a string here. 857 * Is there anything useful we could/should provide? 858 */ 859 bzero(statp->f_fstr, sizeof (statp->f_fstr)); 860 861 ZFS_EXIT(zfsvfs); 862 return (0); 863 } 864 865 static int 866 zfs_root(vfs_t *vfsp, vnode_t **vpp) 867 { 868 zfsvfs_t *zfsvfs = vfsp->vfs_data; 869 znode_t *rootzp; 870 int error; 871 872 ZFS_ENTER(zfsvfs); 873 874 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 875 if (error == 0) 876 *vpp = ZTOV(rootzp); 877 878 ZFS_EXIT(zfsvfs); 879 return (error); 880 } 881 882 /*ARGSUSED*/ 883 static int 884 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) 885 { 886 zfsvfs_t *zfsvfs = vfsp->vfs_data; 887 int ret; 888 889 if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) 890 return (ret); 891 892 893 (void) dnlc_purge_vfsp(vfsp, 0); 894 895 /* 896 * Unmount any snapshots mounted under .zfs before unmounting the 897 * dataset itself. 898 */ 899 if (zfsvfs->z_ctldir != NULL && 900 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 901 return (ret); 902 903 if (fflag & MS_FORCE) { 904 vfsp->vfs_flag |= VFS_UNMOUNTED; 905 zfsvfs->z_unmounted1 = B_TRUE; 906 907 /* 908 * Wait for all zfs threads to leave zfs. 909 * Grabbing a rwlock as reader in all vops and 910 * as writer here doesn't work because it too easy to get 911 * multiple reader enters as zfs can re-enter itself. 912 * This can lead to deadlock if there is an intervening 913 * rw_enter as writer. 914 * So a file system threads ref count (z_op_cnt) is used. 915 * A polling loop on z_op_cnt may seem inefficient, but 916 * - this saves all threads on exit from having to grab a 917 * mutex in order to cv_signal 918 * - only occurs on forced unmount in the rare case when 919 * there are outstanding threads within the file system. 920 */ 921 while (zfsvfs->z_op_cnt) { 922 delay(1); 923 } 924 925 zfs_objset_close(zfsvfs); 926 927 return (0); 928 } 929 /* 930 * Stop all delete threads. 931 */ 932 (void) zfs_delete_thread_target(zfsvfs, 0); 933 934 /* 935 * Check the number of active vnodes in the file system. 936 * Our count is maintained in the vfs structure, but the number 937 * is off by 1 to indicate a hold on the vfs structure itself. 938 * 939 * The '.zfs' directory maintains a reference of its own, and any active 940 * references underneath are reflected in the vnode count. 941 */ 942 if (zfsvfs->z_ctldir == NULL) { 943 if (vfsp->vfs_count > 1) { 944 if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) 945 (void) zfs_delete_thread_target(zfsvfs, 1); 946 return (EBUSY); 947 } 948 } else { 949 if (vfsp->vfs_count > 2 || 950 (zfsvfs->z_ctldir->v_count > 1 && !(fflag & MS_FORCE))) { 951 if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) 952 (void) zfs_delete_thread_target(zfsvfs, 1); 953 return (EBUSY); 954 } 955 } 956 957 vfsp->vfs_flag |= VFS_UNMOUNTED; 958 zfs_objset_close(zfsvfs); 959 960 return (0); 961 } 962 963 static int 964 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 965 { 966 zfsvfs_t *zfsvfs = vfsp->vfs_data; 967 znode_t *zp; 968 uint64_t object = 0; 969 uint64_t fid_gen = 0; 970 uint64_t gen_mask; 971 uint64_t zp_gen; 972 int i, err; 973 974 *vpp = NULL; 975 976 ZFS_ENTER(zfsvfs); 977 978 if (fidp->fid_len == LONG_FID_LEN) { 979 zfid_long_t *zlfid = (zfid_long_t *)fidp; 980 uint64_t objsetid = 0; 981 uint64_t setgen = 0; 982 983 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 984 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 985 986 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 987 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 988 989 ZFS_EXIT(zfsvfs); 990 991 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 992 if (err) 993 return (EINVAL); 994 ZFS_ENTER(zfsvfs); 995 } 996 997 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 998 zfid_short_t *zfid = (zfid_short_t *)fidp; 999 1000 for (i = 0; i < sizeof (zfid->zf_object); i++) 1001 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1002 1003 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1004 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1005 } else { 1006 ZFS_EXIT(zfsvfs); 1007 return (EINVAL); 1008 } 1009 1010 /* A zero fid_gen means we are in the .zfs control directories */ 1011 if (fid_gen == 0 && 1012 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 1013 *vpp = zfsvfs->z_ctldir; 1014 ASSERT(*vpp != NULL); 1015 if (object == ZFSCTL_INO_SNAPDIR) { 1016 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 1017 0, NULL, NULL) == 0); 1018 } else { 1019 VN_HOLD(*vpp); 1020 } 1021 ZFS_EXIT(zfsvfs); 1022 return (0); 1023 } 1024 1025 gen_mask = -1ULL >> (64 - 8 * i); 1026 1027 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 1028 if (err = zfs_zget(zfsvfs, object, &zp)) { 1029 ZFS_EXIT(zfsvfs); 1030 return (err); 1031 } 1032 zp_gen = zp->z_phys->zp_gen & gen_mask; 1033 if (zp_gen == 0) 1034 zp_gen = 1; 1035 if (zp->z_reap || zp_gen != fid_gen) { 1036 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 1037 VN_RELE(ZTOV(zp)); 1038 ZFS_EXIT(zfsvfs); 1039 return (EINVAL); 1040 } 1041 1042 *vpp = ZTOV(zp); 1043 ZFS_EXIT(zfsvfs); 1044 return (0); 1045 } 1046 1047 static void 1048 zfs_objset_close(zfsvfs_t *zfsvfs) 1049 { 1050 zfs_delete_t *zd = &zfsvfs->z_delete_head; 1051 znode_t *zp, *nextzp; 1052 objset_t *os = zfsvfs->z_os; 1053 1054 /* 1055 * Stop all delete threads. 1056 */ 1057 (void) zfs_delete_thread_target(zfsvfs, 0); 1058 1059 /* 1060 * For forced unmount, at this point all vops except zfs_inactive 1061 * are erroring EIO. We need to now suspend zfs_inactive threads 1062 * while we are freeing dbufs before switching zfs_inactive 1063 * to use behaviour without a objset. 1064 */ 1065 rw_enter(&zfsvfs->z_um_lock, RW_WRITER); 1066 1067 /* 1068 * Release all delete in progress znodes 1069 * They will be processed when the file system remounts. 1070 */ 1071 mutex_enter(&zd->z_mutex); 1072 while (zp = list_head(&zd->z_znodes)) { 1073 list_remove(&zd->z_znodes, zp); 1074 zp->z_dbuf_held = 0; 1075 dmu_buf_rele(zp->z_dbuf, NULL); 1076 } 1077 mutex_exit(&zd->z_mutex); 1078 1079 /* 1080 * Release all holds on dbufs 1081 * Note, although we have stopped all other vop threads and 1082 * zfs_inactive(), the dmu can callback via znode_pageout_func() 1083 * which can zfs_znode_free() the znode. 1084 * So we lock z_all_znodes; search the list for a held 1085 * dbuf; drop the lock (we know zp can't disappear if we hold 1086 * a dbuf lock; then regrab the lock and restart. 1087 */ 1088 mutex_enter(&zfsvfs->z_znodes_lock); 1089 for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) { 1090 nextzp = list_next(&zfsvfs->z_all_znodes, zp); 1091 if (zp->z_dbuf_held) { 1092 /* dbufs should only be held when force unmounting */ 1093 zp->z_dbuf_held = 0; 1094 mutex_exit(&zfsvfs->z_znodes_lock); 1095 dmu_buf_rele(zp->z_dbuf, NULL); 1096 /* Start again */ 1097 mutex_enter(&zfsvfs->z_znodes_lock); 1098 nextzp = list_head(&zfsvfs->z_all_znodes); 1099 } 1100 } 1101 mutex_exit(&zfsvfs->z_znodes_lock); 1102 1103 /* 1104 * Unregister properties. 1105 */ 1106 if (!dmu_objset_is_snapshot(os)) 1107 zfs_unregister_callbacks(zfsvfs); 1108 1109 /* 1110 * Switch zfs_inactive to behaviour without an objset. 1111 * It just tosses cached pages and frees the znode & vnode. 1112 * Then re-enable zfs_inactive threads in that new behaviour. 1113 */ 1114 zfsvfs->z_unmounted2 = B_TRUE; 1115 rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */ 1116 1117 /* 1118 * Close the zil. Can't close the zil while zfs_inactive 1119 * threads are blocked as zil_close can call zfs_inactive. 1120 */ 1121 if (zfsvfs->z_log) { 1122 zil_close(zfsvfs->z_log); 1123 zfsvfs->z_log = NULL; 1124 } 1125 1126 /* 1127 * Evict all dbufs so that cached znodes will be freed 1128 */ 1129 if (dmu_objset_evict_dbufs(os, 1)) { 1130 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1131 (void) dmu_objset_evict_dbufs(os, 0); 1132 } 1133 1134 /* 1135 * Finally close the objset 1136 */ 1137 dmu_objset_close(os); 1138 1139 /* 1140 * We can now safely destroy the '.zfs' directory node. 1141 */ 1142 if (zfsvfs->z_ctldir != NULL) 1143 zfsctl_destroy(zfsvfs); 1144 1145 } 1146 1147 static void 1148 zfs_freevfs(vfs_t *vfsp) 1149 { 1150 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1151 1152 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1153 1154 atomic_add_32(&zfs_active_fs_count, -1); 1155 } 1156 1157 /* 1158 * VFS_INIT() initialization. Note that there is no VFS_FINI(), 1159 * so we can't safely do any non-idempotent initialization here. 1160 * Leave that to zfs_init() and zfs_fini(), which are called 1161 * from the module's _init() and _fini() entry points. 1162 */ 1163 /*ARGSUSED*/ 1164 static int 1165 zfs_vfsinit(int fstype, char *name) 1166 { 1167 int error; 1168 1169 zfsfstype = fstype; 1170 1171 /* 1172 * Setup vfsops and vnodeops tables. 1173 */ 1174 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); 1175 if (error != 0) { 1176 cmn_err(CE_WARN, "zfs: bad vfs ops template"); 1177 } 1178 1179 error = zfs_create_op_tables(); 1180 if (error) { 1181 zfs_remove_op_tables(); 1182 cmn_err(CE_WARN, "zfs: bad vnode ops template"); 1183 (void) vfs_freevfsops_by_type(zfsfstype); 1184 return (error); 1185 } 1186 1187 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 1188 1189 /* 1190 * Unique major number for all zfs mounts. 1191 * If we run out of 32-bit minors, we'll getudev() another major. 1192 */ 1193 zfs_major = ddi_name_to_major(ZFS_DRIVER); 1194 zfs_minor = ZFS_MIN_MINOR; 1195 1196 return (0); 1197 } 1198 1199 void 1200 zfs_init(void) 1201 { 1202 /* 1203 * Initialize .zfs directory structures 1204 */ 1205 zfsctl_init(); 1206 1207 /* 1208 * Initialize znode cache, vnode ops, etc... 1209 */ 1210 zfs_znode_init(); 1211 } 1212 1213 void 1214 zfs_fini(void) 1215 { 1216 zfsctl_fini(); 1217 zfs_znode_fini(); 1218 } 1219 1220 int 1221 zfs_busy(void) 1222 { 1223 return (zfs_active_fs_count != 0); 1224 } 1225 1226 static vfsdef_t vfw = { 1227 VFSDEF_VERSION, 1228 MNTTYPE_ZFS, 1229 zfs_vfsinit, 1230 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS, 1231 &zfs_mntopts 1232 }; 1233 1234 struct modlfs zfs_modlfs = { 1235 &mod_fsops, "ZFS filesystem version 1", &vfw 1236 }; 1237