1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/errno.h> 46 #include <sys/user.h> 47 #include <sys/fstyp.h> 48 #include <sys/kmem.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/mount.h> 52 #include <sys/vfs.h> 53 #include <sys/vfs_opreg.h> 54 #include <sys/fem.h> 55 #include <sys/mntent.h> 56 #include <sys/stat.h> 57 #include <sys/statvfs.h> 58 #include <sys/statfs.h> 59 #include <sys/cred.h> 60 #include <sys/vnode.h> 61 #include <sys/rwstlock.h> 62 #include <sys/dnlc.h> 63 #include <sys/file.h> 64 #include <sys/time.h> 65 #include <sys/atomic.h> 66 #include <sys/cmn_err.h> 67 #include <sys/buf.h> 68 #include <sys/swap.h> 69 #include <sys/debug.h> 70 #include <sys/vnode.h> 71 #include <sys/modctl.h> 72 #include <sys/ddi.h> 73 #include <sys/pathname.h> 74 #include <sys/bootconf.h> 75 #include <sys/dumphdr.h> 76 #include <sys/dc_ki.h> 77 #include <sys/poll.h> 78 #include <sys/sunddi.h> 79 #include <sys/sysmacros.h> 80 #include <sys/zone.h> 81 #include <sys/policy.h> 82 #include <sys/ctfs.h> 83 #include <sys/objfs.h> 84 #include <sys/console.h> 85 #include <sys/reboot.h> 86 #include <sys/attr.h> 87 #include <sys/spa.h> 88 89 #include <vm/page.h> 90 91 #include <fs/fs_subr.h> 92 93 /* Private interfaces to create vopstats-related data structures */ 94 extern void initialize_vopstats(vopstats_t *); 95 extern vopstats_t *get_fstype_vopstats(struct vfs *, struct vfssw *); 96 extern vsk_anchor_t *get_vskstat_anchor(struct vfs *); 97 98 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 99 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 100 const char *, int, int); 101 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 102 static void vfs_freemnttab(struct vfs *); 103 static void vfs_freeopt(mntopt_t *); 104 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 105 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 106 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 107 static void vfs_createopttbl_extend(mntopts_t *, const char *, 108 const mntopts_t *); 109 static char **vfs_copycancelopt_extend(char **const, int); 110 static void vfs_freecancelopt(char **); 111 static void getrootfs(char **, char **); 112 static int getmacpath(dev_info_t *, void *); 113 static void vfs_mnttabvp_setup(void); 114 115 struct ipmnt { 116 struct ipmnt *mip_next; 117 dev_t mip_dev; 118 struct vfs *mip_vfsp; 119 }; 120 121 static kmutex_t vfs_miplist_mutex; 122 static struct ipmnt *vfs_miplist = NULL; 123 static struct ipmnt *vfs_miplist_end = NULL; 124 125 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */ 126 127 /* 128 * VFS global data. 129 */ 130 vnode_t *rootdir; /* pointer to root inode vnode. */ 131 vnode_t *devicesdir; /* pointer to inode of devices root */ 132 vnode_t *devdir; /* pointer to inode of dev root */ 133 134 char *server_rootpath; /* root path for diskless clients */ 135 char *server_hostname; /* hostname of diskless server */ 136 137 static struct vfs root; 138 static struct vfs devices; 139 static struct vfs dev; 140 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 141 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 142 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 143 /* must be power of 2! */ 144 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 145 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 146 char *vfs_dummyfstype = "\0"; 147 struct pollhead vfs_pollhd; /* for mnttab pollers */ 148 struct vnode *vfs_mntdummyvp; /* to fake mnttab read/write for file events */ 149 int mntfstype; /* will be set once mnt fs is mounted */ 150 151 /* 152 * Table for generic options recognized in the VFS layer and acted 153 * on at this level before parsing file system specific options. 154 * The nosuid option is stronger than any of the devices and setuid 155 * options, so those are canceled when nosuid is seen. 156 * 157 * All options which are added here need to be added to the 158 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 159 */ 160 /* 161 * VFS Mount options table 162 */ 163 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 164 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 165 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 166 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 167 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 168 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 169 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 170 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 171 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 172 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 173 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 174 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 175 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 176 177 static const mntopt_t mntopts[] = { 178 /* 179 * option name cancel options default arg flags 180 */ 181 { MNTOPT_REMOUNT, NULL, NULL, 182 MO_NODISPLAY, (void *)0 }, 183 { MNTOPT_RO, ro_cancel, NULL, 0, 184 (void *)0 }, 185 { MNTOPT_RW, rw_cancel, NULL, 0, 186 (void *)0 }, 187 { MNTOPT_SUID, suid_cancel, NULL, 0, 188 (void *)0 }, 189 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 190 (void *)0 }, 191 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 192 (void *)0 }, 193 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 194 (void *)0 }, 195 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 196 (void *)0 }, 197 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 198 (void *)0 }, 199 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 200 (void *)0 }, 201 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 202 (void *)0 }, 203 { MNTOPT_EXEC, exec_cancel, NULL, 0, 204 (void *)0 }, 205 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 206 (void *)0 }, 207 }; 208 209 const mntopts_t vfs_mntopts = { 210 sizeof (mntopts) / sizeof (mntopt_t), 211 (mntopt_t *)&mntopts[0] 212 }; 213 214 /* 215 * File system operation dispatch functions. 216 */ 217 218 int 219 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 220 { 221 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 222 } 223 224 int 225 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 226 { 227 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 228 } 229 230 int 231 fsop_root(vfs_t *vfsp, vnode_t **vpp) 232 { 233 refstr_t *mntpt; 234 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 235 /* 236 * Make sure this root has a path. With lofs, it is possible to have 237 * a NULL mountpoint. 238 */ 239 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { 240 mntpt = vfs_getmntpoint(vfsp); 241 vn_setpath_str(*vpp, refstr_value(mntpt), 242 strlen(refstr_value(mntpt))); 243 refstr_rele(mntpt); 244 } 245 246 return (ret); 247 } 248 249 int 250 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 251 { 252 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 253 } 254 255 int 256 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 257 { 258 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 259 } 260 261 int 262 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 263 { 264 /* 265 * In order to handle system attribute fids in a manner 266 * transparent to the underlying fs, we embed the fid for 267 * the sysattr parent object in the sysattr fid and tack on 268 * some extra bytes that only the sysattr layer knows about. 269 * 270 * This guarantees that sysattr fids are larger than other fids 271 * for this vfs. If the vfs supports sysattrs (implied 272 * by VFSFT_XVATTR support), we cannot have a size collision 273 * with XATTR_FIDSZ. 274 */ 275 if (vfs_has_feature(vfsp, VFSFT_XVATTR) && 276 fidp->fid_len == XATTR_FIDSZ) 277 return (xattr_dir_vget(vfsp, vpp, fidp)); 278 279 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 280 } 281 282 int 283 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 284 { 285 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 286 } 287 288 void 289 fsop_freefs(vfs_t *vfsp) 290 { 291 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 292 } 293 294 int 295 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 296 { 297 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 298 } 299 300 int 301 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 302 { 303 ASSERT((fstype >= 0) && (fstype < nfstype)); 304 305 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 306 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 307 else 308 return (ENOTSUP); 309 } 310 311 /* 312 * File system initialization. vfs_setfsops() must be called from a file 313 * system's init routine. 314 */ 315 316 static int 317 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 318 int *unused_ops) 319 { 320 static const fs_operation_trans_def_t vfs_ops_table[] = { 321 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 322 fs_nosys, fs_nosys, 323 324 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 325 fs_nosys, fs_nosys, 326 327 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 328 fs_nosys, fs_nosys, 329 330 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 331 fs_nosys, fs_nosys, 332 333 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 334 (fs_generic_func_p) fs_sync, 335 (fs_generic_func_p) fs_sync, /* No errors allowed */ 336 337 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 338 fs_nosys, fs_nosys, 339 340 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 341 fs_nosys, fs_nosys, 342 343 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 344 (fs_generic_func_p)fs_freevfs, 345 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 346 347 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 348 (fs_generic_func_p)fs_nosys, 349 (fs_generic_func_p)fs_nosys, 350 351 NULL, 0, NULL, NULL 352 }; 353 354 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 355 } 356 357 void 358 zfs_boot_init() { 359 360 if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0) 361 spa_boot_init(); 362 } 363 364 int 365 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 366 { 367 int error; 368 int unused_ops; 369 370 /* 371 * Verify that fstype refers to a valid fs. Note that 372 * 0 is valid since it's used to set "stray" ops. 373 */ 374 if ((fstype < 0) || (fstype >= nfstype)) 375 return (EINVAL); 376 377 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 378 return (EINVAL); 379 380 /* Set up the operations vector. */ 381 382 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 383 384 if (error != 0) 385 return (error); 386 387 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 388 389 if (actual != NULL) 390 *actual = &vfssw[fstype].vsw_vfsops; 391 392 #if DEBUG 393 if (unused_ops != 0) 394 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 395 "but not used", vfssw[fstype].vsw_name, unused_ops); 396 #endif 397 398 return (0); 399 } 400 401 int 402 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 403 { 404 int error; 405 int unused_ops; 406 407 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 408 409 error = fs_copyfsops(template, *actual, &unused_ops); 410 if (error != 0) { 411 kmem_free(*actual, sizeof (vfsops_t)); 412 *actual = NULL; 413 return (error); 414 } 415 416 return (0); 417 } 418 419 /* 420 * Free a vfsops structure created as a result of vfs_makefsops(). 421 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 422 * vfs_freevfsops_by_type(). 423 */ 424 void 425 vfs_freevfsops(vfsops_t *vfsops) 426 { 427 kmem_free(vfsops, sizeof (vfsops_t)); 428 } 429 430 /* 431 * Since the vfsops structure is part of the vfssw table and wasn't 432 * really allocated, we're not really freeing anything. We keep 433 * the name for consistency with vfs_freevfsops(). We do, however, 434 * need to take care of a little bookkeeping. 435 * NOTE: For a vfsops structure created by vfs_setfsops(), use 436 * vfs_freevfsops_by_type(). 437 */ 438 int 439 vfs_freevfsops_by_type(int fstype) 440 { 441 442 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 443 if ((fstype <= 0) || (fstype >= nfstype)) 444 return (EINVAL); 445 446 WLOCK_VFSSW(); 447 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 448 WUNLOCK_VFSSW(); 449 return (EINVAL); 450 } 451 452 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 453 WUNLOCK_VFSSW(); 454 455 return (0); 456 } 457 458 /* Support routines used to reference vfs_op */ 459 460 /* Set the operations vector for a vfs */ 461 void 462 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 463 { 464 vfsops_t *op; 465 466 ASSERT(vfsp != NULL); 467 ASSERT(vfsops != NULL); 468 469 op = vfsp->vfs_op; 470 membar_consumer(); 471 if (vfsp->vfs_femhead == NULL && 472 casptr(&vfsp->vfs_op, op, vfsops) == op) { 473 return; 474 } 475 fsem_setvfsops(vfsp, vfsops); 476 } 477 478 /* Retrieve the operations vector for a vfs */ 479 vfsops_t * 480 vfs_getops(vfs_t *vfsp) 481 { 482 vfsops_t *op; 483 484 ASSERT(vfsp != NULL); 485 486 op = vfsp->vfs_op; 487 membar_consumer(); 488 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) { 489 return (op); 490 } else { 491 return (fsem_getvfsops(vfsp)); 492 } 493 } 494 495 /* 496 * Returns non-zero (1) if the vfsops matches that of the vfs. 497 * Returns zero (0) if not. 498 */ 499 int 500 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 501 { 502 return (vfs_getops(vfsp) == vfsops); 503 } 504 505 /* 506 * Returns non-zero (1) if the file system has installed a non-default, 507 * non-error vfs_sync routine. Returns zero (0) otherwise. 508 */ 509 int 510 vfs_can_sync(vfs_t *vfsp) 511 { 512 /* vfs_sync() routine is not the default/error function */ 513 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 514 } 515 516 /* 517 * Initialize a vfs structure. 518 */ 519 void 520 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 521 { 522 /* Other initialization has been moved to vfs_alloc() */ 523 vfsp->vfs_count = 0; 524 vfsp->vfs_next = vfsp; 525 vfsp->vfs_prev = vfsp; 526 vfsp->vfs_zone_next = vfsp; 527 vfsp->vfs_zone_prev = vfsp; 528 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 529 vfsimpl_setup(vfsp); 530 vfsp->vfs_data = (data); 531 vfs_setops((vfsp), (op)); 532 } 533 534 /* 535 * Allocate and initialize the vfs implementation private data 536 * structure, vfs_impl_t. 537 */ 538 void 539 vfsimpl_setup(vfs_t *vfsp) 540 { 541 int i; 542 543 if (vfsp->vfs_implp != NULL) { 544 return; 545 } 546 547 vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP); 548 /* Note that these are #define'd in vfs.h */ 549 vfsp->vfs_vskap = NULL; 550 vfsp->vfs_fstypevsp = NULL; 551 552 /* Set size of counted array, then zero the array */ 553 vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1; 554 for (i = 1; i < VFS_FEATURE_MAXSZ; i++) { 555 vfsp->vfs_featureset[i] = 0; 556 } 557 } 558 559 /* 560 * Release the vfs_impl_t structure, if it exists. Some unbundled 561 * filesystems may not use the newer version of vfs and thus 562 * would not contain this implementation private data structure. 563 */ 564 void 565 vfsimpl_teardown(vfs_t *vfsp) 566 { 567 vfs_impl_t *vip = vfsp->vfs_implp; 568 569 if (vip == NULL) 570 return; 571 572 kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t)); 573 vfsp->vfs_implp = NULL; 574 } 575 576 /* 577 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 578 * fstatvfs, and sysfs moved to common/syscall. 579 */ 580 581 /* 582 * Update every mounted file system. We call the vfs_sync operation of 583 * each file system type, passing it a NULL vfsp to indicate that all 584 * mounted file systems of that type should be updated. 585 */ 586 void 587 vfs_sync(int flag) 588 { 589 struct vfssw *vswp; 590 RLOCK_VFSSW(); 591 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 592 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 593 vfs_refvfssw(vswp); 594 RUNLOCK_VFSSW(); 595 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 596 CRED()); 597 vfs_unrefvfssw(vswp); 598 RLOCK_VFSSW(); 599 } 600 } 601 RUNLOCK_VFSSW(); 602 } 603 604 void 605 sync(void) 606 { 607 vfs_sync(0); 608 } 609 610 /* 611 * External routines. 612 */ 613 614 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 615 616 /* 617 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 618 * but otherwise should be accessed only via vfs_list_lock() and 619 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 620 */ 621 static krwlock_t vfslist; 622 623 /* 624 * Mount devfs on /devices. This is done right after root is mounted 625 * to provide device access support for the system 626 */ 627 static void 628 vfs_mountdevices(void) 629 { 630 struct vfssw *vsw; 631 struct vnode *mvp; 632 struct mounta mounta = { /* fake mounta for devfs_mount() */ 633 NULL, 634 NULL, 635 MS_SYSSPACE, 636 NULL, 637 NULL, 638 0, 639 NULL, 640 0 641 }; 642 643 /* 644 * _init devfs module to fill in the vfssw 645 */ 646 if (modload("fs", "devfs") == -1) 647 panic("Cannot _init devfs module"); 648 649 /* 650 * Hold vfs 651 */ 652 RLOCK_VFSSW(); 653 vsw = vfs_getvfsswbyname("devfs"); 654 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 655 VFS_HOLD(&devices); 656 657 /* 658 * Locate mount point 659 */ 660 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 661 panic("Cannot find /devices"); 662 663 /* 664 * Perform the mount of /devices 665 */ 666 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 667 panic("Cannot mount /devices"); 668 669 RUNLOCK_VFSSW(); 670 671 /* 672 * Set appropriate members and add to vfs list for mnttab display 673 */ 674 vfs_setresource(&devices, "/devices"); 675 vfs_setmntpoint(&devices, "/devices"); 676 677 /* 678 * Hold the root of /devices so it won't go away 679 */ 680 if (VFS_ROOT(&devices, &devicesdir)) 681 panic("vfs_mountdevices: not devices root"); 682 683 if (vfs_lock(&devices) != 0) { 684 VN_RELE(devicesdir); 685 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 686 return; 687 } 688 689 if (vn_vfswlock(mvp) != 0) { 690 vfs_unlock(&devices); 691 VN_RELE(devicesdir); 692 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 693 return; 694 } 695 696 vfs_add(mvp, &devices, 0); 697 vn_vfsunlock(mvp); 698 vfs_unlock(&devices); 699 VN_RELE(devicesdir); 700 } 701 702 /* 703 * mount the first instance of /dev to root and remain mounted 704 */ 705 static void 706 vfs_mountdev1(void) 707 { 708 struct vfssw *vsw; 709 struct vnode *mvp; 710 struct mounta mounta = { /* fake mounta for sdev_mount() */ 711 NULL, 712 NULL, 713 MS_SYSSPACE | MS_OVERLAY, 714 NULL, 715 NULL, 716 0, 717 NULL, 718 0 719 }; 720 721 /* 722 * _init dev module to fill in the vfssw 723 */ 724 if (modload("fs", "dev") == -1) 725 cmn_err(CE_PANIC, "Cannot _init dev module\n"); 726 727 /* 728 * Hold vfs 729 */ 730 RLOCK_VFSSW(); 731 vsw = vfs_getvfsswbyname("dev"); 732 VFS_INIT(&dev, &vsw->vsw_vfsops, NULL); 733 VFS_HOLD(&dev); 734 735 /* 736 * Locate mount point 737 */ 738 if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 739 cmn_err(CE_PANIC, "Cannot find /dev\n"); 740 741 /* 742 * Perform the mount of /dev 743 */ 744 if (VFS_MOUNT(&dev, mvp, &mounta, CRED())) 745 cmn_err(CE_PANIC, "Cannot mount /dev 1\n"); 746 747 RUNLOCK_VFSSW(); 748 749 /* 750 * Set appropriate members and add to vfs list for mnttab display 751 */ 752 vfs_setresource(&dev, "/dev"); 753 vfs_setmntpoint(&dev, "/dev"); 754 755 /* 756 * Hold the root of /dev so it won't go away 757 */ 758 if (VFS_ROOT(&dev, &devdir)) 759 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root"); 760 761 if (vfs_lock(&dev) != 0) { 762 VN_RELE(devdir); 763 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev"); 764 return; 765 } 766 767 if (vn_vfswlock(mvp) != 0) { 768 vfs_unlock(&dev); 769 VN_RELE(devdir); 770 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev"); 771 return; 772 } 773 774 vfs_add(mvp, &dev, 0); 775 vn_vfsunlock(mvp); 776 vfs_unlock(&dev); 777 VN_RELE(devdir); 778 } 779 780 /* 781 * Mount required filesystem. This is done right after root is mounted. 782 */ 783 static void 784 vfs_mountfs(char *module, char *spec, char *path) 785 { 786 struct vnode *mvp; 787 struct mounta mounta; 788 vfs_t *vfsp; 789 790 mounta.flags = MS_SYSSPACE | MS_DATA; 791 mounta.fstype = module; 792 mounta.spec = spec; 793 mounta.dir = path; 794 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 795 cmn_err(CE_WARN, "Cannot find %s", path); 796 return; 797 } 798 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 799 cmn_err(CE_WARN, "Cannot mount %s", path); 800 else 801 VFS_RELE(vfsp); 802 VN_RELE(mvp); 803 } 804 805 /* 806 * vfs_mountroot is called by main() to mount the root filesystem. 807 */ 808 void 809 vfs_mountroot(void) 810 { 811 struct vnode *rvp = NULL; 812 char *path; 813 size_t plen; 814 struct vfssw *vswp; 815 816 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 817 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 818 819 /* 820 * Alloc the vfs hash bucket array and locks 821 */ 822 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 823 824 /* 825 * Call machine-dependent routine "rootconf" to choose a root 826 * file system type. 827 */ 828 if (rootconf()) 829 panic("vfs_mountroot: cannot mount root"); 830 /* 831 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 832 * to point to it. These are used by lookuppn() so that it 833 * knows where to start from ('/' or '.'). 834 */ 835 vfs_setmntpoint(rootvfs, "/"); 836 if (VFS_ROOT(rootvfs, &rootdir)) 837 panic("vfs_mountroot: no root vnode"); 838 PTOU(curproc)->u_cdir = rootdir; 839 VN_HOLD(PTOU(curproc)->u_cdir); 840 PTOU(curproc)->u_rdir = NULL; 841 842 /* 843 * Setup the global zone's rootvp, now that it exists. 844 */ 845 global_zone->zone_rootvp = rootdir; 846 VN_HOLD(global_zone->zone_rootvp); 847 848 /* 849 * Notify the module code that it can begin using the 850 * root filesystem instead of the boot program's services. 851 */ 852 modrootloaded = 1; 853 854 /* 855 * Special handling for a ZFS root file system. 856 */ 857 zfs_boot_init(); 858 859 /* 860 * Set up mnttab information for root 861 */ 862 vfs_setresource(rootvfs, rootfs.bo_name); 863 864 /* 865 * Notify cluster software that the root filesystem is available. 866 */ 867 clboot_mountroot(); 868 869 /* Now that we're all done with the root FS, set up its vopstats */ 870 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) { 871 /* Set flag for statistics collection */ 872 if (vswp->vsw_flag & VSW_STATS) { 873 initialize_vopstats(&rootvfs->vfs_vopstats); 874 rootvfs->vfs_flag |= VFS_STATS; 875 rootvfs->vfs_fstypevsp = 876 get_fstype_vopstats(rootvfs, vswp); 877 rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs); 878 } 879 vfs_unrefvfssw(vswp); 880 } 881 882 /* 883 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab, 884 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc. 885 */ 886 vfs_mountdevices(); 887 vfs_mountdev1(); 888 889 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 890 vfs_mountfs("proc", "/proc", "/proc"); 891 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 892 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 893 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 894 895 if (getzoneid() == GLOBAL_ZONEID) { 896 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab"); 897 } 898 899 #ifdef __sparc 900 /* 901 * This bit of magic can go away when we convert sparc to 902 * the new boot architecture based on ramdisk. 903 * 904 * Booting off a mirrored root volume: 905 * At this point, we have booted and mounted root on a 906 * single component of the mirror. Complete the boot 907 * by configuring SVM and converting the root to the 908 * dev_t of the mirrored root device. This dev_t conversion 909 * only works because the underlying device doesn't change. 910 */ 911 if (root_is_svm) { 912 if (svm_rootconf()) { 913 panic("vfs_mountroot: cannot remount root"); 914 } 915 916 /* 917 * mnttab should reflect the new root device 918 */ 919 vfs_lock_wait(rootvfs); 920 vfs_setresource(rootvfs, rootfs.bo_name); 921 vfs_unlock(rootvfs); 922 } 923 #endif /* __sparc */ 924 925 /* 926 * Look up the root device via devfs so that a dv_node is 927 * created for it. The vnode is never VN_RELE()ed. 928 * We allocate more than MAXPATHLEN so that the 929 * buffer passed to i_ddi_prompath_to_devfspath() is 930 * exactly MAXPATHLEN (the function expects a buffer 931 * of that length). 932 */ 933 plen = strlen("/devices"); 934 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 935 (void) strcpy(path, "/devices"); 936 937 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 938 != DDI_SUCCESS || 939 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 940 941 /* NUL terminate in case "path" has garbage */ 942 path[plen + MAXPATHLEN - 1] = '\0'; 943 #ifdef DEBUG 944 cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); 945 #endif 946 } 947 kmem_free(path, plen + MAXPATHLEN); 948 vfs_mnttabvp_setup(); 949 } 950 951 /* 952 * If remount failed and we're in a zone we need to check for the zone 953 * root path and strip it before the call to vfs_setpath(). 954 * 955 * If strpath doesn't begin with the zone_rootpath the original 956 * strpath is returned unchanged. 957 */ 958 static const char * 959 stripzonepath(const char *strpath) 960 { 961 char *str1, *str2; 962 int i; 963 zone_t *zonep = curproc->p_zone; 964 965 if (zonep->zone_rootpath == NULL || strpath == NULL) { 966 return (NULL); 967 } 968 969 /* 970 * we check for the end of the string at one past the 971 * current position because the zone_rootpath always 972 * ends with "/" but we don't want to strip that off. 973 */ 974 str1 = zonep->zone_rootpath; 975 str2 = (char *)strpath; 976 ASSERT(str1[0] != '\0'); 977 for (i = 0; str1[i + 1] != '\0'; i++) { 978 if (str1[i] != str2[i]) 979 return ((char *)strpath); 980 } 981 return (&str2[i]); 982 } 983 984 /* 985 * Common mount code. Called from the system call entry point, from autofs, 986 * nfsv4 trigger mounts, and from pxfs. 987 * 988 * Takes the effective file system type, mount arguments, the mount point 989 * vnode, flags specifying whether the mount is a remount and whether it 990 * should be entered into the vfs list, and credentials. Fills in its vfspp 991 * parameter with the mounted file system instance's vfs. 992 * 993 * Note that the effective file system type is specified as a string. It may 994 * be null, in which case it's determined from the mount arguments, and may 995 * differ from the type specified in the mount arguments; this is a hook to 996 * allow interposition when instantiating file system instances. 997 * 998 * The caller is responsible for releasing its own hold on the mount point 999 * vp (this routine does its own hold when necessary). 1000 * Also note that for remounts, the mount point vp should be the vnode for 1001 * the root of the file system rather than the vnode that the file system 1002 * is mounted on top of. 1003 */ 1004 int 1005 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 1006 struct vfs **vfspp) 1007 { 1008 struct vfssw *vswp; 1009 vfsops_t *vfsops; 1010 struct vfs *vfsp; 1011 struct vnode *bvp; 1012 dev_t bdev = 0; 1013 mntopts_t mnt_mntopts; 1014 int error = 0; 1015 int copyout_error = 0; 1016 int ovflags; 1017 char *opts = uap->optptr; 1018 char *inargs = opts; 1019 int optlen = uap->optlen; 1020 int remount; 1021 int rdonly; 1022 int nbmand = 0; 1023 int delmip = 0; 1024 int addmip = 0; 1025 int splice = ((uap->flags & MS_NOSPLICE) == 0); 1026 int fromspace = (uap->flags & MS_SYSSPACE) ? 1027 UIO_SYSSPACE : UIO_USERSPACE; 1028 char *resource = NULL, *mountpt = NULL; 1029 refstr_t *oldresource, *oldmntpt; 1030 struct pathname pn, rpn; 1031 vsk_anchor_t *vskap; 1032 1033 /* 1034 * The v_flag value for the mount point vp is permanently set 1035 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 1036 * for mount point locking. 1037 */ 1038 mutex_enter(&vp->v_lock); 1039 vp->v_flag |= VVFSLOCK; 1040 mutex_exit(&vp->v_lock); 1041 1042 mnt_mntopts.mo_count = 0; 1043 /* 1044 * Find the ops vector to use to invoke the file system-specific mount 1045 * method. If the fsname argument is non-NULL, use it directly. 1046 * Otherwise, dig the file system type information out of the mount 1047 * arguments. 1048 * 1049 * A side effect is to hold the vfssw entry. 1050 * 1051 * Mount arguments can be specified in several ways, which are 1052 * distinguished by flag bit settings. The preferred way is to set 1053 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 1054 * type supplied as a character string and the last two arguments 1055 * being a pointer to a character buffer and the size of the buffer. 1056 * On entry, the buffer holds a null terminated list of options; on 1057 * return, the string is the list of options the file system 1058 * recognized. If MS_DATA is set arguments five and six point to a 1059 * block of binary data which the file system interprets. 1060 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 1061 * consistently with these conventions. To handle them, we check to 1062 * see whether the pointer to the file system name has a numeric value 1063 * less than 256. If so, we treat it as an index. 1064 */ 1065 if (fsname != NULL) { 1066 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 1067 return (EINVAL); 1068 } 1069 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 1070 size_t n; 1071 uint_t fstype; 1072 char name[FSTYPSZ]; 1073 1074 if ((fstype = (uintptr_t)uap->fstype) < 256) { 1075 RLOCK_VFSSW(); 1076 if (fstype == 0 || fstype >= nfstype || 1077 !ALLOCATED_VFSSW(&vfssw[fstype])) { 1078 RUNLOCK_VFSSW(); 1079 return (EINVAL); 1080 } 1081 (void) strcpy(name, vfssw[fstype].vsw_name); 1082 RUNLOCK_VFSSW(); 1083 if ((vswp = vfs_getvfssw(name)) == NULL) 1084 return (EINVAL); 1085 } else { 1086 /* 1087 * Handle either kernel or user address space. 1088 */ 1089 if (uap->flags & MS_SYSSPACE) { 1090 error = copystr(uap->fstype, name, 1091 FSTYPSZ, &n); 1092 } else { 1093 error = copyinstr(uap->fstype, name, 1094 FSTYPSZ, &n); 1095 } 1096 if (error) { 1097 if (error == ENAMETOOLONG) 1098 return (EINVAL); 1099 return (error); 1100 } 1101 if ((vswp = vfs_getvfssw(name)) == NULL) 1102 return (EINVAL); 1103 } 1104 } else { 1105 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 1106 return (EINVAL); 1107 } 1108 if (!VFS_INSTALLED(vswp)) 1109 return (EINVAL); 1110 vfsops = &vswp->vsw_vfsops; 1111 1112 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 1113 /* 1114 * Fetch mount options and parse them for generic vfs options 1115 */ 1116 if (uap->flags & MS_OPTIONSTR) { 1117 /* 1118 * Limit the buffer size 1119 */ 1120 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 1121 error = EINVAL; 1122 goto errout; 1123 } 1124 if ((uap->flags & MS_SYSSPACE) == 0) { 1125 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 1126 inargs[0] = '\0'; 1127 if (optlen) { 1128 error = copyinstr(opts, inargs, (size_t)optlen, 1129 NULL); 1130 if (error) { 1131 goto errout; 1132 } 1133 } 1134 } 1135 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 1136 } 1137 /* 1138 * Flag bits override the options string. 1139 */ 1140 if (uap->flags & MS_REMOUNT) 1141 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 1142 if (uap->flags & MS_RDONLY) 1143 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 1144 if (uap->flags & MS_NOSUID) 1145 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 1146 1147 /* 1148 * Check if this is a remount; must be set in the option string and 1149 * the file system must support a remount option. 1150 */ 1151 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 1152 MNTOPT_REMOUNT, NULL)) { 1153 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 1154 error = ENOTSUP; 1155 goto errout; 1156 } 1157 uap->flags |= MS_REMOUNT; 1158 } 1159 1160 /* 1161 * uap->flags and vfs_optionisset() should agree. 1162 */ 1163 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 1164 uap->flags |= MS_RDONLY; 1165 } 1166 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 1167 uap->flags |= MS_NOSUID; 1168 } 1169 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 1170 ASSERT(splice || !remount); 1171 /* 1172 * If we are splicing the fs into the namespace, 1173 * perform mount point checks. 1174 * 1175 * We want to resolve the path for the mount point to eliminate 1176 * '.' and ".." and symlinks in mount points; we can't do the 1177 * same for the resource string, since it would turn 1178 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 1179 * this before grabbing vn_vfswlock(), because otherwise we 1180 * would deadlock with lookuppn(). 1181 */ 1182 if (splice) { 1183 ASSERT(vp->v_count > 0); 1184 1185 /* 1186 * Pick up mount point and device from appropriate space. 1187 */ 1188 if (pn_get(uap->spec, fromspace, &pn) == 0) { 1189 resource = kmem_alloc(pn.pn_pathlen + 1, 1190 KM_SLEEP); 1191 (void) strcpy(resource, pn.pn_path); 1192 pn_free(&pn); 1193 } 1194 /* 1195 * Do a lookupname prior to taking the 1196 * writelock. Mark this as completed if 1197 * successful for later cleanup and addition to 1198 * the mount in progress table. 1199 */ 1200 if ((uap->flags & MS_GLOBAL) == 0 && 1201 lookupname(uap->spec, fromspace, 1202 FOLLOW, NULL, &bvp) == 0) { 1203 addmip = 1; 1204 } 1205 1206 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 1207 pathname_t *pnp; 1208 1209 if (*pn.pn_path != '/') { 1210 error = EINVAL; 1211 pn_free(&pn); 1212 goto errout; 1213 } 1214 pn_alloc(&rpn); 1215 /* 1216 * Kludge to prevent autofs from deadlocking with 1217 * itself when it calls domount(). 1218 * 1219 * If autofs is calling, it is because it is doing 1220 * (autofs) mounts in the process of an NFS mount. A 1221 * lookuppn() here would cause us to block waiting for 1222 * said NFS mount to complete, which can't since this 1223 * is the thread that was supposed to doing it. 1224 */ 1225 if (fromspace == UIO_USERSPACE) { 1226 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1227 NULL)) == 0) { 1228 pnp = &rpn; 1229 } else { 1230 /* 1231 * The file disappeared or otherwise 1232 * became inaccessible since we opened 1233 * it; might as well fail the mount 1234 * since the mount point is no longer 1235 * accessible. 1236 */ 1237 pn_free(&rpn); 1238 pn_free(&pn); 1239 goto errout; 1240 } 1241 } else { 1242 pnp = &pn; 1243 } 1244 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1245 (void) strcpy(mountpt, pnp->pn_path); 1246 1247 /* 1248 * If the addition of the zone's rootpath 1249 * would push us over a total path length 1250 * of MAXPATHLEN, we fail the mount with 1251 * ENAMETOOLONG, which is what we would have 1252 * gotten if we were trying to perform the same 1253 * mount in the global zone. 1254 * 1255 * strlen() doesn't count the trailing 1256 * '\0', but zone_rootpathlen counts both a 1257 * trailing '/' and the terminating '\0'. 1258 */ 1259 if ((curproc->p_zone->zone_rootpathlen - 1 + 1260 strlen(mountpt)) > MAXPATHLEN || 1261 (resource != NULL && 1262 (curproc->p_zone->zone_rootpathlen - 1 + 1263 strlen(resource)) > MAXPATHLEN)) { 1264 error = ENAMETOOLONG; 1265 } 1266 1267 pn_free(&rpn); 1268 pn_free(&pn); 1269 } 1270 1271 if (error) 1272 goto errout; 1273 1274 /* 1275 * Prevent path name resolution from proceeding past 1276 * the mount point. 1277 */ 1278 if (vn_vfswlock(vp) != 0) { 1279 error = EBUSY; 1280 goto errout; 1281 } 1282 1283 /* 1284 * Verify that it's legitimate to establish a mount on 1285 * the prospective mount point. 1286 */ 1287 if (vn_mountedvfs(vp) != NULL) { 1288 /* 1289 * The mount point lock was obtained after some 1290 * other thread raced through and established a mount. 1291 */ 1292 vn_vfsunlock(vp); 1293 error = EBUSY; 1294 goto errout; 1295 } 1296 if (vp->v_flag & VNOMOUNT) { 1297 vn_vfsunlock(vp); 1298 error = EINVAL; 1299 goto errout; 1300 } 1301 } 1302 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1303 uap->dataptr = NULL; 1304 uap->datalen = 0; 1305 } 1306 1307 /* 1308 * If this is a remount, we don't want to create a new VFS. 1309 * Instead, we pass the existing one with a remount flag. 1310 */ 1311 if (remount) { 1312 /* 1313 * Confirm that the mount point is the root vnode of the 1314 * file system that is being remounted. 1315 * This can happen if the user specifies a different 1316 * mount point directory pathname in the (re)mount command. 1317 * 1318 * Code below can only be reached if splice is true, so it's 1319 * safe to do vn_vfsunlock() here. 1320 */ 1321 if ((vp->v_flag & VROOT) == 0) { 1322 vn_vfsunlock(vp); 1323 error = ENOENT; 1324 goto errout; 1325 } 1326 /* 1327 * Disallow making file systems read-only unless file system 1328 * explicitly allows it in its vfssw. Ignore other flags. 1329 */ 1330 if (rdonly && vn_is_readonly(vp) == 0 && 1331 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1332 vn_vfsunlock(vp); 1333 error = EINVAL; 1334 goto errout; 1335 } 1336 /* 1337 * Disallow changing the NBMAND disposition of the file 1338 * system on remounts. 1339 */ 1340 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1341 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1342 vn_vfsunlock(vp); 1343 error = EINVAL; 1344 goto errout; 1345 } 1346 vfsp = vp->v_vfsp; 1347 ovflags = vfsp->vfs_flag; 1348 vfsp->vfs_flag |= VFS_REMOUNT; 1349 vfsp->vfs_flag &= ~VFS_RDONLY; 1350 } else { 1351 vfsp = vfs_alloc(KM_SLEEP); 1352 VFS_INIT(vfsp, vfsops, NULL); 1353 } 1354 1355 VFS_HOLD(vfsp); 1356 1357 /* 1358 * The vfs_reflock is not used anymore the code below explicitly 1359 * holds it preventing others accesing it directly. 1360 */ 1361 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1362 !(vfsp->vfs_flag & VFS_REMOUNT)) 1363 cmn_err(CE_WARN, 1364 "mount type %s couldn't get vfs_reflock", vswp->vsw_name); 1365 1366 /* 1367 * Lock the vfs. If this is a remount we want to avoid spurious umount 1368 * failures that happen as a side-effect of fsflush() and other mount 1369 * and unmount operations that might be going on simultaneously and 1370 * may have locked the vfs currently. To not return EBUSY immediately 1371 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1372 */ 1373 if (!remount) { 1374 if (error = vfs_lock(vfsp)) { 1375 vfsp->vfs_flag = ovflags; 1376 if (splice) 1377 vn_vfsunlock(vp); 1378 vfs_free(vfsp); 1379 goto errout; 1380 } 1381 } else { 1382 vfs_lock_wait(vfsp); 1383 } 1384 1385 /* 1386 * Add device to mount in progress table, global mounts require special 1387 * handling. It is possible that we have already done the lookupname 1388 * on a spliced, non-global fs. If so, we don't want to do it again 1389 * since we cannot do a lookupname after taking the 1390 * wlock above. This case is for a non-spliced, non-global filesystem. 1391 */ 1392 if (!addmip) { 1393 if ((uap->flags & MS_GLOBAL) == 0 && 1394 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1395 addmip = 1; 1396 } 1397 } 1398 1399 if (addmip) { 1400 bdev = bvp->v_rdev; 1401 VN_RELE(bvp); 1402 vfs_addmip(bdev, vfsp); 1403 addmip = 0; 1404 delmip = 1; 1405 } 1406 /* 1407 * Invalidate cached entry for the mount point. 1408 */ 1409 if (splice) 1410 dnlc_purge_vp(vp); 1411 1412 /* 1413 * If have an option string but the filesystem doesn't supply a 1414 * prototype options table, create a table with the global 1415 * options and sufficient room to accept all the options in the 1416 * string. Then parse the passed in option string 1417 * accepting all the options in the string. This gives us an 1418 * option table with all the proper cancel properties for the 1419 * global options. 1420 * 1421 * Filesystems that supply a prototype options table are handled 1422 * earlier in this function. 1423 */ 1424 if (uap->flags & MS_OPTIONSTR) { 1425 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1426 mntopts_t tmp_mntopts; 1427 1428 tmp_mntopts.mo_count = 0; 1429 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1430 &mnt_mntopts); 1431 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1432 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1433 vfs_freeopttbl(&tmp_mntopts); 1434 } 1435 } 1436 1437 /* 1438 * Serialize with zone creations. 1439 */ 1440 mount_in_progress(); 1441 /* 1442 * Instantiate (or reinstantiate) the file system. If appropriate, 1443 * splice it into the file system name space. 1444 * 1445 * We want VFS_MOUNT() to be able to override the vfs_resource 1446 * string if necessary (ie, mntfs), and also for a remount to 1447 * change the same (necessary when remounting '/' during boot). 1448 * So we set up vfs_mntpt and vfs_resource to what we think they 1449 * should be, then hand off control to VFS_MOUNT() which can 1450 * override this. 1451 * 1452 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1453 * a vfs which is on the vfs list (i.e. during a remount), we must 1454 * never set those fields to NULL. Several bits of code make 1455 * assumptions that the fields are always valid. 1456 */ 1457 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1458 if (remount) { 1459 if ((oldresource = vfsp->vfs_resource) != NULL) 1460 refstr_hold(oldresource); 1461 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1462 refstr_hold(oldmntpt); 1463 } 1464 vfs_setresource(vfsp, resource); 1465 vfs_setmntpoint(vfsp, mountpt); 1466 1467 /* 1468 * going to mount on this vnode, so notify. 1469 */ 1470 vnevent_mountedover(vp, NULL); 1471 error = VFS_MOUNT(vfsp, vp, uap, credp); 1472 1473 if (uap->flags & MS_RDONLY) 1474 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1475 if (uap->flags & MS_NOSUID) 1476 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1477 if (uap->flags & MS_GLOBAL) 1478 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1479 1480 if (error) { 1481 if (remount) { 1482 /* put back pre-remount options */ 1483 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1484 vfs_setmntpoint(vfsp, (stripzonepath( 1485 refstr_value(oldmntpt)))); 1486 if (oldmntpt) 1487 refstr_rele(oldmntpt); 1488 vfs_setresource(vfsp, (stripzonepath( 1489 refstr_value(oldresource)))); 1490 if (oldresource) 1491 refstr_rele(oldresource); 1492 vfsp->vfs_flag = ovflags; 1493 vfs_unlock(vfsp); 1494 VFS_RELE(vfsp); 1495 } else { 1496 vfs_unlock(vfsp); 1497 vfs_freemnttab(vfsp); 1498 vfs_free(vfsp); 1499 } 1500 } else { 1501 /* 1502 * Set the mount time to now 1503 */ 1504 vfsp->vfs_mtime = ddi_get_time(); 1505 if (remount) { 1506 vfsp->vfs_flag &= ~VFS_REMOUNT; 1507 if (oldresource) 1508 refstr_rele(oldresource); 1509 if (oldmntpt) 1510 refstr_rele(oldmntpt); 1511 } else if (splice) { 1512 /* 1513 * Link vfsp into the name space at the mount 1514 * point. Vfs_add() is responsible for 1515 * holding the mount point which will be 1516 * released when vfs_remove() is called. 1517 */ 1518 vfs_add(vp, vfsp, uap->flags); 1519 } else { 1520 /* 1521 * Hold the reference to file system which is 1522 * not linked into the name space. 1523 */ 1524 vfsp->vfs_zone = NULL; 1525 VFS_HOLD(vfsp); 1526 vfsp->vfs_vnodecovered = NULL; 1527 } 1528 /* 1529 * Set flags for global options encountered 1530 */ 1531 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1532 vfsp->vfs_flag |= VFS_RDONLY; 1533 else 1534 vfsp->vfs_flag &= ~VFS_RDONLY; 1535 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1536 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1537 } else { 1538 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1539 vfsp->vfs_flag |= VFS_NODEVICES; 1540 else 1541 vfsp->vfs_flag &= ~VFS_NODEVICES; 1542 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1543 vfsp->vfs_flag |= VFS_NOSETUID; 1544 else 1545 vfsp->vfs_flag &= ~VFS_NOSETUID; 1546 } 1547 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1548 vfsp->vfs_flag |= VFS_NBMAND; 1549 else 1550 vfsp->vfs_flag &= ~VFS_NBMAND; 1551 1552 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1553 vfsp->vfs_flag |= VFS_XATTR; 1554 else 1555 vfsp->vfs_flag &= ~VFS_XATTR; 1556 1557 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1558 vfsp->vfs_flag |= VFS_NOEXEC; 1559 else 1560 vfsp->vfs_flag &= ~VFS_NOEXEC; 1561 1562 /* 1563 * Now construct the output option string of options 1564 * we recognized. 1565 */ 1566 if (uap->flags & MS_OPTIONSTR) { 1567 vfs_list_read_lock(); 1568 copyout_error = vfs_buildoptionstr( 1569 &vfsp->vfs_mntopts, inargs, optlen); 1570 vfs_list_unlock(); 1571 if (copyout_error == 0 && 1572 (uap->flags & MS_SYSSPACE) == 0) { 1573 copyout_error = copyoutstr(inargs, opts, 1574 optlen, NULL); 1575 } 1576 } 1577 1578 /* 1579 * If this isn't a remount, set up the vopstats before 1580 * anyone can touch this. We only allow spliced file 1581 * systems (file systems which are in the namespace) to 1582 * have the VFS_STATS flag set. 1583 * NOTE: PxFS mounts the underlying file system with 1584 * MS_NOSPLICE set and copies those vfs_flags to its private 1585 * vfs structure. As a result, PxFS should never have 1586 * the VFS_STATS flag or else we might access the vfs 1587 * statistics-related fields prior to them being 1588 * properly initialized. 1589 */ 1590 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) { 1591 initialize_vopstats(&vfsp->vfs_vopstats); 1592 /* 1593 * We need to set vfs_vskap to NULL because there's 1594 * a chance it won't be set below. This is checked 1595 * in teardown_vopstats() so we can't have garbage. 1596 */ 1597 vfsp->vfs_vskap = NULL; 1598 vfsp->vfs_flag |= VFS_STATS; 1599 vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp); 1600 } 1601 1602 if (vswp->vsw_flag & VSW_XID) 1603 vfsp->vfs_flag |= VFS_XID; 1604 1605 vfs_unlock(vfsp); 1606 } 1607 mount_completed(); 1608 if (splice) 1609 vn_vfsunlock(vp); 1610 1611 if ((error == 0) && (copyout_error == 0)) { 1612 if (!remount) { 1613 /* 1614 * Don't call get_vskstat_anchor() while holding 1615 * locks since it allocates memory and calls 1616 * VFS_STATVFS(). For NFS, the latter can generate 1617 * an over-the-wire call. 1618 */ 1619 vskap = get_vskstat_anchor(vfsp); 1620 /* Only take the lock if we have something to do */ 1621 if (vskap != NULL) { 1622 vfs_lock_wait(vfsp); 1623 if (vfsp->vfs_flag & VFS_STATS) { 1624 vfsp->vfs_vskap = vskap; 1625 } 1626 vfs_unlock(vfsp); 1627 } 1628 } 1629 /* Return vfsp to caller. */ 1630 *vfspp = vfsp; 1631 } 1632 errout: 1633 vfs_freeopttbl(&mnt_mntopts); 1634 if (resource != NULL) 1635 kmem_free(resource, strlen(resource) + 1); 1636 if (mountpt != NULL) 1637 kmem_free(mountpt, strlen(mountpt) + 1); 1638 /* 1639 * It is possible we errored prior to adding to mount in progress 1640 * table. Must free vnode we acquired with successful lookupname. 1641 */ 1642 if (addmip) 1643 VN_RELE(bvp); 1644 if (delmip) 1645 vfs_delmip(vfsp); 1646 ASSERT(vswp != NULL); 1647 vfs_unrefvfssw(vswp); 1648 if (inargs != opts) 1649 kmem_free(inargs, MAX_MNTOPT_STR); 1650 if (copyout_error) { 1651 VFS_RELE(vfsp); 1652 error = copyout_error; 1653 } 1654 return (error); 1655 } 1656 1657 static void 1658 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath) 1659 { 1660 size_t len; 1661 refstr_t *ref; 1662 zone_t *zone = curproc->p_zone; 1663 char *sp; 1664 int have_list_lock = 0; 1665 1666 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1667 1668 /* 1669 * New path must be less than MAXPATHLEN because mntfs 1670 * will only display up to MAXPATHLEN bytes. This is currently 1671 * safe, because domount() uses pn_get(), and other callers 1672 * similarly cap the size to fewer than MAXPATHLEN bytes. 1673 */ 1674 1675 ASSERT(strlen(newpath) < MAXPATHLEN); 1676 1677 /* mntfs requires consistency while vfs list lock is held */ 1678 1679 if (VFS_ON_LIST(vfsp)) { 1680 have_list_lock = 1; 1681 vfs_list_lock(); 1682 } 1683 1684 if (*refp != NULL) 1685 refstr_rele(*refp); 1686 1687 /* Do we need to modify the path? */ 1688 1689 if (zone == global_zone || *newpath != '/') { 1690 ref = refstr_alloc(newpath); 1691 goto out; 1692 } 1693 1694 /* 1695 * Truncate the trailing '/' in the zoneroot, and merge 1696 * in the zone's rootpath with the "newpath" (resource 1697 * or mountpoint) passed in. 1698 * 1699 * The size of the required buffer is thus the size of 1700 * the buffer required for the passed-in newpath 1701 * (strlen(newpath) + 1), plus the size of the buffer 1702 * required to hold zone_rootpath (zone_rootpathlen) 1703 * minus one for one of the now-superfluous NUL 1704 * terminations, minus one for the trailing '/'. 1705 * 1706 * That gives us: 1707 * 1708 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1709 * 1710 * Which is what we have below. 1711 */ 1712 1713 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1714 sp = kmem_alloc(len, KM_SLEEP); 1715 1716 /* 1717 * Copy everything including the trailing slash, which 1718 * we then overwrite with the NUL character. 1719 */ 1720 1721 (void) strcpy(sp, zone->zone_rootpath); 1722 sp[zone->zone_rootpathlen - 2] = '\0'; 1723 (void) strcat(sp, newpath); 1724 1725 ref = refstr_alloc(sp); 1726 kmem_free(sp, len); 1727 out: 1728 *refp = ref; 1729 1730 if (have_list_lock) { 1731 vfs_mnttab_modtimeupd(); 1732 vfs_list_unlock(); 1733 } 1734 } 1735 1736 /* 1737 * Record a mounted resource name in a vfs structure. 1738 * If vfsp is already mounted, caller must hold the vfs lock. 1739 */ 1740 void 1741 vfs_setresource(struct vfs *vfsp, const char *resource) 1742 { 1743 if (resource == NULL || resource[0] == '\0') 1744 resource = VFS_NORESOURCE; 1745 vfs_setpath(vfsp, &vfsp->vfs_resource, resource); 1746 } 1747 1748 /* 1749 * Record a mount point name in a vfs structure. 1750 * If vfsp is already mounted, caller must hold the vfs lock. 1751 */ 1752 void 1753 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt) 1754 { 1755 if (mntpt == NULL || mntpt[0] == '\0') 1756 mntpt = VFS_NOMNTPT; 1757 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt); 1758 } 1759 1760 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1761 1762 refstr_t * 1763 vfs_getresource(const struct vfs *vfsp) 1764 { 1765 refstr_t *resource; 1766 1767 vfs_list_read_lock(); 1768 resource = vfsp->vfs_resource; 1769 refstr_hold(resource); 1770 vfs_list_unlock(); 1771 1772 return (resource); 1773 } 1774 1775 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 1776 1777 refstr_t * 1778 vfs_getmntpoint(const struct vfs *vfsp) 1779 { 1780 refstr_t *mntpt; 1781 1782 vfs_list_read_lock(); 1783 mntpt = vfsp->vfs_mntpt; 1784 refstr_hold(mntpt); 1785 vfs_list_unlock(); 1786 1787 return (mntpt); 1788 } 1789 1790 /* 1791 * Create an empty options table with enough empty slots to hold all 1792 * The options in the options string passed as an argument. 1793 * Potentially prepend another options table. 1794 * 1795 * Note: caller is responsible for locking the vfs list, if needed, 1796 * to protect mops. 1797 */ 1798 static void 1799 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 1800 const mntopts_t *mtmpl) 1801 { 1802 const char *s = opts; 1803 uint_t count; 1804 1805 if (opts == NULL || *opts == '\0') { 1806 count = 0; 1807 } else { 1808 count = 1; 1809 1810 /* 1811 * Count number of options in the string 1812 */ 1813 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 1814 count++; 1815 s++; 1816 } 1817 } 1818 vfs_copyopttbl_extend(mtmpl, mops, count); 1819 } 1820 1821 /* 1822 * Create an empty options table with enough empty slots to hold all 1823 * The options in the options string passed as an argument. 1824 * 1825 * This function is *not* for general use by filesystems. 1826 * 1827 * Note: caller is responsible for locking the vfs list, if needed, 1828 * to protect mops. 1829 */ 1830 void 1831 vfs_createopttbl(mntopts_t *mops, const char *opts) 1832 { 1833 vfs_createopttbl_extend(mops, opts, NULL); 1834 } 1835 1836 1837 /* 1838 * Swap two mount options tables 1839 */ 1840 static void 1841 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 1842 { 1843 uint_t tmpcnt; 1844 mntopt_t *tmplist; 1845 1846 tmpcnt = optbl2->mo_count; 1847 tmplist = optbl2->mo_list; 1848 optbl2->mo_count = optbl1->mo_count; 1849 optbl2->mo_list = optbl1->mo_list; 1850 optbl1->mo_count = tmpcnt; 1851 optbl1->mo_list = tmplist; 1852 } 1853 1854 static void 1855 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 1856 { 1857 vfs_list_lock(); 1858 vfs_swapopttbl_nolock(optbl1, optbl2); 1859 vfs_mnttab_modtimeupd(); 1860 vfs_list_unlock(); 1861 } 1862 1863 static char ** 1864 vfs_copycancelopt_extend(char **const moc, int extend) 1865 { 1866 int i = 0; 1867 int j; 1868 char **result; 1869 1870 if (moc != NULL) { 1871 for (; moc[i] != NULL; i++) 1872 /* count number of options to cancel */; 1873 } 1874 1875 if (i + extend == 0) 1876 return (NULL); 1877 1878 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 1879 1880 for (j = 0; j < i; j++) { 1881 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 1882 (void) strcpy(result[j], moc[j]); 1883 } 1884 for (; j <= i + extend; j++) 1885 result[j] = NULL; 1886 1887 return (result); 1888 } 1889 1890 static void 1891 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 1892 { 1893 char *sp, *dp; 1894 1895 d->mo_flags = s->mo_flags; 1896 d->mo_data = s->mo_data; 1897 sp = s->mo_name; 1898 if (sp != NULL) { 1899 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1900 (void) strcpy(dp, sp); 1901 d->mo_name = dp; 1902 } else { 1903 d->mo_name = NULL; /* should never happen */ 1904 } 1905 1906 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 1907 1908 sp = s->mo_arg; 1909 if (sp != NULL) { 1910 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1911 (void) strcpy(dp, sp); 1912 d->mo_arg = dp; 1913 } else { 1914 d->mo_arg = NULL; 1915 } 1916 } 1917 1918 /* 1919 * Copy a mount options table, possibly allocating some spare 1920 * slots at the end. It is permissible to copy_extend the NULL table. 1921 */ 1922 static void 1923 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 1924 { 1925 uint_t i, count; 1926 mntopt_t *motbl; 1927 1928 /* 1929 * Clear out any existing stuff in the options table being initialized 1930 */ 1931 vfs_freeopttbl(dmo); 1932 count = (smo == NULL) ? 0 : smo->mo_count; 1933 if ((count + extra) == 0) /* nothing to do */ 1934 return; 1935 dmo->mo_count = count + extra; 1936 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 1937 dmo->mo_list = motbl; 1938 for (i = 0; i < count; i++) { 1939 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 1940 } 1941 for (i = count; i < count + extra; i++) { 1942 motbl[i].mo_flags = MO_EMPTY; 1943 } 1944 } 1945 1946 /* 1947 * Copy a mount options table. 1948 * 1949 * This function is *not* for general use by filesystems. 1950 * 1951 * Note: caller is responsible for locking the vfs list, if needed, 1952 * to protect smo and dmo. 1953 */ 1954 void 1955 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 1956 { 1957 vfs_copyopttbl_extend(smo, dmo, 0); 1958 } 1959 1960 static char ** 1961 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 1962 { 1963 int c1 = 0; 1964 int c2 = 0; 1965 char **result; 1966 char **sp1, **sp2, **dp; 1967 1968 /* 1969 * First we count both lists of cancel options. 1970 * If either is NULL or has no elements, we return a copy of 1971 * the other. 1972 */ 1973 if (mop1->mo_cancel != NULL) { 1974 for (; mop1->mo_cancel[c1] != NULL; c1++) 1975 /* count cancel options in mop1 */; 1976 } 1977 1978 if (c1 == 0) 1979 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 1980 1981 if (mop2->mo_cancel != NULL) { 1982 for (; mop2->mo_cancel[c2] != NULL; c2++) 1983 /* count cancel options in mop2 */; 1984 } 1985 1986 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 1987 1988 if (c2 == 0) 1989 return (result); 1990 1991 /* 1992 * When we get here, we've got two sets of cancel options; 1993 * we need to merge the two sets. We know that the result 1994 * array has "c1+c2+1" entries and in the end we might shrink 1995 * it. 1996 * Result now has a copy of the c1 entries from mop1; we'll 1997 * now lookup all the entries of mop2 in mop1 and copy it if 1998 * it is unique. 1999 * This operation is O(n^2) but it's only called once per 2000 * filesystem per duplicate option. This is a situation 2001 * which doesn't arise with the filesystems in ON and 2002 * n is generally 1. 2003 */ 2004 2005 dp = &result[c1]; 2006 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 2007 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 2008 if (strcmp(*sp1, *sp2) == 0) 2009 break; 2010 } 2011 if (*sp1 == NULL) { 2012 /* 2013 * Option *sp2 not found in mop1, so copy it. 2014 * The calls to vfs_copycancelopt_extend() 2015 * guarantee that there's enough room. 2016 */ 2017 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 2018 (void) strcpy(*dp++, *sp2); 2019 } 2020 } 2021 if (dp != &result[c1+c2]) { 2022 size_t bytes = (dp - result + 1) * sizeof (char *); 2023 char **nres = kmem_alloc(bytes, KM_SLEEP); 2024 2025 bcopy(result, nres, bytes); 2026 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 2027 result = nres; 2028 } 2029 return (result); 2030 } 2031 2032 /* 2033 * Merge two mount option tables (outer and inner) into one. This is very 2034 * similar to "merging" global variables and automatic variables in C. 2035 * 2036 * This isn't (and doesn't have to be) fast. 2037 * 2038 * This function is *not* for general use by filesystems. 2039 * 2040 * Note: caller is responsible for locking the vfs list, if needed, 2041 * to protect omo, imo & dmo. 2042 */ 2043 void 2044 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 2045 { 2046 uint_t i, count; 2047 mntopt_t *mop, *motbl; 2048 uint_t freeidx; 2049 2050 /* 2051 * First determine how much space we need to allocate. 2052 */ 2053 count = omo->mo_count; 2054 for (i = 0; i < imo->mo_count; i++) { 2055 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2056 continue; 2057 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 2058 count++; 2059 } 2060 ASSERT(count >= omo->mo_count && 2061 count <= omo->mo_count + imo->mo_count); 2062 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 2063 for (i = 0; i < omo->mo_count; i++) 2064 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 2065 freeidx = omo->mo_count; 2066 for (i = 0; i < imo->mo_count; i++) { 2067 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2068 continue; 2069 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 2070 char **newcanp; 2071 uint_t index = mop - omo->mo_list; 2072 2073 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 2074 2075 vfs_freeopt(&motbl[index]); 2076 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 2077 2078 vfs_freecancelopt(motbl[index].mo_cancel); 2079 motbl[index].mo_cancel = newcanp; 2080 } else { 2081 /* 2082 * If it's a new option, just copy it over to the first 2083 * free location. 2084 */ 2085 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 2086 } 2087 } 2088 dmo->mo_count = count; 2089 dmo->mo_list = motbl; 2090 } 2091 2092 /* 2093 * Functions to set and clear mount options in a mount options table. 2094 */ 2095 2096 /* 2097 * Clear a mount option, if it exists. 2098 * 2099 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2100 * the vfs list. 2101 */ 2102 static void 2103 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 2104 { 2105 struct mntopt *mop; 2106 uint_t i, count; 2107 2108 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2109 2110 count = mops->mo_count; 2111 for (i = 0; i < count; i++) { 2112 mop = &mops->mo_list[i]; 2113 2114 if (mop->mo_flags & MO_EMPTY) 2115 continue; 2116 if (strcmp(opt, mop->mo_name)) 2117 continue; 2118 mop->mo_flags &= ~MO_SET; 2119 if (mop->mo_arg != NULL) { 2120 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2121 } 2122 mop->mo_arg = NULL; 2123 if (update_mnttab) 2124 vfs_mnttab_modtimeupd(); 2125 break; 2126 } 2127 } 2128 2129 void 2130 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 2131 { 2132 int gotlock = 0; 2133 2134 if (VFS_ON_LIST(vfsp)) { 2135 gotlock = 1; 2136 vfs_list_lock(); 2137 } 2138 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 2139 if (gotlock) 2140 vfs_list_unlock(); 2141 } 2142 2143 2144 /* 2145 * Set a mount option on. If it's not found in the table, it's silently 2146 * ignored. If the option has MO_IGNORE set, it is still set unless the 2147 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 2148 * bits can be used to toggle the MO_NODISPLAY bit for the option. 2149 * If the VFS_CREATEOPT flag bit is set then the first option slot with 2150 * MO_EMPTY set is created as the option passed in. 2151 * 2152 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2153 * the vfs list. 2154 */ 2155 static void 2156 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 2157 const char *arg, int flags, int update_mnttab) 2158 { 2159 mntopt_t *mop; 2160 uint_t i, count; 2161 char *sp; 2162 2163 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2164 2165 if (flags & VFS_CREATEOPT) { 2166 if (vfs_hasopt(mops, opt) != NULL) { 2167 flags &= ~VFS_CREATEOPT; 2168 } 2169 } 2170 count = mops->mo_count; 2171 for (i = 0; i < count; i++) { 2172 mop = &mops->mo_list[i]; 2173 2174 if (mop->mo_flags & MO_EMPTY) { 2175 if ((flags & VFS_CREATEOPT) == 0) 2176 continue; 2177 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 2178 (void) strcpy(sp, opt); 2179 mop->mo_name = sp; 2180 if (arg != NULL) 2181 mop->mo_flags = MO_HASVALUE; 2182 else 2183 mop->mo_flags = 0; 2184 } else if (strcmp(opt, mop->mo_name)) { 2185 continue; 2186 } 2187 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 2188 break; 2189 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 2190 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 2191 (void) strcpy(sp, arg); 2192 } else { 2193 sp = NULL; 2194 } 2195 if (mop->mo_arg != NULL) 2196 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2197 mop->mo_arg = sp; 2198 if (flags & VFS_DISPLAY) 2199 mop->mo_flags &= ~MO_NODISPLAY; 2200 if (flags & VFS_NODISPLAY) 2201 mop->mo_flags |= MO_NODISPLAY; 2202 mop->mo_flags |= MO_SET; 2203 if (mop->mo_cancel != NULL) { 2204 char **cp; 2205 2206 for (cp = mop->mo_cancel; *cp != NULL; cp++) 2207 vfs_clearmntopt_nolock(mops, *cp, 0); 2208 } 2209 if (update_mnttab) 2210 vfs_mnttab_modtimeupd(); 2211 break; 2212 } 2213 } 2214 2215 void 2216 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 2217 { 2218 int gotlock = 0; 2219 2220 if (VFS_ON_LIST(vfsp)) { 2221 gotlock = 1; 2222 vfs_list_lock(); 2223 } 2224 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 2225 if (gotlock) 2226 vfs_list_unlock(); 2227 } 2228 2229 2230 /* 2231 * Add a "tag" option to a mounted file system's options list. 2232 * 2233 * Note: caller is responsible for locking the vfs list, if needed, 2234 * to protect mops. 2235 */ 2236 static mntopt_t * 2237 vfs_addtag(mntopts_t *mops, const char *tag) 2238 { 2239 uint_t count; 2240 mntopt_t *mop, *motbl; 2241 2242 count = mops->mo_count + 1; 2243 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 2244 if (mops->mo_count) { 2245 size_t len = (count - 1) * sizeof (mntopt_t); 2246 2247 bcopy(mops->mo_list, motbl, len); 2248 kmem_free(mops->mo_list, len); 2249 } 2250 mops->mo_count = count; 2251 mops->mo_list = motbl; 2252 mop = &motbl[count - 1]; 2253 mop->mo_flags = MO_TAG; 2254 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 2255 (void) strcpy(mop->mo_name, tag); 2256 return (mop); 2257 } 2258 2259 /* 2260 * Allow users to set arbitrary "tags" in a vfs's mount options. 2261 * Broader use within the kernel is discouraged. 2262 */ 2263 int 2264 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2265 cred_t *cr) 2266 { 2267 vfs_t *vfsp; 2268 mntopts_t *mops; 2269 mntopt_t *mop; 2270 int found = 0; 2271 dev_t dev = makedevice(major, minor); 2272 int err = 0; 2273 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2274 2275 /* 2276 * Find the desired mounted file system 2277 */ 2278 vfs_list_lock(); 2279 vfsp = rootvfs; 2280 do { 2281 if (vfsp->vfs_dev == dev && 2282 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2283 found = 1; 2284 break; 2285 } 2286 vfsp = vfsp->vfs_next; 2287 } while (vfsp != rootvfs); 2288 2289 if (!found) { 2290 err = EINVAL; 2291 goto out; 2292 } 2293 err = secpolicy_fs_config(cr, vfsp); 2294 if (err != 0) 2295 goto out; 2296 2297 mops = &vfsp->vfs_mntopts; 2298 /* 2299 * Add tag if it doesn't already exist 2300 */ 2301 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2302 int len; 2303 2304 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2305 len = strlen(buf); 2306 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2307 err = ENAMETOOLONG; 2308 goto out; 2309 } 2310 mop = vfs_addtag(mops, tag); 2311 } 2312 if ((mop->mo_flags & MO_TAG) == 0) { 2313 err = EINVAL; 2314 goto out; 2315 } 2316 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2317 out: 2318 vfs_list_unlock(); 2319 kmem_free(buf, MAX_MNTOPT_STR); 2320 return (err); 2321 } 2322 2323 /* 2324 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2325 * Broader use within the kernel is discouraged. 2326 */ 2327 int 2328 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2329 cred_t *cr) 2330 { 2331 vfs_t *vfsp; 2332 mntopt_t *mop; 2333 int found = 0; 2334 dev_t dev = makedevice(major, minor); 2335 int err = 0; 2336 2337 /* 2338 * Find the desired mounted file system 2339 */ 2340 vfs_list_lock(); 2341 vfsp = rootvfs; 2342 do { 2343 if (vfsp->vfs_dev == dev && 2344 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2345 found = 1; 2346 break; 2347 } 2348 vfsp = vfsp->vfs_next; 2349 } while (vfsp != rootvfs); 2350 2351 if (!found) { 2352 err = EINVAL; 2353 goto out; 2354 } 2355 err = secpolicy_fs_config(cr, vfsp); 2356 if (err != 0) 2357 goto out; 2358 2359 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2360 err = EINVAL; 2361 goto out; 2362 } 2363 if ((mop->mo_flags & MO_TAG) == 0) { 2364 err = EINVAL; 2365 goto out; 2366 } 2367 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2368 out: 2369 vfs_list_unlock(); 2370 return (err); 2371 } 2372 2373 /* 2374 * Function to parse an option string and fill in a mount options table. 2375 * Unknown options are silently ignored. The input option string is modified 2376 * by replacing separators with nulls. If the create flag is set, options 2377 * not found in the table are just added on the fly. The table must have 2378 * an option slot marked MO_EMPTY to add an option on the fly. 2379 * 2380 * This function is *not* for general use by filesystems. 2381 * 2382 * Note: caller is responsible for locking the vfs list, if needed, 2383 * to protect mops.. 2384 */ 2385 void 2386 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2387 { 2388 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2389 int setflg = VFS_NOFORCEOPT; 2390 2391 if (osp == NULL) 2392 return; 2393 while (*s != '\0') { 2394 p = strchr(s, ','); /* find next option */ 2395 if (p == NULL) { 2396 cp = NULL; 2397 p = s + strlen(s); 2398 } else { 2399 cp = p; /* save location of comma */ 2400 *p++ = '\0'; /* mark end and point to next option */ 2401 } 2402 nextop = p; 2403 p = strchr(s, '='); /* look for value */ 2404 if (p == NULL) { 2405 valp = NULL; /* no value supplied */ 2406 } else { 2407 ep = p; /* save location of equals */ 2408 *p++ = '\0'; /* end option and point to value */ 2409 valp = p; 2410 } 2411 /* 2412 * set option into options table 2413 */ 2414 if (create) 2415 setflg |= VFS_CREATEOPT; 2416 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2417 if (cp != NULL) 2418 *cp = ','; /* restore the comma */ 2419 if (valp != NULL) 2420 *ep = '='; /* restore the equals */ 2421 s = nextop; 2422 } 2423 } 2424 2425 /* 2426 * Function to inquire if an option exists in a mount options table. 2427 * Returns a pointer to the option if it exists, else NULL. 2428 * 2429 * This function is *not* for general use by filesystems. 2430 * 2431 * Note: caller is responsible for locking the vfs list, if needed, 2432 * to protect mops. 2433 */ 2434 struct mntopt * 2435 vfs_hasopt(const mntopts_t *mops, const char *opt) 2436 { 2437 struct mntopt *mop; 2438 uint_t i, count; 2439 2440 count = mops->mo_count; 2441 for (i = 0; i < count; i++) { 2442 mop = &mops->mo_list[i]; 2443 2444 if (mop->mo_flags & MO_EMPTY) 2445 continue; 2446 if (strcmp(opt, mop->mo_name) == 0) 2447 return (mop); 2448 } 2449 return (NULL); 2450 } 2451 2452 /* 2453 * Function to inquire if an option is set in a mount options table. 2454 * Returns non-zero if set and fills in the arg pointer with a pointer to 2455 * the argument string or NULL if there is no argument string. 2456 */ 2457 static int 2458 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2459 { 2460 struct mntopt *mop; 2461 uint_t i, count; 2462 2463 count = mops->mo_count; 2464 for (i = 0; i < count; i++) { 2465 mop = &mops->mo_list[i]; 2466 2467 if (mop->mo_flags & MO_EMPTY) 2468 continue; 2469 if (strcmp(opt, mop->mo_name)) 2470 continue; 2471 if ((mop->mo_flags & MO_SET) == 0) 2472 return (0); 2473 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2474 *argp = mop->mo_arg; 2475 return (1); 2476 } 2477 return (0); 2478 } 2479 2480 2481 int 2482 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2483 { 2484 int ret; 2485 2486 vfs_list_read_lock(); 2487 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2488 vfs_list_unlock(); 2489 return (ret); 2490 } 2491 2492 2493 /* 2494 * Construct a comma separated string of the options set in the given 2495 * mount table, return the string in the given buffer. Return non-zero if 2496 * the buffer would overflow. 2497 * 2498 * This function is *not* for general use by filesystems. 2499 * 2500 * Note: caller is responsible for locking the vfs list, if needed, 2501 * to protect mp. 2502 */ 2503 int 2504 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2505 { 2506 char *cp; 2507 uint_t i; 2508 2509 buf[0] = '\0'; 2510 cp = buf; 2511 for (i = 0; i < mp->mo_count; i++) { 2512 struct mntopt *mop; 2513 2514 mop = &mp->mo_list[i]; 2515 if (mop->mo_flags & MO_SET) { 2516 int optlen, comma = 0; 2517 2518 if (buf[0] != '\0') 2519 comma = 1; 2520 optlen = strlen(mop->mo_name); 2521 if (strlen(buf) + comma + optlen + 1 > len) 2522 goto err; 2523 if (comma) 2524 *cp++ = ','; 2525 (void) strcpy(cp, mop->mo_name); 2526 cp += optlen; 2527 /* 2528 * Append option value if there is one 2529 */ 2530 if (mop->mo_arg != NULL) { 2531 int arglen; 2532 2533 arglen = strlen(mop->mo_arg); 2534 if (strlen(buf) + arglen + 2 > len) 2535 goto err; 2536 *cp++ = '='; 2537 (void) strcpy(cp, mop->mo_arg); 2538 cp += arglen; 2539 } 2540 } 2541 } 2542 return (0); 2543 err: 2544 return (EOVERFLOW); 2545 } 2546 2547 static void 2548 vfs_freecancelopt(char **moc) 2549 { 2550 if (moc != NULL) { 2551 int ccnt = 0; 2552 char **cp; 2553 2554 for (cp = moc; *cp != NULL; cp++) { 2555 kmem_free(*cp, strlen(*cp) + 1); 2556 ccnt++; 2557 } 2558 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2559 } 2560 } 2561 2562 static void 2563 vfs_freeopt(mntopt_t *mop) 2564 { 2565 if (mop->mo_name != NULL) 2566 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2567 2568 vfs_freecancelopt(mop->mo_cancel); 2569 2570 if (mop->mo_arg != NULL) 2571 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2572 } 2573 2574 /* 2575 * Free a mount options table 2576 * 2577 * This function is *not* for general use by filesystems. 2578 * 2579 * Note: caller is responsible for locking the vfs list, if needed, 2580 * to protect mp. 2581 */ 2582 void 2583 vfs_freeopttbl(mntopts_t *mp) 2584 { 2585 uint_t i, count; 2586 2587 count = mp->mo_count; 2588 for (i = 0; i < count; i++) { 2589 vfs_freeopt(&mp->mo_list[i]); 2590 } 2591 if (count) { 2592 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2593 mp->mo_count = 0; 2594 mp->mo_list = NULL; 2595 } 2596 } 2597 2598 2599 /* ARGSUSED */ 2600 static int 2601 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2602 caller_context_t *ct) 2603 { 2604 return (0); 2605 } 2606 2607 /* ARGSUSED */ 2608 static int 2609 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2610 caller_context_t *ct) 2611 { 2612 return (0); 2613 } 2614 2615 /* 2616 * The dummy vnode is currently used only by file events notification 2617 * module which is just interested in the timestamps. 2618 */ 2619 /* ARGSUSED */ 2620 static int 2621 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2622 caller_context_t *ct) 2623 { 2624 bzero(vap, sizeof (vattr_t)); 2625 vap->va_type = VREG; 2626 vap->va_nlink = 1; 2627 vap->va_ctime = vfs_mnttab_ctime; 2628 /* 2629 * it is ok to just copy mtime as the time will be monotonically 2630 * increasing. 2631 */ 2632 vap->va_mtime = vfs_mnttab_mtime; 2633 vap->va_atime = vap->va_mtime; 2634 return (0); 2635 } 2636 2637 static void 2638 vfs_mnttabvp_setup(void) 2639 { 2640 vnode_t *tvp; 2641 vnodeops_t *vfs_mntdummyvnops; 2642 const fs_operation_def_t mnt_dummyvnodeops_template[] = { 2643 VOPNAME_READ, { .vop_read = vfs_mntdummyread }, 2644 VOPNAME_WRITE, { .vop_write = vfs_mntdummywrite }, 2645 VOPNAME_GETATTR, { .vop_getattr = vfs_mntdummygetattr }, 2646 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 2647 NULL, NULL 2648 }; 2649 2650 if (vn_make_ops("mnttab", mnt_dummyvnodeops_template, 2651 &vfs_mntdummyvnops) != 0) { 2652 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed"); 2653 /* Shouldn't happen, but not bad enough to panic */ 2654 return; 2655 } 2656 2657 /* 2658 * A global dummy vnode is allocated to represent mntfs files. 2659 * The mntfs file (/etc/mnttab) can be monitored for file events 2660 * and receive an event when mnttab changes. Dummy VOP calls 2661 * will be made on this vnode. The file events notification module 2662 * intercepts this vnode and delivers relevant events. 2663 */ 2664 tvp = vn_alloc(KM_SLEEP); 2665 tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE; 2666 vn_setops(tvp, vfs_mntdummyvnops); 2667 tvp->v_type = VREG; 2668 /* 2669 * The mnt dummy ops do not reference v_data. 2670 * No other module intercepting this vnode should either. 2671 * Just set it to point to itself. 2672 */ 2673 tvp->v_data = (caddr_t)tvp; 2674 tvp->v_vfsp = rootvfs; 2675 vfs_mntdummyvp = tvp; 2676 } 2677 2678 /* 2679 * performs fake read/write ops 2680 */ 2681 static void 2682 vfs_mnttab_rwop(int rw) 2683 { 2684 struct uio uio; 2685 struct iovec iov; 2686 char buf[1]; 2687 2688 if (vfs_mntdummyvp == NULL) 2689 return; 2690 2691 bzero(&uio, sizeof (uio)); 2692 bzero(&iov, sizeof (iov)); 2693 iov.iov_base = buf; 2694 iov.iov_len = 0; 2695 uio.uio_iov = &iov; 2696 uio.uio_iovcnt = 1; 2697 uio.uio_loffset = 0; 2698 uio.uio_segflg = UIO_SYSSPACE; 2699 uio.uio_resid = 0; 2700 if (rw) { 2701 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2702 } else { 2703 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2704 } 2705 } 2706 2707 /* 2708 * Generate a write operation. 2709 */ 2710 void 2711 vfs_mnttab_writeop(void) 2712 { 2713 vfs_mnttab_rwop(1); 2714 } 2715 2716 /* 2717 * Generate a read operation. 2718 */ 2719 void 2720 vfs_mnttab_readop(void) 2721 { 2722 vfs_mnttab_rwop(0); 2723 } 2724 2725 /* 2726 * Free any mnttab information recorded in the vfs struct. 2727 * The vfs must not be on the vfs list. 2728 */ 2729 static void 2730 vfs_freemnttab(struct vfs *vfsp) 2731 { 2732 ASSERT(!VFS_ON_LIST(vfsp)); 2733 2734 /* 2735 * Free device and mount point information 2736 */ 2737 if (vfsp->vfs_mntpt != NULL) { 2738 refstr_rele(vfsp->vfs_mntpt); 2739 vfsp->vfs_mntpt = NULL; 2740 } 2741 if (vfsp->vfs_resource != NULL) { 2742 refstr_rele(vfsp->vfs_resource); 2743 vfsp->vfs_resource = NULL; 2744 } 2745 /* 2746 * Now free mount options information 2747 */ 2748 vfs_freeopttbl(&vfsp->vfs_mntopts); 2749 } 2750 2751 /* 2752 * Return the last mnttab modification time 2753 */ 2754 void 2755 vfs_mnttab_modtime(timespec_t *ts) 2756 { 2757 ASSERT(RW_LOCK_HELD(&vfslist)); 2758 *ts = vfs_mnttab_mtime; 2759 } 2760 2761 /* 2762 * See if mnttab is changed 2763 */ 2764 void 2765 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2766 { 2767 int changed; 2768 2769 *phpp = (struct pollhead *)NULL; 2770 2771 /* 2772 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 2773 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 2774 * to not grab the vfs list lock because tv_sec is monotonically 2775 * increasing. 2776 */ 2777 2778 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 2779 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 2780 if (!changed) { 2781 *phpp = &vfs_pollhd; 2782 } 2783 } 2784 2785 /* 2786 * Update the mnttab modification time and wake up any waiters for 2787 * mnttab changes 2788 */ 2789 void 2790 vfs_mnttab_modtimeupd() 2791 { 2792 hrtime_t oldhrt, newhrt; 2793 2794 ASSERT(RW_WRITE_HELD(&vfslist)); 2795 oldhrt = ts2hrt(&vfs_mnttab_mtime); 2796 gethrestime(&vfs_mnttab_mtime); 2797 newhrt = ts2hrt(&vfs_mnttab_mtime); 2798 if (oldhrt == (hrtime_t)0) 2799 vfs_mnttab_ctime = vfs_mnttab_mtime; 2800 /* 2801 * Attempt to provide unique mtime (like uniqtime but not). 2802 */ 2803 if (newhrt == oldhrt) { 2804 newhrt++; 2805 hrt2ts(newhrt, &vfs_mnttab_mtime); 2806 } 2807 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 2808 vfs_mnttab_writeop(); 2809 } 2810 2811 int 2812 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 2813 { 2814 vnode_t *coveredvp; 2815 int error; 2816 extern void teardown_vopstats(vfs_t *); 2817 2818 /* 2819 * Get covered vnode. This will be NULL if the vfs is not linked 2820 * into the file system name space (i.e., domount() with MNT_NOSPICE). 2821 */ 2822 coveredvp = vfsp->vfs_vnodecovered; 2823 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 2824 2825 /* 2826 * Purge all dnlc entries for this vfs. 2827 */ 2828 (void) dnlc_purge_vfsp(vfsp, 0); 2829 2830 /* For forcible umount, skip VFS_SYNC() since it may hang */ 2831 if ((flag & MS_FORCE) == 0) 2832 (void) VFS_SYNC(vfsp, 0, cr); 2833 2834 /* 2835 * Lock the vfs to maintain fs status quo during unmount. This 2836 * has to be done after the sync because ufs_update tries to acquire 2837 * the vfs_reflock. 2838 */ 2839 vfs_lock_wait(vfsp); 2840 2841 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 2842 vfs_unlock(vfsp); 2843 if (coveredvp != NULL) 2844 vn_vfsunlock(coveredvp); 2845 } else if (coveredvp != NULL) { 2846 teardown_vopstats(vfsp); 2847 /* 2848 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 2849 * when it frees vfsp so we do a VN_HOLD() so we can 2850 * continue to use coveredvp afterwards. 2851 */ 2852 VN_HOLD(coveredvp); 2853 vfs_remove(vfsp); 2854 vn_vfsunlock(coveredvp); 2855 VN_RELE(coveredvp); 2856 } else { 2857 teardown_vopstats(vfsp); 2858 /* 2859 * Release the reference to vfs that is not linked 2860 * into the name space. 2861 */ 2862 vfs_unlock(vfsp); 2863 VFS_RELE(vfsp); 2864 } 2865 return (error); 2866 } 2867 2868 2869 /* 2870 * Vfs_unmountall() is called by uadmin() to unmount all 2871 * mounted file systems (except the root file system) during shutdown. 2872 * It follows the existing locking protocol when traversing the vfs list 2873 * to sync and unmount vfses. Even though there should be no 2874 * other thread running while the system is shutting down, it is prudent 2875 * to still follow the locking protocol. 2876 */ 2877 void 2878 vfs_unmountall(void) 2879 { 2880 struct vfs *vfsp; 2881 struct vfs *prev_vfsp = NULL; 2882 int error; 2883 2884 /* 2885 * Toss all dnlc entries now so that the per-vfs sync 2886 * and unmount operations don't have to slog through 2887 * a bunch of uninteresting vnodes over and over again. 2888 */ 2889 dnlc_purge(); 2890 2891 vfs_list_lock(); 2892 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 2893 prev_vfsp = vfsp->vfs_prev; 2894 2895 if (vfs_lock(vfsp) != 0) 2896 continue; 2897 error = vn_vfswlock(vfsp->vfs_vnodecovered); 2898 vfs_unlock(vfsp); 2899 if (error) 2900 continue; 2901 2902 vfs_list_unlock(); 2903 2904 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 2905 (void) dounmount(vfsp, 0, CRED()); 2906 2907 /* 2908 * Since we dropped the vfslist lock above we must 2909 * verify that next_vfsp still exists, else start over. 2910 */ 2911 vfs_list_lock(); 2912 for (vfsp = rootvfs->vfs_prev; 2913 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 2914 if (vfsp == prev_vfsp) 2915 break; 2916 if (vfsp == rootvfs && prev_vfsp != rootvfs) 2917 prev_vfsp = rootvfs->vfs_prev; 2918 } 2919 vfs_list_unlock(); 2920 } 2921 2922 /* 2923 * Called to add an entry to the end of the vfs mount in progress list 2924 */ 2925 void 2926 vfs_addmip(dev_t dev, struct vfs *vfsp) 2927 { 2928 struct ipmnt *mipp; 2929 2930 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 2931 mipp->mip_next = NULL; 2932 mipp->mip_dev = dev; 2933 mipp->mip_vfsp = vfsp; 2934 mutex_enter(&vfs_miplist_mutex); 2935 if (vfs_miplist_end != NULL) 2936 vfs_miplist_end->mip_next = mipp; 2937 else 2938 vfs_miplist = mipp; 2939 vfs_miplist_end = mipp; 2940 mutex_exit(&vfs_miplist_mutex); 2941 } 2942 2943 /* 2944 * Called to remove an entry from the mount in progress list 2945 * Either because the mount completed or it failed. 2946 */ 2947 void 2948 vfs_delmip(struct vfs *vfsp) 2949 { 2950 struct ipmnt *mipp, *mipprev; 2951 2952 mutex_enter(&vfs_miplist_mutex); 2953 mipprev = NULL; 2954 for (mipp = vfs_miplist; 2955 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 2956 mipprev = mipp; 2957 } 2958 if (mipp == NULL) 2959 return; /* shouldn't happen */ 2960 if (mipp == vfs_miplist_end) 2961 vfs_miplist_end = mipprev; 2962 if (mipprev == NULL) 2963 vfs_miplist = mipp->mip_next; 2964 else 2965 mipprev->mip_next = mipp->mip_next; 2966 mutex_exit(&vfs_miplist_mutex); 2967 kmem_free(mipp, sizeof (struct ipmnt)); 2968 } 2969 2970 /* 2971 * vfs_add is called by a specific filesystem's mount routine to add 2972 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 2973 * The vfs should already have been locked by the caller. 2974 * 2975 * coveredvp is NULL if this is the root. 2976 */ 2977 void 2978 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 2979 { 2980 int newflag; 2981 2982 ASSERT(vfs_lock_held(vfsp)); 2983 VFS_HOLD(vfsp); 2984 newflag = vfsp->vfs_flag; 2985 if (mflag & MS_RDONLY) 2986 newflag |= VFS_RDONLY; 2987 else 2988 newflag &= ~VFS_RDONLY; 2989 if (mflag & MS_NOSUID) 2990 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 2991 else 2992 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 2993 if (mflag & MS_NOMNTTAB) 2994 newflag |= VFS_NOMNTTAB; 2995 else 2996 newflag &= ~VFS_NOMNTTAB; 2997 2998 if (coveredvp != NULL) { 2999 ASSERT(vn_vfswlock_held(coveredvp)); 3000 coveredvp->v_vfsmountedhere = vfsp; 3001 VN_HOLD(coveredvp); 3002 } 3003 vfsp->vfs_vnodecovered = coveredvp; 3004 vfsp->vfs_flag = newflag; 3005 3006 vfs_list_add(vfsp); 3007 } 3008 3009 /* 3010 * Remove a vfs from the vfs list, null out the pointer from the 3011 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 3012 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 3013 * reference to the vfs and to the covered vnode. 3014 * 3015 * Called from dounmount after it's confirmed with the file system 3016 * that the unmount is legal. 3017 */ 3018 void 3019 vfs_remove(struct vfs *vfsp) 3020 { 3021 vnode_t *vp; 3022 3023 ASSERT(vfs_lock_held(vfsp)); 3024 3025 /* 3026 * Can't unmount root. Should never happen because fs will 3027 * be busy. 3028 */ 3029 if (vfsp == rootvfs) 3030 panic("vfs_remove: unmounting root"); 3031 3032 vfs_list_remove(vfsp); 3033 3034 /* 3035 * Unhook from the file system name space. 3036 */ 3037 vp = vfsp->vfs_vnodecovered; 3038 ASSERT(vn_vfswlock_held(vp)); 3039 vp->v_vfsmountedhere = NULL; 3040 vfsp->vfs_vnodecovered = NULL; 3041 VN_RELE(vp); 3042 3043 /* 3044 * Release lock and wakeup anybody waiting. 3045 */ 3046 vfs_unlock(vfsp); 3047 VFS_RELE(vfsp); 3048 } 3049 3050 /* 3051 * Lock a filesystem to prevent access to it while mounting, 3052 * unmounting and syncing. Return EBUSY immediately if lock 3053 * can't be acquired. 3054 */ 3055 int 3056 vfs_lock(vfs_t *vfsp) 3057 { 3058 vn_vfslocks_entry_t *vpvfsentry; 3059 3060 vpvfsentry = vn_vfslocks_getlock(vfsp); 3061 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 3062 return (0); 3063 3064 vn_vfslocks_rele(vpvfsentry); 3065 return (EBUSY); 3066 } 3067 3068 int 3069 vfs_rlock(vfs_t *vfsp) 3070 { 3071 vn_vfslocks_entry_t *vpvfsentry; 3072 3073 vpvfsentry = vn_vfslocks_getlock(vfsp); 3074 3075 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 3076 return (0); 3077 3078 vn_vfslocks_rele(vpvfsentry); 3079 return (EBUSY); 3080 } 3081 3082 void 3083 vfs_lock_wait(vfs_t *vfsp) 3084 { 3085 vn_vfslocks_entry_t *vpvfsentry; 3086 3087 vpvfsentry = vn_vfslocks_getlock(vfsp); 3088 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 3089 } 3090 3091 void 3092 vfs_rlock_wait(vfs_t *vfsp) 3093 { 3094 vn_vfslocks_entry_t *vpvfsentry; 3095 3096 vpvfsentry = vn_vfslocks_getlock(vfsp); 3097 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 3098 } 3099 3100 /* 3101 * Unlock a locked filesystem. 3102 */ 3103 void 3104 vfs_unlock(vfs_t *vfsp) 3105 { 3106 vn_vfslocks_entry_t *vpvfsentry; 3107 3108 /* 3109 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 3110 * And these changes should remain for the patch changes as it is. 3111 */ 3112 if (panicstr) 3113 return; 3114 3115 /* 3116 * ve_refcount needs to be dropped twice here. 3117 * 1. To release refernce after a call to vfs_locks_getlock() 3118 * 2. To release the reference from the locking routines like 3119 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 3120 */ 3121 3122 vpvfsentry = vn_vfslocks_getlock(vfsp); 3123 vn_vfslocks_rele(vpvfsentry); 3124 3125 rwst_exit(&vpvfsentry->ve_lock); 3126 vn_vfslocks_rele(vpvfsentry); 3127 } 3128 3129 /* 3130 * Utility routine that allows a filesystem to construct its 3131 * fsid in "the usual way" - by munging some underlying dev_t and 3132 * the filesystem type number into the 64-bit fsid. Note that 3133 * this implicitly relies on dev_t persistence to make filesystem 3134 * id's persistent. 3135 * 3136 * There's nothing to prevent an individual fs from constructing its 3137 * fsid in a different way, and indeed they should. 3138 * 3139 * Since we want fsids to be 32-bit quantities (so that they can be 3140 * exported identically by either 32-bit or 64-bit APIs, as well as 3141 * the fact that fsid's are "known" to NFS), we compress the device 3142 * number given down to 32-bits, and panic if that isn't possible. 3143 */ 3144 void 3145 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 3146 { 3147 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 3148 panic("device number too big for fsid!"); 3149 fsi->val[1] = val; 3150 } 3151 3152 int 3153 vfs_lock_held(vfs_t *vfsp) 3154 { 3155 int held; 3156 vn_vfslocks_entry_t *vpvfsentry; 3157 3158 /* 3159 * vfs_lock_held will mimic sema_held behaviour 3160 * if panicstr is set. And these changes should remain 3161 * for the patch changes as it is. 3162 */ 3163 if (panicstr) 3164 return (1); 3165 3166 vpvfsentry = vn_vfslocks_getlock(vfsp); 3167 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 3168 3169 vn_vfslocks_rele(vpvfsentry); 3170 return (held); 3171 } 3172 3173 struct _kthread * 3174 vfs_lock_owner(vfs_t *vfsp) 3175 { 3176 struct _kthread *owner; 3177 vn_vfslocks_entry_t *vpvfsentry; 3178 3179 /* 3180 * vfs_wlock_held will mimic sema_held behaviour 3181 * if panicstr is set. And these changes should remain 3182 * for the patch changes as it is. 3183 */ 3184 if (panicstr) 3185 return (NULL); 3186 3187 vpvfsentry = vn_vfslocks_getlock(vfsp); 3188 owner = rwst_owner(&vpvfsentry->ve_lock); 3189 3190 vn_vfslocks_rele(vpvfsentry); 3191 return (owner); 3192 } 3193 3194 /* 3195 * vfs list locking. 3196 * 3197 * Rather than manipulate the vfslist lock directly, we abstract into lock 3198 * and unlock routines to allow the locking implementation to be changed for 3199 * clustering. 3200 * 3201 * Whenever the vfs list is modified through its hash links, the overall list 3202 * lock must be obtained before locking the relevant hash bucket. But to see 3203 * whether a given vfs is on the list, it suffices to obtain the lock for the 3204 * hash bucket without getting the overall list lock. (See getvfs() below.) 3205 */ 3206 3207 void 3208 vfs_list_lock() 3209 { 3210 rw_enter(&vfslist, RW_WRITER); 3211 } 3212 3213 void 3214 vfs_list_read_lock() 3215 { 3216 rw_enter(&vfslist, RW_READER); 3217 } 3218 3219 void 3220 vfs_list_unlock() 3221 { 3222 rw_exit(&vfslist); 3223 } 3224 3225 /* 3226 * Low level worker routines for adding entries to and removing entries from 3227 * the vfs list. 3228 */ 3229 3230 static void 3231 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 3232 { 3233 int vhno; 3234 struct vfs **hp; 3235 dev_t dev; 3236 3237 ASSERT(RW_WRITE_HELD(&vfslist)); 3238 3239 dev = expldev(vfsp->vfs_fsid.val[0]); 3240 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3241 3242 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3243 3244 /* 3245 * Link into the hash table, inserting it at the end, so that LOFS 3246 * with the same fsid as UFS (or other) file systems will not hide the 3247 * UFS. 3248 */ 3249 if (insert_at_head) { 3250 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 3251 rvfs_list[vhno].rvfs_head = vfsp; 3252 } else { 3253 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 3254 hp = &(*hp)->vfs_hash) 3255 continue; 3256 /* 3257 * hp now contains the address of the pointer to update 3258 * to effect the insertion. 3259 */ 3260 vfsp->vfs_hash = NULL; 3261 *hp = vfsp; 3262 } 3263 3264 rvfs_list[vhno].rvfs_len++; 3265 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3266 } 3267 3268 3269 static void 3270 vfs_hash_remove(struct vfs *vfsp) 3271 { 3272 int vhno; 3273 struct vfs *tvfsp; 3274 dev_t dev; 3275 3276 ASSERT(RW_WRITE_HELD(&vfslist)); 3277 3278 dev = expldev(vfsp->vfs_fsid.val[0]); 3279 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3280 3281 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3282 3283 /* 3284 * Remove from hash. 3285 */ 3286 if (rvfs_list[vhno].rvfs_head == vfsp) { 3287 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 3288 rvfs_list[vhno].rvfs_len--; 3289 goto foundit; 3290 } 3291 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 3292 tvfsp = tvfsp->vfs_hash) { 3293 if (tvfsp->vfs_hash == vfsp) { 3294 tvfsp->vfs_hash = vfsp->vfs_hash; 3295 rvfs_list[vhno].rvfs_len--; 3296 goto foundit; 3297 } 3298 } 3299 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 3300 3301 foundit: 3302 3303 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3304 } 3305 3306 3307 void 3308 vfs_list_add(struct vfs *vfsp) 3309 { 3310 zone_t *zone; 3311 3312 /* 3313 * The zone that owns the mount is the one that performed the mount. 3314 * Note that this isn't necessarily the same as the zone mounted into. 3315 * The corresponding zone_rele() will be done when the vfs_t is 3316 * being free'd. 3317 */ 3318 vfsp->vfs_zone = curproc->p_zone; 3319 zone_hold(vfsp->vfs_zone); 3320 3321 /* 3322 * Find the zone mounted into, and put this mount on its vfs list. 3323 */ 3324 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3325 ASSERT(zone != NULL); 3326 /* 3327 * Special casing for the root vfs. This structure is allocated 3328 * statically and hooked onto rootvfs at link time. During the 3329 * vfs_mountroot call at system startup time, the root file system's 3330 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 3331 * as argument. The code below must detect and handle this special 3332 * case. The only apparent justification for this special casing is 3333 * to ensure that the root file system appears at the head of the 3334 * list. 3335 * 3336 * XXX: I'm assuming that it's ok to do normal list locking when 3337 * adding the entry for the root file system (this used to be 3338 * done with no locks held). 3339 */ 3340 vfs_list_lock(); 3341 /* 3342 * Link into the vfs list proper. 3343 */ 3344 if (vfsp == &root) { 3345 /* 3346 * Assert: This vfs is already on the list as its first entry. 3347 * Thus, there's nothing to do. 3348 */ 3349 ASSERT(rootvfs == vfsp); 3350 /* 3351 * Add it to the head of the global zone's vfslist. 3352 */ 3353 ASSERT(zone == global_zone); 3354 ASSERT(zone->zone_vfslist == NULL); 3355 zone->zone_vfslist = vfsp; 3356 } else { 3357 /* 3358 * Link to end of list using vfs_prev (as rootvfs is now a 3359 * doubly linked circular list) so list is in mount order for 3360 * mnttab use. 3361 */ 3362 rootvfs->vfs_prev->vfs_next = vfsp; 3363 vfsp->vfs_prev = rootvfs->vfs_prev; 3364 rootvfs->vfs_prev = vfsp; 3365 vfsp->vfs_next = rootvfs; 3366 3367 /* 3368 * Do it again for the zone-private list (which may be NULL). 3369 */ 3370 if (zone->zone_vfslist == NULL) { 3371 ASSERT(zone != global_zone); 3372 zone->zone_vfslist = vfsp; 3373 } else { 3374 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 3375 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 3376 zone->zone_vfslist->vfs_zone_prev = vfsp; 3377 vfsp->vfs_zone_next = zone->zone_vfslist; 3378 } 3379 } 3380 3381 /* 3382 * Link into the hash table, inserting it at the end, so that LOFS 3383 * with the same fsid as UFS (or other) file systems will not hide 3384 * the UFS. 3385 */ 3386 vfs_hash_add(vfsp, 0); 3387 3388 /* 3389 * update the mnttab modification time 3390 */ 3391 vfs_mnttab_modtimeupd(); 3392 vfs_list_unlock(); 3393 zone_rele(zone); 3394 } 3395 3396 void 3397 vfs_list_remove(struct vfs *vfsp) 3398 { 3399 zone_t *zone; 3400 3401 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3402 ASSERT(zone != NULL); 3403 /* 3404 * Callers are responsible for preventing attempts to unmount the 3405 * root. 3406 */ 3407 ASSERT(vfsp != rootvfs); 3408 3409 vfs_list_lock(); 3410 3411 /* 3412 * Remove from hash. 3413 */ 3414 vfs_hash_remove(vfsp); 3415 3416 /* 3417 * Remove from vfs list. 3418 */ 3419 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3420 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3421 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3422 3423 /* 3424 * Remove from zone-specific vfs list. 3425 */ 3426 if (zone->zone_vfslist == vfsp) 3427 zone->zone_vfslist = vfsp->vfs_zone_next; 3428 3429 if (vfsp->vfs_zone_next == vfsp) { 3430 ASSERT(vfsp->vfs_zone_prev == vfsp); 3431 ASSERT(zone->zone_vfslist == vfsp); 3432 zone->zone_vfslist = NULL; 3433 } 3434 3435 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3436 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3437 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3438 3439 /* 3440 * update the mnttab modification time 3441 */ 3442 vfs_mnttab_modtimeupd(); 3443 vfs_list_unlock(); 3444 zone_rele(zone); 3445 } 3446 3447 struct vfs * 3448 getvfs(fsid_t *fsid) 3449 { 3450 struct vfs *vfsp; 3451 int val0 = fsid->val[0]; 3452 int val1 = fsid->val[1]; 3453 dev_t dev = expldev(val0); 3454 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3455 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3456 3457 mutex_enter(hmp); 3458 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3459 if (vfsp->vfs_fsid.val[0] == val0 && 3460 vfsp->vfs_fsid.val[1] == val1) { 3461 VFS_HOLD(vfsp); 3462 mutex_exit(hmp); 3463 return (vfsp); 3464 } 3465 } 3466 mutex_exit(hmp); 3467 return (NULL); 3468 } 3469 3470 /* 3471 * Search the vfs mount in progress list for a specified device/vfs entry. 3472 * Returns 0 if the first entry in the list that the device matches has the 3473 * given vfs pointer as well. If the device matches but a different vfs 3474 * pointer is encountered in the list before the given vfs pointer then 3475 * a 1 is returned. 3476 */ 3477 3478 int 3479 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3480 { 3481 int retval = 0; 3482 struct ipmnt *mipp; 3483 3484 mutex_enter(&vfs_miplist_mutex); 3485 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3486 if (mipp->mip_dev == dev) { 3487 if (mipp->mip_vfsp != vfsp) 3488 retval = 1; 3489 break; 3490 } 3491 } 3492 mutex_exit(&vfs_miplist_mutex); 3493 return (retval); 3494 } 3495 3496 /* 3497 * Search the vfs list for a specified device. Returns 1, if entry is found 3498 * or 0 if no suitable entry is found. 3499 */ 3500 3501 int 3502 vfs_devismounted(dev_t dev) 3503 { 3504 struct vfs *vfsp; 3505 int found; 3506 3507 vfs_list_read_lock(); 3508 vfsp = rootvfs; 3509 found = 0; 3510 do { 3511 if (vfsp->vfs_dev == dev) { 3512 found = 1; 3513 break; 3514 } 3515 vfsp = vfsp->vfs_next; 3516 } while (vfsp != rootvfs); 3517 3518 vfs_list_unlock(); 3519 return (found); 3520 } 3521 3522 /* 3523 * Search the vfs list for a specified device. Returns a pointer to it 3524 * or NULL if no suitable entry is found. The caller of this routine 3525 * is responsible for releasing the returned vfs pointer. 3526 */ 3527 struct vfs * 3528 vfs_dev2vfsp(dev_t dev) 3529 { 3530 struct vfs *vfsp; 3531 int found; 3532 3533 vfs_list_read_lock(); 3534 vfsp = rootvfs; 3535 found = 0; 3536 do { 3537 /* 3538 * The following could be made more efficient by making 3539 * the entire loop use vfs_zone_next if the call is from 3540 * a zone. The only callers, however, ustat(2) and 3541 * umount2(2), don't seem to justify the added 3542 * complexity at present. 3543 */ 3544 if (vfsp->vfs_dev == dev && 3545 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3546 curproc->p_zone)) { 3547 VFS_HOLD(vfsp); 3548 found = 1; 3549 break; 3550 } 3551 vfsp = vfsp->vfs_next; 3552 } while (vfsp != rootvfs); 3553 vfs_list_unlock(); 3554 return (found ? vfsp: NULL); 3555 } 3556 3557 /* 3558 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3559 * or NULL if no suitable entry is found. The caller of this routine 3560 * is responsible for releasing the returned vfs pointer. 3561 * 3562 * Note that if multiple mntpoints match, the last one matching is 3563 * returned in an attempt to return the "top" mount when overlay 3564 * mounts are covering the same mount point. This is accomplished by starting 3565 * at the end of the list and working our way backwards, stopping at the first 3566 * matching mount. 3567 */ 3568 struct vfs * 3569 vfs_mntpoint2vfsp(const char *mp) 3570 { 3571 struct vfs *vfsp; 3572 struct vfs *retvfsp = NULL; 3573 zone_t *zone = curproc->p_zone; 3574 struct vfs *list; 3575 3576 vfs_list_read_lock(); 3577 if (getzoneid() == GLOBAL_ZONEID) { 3578 /* 3579 * The global zone may see filesystems in any zone. 3580 */ 3581 vfsp = rootvfs->vfs_prev; 3582 do { 3583 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) { 3584 retvfsp = vfsp; 3585 break; 3586 } 3587 vfsp = vfsp->vfs_prev; 3588 } while (vfsp != rootvfs->vfs_prev); 3589 } else if ((list = zone->zone_vfslist) != NULL) { 3590 const char *mntpt; 3591 3592 vfsp = list->vfs_zone_prev; 3593 do { 3594 mntpt = refstr_value(vfsp->vfs_mntpt); 3595 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3596 if (strcmp(mntpt, mp) == 0) { 3597 retvfsp = vfsp; 3598 break; 3599 } 3600 vfsp = vfsp->vfs_zone_prev; 3601 } while (vfsp != list->vfs_zone_prev); 3602 } 3603 if (retvfsp) 3604 VFS_HOLD(retvfsp); 3605 vfs_list_unlock(); 3606 return (retvfsp); 3607 } 3608 3609 /* 3610 * Search the vfs list for a specified vfsops. 3611 * if vfs entry is found then return 1, else 0. 3612 */ 3613 int 3614 vfs_opsinuse(vfsops_t *ops) 3615 { 3616 struct vfs *vfsp; 3617 int found; 3618 3619 vfs_list_read_lock(); 3620 vfsp = rootvfs; 3621 found = 0; 3622 do { 3623 if (vfs_getops(vfsp) == ops) { 3624 found = 1; 3625 break; 3626 } 3627 vfsp = vfsp->vfs_next; 3628 } while (vfsp != rootvfs); 3629 vfs_list_unlock(); 3630 return (found); 3631 } 3632 3633 /* 3634 * Allocate an entry in vfssw for a file system type 3635 */ 3636 struct vfssw * 3637 allocate_vfssw(char *type) 3638 { 3639 struct vfssw *vswp; 3640 3641 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3642 /* 3643 * The vfssw table uses the empty string to identify an 3644 * available entry; we cannot add any type which has 3645 * a leading NUL. The string length is limited to 3646 * the size of the st_fstype array in struct stat. 3647 */ 3648 return (NULL); 3649 } 3650 3651 ASSERT(VFSSW_WRITE_LOCKED()); 3652 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3653 if (!ALLOCATED_VFSSW(vswp)) { 3654 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3655 (void) strcpy(vswp->vsw_name, type); 3656 ASSERT(vswp->vsw_count == 0); 3657 vswp->vsw_count = 1; 3658 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3659 return (vswp); 3660 } 3661 return (NULL); 3662 } 3663 3664 /* 3665 * Impose additional layer of translation between vfstype names 3666 * and module names in the filesystem. 3667 */ 3668 static char * 3669 vfs_to_modname(char *vfstype) 3670 { 3671 if (strcmp(vfstype, "proc") == 0) { 3672 vfstype = "procfs"; 3673 } else if (strcmp(vfstype, "fd") == 0) { 3674 vfstype = "fdfs"; 3675 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3676 vfstype = "nfs"; 3677 } 3678 3679 return (vfstype); 3680 } 3681 3682 /* 3683 * Find a vfssw entry given a file system type name. 3684 * Try to autoload the filesystem if it's not found. 3685 * If it's installed, return the vfssw locked to prevent unloading. 3686 */ 3687 struct vfssw * 3688 vfs_getvfssw(char *type) 3689 { 3690 struct vfssw *vswp; 3691 char *modname; 3692 3693 RLOCK_VFSSW(); 3694 vswp = vfs_getvfsswbyname(type); 3695 modname = vfs_to_modname(type); 3696 3697 if (rootdir == NULL) { 3698 /* 3699 * If we haven't yet loaded the root file system, then our 3700 * _init won't be called until later. Allocate vfssw entry, 3701 * because mod_installfs won't be called. 3702 */ 3703 if (vswp == NULL) { 3704 RUNLOCK_VFSSW(); 3705 WLOCK_VFSSW(); 3706 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 3707 if ((vswp = allocate_vfssw(type)) == NULL) { 3708 WUNLOCK_VFSSW(); 3709 return (NULL); 3710 } 3711 } 3712 WUNLOCK_VFSSW(); 3713 RLOCK_VFSSW(); 3714 } 3715 if (!VFS_INSTALLED(vswp)) { 3716 RUNLOCK_VFSSW(); 3717 (void) modloadonly("fs", modname); 3718 } else 3719 RUNLOCK_VFSSW(); 3720 return (vswp); 3721 } 3722 3723 /* 3724 * Try to load the filesystem. Before calling modload(), we drop 3725 * our lock on the VFS switch table, and pick it up after the 3726 * module is loaded. However, there is a potential race: the 3727 * module could be unloaded after the call to modload() completes 3728 * but before we pick up the lock and drive on. Therefore, 3729 * we keep reloading the module until we've loaded the module 3730 * _and_ we have the lock on the VFS switch table. 3731 */ 3732 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 3733 RUNLOCK_VFSSW(); 3734 if (modload("fs", modname) == -1) 3735 return (NULL); 3736 RLOCK_VFSSW(); 3737 if (vswp == NULL) 3738 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 3739 break; 3740 } 3741 RUNLOCK_VFSSW(); 3742 3743 return (vswp); 3744 } 3745 3746 /* 3747 * Find a vfssw entry given a file system type name. 3748 */ 3749 struct vfssw * 3750 vfs_getvfsswbyname(char *type) 3751 { 3752 struct vfssw *vswp; 3753 3754 ASSERT(VFSSW_LOCKED()); 3755 if (type == NULL || *type == '\0') 3756 return (NULL); 3757 3758 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3759 if (strcmp(type, vswp->vsw_name) == 0) { 3760 vfs_refvfssw(vswp); 3761 return (vswp); 3762 } 3763 } 3764 3765 return (NULL); 3766 } 3767 3768 /* 3769 * Find a vfssw entry given a set of vfsops. 3770 */ 3771 struct vfssw * 3772 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 3773 { 3774 struct vfssw *vswp; 3775 3776 RLOCK_VFSSW(); 3777 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3778 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 3779 vfs_refvfssw(vswp); 3780 RUNLOCK_VFSSW(); 3781 return (vswp); 3782 } 3783 } 3784 RUNLOCK_VFSSW(); 3785 3786 return (NULL); 3787 } 3788 3789 /* 3790 * Reference a vfssw entry. 3791 */ 3792 void 3793 vfs_refvfssw(struct vfssw *vswp) 3794 { 3795 3796 mutex_enter(&vswp->vsw_lock); 3797 vswp->vsw_count++; 3798 mutex_exit(&vswp->vsw_lock); 3799 } 3800 3801 /* 3802 * Unreference a vfssw entry. 3803 */ 3804 void 3805 vfs_unrefvfssw(struct vfssw *vswp) 3806 { 3807 3808 mutex_enter(&vswp->vsw_lock); 3809 vswp->vsw_count--; 3810 mutex_exit(&vswp->vsw_lock); 3811 } 3812 3813 int sync_timeout = 30; /* timeout for syncing a page during panic */ 3814 int sync_timeleft; /* portion of sync_timeout remaining */ 3815 3816 static int sync_retries = 20; /* number of retries when not making progress */ 3817 static int sync_triesleft; /* portion of sync_retries remaining */ 3818 3819 static pgcnt_t old_pgcnt, new_pgcnt; 3820 static int new_bufcnt, old_bufcnt; 3821 3822 /* 3823 * Sync all of the mounted filesystems, and then wait for the actual i/o to 3824 * complete. We wait by counting the number of dirty pages and buffers, 3825 * pushing them out using bio_busy() and page_busy(), and then counting again. 3826 * This routine is used during both the uadmin A_SHUTDOWN code as well as 3827 * the SYNC phase of the panic code (see comments in panic.c). It should only 3828 * be used after some higher-level mechanism has quiesced the system so that 3829 * new writes are not being initiated while we are waiting for completion. 3830 * 3831 * To ensure finite running time, our algorithm uses two timeout mechanisms: 3832 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and 3833 * sync_triesleft (a progress counter used by the vfs_syncall() loop below). 3834 * Together these ensure that syncing completes if our i/o paths are stuck. 3835 * The counters are declared above so they can be found easily in the debugger. 3836 * 3837 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the 3838 * vfs_syncprogress() subroutine whenever we make progress through the lists of 3839 * pages and buffers. It is decremented and expired by the deadman() cyclic. 3840 * When vfs_syncall() decides it is done, we disable the deadman() counter by 3841 * setting sync_timeleft to zero. This timer guards against vfs_syncall() 3842 * deadlocking or hanging inside of a broken filesystem or driver routine. 3843 * 3844 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 3845 * sync_retries consecutive calls to bio_busy() and page_busy() without 3846 * decreasing either the number of dirty buffers or dirty pages below the 3847 * lowest count we have seen so far, we give up and return from vfs_syncall(). 3848 * 3849 * Each loop iteration ends with a call to delay() one second to allow time for 3850 * i/o completion and to permit the user time to read our progress messages. 3851 */ 3852 void 3853 vfs_syncall(void) 3854 { 3855 if (rootdir == NULL && !modrootloaded) 3856 return; /* panic during boot - no filesystems yet */ 3857 3858 printf("syncing file systems..."); 3859 vfs_syncprogress(); 3860 sync(); 3861 3862 vfs_syncprogress(); 3863 sync_triesleft = sync_retries; 3864 3865 old_bufcnt = new_bufcnt = INT_MAX; 3866 old_pgcnt = new_pgcnt = ULONG_MAX; 3867 3868 while (sync_triesleft > 0) { 3869 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 3870 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 3871 3872 new_bufcnt = bio_busy(B_TRUE); 3873 new_pgcnt = page_busy(B_TRUE); 3874 vfs_syncprogress(); 3875 3876 if (new_bufcnt == 0 && new_pgcnt == 0) 3877 break; 3878 3879 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 3880 sync_triesleft = sync_retries; 3881 else 3882 sync_triesleft--; 3883 3884 if (new_bufcnt) 3885 printf(" [%d]", new_bufcnt); 3886 if (new_pgcnt) 3887 printf(" %lu", new_pgcnt); 3888 3889 delay(hz); 3890 } 3891 3892 if (new_bufcnt != 0 || new_pgcnt != 0) 3893 printf(" done (not all i/o completed)\n"); 3894 else 3895 printf(" done\n"); 3896 3897 sync_timeleft = 0; 3898 delay(hz); 3899 } 3900 3901 /* 3902 * If we are in the middle of the sync phase of panic, reset sync_timeleft to 3903 * sync_timeout to indicate that we are making progress and the deadman() 3904 * omnipresent cyclic should not yet time us out. Note that it is safe to 3905 * store to sync_timeleft here since the deadman() is firing at high-level 3906 * on top of us. If we are racing with the deadman(), either the deadman() 3907 * will decrement the old value and then we will reset it, or we will 3908 * reset it and then the deadman() will immediately decrement it. In either 3909 * case, correct behavior results. 3910 */ 3911 void 3912 vfs_syncprogress(void) 3913 { 3914 if (panicstr) 3915 sync_timeleft = sync_timeout; 3916 } 3917 3918 /* 3919 * Map VFS flags to statvfs flags. These shouldn't really be separate 3920 * flags at all. 3921 */ 3922 uint_t 3923 vf_to_stf(uint_t vf) 3924 { 3925 uint_t stf = 0; 3926 3927 if (vf & VFS_RDONLY) 3928 stf |= ST_RDONLY; 3929 if (vf & VFS_NOSETUID) 3930 stf |= ST_NOSUID; 3931 if (vf & VFS_NOTRUNC) 3932 stf |= ST_NOTRUNC; 3933 3934 return (stf); 3935 } 3936 3937 /* 3938 * Entries for (illegal) fstype 0. 3939 */ 3940 /* ARGSUSED */ 3941 int 3942 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 3943 { 3944 cmn_err(CE_PANIC, "stray vfs operation"); 3945 return (0); 3946 } 3947 3948 /* 3949 * Entries for (illegal) fstype 0. 3950 */ 3951 int 3952 vfsstray(void) 3953 { 3954 cmn_err(CE_PANIC, "stray vfs operation"); 3955 return (0); 3956 } 3957 3958 /* 3959 * Support for dealing with forced UFS unmount and its interaction with 3960 * LOFS. Could be used by any filesystem. 3961 * See bug 1203132. 3962 */ 3963 int 3964 vfs_EIO(void) 3965 { 3966 return (EIO); 3967 } 3968 3969 /* 3970 * We've gotta define the op for sync separately, since the compiler gets 3971 * confused if we mix and match ANSI and normal style prototypes when 3972 * a "short" argument is present and spits out a warning. 3973 */ 3974 /*ARGSUSED*/ 3975 int 3976 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 3977 { 3978 return (EIO); 3979 } 3980 3981 vfs_t EIO_vfs; 3982 vfsops_t *EIO_vfsops; 3983 3984 /* 3985 * Called from startup() to initialize all loaded vfs's 3986 */ 3987 void 3988 vfsinit(void) 3989 { 3990 struct vfssw *vswp; 3991 int error; 3992 extern int vopstats_enabled; 3993 extern void vopstats_startup(); 3994 3995 static const fs_operation_def_t EIO_vfsops_template[] = { 3996 VFSNAME_MOUNT, { .error = vfs_EIO }, 3997 VFSNAME_UNMOUNT, { .error = vfs_EIO }, 3998 VFSNAME_ROOT, { .error = vfs_EIO }, 3999 VFSNAME_STATVFS, { .error = vfs_EIO }, 4000 VFSNAME_SYNC, { .vfs_sync = vfs_EIO_sync }, 4001 VFSNAME_VGET, { .error = vfs_EIO }, 4002 VFSNAME_MOUNTROOT, { .error = vfs_EIO }, 4003 VFSNAME_FREEVFS, { .error = vfs_EIO }, 4004 VFSNAME_VNSTATE, { .error = vfs_EIO }, 4005 NULL, NULL 4006 }; 4007 4008 static const fs_operation_def_t stray_vfsops_template[] = { 4009 VFSNAME_MOUNT, { .error = vfsstray }, 4010 VFSNAME_UNMOUNT, { .error = vfsstray }, 4011 VFSNAME_ROOT, { .error = vfsstray }, 4012 VFSNAME_STATVFS, { .error = vfsstray }, 4013 VFSNAME_SYNC, { .vfs_sync = vfsstray_sync }, 4014 VFSNAME_VGET, { .error = vfsstray }, 4015 VFSNAME_MOUNTROOT, { .error = vfsstray }, 4016 VFSNAME_FREEVFS, { .error = vfsstray }, 4017 VFSNAME_VNSTATE, { .error = vfsstray }, 4018 NULL, NULL 4019 }; 4020 4021 /* Create vfs cache */ 4022 vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs), 4023 sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0); 4024 4025 /* Initialize the vnode cache (file systems may use it during init). */ 4026 vn_create_cache(); 4027 4028 /* Setup event monitor framework */ 4029 fem_init(); 4030 4031 /* Initialize the dummy stray file system type. */ 4032 error = vfs_setfsops(0, stray_vfsops_template, NULL); 4033 4034 /* Initialize the dummy EIO file system. */ 4035 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 4036 if (error != 0) { 4037 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 4038 /* Shouldn't happen, but not bad enough to panic */ 4039 } 4040 4041 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 4042 4043 /* 4044 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 4045 * on this vfs can immediately notice it's invalid. 4046 */ 4047 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 4048 4049 /* 4050 * Call the init routines of non-loadable filesystems only. 4051 * Filesystems which are loaded as separate modules will be 4052 * initialized by the module loading code instead. 4053 */ 4054 4055 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 4056 RLOCK_VFSSW(); 4057 if (vswp->vsw_init != NULL) 4058 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 4059 RUNLOCK_VFSSW(); 4060 } 4061 4062 vopstats_startup(); 4063 4064 if (vopstats_enabled) { 4065 /* EIO_vfs can collect stats, but we don't retrieve them */ 4066 initialize_vopstats(&EIO_vfs.vfs_vopstats); 4067 EIO_vfs.vfs_fstypevsp = NULL; 4068 EIO_vfs.vfs_vskap = NULL; 4069 EIO_vfs.vfs_flag |= VFS_STATS; 4070 } 4071 4072 xattr_init(); 4073 } 4074 4075 vfs_t * 4076 vfs_alloc(int kmflag) 4077 { 4078 vfs_t *vfsp; 4079 4080 vfsp = kmem_cache_alloc(vfs_cache, kmflag); 4081 4082 /* 4083 * Do the simplest initialization here. 4084 * Everything else gets done in vfs_init() 4085 */ 4086 bzero(vfsp, sizeof (vfs_t)); 4087 return (vfsp); 4088 } 4089 4090 void 4091 vfs_free(vfs_t *vfsp) 4092 { 4093 /* 4094 * One would be tempted to assert that "vfsp->vfs_count == 0". 4095 * The problem is that this gets called out of domount() with 4096 * a partially initialized vfs and a vfs_count of 1. This is 4097 * also called from vfs_rele() with a vfs_count of 0. We can't 4098 * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully 4099 * returned. This is because VFS_MOUNT() fully initializes the 4100 * vfs structure and its associated data. VFS_RELE() will call 4101 * VFS_FREEVFS() which may panic the system if the data structures 4102 * aren't fully initialized from a successful VFS_MOUNT()). 4103 */ 4104 4105 /* If FEM was in use, make sure everything gets cleaned up */ 4106 if (vfsp->vfs_femhead) { 4107 ASSERT(vfsp->vfs_femhead->femh_list == NULL); 4108 mutex_destroy(&vfsp->vfs_femhead->femh_lock); 4109 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead))); 4110 vfsp->vfs_femhead = NULL; 4111 } 4112 4113 if (vfsp->vfs_implp) 4114 vfsimpl_teardown(vfsp); 4115 sema_destroy(&vfsp->vfs_reflock); 4116 kmem_cache_free(vfs_cache, vfsp); 4117 } 4118 4119 /* 4120 * Increments the vfs reference count by one atomically. 4121 */ 4122 void 4123 vfs_hold(vfs_t *vfsp) 4124 { 4125 atomic_add_32(&vfsp->vfs_count, 1); 4126 ASSERT(vfsp->vfs_count != 0); 4127 } 4128 4129 /* 4130 * Decrements the vfs reference count by one atomically. When 4131 * vfs reference count becomes zero, it calls the file system 4132 * specific vfs_freevfs() to free up the resources. 4133 */ 4134 void 4135 vfs_rele(vfs_t *vfsp) 4136 { 4137 ASSERT(vfsp->vfs_count != 0); 4138 if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) { 4139 VFS_FREEVFS(vfsp); 4140 if (vfsp->vfs_zone) 4141 zone_rele(vfsp->vfs_zone); 4142 vfs_freemnttab(vfsp); 4143 vfs_free(vfsp); 4144 } 4145 } 4146 4147 /* 4148 * Generic operations vector support. 4149 * 4150 * This is used to build operations vectors for both the vfs and vnode. 4151 * It's normally called only when a file system is loaded. 4152 * 4153 * There are many possible algorithms for this, including the following: 4154 * 4155 * (1) scan the list of known operations; for each, see if the file system 4156 * includes an entry for it, and fill it in as appropriate. 4157 * 4158 * (2) set up defaults for all known operations. scan the list of ops 4159 * supplied by the file system; for each which is both supplied and 4160 * known, fill it in. 4161 * 4162 * (3) sort the lists of known ops & supplied ops; scan the list, filling 4163 * in entries as we go. 4164 * 4165 * we choose (1) for simplicity, and because performance isn't critical here. 4166 * note that (2) could be sped up using a precomputed hash table on known ops. 4167 * (3) could be faster than either, but only if the lists were very large or 4168 * supplied in sorted order. 4169 * 4170 */ 4171 4172 int 4173 fs_build_vector(void *vector, int *unused_ops, 4174 const fs_operation_trans_def_t *translation, 4175 const fs_operation_def_t *operations) 4176 { 4177 int i, num_trans, num_ops, used; 4178 4179 /* 4180 * Count the number of translations and the number of supplied 4181 * operations. 4182 */ 4183 4184 { 4185 const fs_operation_trans_def_t *p; 4186 4187 for (num_trans = 0, p = translation; 4188 p->name != NULL; 4189 num_trans++, p++) 4190 ; 4191 } 4192 4193 { 4194 const fs_operation_def_t *p; 4195 4196 for (num_ops = 0, p = operations; 4197 p->name != NULL; 4198 num_ops++, p++) 4199 ; 4200 } 4201 4202 /* Walk through each operation known to our caller. There will be */ 4203 /* one entry in the supplied "translation table" for each. */ 4204 4205 used = 0; 4206 4207 for (i = 0; i < num_trans; i++) { 4208 int j, found; 4209 char *curname; 4210 fs_generic_func_p result; 4211 fs_generic_func_p *location; 4212 4213 curname = translation[i].name; 4214 4215 /* Look for a matching operation in the list supplied by the */ 4216 /* file system. */ 4217 4218 found = 0; 4219 4220 for (j = 0; j < num_ops; j++) { 4221 if (strcmp(operations[j].name, curname) == 0) { 4222 used++; 4223 found = 1; 4224 break; 4225 } 4226 } 4227 4228 /* 4229 * If the file system is using a "placeholder" for default 4230 * or error functions, grab the appropriate function out of 4231 * the translation table. If the file system didn't supply 4232 * this operation at all, use the default function. 4233 */ 4234 4235 if (found) { 4236 result = operations[j].func.fs_generic; 4237 if (result == fs_default) { 4238 result = translation[i].defaultFunc; 4239 } else if (result == fs_error) { 4240 result = translation[i].errorFunc; 4241 } else if (result == NULL) { 4242 /* Null values are PROHIBITED */ 4243 return (EINVAL); 4244 } 4245 } else { 4246 result = translation[i].defaultFunc; 4247 } 4248 4249 /* Now store the function into the operations vector. */ 4250 4251 location = (fs_generic_func_p *) 4252 (((char *)vector) + translation[i].offset); 4253 4254 *location = result; 4255 } 4256 4257 *unused_ops = num_ops - used; 4258 4259 return (0); 4260 } 4261 4262 /* Placeholder functions, should never be called. */ 4263 4264 int 4265 fs_error(void) 4266 { 4267 cmn_err(CE_PANIC, "fs_error called"); 4268 return (0); 4269 } 4270 4271 int 4272 fs_default(void) 4273 { 4274 cmn_err(CE_PANIC, "fs_default called"); 4275 return (0); 4276 } 4277 4278 #ifdef __sparc 4279 4280 /* 4281 * Part of the implementation of booting off a mirrored root 4282 * involves a change of dev_t for the root device. To 4283 * accomplish this, first remove the existing hash table 4284 * entry for the root device, convert to the new dev_t, 4285 * then re-insert in the hash table at the head of the list. 4286 */ 4287 void 4288 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 4289 { 4290 vfs_list_lock(); 4291 4292 vfs_hash_remove(vfsp); 4293 4294 vfsp->vfs_dev = ndev; 4295 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 4296 4297 vfs_hash_add(vfsp, 1); 4298 4299 vfs_list_unlock(); 4300 } 4301 4302 #else /* x86 NEWBOOT */ 4303 4304 #if defined(__x86) 4305 extern int hvmboot_rootconf(); 4306 #endif /* __x86 */ 4307 4308 int 4309 rootconf() 4310 { 4311 int error; 4312 struct vfssw *vsw; 4313 extern void pm_init(); 4314 char *fstyp, *fsmod; 4315 4316 getrootfs(&fstyp, &fsmod); 4317 4318 #if defined(__x86) 4319 /* 4320 * hvmboot_rootconf() is defined in the hvm_bootstrap misc module, 4321 * which lives in /platform/i86hvm, and hence is only available when 4322 * booted in an x86 hvm environment. If the hvm_bootstrap misc module 4323 * is not available then the modstub for this function will return 0. 4324 * If the hvm_bootstrap misc module is available it will be loaded 4325 * and hvmboot_rootconf() will be invoked. 4326 */ 4327 if (error = hvmboot_rootconf()) 4328 return (error); 4329 #endif /* __x86 */ 4330 4331 if (error = clboot_rootconf()) 4332 return (error); 4333 4334 if (modload("fs", fsmod) == -1) 4335 panic("Cannot _init %s module", fsmod); 4336 4337 RLOCK_VFSSW(); 4338 vsw = vfs_getvfsswbyname(fstyp); 4339 RUNLOCK_VFSSW(); 4340 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 4341 VFS_HOLD(rootvfs); 4342 4343 /* always mount readonly first */ 4344 rootvfs->vfs_flag |= VFS_RDONLY; 4345 4346 pm_init(); 4347 4348 if (netboot) 4349 (void) strplumb(); 4350 4351 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 4352 vfs_unrefvfssw(vsw); 4353 rootdev = rootvfs->vfs_dev; 4354 4355 if (error) 4356 panic("cannot mount root path %s", rootfs.bo_name); 4357 return (error); 4358 } 4359 4360 /* 4361 * XXX this is called by nfs only and should probably be removed 4362 * If booted with ASKNAME, prompt on the console for a filesystem 4363 * name and return it. 4364 */ 4365 void 4366 getfsname(char *askfor, char *name, size_t namelen) 4367 { 4368 if (boothowto & RB_ASKNAME) { 4369 printf("%s name: ", askfor); 4370 console_gets(name, namelen); 4371 } 4372 } 4373 4374 /* 4375 * If server_path exists, then we are booting a diskless 4376 * client. Otherwise, we default to ufs. Zfs should perhaps be 4377 * another property. 4378 */ 4379 static void 4380 getrootfs(char **fstypp, char **fsmodp) 4381 { 4382 extern char *strplumb_get_netdev_path(void); 4383 char *propstr = NULL; 4384 4385 /* check fstype property; it should be nfsdyn for diskless */ 4386 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4387 DDI_PROP_DONTPASS, "fstype", &propstr) 4388 == DDI_SUCCESS) { 4389 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 4390 ddi_prop_free(propstr); 4391 4392 /* 4393 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set, 4394 * assume the type of this root filesystem is 'zfs'. 4395 */ 4396 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4397 DDI_PROP_DONTPASS, "zfs-bootfs", &propstr) 4398 == DDI_SUCCESS) { 4399 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME); 4400 ddi_prop_free(propstr); 4401 } 4402 4403 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) { 4404 *fstypp = *fsmodp = rootfs.bo_fstype; 4405 return; 4406 } 4407 4408 ++netboot; 4409 /* 4410 * check if path to network interface is specified in bootpath 4411 * or by a hypervisor domain configuration file. 4412 * XXPV - enable strlumb_get_netdev_path() 4413 */ 4414 if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS, 4415 "xpv-nfsroot")) { 4416 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0"); 4417 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4418 DDI_PROP_DONTPASS, "bootpath", &propstr) 4419 == DDI_SUCCESS) { 4420 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 4421 ddi_prop_free(propstr); 4422 } else { 4423 /* attempt to determine netdev_path via boot_mac address */ 4424 netdev_path = strplumb_get_netdev_path(); 4425 if (netdev_path == NULL) 4426 panic("cannot find boot network interface"); 4427 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 4428 } 4429 *fstypp = rootfs.bo_fstype; 4430 *fsmodp = "nfs"; 4431 } 4432 #endif 4433 4434 /* 4435 * VFS feature routines 4436 */ 4437 4438 #define VFTINDEX(feature) (((feature) >> 32) & 0xFFFFFFFF) 4439 #define VFTBITS(feature) ((feature) & 0xFFFFFFFFLL) 4440 4441 /* Register a feature in the vfs */ 4442 void 4443 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature) 4444 { 4445 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4446 if (vfsp->vfs_implp == NULL) 4447 return; 4448 4449 vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature); 4450 } 4451 4452 /* 4453 * Query a vfs for a feature. 4454 * Returns 1 if feature is present, 0 if not 4455 */ 4456 int 4457 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature) 4458 { 4459 int ret = 0; 4460 4461 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4462 if (vfsp->vfs_implp == NULL) 4463 return (ret); 4464 4465 if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature)) 4466 ret = 1; 4467 4468 return (ret); 4469 } 4470 4471 /* 4472 * Propagate feature set from one vfs to another 4473 */ 4474 void 4475 vfs_propagate_features(vfs_t *from, vfs_t *to) 4476 { 4477 int i; 4478 4479 if (to->vfs_implp == NULL || from->vfs_implp == NULL) 4480 return; 4481 4482 for (i = 1; i <= to->vfs_featureset[0]; i++) { 4483 to->vfs_featureset[i] = from->vfs_featureset[i]; 4484 } 4485 } 4486