1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/errno.h> 46 #include <sys/user.h> 47 #include <sys/fstyp.h> 48 #include <sys/kmem.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/mount.h> 52 #include <sys/vfs.h> 53 #include <sys/fem.h> 54 #include <sys/mntent.h> 55 #include <sys/stat.h> 56 #include <sys/statvfs.h> 57 #include <sys/statfs.h> 58 #include <sys/cred.h> 59 #include <sys/vnode.h> 60 #include <sys/rwstlock.h> 61 #include <sys/dnlc.h> 62 #include <sys/file.h> 63 #include <sys/time.h> 64 #include <sys/atomic.h> 65 #include <sys/cmn_err.h> 66 #include <sys/buf.h> 67 #include <sys/swap.h> 68 #include <sys/debug.h> 69 #include <sys/vnode.h> 70 #include <sys/modctl.h> 71 #include <sys/ddi.h> 72 #include <sys/pathname.h> 73 #include <sys/bootconf.h> 74 #include <sys/dumphdr.h> 75 #include <sys/dc_ki.h> 76 #include <sys/poll.h> 77 #include <sys/sunddi.h> 78 #include <sys/sysmacros.h> 79 #include <sys/zone.h> 80 #include <sys/policy.h> 81 #include <sys/ctfs.h> 82 #include <sys/objfs.h> 83 #include <sys/console.h> 84 #include <sys/reboot.h> 85 86 #include <vm/page.h> 87 88 #include <fs/fs_subr.h> 89 90 /* Private interfaces to create vopstats-related data structures */ 91 extern void initialize_vopstats(vopstats_t *); 92 extern vopstats_t *get_fstype_vopstats(struct vfs *, struct vfssw *); 93 extern vsk_anchor_t *get_vskstat_anchor(struct vfs *); 94 95 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 96 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 97 const char *, int, int); 98 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 99 static void vfs_freemnttab(struct vfs *); 100 static void vfs_freeopt(mntopt_t *); 101 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 102 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 103 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 104 static void vfs_createopttbl_extend(mntopts_t *, const char *, 105 const mntopts_t *); 106 static char **vfs_copycancelopt_extend(char **const, int); 107 static void vfs_freecancelopt(char **); 108 static char *getrootfs(void); 109 static int getmacpath(dev_info_t *, void *); 110 111 struct ipmnt { 112 struct ipmnt *mip_next; 113 dev_t mip_dev; 114 struct vfs *mip_vfsp; 115 }; 116 117 static kmutex_t vfs_miplist_mutex; 118 static struct ipmnt *vfs_miplist = NULL; 119 static struct ipmnt *vfs_miplist_end = NULL; 120 121 /* 122 * VFS global data. 123 */ 124 vnode_t *rootdir; /* pointer to root inode vnode. */ 125 vnode_t *devicesdir; /* pointer to inode of devices root */ 126 127 char *server_rootpath; /* root path for diskless clients */ 128 char *server_hostname; /* hostname of diskless server */ 129 130 static struct vfs root; 131 static struct vfs devices; 132 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 133 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 134 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 135 /* must be power of 2! */ 136 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 137 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 138 char *vfs_dummyfstype = "\0"; 139 struct pollhead vfs_pollhd; /* for mnttab pollers */ 140 141 /* 142 * Table for generic options recognized in the VFS layer and acted 143 * on at this level before parsing file system specific options. 144 * The nosuid option is stronger than any of the devices and setuid 145 * options, so those are canceled when nosuid is seen. 146 * 147 * All options which are added here need to be added to the 148 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 149 */ 150 /* 151 * VFS Mount options table 152 */ 153 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 154 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 155 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 156 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 157 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 158 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 159 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 160 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 161 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 162 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 163 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 164 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 165 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 166 167 static const mntopt_t mntopts[] = { 168 /* 169 * option name cancel options default arg flags 170 */ 171 { MNTOPT_REMOUNT, NULL, NULL, 172 MO_NODISPLAY, (void *)0 }, 173 { MNTOPT_RO, ro_cancel, NULL, 0, 174 (void *)0 }, 175 { MNTOPT_RW, rw_cancel, NULL, 0, 176 (void *)0 }, 177 { MNTOPT_SUID, suid_cancel, NULL, 0, 178 (void *)0 }, 179 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 180 (void *)0 }, 181 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 182 (void *)0 }, 183 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 184 (void *)0 }, 185 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 186 (void *)0 }, 187 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 188 (void *)0 }, 189 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 190 (void *)0 }, 191 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 192 (void *)0 }, 193 { MNTOPT_EXEC, exec_cancel, NULL, 0, 194 (void *)0 }, 195 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 196 (void *)0 }, 197 }; 198 199 const mntopts_t vfs_mntopts = { 200 sizeof (mntopts) / sizeof (mntopt_t), 201 (mntopt_t *)&mntopts[0] 202 }; 203 204 /* 205 * File system operation dispatch functions. 206 */ 207 208 int 209 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 210 { 211 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 212 } 213 214 int 215 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 216 { 217 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 218 } 219 220 int 221 fsop_root(vfs_t *vfsp, vnode_t **vpp) 222 { 223 refstr_t *mntpt; 224 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 225 /* 226 * Make sure this root has a path. With lofs, it is possible to have 227 * a NULL mountpoint. 228 */ 229 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { 230 mntpt = vfs_getmntpoint(vfsp); 231 vn_setpath_str(*vpp, refstr_value(mntpt), 232 strlen(refstr_value(mntpt))); 233 refstr_rele(mntpt); 234 } 235 236 return (ret); 237 } 238 239 int 240 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 241 { 242 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 243 } 244 245 int 246 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 247 { 248 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 249 } 250 251 int 252 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 253 { 254 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 255 } 256 257 int 258 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 259 { 260 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 261 } 262 263 void 264 fsop_freefs(vfs_t *vfsp) 265 { 266 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 267 } 268 269 int 270 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 271 { 272 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 273 } 274 275 int 276 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 277 { 278 ASSERT((fstype >= 0) && (fstype < nfstype)); 279 280 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 281 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 282 else 283 return (ENOTSUP); 284 } 285 286 /* 287 * File system initialization. vfs_setfsops() must be called from a file 288 * system's init routine. 289 */ 290 291 static int 292 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 293 int *unused_ops) 294 { 295 static const fs_operation_trans_def_t vfs_ops_table[] = { 296 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 297 fs_nosys, fs_nosys, 298 299 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 300 fs_nosys, fs_nosys, 301 302 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 303 fs_nosys, fs_nosys, 304 305 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 306 fs_nosys, fs_nosys, 307 308 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 309 (fs_generic_func_p) fs_sync, 310 (fs_generic_func_p) fs_sync, /* No errors allowed */ 311 312 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 313 fs_nosys, fs_nosys, 314 315 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 316 fs_nosys, fs_nosys, 317 318 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 319 (fs_generic_func_p)fs_freevfs, 320 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 321 322 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 323 (fs_generic_func_p)fs_nosys, 324 (fs_generic_func_p)fs_nosys, 325 326 NULL, 0, NULL, NULL 327 }; 328 329 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 330 } 331 332 int 333 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 334 { 335 int error; 336 int unused_ops; 337 338 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 339 340 if ((fstype <= 0) || (fstype >= nfstype)) 341 return (EINVAL); 342 343 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 344 return (EINVAL); 345 346 /* Set up the operations vector. */ 347 348 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 349 350 if (error != 0) 351 return (error); 352 353 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 354 355 if (actual != NULL) 356 *actual = &vfssw[fstype].vsw_vfsops; 357 358 #if DEBUG 359 if (unused_ops != 0) 360 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 361 "but not used", vfssw[fstype].vsw_name, unused_ops); 362 #endif 363 364 return (0); 365 } 366 367 int 368 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 369 { 370 int error; 371 int unused_ops; 372 373 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 374 375 error = fs_copyfsops(template, *actual, &unused_ops); 376 if (error != 0) { 377 kmem_free(*actual, sizeof (vfsops_t)); 378 *actual = NULL; 379 return (error); 380 } 381 382 return (0); 383 } 384 385 /* 386 * Free a vfsops structure created as a result of vfs_makefsops(). 387 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 388 * vfs_freevfsops_by_type(). 389 */ 390 void 391 vfs_freevfsops(vfsops_t *vfsops) 392 { 393 kmem_free(vfsops, sizeof (vfsops_t)); 394 } 395 396 /* 397 * Since the vfsops structure is part of the vfssw table and wasn't 398 * really allocated, we're not really freeing anything. We keep 399 * the name for consistency with vfs_freevfsops(). We do, however, 400 * need to take care of a little bookkeeping. 401 * NOTE: For a vfsops structure created by vfs_setfsops(), use 402 * vfs_freevfsops_by_type(). 403 */ 404 int 405 vfs_freevfsops_by_type(int fstype) 406 { 407 408 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 409 if ((fstype <= 0) || (fstype >= nfstype)) 410 return (EINVAL); 411 412 WLOCK_VFSSW(); 413 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 414 WUNLOCK_VFSSW(); 415 return (EINVAL); 416 } 417 418 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 419 WUNLOCK_VFSSW(); 420 421 return (0); 422 } 423 424 /* Support routines used to reference vfs_op */ 425 426 /* Set the operations vector for a vfs */ 427 void 428 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 429 { 430 vfsops_t *op; 431 432 ASSERT(vfsp != NULL); 433 ASSERT(vfsops != NULL); 434 435 op = vfsp->vfs_op; 436 membar_consumer(); 437 if (vfsp->vfs_femhead == NULL && 438 casptr(&vfsp->vfs_op, op, vfsops) == op) { 439 return; 440 } 441 fsem_setvfsops(vfsp, vfsops); 442 } 443 444 /* Retrieve the operations vector for a vfs */ 445 vfsops_t * 446 vfs_getops(vfs_t *vfsp) 447 { 448 vfsops_t *op; 449 450 ASSERT(vfsp != NULL); 451 452 op = vfsp->vfs_op; 453 membar_consumer(); 454 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) { 455 return (op); 456 } else { 457 return (fsem_getvfsops(vfsp)); 458 } 459 } 460 461 /* 462 * Returns non-zero (1) if the vfsops matches that of the vfs. 463 * Returns zero (0) if not. 464 */ 465 int 466 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 467 { 468 return (vfs_getops(vfsp) == vfsops); 469 } 470 471 /* 472 * Returns non-zero (1) if the file system has installed a non-default, 473 * non-error vfs_sync routine. Returns zero (0) otherwise. 474 */ 475 int 476 vfs_can_sync(vfs_t *vfsp) 477 { 478 /* vfs_sync() routine is not the default/error function */ 479 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 480 } 481 482 /* 483 * Initialize a vfs structure. 484 */ 485 void 486 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 487 { 488 vfsp->vfs_count = 0; 489 vfsp->vfs_next = vfsp; 490 vfsp->vfs_prev = vfsp; 491 vfsp->vfs_zone_next = vfsp; 492 vfsp->vfs_zone_prev = vfsp; 493 vfsp->vfs_flag = 0; 494 vfsp->vfs_data = (data); 495 vfsp->vfs_resource = NULL; 496 vfsp->vfs_mntpt = NULL; 497 vfsp->vfs_mntopts.mo_count = 0; 498 vfsp->vfs_mntopts.mo_list = NULL; 499 vfsp->vfs_femhead = NULL; 500 vfsp->vfs_zone = NULL; 501 /* 502 * Note: Don't initialize vfs_vskap, vfs_fstypevsp since it 503 * could be a problem for unbundled file systems. 504 */ 505 vfs_setops((vfsp), (op)); 506 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 507 } 508 509 510 /* 511 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 512 * fstatvfs, and sysfs moved to common/syscall. 513 */ 514 515 /* 516 * Update every mounted file system. We call the vfs_sync operation of 517 * each file system type, passing it a NULL vfsp to indicate that all 518 * mounted file systems of that type should be updated. 519 */ 520 void 521 vfs_sync(int flag) 522 { 523 struct vfssw *vswp; 524 RLOCK_VFSSW(); 525 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 526 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 527 vfs_refvfssw(vswp); 528 RUNLOCK_VFSSW(); 529 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 530 CRED()); 531 vfs_unrefvfssw(vswp); 532 RLOCK_VFSSW(); 533 } 534 } 535 RUNLOCK_VFSSW(); 536 } 537 538 void 539 sync(void) 540 { 541 vfs_sync(0); 542 } 543 544 /* 545 * External routines. 546 */ 547 548 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 549 550 /* 551 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 552 * but otherwise should be accessed only via vfs_list_lock() and 553 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 554 */ 555 static krwlock_t vfslist; 556 557 /* 558 * Mount devfs on /devices. This is done right after root is mounted 559 * to provide device access support for the system 560 */ 561 static void 562 vfs_mountdevices(void) 563 { 564 struct vfssw *vsw; 565 struct vnode *mvp; 566 struct mounta mounta = { /* fake mounta for devfs_mount() */ 567 NULL, 568 NULL, 569 MS_SYSSPACE, 570 NULL, 571 NULL, 572 0, 573 NULL, 574 0 575 }; 576 577 /* 578 * _init devfs module to fill in the vfssw 579 */ 580 if (modload("fs", "devfs") == -1) 581 cmn_err(CE_PANIC, "Cannot _init devfs module\n"); 582 583 /* 584 * Hold vfs 585 */ 586 RLOCK_VFSSW(); 587 vsw = vfs_getvfsswbyname("devfs"); 588 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 589 VFS_HOLD(&devices); 590 591 /* 592 * Locate mount point 593 */ 594 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 595 cmn_err(CE_PANIC, "Cannot find /devices\n"); 596 597 /* 598 * Perform the mount of /devices 599 */ 600 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 601 cmn_err(CE_PANIC, "Cannot mount /devices\n"); 602 603 RUNLOCK_VFSSW(); 604 605 /* 606 * Set appropriate members and add to vfs list for mnttab display 607 */ 608 vfs_setresource(&devices, "/devices"); 609 vfs_setmntpoint(&devices, "/devices"); 610 611 /* 612 * Hold the root of /devices so it won't go away 613 */ 614 if (VFS_ROOT(&devices, &devicesdir)) 615 cmn_err(CE_PANIC, "vfs_mountdevices: not devices root"); 616 VN_HOLD(devicesdir); 617 618 if (vfs_lock(&devices) != 0) { 619 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 620 return; 621 } 622 623 if (vn_vfswlock(mvp) != 0) { 624 vfs_unlock(&devices); 625 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 626 return; 627 } 628 629 vfs_add(mvp, &devices, 0); 630 vn_vfsunlock(mvp); 631 vfs_unlock(&devices); 632 } 633 634 /* 635 * Mount required filesystem. This is done right after root is mounted. 636 */ 637 static void 638 vfs_mountfs(char *module, char *spec, char *path) 639 { 640 struct vnode *mvp; 641 struct mounta mounta; 642 vfs_t *vfsp; 643 644 mounta.flags = MS_SYSSPACE | MS_DATA; 645 mounta.fstype = module; 646 mounta.spec = spec; 647 mounta.dir = path; 648 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 649 cmn_err(CE_WARN, "Cannot find %s\n", path); 650 return; 651 } 652 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 653 cmn_err(CE_WARN, "Cannot mount %s\n", path); 654 else 655 VFS_RELE(vfsp); 656 VN_RELE(mvp); 657 } 658 659 /* 660 * vfs_mountroot is called by main() to mount the root filesystem. 661 */ 662 void 663 vfs_mountroot(void) 664 { 665 struct vnode *rvp = NULL; 666 char *path; 667 size_t plen; 668 struct vfssw *vswp; 669 670 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 671 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 672 673 /* 674 * Alloc the vfs hash bucket array and locks 675 */ 676 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 677 678 /* 679 * Call machine-dependent routine "rootconf" to choose a root 680 * file system type. 681 */ 682 if (rootconf()) 683 cmn_err(CE_PANIC, "vfs_mountroot: cannot mount root"); 684 /* 685 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 686 * to point to it. These are used by lookuppn() so that it 687 * knows where to start from ('/' or '.'). 688 */ 689 vfs_setmntpoint(rootvfs, "/"); 690 if (VFS_ROOT(rootvfs, &rootdir)) 691 cmn_err(CE_PANIC, "vfs_mountroot: no root vnode"); 692 u.u_cdir = rootdir; 693 VN_HOLD(u.u_cdir); 694 u.u_rdir = NULL; 695 696 /* 697 * Setup the global zone's rootvp, now that it exists. 698 */ 699 global_zone->zone_rootvp = rootdir; 700 VN_HOLD(global_zone->zone_rootvp); 701 702 /* 703 * Notify the module code that it can begin using the 704 * root filesystem instead of the boot program's services. 705 */ 706 modrootloaded = 1; 707 /* 708 * Set up mnttab information for root 709 */ 710 vfs_setresource(rootvfs, rootfs.bo_name); 711 712 /* 713 * Notify cluster software that the root filesystem is available. 714 */ 715 clboot_mountroot(); 716 717 /* Now that we're all done with the root FS, set up its vopstats */ 718 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) { 719 /* Set flag for statistics collection */ 720 if (vswp->vsw_flag & VSW_STATS) { 721 initialize_vopstats(&rootvfs->vfs_vopstats); 722 rootvfs->vfs_flag |= VFS_STATS; 723 rootvfs->vfs_fstypevsp = 724 get_fstype_vopstats(rootvfs, vswp); 725 rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs); 726 } 727 vfs_unrefvfssw(vswp); 728 } 729 730 /* 731 * Mount /devices, /system/contract, /etc/mnttab, /etc/svc/volatile, 732 * /system/object, and /proc. 733 */ 734 vfs_mountdevices(); 735 736 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 737 vfs_mountfs("proc", "/proc", "/proc"); 738 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 739 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 740 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 741 742 #ifdef __sparc 743 /* 744 * This bit of magic can go away when we convert sparc to 745 * the new boot architecture based on ramdisk. 746 * 747 * Booting off a mirrored root volume: 748 * At this point, we have booted and mounted root on a 749 * single component of the mirror. Complete the boot 750 * by configuring SVM and converting the root to the 751 * dev_t of the mirrored root device. This dev_t conversion 752 * only works because the underlying device doesn't change. 753 */ 754 if (root_is_svm) { 755 if (svm_rootconf()) { 756 cmn_err(CE_PANIC, "vfs_mountroot: cannot remount root"); 757 } 758 759 /* 760 * mnttab should reflect the new root device 761 */ 762 vfs_lock_wait(rootvfs); 763 vfs_setresource(rootvfs, rootfs.bo_name); 764 vfs_unlock(rootvfs); 765 } 766 #endif /* __sparc */ 767 768 /* 769 * Look up the root device via devfs so that a dv_node is 770 * created for it. The vnode is never VN_RELE()ed. 771 * We allocate more than MAXPATHLEN so that the 772 * buffer passed to i_ddi_prompath_to_devfspath() is 773 * exactly MAXPATHLEN (the function expects a buffer 774 * of that length). 775 */ 776 plen = strlen("/devices"); 777 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 778 (void) strcpy(path, "/devices"); 779 780 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 781 != DDI_SUCCESS || 782 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 783 784 /* NUL terminate in case "path" has garbage */ 785 path[plen + MAXPATHLEN - 1] = '\0'; 786 #ifdef DEBUG 787 cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); 788 #endif 789 } 790 kmem_free(path, plen + MAXPATHLEN); 791 } 792 793 /* 794 * If remount failed and we're in a zone we need to check for the zone 795 * root path and strip it before the call to vfs_setpath(). 796 * 797 * If strpath doesn't begin with the zone_rootpath the original 798 * strpath is returned unchanged. 799 */ 800 static const char * 801 stripzonepath(const char *strpath) 802 { 803 char *str1, *str2; 804 int i; 805 zone_t *zonep = curproc->p_zone; 806 807 if (zonep->zone_rootpath == NULL || strpath == NULL) { 808 return (NULL); 809 } 810 811 /* 812 * we check for the end of the string at one past the 813 * current position because the zone_rootpath always 814 * ends with "/" but we don't want to strip that off. 815 */ 816 str1 = zonep->zone_rootpath; 817 str2 = (char *)strpath; 818 ASSERT(str1[0] != '\0'); 819 for (i = 0; str1[i + 1] != '\0'; i++) { 820 if (str1[i] != str2[i]) 821 return ((char *)strpath); 822 } 823 return (&str2[i]); 824 } 825 826 /* 827 * Common mount code. Called from the system call entry point, from autofs, 828 * and from pxfs. 829 * 830 * Takes the effective file system type, mount arguments, the mount point 831 * vnode, flags specifying whether the mount is a remount and whether it 832 * should be entered into the vfs list, and credentials. Fills in its vfspp 833 * parameter with the mounted file system instance's vfs. 834 * 835 * Note that the effective file system type is specified as a string. It may 836 * be null, in which case it's determined from the mount arguments, and may 837 * differ from the type specified in the mount arguments; this is a hook to 838 * allow interposition when instantiating file system instances. 839 * 840 * The caller is responsible for releasing its own hold on the mount point 841 * vp (this routine does its own hold when necessary). 842 * Also note that for remounts, the mount point vp should be the vnode for 843 * the root of the file system rather than the vnode that the file system 844 * is mounted on top of. 845 */ 846 int 847 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 848 struct vfs **vfspp) 849 { 850 struct vfssw *vswp; 851 vfsops_t *vfsops; 852 struct vfs *vfsp; 853 struct vnode *bvp; 854 dev_t bdev = 0; 855 mntopts_t mnt_mntopts; 856 int error = 0; 857 int copyout_error = 0; 858 int ovflags; 859 char *opts = uap->optptr; 860 char *inargs = opts; 861 int optlen = uap->optlen; 862 int remount; 863 int rdonly; 864 int nbmand = 0; 865 int delmip = 0; 866 int addmip = 0; 867 int splice = ((uap->flags & MS_NOSPLICE) == 0); 868 int fromspace = (uap->flags & MS_SYSSPACE) ? 869 UIO_SYSSPACE : UIO_USERSPACE; 870 char *resource = NULL, *mountpt = NULL; 871 refstr_t *oldresource, *oldmntpt; 872 struct pathname pn, rpn; 873 vsk_anchor_t *vskap; 874 875 /* 876 * The v_flag value for the mount point vp is permanently set 877 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 878 * for mount point locking. 879 */ 880 mutex_enter(&vp->v_lock); 881 vp->v_flag |= VVFSLOCK; 882 mutex_exit(&vp->v_lock); 883 884 mnt_mntopts.mo_count = 0; 885 /* 886 * Find the ops vector to use to invoke the file system-specific mount 887 * method. If the fsname argument is non-NULL, use it directly. 888 * Otherwise, dig the file system type information out of the mount 889 * arguments. 890 * 891 * A side effect is to hold the vfssw entry. 892 * 893 * Mount arguments can be specified in several ways, which are 894 * distinguished by flag bit settings. The preferred way is to set 895 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 896 * type supplied as a character string and the last two arguments 897 * being a pointer to a character buffer and the size of the buffer. 898 * On entry, the buffer holds a null terminated list of options; on 899 * return, the string is the list of options the file system 900 * recognized. If MS_DATA is set arguments five and six point to a 901 * block of binary data which the file system interprets. 902 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 903 * consistently with these conventions. To handle them, we check to 904 * see whether the pointer to the file system name has a numeric value 905 * less than 256. If so, we treat it as an index. 906 */ 907 if (fsname != NULL) { 908 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 909 return (EINVAL); 910 } 911 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 912 size_t n; 913 uint_t fstype; 914 char name[FSTYPSZ]; 915 916 if ((fstype = (uintptr_t)uap->fstype) < 256) { 917 RLOCK_VFSSW(); 918 if (fstype == 0 || fstype >= nfstype || 919 !ALLOCATED_VFSSW(&vfssw[fstype])) { 920 RUNLOCK_VFSSW(); 921 return (EINVAL); 922 } 923 (void) strcpy(name, vfssw[fstype].vsw_name); 924 RUNLOCK_VFSSW(); 925 if ((vswp = vfs_getvfssw(name)) == NULL) 926 return (EINVAL); 927 } else { 928 /* 929 * Handle either kernel or user address space. 930 */ 931 if (uap->flags & MS_SYSSPACE) { 932 error = copystr(uap->fstype, name, 933 FSTYPSZ, &n); 934 } else { 935 error = copyinstr(uap->fstype, name, 936 FSTYPSZ, &n); 937 } 938 if (error) { 939 if (error == ENAMETOOLONG) 940 return (EINVAL); 941 return (error); 942 } 943 if ((vswp = vfs_getvfssw(name)) == NULL) 944 return (EINVAL); 945 } 946 } else { 947 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 948 return (EINVAL); 949 } 950 if (!VFS_INSTALLED(vswp)) 951 return (EINVAL); 952 vfsops = &vswp->vsw_vfsops; 953 954 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 955 /* 956 * Fetch mount options and parse them for generic vfs options 957 */ 958 if (uap->flags & MS_OPTIONSTR) { 959 /* 960 * Limit the buffer size 961 */ 962 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 963 error = EINVAL; 964 goto errout; 965 } 966 if ((uap->flags & MS_SYSSPACE) == 0) { 967 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 968 inargs[0] = '\0'; 969 if (optlen) { 970 error = copyinstr(opts, inargs, (size_t)optlen, 971 NULL); 972 if (error) { 973 goto errout; 974 } 975 } 976 } 977 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 978 } 979 /* 980 * Flag bits override the options string. 981 */ 982 if (uap->flags & MS_REMOUNT) 983 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 984 if (uap->flags & MS_RDONLY) 985 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 986 if (uap->flags & MS_NOSUID) 987 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 988 989 /* 990 * Check if this is a remount; must be set in the option string and 991 * the file system must support a remount option. 992 */ 993 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 994 MNTOPT_REMOUNT, NULL)) { 995 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 996 error = ENOTSUP; 997 goto errout; 998 } 999 uap->flags |= MS_REMOUNT; 1000 } 1001 1002 /* 1003 * uap->flags and vfs_optionisset() should agree. 1004 */ 1005 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 1006 uap->flags |= MS_RDONLY; 1007 } 1008 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 1009 uap->flags |= MS_NOSUID; 1010 } 1011 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 1012 ASSERT(splice || !remount); 1013 /* 1014 * If we are splicing the fs into the namespace, 1015 * perform mount point checks. 1016 * 1017 * We want to resolve the path for the mount point to eliminate 1018 * '.' and ".." and symlinks in mount points; we can't do the 1019 * same for the resource string, since it would turn 1020 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 1021 * this before grabbing vn_vfswlock(), because otherwise we 1022 * would deadlock with lookuppn(). 1023 */ 1024 if (splice) { 1025 ASSERT(vp->v_count > 0); 1026 1027 /* 1028 * Pick up mount point and device from appropriate space. 1029 */ 1030 if (pn_get(uap->spec, fromspace, &pn) == 0) { 1031 resource = kmem_alloc(pn.pn_pathlen + 1, 1032 KM_SLEEP); 1033 (void) strcpy(resource, pn.pn_path); 1034 pn_free(&pn); 1035 } 1036 /* 1037 * Do a lookupname prior to taking the 1038 * writelock. Mark this as completed if 1039 * successful for later cleanup and addition to 1040 * the mount in progress table. 1041 */ 1042 if ((uap->flags & MS_GLOBAL) == 0 && 1043 lookupname(uap->spec, fromspace, 1044 FOLLOW, NULL, &bvp) == 0) { 1045 addmip = 1; 1046 } 1047 1048 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 1049 pathname_t *pnp; 1050 1051 if (*pn.pn_path != '/') { 1052 error = EINVAL; 1053 pn_free(&pn); 1054 goto errout; 1055 } 1056 pn_alloc(&rpn); 1057 /* 1058 * Kludge to prevent autofs from deadlocking with 1059 * itself when it calls domount(). 1060 * 1061 * If autofs is calling, it is because it is doing 1062 * (autofs) mounts in the process of an NFS mount. A 1063 * lookuppn() here would cause us to block waiting for 1064 * said NFS mount to complete, which can't since this 1065 * is the thread that was supposed to doing it. 1066 */ 1067 if (fromspace == UIO_USERSPACE) { 1068 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1069 NULL)) == 0) { 1070 pnp = &rpn; 1071 } else { 1072 /* 1073 * The file disappeared or otherwise 1074 * became inaccessible since we opened 1075 * it; might as well fail the mount 1076 * since the mount point is no longer 1077 * accessible. 1078 */ 1079 pn_free(&rpn); 1080 pn_free(&pn); 1081 goto errout; 1082 } 1083 } else { 1084 pnp = &pn; 1085 } 1086 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1087 (void) strcpy(mountpt, pnp->pn_path); 1088 1089 /* 1090 * If the addition of the zone's rootpath 1091 * would push us over a total path length 1092 * of MAXPATHLEN, we fail the mount with 1093 * ENAMETOOLONG, which is what we would have 1094 * gotten if we were trying to perform the same 1095 * mount in the global zone. 1096 * 1097 * strlen() doesn't count the trailing 1098 * '\0', but zone_rootpathlen counts both a 1099 * trailing '/' and the terminating '\0'. 1100 */ 1101 if ((curproc->p_zone->zone_rootpathlen - 1 + 1102 strlen(mountpt)) > MAXPATHLEN || 1103 (resource != NULL && 1104 (curproc->p_zone->zone_rootpathlen - 1 + 1105 strlen(resource)) > MAXPATHLEN)) { 1106 error = ENAMETOOLONG; 1107 } 1108 1109 pn_free(&rpn); 1110 pn_free(&pn); 1111 } 1112 1113 if (error) 1114 goto errout; 1115 1116 /* 1117 * Prevent path name resolution from proceeding past 1118 * the mount point. 1119 */ 1120 if (vn_vfswlock(vp) != 0) { 1121 error = EBUSY; 1122 goto errout; 1123 } 1124 1125 /* 1126 * Verify that it's legitimate to establish a mount on 1127 * the prospective mount point. 1128 */ 1129 if (vn_mountedvfs(vp) != NULL) { 1130 /* 1131 * The mount point lock was obtained after some 1132 * other thread raced through and established a mount. 1133 */ 1134 vn_vfsunlock(vp); 1135 error = EBUSY; 1136 goto errout; 1137 } 1138 if (vp->v_flag & VNOMOUNT) { 1139 vn_vfsunlock(vp); 1140 error = EINVAL; 1141 goto errout; 1142 } 1143 } 1144 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1145 uap->dataptr = NULL; 1146 uap->datalen = 0; 1147 } 1148 1149 /* 1150 * If this is a remount, we don't want to create a new VFS. 1151 * Instead, we pass the existing one with a remount flag. 1152 */ 1153 if (remount) { 1154 /* 1155 * Confirm that the mount point is the root vnode of the 1156 * file system that is being remounted. 1157 * This can happen if the user specifies a different 1158 * mount point directory pathname in the (re)mount command. 1159 * 1160 * Code below can only be reached if splice is true, so it's 1161 * safe to do vn_vfsunlock() here. 1162 */ 1163 if ((vp->v_flag & VROOT) == 0) { 1164 vn_vfsunlock(vp); 1165 error = ENOENT; 1166 goto errout; 1167 } 1168 /* 1169 * Disallow making file systems read-only unless file system 1170 * explicitly allows it in its vfssw. Ignore other flags. 1171 */ 1172 if (rdonly && vn_is_readonly(vp) == 0 && 1173 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1174 vn_vfsunlock(vp); 1175 error = EINVAL; 1176 goto errout; 1177 } 1178 /* 1179 * Changing the NBMAND setting on remounts is permitted 1180 * but logged since it can lead to unexpected behavior. 1181 * We also counsel against using it for / and /usr. 1182 */ 1183 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1184 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1185 cmn_err(CE_WARN, "domount: nbmand turned %s via " 1186 "remounting %s", nbmand ? "on" : "off", 1187 refstr_value(vp->v_vfsp->vfs_mntpt)); 1188 } 1189 vfsp = vp->v_vfsp; 1190 ovflags = vfsp->vfs_flag; 1191 vfsp->vfs_flag |= VFS_REMOUNT; 1192 vfsp->vfs_flag &= ~VFS_RDONLY; 1193 } else { 1194 vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP); 1195 VFS_INIT(vfsp, vfsops, NULL); 1196 } 1197 1198 VFS_HOLD(vfsp); 1199 1200 /* 1201 * The vfs_reflock is not used anymore the code below explicitly 1202 * holds it preventing others accesing it directly. 1203 */ 1204 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1205 !(vfsp->vfs_flag & VFS_REMOUNT)) 1206 cmn_err(CE_WARN, 1207 "mount type %s couldn't get vfs_reflock\n", vswp->vsw_name); 1208 1209 /* 1210 * Lock the vfs. If this is a remount we want to avoid spurious umount 1211 * failures that happen as a side-effect of fsflush() and other mount 1212 * and unmount operations that might be going on simultaneously and 1213 * may have locked the vfs currently. To not return EBUSY immediately 1214 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1215 */ 1216 if (!remount) { 1217 if (error = vfs_lock(vfsp)) { 1218 vfsp->vfs_flag = ovflags; 1219 if (splice) 1220 vn_vfsunlock(vp); 1221 kmem_free(vfsp, sizeof (struct vfs)); 1222 goto errout; 1223 } 1224 } else { 1225 vfs_lock_wait(vfsp); 1226 } 1227 1228 /* 1229 * Add device to mount in progress table, global mounts require special 1230 * handling. It is possible that we have already done the lookupname 1231 * on a spliced, non-global fs. If so, we don't want to do it again 1232 * since we cannot do a lookupname after taking the 1233 * wlock above. This case is for a non-spliced, non-global filesystem. 1234 */ 1235 if (!addmip) { 1236 if ((uap->flags & MS_GLOBAL) == 0 && 1237 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1238 addmip = 1; 1239 } 1240 } 1241 1242 if (addmip) { 1243 bdev = bvp->v_rdev; 1244 VN_RELE(bvp); 1245 vfs_addmip(bdev, vfsp); 1246 addmip = 0; 1247 delmip = 1; 1248 } 1249 /* 1250 * Invalidate cached entry for the mount point. 1251 */ 1252 if (splice) 1253 dnlc_purge_vp(vp); 1254 1255 /* 1256 * If have an option string but the filesystem doesn't supply a 1257 * prototype options table, create a table with the global 1258 * options and sufficient room to accept all the options in the 1259 * string. Then parse the passed in option string 1260 * accepting all the options in the string. This gives us an 1261 * option table with all the proper cancel properties for the 1262 * global options. 1263 * 1264 * Filesystems that supply a prototype options table are handled 1265 * earlier in this function. 1266 */ 1267 if (uap->flags & MS_OPTIONSTR) { 1268 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1269 mntopts_t tmp_mntopts; 1270 1271 tmp_mntopts.mo_count = 0; 1272 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1273 &mnt_mntopts); 1274 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1275 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1276 vfs_freeopttbl(&tmp_mntopts); 1277 } 1278 } 1279 1280 /* 1281 * Serialize with zone creations. 1282 */ 1283 mount_in_progress(); 1284 /* 1285 * Instantiate (or reinstantiate) the file system. If appropriate, 1286 * splice it into the file system name space. 1287 * 1288 * We want VFS_MOUNT() to be able to override the vfs_resource 1289 * string if necessary (ie, mntfs), and also for a remount to 1290 * change the same (necessary when remounting '/' during boot). 1291 * So we set up vfs_mntpt and vfs_resource to what we think they 1292 * should be, then hand off control to VFS_MOUNT() which can 1293 * override this. 1294 * 1295 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1296 * a vfs which is on the vfs list (i.e. during a remount), we must 1297 * never set those fields to NULL. Several bits of code make 1298 * assumptions that the fields are always valid. 1299 */ 1300 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1301 if (remount) { 1302 if ((oldresource = vfsp->vfs_resource) != NULL) 1303 refstr_hold(oldresource); 1304 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1305 refstr_hold(oldmntpt); 1306 } 1307 vfs_setresource(vfsp, resource); 1308 vfs_setmntpoint(vfsp, mountpt); 1309 1310 error = VFS_MOUNT(vfsp, vp, uap, credp); 1311 1312 if (uap->flags & MS_RDONLY) 1313 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1314 if (uap->flags & MS_NOSUID) 1315 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1316 if (uap->flags & MS_GLOBAL) 1317 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1318 1319 if (error) { 1320 if (remount) { 1321 /* put back pre-remount options */ 1322 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1323 vfs_setmntpoint(vfsp, (stripzonepath( 1324 refstr_value(oldmntpt)))); 1325 if (oldmntpt) 1326 refstr_rele(oldmntpt); 1327 vfs_setresource(vfsp, (stripzonepath( 1328 refstr_value(oldresource)))); 1329 if (oldresource) 1330 refstr_rele(oldresource); 1331 vfsp->vfs_flag = ovflags; 1332 vfs_unlock(vfsp); 1333 VFS_RELE(vfsp); 1334 } else { 1335 vfs_unlock(vfsp); 1336 vfs_freemnttab(vfsp); 1337 kmem_free(vfsp, sizeof (struct vfs)); 1338 } 1339 } else { 1340 /* 1341 * Set the mount time to now 1342 */ 1343 vfsp->vfs_mtime = ddi_get_time(); 1344 if (remount) { 1345 vfsp->vfs_flag &= ~VFS_REMOUNT; 1346 if (oldresource) 1347 refstr_rele(oldresource); 1348 if (oldmntpt) 1349 refstr_rele(oldmntpt); 1350 } else if (splice) { 1351 /* 1352 * Link vfsp into the name space at the mount 1353 * point. Vfs_add() is responsible for 1354 * holding the mount point which will be 1355 * released when vfs_remove() is called. 1356 */ 1357 vfs_add(vp, vfsp, uap->flags); 1358 } else { 1359 /* 1360 * Hold the reference to file system which is 1361 * not linked into the name space. 1362 */ 1363 vfsp->vfs_zone = NULL; 1364 VFS_HOLD(vfsp); 1365 vfsp->vfs_vnodecovered = NULL; 1366 } 1367 /* 1368 * Set flags for global options encountered 1369 */ 1370 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1371 vfsp->vfs_flag |= VFS_RDONLY; 1372 else 1373 vfsp->vfs_flag &= ~VFS_RDONLY; 1374 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1375 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1376 } else { 1377 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1378 vfsp->vfs_flag |= VFS_NODEVICES; 1379 else 1380 vfsp->vfs_flag &= ~VFS_NODEVICES; 1381 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1382 vfsp->vfs_flag |= VFS_NOSETUID; 1383 else 1384 vfsp->vfs_flag &= ~VFS_NOSETUID; 1385 } 1386 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1387 vfsp->vfs_flag |= VFS_NBMAND; 1388 else 1389 vfsp->vfs_flag &= ~VFS_NBMAND; 1390 1391 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1392 vfsp->vfs_flag |= VFS_XATTR; 1393 else 1394 vfsp->vfs_flag &= ~VFS_XATTR; 1395 1396 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1397 vfsp->vfs_flag |= VFS_NOEXEC; 1398 else 1399 vfsp->vfs_flag &= ~VFS_NOEXEC; 1400 1401 /* 1402 * Now construct the output option string of options 1403 * we recognized. 1404 */ 1405 if (uap->flags & MS_OPTIONSTR) { 1406 vfs_list_read_lock(); 1407 copyout_error = vfs_buildoptionstr( 1408 &vfsp->vfs_mntopts, inargs, optlen); 1409 vfs_list_unlock(); 1410 if (copyout_error == 0 && 1411 (uap->flags & MS_SYSSPACE) == 0) { 1412 copyout_error = copyoutstr(inargs, opts, 1413 optlen, NULL); 1414 } 1415 } 1416 1417 /* 1418 * If this isn't a remount, set up the vopstats before 1419 * anyone can touch this 1420 */ 1421 if (!remount && vswp->vsw_flag & VSW_STATS) { 1422 initialize_vopstats(&vfsp->vfs_vopstats); 1423 /* 1424 * We need to set vfs_vskap to NULL because there's 1425 * a chance it won't be set below. This is checked 1426 * in teardown_vopstats() so we can't have garbage. 1427 */ 1428 vfsp->vfs_vskap = NULL; 1429 vfsp->vfs_flag |= VFS_STATS; 1430 vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp); 1431 } 1432 1433 vfs_unlock(vfsp); 1434 } 1435 mount_completed(); 1436 if (splice) 1437 vn_vfsunlock(vp); 1438 1439 if ((error == 0) && (copyout_error == 0)) { 1440 if (!remount) { 1441 /* 1442 * Don't call get_vskstat_anchor() while holding 1443 * locks since it allocates memory and calls 1444 * VFS_STATVFS(). For NFS, the latter can generate 1445 * an over-the-wire call. 1446 */ 1447 vskap = get_vskstat_anchor(vfsp); 1448 /* Only take the lock if we have something to do */ 1449 if (vskap != NULL) { 1450 vfs_lock_wait(vfsp); 1451 if (vfsp->vfs_flag & VFS_STATS) { 1452 vfsp->vfs_vskap = vskap; 1453 } 1454 vfs_unlock(vfsp); 1455 } 1456 } 1457 /* Return vfsp to caller. */ 1458 *vfspp = vfsp; 1459 } 1460 errout: 1461 vfs_freeopttbl(&mnt_mntopts); 1462 if (resource != NULL) 1463 kmem_free(resource, strlen(resource) + 1); 1464 if (mountpt != NULL) 1465 kmem_free(mountpt, strlen(mountpt) + 1); 1466 /* 1467 * It is possible we errored prior to adding to mount in progress 1468 * table. Must free vnode we acquired with successful lookupname. 1469 */ 1470 if (addmip) 1471 VN_RELE(bvp); 1472 if (delmip) 1473 vfs_delmip(vfsp); 1474 ASSERT(vswp != NULL); 1475 vfs_unrefvfssw(vswp); 1476 if (inargs != opts) 1477 kmem_free(inargs, MAX_MNTOPT_STR); 1478 if (copyout_error) { 1479 VFS_RELE(vfsp); 1480 error = copyout_error; 1481 } 1482 return (error); 1483 } 1484 1485 static void 1486 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath) 1487 { 1488 size_t len; 1489 refstr_t *ref; 1490 zone_t *zone = curproc->p_zone; 1491 char *sp; 1492 int have_list_lock = 0; 1493 1494 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1495 1496 /* 1497 * New path must be less than MAXPATHLEN because mntfs 1498 * will only display up to MAXPATHLEN bytes. This is currently 1499 * safe, because domount() uses pn_get(), and other callers 1500 * similarly cap the size to fewer than MAXPATHLEN bytes. 1501 */ 1502 1503 ASSERT(strlen(newpath) < MAXPATHLEN); 1504 1505 /* mntfs requires consistency while vfs list lock is held */ 1506 1507 if (VFS_ON_LIST(vfsp)) { 1508 have_list_lock = 1; 1509 vfs_list_lock(); 1510 } 1511 1512 if (*refp != NULL) 1513 refstr_rele(*refp); 1514 1515 /* Do we need to modify the path? */ 1516 1517 if (zone == global_zone || *newpath != '/') { 1518 ref = refstr_alloc(newpath); 1519 goto out; 1520 } 1521 1522 /* 1523 * Truncate the trailing '/' in the zoneroot, and merge 1524 * in the zone's rootpath with the "newpath" (resource 1525 * or mountpoint) passed in. 1526 * 1527 * The size of the required buffer is thus the size of 1528 * the buffer required for the passed-in newpath 1529 * (strlen(newpath) + 1), plus the size of the buffer 1530 * required to hold zone_rootpath (zone_rootpathlen) 1531 * minus one for one of the now-superfluous NUL 1532 * terminations, minus one for the trailing '/'. 1533 * 1534 * That gives us: 1535 * 1536 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1537 * 1538 * Which is what we have below. 1539 */ 1540 1541 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1542 sp = kmem_alloc(len, KM_SLEEP); 1543 1544 /* 1545 * Copy everything including the trailing slash, which 1546 * we then overwrite with the NUL character. 1547 */ 1548 1549 (void) strcpy(sp, zone->zone_rootpath); 1550 sp[zone->zone_rootpathlen - 2] = '\0'; 1551 (void) strcat(sp, newpath); 1552 1553 ref = refstr_alloc(sp); 1554 kmem_free(sp, len); 1555 out: 1556 *refp = ref; 1557 1558 if (have_list_lock) { 1559 vfs_mnttab_modtimeupd(); 1560 vfs_list_unlock(); 1561 } 1562 } 1563 1564 /* 1565 * Record a mounted resource name in a vfs structure. 1566 * If vfsp is already mounted, caller must hold the vfs lock. 1567 */ 1568 void 1569 vfs_setresource(struct vfs *vfsp, const char *resource) 1570 { 1571 if (resource == NULL || resource[0] == '\0') 1572 resource = VFS_NORESOURCE; 1573 vfs_setpath(vfsp, &vfsp->vfs_resource, resource); 1574 } 1575 1576 /* 1577 * Record a mount point name in a vfs structure. 1578 * If vfsp is already mounted, caller must hold the vfs lock. 1579 */ 1580 void 1581 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt) 1582 { 1583 if (mntpt == NULL || mntpt[0] == '\0') 1584 mntpt = VFS_NOMNTPT; 1585 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt); 1586 } 1587 1588 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1589 1590 refstr_t * 1591 vfs_getresource(const struct vfs *vfsp) 1592 { 1593 refstr_t *resource; 1594 1595 vfs_list_read_lock(); 1596 resource = vfsp->vfs_resource; 1597 refstr_hold(resource); 1598 vfs_list_unlock(); 1599 1600 return (resource); 1601 } 1602 1603 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 1604 1605 refstr_t * 1606 vfs_getmntpoint(const struct vfs *vfsp) 1607 { 1608 refstr_t *mntpt; 1609 1610 vfs_list_read_lock(); 1611 mntpt = vfsp->vfs_mntpt; 1612 refstr_hold(mntpt); 1613 vfs_list_unlock(); 1614 1615 return (mntpt); 1616 } 1617 1618 /* 1619 * Create an empty options table with enough empty slots to hold all 1620 * The options in the options string passed as an argument. 1621 * Potentially prepend another options table. 1622 * 1623 * Note: caller is responsible for locking the vfs list, if needed, 1624 * to protect mops. 1625 */ 1626 static void 1627 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 1628 const mntopts_t *mtmpl) 1629 { 1630 const char *s = opts; 1631 uint_t count; 1632 1633 if (opts == NULL || *opts == '\0') { 1634 count = 0; 1635 } else { 1636 count = 1; 1637 1638 /* 1639 * Count number of options in the string 1640 */ 1641 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 1642 count++; 1643 s++; 1644 } 1645 } 1646 vfs_copyopttbl_extend(mtmpl, mops, count); 1647 } 1648 1649 /* 1650 * Create an empty options table with enough empty slots to hold all 1651 * The options in the options string passed as an argument. 1652 * 1653 * This function is *not* for general use by filesystems. 1654 * 1655 * Note: caller is responsible for locking the vfs list, if needed, 1656 * to protect mops. 1657 */ 1658 void 1659 vfs_createopttbl(mntopts_t *mops, const char *opts) 1660 { 1661 vfs_createopttbl_extend(mops, opts, NULL); 1662 } 1663 1664 1665 /* 1666 * Swap two mount options tables 1667 */ 1668 static void 1669 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 1670 { 1671 uint_t tmpcnt; 1672 mntopt_t *tmplist; 1673 1674 tmpcnt = optbl2->mo_count; 1675 tmplist = optbl2->mo_list; 1676 optbl2->mo_count = optbl1->mo_count; 1677 optbl2->mo_list = optbl1->mo_list; 1678 optbl1->mo_count = tmpcnt; 1679 optbl1->mo_list = tmplist; 1680 } 1681 1682 static void 1683 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 1684 { 1685 vfs_list_lock(); 1686 vfs_swapopttbl_nolock(optbl1, optbl2); 1687 vfs_mnttab_modtimeupd(); 1688 vfs_list_unlock(); 1689 } 1690 1691 static char ** 1692 vfs_copycancelopt_extend(char **const moc, int extend) 1693 { 1694 int i = 0; 1695 int j; 1696 char **result; 1697 1698 if (moc != NULL) { 1699 for (; moc[i] != NULL; i++) 1700 /* count number of options to cancel */; 1701 } 1702 1703 if (i + extend == 0) 1704 return (NULL); 1705 1706 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 1707 1708 for (j = 0; j < i; j++) { 1709 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 1710 (void) strcpy(result[j], moc[j]); 1711 } 1712 for (; j <= i + extend; j++) 1713 result[j] = NULL; 1714 1715 return (result); 1716 } 1717 1718 static void 1719 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 1720 { 1721 char *sp, *dp; 1722 1723 d->mo_flags = s->mo_flags; 1724 d->mo_data = s->mo_data; 1725 sp = s->mo_name; 1726 if (sp != NULL) { 1727 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1728 (void) strcpy(dp, sp); 1729 d->mo_name = dp; 1730 } else { 1731 d->mo_name = NULL; /* should never happen */ 1732 } 1733 1734 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 1735 1736 sp = s->mo_arg; 1737 if (sp != NULL) { 1738 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1739 (void) strcpy(dp, sp); 1740 d->mo_arg = dp; 1741 } else { 1742 d->mo_arg = NULL; 1743 } 1744 } 1745 1746 /* 1747 * Copy a mount options table, possibly allocating some spare 1748 * slots at the end. It is permissible to copy_extend the NULL table. 1749 */ 1750 static void 1751 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 1752 { 1753 uint_t i, count; 1754 mntopt_t *motbl; 1755 1756 /* 1757 * Clear out any existing stuff in the options table being initialized 1758 */ 1759 vfs_freeopttbl(dmo); 1760 count = (smo == NULL) ? 0 : smo->mo_count; 1761 if ((count + extra) == 0) /* nothing to do */ 1762 return; 1763 dmo->mo_count = count + extra; 1764 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 1765 dmo->mo_list = motbl; 1766 for (i = 0; i < count; i++) { 1767 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 1768 } 1769 for (i = count; i < count + extra; i++) { 1770 motbl[i].mo_flags = MO_EMPTY; 1771 } 1772 } 1773 1774 /* 1775 * Copy a mount options table. 1776 * 1777 * This function is *not* for general use by filesystems. 1778 * 1779 * Note: caller is responsible for locking the vfs list, if needed, 1780 * to protect smo and dmo. 1781 */ 1782 void 1783 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 1784 { 1785 vfs_copyopttbl_extend(smo, dmo, 0); 1786 } 1787 1788 static char ** 1789 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 1790 { 1791 int c1 = 0; 1792 int c2 = 0; 1793 char **result; 1794 char **sp1, **sp2, **dp; 1795 1796 /* 1797 * First we count both lists of cancel options. 1798 * If either is NULL or has no elements, we return a copy of 1799 * the other. 1800 */ 1801 if (mop1->mo_cancel != NULL) { 1802 for (; mop1->mo_cancel[c1] != NULL; c1++) 1803 /* count cancel options in mop1 */; 1804 } 1805 1806 if (c1 == 0) 1807 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 1808 1809 if (mop2->mo_cancel != NULL) { 1810 for (; mop2->mo_cancel[c2] != NULL; c2++) 1811 /* count cancel options in mop2 */; 1812 } 1813 1814 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 1815 1816 if (c2 == 0) 1817 return (result); 1818 1819 /* 1820 * When we get here, we've got two sets of cancel options; 1821 * we need to merge the two sets. We know that the result 1822 * array has "c1+c2+1" entries and in the end we might shrink 1823 * it. 1824 * Result now has a copy of the c1 entries from mop1; we'll 1825 * now lookup all the entries of mop2 in mop1 and copy it if 1826 * it is unique. 1827 * This operation is O(n^2) but it's only called once per 1828 * filesystem per duplicate option. This is a situation 1829 * which doesn't arise with the filesystems in ON and 1830 * n is generally 1. 1831 */ 1832 1833 dp = &result[c1]; 1834 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 1835 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 1836 if (strcmp(*sp1, *sp2) == 0) 1837 break; 1838 } 1839 if (*sp1 == NULL) { 1840 /* 1841 * Option *sp2 not found in mop1, so copy it. 1842 * The calls to vfs_copycancelopt_extend() 1843 * guarantee that there's enough room. 1844 */ 1845 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 1846 (void) strcpy(*dp++, *sp2); 1847 } 1848 } 1849 if (dp != &result[c1+c2]) { 1850 size_t bytes = (dp - result + 1) * sizeof (char *); 1851 char **nres = kmem_alloc(bytes, KM_SLEEP); 1852 1853 bcopy(result, nres, bytes); 1854 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 1855 result = nres; 1856 } 1857 return (result); 1858 } 1859 1860 /* 1861 * Merge two mount option tables (outer and inner) into one. This is very 1862 * similar to "merging" global variables and automatic variables in C. 1863 * 1864 * This isn't (and doesn't have to be) fast. 1865 * 1866 * This function is *not* for general use by filesystems. 1867 * 1868 * Note: caller is responsible for locking the vfs list, if needed, 1869 * to protect omo, imo & dmo. 1870 */ 1871 void 1872 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 1873 { 1874 uint_t i, count; 1875 mntopt_t *mop, *motbl; 1876 uint_t freeidx; 1877 1878 /* 1879 * First determine how much space we need to allocate. 1880 */ 1881 count = omo->mo_count; 1882 for (i = 0; i < imo->mo_count; i++) { 1883 if (imo->mo_list[i].mo_flags & MO_EMPTY) 1884 continue; 1885 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 1886 count++; 1887 } 1888 ASSERT(count >= omo->mo_count && 1889 count <= omo->mo_count + imo->mo_count); 1890 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 1891 for (i = 0; i < omo->mo_count; i++) 1892 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 1893 freeidx = omo->mo_count; 1894 for (i = 0; i < imo->mo_count; i++) { 1895 if (imo->mo_list[i].mo_flags & MO_EMPTY) 1896 continue; 1897 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 1898 char **newcanp; 1899 uint_t index = mop - omo->mo_list; 1900 1901 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 1902 1903 vfs_freeopt(&motbl[index]); 1904 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 1905 1906 vfs_freecancelopt(motbl[index].mo_cancel); 1907 motbl[index].mo_cancel = newcanp; 1908 } else { 1909 /* 1910 * If it's a new option, just copy it over to the first 1911 * free location. 1912 */ 1913 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 1914 } 1915 } 1916 dmo->mo_count = count; 1917 dmo->mo_list = motbl; 1918 } 1919 1920 /* 1921 * Functions to set and clear mount options in a mount options table. 1922 */ 1923 1924 /* 1925 * Clear a mount option, if it exists. 1926 * 1927 * The update_mnttab arg indicates whether mops is part of a vfs that is on 1928 * the vfs list. 1929 */ 1930 static void 1931 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 1932 { 1933 struct mntopt *mop; 1934 uint_t i, count; 1935 1936 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 1937 1938 count = mops->mo_count; 1939 for (i = 0; i < count; i++) { 1940 mop = &mops->mo_list[i]; 1941 1942 if (mop->mo_flags & MO_EMPTY) 1943 continue; 1944 if (strcmp(opt, mop->mo_name)) 1945 continue; 1946 mop->mo_flags &= ~MO_SET; 1947 if (mop->mo_arg != NULL) { 1948 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 1949 } 1950 mop->mo_arg = NULL; 1951 if (update_mnttab) 1952 vfs_mnttab_modtimeupd(); 1953 break; 1954 } 1955 } 1956 1957 void 1958 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 1959 { 1960 int gotlock = 0; 1961 1962 if (VFS_ON_LIST(vfsp)) { 1963 gotlock = 1; 1964 vfs_list_lock(); 1965 } 1966 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 1967 if (gotlock) 1968 vfs_list_unlock(); 1969 } 1970 1971 1972 /* 1973 * Set a mount option on. If it's not found in the table, it's silently 1974 * ignored. If the option has MO_IGNORE set, it is still set unless the 1975 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 1976 * bits can be used to toggle the MO_NODISPLAY bit for the option. 1977 * If the VFS_CREATEOPT flag bit is set then the first option slot with 1978 * MO_EMPTY set is created as the option passed in. 1979 * 1980 * The update_mnttab arg indicates whether mops is part of a vfs that is on 1981 * the vfs list. 1982 */ 1983 static void 1984 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 1985 const char *arg, int flags, int update_mnttab) 1986 { 1987 mntopt_t *mop; 1988 uint_t i, count; 1989 char *sp; 1990 1991 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 1992 1993 if (flags & VFS_CREATEOPT) { 1994 if (vfs_hasopt(mops, opt) != NULL) { 1995 flags &= ~VFS_CREATEOPT; 1996 } 1997 } 1998 count = mops->mo_count; 1999 for (i = 0; i < count; i++) { 2000 mop = &mops->mo_list[i]; 2001 2002 if (mop->mo_flags & MO_EMPTY) { 2003 if ((flags & VFS_CREATEOPT) == 0) 2004 continue; 2005 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 2006 (void) strcpy(sp, opt); 2007 mop->mo_name = sp; 2008 if (arg != NULL) 2009 mop->mo_flags = MO_HASVALUE; 2010 else 2011 mop->mo_flags = 0; 2012 } else if (strcmp(opt, mop->mo_name)) { 2013 continue; 2014 } 2015 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 2016 break; 2017 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 2018 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 2019 (void) strcpy(sp, arg); 2020 } else { 2021 sp = NULL; 2022 } 2023 if (mop->mo_arg != NULL) 2024 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2025 mop->mo_arg = sp; 2026 if (flags & VFS_DISPLAY) 2027 mop->mo_flags &= ~MO_NODISPLAY; 2028 if (flags & VFS_NODISPLAY) 2029 mop->mo_flags |= MO_NODISPLAY; 2030 mop->mo_flags |= MO_SET; 2031 if (mop->mo_cancel != NULL) { 2032 char **cp; 2033 2034 for (cp = mop->mo_cancel; *cp != NULL; cp++) 2035 vfs_clearmntopt_nolock(mops, *cp, 0); 2036 } 2037 if (update_mnttab) 2038 vfs_mnttab_modtimeupd(); 2039 break; 2040 } 2041 } 2042 2043 void 2044 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 2045 { 2046 int gotlock = 0; 2047 2048 if (VFS_ON_LIST(vfsp)) { 2049 gotlock = 1; 2050 vfs_list_lock(); 2051 } 2052 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 2053 if (gotlock) 2054 vfs_list_unlock(); 2055 } 2056 2057 2058 /* 2059 * Add a "tag" option to a mounted file system's options list. 2060 * 2061 * Note: caller is responsible for locking the vfs list, if needed, 2062 * to protect mops. 2063 */ 2064 static mntopt_t * 2065 vfs_addtag(mntopts_t *mops, const char *tag) 2066 { 2067 uint_t count; 2068 mntopt_t *mop, *motbl; 2069 2070 count = mops->mo_count + 1; 2071 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 2072 if (mops->mo_count) { 2073 size_t len = (count - 1) * sizeof (mntopt_t); 2074 2075 bcopy(mops->mo_list, motbl, len); 2076 kmem_free(mops->mo_list, len); 2077 } 2078 mops->mo_count = count; 2079 mops->mo_list = motbl; 2080 mop = &motbl[count - 1]; 2081 mop->mo_flags = MO_TAG; 2082 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 2083 (void) strcpy(mop->mo_name, tag); 2084 return (mop); 2085 } 2086 2087 /* 2088 * Allow users to set arbitrary "tags" in a vfs's mount options. 2089 * Broader use within the kernel is discouraged. 2090 */ 2091 int 2092 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2093 cred_t *cr) 2094 { 2095 vfs_t *vfsp; 2096 mntopts_t *mops; 2097 mntopt_t *mop; 2098 int found = 0; 2099 dev_t dev = makedevice(major, minor); 2100 int err = 0; 2101 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2102 2103 /* 2104 * Find the desired mounted file system 2105 */ 2106 vfs_list_lock(); 2107 vfsp = rootvfs; 2108 do { 2109 if (vfsp->vfs_dev == dev && 2110 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2111 found = 1; 2112 break; 2113 } 2114 vfsp = vfsp->vfs_next; 2115 } while (vfsp != rootvfs); 2116 2117 if (!found) { 2118 err = EINVAL; 2119 goto out; 2120 } 2121 err = secpolicy_fs_config(cr, vfsp); 2122 if (err != 0) 2123 goto out; 2124 2125 mops = &vfsp->vfs_mntopts; 2126 /* 2127 * Add tag if it doesn't already exist 2128 */ 2129 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2130 int len; 2131 2132 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2133 len = strlen(buf); 2134 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2135 err = ENAMETOOLONG; 2136 goto out; 2137 } 2138 mop = vfs_addtag(mops, tag); 2139 } 2140 if ((mop->mo_flags & MO_TAG) == 0) { 2141 err = EINVAL; 2142 goto out; 2143 } 2144 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2145 out: 2146 vfs_list_unlock(); 2147 kmem_free(buf, MAX_MNTOPT_STR); 2148 return (err); 2149 } 2150 2151 /* 2152 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2153 * Broader use within the kernel is discouraged. 2154 */ 2155 int 2156 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2157 cred_t *cr) 2158 { 2159 vfs_t *vfsp; 2160 mntopt_t *mop; 2161 int found = 0; 2162 dev_t dev = makedevice(major, minor); 2163 int err = 0; 2164 2165 /* 2166 * Find the desired mounted file system 2167 */ 2168 vfs_list_lock(); 2169 vfsp = rootvfs; 2170 do { 2171 if (vfsp->vfs_dev == dev && 2172 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2173 found = 1; 2174 break; 2175 } 2176 vfsp = vfsp->vfs_next; 2177 } while (vfsp != rootvfs); 2178 2179 if (!found) { 2180 err = EINVAL; 2181 goto out; 2182 } 2183 err = secpolicy_fs_config(cr, vfsp); 2184 if (err != 0) 2185 goto out; 2186 2187 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2188 err = EINVAL; 2189 goto out; 2190 } 2191 if ((mop->mo_flags & MO_TAG) == 0) { 2192 err = EINVAL; 2193 goto out; 2194 } 2195 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2196 out: 2197 vfs_list_unlock(); 2198 return (err); 2199 } 2200 2201 /* 2202 * Function to parse an option string and fill in a mount options table. 2203 * Unknown options are silently ignored. The input option string is modified 2204 * by replacing separators with nulls. If the create flag is set, options 2205 * not found in the table are just added on the fly. The table must have 2206 * an option slot marked MO_EMPTY to add an option on the fly. 2207 * 2208 * This function is *not* for general use by filesystems. 2209 * 2210 * Note: caller is responsible for locking the vfs list, if needed, 2211 * to protect mops.. 2212 */ 2213 void 2214 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2215 { 2216 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2217 int setflg = VFS_NOFORCEOPT; 2218 2219 if (osp == NULL) 2220 return; 2221 while (*s != '\0') { 2222 p = strchr(s, ','); /* find next option */ 2223 if (p == NULL) { 2224 cp = NULL; 2225 p = s + strlen(s); 2226 } else { 2227 cp = p; /* save location of comma */ 2228 *p++ = '\0'; /* mark end and point to next option */ 2229 } 2230 nextop = p; 2231 p = strchr(s, '='); /* look for value */ 2232 if (p == NULL) { 2233 valp = NULL; /* no value supplied */ 2234 } else { 2235 ep = p; /* save location of equals */ 2236 *p++ = '\0'; /* end option and point to value */ 2237 valp = p; 2238 } 2239 /* 2240 * set option into options table 2241 */ 2242 if (create) 2243 setflg |= VFS_CREATEOPT; 2244 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2245 if (cp != NULL) 2246 *cp = ','; /* restore the comma */ 2247 if (valp != NULL) 2248 *ep = '='; /* restore the equals */ 2249 s = nextop; 2250 } 2251 } 2252 2253 /* 2254 * Function to inquire if an option exists in a mount options table. 2255 * Returns a pointer to the option if it exists, else NULL. 2256 * 2257 * This function is *not* for general use by filesystems. 2258 * 2259 * Note: caller is responsible for locking the vfs list, if needed, 2260 * to protect mops. 2261 */ 2262 struct mntopt * 2263 vfs_hasopt(const mntopts_t *mops, const char *opt) 2264 { 2265 struct mntopt *mop; 2266 uint_t i, count; 2267 2268 count = mops->mo_count; 2269 for (i = 0; i < count; i++) { 2270 mop = &mops->mo_list[i]; 2271 2272 if (mop->mo_flags & MO_EMPTY) 2273 continue; 2274 if (strcmp(opt, mop->mo_name) == 0) 2275 return (mop); 2276 } 2277 return (NULL); 2278 } 2279 2280 /* 2281 * Function to inquire if an option is set in a mount options table. 2282 * Returns non-zero if set and fills in the arg pointer with a pointer to 2283 * the argument string or NULL if there is no argument string. 2284 */ 2285 static int 2286 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2287 { 2288 struct mntopt *mop; 2289 uint_t i, count; 2290 2291 count = mops->mo_count; 2292 for (i = 0; i < count; i++) { 2293 mop = &mops->mo_list[i]; 2294 2295 if (mop->mo_flags & MO_EMPTY) 2296 continue; 2297 if (strcmp(opt, mop->mo_name)) 2298 continue; 2299 if ((mop->mo_flags & MO_SET) == 0) 2300 return (0); 2301 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2302 *argp = mop->mo_arg; 2303 return (1); 2304 } 2305 return (0); 2306 } 2307 2308 2309 int 2310 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2311 { 2312 int ret; 2313 2314 vfs_list_read_lock(); 2315 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2316 vfs_list_unlock(); 2317 return (ret); 2318 } 2319 2320 2321 /* 2322 * Construct a comma separated string of the options set in the given 2323 * mount table, return the string in the given buffer. Return non-zero if 2324 * the buffer would overflow. 2325 * 2326 * This function is *not* for general use by filesystems. 2327 * 2328 * Note: caller is responsible for locking the vfs list, if needed, 2329 * to protect mp. 2330 */ 2331 int 2332 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2333 { 2334 char *cp; 2335 uint_t i; 2336 2337 buf[0] = '\0'; 2338 cp = buf; 2339 for (i = 0; i < mp->mo_count; i++) { 2340 struct mntopt *mop; 2341 2342 mop = &mp->mo_list[i]; 2343 if (mop->mo_flags & MO_SET) { 2344 int optlen, comma = 0; 2345 2346 if (buf[0] != '\0') 2347 comma = 1; 2348 optlen = strlen(mop->mo_name); 2349 if (strlen(buf) + comma + optlen + 1 > len) 2350 goto err; 2351 if (comma) 2352 *cp++ = ','; 2353 (void) strcpy(cp, mop->mo_name); 2354 cp += optlen; 2355 /* 2356 * Append option value if there is one 2357 */ 2358 if (mop->mo_arg != NULL) { 2359 int arglen; 2360 2361 arglen = strlen(mop->mo_arg); 2362 if (strlen(buf) + arglen + 2 > len) 2363 goto err; 2364 *cp++ = '='; 2365 (void) strcpy(cp, mop->mo_arg); 2366 cp += arglen; 2367 } 2368 } 2369 } 2370 return (0); 2371 err: 2372 return (EOVERFLOW); 2373 } 2374 2375 static void 2376 vfs_freecancelopt(char **moc) 2377 { 2378 if (moc != NULL) { 2379 int ccnt = 0; 2380 char **cp; 2381 2382 for (cp = moc; *cp != NULL; cp++) { 2383 kmem_free(*cp, strlen(*cp) + 1); 2384 ccnt++; 2385 } 2386 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2387 } 2388 } 2389 2390 static void 2391 vfs_freeopt(mntopt_t *mop) 2392 { 2393 if (mop->mo_name != NULL) 2394 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2395 2396 vfs_freecancelopt(mop->mo_cancel); 2397 2398 if (mop->mo_arg != NULL) 2399 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2400 } 2401 2402 /* 2403 * Free a mount options table 2404 * 2405 * This function is *not* for general use by filesystems. 2406 * 2407 * Note: caller is responsible for locking the vfs list, if needed, 2408 * to protect mp. 2409 */ 2410 void 2411 vfs_freeopttbl(mntopts_t *mp) 2412 { 2413 uint_t i, count; 2414 2415 count = mp->mo_count; 2416 for (i = 0; i < count; i++) { 2417 vfs_freeopt(&mp->mo_list[i]); 2418 } 2419 if (count) { 2420 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2421 mp->mo_count = 0; 2422 mp->mo_list = NULL; 2423 } 2424 } 2425 2426 /* 2427 * Free any mnttab information recorded in the vfs struct. 2428 * The vfs must not be on the vfs list. 2429 */ 2430 static void 2431 vfs_freemnttab(struct vfs *vfsp) 2432 { 2433 ASSERT(!VFS_ON_LIST(vfsp)); 2434 2435 /* 2436 * Free device and mount point information 2437 */ 2438 if (vfsp->vfs_mntpt != NULL) { 2439 refstr_rele(vfsp->vfs_mntpt); 2440 vfsp->vfs_mntpt = NULL; 2441 } 2442 if (vfsp->vfs_resource != NULL) { 2443 refstr_rele(vfsp->vfs_resource); 2444 vfsp->vfs_resource = NULL; 2445 } 2446 /* 2447 * Now free mount options information 2448 */ 2449 vfs_freeopttbl(&vfsp->vfs_mntopts); 2450 } 2451 2452 /* 2453 * Return the last mnttab modification time 2454 */ 2455 void 2456 vfs_mnttab_modtime(timespec_t *ts) 2457 { 2458 ASSERT(RW_LOCK_HELD(&vfslist)); 2459 *ts = vfs_mnttab_mtime; 2460 } 2461 2462 /* 2463 * See if mnttab is changed 2464 */ 2465 void 2466 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2467 { 2468 int changed; 2469 2470 *phpp = (struct pollhead *)NULL; 2471 2472 /* 2473 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 2474 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 2475 * to not grab the vfs list lock because tv_sec is monotonically 2476 * increasing. 2477 */ 2478 2479 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 2480 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 2481 if (!changed) { 2482 *phpp = &vfs_pollhd; 2483 } 2484 } 2485 2486 /* 2487 * Update the mnttab modification time and wake up any waiters for 2488 * mnttab changes 2489 */ 2490 void 2491 vfs_mnttab_modtimeupd() 2492 { 2493 hrtime_t oldhrt, newhrt; 2494 2495 ASSERT(RW_WRITE_HELD(&vfslist)); 2496 oldhrt = ts2hrt(&vfs_mnttab_mtime); 2497 gethrestime(&vfs_mnttab_mtime); 2498 newhrt = ts2hrt(&vfs_mnttab_mtime); 2499 if (oldhrt == (hrtime_t)0) 2500 vfs_mnttab_ctime = vfs_mnttab_mtime; 2501 /* 2502 * Attempt to provide unique mtime (like uniqtime but not). 2503 */ 2504 if (newhrt == oldhrt) { 2505 newhrt++; 2506 hrt2ts(newhrt, &vfs_mnttab_mtime); 2507 } 2508 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 2509 } 2510 2511 int 2512 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 2513 { 2514 vnode_t *coveredvp; 2515 int error; 2516 extern void teardown_vopstats(vfs_t *); 2517 2518 /* 2519 * Get covered vnode. This will be NULL if the vfs is not linked 2520 * into the file system name space (i.e., domount() with MNT_NOSPICE). 2521 */ 2522 coveredvp = vfsp->vfs_vnodecovered; 2523 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 2524 2525 /* 2526 * Purge all dnlc entries for this vfs. 2527 */ 2528 (void) dnlc_purge_vfsp(vfsp, 0); 2529 2530 /* For forcible umount, skip VFS_SYNC() since it may hang */ 2531 if ((flag & MS_FORCE) == 0) 2532 (void) VFS_SYNC(vfsp, 0, cr); 2533 2534 /* 2535 * Lock the vfs to maintain fs status quo during unmount. This 2536 * has to be done after the sync because ufs_update tries to acquire 2537 * the vfs_reflock. 2538 */ 2539 vfs_lock_wait(vfsp); 2540 2541 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 2542 vfs_unlock(vfsp); 2543 if (coveredvp != NULL) 2544 vn_vfsunlock(coveredvp); 2545 } else if (coveredvp != NULL) { 2546 teardown_vopstats(vfsp); 2547 /* 2548 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 2549 * when it frees vfsp so we do a VN_HOLD() so we can 2550 * continue to use coveredvp afterwards. 2551 */ 2552 VN_HOLD(coveredvp); 2553 vfs_remove(vfsp); 2554 vn_vfsunlock(coveredvp); 2555 VN_RELE(coveredvp); 2556 } else { 2557 teardown_vopstats(vfsp); 2558 /* 2559 * Release the reference to vfs that is not linked 2560 * into the name space. 2561 */ 2562 vfs_unlock(vfsp); 2563 VFS_RELE(vfsp); 2564 } 2565 return (error); 2566 } 2567 2568 2569 /* 2570 * Vfs_unmountall() is called by uadmin() to unmount all 2571 * mounted file systems (except the root file system) during shutdown. 2572 * It follows the existing locking protocol when traversing the vfs list 2573 * to sync and unmount vfses. Even though there should be no 2574 * other thread running while the system is shutting down, it is prudent 2575 * to still follow the locking protocol. 2576 */ 2577 void 2578 vfs_unmountall(void) 2579 { 2580 struct vfs *vfsp; 2581 struct vfs *prev_vfsp = NULL; 2582 int error; 2583 2584 /* 2585 * Toss all dnlc entries now so that the per-vfs sync 2586 * and unmount operations don't have to slog through 2587 * a bunch of uninteresting vnodes over and over again. 2588 */ 2589 dnlc_purge(); 2590 2591 vfs_list_lock(); 2592 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 2593 prev_vfsp = vfsp->vfs_prev; 2594 2595 if (vfs_lock(vfsp) != 0) 2596 continue; 2597 error = vn_vfswlock(vfsp->vfs_vnodecovered); 2598 vfs_unlock(vfsp); 2599 if (error) 2600 continue; 2601 2602 vfs_list_unlock(); 2603 2604 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 2605 (void) dounmount(vfsp, 0, CRED()); 2606 2607 /* 2608 * Since we dropped the vfslist lock above we must 2609 * verify that next_vfsp still exists, else start over. 2610 */ 2611 vfs_list_lock(); 2612 for (vfsp = rootvfs->vfs_prev; 2613 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 2614 if (vfsp == prev_vfsp) 2615 break; 2616 if (vfsp == rootvfs && prev_vfsp != rootvfs) 2617 prev_vfsp = rootvfs->vfs_prev; 2618 } 2619 vfs_list_unlock(); 2620 } 2621 2622 /* 2623 * Called to add an entry to the end of the vfs mount in progress list 2624 */ 2625 void 2626 vfs_addmip(dev_t dev, struct vfs *vfsp) 2627 { 2628 struct ipmnt *mipp; 2629 2630 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 2631 mipp->mip_next = NULL; 2632 mipp->mip_dev = dev; 2633 mipp->mip_vfsp = vfsp; 2634 mutex_enter(&vfs_miplist_mutex); 2635 if (vfs_miplist_end != NULL) 2636 vfs_miplist_end->mip_next = mipp; 2637 else 2638 vfs_miplist = mipp; 2639 vfs_miplist_end = mipp; 2640 mutex_exit(&vfs_miplist_mutex); 2641 } 2642 2643 /* 2644 * Called to remove an entry from the mount in progress list 2645 * Either because the mount completed or it failed. 2646 */ 2647 void 2648 vfs_delmip(struct vfs *vfsp) 2649 { 2650 struct ipmnt *mipp, *mipprev; 2651 2652 mutex_enter(&vfs_miplist_mutex); 2653 mipprev = NULL; 2654 for (mipp = vfs_miplist; 2655 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 2656 mipprev = mipp; 2657 } 2658 if (mipp == NULL) 2659 return; /* shouldn't happen */ 2660 if (mipp == vfs_miplist_end) 2661 vfs_miplist_end = mipprev; 2662 if (mipprev == NULL) 2663 vfs_miplist = mipp->mip_next; 2664 else 2665 mipprev->mip_next = mipp->mip_next; 2666 mutex_exit(&vfs_miplist_mutex); 2667 kmem_free(mipp, sizeof (struct ipmnt)); 2668 } 2669 2670 /* 2671 * vfs_add is called by a specific filesystem's mount routine to add 2672 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 2673 * The vfs should already have been locked by the caller. 2674 * 2675 * coveredvp is NULL if this is the root. 2676 */ 2677 void 2678 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 2679 { 2680 int newflag; 2681 2682 ASSERT(vfs_lock_held(vfsp)); 2683 VFS_HOLD(vfsp); 2684 newflag = vfsp->vfs_flag; 2685 if (mflag & MS_RDONLY) 2686 newflag |= VFS_RDONLY; 2687 else 2688 newflag &= ~VFS_RDONLY; 2689 if (mflag & MS_NOSUID) 2690 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 2691 else 2692 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 2693 if (mflag & MS_NOMNTTAB) 2694 newflag |= VFS_NOMNTTAB; 2695 else 2696 newflag &= ~VFS_NOMNTTAB; 2697 2698 if (coveredvp != NULL) { 2699 ASSERT(vn_vfswlock_held(coveredvp)); 2700 coveredvp->v_vfsmountedhere = vfsp; 2701 VN_HOLD(coveredvp); 2702 } 2703 vfsp->vfs_vnodecovered = coveredvp; 2704 vfsp->vfs_flag = newflag; 2705 2706 vfs_list_add(vfsp); 2707 } 2708 2709 /* 2710 * Remove a vfs from the vfs list, null out the pointer from the 2711 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 2712 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 2713 * reference to the vfs and to the covered vnode. 2714 * 2715 * Called from dounmount after it's confirmed with the file system 2716 * that the unmount is legal. 2717 */ 2718 void 2719 vfs_remove(struct vfs *vfsp) 2720 { 2721 vnode_t *vp; 2722 2723 ASSERT(vfs_lock_held(vfsp)); 2724 2725 /* 2726 * Can't unmount root. Should never happen because fs will 2727 * be busy. 2728 */ 2729 if (vfsp == rootvfs) 2730 cmn_err(CE_PANIC, "vfs_remove: unmounting root"); 2731 2732 vfs_list_remove(vfsp); 2733 2734 /* 2735 * Unhook from the file system name space. 2736 */ 2737 vp = vfsp->vfs_vnodecovered; 2738 ASSERT(vn_vfswlock_held(vp)); 2739 vp->v_vfsmountedhere = NULL; 2740 vfsp->vfs_vnodecovered = NULL; 2741 VN_RELE(vp); 2742 2743 /* 2744 * Release lock and wakeup anybody waiting. 2745 */ 2746 vfs_unlock(vfsp); 2747 VFS_RELE(vfsp); 2748 } 2749 2750 /* 2751 * Lock a filesystem to prevent access to it while mounting, 2752 * unmounting and syncing. Return EBUSY immediately if lock 2753 * can't be acquired. 2754 */ 2755 int 2756 vfs_lock(vfs_t *vfsp) 2757 { 2758 vn_vfslocks_entry_t *vpvfsentry; 2759 2760 vpvfsentry = vn_vfslocks_getlock(vfsp); 2761 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 2762 return (0); 2763 2764 vn_vfslocks_rele(vpvfsentry); 2765 return (EBUSY); 2766 } 2767 2768 int 2769 vfs_rlock(vfs_t *vfsp) 2770 { 2771 vn_vfslocks_entry_t *vpvfsentry; 2772 2773 vpvfsentry = vn_vfslocks_getlock(vfsp); 2774 2775 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 2776 return (0); 2777 2778 vn_vfslocks_rele(vpvfsentry); 2779 return (EBUSY); 2780 } 2781 2782 void 2783 vfs_lock_wait(vfs_t *vfsp) 2784 { 2785 vn_vfslocks_entry_t *vpvfsentry; 2786 2787 vpvfsentry = vn_vfslocks_getlock(vfsp); 2788 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 2789 } 2790 2791 void 2792 vfs_rlock_wait(vfs_t *vfsp) 2793 { 2794 vn_vfslocks_entry_t *vpvfsentry; 2795 2796 vpvfsentry = vn_vfslocks_getlock(vfsp); 2797 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 2798 } 2799 2800 /* 2801 * Unlock a locked filesystem. 2802 */ 2803 void 2804 vfs_unlock(vfs_t *vfsp) 2805 { 2806 vn_vfslocks_entry_t *vpvfsentry; 2807 2808 /* 2809 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 2810 * And these changes should remain for the patch changes as it is. 2811 */ 2812 if (panicstr) 2813 return; 2814 2815 /* 2816 * ve_refcount needs to be dropped twice here. 2817 * 1. To release refernce after a call to vfs_locks_getlock() 2818 * 2. To release the reference from the locking routines like 2819 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 2820 */ 2821 2822 vpvfsentry = vn_vfslocks_getlock(vfsp); 2823 vn_vfslocks_rele(vpvfsentry); 2824 2825 rwst_exit(&vpvfsentry->ve_lock); 2826 vn_vfslocks_rele(vpvfsentry); 2827 } 2828 2829 /* 2830 * Utility routine that allows a filesystem to construct its 2831 * fsid in "the usual way" - by munging some underlying dev_t and 2832 * the filesystem type number into the 64-bit fsid. Note that 2833 * this implicitly relies on dev_t persistence to make filesystem 2834 * id's persistent. 2835 * 2836 * There's nothing to prevent an individual fs from constructing its 2837 * fsid in a different way, and indeed they should. 2838 * 2839 * Since we want fsids to be 32-bit quantities (so that they can be 2840 * exported identically by either 32-bit or 64-bit APIs, as well as 2841 * the fact that fsid's are "known" to NFS), we compress the device 2842 * number given down to 32-bits, and panic if that isn't possible. 2843 */ 2844 void 2845 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 2846 { 2847 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 2848 panic("device number too big for fsid!"); 2849 fsi->val[1] = val; 2850 } 2851 2852 int 2853 vfs_lock_held(vfs_t *vfsp) 2854 { 2855 int held; 2856 vn_vfslocks_entry_t *vpvfsentry; 2857 2858 /* 2859 * vfs_lock_held will mimic sema_held behaviour 2860 * if panicstr is set. And these changes should remain 2861 * for the patch changes as it is. 2862 */ 2863 if (panicstr) 2864 return (1); 2865 2866 vpvfsentry = vn_vfslocks_getlock(vfsp); 2867 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 2868 2869 vn_vfslocks_rele(vpvfsentry); 2870 return (held); 2871 } 2872 2873 struct _kthread * 2874 vfs_lock_owner(vfs_t *vfsp) 2875 { 2876 struct _kthread *owner; 2877 vn_vfslocks_entry_t *vpvfsentry; 2878 2879 /* 2880 * vfs_wlock_held will mimic sema_held behaviour 2881 * if panicstr is set. And these changes should remain 2882 * for the patch changes as it is. 2883 */ 2884 if (panicstr) 2885 return (NULL); 2886 2887 vpvfsentry = vn_vfslocks_getlock(vfsp); 2888 owner = rwst_owner(&vpvfsentry->ve_lock); 2889 2890 vn_vfslocks_rele(vpvfsentry); 2891 return (owner); 2892 } 2893 2894 /* 2895 * vfs list locking. 2896 * 2897 * Rather than manipulate the vfslist lock directly, we abstract into lock 2898 * and unlock routines to allow the locking implementation to be changed for 2899 * clustering. 2900 * 2901 * Whenever the vfs list is modified through its hash links, the overall list 2902 * lock must be obtained before locking the relevant hash bucket. But to see 2903 * whether a given vfs is on the list, it suffices to obtain the lock for the 2904 * hash bucket without getting the overall list lock. (See getvfs() below.) 2905 */ 2906 2907 void 2908 vfs_list_lock() 2909 { 2910 rw_enter(&vfslist, RW_WRITER); 2911 } 2912 2913 void 2914 vfs_list_read_lock() 2915 { 2916 rw_enter(&vfslist, RW_READER); 2917 } 2918 2919 void 2920 vfs_list_unlock() 2921 { 2922 rw_exit(&vfslist); 2923 } 2924 2925 /* 2926 * Low level worker routines for adding entries to and removing entries from 2927 * the vfs list. 2928 */ 2929 2930 static void 2931 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 2932 { 2933 int vhno; 2934 struct vfs **hp; 2935 dev_t dev; 2936 2937 ASSERT(RW_WRITE_HELD(&vfslist)); 2938 2939 dev = expldev(vfsp->vfs_fsid.val[0]); 2940 vhno = VFSHASH(getmajor(dev), getminor(dev)); 2941 2942 mutex_enter(&rvfs_list[vhno].rvfs_lock); 2943 2944 /* 2945 * Link into the hash table, inserting it at the end, so that LOFS 2946 * with the same fsid as UFS (or other) file systems will not hide the 2947 * UFS. 2948 */ 2949 if (insert_at_head) { 2950 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 2951 rvfs_list[vhno].rvfs_head = vfsp; 2952 } else { 2953 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 2954 hp = &(*hp)->vfs_hash) 2955 continue; 2956 /* 2957 * hp now contains the address of the pointer to update 2958 * to effect the insertion. 2959 */ 2960 vfsp->vfs_hash = NULL; 2961 *hp = vfsp; 2962 } 2963 2964 rvfs_list[vhno].rvfs_len++; 2965 mutex_exit(&rvfs_list[vhno].rvfs_lock); 2966 } 2967 2968 2969 static void 2970 vfs_hash_remove(struct vfs *vfsp) 2971 { 2972 int vhno; 2973 struct vfs *tvfsp; 2974 dev_t dev; 2975 2976 ASSERT(RW_WRITE_HELD(&vfslist)); 2977 2978 dev = expldev(vfsp->vfs_fsid.val[0]); 2979 vhno = VFSHASH(getmajor(dev), getminor(dev)); 2980 2981 mutex_enter(&rvfs_list[vhno].rvfs_lock); 2982 2983 /* 2984 * Remove from hash. 2985 */ 2986 if (rvfs_list[vhno].rvfs_head == vfsp) { 2987 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 2988 rvfs_list[vhno].rvfs_len--; 2989 goto foundit; 2990 } 2991 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 2992 tvfsp = tvfsp->vfs_hash) { 2993 if (tvfsp->vfs_hash == vfsp) { 2994 tvfsp->vfs_hash = vfsp->vfs_hash; 2995 rvfs_list[vhno].rvfs_len--; 2996 goto foundit; 2997 } 2998 } 2999 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 3000 3001 foundit: 3002 3003 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3004 } 3005 3006 3007 void 3008 vfs_list_add(struct vfs *vfsp) 3009 { 3010 zone_t *zone; 3011 3012 /* 3013 * The zone that owns the mount is the one that performed the mount. 3014 * Note that this isn't necessarily the same as the zone mounted into. 3015 * The corresponding zone_rele() will be done when the vfs_t is 3016 * being free'd. 3017 */ 3018 vfsp->vfs_zone = curproc->p_zone; 3019 zone_hold(vfsp->vfs_zone); 3020 3021 /* 3022 * Find the zone mounted into, and put this mount on its vfs list. 3023 */ 3024 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3025 ASSERT(zone != NULL); 3026 /* 3027 * Special casing for the root vfs. This structure is allocated 3028 * statically and hooked onto rootvfs at link time. During the 3029 * vfs_mountroot call at system startup time, the root file system's 3030 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 3031 * as argument. The code below must detect and handle this special 3032 * case. The only apparent justification for this special casing is 3033 * to ensure that the root file system appears at the head of the 3034 * list. 3035 * 3036 * XXX: I'm assuming that it's ok to do normal list locking when 3037 * adding the entry for the root file system (this used to be 3038 * done with no locks held). 3039 */ 3040 vfs_list_lock(); 3041 /* 3042 * Link into the vfs list proper. 3043 */ 3044 if (vfsp == &root) { 3045 /* 3046 * Assert: This vfs is already on the list as its first entry. 3047 * Thus, there's nothing to do. 3048 */ 3049 ASSERT(rootvfs == vfsp); 3050 /* 3051 * Add it to the head of the global zone's vfslist. 3052 */ 3053 ASSERT(zone == global_zone); 3054 ASSERT(zone->zone_vfslist == NULL); 3055 zone->zone_vfslist = vfsp; 3056 } else { 3057 /* 3058 * Link to end of list using vfs_prev (as rootvfs is now a 3059 * doubly linked circular list) so list is in mount order for 3060 * mnttab use. 3061 */ 3062 rootvfs->vfs_prev->vfs_next = vfsp; 3063 vfsp->vfs_prev = rootvfs->vfs_prev; 3064 rootvfs->vfs_prev = vfsp; 3065 vfsp->vfs_next = rootvfs; 3066 3067 /* 3068 * Do it again for the zone-private list (which may be NULL). 3069 */ 3070 if (zone->zone_vfslist == NULL) { 3071 ASSERT(zone != global_zone); 3072 zone->zone_vfslist = vfsp; 3073 } else { 3074 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 3075 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 3076 zone->zone_vfslist->vfs_zone_prev = vfsp; 3077 vfsp->vfs_zone_next = zone->zone_vfslist; 3078 } 3079 } 3080 3081 /* 3082 * Link into the hash table, inserting it at the end, so that LOFS 3083 * with the same fsid as UFS (or other) file systems will not hide 3084 * the UFS. 3085 */ 3086 vfs_hash_add(vfsp, 0); 3087 3088 /* 3089 * update the mnttab modification time 3090 */ 3091 vfs_mnttab_modtimeupd(); 3092 vfs_list_unlock(); 3093 zone_rele(zone); 3094 } 3095 3096 void 3097 vfs_list_remove(struct vfs *vfsp) 3098 { 3099 zone_t *zone; 3100 3101 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3102 ASSERT(zone != NULL); 3103 /* 3104 * Callers are responsible for preventing attempts to unmount the 3105 * root. 3106 */ 3107 ASSERT(vfsp != rootvfs); 3108 3109 vfs_list_lock(); 3110 3111 /* 3112 * Remove from hash. 3113 */ 3114 vfs_hash_remove(vfsp); 3115 3116 /* 3117 * Remove from vfs list. 3118 */ 3119 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3120 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3121 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3122 3123 /* 3124 * Remove from zone-specific vfs list. 3125 */ 3126 if (zone->zone_vfslist == vfsp) 3127 zone->zone_vfslist = vfsp->vfs_zone_next; 3128 3129 if (vfsp->vfs_zone_next == vfsp) { 3130 ASSERT(vfsp->vfs_zone_prev == vfsp); 3131 ASSERT(zone->zone_vfslist == vfsp); 3132 zone->zone_vfslist = NULL; 3133 } 3134 3135 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3136 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3137 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3138 3139 /* 3140 * update the mnttab modification time 3141 */ 3142 vfs_mnttab_modtimeupd(); 3143 vfs_list_unlock(); 3144 zone_rele(zone); 3145 } 3146 3147 struct vfs * 3148 getvfs(fsid_t *fsid) 3149 { 3150 struct vfs *vfsp; 3151 int val0 = fsid->val[0]; 3152 int val1 = fsid->val[1]; 3153 dev_t dev = expldev(val0); 3154 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3155 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3156 3157 mutex_enter(hmp); 3158 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3159 if (vfsp->vfs_fsid.val[0] == val0 && 3160 vfsp->vfs_fsid.val[1] == val1) { 3161 VFS_HOLD(vfsp); 3162 mutex_exit(hmp); 3163 return (vfsp); 3164 } 3165 } 3166 mutex_exit(hmp); 3167 return (NULL); 3168 } 3169 3170 /* 3171 * Search the vfs mount in progress list for a specified device/vfs entry. 3172 * Returns 0 if the first entry in the list that the device matches has the 3173 * given vfs pointer as well. If the device matches but a different vfs 3174 * pointer is encountered in the list before the given vfs pointer then 3175 * a 1 is returned. 3176 */ 3177 3178 int 3179 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3180 { 3181 int retval = 0; 3182 struct ipmnt *mipp; 3183 3184 mutex_enter(&vfs_miplist_mutex); 3185 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3186 if (mipp->mip_dev == dev) { 3187 if (mipp->mip_vfsp != vfsp) 3188 retval = 1; 3189 break; 3190 } 3191 } 3192 mutex_exit(&vfs_miplist_mutex); 3193 return (retval); 3194 } 3195 3196 /* 3197 * Search the vfs list for a specified device. Returns 1, if entry is found 3198 * or 0 if no suitable entry is found. 3199 */ 3200 3201 int 3202 vfs_devismounted(dev_t dev) 3203 { 3204 struct vfs *vfsp; 3205 int found; 3206 3207 vfs_list_read_lock(); 3208 vfsp = rootvfs; 3209 found = 0; 3210 do { 3211 if (vfsp->vfs_dev == dev) { 3212 found = 1; 3213 break; 3214 } 3215 vfsp = vfsp->vfs_next; 3216 } while (vfsp != rootvfs); 3217 3218 vfs_list_unlock(); 3219 return (found); 3220 } 3221 3222 /* 3223 * Search the vfs list for a specified device. Returns a pointer to it 3224 * or NULL if no suitable entry is found. The caller of this routine 3225 * is responsible for releasing the returned vfs pointer. 3226 */ 3227 struct vfs * 3228 vfs_dev2vfsp(dev_t dev) 3229 { 3230 struct vfs *vfsp; 3231 int found; 3232 3233 vfs_list_read_lock(); 3234 vfsp = rootvfs; 3235 found = 0; 3236 do { 3237 /* 3238 * The following could be made more efficient by making 3239 * the entire loop use vfs_zone_next if the call is from 3240 * a zone. The only callers, however, ustat(2) and 3241 * umount2(2), don't seem to justify the added 3242 * complexity at present. 3243 */ 3244 if (vfsp->vfs_dev == dev && 3245 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3246 curproc->p_zone)) { 3247 VFS_HOLD(vfsp); 3248 found = 1; 3249 break; 3250 } 3251 vfsp = vfsp->vfs_next; 3252 } while (vfsp != rootvfs); 3253 vfs_list_unlock(); 3254 return (found ? vfsp: NULL); 3255 } 3256 3257 /* 3258 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3259 * or NULL if no suitable entry is found. The caller of this routine 3260 * is responsible for releasing the returned vfs pointer. 3261 * 3262 * Note that if multiple mntpoints match, the last one matching is 3263 * returned in an attempt to return the "top" mount when overlay 3264 * mounts are covering the same mount point. This is accomplished by starting 3265 * at the end of the list and working our way backwards, stopping at the first 3266 * matching mount. 3267 */ 3268 struct vfs * 3269 vfs_mntpoint2vfsp(const char *mp) 3270 { 3271 struct vfs *vfsp; 3272 struct vfs *retvfsp = NULL; 3273 zone_t *zone = curproc->p_zone; 3274 struct vfs *list; 3275 3276 vfs_list_read_lock(); 3277 if (getzoneid() == GLOBAL_ZONEID) { 3278 /* 3279 * The global zone may see filesystems in any zone. 3280 */ 3281 vfsp = rootvfs->vfs_prev; 3282 do { 3283 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) { 3284 retvfsp = vfsp; 3285 break; 3286 } 3287 vfsp = vfsp->vfs_prev; 3288 } while (vfsp != rootvfs->vfs_prev); 3289 } else if ((list = zone->zone_vfslist) != NULL) { 3290 const char *mntpt; 3291 3292 vfsp = list->vfs_zone_prev; 3293 do { 3294 mntpt = refstr_value(vfsp->vfs_mntpt); 3295 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3296 if (strcmp(mntpt, mp) == 0) { 3297 retvfsp = vfsp; 3298 break; 3299 } 3300 vfsp = vfsp->vfs_zone_prev; 3301 } while (vfsp != list->vfs_zone_prev); 3302 } 3303 if (retvfsp) 3304 VFS_HOLD(retvfsp); 3305 vfs_list_unlock(); 3306 return (retvfsp); 3307 } 3308 3309 /* 3310 * Search the vfs list for a specified vfsops. 3311 * if vfs entry is found then return 1, else 0. 3312 */ 3313 int 3314 vfs_opsinuse(vfsops_t *ops) 3315 { 3316 struct vfs *vfsp; 3317 int found; 3318 3319 vfs_list_read_lock(); 3320 vfsp = rootvfs; 3321 found = 0; 3322 do { 3323 if (vfs_getops(vfsp) == ops) { 3324 found = 1; 3325 break; 3326 } 3327 vfsp = vfsp->vfs_next; 3328 } while (vfsp != rootvfs); 3329 vfs_list_unlock(); 3330 return (found); 3331 } 3332 3333 /* 3334 * Allocate an entry in vfssw for a file system type 3335 */ 3336 struct vfssw * 3337 allocate_vfssw(char *type) 3338 { 3339 struct vfssw *vswp; 3340 3341 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3342 /* 3343 * The vfssw table uses the empty string to identify an 3344 * available entry; we cannot add any type which has 3345 * a leading NUL. The string length is limited to 3346 * the size of the st_fstype array in struct stat. 3347 */ 3348 return (NULL); 3349 } 3350 3351 ASSERT(VFSSW_WRITE_LOCKED()); 3352 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3353 if (!ALLOCATED_VFSSW(vswp)) { 3354 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3355 (void) strcpy(vswp->vsw_name, type); 3356 ASSERT(vswp->vsw_count == 0); 3357 vswp->vsw_count = 1; 3358 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3359 return (vswp); 3360 } 3361 return (NULL); 3362 } 3363 3364 /* 3365 * Impose additional layer of translation between vfstype names 3366 * and module names in the filesystem. 3367 */ 3368 static char * 3369 vfs_to_modname(char *vfstype) 3370 { 3371 if (strcmp(vfstype, "proc") == 0) { 3372 vfstype = "procfs"; 3373 } else if (strcmp(vfstype, "fd") == 0) { 3374 vfstype = "fdfs"; 3375 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3376 vfstype = "nfs"; 3377 } 3378 3379 return (vfstype); 3380 } 3381 3382 /* 3383 * Find a vfssw entry given a file system type name. 3384 * Try to autoload the filesystem if it's not found. 3385 * If it's installed, return the vfssw locked to prevent unloading. 3386 */ 3387 struct vfssw * 3388 vfs_getvfssw(char *type) 3389 { 3390 struct vfssw *vswp; 3391 char *modname; 3392 3393 RLOCK_VFSSW(); 3394 vswp = vfs_getvfsswbyname(type); 3395 modname = vfs_to_modname(type); 3396 3397 if (rootdir == NULL) { 3398 /* 3399 * If we haven't yet loaded the root file system, then our 3400 * _init won't be called until later. Allocate vfssw entry, 3401 * because mod_installfs won't be called. 3402 */ 3403 if (vswp == NULL) { 3404 RUNLOCK_VFSSW(); 3405 WLOCK_VFSSW(); 3406 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 3407 if ((vswp = allocate_vfssw(type)) == NULL) { 3408 WUNLOCK_VFSSW(); 3409 return (NULL); 3410 } 3411 } 3412 WUNLOCK_VFSSW(); 3413 RLOCK_VFSSW(); 3414 } 3415 if (!VFS_INSTALLED(vswp)) { 3416 RUNLOCK_VFSSW(); 3417 (void) modloadonly("fs", modname); 3418 } else 3419 RUNLOCK_VFSSW(); 3420 return (vswp); 3421 } 3422 3423 /* 3424 * Try to load the filesystem. Before calling modload(), we drop 3425 * our lock on the VFS switch table, and pick it up after the 3426 * module is loaded. However, there is a potential race: the 3427 * module could be unloaded after the call to modload() completes 3428 * but before we pick up the lock and drive on. Therefore, 3429 * we keep reloading the module until we've loaded the module 3430 * _and_ we have the lock on the VFS switch table. 3431 */ 3432 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 3433 RUNLOCK_VFSSW(); 3434 if (modload("fs", modname) == -1) 3435 return (NULL); 3436 RLOCK_VFSSW(); 3437 if (vswp == NULL) 3438 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 3439 break; 3440 } 3441 RUNLOCK_VFSSW(); 3442 3443 return (vswp); 3444 } 3445 3446 /* 3447 * Find a vfssw entry given a file system type name. 3448 */ 3449 struct vfssw * 3450 vfs_getvfsswbyname(char *type) 3451 { 3452 struct vfssw *vswp; 3453 3454 ASSERT(VFSSW_LOCKED()); 3455 if (type == NULL || *type == '\0') 3456 return (NULL); 3457 3458 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3459 if (strcmp(type, vswp->vsw_name) == 0) { 3460 vfs_refvfssw(vswp); 3461 return (vswp); 3462 } 3463 } 3464 3465 return (NULL); 3466 } 3467 3468 /* 3469 * Find a vfssw entry given a set of vfsops. 3470 */ 3471 struct vfssw * 3472 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 3473 { 3474 struct vfssw *vswp; 3475 3476 RLOCK_VFSSW(); 3477 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3478 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 3479 vfs_refvfssw(vswp); 3480 RUNLOCK_VFSSW(); 3481 return (vswp); 3482 } 3483 } 3484 RUNLOCK_VFSSW(); 3485 3486 return (NULL); 3487 } 3488 3489 /* 3490 * Reference a vfssw entry. 3491 */ 3492 void 3493 vfs_refvfssw(struct vfssw *vswp) 3494 { 3495 3496 mutex_enter(&vswp->vsw_lock); 3497 vswp->vsw_count++; 3498 mutex_exit(&vswp->vsw_lock); 3499 } 3500 3501 /* 3502 * Unreference a vfssw entry. 3503 */ 3504 void 3505 vfs_unrefvfssw(struct vfssw *vswp) 3506 { 3507 3508 mutex_enter(&vswp->vsw_lock); 3509 vswp->vsw_count--; 3510 mutex_exit(&vswp->vsw_lock); 3511 } 3512 3513 int sync_timeout = 30; /* timeout for syncing a page during panic */ 3514 int sync_timeleft; /* portion of sync_timeout remaining */ 3515 3516 static int sync_retries = 20; /* number of retries when not making progress */ 3517 static int sync_triesleft; /* portion of sync_retries remaining */ 3518 3519 static pgcnt_t old_pgcnt, new_pgcnt; 3520 static int new_bufcnt, old_bufcnt; 3521 3522 /* 3523 * Sync all of the mounted filesystems, and then wait for the actual i/o to 3524 * complete. We wait by counting the number of dirty pages and buffers, 3525 * pushing them out using bio_busy() and page_busy(), and then counting again. 3526 * This routine is used during both the uadmin A_SHUTDOWN code as well as 3527 * the SYNC phase of the panic code (see comments in panic.c). It should only 3528 * be used after some higher-level mechanism has quiesced the system so that 3529 * new writes are not being initiated while we are waiting for completion. 3530 * 3531 * To ensure finite running time, our algorithm uses two timeout mechanisms: 3532 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and 3533 * sync_triesleft (a progress counter used by the vfs_syncall() loop below). 3534 * Together these ensure that syncing completes if our i/o paths are stuck. 3535 * The counters are declared above so they can be found easily in the debugger. 3536 * 3537 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the 3538 * vfs_syncprogress() subroutine whenever we make progress through the lists of 3539 * pages and buffers. It is decremented and expired by the deadman() cyclic. 3540 * When vfs_syncall() decides it is done, we disable the deadman() counter by 3541 * setting sync_timeleft to zero. This timer guards against vfs_syncall() 3542 * deadlocking or hanging inside of a broken filesystem or driver routine. 3543 * 3544 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 3545 * sync_retries consecutive calls to bio_busy() and page_busy() without 3546 * decreasing either the number of dirty buffers or dirty pages below the 3547 * lowest count we have seen so far, we give up and return from vfs_syncall(). 3548 * 3549 * Each loop iteration ends with a call to delay() one second to allow time for 3550 * i/o completion and to permit the user time to read our progress messages. 3551 */ 3552 void 3553 vfs_syncall(void) 3554 { 3555 if (rootdir == NULL && !modrootloaded) 3556 return; /* panic during boot - no filesystems yet */ 3557 3558 printf("syncing file systems..."); 3559 vfs_syncprogress(); 3560 sync(); 3561 3562 vfs_syncprogress(); 3563 sync_triesleft = sync_retries; 3564 3565 old_bufcnt = new_bufcnt = INT_MAX; 3566 old_pgcnt = new_pgcnt = ULONG_MAX; 3567 3568 while (sync_triesleft > 0) { 3569 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 3570 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 3571 3572 new_bufcnt = bio_busy(B_TRUE); 3573 new_pgcnt = page_busy(B_TRUE); 3574 vfs_syncprogress(); 3575 3576 if (new_bufcnt == 0 && new_pgcnt == 0) 3577 break; 3578 3579 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 3580 sync_triesleft = sync_retries; 3581 else 3582 sync_triesleft--; 3583 3584 if (new_bufcnt) 3585 printf(" [%d]", new_bufcnt); 3586 if (new_pgcnt) 3587 printf(" %lu", new_pgcnt); 3588 3589 delay(hz); 3590 } 3591 3592 if (new_bufcnt != 0 || new_pgcnt != 0) 3593 printf(" done (not all i/o completed)\n"); 3594 else 3595 printf(" done\n"); 3596 3597 sync_timeleft = 0; 3598 delay(hz); 3599 } 3600 3601 /* 3602 * If we are in the middle of the sync phase of panic, reset sync_timeleft to 3603 * sync_timeout to indicate that we are making progress and the deadman() 3604 * omnipresent cyclic should not yet time us out. Note that it is safe to 3605 * store to sync_timeleft here since the deadman() is firing at high-level 3606 * on top of us. If we are racing with the deadman(), either the deadman() 3607 * will decrement the old value and then we will reset it, or we will 3608 * reset it and then the deadman() will immediately decrement it. In either 3609 * case, correct behavior results. 3610 */ 3611 void 3612 vfs_syncprogress(void) 3613 { 3614 if (panicstr) 3615 sync_timeleft = sync_timeout; 3616 } 3617 3618 /* 3619 * Map VFS flags to statvfs flags. These shouldn't really be separate 3620 * flags at all. 3621 */ 3622 uint_t 3623 vf_to_stf(uint_t vf) 3624 { 3625 uint_t stf = 0; 3626 3627 if (vf & VFS_RDONLY) 3628 stf |= ST_RDONLY; 3629 if (vf & VFS_NOSETUID) 3630 stf |= ST_NOSUID; 3631 if (vf & VFS_NOTRUNC) 3632 stf |= ST_NOTRUNC; 3633 3634 return (stf); 3635 } 3636 3637 /* 3638 * Use old-style function prototype for vfsstray() so 3639 * that we can use it anywhere in the vfsops structure. 3640 */ 3641 int vfsstray(); 3642 3643 /* 3644 * Entries for (illegal) fstype 0. 3645 */ 3646 /* ARGSUSED */ 3647 int 3648 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 3649 { 3650 cmn_err(CE_PANIC, "stray vfs operation"); 3651 return (0); 3652 } 3653 3654 vfsops_t vfs_strayops = { 3655 vfsstray, 3656 vfsstray, 3657 vfsstray, 3658 vfsstray, 3659 vfsstray_sync, 3660 vfsstray, 3661 vfsstray, 3662 vfsstray 3663 }; 3664 3665 /* 3666 * Entries for (illegal) fstype 0. 3667 */ 3668 int 3669 vfsstray(void) 3670 { 3671 cmn_err(CE_PANIC, "stray vfs operation"); 3672 return (0); 3673 } 3674 3675 /* 3676 * Support for dealing with forced UFS unmount and its interaction with 3677 * LOFS. Could be used by any filesystem. 3678 * See bug 1203132. 3679 */ 3680 int 3681 vfs_EIO(void) 3682 { 3683 return (EIO); 3684 } 3685 3686 /* 3687 * We've gotta define the op for sync separately, since the compiler gets 3688 * confused if we mix and match ANSI and normal style prototypes when 3689 * a "short" argument is present and spits out a warning. 3690 */ 3691 /*ARGSUSED*/ 3692 int 3693 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 3694 { 3695 return (EIO); 3696 } 3697 3698 vfs_t EIO_vfs; 3699 vfsops_t *EIO_vfsops; 3700 3701 /* 3702 * Called from startup() to initialize all loaded vfs's 3703 */ 3704 void 3705 vfsinit(void) 3706 { 3707 struct vfssw *vswp; 3708 int error; 3709 extern int vopstats_enabled; 3710 extern void vopstats_startup(); 3711 3712 static const fs_operation_def_t EIO_vfsops_template[] = { 3713 VFSNAME_MOUNT, vfs_EIO, 3714 VFSNAME_UNMOUNT, vfs_EIO, 3715 VFSNAME_ROOT, vfs_EIO, 3716 VFSNAME_STATVFS, vfs_EIO, 3717 VFSNAME_SYNC, (fs_generic_func_p) vfs_EIO_sync, 3718 VFSNAME_VGET, vfs_EIO, 3719 VFSNAME_MOUNTROOT, vfs_EIO, 3720 VFSNAME_FREEVFS, vfs_EIO, 3721 VFSNAME_VNSTATE, vfs_EIO, 3722 NULL, NULL 3723 }; 3724 3725 3726 /* Initialize the vnode cache (file systems may use it during init). */ 3727 3728 vn_create_cache(); 3729 3730 /* Setup event monitor framework */ 3731 3732 fem_init(); 3733 3734 /* Initialize the dummy stray file system type. */ 3735 3736 vfssw[0].vsw_vfsops = vfs_strayops; 3737 3738 /* Initialize the dummy EIO file system. */ 3739 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 3740 if (error != 0) { 3741 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 3742 /* Shouldn't happen, but not bad enough to panic */ 3743 } 3744 3745 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 3746 3747 /* 3748 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 3749 * on this vfs can immediately notice it's invalid. 3750 */ 3751 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 3752 3753 /* 3754 * Call the init routines of non-loadable filesystems only. 3755 * Filesystems which are loaded as separate modules will be 3756 * initialized by the module loading code instead. 3757 */ 3758 3759 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3760 RLOCK_VFSSW(); 3761 if (vswp->vsw_init != NULL) 3762 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 3763 RUNLOCK_VFSSW(); 3764 } 3765 3766 vopstats_startup(); 3767 3768 if (vopstats_enabled) { 3769 /* EIO_vfs can collect stats, but we don't retrieve them */ 3770 initialize_vopstats(&EIO_vfs.vfs_vopstats); 3771 EIO_vfs.vfs_fstypevsp = NULL; 3772 EIO_vfs.vfs_vskap = NULL; 3773 EIO_vfs.vfs_flag |= VFS_STATS; 3774 } 3775 } 3776 3777 /* 3778 * Increments the vfs reference count by one atomically. 3779 */ 3780 void 3781 vfs_hold(vfs_t *vfsp) 3782 { 3783 atomic_add_32(&vfsp->vfs_count, 1); 3784 ASSERT(vfsp->vfs_count != 0); 3785 } 3786 3787 /* 3788 * Decrements the vfs reference count by one atomically. When 3789 * vfs reference count becomes zero, it calls the file system 3790 * specific vfs_freevfs() to free up the resources. 3791 */ 3792 void 3793 vfs_rele(vfs_t *vfsp) 3794 { 3795 ASSERT(vfsp->vfs_count != 0); 3796 if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) { 3797 VFS_FREEVFS(vfsp); 3798 if (vfsp->vfs_zone) 3799 zone_rele(vfsp->vfs_zone); 3800 vfs_freemnttab(vfsp); 3801 sema_destroy(&vfsp->vfs_reflock); 3802 kmem_free(vfsp, sizeof (*vfsp)); 3803 } 3804 } 3805 3806 /* 3807 * Generic operations vector support. 3808 * 3809 * This is used to build operations vectors for both the vfs and vnode. 3810 * It's normally called only when a file system is loaded. 3811 * 3812 * There are many possible algorithms for this, including the following: 3813 * 3814 * (1) scan the list of known operations; for each, see if the file system 3815 * includes an entry for it, and fill it in as appropriate. 3816 * 3817 * (2) set up defaults for all known operations. scan the list of ops 3818 * supplied by the file system; for each which is both supplied and 3819 * known, fill it in. 3820 * 3821 * (3) sort the lists of known ops & supplied ops; scan the list, filling 3822 * in entries as we go. 3823 * 3824 * we choose (1) for simplicity, and because performance isn't critical here. 3825 * note that (2) could be sped up using a precomputed hash table on known ops. 3826 * (3) could be faster than either, but only if the lists were very large or 3827 * supplied in sorted order. 3828 * 3829 */ 3830 3831 int 3832 fs_build_vector(void *vector, int *unused_ops, 3833 const fs_operation_trans_def_t *translation, 3834 const fs_operation_def_t *operations) 3835 { 3836 int i, num_trans, num_ops, used; 3837 3838 /* Count the number of translations and the number of supplied */ 3839 /* operations. */ 3840 3841 { 3842 const fs_operation_trans_def_t *p; 3843 3844 for (num_trans = 0, p = translation; 3845 p->name != NULL; 3846 num_trans++, p++) 3847 ; 3848 } 3849 3850 { 3851 const fs_operation_def_t *p; 3852 3853 for (num_ops = 0, p = operations; 3854 p->name != NULL; 3855 num_ops++, p++) 3856 ; 3857 } 3858 3859 /* Walk through each operation known to our caller. There will be */ 3860 /* one entry in the supplied "translation table" for each. */ 3861 3862 used = 0; 3863 3864 for (i = 0; i < num_trans; i++) { 3865 int j, found; 3866 char *curname; 3867 fs_generic_func_p result; 3868 fs_generic_func_p *location; 3869 3870 curname = translation[i].name; 3871 3872 /* Look for a matching operation in the list supplied by the */ 3873 /* file system. */ 3874 3875 found = 0; 3876 3877 for (j = 0; j < num_ops; j++) { 3878 if (strcmp(operations[j].name, curname) == 0) { 3879 used++; 3880 found = 1; 3881 break; 3882 } 3883 } 3884 3885 /* If the file system is using a "placeholder" for default */ 3886 /* or error functions, grab the appropriate function out of */ 3887 /* the translation table. If the file system didn't supply */ 3888 /* this operation at all, use the default function. */ 3889 3890 if (found) { 3891 result = operations[j].func; 3892 if (result == fs_default) { 3893 result = translation[i].defaultFunc; 3894 } else if (result == fs_error) { 3895 result = translation[i].errorFunc; 3896 } else if (result == NULL) { 3897 /* Null values are PROHIBITED */ 3898 return (EINVAL); 3899 } 3900 } else { 3901 result = translation[i].defaultFunc; 3902 } 3903 3904 /* Now store the function into the operations vector. */ 3905 3906 location = (fs_generic_func_p *) 3907 (((char *)vector) + translation[i].offset); 3908 3909 *location = result; 3910 } 3911 3912 *unused_ops = num_ops - used; 3913 3914 return (0); 3915 } 3916 3917 /* Placeholder functions, should never be called. */ 3918 3919 int 3920 fs_error(void) 3921 { 3922 cmn_err(CE_PANIC, "fs_error called"); 3923 return (0); 3924 } 3925 3926 int 3927 fs_default(void) 3928 { 3929 cmn_err(CE_PANIC, "fs_default called"); 3930 return (0); 3931 } 3932 3933 #ifdef __sparc 3934 3935 /* 3936 * Part of the implementation of booting off a mirrored root 3937 * involves a change of dev_t for the root device. To 3938 * accomplish this, first remove the existing hash table 3939 * entry for the root device, convert to the new dev_t, 3940 * then re-insert in the hash table at the head of the list. 3941 */ 3942 void 3943 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 3944 { 3945 vfs_list_lock(); 3946 3947 vfs_hash_remove(vfsp); 3948 3949 vfsp->vfs_dev = ndev; 3950 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 3951 3952 vfs_hash_add(vfsp, 1); 3953 3954 vfs_list_unlock(); 3955 } 3956 3957 #else /* x86 NEWBOOT */ 3958 3959 int 3960 rootconf() 3961 { 3962 int error; 3963 struct vfssw *vsw; 3964 extern void pm_init(); 3965 char *fstyp; 3966 3967 fstyp = getrootfs(); 3968 3969 if (error = clboot_rootconf()) 3970 return (error); 3971 3972 if (modload("fs", fstyp) == -1) 3973 cmn_err(CE_PANIC, "Cannot _init %s module\n", fstyp); 3974 3975 RLOCK_VFSSW(); 3976 vsw = vfs_getvfsswbyname(fstyp); 3977 RUNLOCK_VFSSW(); 3978 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 3979 VFS_HOLD(rootvfs); 3980 3981 /* always mount readonly first */ 3982 rootvfs->vfs_flag |= VFS_RDONLY; 3983 3984 pm_init(); 3985 3986 if (netboot) 3987 (void) strplumb(); 3988 3989 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 3990 vfs_unrefvfssw(vsw); 3991 rootdev = rootvfs->vfs_dev; 3992 3993 if (error) 3994 cmn_err(CE_PANIC, "cannot mount root path %s", svm_bootpath); 3995 return (error); 3996 } 3997 3998 /* 3999 * XXX this is called by nfs only and should probably be removed 4000 * If booted with ASKNAME, prompt on the console for a filesystem 4001 * name and return it. 4002 */ 4003 void 4004 getfsname(char *askfor, char *name, size_t namelen) 4005 { 4006 if (boothowto & RB_ASKNAME) { 4007 printf("%s name: ", askfor); 4008 console_gets(name, namelen); 4009 } 4010 } 4011 4012 /* 4013 * If server_path exists, then we are booting a diskless 4014 * client. Otherwise, we default to ufs. Zfs should perhaps be 4015 * another property. 4016 */ 4017 static char * 4018 getrootfs(void) 4019 { 4020 extern char *strplumb_get_netdev_path(void); 4021 char *propstr = NULL; 4022 4023 /* check fstype property; it should be nfsdyn for diskless */ 4024 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4025 DDI_PROP_DONTPASS, "fstype", &propstr) 4026 == DDI_SUCCESS) { 4027 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 4028 ddi_prop_free(propstr); 4029 } 4030 4031 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) 4032 return (rootfs.bo_fstype); 4033 4034 ++netboot; 4035 /* check if path to network interface is specified in bootpath */ 4036 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4037 DDI_PROP_DONTPASS, "bootpath", &propstr) 4038 == DDI_SUCCESS) { 4039 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 4040 ddi_prop_free(propstr); 4041 } else { 4042 /* attempt to determine netdev_path via boot_mac address */ 4043 netdev_path = strplumb_get_netdev_path(); 4044 if (netdev_path == NULL) 4045 cmn_err(CE_PANIC, 4046 "Cannot find boot network interface\n"); 4047 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 4048 } 4049 return ("nfs"); 4050 } 4051 #endif 4052