1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/errno.h> 46 #include <sys/user.h> 47 #include <sys/fstyp.h> 48 #include <sys/kmem.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/mount.h> 52 #include <sys/vfs.h> 53 #include <sys/fem.h> 54 #include <sys/mntent.h> 55 #include <sys/stat.h> 56 #include <sys/statvfs.h> 57 #include <sys/statfs.h> 58 #include <sys/cred.h> 59 #include <sys/vnode.h> 60 #include <sys/rwstlock.h> 61 #include <sys/dnlc.h> 62 #include <sys/file.h> 63 #include <sys/time.h> 64 #include <sys/atomic.h> 65 #include <sys/cmn_err.h> 66 #include <sys/buf.h> 67 #include <sys/swap.h> 68 #include <sys/debug.h> 69 #include <sys/vnode.h> 70 #include <sys/modctl.h> 71 #include <sys/ddi.h> 72 #include <sys/pathname.h> 73 #include <sys/bootconf.h> 74 #include <sys/dumphdr.h> 75 #include <sys/dc_ki.h> 76 #include <sys/poll.h> 77 #include <sys/sunddi.h> 78 #include <sys/sysmacros.h> 79 #include <sys/zone.h> 80 #include <sys/policy.h> 81 #include <sys/ctfs.h> 82 #include <sys/objfs.h> 83 #include <sys/console.h> 84 #include <sys/reboot.h> 85 86 #include <vm/page.h> 87 88 #include <fs/fs_subr.h> 89 90 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 91 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 92 const char *, int, int); 93 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 94 static void vfs_freemnttab(struct vfs *); 95 static void vfs_freeopt(mntopt_t *); 96 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 97 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 98 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 99 static void vfs_createopttbl_extend(mntopts_t *, const char *, 100 const mntopts_t *); 101 static char **vfs_copycancelopt_extend(char **const, int); 102 static void vfs_freecancelopt(char **); 103 static char *getrootfs(void); 104 static int getmacpath(dev_info_t *, void *); 105 106 struct ipmnt { 107 struct ipmnt *mip_next; 108 dev_t mip_dev; 109 struct vfs *mip_vfsp; 110 }; 111 112 static kmutex_t vfs_miplist_mutex; 113 static struct ipmnt *vfs_miplist = NULL; 114 static struct ipmnt *vfs_miplist_end = NULL; 115 116 /* 117 * VFS global data. 118 */ 119 vnode_t *rootdir; /* pointer to root inode vnode. */ 120 vnode_t *devicesdir; /* pointer to inode of devices root */ 121 122 char *server_rootpath; /* root path for diskless clients */ 123 char *server_hostname; /* hostname of diskless server */ 124 125 static struct vfs root; 126 static struct vfs devices; 127 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 128 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 129 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 130 /* must be power of 2! */ 131 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 132 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 133 char *vfs_dummyfstype = "\0"; 134 struct pollhead vfs_pollhd; /* for mnttab pollers */ 135 136 /* 137 * Table for generic options recognized in the VFS layer and acted 138 * on at this level before parsing file system specific options. 139 * The nosuid option is stronger than any of the devices and setuid 140 * options, so those are canceled when nosuid is seen. 141 * 142 * All options which are added here need to be added to the 143 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 144 */ 145 /* 146 * VFS Mount options table 147 */ 148 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 149 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 150 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 151 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 152 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 153 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 154 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 155 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 156 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 157 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 158 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 159 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 160 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 161 162 static const mntopt_t mntopts[] = { 163 /* 164 * option name cancel options default arg flags 165 */ 166 { MNTOPT_REMOUNT, NULL, NULL, 167 MO_NODISPLAY, (void *)0 }, 168 { MNTOPT_RO, ro_cancel, NULL, 0, 169 (void *)0 }, 170 { MNTOPT_RW, rw_cancel, NULL, 0, 171 (void *)0 }, 172 { MNTOPT_SUID, suid_cancel, NULL, 0, 173 (void *)0 }, 174 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 175 (void *)0 }, 176 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 177 (void *)0 }, 178 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 179 (void *)0 }, 180 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 181 (void *)0 }, 182 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 183 (void *)0 }, 184 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 185 (void *)0 }, 186 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 187 (void *)0 }, 188 { MNTOPT_EXEC, exec_cancel, NULL, 0, 189 (void *)0 }, 190 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 191 (void *)0 }, 192 }; 193 194 const mntopts_t vfs_mntopts = { 195 sizeof (mntopts) / sizeof (mntopt_t), 196 (mntopt_t *)&mntopts[0] 197 }; 198 199 /* 200 * File system operation dispatch functions. 201 */ 202 203 int 204 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 205 { 206 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 207 } 208 209 int 210 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 211 { 212 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 213 } 214 215 int 216 fsop_root(vfs_t *vfsp, vnode_t **vpp) 217 { 218 refstr_t *mntpt; 219 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 220 /* 221 * Make sure this root has a path. With lofs, it is possible to have 222 * a NULL mountpoint. 223 */ 224 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { 225 mntpt = vfs_getmntpoint(vfsp); 226 vn_setpath_str(*vpp, refstr_value(mntpt), 227 strlen(refstr_value(mntpt))); 228 refstr_rele(mntpt); 229 } 230 231 return (ret); 232 } 233 234 int 235 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 236 { 237 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 238 } 239 240 int 241 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 242 { 243 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 244 } 245 246 int 247 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 248 { 249 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 250 } 251 252 int 253 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 254 { 255 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 256 } 257 258 void 259 fsop_freefs(vfs_t *vfsp) 260 { 261 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 262 } 263 264 int 265 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 266 { 267 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 268 } 269 270 int 271 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 272 { 273 ASSERT((fstype >= 0) && (fstype < nfstype)); 274 275 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 276 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 277 else 278 return (ENOTSUP); 279 } 280 281 /* 282 * File system initialization. vfs_setfsops() must be called from a file 283 * system's init routine. 284 */ 285 286 static int 287 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 288 int *unused_ops) 289 { 290 static const fs_operation_trans_def_t vfs_ops_table[] = { 291 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 292 fs_nosys, fs_nosys, 293 294 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 295 fs_nosys, fs_nosys, 296 297 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 298 fs_nosys, fs_nosys, 299 300 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 301 fs_nosys, fs_nosys, 302 303 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 304 (fs_generic_func_p) fs_sync, 305 (fs_generic_func_p) fs_sync, /* No errors allowed */ 306 307 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 308 fs_nosys, fs_nosys, 309 310 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 311 fs_nosys, fs_nosys, 312 313 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 314 (fs_generic_func_p)fs_freevfs, 315 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 316 317 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 318 (fs_generic_func_p)fs_nosys, 319 (fs_generic_func_p)fs_nosys, 320 321 NULL, 0, NULL, NULL 322 }; 323 324 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 325 } 326 327 int 328 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 329 { 330 int error; 331 int unused_ops; 332 333 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 334 335 if ((fstype <= 0) || (fstype >= nfstype)) 336 return (EINVAL); 337 338 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 339 return (EINVAL); 340 341 /* Set up the operations vector. */ 342 343 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 344 345 if (error != 0) 346 return (error); 347 348 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 349 350 if (actual != NULL) 351 *actual = &vfssw[fstype].vsw_vfsops; 352 353 #if DEBUG 354 if (unused_ops != 0) 355 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 356 "but not used", vfssw[fstype].vsw_name, unused_ops); 357 #endif 358 359 return (0); 360 } 361 362 int 363 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 364 { 365 int error; 366 int unused_ops; 367 368 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 369 370 error = fs_copyfsops(template, *actual, &unused_ops); 371 if (error != 0) { 372 kmem_free(*actual, sizeof (vfsops_t)); 373 *actual = NULL; 374 return (error); 375 } 376 377 return (0); 378 } 379 380 /* 381 * Free a vfsops structure created as a result of vfs_makefsops(). 382 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 383 * vfs_freevfsops_by_type(). 384 */ 385 void 386 vfs_freevfsops(vfsops_t *vfsops) 387 { 388 kmem_free(vfsops, sizeof (vfsops_t)); 389 } 390 391 /* 392 * Since the vfsops structure is part of the vfssw table and wasn't 393 * really allocated, we're not really freeing anything. We keep 394 * the name for consistency with vfs_freevfsops(). We do, however, 395 * need to take care of a little bookkeeping. 396 * NOTE: For a vfsops structure created by vfs_setfsops(), use 397 * vfs_freevfsops_by_type(). 398 */ 399 int 400 vfs_freevfsops_by_type(int fstype) 401 { 402 403 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 404 if ((fstype <= 0) || (fstype >= nfstype)) 405 return (EINVAL); 406 407 WLOCK_VFSSW(); 408 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 409 WUNLOCK_VFSSW(); 410 return (EINVAL); 411 } 412 413 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 414 WUNLOCK_VFSSW(); 415 416 return (0); 417 } 418 419 /* Support routines used to reference vfs_op */ 420 421 /* Set the operations vector for a vfs */ 422 void 423 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 424 { 425 vfsops_t *op; 426 427 ASSERT(vfsp != NULL); 428 ASSERT(vfsops != NULL); 429 430 op = vfsp->vfs_op; 431 membar_consumer(); 432 if (vfsp->vfs_femhead == NULL && 433 casptr(&vfsp->vfs_op, op, vfsops) == op) { 434 return; 435 } 436 fsem_setvfsops(vfsp, vfsops); 437 } 438 439 /* Retrieve the operations vector for a vfs */ 440 vfsops_t * 441 vfs_getops(vfs_t *vfsp) 442 { 443 vfsops_t *op; 444 445 ASSERT(vfsp != NULL); 446 447 op = vfsp->vfs_op; 448 membar_consumer(); 449 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) { 450 return (op); 451 } else { 452 return (fsem_getvfsops(vfsp)); 453 } 454 } 455 456 /* 457 * Returns non-zero (1) if the vfsops matches that of the vfs. 458 * Returns zero (0) if not. 459 */ 460 int 461 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 462 { 463 return (vfs_getops(vfsp) == vfsops); 464 } 465 466 /* 467 * Returns non-zero (1) if the file system has installed a non-default, 468 * non-error vfs_sync routine. Returns zero (0) otherwise. 469 */ 470 int 471 vfs_can_sync(vfs_t *vfsp) 472 { 473 /* vfs_sync() routine is not the default/error function */ 474 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 475 } 476 477 /* 478 * Initialize a vfs structure. 479 */ 480 void 481 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 482 { 483 vfsp->vfs_count = 0; 484 vfsp->vfs_next = vfsp; 485 vfsp->vfs_prev = vfsp; 486 vfsp->vfs_zone_next = vfsp; 487 vfsp->vfs_zone_prev = vfsp; 488 vfsp->vfs_flag = 0; 489 vfsp->vfs_data = (data); 490 vfsp->vfs_resource = NULL; 491 vfsp->vfs_mntpt = NULL; 492 vfsp->vfs_mntopts.mo_count = 0; 493 vfsp->vfs_mntopts.mo_list = NULL; 494 vfsp->vfs_femhead = NULL; 495 vfsp->vfs_zone = NULL; 496 /* 497 * Note: Don't initialize vfs_vskap, vfs_fstypevsp since it 498 * could be a problem for unbundled file systems. 499 */ 500 vfs_setops((vfsp), (op)); 501 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 502 } 503 504 505 /* 506 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 507 * fstatvfs, and sysfs moved to common/syscall. 508 */ 509 510 /* 511 * Update every mounted file system. We call the vfs_sync operation of 512 * each file system type, passing it a NULL vfsp to indicate that all 513 * mounted file systems of that type should be updated. 514 */ 515 void 516 vfs_sync(int flag) 517 { 518 struct vfssw *vswp; 519 RLOCK_VFSSW(); 520 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 521 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 522 vfs_refvfssw(vswp); 523 RUNLOCK_VFSSW(); 524 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 525 CRED()); 526 vfs_unrefvfssw(vswp); 527 RLOCK_VFSSW(); 528 } 529 } 530 RUNLOCK_VFSSW(); 531 } 532 533 void 534 sync(void) 535 { 536 vfs_sync(0); 537 } 538 539 /* 540 * External routines. 541 */ 542 543 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 544 545 /* 546 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 547 * but otherwise should be accessed only via vfs_list_lock() and 548 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 549 */ 550 static krwlock_t vfslist; 551 552 /* 553 * Mount devfs on /devices. This is done right after root is mounted 554 * to provide device access support for the system 555 */ 556 static void 557 vfs_mountdevices(void) 558 { 559 struct vfssw *vsw; 560 struct vnode *mvp; 561 struct mounta mounta = { /* fake mounta for devfs_mount() */ 562 NULL, 563 NULL, 564 MS_SYSSPACE, 565 NULL, 566 NULL, 567 0, 568 NULL, 569 0 570 }; 571 572 /* 573 * _init devfs module to fill in the vfssw 574 */ 575 if (modload("fs", "devfs") == -1) 576 cmn_err(CE_PANIC, "Cannot _init devfs module\n"); 577 578 /* 579 * Hold vfs 580 */ 581 RLOCK_VFSSW(); 582 vsw = vfs_getvfsswbyname("devfs"); 583 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 584 VFS_HOLD(&devices); 585 586 /* 587 * Locate mount point 588 */ 589 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 590 cmn_err(CE_PANIC, "Cannot find /devices\n"); 591 592 /* 593 * Perform the mount of /devices 594 */ 595 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 596 cmn_err(CE_PANIC, "Cannot mount /devices\n"); 597 598 RUNLOCK_VFSSW(); 599 600 /* 601 * Set appropriate members and add to vfs list for mnttab display 602 */ 603 vfs_setresource(&devices, "/devices"); 604 vfs_setmntpoint(&devices, "/devices"); 605 606 /* 607 * Hold the root of /devices so it won't go away 608 */ 609 if (VFS_ROOT(&devices, &devicesdir)) 610 cmn_err(CE_PANIC, "vfs_mountdevices: not devices root"); 611 VN_HOLD(devicesdir); 612 613 if (vfs_lock(&devices) != 0) { 614 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 615 return; 616 } 617 618 if (vn_vfswlock(mvp) != 0) { 619 vfs_unlock(&devices); 620 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 621 return; 622 } 623 624 vfs_add(mvp, &devices, 0); 625 vn_vfsunlock(mvp); 626 vfs_unlock(&devices); 627 } 628 629 /* 630 * Mount required filesystem. This is done right after root is mounted. 631 */ 632 static void 633 vfs_mountfs(char *module, char *spec, char *path) 634 { 635 struct vnode *mvp; 636 struct mounta mounta; 637 vfs_t *vfsp; 638 639 mounta.flags = MS_SYSSPACE | MS_DATA; 640 mounta.fstype = module; 641 mounta.spec = spec; 642 mounta.dir = path; 643 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 644 cmn_err(CE_WARN, "Cannot find %s\n", path); 645 return; 646 } 647 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 648 cmn_err(CE_WARN, "Cannot mount %s\n", path); 649 else 650 VFS_RELE(vfsp); 651 VN_RELE(mvp); 652 } 653 654 /* 655 * vfs_mountroot is called by main() to mount the root filesystem. 656 */ 657 void 658 vfs_mountroot(void) 659 { 660 struct vnode *rvp = NULL; 661 char *path; 662 size_t plen; 663 struct vfssw *vswp; 664 extern void setup_vopstats(vfs_t *); 665 666 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 667 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 668 669 /* 670 * Alloc the vfs hash bucket array and locks 671 */ 672 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 673 674 /* 675 * Call machine-dependent routine "rootconf" to choose a root 676 * file system type. 677 */ 678 if (rootconf()) 679 cmn_err(CE_PANIC, "vfs_mountroot: cannot mount root"); 680 /* 681 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 682 * to point to it. These are used by lookuppn() so that it 683 * knows where to start from ('/' or '.'). 684 */ 685 vfs_setmntpoint(rootvfs, "/"); 686 if (VFS_ROOT(rootvfs, &rootdir)) 687 cmn_err(CE_PANIC, "vfs_mountroot: no root vnode"); 688 u.u_cdir = rootdir; 689 VN_HOLD(u.u_cdir); 690 u.u_rdir = NULL; 691 692 /* 693 * Setup the global zone's rootvp, now that it exists. 694 */ 695 global_zone->zone_rootvp = rootdir; 696 VN_HOLD(global_zone->zone_rootvp); 697 698 /* 699 * Notify the module code that it can begin using the 700 * root filesystem instead of the boot program's services. 701 */ 702 modrootloaded = 1; 703 /* 704 * Set up mnttab information for root 705 */ 706 vfs_setresource(rootvfs, rootfs.bo_name); 707 708 /* 709 * Notify cluster software that the root filesystem is available. 710 */ 711 clboot_mountroot(); 712 713 /* Now that we're all done with the root FS, set up its vopstats */ 714 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) { 715 /* Set flag for statistics collection */ 716 if (vswp->vsw_flag & VSW_STATS) { 717 rootvfs->vfs_flag |= VFS_STATS; 718 } 719 vfs_unrefvfssw(vswp); 720 } 721 setup_vopstats(rootvfs); 722 723 /* 724 * Mount /devices, /system/contract, /etc/mnttab, /etc/svc/volatile, 725 * /system/object, and /proc. 726 */ 727 vfs_mountdevices(); 728 729 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 730 vfs_mountfs("proc", "/proc", "/proc"); 731 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 732 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 733 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 734 735 #ifdef __sparc 736 /* 737 * This bit of magic can go away when we convert sparc to 738 * the new boot architecture based on ramdisk. 739 * 740 * Booting off a mirrored root volume: 741 * At this point, we have booted and mounted root on a 742 * single component of the mirror. Complete the boot 743 * by configuring SVM and converting the root to the 744 * dev_t of the mirrored root device. This dev_t conversion 745 * only works because the underlying device doesn't change. 746 */ 747 if (root_is_svm) { 748 if (svm_rootconf()) { 749 cmn_err(CE_PANIC, "vfs_mountroot: cannot remount root"); 750 } 751 752 /* 753 * mnttab should reflect the new root device 754 */ 755 vfs_lock_wait(rootvfs); 756 vfs_setresource(rootvfs, rootfs.bo_name); 757 vfs_unlock(rootvfs); 758 } 759 #endif /* __sparc */ 760 761 /* 762 * Look up the root device via devfs so that a dv_node is 763 * created for it. The vnode is never VN_RELE()ed. 764 * We allocate more than MAXPATHLEN so that the 765 * buffer passed to i_ddi_prompath_to_devfspath() is 766 * exactly MAXPATHLEN (the function expects a buffer 767 * of that length). 768 */ 769 plen = strlen("/devices"); 770 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 771 (void) strcpy(path, "/devices"); 772 773 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 774 != DDI_SUCCESS || 775 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 776 777 /* NUL terminate in case "path" has garbage */ 778 path[plen + MAXPATHLEN - 1] = '\0'; 779 #ifdef DEBUG 780 cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); 781 #endif 782 } 783 kmem_free(path, plen + MAXPATHLEN); 784 } 785 786 /* 787 * If remount failed and we're in a zone we need to check for the zone 788 * root path and strip it before the call to vfs_setpath(). 789 * 790 * If strpath doesn't begin with the zone_rootpath the original 791 * strpath is returned unchanged. 792 */ 793 static const char * 794 stripzonepath(const char *strpath) 795 { 796 char *str1, *str2; 797 int i; 798 zone_t *zonep = curproc->p_zone; 799 800 if (zonep->zone_rootpath == NULL || strpath == NULL) { 801 return (NULL); 802 } 803 804 /* 805 * we check for the end of the string at one past the 806 * current position because the zone_rootpath always 807 * ends with "/" but we don't want to strip that off. 808 */ 809 str1 = zonep->zone_rootpath; 810 str2 = (char *)strpath; 811 ASSERT(str1[0] != '\0'); 812 for (i = 0; str1[i + 1] != '\0'; i++) { 813 if (str1[i] != str2[i]) 814 return ((char *)strpath); 815 } 816 return (&str2[i]); 817 } 818 819 /* 820 * Common mount code. Called from the system call entry point, from autofs, 821 * and from pxfs. 822 * 823 * Takes the effective file system type, mount arguments, the mount point 824 * vnode, flags specifying whether the mount is a remount and whether it 825 * should be entered into the vfs list, and credentials. Fills in its vfspp 826 * parameter with the mounted file system instance's vfs. 827 * 828 * Note that the effective file system type is specified as a string. It may 829 * be null, in which case it's determined from the mount arguments, and may 830 * differ from the type specified in the mount arguments; this is a hook to 831 * allow interposition when instantiating file system instances. 832 * 833 * The caller is responsible for releasing its own hold on the mount point 834 * vp (this routine does its own hold when necessary). 835 * Also note that for remounts, the mount point vp should be the vnode for 836 * the root of the file system rather than the vnode that the file system 837 * is mounted on top of. 838 */ 839 int 840 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 841 struct vfs **vfspp) 842 { 843 struct vfssw *vswp; 844 vfsops_t *vfsops; 845 struct vfs *vfsp; 846 struct vnode *bvp; 847 dev_t bdev = 0; 848 mntopts_t mnt_mntopts; 849 int error = 0; 850 int copyout_error = 0; 851 int ovflags; 852 char *opts = uap->optptr; 853 char *inargs = opts; 854 int optlen = uap->optlen; 855 int remount; 856 int rdonly; 857 int nbmand = 0; 858 int delmip = 0; 859 int addmip = 0; 860 int splice = ((uap->flags & MS_NOSPLICE) == 0); 861 int fromspace = (uap->flags & MS_SYSSPACE) ? 862 UIO_SYSSPACE : UIO_USERSPACE; 863 char *resource = NULL, *mountpt = NULL; 864 refstr_t *oldresource, *oldmntpt; 865 struct pathname pn, rpn; 866 extern void setup_vopstats(vfs_t *); 867 868 /* 869 * The v_flag value for the mount point vp is permanently set 870 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 871 * for mount point locking. 872 */ 873 mutex_enter(&vp->v_lock); 874 vp->v_flag |= VVFSLOCK; 875 mutex_exit(&vp->v_lock); 876 877 mnt_mntopts.mo_count = 0; 878 /* 879 * Find the ops vector to use to invoke the file system-specific mount 880 * method. If the fsname argument is non-NULL, use it directly. 881 * Otherwise, dig the file system type information out of the mount 882 * arguments. 883 * 884 * A side effect is to hold the vfssw entry. 885 * 886 * Mount arguments can be specified in several ways, which are 887 * distinguished by flag bit settings. The preferred way is to set 888 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 889 * type supplied as a character string and the last two arguments 890 * being a pointer to a character buffer and the size of the buffer. 891 * On entry, the buffer holds a null terminated list of options; on 892 * return, the string is the list of options the file system 893 * recognized. If MS_DATA is set arguments five and six point to a 894 * block of binary data which the file system interprets. 895 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 896 * consistently with these conventions. To handle them, we check to 897 * see whether the pointer to the file system name has a numeric value 898 * less than 256. If so, we treat it as an index. 899 */ 900 if (fsname != NULL) { 901 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 902 return (EINVAL); 903 } 904 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 905 size_t n; 906 uint_t fstype; 907 char name[FSTYPSZ]; 908 909 if ((fstype = (uintptr_t)uap->fstype) < 256) { 910 RLOCK_VFSSW(); 911 if (fstype == 0 || fstype >= nfstype || 912 !ALLOCATED_VFSSW(&vfssw[fstype])) { 913 RUNLOCK_VFSSW(); 914 return (EINVAL); 915 } 916 (void) strcpy(name, vfssw[fstype].vsw_name); 917 RUNLOCK_VFSSW(); 918 if ((vswp = vfs_getvfssw(name)) == NULL) 919 return (EINVAL); 920 } else { 921 /* 922 * Handle either kernel or user address space. 923 */ 924 if (uap->flags & MS_SYSSPACE) { 925 error = copystr(uap->fstype, name, 926 FSTYPSZ, &n); 927 } else { 928 error = copyinstr(uap->fstype, name, 929 FSTYPSZ, &n); 930 } 931 if (error) { 932 if (error == ENAMETOOLONG) 933 return (EINVAL); 934 return (error); 935 } 936 if ((vswp = vfs_getvfssw(name)) == NULL) 937 return (EINVAL); 938 } 939 } else { 940 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 941 return (EINVAL); 942 } 943 if (!VFS_INSTALLED(vswp)) 944 return (EINVAL); 945 vfsops = &vswp->vsw_vfsops; 946 947 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 948 /* 949 * Fetch mount options and parse them for generic vfs options 950 */ 951 if (uap->flags & MS_OPTIONSTR) { 952 /* 953 * Limit the buffer size 954 */ 955 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 956 error = EINVAL; 957 goto errout; 958 } 959 if ((uap->flags & MS_SYSSPACE) == 0) { 960 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 961 inargs[0] = '\0'; 962 if (optlen) { 963 error = copyinstr(opts, inargs, (size_t)optlen, 964 NULL); 965 if (error) { 966 goto errout; 967 } 968 } 969 } 970 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 971 } 972 /* 973 * Flag bits override the options string. 974 */ 975 if (uap->flags & MS_REMOUNT) 976 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 977 if (uap->flags & MS_RDONLY) 978 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 979 if (uap->flags & MS_NOSUID) 980 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 981 982 /* 983 * Check if this is a remount; must be set in the option string and 984 * the file system must support a remount option. 985 */ 986 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 987 MNTOPT_REMOUNT, NULL)) { 988 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 989 error = ENOTSUP; 990 goto errout; 991 } 992 uap->flags |= MS_REMOUNT; 993 } 994 995 /* 996 * uap->flags and vfs_optionisset() should agree. 997 */ 998 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 999 uap->flags |= MS_RDONLY; 1000 } 1001 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 1002 uap->flags |= MS_NOSUID; 1003 } 1004 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 1005 ASSERT(splice || !remount); 1006 /* 1007 * If we are splicing the fs into the namespace, 1008 * perform mount point checks. 1009 * 1010 * We want to resolve the path for the mount point to eliminate 1011 * '.' and ".." and symlinks in mount points; we can't do the 1012 * same for the resource string, since it would turn 1013 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 1014 * this before grabbing vn_vfswlock(), because otherwise we 1015 * would deadlock with lookuppn(). 1016 */ 1017 if (splice) { 1018 ASSERT(vp->v_count > 0); 1019 1020 /* 1021 * Pick up mount point and device from appropriate space. 1022 */ 1023 if (pn_get(uap->spec, fromspace, &pn) == 0) { 1024 resource = kmem_alloc(pn.pn_pathlen + 1, 1025 KM_SLEEP); 1026 (void) strcpy(resource, pn.pn_path); 1027 pn_free(&pn); 1028 } 1029 /* 1030 * Do a lookupname prior to taking the 1031 * writelock. Mark this as completed if 1032 * successful for later cleanup and addition to 1033 * the mount in progress table. 1034 */ 1035 if ((uap->flags & MS_GLOBAL) == 0 && 1036 lookupname(uap->spec, fromspace, 1037 FOLLOW, NULL, &bvp) == 0) { 1038 addmip = 1; 1039 } 1040 1041 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 1042 pathname_t *pnp; 1043 1044 if (*pn.pn_path != '/') { 1045 error = EINVAL; 1046 pn_free(&pn); 1047 goto errout; 1048 } 1049 pn_alloc(&rpn); 1050 /* 1051 * Kludge to prevent autofs from deadlocking with 1052 * itself when it calls domount(). 1053 * 1054 * If autofs is calling, it is because it is doing 1055 * (autofs) mounts in the process of an NFS mount. A 1056 * lookuppn() here would cause us to block waiting for 1057 * said NFS mount to complete, which can't since this 1058 * is the thread that was supposed to doing it. 1059 */ 1060 if (fromspace == UIO_USERSPACE) { 1061 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1062 NULL)) == 0) { 1063 pnp = &rpn; 1064 } else { 1065 /* 1066 * The file disappeared or otherwise 1067 * became inaccessible since we opened 1068 * it; might as well fail the mount 1069 * since the mount point is no longer 1070 * accessible. 1071 */ 1072 pn_free(&rpn); 1073 pn_free(&pn); 1074 goto errout; 1075 } 1076 } else { 1077 pnp = &pn; 1078 } 1079 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1080 (void) strcpy(mountpt, pnp->pn_path); 1081 1082 /* 1083 * If the addition of the zone's rootpath 1084 * would push us over a total path length 1085 * of MAXPATHLEN, we fail the mount with 1086 * ENAMETOOLONG, which is what we would have 1087 * gotten if we were trying to perform the same 1088 * mount in the global zone. 1089 * 1090 * strlen() doesn't count the trailing 1091 * '\0', but zone_rootpathlen counts both a 1092 * trailing '/' and the terminating '\0'. 1093 */ 1094 if ((curproc->p_zone->zone_rootpathlen - 1 + 1095 strlen(mountpt)) > MAXPATHLEN || 1096 (resource != NULL && 1097 (curproc->p_zone->zone_rootpathlen - 1 + 1098 strlen(resource)) > MAXPATHLEN)) { 1099 error = ENAMETOOLONG; 1100 } 1101 1102 pn_free(&rpn); 1103 pn_free(&pn); 1104 } 1105 1106 if (error) 1107 goto errout; 1108 1109 /* 1110 * Prevent path name resolution from proceeding past 1111 * the mount point. 1112 */ 1113 if (vn_vfswlock(vp) != 0) { 1114 error = EBUSY; 1115 goto errout; 1116 } 1117 1118 /* 1119 * Verify that it's legitimate to establish a mount on 1120 * the prospective mount point. 1121 */ 1122 if (vn_mountedvfs(vp) != NULL) { 1123 /* 1124 * The mount point lock was obtained after some 1125 * other thread raced through and established a mount. 1126 */ 1127 vn_vfsunlock(vp); 1128 error = EBUSY; 1129 goto errout; 1130 } 1131 if (vp->v_flag & VNOMOUNT) { 1132 vn_vfsunlock(vp); 1133 error = EINVAL; 1134 goto errout; 1135 } 1136 } 1137 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1138 uap->dataptr = NULL; 1139 uap->datalen = 0; 1140 } 1141 1142 /* 1143 * If this is a remount, we don't want to create a new VFS. 1144 * Instead, we pass the existing one with a remount flag. 1145 */ 1146 if (remount) { 1147 /* 1148 * Confirm that the mount point is the root vnode of the 1149 * file system that is being remounted. 1150 * This can happen if the user specifies a different 1151 * mount point directory pathname in the (re)mount command. 1152 * 1153 * Code below can only be reached if splice is true, so it's 1154 * safe to do vn_vfsunlock() here. 1155 */ 1156 if ((vp->v_flag & VROOT) == 0) { 1157 vn_vfsunlock(vp); 1158 error = ENOENT; 1159 goto errout; 1160 } 1161 /* 1162 * Disallow making file systems read-only unless file system 1163 * explicitly allows it in its vfssw. Ignore other flags. 1164 */ 1165 if (rdonly && vn_is_readonly(vp) == 0 && 1166 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1167 vn_vfsunlock(vp); 1168 error = EINVAL; 1169 goto errout; 1170 } 1171 /* 1172 * Changing the NBMAND setting on remounts is permitted 1173 * but logged since it can lead to unexpected behavior. 1174 * We also counsel against using it for / and /usr. 1175 */ 1176 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1177 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1178 cmn_err(CE_WARN, "domount: nbmand turned %s via " 1179 "remounting %s", nbmand ? "on" : "off", 1180 refstr_value(vp->v_vfsp->vfs_mntpt)); 1181 } 1182 vfsp = vp->v_vfsp; 1183 ovflags = vfsp->vfs_flag; 1184 vfsp->vfs_flag |= VFS_REMOUNT; 1185 vfsp->vfs_flag &= ~VFS_RDONLY; 1186 } else { 1187 vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP); 1188 VFS_INIT(vfsp, vfsops, NULL); 1189 } 1190 1191 VFS_HOLD(vfsp); 1192 1193 /* 1194 * The vfs_reflock is not used anymore the code below explicitly 1195 * holds it preventing others accesing it directly. 1196 */ 1197 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1198 !(vfsp->vfs_flag & VFS_REMOUNT)) 1199 cmn_err(CE_WARN, 1200 "mount type %s couldn't get vfs_reflock\n", vswp->vsw_name); 1201 1202 /* 1203 * Lock the vfs. If this is a remount we want to avoid spurious umount 1204 * failures that happen as a side-effect of fsflush() and other mount 1205 * and unmount operations that might be going on simultaneously and 1206 * may have locked the vfs currently. To not return EBUSY immediately 1207 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1208 */ 1209 if (!remount) { 1210 if (error = vfs_lock(vfsp)) { 1211 vfsp->vfs_flag = ovflags; 1212 if (splice) 1213 vn_vfsunlock(vp); 1214 kmem_free(vfsp, sizeof (struct vfs)); 1215 goto errout; 1216 } 1217 } else { 1218 vfs_lock_wait(vfsp); 1219 } 1220 1221 /* 1222 * Add device to mount in progress table, global mounts require special 1223 * handling. It is possible that we have already done the lookupname 1224 * on a spliced, non-global fs. If so, we don't want to do it again 1225 * since we cannot do a lookupname after taking the 1226 * wlock above. This case is for a non-spliced, non-global filesystem. 1227 */ 1228 if (!addmip) { 1229 if ((uap->flags & MS_GLOBAL) == 0 && 1230 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1231 addmip = 1; 1232 } 1233 } 1234 1235 if (addmip) { 1236 bdev = bvp->v_rdev; 1237 VN_RELE(bvp); 1238 vfs_addmip(bdev, vfsp); 1239 addmip = 0; 1240 delmip = 1; 1241 } 1242 /* 1243 * Invalidate cached entry for the mount point. 1244 */ 1245 if (splice) 1246 dnlc_purge_vp(vp); 1247 1248 /* 1249 * If have an option string but the filesystem doesn't supply a 1250 * prototype options table, create a table with the global 1251 * options and sufficient room to accept all the options in the 1252 * string. Then parse the passed in option string 1253 * accepting all the options in the string. This gives us an 1254 * option table with all the proper cancel properties for the 1255 * global options. 1256 * 1257 * Filesystems that supply a prototype options table are handled 1258 * earlier in this function. 1259 */ 1260 if (uap->flags & MS_OPTIONSTR) { 1261 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1262 mntopts_t tmp_mntopts; 1263 1264 tmp_mntopts.mo_count = 0; 1265 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1266 &mnt_mntopts); 1267 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1268 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1269 vfs_freeopttbl(&tmp_mntopts); 1270 } 1271 } 1272 1273 /* 1274 * Serialize with zone creations. 1275 */ 1276 mount_in_progress(); 1277 /* 1278 * Instantiate (or reinstantiate) the file system. If appropriate, 1279 * splice it into the file system name space. 1280 * 1281 * We want VFS_MOUNT() to be able to override the vfs_resource 1282 * string if necessary (ie, mntfs), and also for a remount to 1283 * change the same (necessary when remounting '/' during boot). 1284 * So we set up vfs_mntpt and vfs_resource to what we think they 1285 * should be, then hand off control to VFS_MOUNT() which can 1286 * override this. 1287 * 1288 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1289 * a vfs which is on the vfs list (i.e. during a remount), we must 1290 * never set those fields to NULL. Several bits of code make 1291 * assumptions that the fields are always valid. 1292 */ 1293 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1294 if (remount) { 1295 if ((oldresource = vfsp->vfs_resource) != NULL) 1296 refstr_hold(oldresource); 1297 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1298 refstr_hold(oldmntpt); 1299 } 1300 vfs_setresource(vfsp, resource); 1301 vfs_setmntpoint(vfsp, mountpt); 1302 1303 error = VFS_MOUNT(vfsp, vp, uap, credp); 1304 1305 if (uap->flags & MS_RDONLY) 1306 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1307 if (uap->flags & MS_NOSUID) 1308 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1309 if (uap->flags & MS_GLOBAL) 1310 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1311 1312 if (error) { 1313 if (remount) { 1314 /* put back pre-remount options */ 1315 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1316 vfs_setmntpoint(vfsp, (stripzonepath( 1317 refstr_value(oldmntpt)))); 1318 if (oldmntpt) 1319 refstr_rele(oldmntpt); 1320 vfs_setresource(vfsp, (stripzonepath( 1321 refstr_value(oldresource)))); 1322 if (oldresource) 1323 refstr_rele(oldresource); 1324 vfsp->vfs_flag = ovflags; 1325 vfs_unlock(vfsp); 1326 VFS_RELE(vfsp); 1327 } else { 1328 vfs_unlock(vfsp); 1329 vfs_freemnttab(vfsp); 1330 kmem_free(vfsp, sizeof (struct vfs)); 1331 } 1332 } else { 1333 /* 1334 * Set the mount time to now 1335 */ 1336 vfsp->vfs_mtime = ddi_get_time(); 1337 if (remount) { 1338 vfsp->vfs_flag &= ~VFS_REMOUNT; 1339 if (oldresource) 1340 refstr_rele(oldresource); 1341 if (oldmntpt) 1342 refstr_rele(oldmntpt); 1343 } else if (splice) { 1344 /* 1345 * Link vfsp into the name space at the mount 1346 * point. Vfs_add() is responsible for 1347 * holding the mount point which will be 1348 * released when vfs_remove() is called. 1349 */ 1350 vfs_add(vp, vfsp, uap->flags); 1351 } else { 1352 /* 1353 * Hold the reference to file system which is 1354 * not linked into the name space. 1355 */ 1356 vfsp->vfs_zone = NULL; 1357 VFS_HOLD(vfsp); 1358 vfsp->vfs_vnodecovered = NULL; 1359 } 1360 /* 1361 * Set flags for global options encountered 1362 */ 1363 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1364 vfsp->vfs_flag |= VFS_RDONLY; 1365 else 1366 vfsp->vfs_flag &= ~VFS_RDONLY; 1367 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1368 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1369 } else { 1370 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1371 vfsp->vfs_flag |= VFS_NODEVICES; 1372 else 1373 vfsp->vfs_flag &= ~VFS_NODEVICES; 1374 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1375 vfsp->vfs_flag |= VFS_NOSETUID; 1376 else 1377 vfsp->vfs_flag &= ~VFS_NOSETUID; 1378 } 1379 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1380 vfsp->vfs_flag |= VFS_NBMAND; 1381 else 1382 vfsp->vfs_flag &= ~VFS_NBMAND; 1383 1384 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1385 vfsp->vfs_flag |= VFS_XATTR; 1386 else 1387 vfsp->vfs_flag &= ~VFS_XATTR; 1388 1389 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1390 vfsp->vfs_flag |= VFS_NOEXEC; 1391 else 1392 vfsp->vfs_flag &= ~VFS_NOEXEC; 1393 1394 /* 1395 * Now construct the output option string of options 1396 * we recognized. 1397 */ 1398 if (uap->flags & MS_OPTIONSTR) { 1399 vfs_list_read_lock(); 1400 copyout_error = vfs_buildoptionstr( 1401 &vfsp->vfs_mntopts, inargs, optlen); 1402 vfs_list_unlock(); 1403 if (copyout_error == 0 && 1404 (uap->flags & MS_SYSSPACE) == 0) { 1405 copyout_error = copyoutstr(inargs, opts, 1406 optlen, NULL); 1407 } 1408 } 1409 1410 /* Set flag for statistics collection */ 1411 if (vswp->vsw_flag & VSW_STATS) { 1412 vfsp->vfs_flag |= VFS_STATS; 1413 } 1414 1415 vfs_unlock(vfsp); 1416 } 1417 mount_completed(); 1418 if (splice) 1419 vn_vfsunlock(vp); 1420 1421 if ((error == 0) && (copyout_error == 0)) { 1422 /* 1423 * If this isn't a remount, set up the vopstats before 1424 * anyone can touch this 1425 */ 1426 if (!remount) 1427 setup_vopstats(vfsp); 1428 1429 /* Return vfsp to caller. */ 1430 *vfspp = vfsp; 1431 } 1432 errout: 1433 vfs_freeopttbl(&mnt_mntopts); 1434 if (resource != NULL) 1435 kmem_free(resource, strlen(resource) + 1); 1436 if (mountpt != NULL) 1437 kmem_free(mountpt, strlen(mountpt) + 1); 1438 /* 1439 * It is possible we errored prior to adding to mount in progress 1440 * table. Must free vnode we acquired with successful lookupname. 1441 */ 1442 if (addmip) 1443 VN_RELE(bvp); 1444 if (delmip) 1445 vfs_delmip(vfsp); 1446 ASSERT(vswp != NULL); 1447 vfs_unrefvfssw(vswp); 1448 if (inargs != opts) 1449 kmem_free(inargs, MAX_MNTOPT_STR); 1450 if (copyout_error) { 1451 VFS_RELE(vfsp); 1452 error = copyout_error; 1453 } 1454 return (error); 1455 } 1456 1457 static void 1458 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath) 1459 { 1460 size_t len; 1461 refstr_t *ref; 1462 zone_t *zone = curproc->p_zone; 1463 char *sp; 1464 int have_list_lock = 0; 1465 1466 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1467 1468 /* 1469 * New path must be less than MAXPATHLEN because mntfs 1470 * will only display up to MAXPATHLEN bytes. This is currently 1471 * safe, because domount() uses pn_get(), and other callers 1472 * similarly cap the size to fewer than MAXPATHLEN bytes. 1473 */ 1474 1475 ASSERT(strlen(newpath) < MAXPATHLEN); 1476 1477 /* mntfs requires consistency while vfs list lock is held */ 1478 1479 if (VFS_ON_LIST(vfsp)) { 1480 have_list_lock = 1; 1481 vfs_list_lock(); 1482 } 1483 1484 if (*refp != NULL) 1485 refstr_rele(*refp); 1486 1487 /* Do we need to modify the path? */ 1488 1489 if (zone == global_zone || *newpath != '/') { 1490 ref = refstr_alloc(newpath); 1491 goto out; 1492 } 1493 1494 /* 1495 * Truncate the trailing '/' in the zoneroot, and merge 1496 * in the zone's rootpath with the "newpath" (resource 1497 * or mountpoint) passed in. 1498 * 1499 * The size of the required buffer is thus the size of 1500 * the buffer required for the passed-in newpath 1501 * (strlen(newpath) + 1), plus the size of the buffer 1502 * required to hold zone_rootpath (zone_rootpathlen) 1503 * minus one for one of the now-superfluous NUL 1504 * terminations, minus one for the trailing '/'. 1505 * 1506 * That gives us: 1507 * 1508 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1509 * 1510 * Which is what we have below. 1511 */ 1512 1513 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1514 sp = kmem_alloc(len, KM_SLEEP); 1515 1516 /* 1517 * Copy everything including the trailing slash, which 1518 * we then overwrite with the NUL character. 1519 */ 1520 1521 (void) strcpy(sp, zone->zone_rootpath); 1522 sp[zone->zone_rootpathlen - 2] = '\0'; 1523 (void) strcat(sp, newpath); 1524 1525 ref = refstr_alloc(sp); 1526 kmem_free(sp, len); 1527 out: 1528 *refp = ref; 1529 1530 if (have_list_lock) { 1531 vfs_mnttab_modtimeupd(); 1532 vfs_list_unlock(); 1533 } 1534 } 1535 1536 /* 1537 * Record a mounted resource name in a vfs structure. 1538 * If vfsp is already mounted, caller must hold the vfs lock. 1539 */ 1540 void 1541 vfs_setresource(struct vfs *vfsp, const char *resource) 1542 { 1543 if (resource == NULL || resource[0] == '\0') 1544 resource = VFS_NORESOURCE; 1545 vfs_setpath(vfsp, &vfsp->vfs_resource, resource); 1546 } 1547 1548 /* 1549 * Record a mount point name in a vfs structure. 1550 * If vfsp is already mounted, caller must hold the vfs lock. 1551 */ 1552 void 1553 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt) 1554 { 1555 if (mntpt == NULL || mntpt[0] == '\0') 1556 mntpt = VFS_NOMNTPT; 1557 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt); 1558 } 1559 1560 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1561 1562 refstr_t * 1563 vfs_getresource(const struct vfs *vfsp) 1564 { 1565 refstr_t *resource; 1566 1567 vfs_list_read_lock(); 1568 resource = vfsp->vfs_resource; 1569 refstr_hold(resource); 1570 vfs_list_unlock(); 1571 1572 return (resource); 1573 } 1574 1575 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 1576 1577 refstr_t * 1578 vfs_getmntpoint(const struct vfs *vfsp) 1579 { 1580 refstr_t *mntpt; 1581 1582 vfs_list_read_lock(); 1583 mntpt = vfsp->vfs_mntpt; 1584 refstr_hold(mntpt); 1585 vfs_list_unlock(); 1586 1587 return (mntpt); 1588 } 1589 1590 /* 1591 * Create an empty options table with enough empty slots to hold all 1592 * The options in the options string passed as an argument. 1593 * Potentially prepend another options table. 1594 * 1595 * Note: caller is responsible for locking the vfs list, if needed, 1596 * to protect mops. 1597 */ 1598 static void 1599 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 1600 const mntopts_t *mtmpl) 1601 { 1602 const char *s = opts; 1603 uint_t count; 1604 1605 if (opts == NULL || *opts == '\0') { 1606 count = 0; 1607 } else { 1608 count = 1; 1609 1610 /* 1611 * Count number of options in the string 1612 */ 1613 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 1614 count++; 1615 s++; 1616 } 1617 } 1618 vfs_copyopttbl_extend(mtmpl, mops, count); 1619 } 1620 1621 /* 1622 * Create an empty options table with enough empty slots to hold all 1623 * The options in the options string passed as an argument. 1624 * 1625 * This function is *not* for general use by filesystems. 1626 * 1627 * Note: caller is responsible for locking the vfs list, if needed, 1628 * to protect mops. 1629 */ 1630 void 1631 vfs_createopttbl(mntopts_t *mops, const char *opts) 1632 { 1633 vfs_createopttbl_extend(mops, opts, NULL); 1634 } 1635 1636 1637 /* 1638 * Swap two mount options tables 1639 */ 1640 static void 1641 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 1642 { 1643 uint_t tmpcnt; 1644 mntopt_t *tmplist; 1645 1646 tmpcnt = optbl2->mo_count; 1647 tmplist = optbl2->mo_list; 1648 optbl2->mo_count = optbl1->mo_count; 1649 optbl2->mo_list = optbl1->mo_list; 1650 optbl1->mo_count = tmpcnt; 1651 optbl1->mo_list = tmplist; 1652 } 1653 1654 static void 1655 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 1656 { 1657 vfs_list_lock(); 1658 vfs_swapopttbl_nolock(optbl1, optbl2); 1659 vfs_mnttab_modtimeupd(); 1660 vfs_list_unlock(); 1661 } 1662 1663 static char ** 1664 vfs_copycancelopt_extend(char **const moc, int extend) 1665 { 1666 int i = 0; 1667 int j; 1668 char **result; 1669 1670 if (moc != NULL) { 1671 for (; moc[i] != NULL; i++) 1672 /* count number of options to cancel */; 1673 } 1674 1675 if (i + extend == 0) 1676 return (NULL); 1677 1678 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 1679 1680 for (j = 0; j < i; j++) { 1681 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 1682 (void) strcpy(result[j], moc[j]); 1683 } 1684 for (; j <= i + extend; j++) 1685 result[j] = NULL; 1686 1687 return (result); 1688 } 1689 1690 static void 1691 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 1692 { 1693 char *sp, *dp; 1694 1695 d->mo_flags = s->mo_flags; 1696 d->mo_data = s->mo_data; 1697 sp = s->mo_name; 1698 if (sp != NULL) { 1699 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1700 (void) strcpy(dp, sp); 1701 d->mo_name = dp; 1702 } else { 1703 d->mo_name = NULL; /* should never happen */ 1704 } 1705 1706 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 1707 1708 sp = s->mo_arg; 1709 if (sp != NULL) { 1710 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1711 (void) strcpy(dp, sp); 1712 d->mo_arg = dp; 1713 } else { 1714 d->mo_arg = NULL; 1715 } 1716 } 1717 1718 /* 1719 * Copy a mount options table, possibly allocating some spare 1720 * slots at the end. It is permissible to copy_extend the NULL table. 1721 */ 1722 static void 1723 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 1724 { 1725 uint_t i, count; 1726 mntopt_t *motbl; 1727 1728 /* 1729 * Clear out any existing stuff in the options table being initialized 1730 */ 1731 vfs_freeopttbl(dmo); 1732 count = (smo == NULL) ? 0 : smo->mo_count; 1733 if ((count + extra) == 0) /* nothing to do */ 1734 return; 1735 dmo->mo_count = count + extra; 1736 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 1737 dmo->mo_list = motbl; 1738 for (i = 0; i < count; i++) { 1739 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 1740 } 1741 for (i = count; i < count + extra; i++) { 1742 motbl[i].mo_flags = MO_EMPTY; 1743 } 1744 } 1745 1746 /* 1747 * Copy a mount options table. 1748 * 1749 * This function is *not* for general use by filesystems. 1750 * 1751 * Note: caller is responsible for locking the vfs list, if needed, 1752 * to protect smo and dmo. 1753 */ 1754 void 1755 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 1756 { 1757 vfs_copyopttbl_extend(smo, dmo, 0); 1758 } 1759 1760 static char ** 1761 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 1762 { 1763 int c1 = 0; 1764 int c2 = 0; 1765 char **result; 1766 char **sp1, **sp2, **dp; 1767 1768 /* 1769 * First we count both lists of cancel options. 1770 * If either is NULL or has no elements, we return a copy of 1771 * the other. 1772 */ 1773 if (mop1->mo_cancel != NULL) { 1774 for (; mop1->mo_cancel[c1] != NULL; c1++) 1775 /* count cancel options in mop1 */; 1776 } 1777 1778 if (c1 == 0) 1779 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 1780 1781 if (mop2->mo_cancel != NULL) { 1782 for (; mop2->mo_cancel[c2] != NULL; c2++) 1783 /* count cancel options in mop2 */; 1784 } 1785 1786 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 1787 1788 if (c2 == 0) 1789 return (result); 1790 1791 /* 1792 * When we get here, we've got two sets of cancel options; 1793 * we need to merge the two sets. We know that the result 1794 * array has "c1+c2+1" entries and in the end we might shrink 1795 * it. 1796 * Result now has a copy of the c1 entries from mop1; we'll 1797 * now lookup all the entries of mop2 in mop1 and copy it if 1798 * it is unique. 1799 * This operation is O(n^2) but it's only called once per 1800 * filesystem per duplicate option. This is a situation 1801 * which doesn't arise with the filesystems in ON and 1802 * n is generally 1. 1803 */ 1804 1805 dp = &result[c1]; 1806 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 1807 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 1808 if (strcmp(*sp1, *sp2) == 0) 1809 break; 1810 } 1811 if (*sp1 == NULL) { 1812 /* 1813 * Option *sp2 not found in mop1, so copy it. 1814 * The calls to vfs_copycancelopt_extend() 1815 * guarantee that there's enough room. 1816 */ 1817 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 1818 (void) strcpy(*dp++, *sp2); 1819 } 1820 } 1821 if (dp != &result[c1+c2]) { 1822 size_t bytes = (dp - result + 1) * sizeof (char *); 1823 char **nres = kmem_alloc(bytes, KM_SLEEP); 1824 1825 bcopy(result, nres, bytes); 1826 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 1827 result = nres; 1828 } 1829 return (result); 1830 } 1831 1832 /* 1833 * Merge two mount option tables (outer and inner) into one. This is very 1834 * similar to "merging" global variables and automatic variables in C. 1835 * 1836 * This isn't (and doesn't have to be) fast. 1837 * 1838 * This function is *not* for general use by filesystems. 1839 * 1840 * Note: caller is responsible for locking the vfs list, if needed, 1841 * to protect omo, imo & dmo. 1842 */ 1843 void 1844 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 1845 { 1846 uint_t i, count; 1847 mntopt_t *mop, *motbl; 1848 uint_t freeidx; 1849 1850 /* 1851 * First determine how much space we need to allocate. 1852 */ 1853 count = omo->mo_count; 1854 for (i = 0; i < imo->mo_count; i++) { 1855 if (imo->mo_list[i].mo_flags & MO_EMPTY) 1856 continue; 1857 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 1858 count++; 1859 } 1860 ASSERT(count >= omo->mo_count && 1861 count <= omo->mo_count + imo->mo_count); 1862 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 1863 for (i = 0; i < omo->mo_count; i++) 1864 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 1865 freeidx = omo->mo_count; 1866 for (i = 0; i < imo->mo_count; i++) { 1867 if (imo->mo_list[i].mo_flags & MO_EMPTY) 1868 continue; 1869 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 1870 char **newcanp; 1871 uint_t index = mop - omo->mo_list; 1872 1873 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 1874 1875 vfs_freeopt(&motbl[index]); 1876 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 1877 1878 vfs_freecancelopt(motbl[index].mo_cancel); 1879 motbl[index].mo_cancel = newcanp; 1880 } else { 1881 /* 1882 * If it's a new option, just copy it over to the first 1883 * free location. 1884 */ 1885 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 1886 } 1887 } 1888 dmo->mo_count = count; 1889 dmo->mo_list = motbl; 1890 } 1891 1892 /* 1893 * Functions to set and clear mount options in a mount options table. 1894 */ 1895 1896 /* 1897 * Clear a mount option, if it exists. 1898 * 1899 * The update_mnttab arg indicates whether mops is part of a vfs that is on 1900 * the vfs list. 1901 */ 1902 static void 1903 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 1904 { 1905 struct mntopt *mop; 1906 uint_t i, count; 1907 1908 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 1909 1910 count = mops->mo_count; 1911 for (i = 0; i < count; i++) { 1912 mop = &mops->mo_list[i]; 1913 1914 if (mop->mo_flags & MO_EMPTY) 1915 continue; 1916 if (strcmp(opt, mop->mo_name)) 1917 continue; 1918 mop->mo_flags &= ~MO_SET; 1919 if (mop->mo_arg != NULL) { 1920 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 1921 } 1922 mop->mo_arg = NULL; 1923 if (update_mnttab) 1924 vfs_mnttab_modtimeupd(); 1925 break; 1926 } 1927 } 1928 1929 void 1930 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 1931 { 1932 int gotlock = 0; 1933 1934 if (VFS_ON_LIST(vfsp)) { 1935 gotlock = 1; 1936 vfs_list_lock(); 1937 } 1938 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 1939 if (gotlock) 1940 vfs_list_unlock(); 1941 } 1942 1943 1944 /* 1945 * Set a mount option on. If it's not found in the table, it's silently 1946 * ignored. If the option has MO_IGNORE set, it is still set unless the 1947 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 1948 * bits can be used to toggle the MO_NODISPLAY bit for the option. 1949 * If the VFS_CREATEOPT flag bit is set then the first option slot with 1950 * MO_EMPTY set is created as the option passed in. 1951 * 1952 * The update_mnttab arg indicates whether mops is part of a vfs that is on 1953 * the vfs list. 1954 */ 1955 static void 1956 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 1957 const char *arg, int flags, int update_mnttab) 1958 { 1959 mntopt_t *mop; 1960 uint_t i, count; 1961 char *sp; 1962 1963 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 1964 1965 if (flags & VFS_CREATEOPT) { 1966 if (vfs_hasopt(mops, opt) != NULL) { 1967 flags &= ~VFS_CREATEOPT; 1968 } 1969 } 1970 count = mops->mo_count; 1971 for (i = 0; i < count; i++) { 1972 mop = &mops->mo_list[i]; 1973 1974 if (mop->mo_flags & MO_EMPTY) { 1975 if ((flags & VFS_CREATEOPT) == 0) 1976 continue; 1977 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 1978 (void) strcpy(sp, opt); 1979 mop->mo_name = sp; 1980 if (arg != NULL) 1981 mop->mo_flags = MO_HASVALUE; 1982 else 1983 mop->mo_flags = 0; 1984 } else if (strcmp(opt, mop->mo_name)) { 1985 continue; 1986 } 1987 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 1988 break; 1989 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 1990 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 1991 (void) strcpy(sp, arg); 1992 } else { 1993 sp = NULL; 1994 } 1995 if (mop->mo_arg != NULL) 1996 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 1997 mop->mo_arg = sp; 1998 if (flags & VFS_DISPLAY) 1999 mop->mo_flags &= ~MO_NODISPLAY; 2000 if (flags & VFS_NODISPLAY) 2001 mop->mo_flags |= MO_NODISPLAY; 2002 mop->mo_flags |= MO_SET; 2003 if (mop->mo_cancel != NULL) { 2004 char **cp; 2005 2006 for (cp = mop->mo_cancel; *cp != NULL; cp++) 2007 vfs_clearmntopt_nolock(mops, *cp, 0); 2008 } 2009 if (update_mnttab) 2010 vfs_mnttab_modtimeupd(); 2011 break; 2012 } 2013 } 2014 2015 void 2016 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 2017 { 2018 int gotlock = 0; 2019 2020 if (VFS_ON_LIST(vfsp)) { 2021 gotlock = 1; 2022 vfs_list_lock(); 2023 } 2024 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 2025 if (gotlock) 2026 vfs_list_unlock(); 2027 } 2028 2029 2030 /* 2031 * Add a "tag" option to a mounted file system's options list. 2032 * 2033 * Note: caller is responsible for locking the vfs list, if needed, 2034 * to protect mops. 2035 */ 2036 static mntopt_t * 2037 vfs_addtag(mntopts_t *mops, const char *tag) 2038 { 2039 uint_t count; 2040 mntopt_t *mop, *motbl; 2041 2042 count = mops->mo_count + 1; 2043 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 2044 if (mops->mo_count) { 2045 size_t len = (count - 1) * sizeof (mntopt_t); 2046 2047 bcopy(mops->mo_list, motbl, len); 2048 kmem_free(mops->mo_list, len); 2049 } 2050 mops->mo_count = count; 2051 mops->mo_list = motbl; 2052 mop = &motbl[count - 1]; 2053 mop->mo_flags = MO_TAG; 2054 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 2055 (void) strcpy(mop->mo_name, tag); 2056 return (mop); 2057 } 2058 2059 /* 2060 * Allow users to set arbitrary "tags" in a vfs's mount options. 2061 * Broader use within the kernel is discouraged. 2062 */ 2063 int 2064 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2065 cred_t *cr) 2066 { 2067 vfs_t *vfsp; 2068 mntopts_t *mops; 2069 mntopt_t *mop; 2070 int found = 0; 2071 dev_t dev = makedevice(major, minor); 2072 int err = 0; 2073 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2074 2075 /* 2076 * Find the desired mounted file system 2077 */ 2078 vfs_list_lock(); 2079 vfsp = rootvfs; 2080 do { 2081 if (vfsp->vfs_dev == dev && 2082 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2083 found = 1; 2084 break; 2085 } 2086 vfsp = vfsp->vfs_next; 2087 } while (vfsp != rootvfs); 2088 2089 if (!found) { 2090 err = EINVAL; 2091 goto out; 2092 } 2093 err = secpolicy_fs_config(cr, vfsp); 2094 if (err != 0) 2095 goto out; 2096 2097 mops = &vfsp->vfs_mntopts; 2098 /* 2099 * Add tag if it doesn't already exist 2100 */ 2101 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2102 int len; 2103 2104 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2105 len = strlen(buf); 2106 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2107 err = ENAMETOOLONG; 2108 goto out; 2109 } 2110 mop = vfs_addtag(mops, tag); 2111 } 2112 if ((mop->mo_flags & MO_TAG) == 0) { 2113 err = EINVAL; 2114 goto out; 2115 } 2116 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2117 out: 2118 vfs_list_unlock(); 2119 kmem_free(buf, MAX_MNTOPT_STR); 2120 return (err); 2121 } 2122 2123 /* 2124 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2125 * Broader use within the kernel is discouraged. 2126 */ 2127 int 2128 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2129 cred_t *cr) 2130 { 2131 vfs_t *vfsp; 2132 mntopt_t *mop; 2133 int found = 0; 2134 dev_t dev = makedevice(major, minor); 2135 int err = 0; 2136 2137 /* 2138 * Find the desired mounted file system 2139 */ 2140 vfs_list_lock(); 2141 vfsp = rootvfs; 2142 do { 2143 if (vfsp->vfs_dev == dev && 2144 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2145 found = 1; 2146 break; 2147 } 2148 vfsp = vfsp->vfs_next; 2149 } while (vfsp != rootvfs); 2150 2151 if (!found) { 2152 err = EINVAL; 2153 goto out; 2154 } 2155 err = secpolicy_fs_config(cr, vfsp); 2156 if (err != 0) 2157 goto out; 2158 2159 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2160 err = EINVAL; 2161 goto out; 2162 } 2163 if ((mop->mo_flags & MO_TAG) == 0) { 2164 err = EINVAL; 2165 goto out; 2166 } 2167 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2168 out: 2169 vfs_list_unlock(); 2170 return (err); 2171 } 2172 2173 /* 2174 * Function to parse an option string and fill in a mount options table. 2175 * Unknown options are silently ignored. The input option string is modified 2176 * by replacing separators with nulls. If the create flag is set, options 2177 * not found in the table are just added on the fly. The table must have 2178 * an option slot marked MO_EMPTY to add an option on the fly. 2179 * 2180 * This function is *not* for general use by filesystems. 2181 * 2182 * Note: caller is responsible for locking the vfs list, if needed, 2183 * to protect mops.. 2184 */ 2185 void 2186 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2187 { 2188 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2189 int setflg = VFS_NOFORCEOPT; 2190 2191 if (osp == NULL) 2192 return; 2193 while (*s != '\0') { 2194 p = strchr(s, ','); /* find next option */ 2195 if (p == NULL) { 2196 cp = NULL; 2197 p = s + strlen(s); 2198 } else { 2199 cp = p; /* save location of comma */ 2200 *p++ = '\0'; /* mark end and point to next option */ 2201 } 2202 nextop = p; 2203 p = strchr(s, '='); /* look for value */ 2204 if (p == NULL) { 2205 valp = NULL; /* no value supplied */ 2206 } else { 2207 ep = p; /* save location of equals */ 2208 *p++ = '\0'; /* end option and point to value */ 2209 valp = p; 2210 } 2211 /* 2212 * set option into options table 2213 */ 2214 if (create) 2215 setflg |= VFS_CREATEOPT; 2216 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2217 if (cp != NULL) 2218 *cp = ','; /* restore the comma */ 2219 if (valp != NULL) 2220 *ep = '='; /* restore the equals */ 2221 s = nextop; 2222 } 2223 } 2224 2225 /* 2226 * Function to inquire if an option exists in a mount options table. 2227 * Returns a pointer to the option if it exists, else NULL. 2228 * 2229 * This function is *not* for general use by filesystems. 2230 * 2231 * Note: caller is responsible for locking the vfs list, if needed, 2232 * to protect mops. 2233 */ 2234 struct mntopt * 2235 vfs_hasopt(const mntopts_t *mops, const char *opt) 2236 { 2237 struct mntopt *mop; 2238 uint_t i, count; 2239 2240 count = mops->mo_count; 2241 for (i = 0; i < count; i++) { 2242 mop = &mops->mo_list[i]; 2243 2244 if (mop->mo_flags & MO_EMPTY) 2245 continue; 2246 if (strcmp(opt, mop->mo_name) == 0) 2247 return (mop); 2248 } 2249 return (NULL); 2250 } 2251 2252 /* 2253 * Function to inquire if an option is set in a mount options table. 2254 * Returns non-zero if set and fills in the arg pointer with a pointer to 2255 * the argument string or NULL if there is no argument string. 2256 */ 2257 static int 2258 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2259 { 2260 struct mntopt *mop; 2261 uint_t i, count; 2262 2263 count = mops->mo_count; 2264 for (i = 0; i < count; i++) { 2265 mop = &mops->mo_list[i]; 2266 2267 if (mop->mo_flags & MO_EMPTY) 2268 continue; 2269 if (strcmp(opt, mop->mo_name)) 2270 continue; 2271 if ((mop->mo_flags & MO_SET) == 0) 2272 return (0); 2273 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2274 *argp = mop->mo_arg; 2275 return (1); 2276 } 2277 return (0); 2278 } 2279 2280 2281 int 2282 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2283 { 2284 int ret; 2285 2286 vfs_list_read_lock(); 2287 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2288 vfs_list_unlock(); 2289 return (ret); 2290 } 2291 2292 2293 /* 2294 * Construct a comma separated string of the options set in the given 2295 * mount table, return the string in the given buffer. Return non-zero if 2296 * the buffer would overflow. 2297 * 2298 * This function is *not* for general use by filesystems. 2299 * 2300 * Note: caller is responsible for locking the vfs list, if needed, 2301 * to protect mp. 2302 */ 2303 int 2304 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2305 { 2306 char *cp; 2307 uint_t i; 2308 2309 buf[0] = '\0'; 2310 cp = buf; 2311 for (i = 0; i < mp->mo_count; i++) { 2312 struct mntopt *mop; 2313 2314 mop = &mp->mo_list[i]; 2315 if (mop->mo_flags & MO_SET) { 2316 int optlen, comma = 0; 2317 2318 if (buf[0] != '\0') 2319 comma = 1; 2320 optlen = strlen(mop->mo_name); 2321 if (strlen(buf) + comma + optlen + 1 > len) 2322 goto err; 2323 if (comma) 2324 *cp++ = ','; 2325 (void) strcpy(cp, mop->mo_name); 2326 cp += optlen; 2327 /* 2328 * Append option value if there is one 2329 */ 2330 if (mop->mo_arg != NULL) { 2331 int arglen; 2332 2333 arglen = strlen(mop->mo_arg); 2334 if (strlen(buf) + arglen + 2 > len) 2335 goto err; 2336 *cp++ = '='; 2337 (void) strcpy(cp, mop->mo_arg); 2338 cp += arglen; 2339 } 2340 } 2341 } 2342 return (0); 2343 err: 2344 return (EOVERFLOW); 2345 } 2346 2347 static void 2348 vfs_freecancelopt(char **moc) 2349 { 2350 if (moc != NULL) { 2351 int ccnt = 0; 2352 char **cp; 2353 2354 for (cp = moc; *cp != NULL; cp++) { 2355 kmem_free(*cp, strlen(*cp) + 1); 2356 ccnt++; 2357 } 2358 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2359 } 2360 } 2361 2362 static void 2363 vfs_freeopt(mntopt_t *mop) 2364 { 2365 if (mop->mo_name != NULL) 2366 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2367 2368 vfs_freecancelopt(mop->mo_cancel); 2369 2370 if (mop->mo_arg != NULL) 2371 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2372 } 2373 2374 /* 2375 * Free a mount options table 2376 * 2377 * This function is *not* for general use by filesystems. 2378 * 2379 * Note: caller is responsible for locking the vfs list, if needed, 2380 * to protect mp. 2381 */ 2382 void 2383 vfs_freeopttbl(mntopts_t *mp) 2384 { 2385 uint_t i, count; 2386 2387 count = mp->mo_count; 2388 for (i = 0; i < count; i++) { 2389 vfs_freeopt(&mp->mo_list[i]); 2390 } 2391 if (count) { 2392 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2393 mp->mo_count = 0; 2394 mp->mo_list = NULL; 2395 } 2396 } 2397 2398 /* 2399 * Free any mnttab information recorded in the vfs struct. 2400 * The vfs must not be on the vfs list. 2401 */ 2402 static void 2403 vfs_freemnttab(struct vfs *vfsp) 2404 { 2405 ASSERT(!VFS_ON_LIST(vfsp)); 2406 2407 /* 2408 * Free device and mount point information 2409 */ 2410 if (vfsp->vfs_mntpt != NULL) { 2411 refstr_rele(vfsp->vfs_mntpt); 2412 vfsp->vfs_mntpt = NULL; 2413 } 2414 if (vfsp->vfs_resource != NULL) { 2415 refstr_rele(vfsp->vfs_resource); 2416 vfsp->vfs_resource = NULL; 2417 } 2418 /* 2419 * Now free mount options information 2420 */ 2421 vfs_freeopttbl(&vfsp->vfs_mntopts); 2422 } 2423 2424 /* 2425 * Return the last mnttab modification time 2426 */ 2427 void 2428 vfs_mnttab_modtime(timespec_t *ts) 2429 { 2430 ASSERT(RW_LOCK_HELD(&vfslist)); 2431 *ts = vfs_mnttab_mtime; 2432 } 2433 2434 /* 2435 * See if mnttab is changed 2436 */ 2437 void 2438 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2439 { 2440 int changed; 2441 2442 *phpp = (struct pollhead *)NULL; 2443 2444 /* 2445 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 2446 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 2447 * to not grab the vfs list lock because tv_sec is monotonically 2448 * increasing. 2449 */ 2450 2451 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 2452 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 2453 if (!changed) { 2454 *phpp = &vfs_pollhd; 2455 } 2456 } 2457 2458 /* 2459 * Update the mnttab modification time and wake up any waiters for 2460 * mnttab changes 2461 */ 2462 void 2463 vfs_mnttab_modtimeupd() 2464 { 2465 hrtime_t oldhrt, newhrt; 2466 2467 ASSERT(RW_WRITE_HELD(&vfslist)); 2468 oldhrt = ts2hrt(&vfs_mnttab_mtime); 2469 gethrestime(&vfs_mnttab_mtime); 2470 newhrt = ts2hrt(&vfs_mnttab_mtime); 2471 if (oldhrt == (hrtime_t)0) 2472 vfs_mnttab_ctime = vfs_mnttab_mtime; 2473 /* 2474 * Attempt to provide unique mtime (like uniqtime but not). 2475 */ 2476 if (newhrt == oldhrt) { 2477 newhrt++; 2478 hrt2ts(newhrt, &vfs_mnttab_mtime); 2479 } 2480 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 2481 } 2482 2483 int 2484 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 2485 { 2486 vnode_t *coveredvp; 2487 int error; 2488 extern void teardown_vopstats(vfs_t *); 2489 2490 /* 2491 * Get covered vnode. This will be NULL if the vfs is not linked 2492 * into the file system name space (i.e., domount() with MNT_NOSPICE). 2493 */ 2494 coveredvp = vfsp->vfs_vnodecovered; 2495 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 2496 2497 /* 2498 * Purge all dnlc entries for this vfs. 2499 */ 2500 (void) dnlc_purge_vfsp(vfsp, 0); 2501 2502 /* For forcible umount, skip VFS_SYNC() since it may hang */ 2503 if ((flag & MS_FORCE) == 0) 2504 (void) VFS_SYNC(vfsp, 0, cr); 2505 2506 /* 2507 * Lock the vfs to maintain fs status quo during unmount. This 2508 * has to be done after the sync because ufs_update tries to acquire 2509 * the vfs_reflock. 2510 */ 2511 vfs_lock_wait(vfsp); 2512 2513 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 2514 vfs_unlock(vfsp); 2515 if (coveredvp != NULL) 2516 vn_vfsunlock(coveredvp); 2517 } else if (coveredvp != NULL) { 2518 teardown_vopstats(vfsp); 2519 /* 2520 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 2521 * when it frees vfsp so we do a VN_HOLD() so we can 2522 * continue to use coveredvp afterwards. 2523 */ 2524 VN_HOLD(coveredvp); 2525 vfs_remove(vfsp); 2526 vn_vfsunlock(coveredvp); 2527 VN_RELE(coveredvp); 2528 } else { 2529 teardown_vopstats(vfsp); 2530 /* 2531 * Release the reference to vfs that is not linked 2532 * into the name space. 2533 */ 2534 vfs_unlock(vfsp); 2535 VFS_RELE(vfsp); 2536 } 2537 return (error); 2538 } 2539 2540 2541 /* 2542 * Vfs_unmountall() is called by uadmin() to unmount all 2543 * mounted file systems (except the root file system) during shutdown. 2544 * It follows the existing locking protocol when traversing the vfs list 2545 * to sync and unmount vfses. Even though there should be no 2546 * other thread running while the system is shutting down, it is prudent 2547 * to still follow the locking protocol. 2548 */ 2549 void 2550 vfs_unmountall(void) 2551 { 2552 struct vfs *vfsp; 2553 struct vfs *prev_vfsp = NULL; 2554 int error; 2555 2556 /* 2557 * Toss all dnlc entries now so that the per-vfs sync 2558 * and unmount operations don't have to slog through 2559 * a bunch of uninteresting vnodes over and over again. 2560 */ 2561 dnlc_purge(); 2562 2563 vfs_list_lock(); 2564 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 2565 prev_vfsp = vfsp->vfs_prev; 2566 2567 if (vfs_lock(vfsp) != 0) 2568 continue; 2569 error = vn_vfswlock(vfsp->vfs_vnodecovered); 2570 vfs_unlock(vfsp); 2571 if (error) 2572 continue; 2573 2574 vfs_list_unlock(); 2575 2576 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 2577 (void) dounmount(vfsp, 0, CRED()); 2578 2579 /* 2580 * Since we dropped the vfslist lock above we must 2581 * verify that next_vfsp still exists, else start over. 2582 */ 2583 vfs_list_lock(); 2584 for (vfsp = rootvfs->vfs_prev; 2585 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 2586 if (vfsp == prev_vfsp) 2587 break; 2588 if (vfsp == rootvfs && prev_vfsp != rootvfs) 2589 prev_vfsp = rootvfs->vfs_prev; 2590 } 2591 vfs_list_unlock(); 2592 } 2593 2594 /* 2595 * Called to add an entry to the end of the vfs mount in progress list 2596 */ 2597 void 2598 vfs_addmip(dev_t dev, struct vfs *vfsp) 2599 { 2600 struct ipmnt *mipp; 2601 2602 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 2603 mipp->mip_next = NULL; 2604 mipp->mip_dev = dev; 2605 mipp->mip_vfsp = vfsp; 2606 mutex_enter(&vfs_miplist_mutex); 2607 if (vfs_miplist_end != NULL) 2608 vfs_miplist_end->mip_next = mipp; 2609 else 2610 vfs_miplist = mipp; 2611 vfs_miplist_end = mipp; 2612 mutex_exit(&vfs_miplist_mutex); 2613 } 2614 2615 /* 2616 * Called to remove an entry from the mount in progress list 2617 * Either because the mount completed or it failed. 2618 */ 2619 void 2620 vfs_delmip(struct vfs *vfsp) 2621 { 2622 struct ipmnt *mipp, *mipprev; 2623 2624 mutex_enter(&vfs_miplist_mutex); 2625 mipprev = NULL; 2626 for (mipp = vfs_miplist; 2627 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 2628 mipprev = mipp; 2629 } 2630 if (mipp == NULL) 2631 return; /* shouldn't happen */ 2632 if (mipp == vfs_miplist_end) 2633 vfs_miplist_end = mipprev; 2634 if (mipprev == NULL) 2635 vfs_miplist = mipp->mip_next; 2636 else 2637 mipprev->mip_next = mipp->mip_next; 2638 mutex_exit(&vfs_miplist_mutex); 2639 kmem_free(mipp, sizeof (struct ipmnt)); 2640 } 2641 2642 /* 2643 * vfs_add is called by a specific filesystem's mount routine to add 2644 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 2645 * The vfs should already have been locked by the caller. 2646 * 2647 * coveredvp is NULL if this is the root. 2648 */ 2649 void 2650 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 2651 { 2652 int newflag; 2653 2654 ASSERT(vfs_lock_held(vfsp)); 2655 VFS_HOLD(vfsp); 2656 newflag = vfsp->vfs_flag; 2657 if (mflag & MS_RDONLY) 2658 newflag |= VFS_RDONLY; 2659 else 2660 newflag &= ~VFS_RDONLY; 2661 if (mflag & MS_NOSUID) 2662 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 2663 else 2664 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 2665 if (mflag & MS_NOMNTTAB) 2666 newflag |= VFS_NOMNTTAB; 2667 else 2668 newflag &= ~VFS_NOMNTTAB; 2669 2670 if (coveredvp != NULL) { 2671 ASSERT(vn_vfswlock_held(coveredvp)); 2672 coveredvp->v_vfsmountedhere = vfsp; 2673 VN_HOLD(coveredvp); 2674 } 2675 vfsp->vfs_vnodecovered = coveredvp; 2676 vfsp->vfs_flag = newflag; 2677 2678 vfs_list_add(vfsp); 2679 } 2680 2681 /* 2682 * Remove a vfs from the vfs list, null out the pointer from the 2683 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 2684 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 2685 * reference to the vfs and to the covered vnode. 2686 * 2687 * Called from dounmount after it's confirmed with the file system 2688 * that the unmount is legal. 2689 */ 2690 void 2691 vfs_remove(struct vfs *vfsp) 2692 { 2693 vnode_t *vp; 2694 2695 ASSERT(vfs_lock_held(vfsp)); 2696 2697 /* 2698 * Can't unmount root. Should never happen because fs will 2699 * be busy. 2700 */ 2701 if (vfsp == rootvfs) 2702 cmn_err(CE_PANIC, "vfs_remove: unmounting root"); 2703 2704 vfs_list_remove(vfsp); 2705 2706 /* 2707 * Unhook from the file system name space. 2708 */ 2709 vp = vfsp->vfs_vnodecovered; 2710 ASSERT(vn_vfswlock_held(vp)); 2711 vp->v_vfsmountedhere = NULL; 2712 vfsp->vfs_vnodecovered = NULL; 2713 VN_RELE(vp); 2714 2715 /* 2716 * Release lock and wakeup anybody waiting. 2717 */ 2718 vfs_unlock(vfsp); 2719 VFS_RELE(vfsp); 2720 } 2721 2722 /* 2723 * Lock a filesystem to prevent access to it while mounting, 2724 * unmounting and syncing. Return EBUSY immediately if lock 2725 * can't be acquired. 2726 */ 2727 int 2728 vfs_lock(vfs_t *vfsp) 2729 { 2730 vn_vfslocks_entry_t *vpvfsentry; 2731 2732 vpvfsentry = vn_vfslocks_getlock(vfsp); 2733 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 2734 return (0); 2735 2736 vn_vfslocks_rele(vpvfsentry); 2737 return (EBUSY); 2738 } 2739 2740 int 2741 vfs_rlock(vfs_t *vfsp) 2742 { 2743 vn_vfslocks_entry_t *vpvfsentry; 2744 2745 vpvfsentry = vn_vfslocks_getlock(vfsp); 2746 2747 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 2748 return (0); 2749 2750 vn_vfslocks_rele(vpvfsentry); 2751 return (EBUSY); 2752 } 2753 2754 void 2755 vfs_lock_wait(vfs_t *vfsp) 2756 { 2757 vn_vfslocks_entry_t *vpvfsentry; 2758 2759 vpvfsentry = vn_vfslocks_getlock(vfsp); 2760 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 2761 } 2762 2763 void 2764 vfs_rlock_wait(vfs_t *vfsp) 2765 { 2766 vn_vfslocks_entry_t *vpvfsentry; 2767 2768 vpvfsentry = vn_vfslocks_getlock(vfsp); 2769 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 2770 } 2771 2772 /* 2773 * Unlock a locked filesystem. 2774 */ 2775 void 2776 vfs_unlock(vfs_t *vfsp) 2777 { 2778 vn_vfslocks_entry_t *vpvfsentry; 2779 2780 /* 2781 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 2782 * And these changes should remain for the patch changes as it is. 2783 */ 2784 if (panicstr) 2785 return; 2786 2787 /* 2788 * ve_refcount needs to be dropped twice here. 2789 * 1. To release refernce after a call to vfs_locks_getlock() 2790 * 2. To release the reference from the locking routines like 2791 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 2792 */ 2793 2794 vpvfsentry = vn_vfslocks_getlock(vfsp); 2795 vn_vfslocks_rele(vpvfsentry); 2796 2797 rwst_exit(&vpvfsentry->ve_lock); 2798 vn_vfslocks_rele(vpvfsentry); 2799 } 2800 2801 /* 2802 * Utility routine that allows a filesystem to construct its 2803 * fsid in "the usual way" - by munging some underlying dev_t and 2804 * the filesystem type number into the 64-bit fsid. Note that 2805 * this implicitly relies on dev_t persistence to make filesystem 2806 * id's persistent. 2807 * 2808 * There's nothing to prevent an individual fs from constructing its 2809 * fsid in a different way, and indeed they should. 2810 * 2811 * Since we want fsids to be 32-bit quantities (so that they can be 2812 * exported identically by either 32-bit or 64-bit APIs, as well as 2813 * the fact that fsid's are "known" to NFS), we compress the device 2814 * number given down to 32-bits, and panic if that isn't possible. 2815 */ 2816 void 2817 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 2818 { 2819 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 2820 panic("device number too big for fsid!"); 2821 fsi->val[1] = val; 2822 } 2823 2824 int 2825 vfs_lock_held(vfs_t *vfsp) 2826 { 2827 int held; 2828 vn_vfslocks_entry_t *vpvfsentry; 2829 2830 /* 2831 * vfs_lock_held will mimic sema_held behaviour 2832 * if panicstr is set. And these changes should remain 2833 * for the patch changes as it is. 2834 */ 2835 if (panicstr) 2836 return (1); 2837 2838 vpvfsentry = vn_vfslocks_getlock(vfsp); 2839 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 2840 2841 vn_vfslocks_rele(vpvfsentry); 2842 return (held); 2843 } 2844 2845 struct _kthread * 2846 vfs_lock_owner(vfs_t *vfsp) 2847 { 2848 struct _kthread *owner; 2849 vn_vfslocks_entry_t *vpvfsentry; 2850 2851 /* 2852 * vfs_wlock_held will mimic sema_held behaviour 2853 * if panicstr is set. And these changes should remain 2854 * for the patch changes as it is. 2855 */ 2856 if (panicstr) 2857 return (NULL); 2858 2859 vpvfsentry = vn_vfslocks_getlock(vfsp); 2860 owner = rwst_owner(&vpvfsentry->ve_lock); 2861 2862 vn_vfslocks_rele(vpvfsentry); 2863 return (owner); 2864 } 2865 2866 /* 2867 * vfs list locking. 2868 * 2869 * Rather than manipulate the vfslist lock directly, we abstract into lock 2870 * and unlock routines to allow the locking implementation to be changed for 2871 * clustering. 2872 * 2873 * Whenever the vfs list is modified through its hash links, the overall list 2874 * lock must be obtained before locking the relevant hash bucket. But to see 2875 * whether a given vfs is on the list, it suffices to obtain the lock for the 2876 * hash bucket without getting the overall list lock. (See getvfs() below.) 2877 */ 2878 2879 void 2880 vfs_list_lock() 2881 { 2882 rw_enter(&vfslist, RW_WRITER); 2883 } 2884 2885 void 2886 vfs_list_read_lock() 2887 { 2888 rw_enter(&vfslist, RW_READER); 2889 } 2890 2891 void 2892 vfs_list_unlock() 2893 { 2894 rw_exit(&vfslist); 2895 } 2896 2897 /* 2898 * Low level worker routines for adding entries to and removing entries from 2899 * the vfs list. 2900 */ 2901 2902 static void 2903 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 2904 { 2905 int vhno; 2906 struct vfs **hp; 2907 dev_t dev; 2908 2909 ASSERT(RW_WRITE_HELD(&vfslist)); 2910 2911 dev = expldev(vfsp->vfs_fsid.val[0]); 2912 vhno = VFSHASH(getmajor(dev), getminor(dev)); 2913 2914 mutex_enter(&rvfs_list[vhno].rvfs_lock); 2915 2916 /* 2917 * Link into the hash table, inserting it at the end, so that LOFS 2918 * with the same fsid as UFS (or other) file systems will not hide the 2919 * UFS. 2920 */ 2921 if (insert_at_head) { 2922 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 2923 rvfs_list[vhno].rvfs_head = vfsp; 2924 } else { 2925 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 2926 hp = &(*hp)->vfs_hash) 2927 continue; 2928 /* 2929 * hp now contains the address of the pointer to update 2930 * to effect the insertion. 2931 */ 2932 vfsp->vfs_hash = NULL; 2933 *hp = vfsp; 2934 } 2935 2936 rvfs_list[vhno].rvfs_len++; 2937 mutex_exit(&rvfs_list[vhno].rvfs_lock); 2938 } 2939 2940 2941 static void 2942 vfs_hash_remove(struct vfs *vfsp) 2943 { 2944 int vhno; 2945 struct vfs *tvfsp; 2946 dev_t dev; 2947 2948 ASSERT(RW_WRITE_HELD(&vfslist)); 2949 2950 dev = expldev(vfsp->vfs_fsid.val[0]); 2951 vhno = VFSHASH(getmajor(dev), getminor(dev)); 2952 2953 mutex_enter(&rvfs_list[vhno].rvfs_lock); 2954 2955 /* 2956 * Remove from hash. 2957 */ 2958 if (rvfs_list[vhno].rvfs_head == vfsp) { 2959 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 2960 rvfs_list[vhno].rvfs_len--; 2961 goto foundit; 2962 } 2963 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 2964 tvfsp = tvfsp->vfs_hash) { 2965 if (tvfsp->vfs_hash == vfsp) { 2966 tvfsp->vfs_hash = vfsp->vfs_hash; 2967 rvfs_list[vhno].rvfs_len--; 2968 goto foundit; 2969 } 2970 } 2971 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 2972 2973 foundit: 2974 2975 mutex_exit(&rvfs_list[vhno].rvfs_lock); 2976 } 2977 2978 2979 void 2980 vfs_list_add(struct vfs *vfsp) 2981 { 2982 zone_t *zone; 2983 2984 /* 2985 * The zone that owns the mount is the one that performed the mount. 2986 * Note that this isn't necessarily the same as the zone mounted into. 2987 * The corresponding zone_rele() will be done when the vfs_t is 2988 * being free'd. 2989 */ 2990 vfsp->vfs_zone = curproc->p_zone; 2991 zone_hold(vfsp->vfs_zone); 2992 2993 /* 2994 * Find the zone mounted into, and put this mount on its vfs list. 2995 */ 2996 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 2997 ASSERT(zone != NULL); 2998 /* 2999 * Special casing for the root vfs. This structure is allocated 3000 * statically and hooked onto rootvfs at link time. During the 3001 * vfs_mountroot call at system startup time, the root file system's 3002 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 3003 * as argument. The code below must detect and handle this special 3004 * case. The only apparent justification for this special casing is 3005 * to ensure that the root file system appears at the head of the 3006 * list. 3007 * 3008 * XXX: I'm assuming that it's ok to do normal list locking when 3009 * adding the entry for the root file system (this used to be 3010 * done with no locks held). 3011 */ 3012 vfs_list_lock(); 3013 /* 3014 * Link into the vfs list proper. 3015 */ 3016 if (vfsp == &root) { 3017 /* 3018 * Assert: This vfs is already on the list as its first entry. 3019 * Thus, there's nothing to do. 3020 */ 3021 ASSERT(rootvfs == vfsp); 3022 /* 3023 * Add it to the head of the global zone's vfslist. 3024 */ 3025 ASSERT(zone == global_zone); 3026 ASSERT(zone->zone_vfslist == NULL); 3027 zone->zone_vfslist = vfsp; 3028 } else { 3029 /* 3030 * Link to end of list using vfs_prev (as rootvfs is now a 3031 * doubly linked circular list) so list is in mount order for 3032 * mnttab use. 3033 */ 3034 rootvfs->vfs_prev->vfs_next = vfsp; 3035 vfsp->vfs_prev = rootvfs->vfs_prev; 3036 rootvfs->vfs_prev = vfsp; 3037 vfsp->vfs_next = rootvfs; 3038 3039 /* 3040 * Do it again for the zone-private list (which may be NULL). 3041 */ 3042 if (zone->zone_vfslist == NULL) { 3043 ASSERT(zone != global_zone); 3044 zone->zone_vfslist = vfsp; 3045 } else { 3046 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 3047 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 3048 zone->zone_vfslist->vfs_zone_prev = vfsp; 3049 vfsp->vfs_zone_next = zone->zone_vfslist; 3050 } 3051 } 3052 3053 /* 3054 * Link into the hash table, inserting it at the end, so that LOFS 3055 * with the same fsid as UFS (or other) file systems will not hide 3056 * the UFS. 3057 */ 3058 vfs_hash_add(vfsp, 0); 3059 3060 /* 3061 * update the mnttab modification time 3062 */ 3063 vfs_mnttab_modtimeupd(); 3064 vfs_list_unlock(); 3065 zone_rele(zone); 3066 } 3067 3068 void 3069 vfs_list_remove(struct vfs *vfsp) 3070 { 3071 zone_t *zone; 3072 3073 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3074 ASSERT(zone != NULL); 3075 /* 3076 * Callers are responsible for preventing attempts to unmount the 3077 * root. 3078 */ 3079 ASSERT(vfsp != rootvfs); 3080 3081 vfs_list_lock(); 3082 3083 /* 3084 * Remove from hash. 3085 */ 3086 vfs_hash_remove(vfsp); 3087 3088 /* 3089 * Remove from vfs list. 3090 */ 3091 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3092 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3093 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3094 3095 /* 3096 * Remove from zone-specific vfs list. 3097 */ 3098 if (zone->zone_vfslist == vfsp) 3099 zone->zone_vfslist = vfsp->vfs_zone_next; 3100 3101 if (vfsp->vfs_zone_next == vfsp) { 3102 ASSERT(vfsp->vfs_zone_prev == vfsp); 3103 ASSERT(zone->zone_vfslist == vfsp); 3104 zone->zone_vfslist = NULL; 3105 } 3106 3107 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3108 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3109 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3110 3111 /* 3112 * update the mnttab modification time 3113 */ 3114 vfs_mnttab_modtimeupd(); 3115 vfs_list_unlock(); 3116 zone_rele(zone); 3117 } 3118 3119 struct vfs * 3120 getvfs(fsid_t *fsid) 3121 { 3122 struct vfs *vfsp; 3123 int val0 = fsid->val[0]; 3124 int val1 = fsid->val[1]; 3125 dev_t dev = expldev(val0); 3126 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3127 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3128 3129 mutex_enter(hmp); 3130 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3131 if (vfsp->vfs_fsid.val[0] == val0 && 3132 vfsp->vfs_fsid.val[1] == val1) { 3133 VFS_HOLD(vfsp); 3134 mutex_exit(hmp); 3135 return (vfsp); 3136 } 3137 } 3138 mutex_exit(hmp); 3139 return (NULL); 3140 } 3141 3142 /* 3143 * Search the vfs mount in progress list for a specified device/vfs entry. 3144 * Returns 0 if the first entry in the list that the device matches has the 3145 * given vfs pointer as well. If the device matches but a different vfs 3146 * pointer is encountered in the list before the given vfs pointer then 3147 * a 1 is returned. 3148 */ 3149 3150 int 3151 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3152 { 3153 int retval = 0; 3154 struct ipmnt *mipp; 3155 3156 mutex_enter(&vfs_miplist_mutex); 3157 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3158 if (mipp->mip_dev == dev) { 3159 if (mipp->mip_vfsp != vfsp) 3160 retval = 1; 3161 break; 3162 } 3163 } 3164 mutex_exit(&vfs_miplist_mutex); 3165 return (retval); 3166 } 3167 3168 /* 3169 * Search the vfs list for a specified device. Returns 1, if entry is found 3170 * or 0 if no suitable entry is found. 3171 */ 3172 3173 int 3174 vfs_devismounted(dev_t dev) 3175 { 3176 struct vfs *vfsp; 3177 int found; 3178 3179 vfs_list_read_lock(); 3180 vfsp = rootvfs; 3181 found = 0; 3182 do { 3183 if (vfsp->vfs_dev == dev) { 3184 found = 1; 3185 break; 3186 } 3187 vfsp = vfsp->vfs_next; 3188 } while (vfsp != rootvfs); 3189 3190 vfs_list_unlock(); 3191 return (found); 3192 } 3193 3194 /* 3195 * Search the vfs list for a specified device. Returns a pointer to it 3196 * or NULL if no suitable entry is found. The caller of this routine 3197 * is responsible for releasing the returned vfs pointer. 3198 */ 3199 struct vfs * 3200 vfs_dev2vfsp(dev_t dev) 3201 { 3202 struct vfs *vfsp; 3203 int found; 3204 3205 vfs_list_read_lock(); 3206 vfsp = rootvfs; 3207 found = 0; 3208 do { 3209 /* 3210 * The following could be made more efficient by making 3211 * the entire loop use vfs_zone_next if the call is from 3212 * a zone. The only callers, however, ustat(2) and 3213 * umount2(2), don't seem to justify the added 3214 * complexity at present. 3215 */ 3216 if (vfsp->vfs_dev == dev && 3217 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3218 curproc->p_zone)) { 3219 VFS_HOLD(vfsp); 3220 found = 1; 3221 break; 3222 } 3223 vfsp = vfsp->vfs_next; 3224 } while (vfsp != rootvfs); 3225 vfs_list_unlock(); 3226 return (found ? vfsp: NULL); 3227 } 3228 3229 /* 3230 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3231 * or NULL if no suitable entry is found. The caller of this routine 3232 * is responsible for releasing the returned vfs pointer. 3233 * 3234 * Note that if multiple mntpoints match, the last one matching is 3235 * returned in an attempt to return the "top" mount when overlay 3236 * mounts are covering the same mount point. This is accomplished by starting 3237 * at the end of the list and working our way backwards, stopping at the first 3238 * matching mount. 3239 */ 3240 struct vfs * 3241 vfs_mntpoint2vfsp(const char *mp) 3242 { 3243 struct vfs *vfsp; 3244 struct vfs *retvfsp = NULL; 3245 zone_t *zone = curproc->p_zone; 3246 struct vfs *list; 3247 3248 vfs_list_read_lock(); 3249 if (getzoneid() == GLOBAL_ZONEID) { 3250 /* 3251 * The global zone may see filesystems in any zone. 3252 */ 3253 vfsp = rootvfs->vfs_prev; 3254 do { 3255 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) { 3256 retvfsp = vfsp; 3257 break; 3258 } 3259 vfsp = vfsp->vfs_prev; 3260 } while (vfsp != rootvfs->vfs_prev); 3261 } else if ((list = zone->zone_vfslist) != NULL) { 3262 const char *mntpt; 3263 3264 vfsp = list->vfs_zone_prev; 3265 do { 3266 mntpt = refstr_value(vfsp->vfs_mntpt); 3267 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3268 if (strcmp(mntpt, mp) == 0) { 3269 retvfsp = vfsp; 3270 break; 3271 } 3272 vfsp = vfsp->vfs_zone_prev; 3273 } while (vfsp != list->vfs_zone_prev); 3274 } 3275 if (retvfsp) 3276 VFS_HOLD(retvfsp); 3277 vfs_list_unlock(); 3278 return (retvfsp); 3279 } 3280 3281 /* 3282 * Search the vfs list for a specified vfsops. 3283 * if vfs entry is found then return 1, else 0. 3284 */ 3285 int 3286 vfs_opsinuse(vfsops_t *ops) 3287 { 3288 struct vfs *vfsp; 3289 int found; 3290 3291 vfs_list_read_lock(); 3292 vfsp = rootvfs; 3293 found = 0; 3294 do { 3295 if (vfs_getops(vfsp) == ops) { 3296 found = 1; 3297 break; 3298 } 3299 vfsp = vfsp->vfs_next; 3300 } while (vfsp != rootvfs); 3301 vfs_list_unlock(); 3302 return (found); 3303 } 3304 3305 /* 3306 * Allocate an entry in vfssw for a file system type 3307 */ 3308 struct vfssw * 3309 allocate_vfssw(char *type) 3310 { 3311 struct vfssw *vswp; 3312 3313 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3314 /* 3315 * The vfssw table uses the empty string to identify an 3316 * available entry; we cannot add any type which has 3317 * a leading NUL. The string length is limited to 3318 * the size of the st_fstype array in struct stat. 3319 */ 3320 return (NULL); 3321 } 3322 3323 ASSERT(VFSSW_WRITE_LOCKED()); 3324 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3325 if (!ALLOCATED_VFSSW(vswp)) { 3326 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3327 (void) strcpy(vswp->vsw_name, type); 3328 ASSERT(vswp->vsw_count == 0); 3329 vswp->vsw_count = 1; 3330 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3331 return (vswp); 3332 } 3333 return (NULL); 3334 } 3335 3336 /* 3337 * Impose additional layer of translation between vfstype names 3338 * and module names in the filesystem. 3339 */ 3340 static char * 3341 vfs_to_modname(char *vfstype) 3342 { 3343 if (strcmp(vfstype, "proc") == 0) { 3344 vfstype = "procfs"; 3345 } else if (strcmp(vfstype, "fd") == 0) { 3346 vfstype = "fdfs"; 3347 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3348 vfstype = "nfs"; 3349 } 3350 3351 return (vfstype); 3352 } 3353 3354 /* 3355 * Find a vfssw entry given a file system type name. 3356 * Try to autoload the filesystem if it's not found. 3357 * If it's installed, return the vfssw locked to prevent unloading. 3358 */ 3359 struct vfssw * 3360 vfs_getvfssw(char *type) 3361 { 3362 struct vfssw *vswp; 3363 char *modname; 3364 3365 RLOCK_VFSSW(); 3366 vswp = vfs_getvfsswbyname(type); 3367 modname = vfs_to_modname(type); 3368 3369 if (rootdir == NULL) { 3370 /* 3371 * If we haven't yet loaded the root file system, then our 3372 * _init won't be called until later. Allocate vfssw entry, 3373 * because mod_installfs won't be called. 3374 */ 3375 if (vswp == NULL) { 3376 RUNLOCK_VFSSW(); 3377 WLOCK_VFSSW(); 3378 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 3379 if ((vswp = allocate_vfssw(type)) == NULL) { 3380 WUNLOCK_VFSSW(); 3381 return (NULL); 3382 } 3383 } 3384 WUNLOCK_VFSSW(); 3385 RLOCK_VFSSW(); 3386 } 3387 if (!VFS_INSTALLED(vswp)) { 3388 RUNLOCK_VFSSW(); 3389 (void) modloadonly("fs", modname); 3390 } else 3391 RUNLOCK_VFSSW(); 3392 return (vswp); 3393 } 3394 3395 /* 3396 * Try to load the filesystem. Before calling modload(), we drop 3397 * our lock on the VFS switch table, and pick it up after the 3398 * module is loaded. However, there is a potential race: the 3399 * module could be unloaded after the call to modload() completes 3400 * but before we pick up the lock and drive on. Therefore, 3401 * we keep reloading the module until we've loaded the module 3402 * _and_ we have the lock on the VFS switch table. 3403 */ 3404 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 3405 RUNLOCK_VFSSW(); 3406 if (modload("fs", modname) == -1) 3407 return (NULL); 3408 RLOCK_VFSSW(); 3409 if (vswp == NULL) 3410 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 3411 break; 3412 } 3413 RUNLOCK_VFSSW(); 3414 3415 return (vswp); 3416 } 3417 3418 /* 3419 * Find a vfssw entry given a file system type name. 3420 */ 3421 struct vfssw * 3422 vfs_getvfsswbyname(char *type) 3423 { 3424 struct vfssw *vswp; 3425 3426 ASSERT(VFSSW_LOCKED()); 3427 if (type == NULL || *type == '\0') 3428 return (NULL); 3429 3430 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3431 if (strcmp(type, vswp->vsw_name) == 0) { 3432 vfs_refvfssw(vswp); 3433 return (vswp); 3434 } 3435 } 3436 3437 return (NULL); 3438 } 3439 3440 /* 3441 * Find a vfssw entry given a set of vfsops. 3442 */ 3443 struct vfssw * 3444 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 3445 { 3446 struct vfssw *vswp; 3447 3448 RLOCK_VFSSW(); 3449 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3450 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 3451 vfs_refvfssw(vswp); 3452 RUNLOCK_VFSSW(); 3453 return (vswp); 3454 } 3455 } 3456 RUNLOCK_VFSSW(); 3457 3458 return (NULL); 3459 } 3460 3461 /* 3462 * Reference a vfssw entry. 3463 */ 3464 void 3465 vfs_refvfssw(struct vfssw *vswp) 3466 { 3467 3468 mutex_enter(&vswp->vsw_lock); 3469 vswp->vsw_count++; 3470 mutex_exit(&vswp->vsw_lock); 3471 } 3472 3473 /* 3474 * Unreference a vfssw entry. 3475 */ 3476 void 3477 vfs_unrefvfssw(struct vfssw *vswp) 3478 { 3479 3480 mutex_enter(&vswp->vsw_lock); 3481 vswp->vsw_count--; 3482 mutex_exit(&vswp->vsw_lock); 3483 } 3484 3485 int sync_timeout = 30; /* timeout for syncing a page during panic */ 3486 int sync_timeleft; /* portion of sync_timeout remaining */ 3487 3488 static int sync_retries = 20; /* number of retries when not making progress */ 3489 static int sync_triesleft; /* portion of sync_retries remaining */ 3490 3491 static pgcnt_t old_pgcnt, new_pgcnt; 3492 static int new_bufcnt, old_bufcnt; 3493 3494 /* 3495 * Sync all of the mounted filesystems, and then wait for the actual i/o to 3496 * complete. We wait by counting the number of dirty pages and buffers, 3497 * pushing them out using bio_busy() and page_busy(), and then counting again. 3498 * This routine is used during both the uadmin A_SHUTDOWN code as well as 3499 * the SYNC phase of the panic code (see comments in panic.c). It should only 3500 * be used after some higher-level mechanism has quiesced the system so that 3501 * new writes are not being initiated while we are waiting for completion. 3502 * 3503 * To ensure finite running time, our algorithm uses two timeout mechanisms: 3504 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and 3505 * sync_triesleft (a progress counter used by the vfs_syncall() loop below). 3506 * Together these ensure that syncing completes if our i/o paths are stuck. 3507 * The counters are declared above so they can be found easily in the debugger. 3508 * 3509 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the 3510 * vfs_syncprogress() subroutine whenever we make progress through the lists of 3511 * pages and buffers. It is decremented and expired by the deadman() cyclic. 3512 * When vfs_syncall() decides it is done, we disable the deadman() counter by 3513 * setting sync_timeleft to zero. This timer guards against vfs_syncall() 3514 * deadlocking or hanging inside of a broken filesystem or driver routine. 3515 * 3516 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 3517 * sync_retries consecutive calls to bio_busy() and page_busy() without 3518 * decreasing either the number of dirty buffers or dirty pages below the 3519 * lowest count we have seen so far, we give up and return from vfs_syncall(). 3520 * 3521 * Each loop iteration ends with a call to delay() one second to allow time for 3522 * i/o completion and to permit the user time to read our progress messages. 3523 */ 3524 void 3525 vfs_syncall(void) 3526 { 3527 if (rootdir == NULL && !modrootloaded) 3528 return; /* panic during boot - no filesystems yet */ 3529 3530 printf("syncing file systems..."); 3531 vfs_syncprogress(); 3532 sync(); 3533 3534 vfs_syncprogress(); 3535 sync_triesleft = sync_retries; 3536 3537 old_bufcnt = new_bufcnt = INT_MAX; 3538 old_pgcnt = new_pgcnt = ULONG_MAX; 3539 3540 while (sync_triesleft > 0) { 3541 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 3542 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 3543 3544 new_bufcnt = bio_busy(B_TRUE); 3545 new_pgcnt = page_busy(B_TRUE); 3546 vfs_syncprogress(); 3547 3548 if (new_bufcnt == 0 && new_pgcnt == 0) 3549 break; 3550 3551 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 3552 sync_triesleft = sync_retries; 3553 else 3554 sync_triesleft--; 3555 3556 if (new_bufcnt) 3557 printf(" [%d]", new_bufcnt); 3558 if (new_pgcnt) 3559 printf(" %lu", new_pgcnt); 3560 3561 delay(hz); 3562 } 3563 3564 if (new_bufcnt != 0 || new_pgcnt != 0) 3565 printf(" done (not all i/o completed)\n"); 3566 else 3567 printf(" done\n"); 3568 3569 sync_timeleft = 0; 3570 delay(hz); 3571 } 3572 3573 /* 3574 * If we are in the middle of the sync phase of panic, reset sync_timeleft to 3575 * sync_timeout to indicate that we are making progress and the deadman() 3576 * omnipresent cyclic should not yet time us out. Note that it is safe to 3577 * store to sync_timeleft here since the deadman() is firing at high-level 3578 * on top of us. If we are racing with the deadman(), either the deadman() 3579 * will decrement the old value and then we will reset it, or we will 3580 * reset it and then the deadman() will immediately decrement it. In either 3581 * case, correct behavior results. 3582 */ 3583 void 3584 vfs_syncprogress(void) 3585 { 3586 if (panicstr) 3587 sync_timeleft = sync_timeout; 3588 } 3589 3590 /* 3591 * Map VFS flags to statvfs flags. These shouldn't really be separate 3592 * flags at all. 3593 */ 3594 uint_t 3595 vf_to_stf(uint_t vf) 3596 { 3597 uint_t stf = 0; 3598 3599 if (vf & VFS_RDONLY) 3600 stf |= ST_RDONLY; 3601 if (vf & VFS_NOSETUID) 3602 stf |= ST_NOSUID; 3603 if (vf & VFS_NOTRUNC) 3604 stf |= ST_NOTRUNC; 3605 3606 return (stf); 3607 } 3608 3609 /* 3610 * Use old-style function prototype for vfsstray() so 3611 * that we can use it anywhere in the vfsops structure. 3612 */ 3613 int vfsstray(); 3614 3615 /* 3616 * Entries for (illegal) fstype 0. 3617 */ 3618 /* ARGSUSED */ 3619 int 3620 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 3621 { 3622 cmn_err(CE_PANIC, "stray vfs operation"); 3623 return (0); 3624 } 3625 3626 vfsops_t vfs_strayops = { 3627 vfsstray, 3628 vfsstray, 3629 vfsstray, 3630 vfsstray, 3631 vfsstray_sync, 3632 vfsstray, 3633 vfsstray, 3634 vfsstray 3635 }; 3636 3637 /* 3638 * Entries for (illegal) fstype 0. 3639 */ 3640 int 3641 vfsstray(void) 3642 { 3643 cmn_err(CE_PANIC, "stray vfs operation"); 3644 return (0); 3645 } 3646 3647 /* 3648 * Support for dealing with forced UFS unmount and its interaction with 3649 * LOFS. Could be used by any filesystem. 3650 * See bug 1203132. 3651 */ 3652 int 3653 vfs_EIO(void) 3654 { 3655 return (EIO); 3656 } 3657 3658 /* 3659 * We've gotta define the op for sync separately, since the compiler gets 3660 * confused if we mix and match ANSI and normal style prototypes when 3661 * a "short" argument is present and spits out a warning. 3662 */ 3663 /*ARGSUSED*/ 3664 int 3665 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 3666 { 3667 return (EIO); 3668 } 3669 3670 vfs_t EIO_vfs; 3671 vfsops_t *EIO_vfsops; 3672 3673 /* 3674 * Called from startup() to initialize all loaded vfs's 3675 */ 3676 void 3677 vfsinit(void) 3678 { 3679 struct vfssw *vswp; 3680 int error; 3681 extern void vopstats_startup(); 3682 extern void setup_vopstats(vfs_t *); 3683 3684 static const fs_operation_def_t EIO_vfsops_template[] = { 3685 VFSNAME_MOUNT, vfs_EIO, 3686 VFSNAME_UNMOUNT, vfs_EIO, 3687 VFSNAME_ROOT, vfs_EIO, 3688 VFSNAME_STATVFS, vfs_EIO, 3689 VFSNAME_SYNC, (fs_generic_func_p) vfs_EIO_sync, 3690 VFSNAME_VGET, vfs_EIO, 3691 VFSNAME_MOUNTROOT, vfs_EIO, 3692 VFSNAME_FREEVFS, vfs_EIO, 3693 VFSNAME_VNSTATE, vfs_EIO, 3694 NULL, NULL 3695 }; 3696 3697 3698 /* Initialize the vnode cache (file systems may use it during init). */ 3699 3700 vn_create_cache(); 3701 3702 /* Setup event monitor framework */ 3703 3704 fem_init(); 3705 3706 /* Initialize the dummy stray file system type. */ 3707 3708 vfssw[0].vsw_vfsops = vfs_strayops; 3709 3710 /* Initialize the dummy EIO file system. */ 3711 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 3712 if (error != 0) { 3713 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 3714 /* Shouldn't happen, but not bad enough to panic */ 3715 } 3716 3717 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 3718 3719 /* 3720 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 3721 * on this vfs can immediately notice it's invalid. 3722 */ 3723 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 3724 3725 /* 3726 * Call the init routines of non-loadable filesystems only. 3727 * Filesystems which are loaded as separate modules will be 3728 * initialized by the module loading code instead. 3729 */ 3730 3731 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3732 RLOCK_VFSSW(); 3733 if (vswp->vsw_init != NULL) 3734 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 3735 RUNLOCK_VFSSW(); 3736 } 3737 3738 vopstats_startup(); 3739 setup_vopstats(&EIO_vfs); 3740 } 3741 3742 /* 3743 * Increments the vfs reference count by one atomically. 3744 */ 3745 void 3746 vfs_hold(vfs_t *vfsp) 3747 { 3748 atomic_add_32(&vfsp->vfs_count, 1); 3749 ASSERT(vfsp->vfs_count != 0); 3750 } 3751 3752 /* 3753 * Decrements the vfs reference count by one atomically. When 3754 * vfs reference count becomes zero, it calls the file system 3755 * specific vfs_freevfs() to free up the resources. 3756 */ 3757 void 3758 vfs_rele(vfs_t *vfsp) 3759 { 3760 ASSERT(vfsp->vfs_count != 0); 3761 if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) { 3762 VFS_FREEVFS(vfsp); 3763 if (vfsp->vfs_zone) 3764 zone_rele(vfsp->vfs_zone); 3765 vfs_freemnttab(vfsp); 3766 sema_destroy(&vfsp->vfs_reflock); 3767 kmem_free(vfsp, sizeof (*vfsp)); 3768 } 3769 } 3770 3771 /* 3772 * Generic operations vector support. 3773 * 3774 * This is used to build operations vectors for both the vfs and vnode. 3775 * It's normally called only when a file system is loaded. 3776 * 3777 * There are many possible algorithms for this, including the following: 3778 * 3779 * (1) scan the list of known operations; for each, see if the file system 3780 * includes an entry for it, and fill it in as appropriate. 3781 * 3782 * (2) set up defaults for all known operations. scan the list of ops 3783 * supplied by the file system; for each which is both supplied and 3784 * known, fill it in. 3785 * 3786 * (3) sort the lists of known ops & supplied ops; scan the list, filling 3787 * in entries as we go. 3788 * 3789 * we choose (1) for simplicity, and because performance isn't critical here. 3790 * note that (2) could be sped up using a precomputed hash table on known ops. 3791 * (3) could be faster than either, but only if the lists were very large or 3792 * supplied in sorted order. 3793 * 3794 */ 3795 3796 int 3797 fs_build_vector(void *vector, int *unused_ops, 3798 const fs_operation_trans_def_t *translation, 3799 const fs_operation_def_t *operations) 3800 { 3801 int i, num_trans, num_ops, used; 3802 3803 /* Count the number of translations and the number of supplied */ 3804 /* operations. */ 3805 3806 { 3807 const fs_operation_trans_def_t *p; 3808 3809 for (num_trans = 0, p = translation; 3810 p->name != NULL; 3811 num_trans++, p++) 3812 ; 3813 } 3814 3815 { 3816 const fs_operation_def_t *p; 3817 3818 for (num_ops = 0, p = operations; 3819 p->name != NULL; 3820 num_ops++, p++) 3821 ; 3822 } 3823 3824 /* Walk through each operation known to our caller. There will be */ 3825 /* one entry in the supplied "translation table" for each. */ 3826 3827 used = 0; 3828 3829 for (i = 0; i < num_trans; i++) { 3830 int j, found; 3831 char *curname; 3832 fs_generic_func_p result; 3833 fs_generic_func_p *location; 3834 3835 curname = translation[i].name; 3836 3837 /* Look for a matching operation in the list supplied by the */ 3838 /* file system. */ 3839 3840 found = 0; 3841 3842 for (j = 0; j < num_ops; j++) { 3843 if (strcmp(operations[j].name, curname) == 0) { 3844 used++; 3845 found = 1; 3846 break; 3847 } 3848 } 3849 3850 /* If the file system is using a "placeholder" for default */ 3851 /* or error functions, grab the appropriate function out of */ 3852 /* the translation table. If the file system didn't supply */ 3853 /* this operation at all, use the default function. */ 3854 3855 if (found) { 3856 result = operations[j].func; 3857 if (result == fs_default) { 3858 result = translation[i].defaultFunc; 3859 } else if (result == fs_error) { 3860 result = translation[i].errorFunc; 3861 } else if (result == NULL) { 3862 /* Null values are PROHIBITED */ 3863 return (EINVAL); 3864 } 3865 } else { 3866 result = translation[i].defaultFunc; 3867 } 3868 3869 /* Now store the function into the operations vector. */ 3870 3871 location = (fs_generic_func_p *) 3872 (((char *)vector) + translation[i].offset); 3873 3874 *location = result; 3875 } 3876 3877 *unused_ops = num_ops - used; 3878 3879 return (0); 3880 } 3881 3882 /* Placeholder functions, should never be called. */ 3883 3884 int 3885 fs_error(void) 3886 { 3887 cmn_err(CE_PANIC, "fs_error called"); 3888 return (0); 3889 } 3890 3891 int 3892 fs_default(void) 3893 { 3894 cmn_err(CE_PANIC, "fs_default called"); 3895 return (0); 3896 } 3897 3898 #ifdef __sparc 3899 3900 /* 3901 * Part of the implementation of booting off a mirrored root 3902 * involves a change of dev_t for the root device. To 3903 * accomplish this, first remove the existing hash table 3904 * entry for the root device, convert to the new dev_t, 3905 * then re-insert in the hash table at the head of the list. 3906 */ 3907 void 3908 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 3909 { 3910 vfs_list_lock(); 3911 3912 vfs_hash_remove(vfsp); 3913 3914 vfsp->vfs_dev = ndev; 3915 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 3916 3917 vfs_hash_add(vfsp, 1); 3918 3919 vfs_list_unlock(); 3920 } 3921 3922 #else /* x86 NEWBOOT */ 3923 3924 int 3925 rootconf() 3926 { 3927 int error; 3928 struct vfssw *vsw; 3929 extern void pm_init(); 3930 char *fstyp; 3931 3932 fstyp = getrootfs(); 3933 3934 if (error = clboot_rootconf()) 3935 return (error); 3936 3937 if (modload("fs", fstyp) == -1) 3938 cmn_err(CE_PANIC, "Cannot _init %s module\n", fstyp); 3939 3940 RLOCK_VFSSW(); 3941 vsw = vfs_getvfsswbyname(fstyp); 3942 RUNLOCK_VFSSW(); 3943 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 3944 VFS_HOLD(rootvfs); 3945 3946 /* always mount readonly first */ 3947 rootvfs->vfs_flag |= VFS_RDONLY; 3948 3949 pm_init(); 3950 3951 if (netboot) 3952 (void) strplumb(); 3953 3954 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 3955 vfs_unrefvfssw(vsw); 3956 rootdev = rootvfs->vfs_dev; 3957 3958 if (error) 3959 cmn_err(CE_PANIC, "cannot mount root path %s", svm_bootpath); 3960 return (error); 3961 } 3962 3963 /* 3964 * XXX this is called by nfs only and should probably be removed 3965 * If booted with ASKNAME, prompt on the console for a filesystem 3966 * name and return it. 3967 */ 3968 void 3969 getfsname(char *askfor, char *name, size_t namelen) 3970 { 3971 if (boothowto & RB_ASKNAME) { 3972 printf("%s name: ", askfor); 3973 console_gets(name, namelen); 3974 } 3975 } 3976 3977 /* 3978 * If server_path exists, then we are booting a diskless 3979 * client. Otherwise, we default to ufs. Zfs should perhaps be 3980 * another property. 3981 */ 3982 static char * 3983 getrootfs(void) 3984 { 3985 extern char *strplumb_get_netdev_path(void); 3986 char *propstr = NULL; 3987 3988 /* check fstype property; it should be nfsdyn for diskless */ 3989 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 3990 DDI_PROP_DONTPASS, "fstype", &propstr) 3991 == DDI_SUCCESS) { 3992 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 3993 ddi_prop_free(propstr); 3994 } 3995 3996 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) 3997 return (rootfs.bo_fstype); 3998 3999 ++netboot; 4000 /* check if path to network interface is specified in bootpath */ 4001 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4002 DDI_PROP_DONTPASS, "bootpath", &propstr) 4003 == DDI_SUCCESS) { 4004 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 4005 ddi_prop_free(propstr); 4006 } else { 4007 /* attempt to determine netdev_path via boot_mac address */ 4008 netdev_path = strplumb_get_netdev_path(); 4009 if (netdev_path == NULL) 4010 cmn_err(CE_PANIC, 4011 "Cannot find boot network interface\n"); 4012 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 4013 } 4014 return ("nfs"); 4015 } 4016 #endif 4017