1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/param.h> 46 #include <sys/errno.h> 47 #include <sys/user.h> 48 #include <sys/fstyp.h> 49 #include <sys/kmem.h> 50 #include <sys/systm.h> 51 #include <sys/proc.h> 52 #include <sys/mount.h> 53 #include <sys/vfs.h> 54 #include <sys/fem.h> 55 #include <sys/mntent.h> 56 #include <sys/stat.h> 57 #include <sys/statvfs.h> 58 #include <sys/statfs.h> 59 #include <sys/cred.h> 60 #include <sys/vnode.h> 61 #include <sys/rwstlock.h> 62 #include <sys/dnlc.h> 63 #include <sys/file.h> 64 #include <sys/time.h> 65 #include <sys/atomic.h> 66 #include <sys/cmn_err.h> 67 #include <sys/buf.h> 68 #include <sys/swap.h> 69 #include <sys/debug.h> 70 #include <sys/vnode.h> 71 #include <sys/modctl.h> 72 #include <sys/ddi.h> 73 #include <sys/pathname.h> 74 #include <sys/bootconf.h> 75 #include <sys/dumphdr.h> 76 #include <sys/dc_ki.h> 77 #include <sys/poll.h> 78 #include <sys/sunddi.h> 79 #include <sys/sysmacros.h> 80 #include <sys/zone.h> 81 #include <sys/policy.h> 82 #include <sys/ctfs.h> 83 #include <sys/objfs.h> 84 #include <sys/console.h> 85 #include <sys/reboot.h> 86 87 #include <vm/page.h> 88 89 #include <fs/fs_subr.h> 90 91 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 92 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 93 const char *, int, int); 94 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 95 static void vfs_freemnttab(struct vfs *); 96 static void vfs_freeopt(mntopt_t *); 97 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 98 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 99 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 100 static void vfs_createopttbl_extend(mntopts_t *, const char *, 101 const mntopts_t *); 102 static char **vfs_copycancelopt_extend(char **const, int); 103 static void vfs_freecancelopt(char **); 104 static char *getrootfs(void); 105 static int getmacpath(dev_info_t *, void *); 106 107 struct ipmnt { 108 struct ipmnt *mip_next; 109 dev_t mip_dev; 110 struct vfs *mip_vfsp; 111 }; 112 113 static kmutex_t vfs_miplist_mutex; 114 static struct ipmnt *vfs_miplist = NULL; 115 static struct ipmnt *vfs_miplist_end = NULL; 116 117 /* 118 * VFS global data. 119 */ 120 vnode_t *rootdir; /* pointer to root inode vnode. */ 121 vnode_t *devicesdir; /* pointer to inode of devices root */ 122 123 char *server_rootpath; /* root path for diskless clients */ 124 char *server_hostname; /* hostname of diskless server */ 125 126 static struct vfs root; 127 static struct vfs devices; 128 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 129 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 130 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 131 /* must be power of 2! */ 132 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 133 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 134 char *vfs_dummyfstype = "\0"; 135 struct pollhead vfs_pollhd; /* for mnttab pollers */ 136 137 /* 138 * Table for generic options recognized in the VFS layer and acted 139 * on at this level before parsing file system specific options. 140 * The nosuid option is stronger than any of the devices and setuid 141 * options, so those are canceled when nosuid is seen. 142 * 143 * All options which are added here need to be added to the 144 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 145 */ 146 /* 147 * VFS Mount options table 148 */ 149 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 150 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 151 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 152 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 153 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 154 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 155 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 156 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 157 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 158 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 159 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 160 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 161 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 162 163 static const mntopt_t mntopts[] = { 164 /* 165 * option name cancel options default arg flags 166 */ 167 { MNTOPT_REMOUNT, NULL, NULL, 168 MO_NODISPLAY, (void *)0 }, 169 { MNTOPT_RO, ro_cancel, NULL, 0, 170 (void *)0 }, 171 { MNTOPT_RW, rw_cancel, NULL, 0, 172 (void *)0 }, 173 { MNTOPT_SUID, suid_cancel, NULL, 0, 174 (void *)0 }, 175 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 176 (void *)0 }, 177 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 178 (void *)0 }, 179 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 180 (void *)0 }, 181 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 182 (void *)0 }, 183 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 184 (void *)0 }, 185 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 186 (void *)0 }, 187 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 188 (void *)0 }, 189 { MNTOPT_EXEC, exec_cancel, NULL, 0, 190 (void *)0 }, 191 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 192 (void *)0 }, 193 }; 194 195 const mntopts_t vfs_mntopts = { 196 sizeof (mntopts) / sizeof (mntopt_t), 197 (mntopt_t *)&mntopts[0] 198 }; 199 200 /* 201 * File system operation dispatch functions. 202 */ 203 204 int 205 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 206 { 207 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 208 } 209 210 int 211 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 212 { 213 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 214 } 215 216 int 217 fsop_root(vfs_t *vfsp, vnode_t **vpp) 218 { 219 refstr_t *mntpt; 220 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 221 /* 222 * Make sure this root has a path. With lofs, it is possible to have 223 * a NULL mountpoint. 224 */ 225 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { 226 mntpt = vfs_getmntpoint(vfsp); 227 vn_setpath_str(*vpp, refstr_value(mntpt), 228 strlen(refstr_value(mntpt))); 229 refstr_rele(mntpt); 230 } 231 232 return (ret); 233 } 234 235 int 236 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 237 { 238 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 239 } 240 241 int 242 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 243 { 244 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 245 } 246 247 int 248 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 249 { 250 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 251 } 252 253 int 254 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 255 { 256 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 257 } 258 259 void 260 fsop_freefs(vfs_t *vfsp) 261 { 262 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 263 } 264 265 int 266 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 267 { 268 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 269 } 270 271 int 272 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 273 { 274 ASSERT((fstype >= 0) && (fstype < nfstype)); 275 276 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 277 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 278 else 279 return (ENOTSUP); 280 } 281 282 /* 283 * File system initialization. vfs_setfsops() must be called from a file 284 * system's init routine. 285 */ 286 287 static int 288 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 289 int *unused_ops) 290 { 291 static const fs_operation_trans_def_t vfs_ops_table[] = { 292 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 293 fs_nosys, fs_nosys, 294 295 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 296 fs_nosys, fs_nosys, 297 298 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 299 fs_nosys, fs_nosys, 300 301 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 302 fs_nosys, fs_nosys, 303 304 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 305 (fs_generic_func_p) fs_sync, 306 (fs_generic_func_p) fs_sync, /* No errors allowed */ 307 308 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 309 fs_nosys, fs_nosys, 310 311 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 312 fs_nosys, fs_nosys, 313 314 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 315 (fs_generic_func_p)fs_freevfs, 316 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 317 318 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 319 (fs_generic_func_p)fs_nosys, 320 (fs_generic_func_p)fs_nosys, 321 322 NULL, 0, NULL, NULL 323 }; 324 325 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 326 } 327 328 int 329 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 330 { 331 int error; 332 int unused_ops; 333 334 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 335 336 if ((fstype <= 0) || (fstype >= nfstype)) 337 return (EINVAL); 338 339 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 340 return (EINVAL); 341 342 /* Set up the operations vector. */ 343 344 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 345 346 if (error != 0) 347 return (error); 348 349 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 350 351 if (actual != NULL) 352 *actual = &vfssw[fstype].vsw_vfsops; 353 354 #if DEBUG 355 if (unused_ops != 0) 356 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 357 "but not used", vfssw[fstype].vsw_name, unused_ops); 358 #endif 359 360 return (0); 361 } 362 363 int 364 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 365 { 366 int error; 367 int unused_ops; 368 369 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 370 371 error = fs_copyfsops(template, *actual, &unused_ops); 372 if (error != 0) { 373 kmem_free(*actual, sizeof (vfsops_t)); 374 *actual = NULL; 375 return (error); 376 } 377 378 return (0); 379 } 380 381 /* 382 * Free a vfsops structure created as a result of vfs_makefsops(). 383 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 384 * vfs_freevfsops_by_type(). 385 */ 386 void 387 vfs_freevfsops(vfsops_t *vfsops) 388 { 389 kmem_free(vfsops, sizeof (vfsops_t)); 390 } 391 392 /* 393 * Since the vfsops structure is part of the vfssw table and wasn't 394 * really allocated, we're not really freeing anything. We keep 395 * the name for consistency with vfs_freevfsops(). We do, however, 396 * need to take care of a little bookkeeping. 397 * NOTE: For a vfsops structure created by vfs_setfsops(), use 398 * vfs_freevfsops_by_type(). 399 */ 400 int 401 vfs_freevfsops_by_type(int fstype) 402 { 403 404 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 405 if ((fstype <= 0) || (fstype >= nfstype)) 406 return (EINVAL); 407 408 WLOCK_VFSSW(); 409 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 410 WUNLOCK_VFSSW(); 411 return (EINVAL); 412 } 413 414 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 415 WUNLOCK_VFSSW(); 416 417 return (0); 418 } 419 420 /* Support routines used to reference vfs_op */ 421 422 /* Set the operations vector for a vfs */ 423 void 424 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 425 { 426 vfsops_t *op; 427 428 ASSERT(vfsp != NULL); 429 ASSERT(vfsops != NULL); 430 431 op = vfsp->vfs_op; 432 membar_consumer(); 433 if (vfsp->vfs_femhead == NULL && 434 casptr(&vfsp->vfs_op, op, vfsops) == op) { 435 return; 436 } 437 fsem_setvfsops(vfsp, vfsops); 438 } 439 440 /* Retrieve the operations vector for a vfs */ 441 vfsops_t * 442 vfs_getops(vfs_t *vfsp) 443 { 444 vfsops_t *op; 445 446 ASSERT(vfsp != NULL); 447 448 op = vfsp->vfs_op; 449 membar_consumer(); 450 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) { 451 return (op); 452 } else { 453 return (fsem_getvfsops(vfsp)); 454 } 455 } 456 457 /* 458 * Returns non-zero (1) if the vfsops matches that of the vfs. 459 * Returns zero (0) if not. 460 */ 461 int 462 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 463 { 464 return (vfs_getops(vfsp) == vfsops); 465 } 466 467 /* 468 * Returns non-zero (1) if the file system has installed a non-default, 469 * non-error vfs_sync routine. Returns zero (0) otherwise. 470 */ 471 int 472 vfs_can_sync(vfs_t *vfsp) 473 { 474 /* vfs_sync() routine is not the default/error function */ 475 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 476 } 477 478 /* 479 * Initialize a vfs structure. 480 */ 481 void 482 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 483 { 484 vfsp->vfs_count = 0; 485 vfsp->vfs_next = vfsp; 486 vfsp->vfs_prev = vfsp; 487 vfsp->vfs_zone_next = vfsp; 488 vfsp->vfs_zone_prev = vfsp; 489 vfsp->vfs_flag = 0; 490 vfsp->vfs_data = (data); 491 vfsp->vfs_resource = NULL; 492 vfsp->vfs_mntpt = NULL; 493 vfsp->vfs_mntopts.mo_count = 0; 494 vfsp->vfs_mntopts.mo_list = NULL; 495 vfsp->vfs_femhead = NULL; 496 vfsp->vfs_zone = NULL; 497 vfs_setops((vfsp), (op)); 498 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 499 } 500 501 502 /* 503 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 504 * fstatvfs, and sysfs moved to common/syscall. 505 */ 506 507 /* 508 * Update every mounted file system. We call the vfs_sync operation of 509 * each file system type, passing it a NULL vfsp to indicate that all 510 * mounted file systems of that type should be updated. 511 */ 512 void 513 vfs_sync(int flag) 514 { 515 struct vfssw *vswp; 516 RLOCK_VFSSW(); 517 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 518 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 519 vfs_refvfssw(vswp); 520 RUNLOCK_VFSSW(); 521 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 522 CRED()); 523 vfs_unrefvfssw(vswp); 524 RLOCK_VFSSW(); 525 } 526 } 527 RUNLOCK_VFSSW(); 528 } 529 530 void 531 sync(void) 532 { 533 vfs_sync(0); 534 } 535 536 /* 537 * External routines. 538 */ 539 540 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 541 542 /* 543 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 544 * but otherwise should be accessed only via vfs_list_lock() and 545 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 546 */ 547 static krwlock_t vfslist; 548 549 /* 550 * Mount devfs on /devices. This is done right after root is mounted 551 * to provide device access support for the system 552 */ 553 static void 554 vfs_mountdevices(void) 555 { 556 struct vfssw *vsw; 557 struct vnode *mvp; 558 struct mounta mounta = { /* fake mounta for devfs_mount() */ 559 NULL, 560 NULL, 561 MS_SYSSPACE, 562 NULL, 563 NULL, 564 0, 565 NULL, 566 0 567 }; 568 569 /* 570 * _init devfs module to fill in the vfssw 571 */ 572 if (modload("fs", "devfs") == -1) 573 cmn_err(CE_PANIC, "Cannot _init devfs module\n"); 574 575 /* 576 * Hold vfs 577 */ 578 RLOCK_VFSSW(); 579 vsw = vfs_getvfsswbyname("devfs"); 580 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 581 VFS_HOLD(&devices); 582 583 /* 584 * Locate mount point 585 */ 586 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 587 cmn_err(CE_PANIC, "Cannot find /devices\n"); 588 589 /* 590 * Perform the mount of /devices 591 */ 592 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 593 cmn_err(CE_PANIC, "Cannot mount /devices\n"); 594 595 RUNLOCK_VFSSW(); 596 597 /* 598 * Set appropriate members and add to vfs list for mnttab display 599 */ 600 vfs_setresource(&devices, "/devices"); 601 vfs_setmntpoint(&devices, "/devices"); 602 603 /* 604 * Hold the root of /devices so it won't go away 605 */ 606 if (VFS_ROOT(&devices, &devicesdir)) 607 cmn_err(CE_PANIC, "vfs_mountdevices: not devices root"); 608 VN_HOLD(devicesdir); 609 610 if (vfs_lock(&devices) != 0) { 611 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 612 return; 613 } 614 615 if (vn_vfswlock(mvp) != 0) { 616 vfs_unlock(&devices); 617 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 618 return; 619 } 620 621 vfs_add(mvp, &devices, 0); 622 vn_vfsunlock(mvp); 623 vfs_unlock(&devices); 624 } 625 626 /* 627 * Mount required filesystem. This is done right after root is mounted. 628 */ 629 static void 630 vfs_mountfs(char *module, char *spec, char *path) 631 { 632 struct vnode *mvp; 633 struct mounta mounta; 634 vfs_t *vfsp; 635 636 mounta.flags = MS_SYSSPACE | MS_DATA; 637 mounta.fstype = module; 638 mounta.spec = spec; 639 mounta.dir = path; 640 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 641 cmn_err(CE_WARN, "Cannot find %s\n", path); 642 return; 643 } 644 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 645 cmn_err(CE_WARN, "Cannot mount %s\n", path); 646 else 647 VFS_RELE(vfsp); 648 VN_RELE(mvp); 649 } 650 651 /* 652 * vfs_mountroot is called by main() to mount the root filesystem. 653 */ 654 void 655 vfs_mountroot(void) 656 { 657 struct vnode *rvp = NULL; 658 char *path; 659 size_t plen; 660 661 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 662 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 663 664 /* 665 * Alloc the vfs hash bucket array and locks 666 */ 667 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 668 669 /* 670 * Call machine-dependent routine "rootconf" to choose a root 671 * file system type. 672 */ 673 if (rootconf()) 674 cmn_err(CE_PANIC, "vfs_mountroot: cannot mount root"); 675 /* 676 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 677 * to point to it. These are used by lookuppn() so that it 678 * knows where to start from ('/' or '.'). 679 */ 680 vfs_setmntpoint(rootvfs, "/"); 681 if (VFS_ROOT(rootvfs, &rootdir)) 682 cmn_err(CE_PANIC, "vfs_mountroot: no root vnode"); 683 u.u_cdir = rootdir; 684 VN_HOLD(u.u_cdir); 685 u.u_rdir = NULL; 686 687 /* 688 * Setup the global zone's rootvp, now that it exists. 689 */ 690 global_zone->zone_rootvp = rootdir; 691 VN_HOLD(global_zone->zone_rootvp); 692 693 /* 694 * Notify the module code that it can begin using the 695 * root filesystem instead of the boot program's services. 696 */ 697 modrootloaded = 1; 698 /* 699 * Set up mnttab information for root 700 */ 701 vfs_setresource(rootvfs, rootfs.bo_name); 702 703 /* 704 * Notify cluster software that the root filesystem is available. 705 */ 706 clboot_mountroot(); 707 708 /* 709 * Mount /devices, /system/contract, /etc/mnttab, /etc/svc/volatile, 710 * /system/object, and /proc. 711 */ 712 vfs_mountdevices(); 713 714 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 715 vfs_mountfs("proc", "/proc", "/proc"); 716 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 717 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 718 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 719 720 #ifdef __sparc 721 /* 722 * This bit of magic can go away when we convert sparc to 723 * the new boot architecture based on ramdisk. 724 * 725 * Booting off a mirrored root volume: 726 * At this point, we have booted and mounted root on a 727 * single component of the mirror. Complete the boot 728 * by configuring SVM and converting the root to the 729 * dev_t of the mirrored root device. This dev_t conversion 730 * only works because the underlying device doesn't change. 731 */ 732 if (root_is_svm) { 733 if (svm_rootconf()) { 734 cmn_err(CE_PANIC, "vfs_mountroot: cannot remount root"); 735 } 736 737 /* 738 * mnttab should reflect the new root device 739 */ 740 vfs_lock_wait(rootvfs); 741 vfs_setresource(rootvfs, rootfs.bo_name); 742 vfs_unlock(rootvfs); 743 } 744 #endif /* __sparc */ 745 746 /* 747 * Look up the root device via devfs so that a dv_node is 748 * created for it. The vnode is never VN_RELE()ed. 749 * We allocate more than MAXPATHLEN so that the 750 * buffer passed to i_ddi_prompath_to_devfspath() is 751 * exactly MAXPATHLEN (the function expects a buffer 752 * of that length). 753 */ 754 plen = strlen("/devices"); 755 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 756 (void) strcpy(path, "/devices"); 757 758 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 759 != DDI_SUCCESS || 760 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 761 762 /* NUL terminate in case "path" has garbage */ 763 path[plen + MAXPATHLEN - 1] = '\0'; 764 #ifdef DEBUG 765 cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); 766 #endif 767 } 768 kmem_free(path, plen + MAXPATHLEN); 769 } 770 771 /* 772 * Common mount code. Called from the system call entry point, from autofs, 773 * and from pxfs. 774 * 775 * Takes the effective file system type, mount arguments, the mount point 776 * vnode, flags specifying whether the mount is a remount and whether it 777 * should be entered into the vfs list, and credentials. Fills in its vfspp 778 * parameter with the mounted file system instance's vfs. 779 * 780 * Note that the effective file system type is specified as a string. It may 781 * be null, in which case it's determined from the mount arguments, and may 782 * differ from the type specified in the mount arguments; this is a hook to 783 * allow interposition when instantiating file system instances. 784 * 785 * The caller is responsible for releasing its own hold on the mount point 786 * vp (this routine does its own hold when necessary). 787 * Also note that for remounts, the mount point vp should be the vnode for 788 * the root of the file system rather than the vnode that the file system 789 * is mounted on top of. 790 */ 791 int 792 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 793 struct vfs **vfspp) 794 { 795 struct vfssw *vswp; 796 vfsops_t *vfsops; 797 struct vfs *vfsp; 798 struct vnode *bvp; 799 dev_t bdev = 0; 800 mntopts_t mnt_mntopts; 801 int error = 0; 802 int copyout_error = 0; 803 int ovflags; 804 char *opts = uap->optptr; 805 char *inargs = opts; 806 int optlen = uap->optlen; 807 int remount; 808 int rdonly; 809 int nbmand = 0; 810 int delmip = 0; 811 int addmip = 0; 812 int splice = ((uap->flags & MS_NOSPLICE) == 0); 813 int fromspace = (uap->flags & MS_SYSSPACE) ? 814 UIO_SYSSPACE : UIO_USERSPACE; 815 char *resource = NULL, *mountpt = NULL; 816 refstr_t *oldresource, *oldmntpt; 817 struct pathname pn, rpn; 818 819 /* 820 * The v_flag value for the mount point vp is permanently set 821 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 822 * for mount point locking. 823 */ 824 mutex_enter(&vp->v_lock); 825 vp->v_flag |= VVFSLOCK; 826 mutex_exit(&vp->v_lock); 827 828 mnt_mntopts.mo_count = 0; 829 /* 830 * Find the ops vector to use to invoke the file system-specific mount 831 * method. If the fsname argument is non-NULL, use it directly. 832 * Otherwise, dig the file system type information out of the mount 833 * arguments. 834 * 835 * A side effect is to hold the vfssw entry. 836 * 837 * Mount arguments can be specified in several ways, which are 838 * distinguished by flag bit settings. The preferred way is to set 839 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 840 * type supplied as a character string and the last two arguments 841 * being a pointer to a character buffer and the size of the buffer. 842 * On entry, the buffer holds a null terminated list of options; on 843 * return, the string is the list of options the file system 844 * recognized. If MS_DATA is set arguments five and six point to a 845 * block of binary data which the file system interprets. 846 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 847 * consistently with these conventions. To handle them, we check to 848 * see whether the pointer to the file system name has a numeric value 849 * less than 256. If so, we treat it as an index. 850 */ 851 if (fsname != NULL) { 852 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 853 return (EINVAL); 854 } 855 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 856 size_t n; 857 uint_t fstype; 858 char name[FSTYPSZ]; 859 860 if ((fstype = (uintptr_t)uap->fstype) < 256) { 861 RLOCK_VFSSW(); 862 if (fstype == 0 || fstype >= nfstype || 863 !ALLOCATED_VFSSW(&vfssw[fstype])) { 864 RUNLOCK_VFSSW(); 865 return (EINVAL); 866 } 867 (void) strcpy(name, vfssw[fstype].vsw_name); 868 RUNLOCK_VFSSW(); 869 if ((vswp = vfs_getvfssw(name)) == NULL) 870 return (EINVAL); 871 } else { 872 /* 873 * Handle either kernel or user address space. 874 */ 875 if (uap->flags & MS_SYSSPACE) { 876 error = copystr(uap->fstype, name, 877 FSTYPSZ, &n); 878 } else { 879 error = copyinstr(uap->fstype, name, 880 FSTYPSZ, &n); 881 } 882 if (error) { 883 if (error == ENAMETOOLONG) 884 return (EINVAL); 885 return (error); 886 } 887 if ((vswp = vfs_getvfssw(name)) == NULL) 888 return (EINVAL); 889 } 890 } else { 891 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 892 return (EINVAL); 893 } 894 if (!VFS_INSTALLED(vswp)) 895 return (EINVAL); 896 vfsops = &vswp->vsw_vfsops; 897 898 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 899 /* 900 * Fetch mount options and parse them for generic vfs options 901 */ 902 if (uap->flags & MS_OPTIONSTR) { 903 /* 904 * Limit the buffer size 905 */ 906 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 907 error = EINVAL; 908 goto errout; 909 } 910 if ((uap->flags & MS_SYSSPACE) == 0) { 911 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 912 inargs[0] = '\0'; 913 if (optlen) { 914 error = copyinstr(opts, inargs, (size_t)optlen, 915 NULL); 916 if (error) { 917 goto errout; 918 } 919 } 920 } 921 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 922 } 923 /* 924 * Flag bits override the options string. 925 */ 926 if (uap->flags & MS_REMOUNT) 927 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 928 if (uap->flags & MS_RDONLY) 929 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 930 if (uap->flags & MS_NOSUID) 931 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 932 933 /* 934 * Check if this is a remount; must be set in the option string and 935 * the file system must support a remount option. 936 */ 937 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 938 MNTOPT_REMOUNT, NULL)) { 939 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 940 error = ENOTSUP; 941 goto errout; 942 } 943 uap->flags |= MS_REMOUNT; 944 } 945 946 /* 947 * uap->flags and vfs_optionisset() should agree. 948 */ 949 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 950 uap->flags |= MS_RDONLY; 951 } 952 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 953 uap->flags |= MS_NOSUID; 954 } 955 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 956 ASSERT(splice || !remount); 957 /* 958 * If we are splicing the fs into the namespace, 959 * perform mount point checks. 960 * 961 * We want to resolve the path for the mount point to eliminate 962 * '.' and ".." and symlinks in mount points; we can't do the 963 * same for the resource string, since it would turn 964 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 965 * this before grabbing vn_vfswlock(), because otherwise we 966 * would deadlock with lookuppn(). 967 */ 968 if (splice) { 969 ASSERT(vp->v_count > 0); 970 971 /* 972 * Pick up mount point and device from appropriate space. 973 */ 974 if (pn_get(uap->spec, fromspace, &pn) == 0) { 975 resource = kmem_alloc(pn.pn_pathlen + 1, 976 KM_SLEEP); 977 (void) strcpy(resource, pn.pn_path); 978 pn_free(&pn); 979 } 980 /* 981 * Do a lookupname prior to taking the 982 * writelock. Mark this as completed if 983 * successful for later cleanup and addition to 984 * the mount in progress table. 985 */ 986 if ((uap->flags & MS_GLOBAL) == 0 && 987 lookupname(uap->spec, fromspace, 988 FOLLOW, NULL, &bvp) == 0) { 989 addmip = 1; 990 } 991 992 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 993 pathname_t *pnp; 994 995 if (*pn.pn_path != '/') { 996 error = EINVAL; 997 pn_free(&pn); 998 goto errout; 999 } 1000 pn_alloc(&rpn); 1001 /* 1002 * Kludge to prevent autofs from deadlocking with 1003 * itself when it calls domount(). 1004 * 1005 * If autofs is calling, it is because it is doing 1006 * (autofs) mounts in the process of an NFS mount. A 1007 * lookuppn() here would cause us to block waiting for 1008 * said NFS mount to complete, which can't since this 1009 * is the thread that was supposed to doing it. 1010 */ 1011 if (fromspace == UIO_USERSPACE) { 1012 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1013 NULL)) == 0) { 1014 pnp = &rpn; 1015 } else { 1016 /* 1017 * The file disappeared or otherwise 1018 * became inaccessible since we opened 1019 * it; might as well fail the mount 1020 * since the mount point is no longer 1021 * accessible. 1022 */ 1023 pn_free(&rpn); 1024 pn_free(&pn); 1025 goto errout; 1026 } 1027 } else { 1028 pnp = &pn; 1029 } 1030 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1031 (void) strcpy(mountpt, pnp->pn_path); 1032 1033 /* 1034 * If the addition of the zone's rootpath 1035 * would push us over a total path length 1036 * of MAXPATHLEN, we fail the mount with 1037 * ENAMETOOLONG, which is what we would have 1038 * gotten if we were trying to perform the same 1039 * mount in the global zone. 1040 * 1041 * strlen() doesn't count the trailing 1042 * '\0', but zone_rootpathlen counts both a 1043 * trailing '/' and the terminating '\0'. 1044 */ 1045 if ((curproc->p_zone->zone_rootpathlen - 1 + 1046 strlen(mountpt)) > MAXPATHLEN || 1047 (resource != NULL && 1048 (curproc->p_zone->zone_rootpathlen - 1 + 1049 strlen(resource)) > MAXPATHLEN)) { 1050 error = ENAMETOOLONG; 1051 } 1052 1053 pn_free(&rpn); 1054 pn_free(&pn); 1055 } 1056 1057 if (error) 1058 goto errout; 1059 1060 /* 1061 * Prevent path name resolution from proceeding past 1062 * the mount point. 1063 */ 1064 if (vn_vfswlock(vp) != 0) { 1065 error = EBUSY; 1066 goto errout; 1067 } 1068 1069 /* 1070 * Verify that it's legitimate to establish a mount on 1071 * the prospective mount point. 1072 */ 1073 if (vn_mountedvfs(vp) != NULL) { 1074 /* 1075 * The mount point lock was obtained after some 1076 * other thread raced through and established a mount. 1077 */ 1078 vn_vfsunlock(vp); 1079 error = EBUSY; 1080 goto errout; 1081 } 1082 if (vp->v_flag & VNOMOUNT) { 1083 vn_vfsunlock(vp); 1084 error = EINVAL; 1085 goto errout; 1086 } 1087 } 1088 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1089 uap->dataptr = NULL; 1090 uap->datalen = 0; 1091 } 1092 1093 /* 1094 * If this is a remount, we don't want to create a new VFS. 1095 * Instead, we pass the existing one with a remount flag. 1096 */ 1097 if (remount) { 1098 /* 1099 * Confirm that the mount point is the root vnode of the 1100 * file system that is being remounted. 1101 * This can happen if the user specifies a different 1102 * mount point directory pathname in the (re)mount command. 1103 * 1104 * Code below can only be reached if splice is true, so it's 1105 * safe to do vn_vfsunlock() here. 1106 */ 1107 if ((vp->v_flag & VROOT) == 0) { 1108 vn_vfsunlock(vp); 1109 error = ENOENT; 1110 goto errout; 1111 } 1112 /* 1113 * Disallow making file systems read-only unless file system 1114 * explicitly allows it in its vfssw. Ignore other flags. 1115 */ 1116 if (rdonly && vn_is_readonly(vp) == 0 && 1117 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1118 vn_vfsunlock(vp); 1119 error = EINVAL; 1120 goto errout; 1121 } 1122 /* 1123 * Changing the NBMAND setting on remounts is permitted 1124 * but logged since it can lead to unexpected behavior. 1125 * We also counsel against using it for / and /usr. 1126 */ 1127 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1128 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1129 cmn_err(CE_WARN, "domount: nbmand turned %s via " 1130 "remounting %s", nbmand ? "on" : "off", 1131 refstr_value(vp->v_vfsp->vfs_mntpt)); 1132 } 1133 vfsp = vp->v_vfsp; 1134 ovflags = vfsp->vfs_flag; 1135 vfsp->vfs_flag |= VFS_REMOUNT; 1136 vfsp->vfs_flag &= ~VFS_RDONLY; 1137 } else { 1138 vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP); 1139 VFS_INIT(vfsp, vfsops, NULL); 1140 } 1141 1142 VFS_HOLD(vfsp); 1143 1144 /* 1145 * The vfs_reflock is not used anymore the code below explicitly 1146 * holds it preventing others accesing it directly. 1147 */ 1148 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1149 !(vfsp->vfs_flag & VFS_REMOUNT)) 1150 cmn_err(CE_WARN, 1151 "mount type %s couldn't get vfs_reflock\n", vswp->vsw_name); 1152 1153 /* 1154 * Lock the vfs. If this is a remount we want to avoid spurious umount 1155 * failures that happen as a side-effect of fsflush() and other mount 1156 * and unmount operations that might be going on simultaneously and 1157 * may have locked the vfs currently. To not return EBUSY immediately 1158 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1159 */ 1160 if (!remount) { 1161 if (error = vfs_lock(vfsp)) { 1162 vfsp->vfs_flag = ovflags; 1163 if (splice) 1164 vn_vfsunlock(vp); 1165 kmem_free(vfsp, sizeof (struct vfs)); 1166 goto errout; 1167 } 1168 } else { 1169 vfs_lock_wait(vfsp); 1170 } 1171 1172 /* 1173 * Add device to mount in progress table, global mounts require special 1174 * handling. It is possible that we have already done the lookupname 1175 * on a spliced, non-global fs. If so, we don't want to do it again 1176 * since we cannot do a lookupname after taking the 1177 * wlock above. This case is for a non-spliced, non-global filesystem. 1178 */ 1179 if (!addmip) { 1180 if ((uap->flags & MS_GLOBAL) == 0 && 1181 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1182 addmip = 1; 1183 } 1184 } 1185 1186 if (addmip) { 1187 bdev = bvp->v_rdev; 1188 VN_RELE(bvp); 1189 vfs_addmip(bdev, vfsp); 1190 addmip = 0; 1191 delmip = 1; 1192 } 1193 /* 1194 * Invalidate cached entry for the mount point. 1195 */ 1196 if (splice) 1197 dnlc_purge_vp(vp); 1198 1199 /* 1200 * If have an option string but the filesystem doesn't supply a 1201 * prototype options table, create a table with the global 1202 * options and sufficient room to accept all the options in the 1203 * string. Then parse the passed in option string 1204 * accepting all the options in the string. This gives us an 1205 * option table with all the proper cancel properties for the 1206 * global options. 1207 * 1208 * Filesystems that supply a prototype options table are handled 1209 * earlier in this function. 1210 */ 1211 if (uap->flags & MS_OPTIONSTR) { 1212 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1213 mntopts_t tmp_mntopts; 1214 1215 tmp_mntopts.mo_count = 0; 1216 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1217 &mnt_mntopts); 1218 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1219 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1220 vfs_freeopttbl(&tmp_mntopts); 1221 } 1222 } 1223 1224 /* 1225 * Serialize with zone creations. 1226 */ 1227 mount_in_progress(); 1228 /* 1229 * Instantiate (or reinstantiate) the file system. If appropriate, 1230 * splice it into the file system name space. 1231 * 1232 * We want VFS_MOUNT() to be able to override the vfs_resource 1233 * string if necessary (ie, mntfs), and also for a remount to 1234 * change the same (necessary when remounting '/' during boot). 1235 * So we set up vfs_mntpt and vfs_resource to what we think they 1236 * should be, then hand off control to VFS_MOUNT() which can 1237 * override this. 1238 * 1239 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1240 * a vfs which is on the vfs list (i.e. during a remount), we must 1241 * never set those fields to NULL. Several bits of code make 1242 * assumptions that the fields are always valid. 1243 */ 1244 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1245 if (remount) { 1246 if ((oldresource = vfsp->vfs_resource) != NULL) 1247 refstr_hold(oldresource); 1248 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1249 refstr_hold(oldmntpt); 1250 } 1251 vfs_setresource(vfsp, resource); 1252 vfs_setmntpoint(vfsp, mountpt); 1253 1254 error = VFS_MOUNT(vfsp, vp, uap, credp); 1255 1256 if (uap->flags & MS_RDONLY) 1257 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1258 if (uap->flags & MS_NOSUID) 1259 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1260 if (uap->flags & MS_GLOBAL) 1261 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1262 1263 if (error) { 1264 if (remount) { 1265 /* put back pre-remount options */ 1266 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1267 vfs_setmntpoint(vfsp, refstr_value(oldmntpt)); 1268 if (oldmntpt) 1269 refstr_rele(oldmntpt); 1270 vfs_setresource(vfsp, refstr_value(oldresource)); 1271 if (oldresource) 1272 refstr_rele(oldresource); 1273 vfsp->vfs_flag = ovflags; 1274 vfs_unlock(vfsp); 1275 VFS_RELE(vfsp); 1276 } else { 1277 vfs_unlock(vfsp); 1278 vfs_freemnttab(vfsp); 1279 kmem_free(vfsp, sizeof (struct vfs)); 1280 } 1281 } else { 1282 /* 1283 * Set the mount time to now 1284 */ 1285 vfsp->vfs_mtime = ddi_get_time(); 1286 if (remount) { 1287 vfsp->vfs_flag &= ~VFS_REMOUNT; 1288 if (oldresource) 1289 refstr_rele(oldresource); 1290 if (oldmntpt) 1291 refstr_rele(oldmntpt); 1292 } else if (splice) { 1293 /* 1294 * Link vfsp into the name space at the mount 1295 * point. Vfs_add() is responsible for 1296 * holding the mount point which will be 1297 * released when vfs_remove() is called. 1298 */ 1299 vfs_add(vp, vfsp, uap->flags); 1300 } else { 1301 /* 1302 * Hold the reference to file system which is 1303 * not linked into the name space. 1304 */ 1305 vfsp->vfs_zone = NULL; 1306 VFS_HOLD(vfsp); 1307 vfsp->vfs_vnodecovered = NULL; 1308 } 1309 /* 1310 * Set flags for global options encountered 1311 */ 1312 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1313 vfsp->vfs_flag |= VFS_RDONLY; 1314 else 1315 vfsp->vfs_flag &= ~VFS_RDONLY; 1316 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1317 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1318 } else { 1319 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1320 vfsp->vfs_flag |= VFS_NODEVICES; 1321 else 1322 vfsp->vfs_flag &= ~VFS_NODEVICES; 1323 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1324 vfsp->vfs_flag |= VFS_NOSETUID; 1325 else 1326 vfsp->vfs_flag &= ~VFS_NOSETUID; 1327 } 1328 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1329 vfsp->vfs_flag |= VFS_NBMAND; 1330 else 1331 vfsp->vfs_flag &= ~VFS_NBMAND; 1332 1333 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1334 vfsp->vfs_flag |= VFS_XATTR; 1335 else 1336 vfsp->vfs_flag &= ~VFS_XATTR; 1337 1338 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1339 vfsp->vfs_flag |= VFS_NOEXEC; 1340 else 1341 vfsp->vfs_flag &= ~VFS_NOEXEC; 1342 1343 /* 1344 * Now construct the output option string of options 1345 * we recognized. 1346 */ 1347 if (uap->flags & MS_OPTIONSTR) { 1348 vfs_list_read_lock(); 1349 copyout_error = vfs_buildoptionstr( 1350 &vfsp->vfs_mntopts, inargs, optlen); 1351 vfs_list_unlock(); 1352 if (copyout_error == 0 && 1353 (uap->flags & MS_SYSSPACE) == 0) { 1354 copyout_error = copyoutstr(inargs, opts, 1355 optlen, NULL); 1356 } 1357 } 1358 vfs_unlock(vfsp); 1359 } 1360 mount_completed(); 1361 if (splice) 1362 vn_vfsunlock(vp); 1363 1364 /* 1365 * Return vfsp to caller. 1366 */ 1367 if ((error == 0) && (copyout_error == 0)) { 1368 *vfspp = vfsp; 1369 } 1370 errout: 1371 vfs_freeopttbl(&mnt_mntopts); 1372 if (resource != NULL) 1373 kmem_free(resource, strlen(resource) + 1); 1374 if (mountpt != NULL) 1375 kmem_free(mountpt, strlen(mountpt) + 1); 1376 /* 1377 * It is possible we errored prior to adding to mount in progress 1378 * table. Must free vnode we acquired with successful lookupname. 1379 */ 1380 if (addmip) 1381 VN_RELE(bvp); 1382 if (delmip) 1383 vfs_delmip(vfsp); 1384 ASSERT(vswp != NULL); 1385 vfs_unrefvfssw(vswp); 1386 if (inargs != opts) 1387 kmem_free(inargs, MAX_MNTOPT_STR); 1388 if (copyout_error) { 1389 VFS_RELE(vfsp); 1390 error = copyout_error; 1391 } 1392 return (error); 1393 } 1394 1395 static void 1396 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath) 1397 { 1398 size_t len; 1399 refstr_t *ref; 1400 zone_t *zone = curproc->p_zone; 1401 char *sp; 1402 int have_list_lock = 0; 1403 1404 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1405 1406 /* 1407 * New path must be less than MAXPATHLEN because mntfs 1408 * will only display up to MAXPATHLEN bytes. This is currently 1409 * safe, because domount() uses pn_get(), and other callers 1410 * similarly cap the size to fewer than MAXPATHLEN bytes. 1411 */ 1412 1413 ASSERT(strlen(newpath) < MAXPATHLEN); 1414 1415 /* mntfs requires consistency while vfs list lock is held */ 1416 1417 if (VFS_ON_LIST(vfsp)) { 1418 have_list_lock = 1; 1419 vfs_list_lock(); 1420 } 1421 1422 if (*refp != NULL) 1423 refstr_rele(*refp); 1424 1425 /* Do we need to modify the path? */ 1426 1427 if (zone == global_zone || *newpath != '/') { 1428 ref = refstr_alloc(newpath); 1429 goto out; 1430 } 1431 1432 /* 1433 * Truncate the trailing '/' in the zoneroot, and merge 1434 * in the zone's rootpath with the "newpath" (resource 1435 * or mountpoint) passed in. 1436 * 1437 * The size of the required buffer is thus the size of 1438 * the buffer required for the passed-in newpath 1439 * (strlen(newpath) + 1), plus the size of the buffer 1440 * required to hold zone_rootpath (zone_rootpathlen) 1441 * minus one for one of the now-superfluous NUL 1442 * terminations, minus one for the trailing '/'. 1443 * 1444 * That gives us: 1445 * 1446 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1447 * 1448 * Which is what we have below. 1449 */ 1450 1451 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1452 sp = kmem_alloc(len, KM_SLEEP); 1453 1454 /* 1455 * Copy everything including the trailing slash, which 1456 * we then overwrite with the NUL character. 1457 */ 1458 1459 (void) strcpy(sp, zone->zone_rootpath); 1460 sp[zone->zone_rootpathlen - 2] = '\0'; 1461 (void) strcat(sp, newpath); 1462 1463 ref = refstr_alloc(sp); 1464 kmem_free(sp, len); 1465 out: 1466 *refp = ref; 1467 1468 if (have_list_lock) { 1469 vfs_mnttab_modtimeupd(); 1470 vfs_list_unlock(); 1471 } 1472 } 1473 1474 /* 1475 * Record a mounted resource name in a vfs structure. 1476 * If vfsp is already mounted, caller must hold the vfs lock. 1477 */ 1478 void 1479 vfs_setresource(struct vfs *vfsp, const char *resource) 1480 { 1481 if (resource == NULL || resource[0] == '\0') 1482 resource = VFS_NORESOURCE; 1483 vfs_setpath(vfsp, &vfsp->vfs_resource, resource); 1484 } 1485 1486 /* 1487 * Record a mount point name in a vfs structure. 1488 * If vfsp is already mounted, caller must hold the vfs lock. 1489 */ 1490 void 1491 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt) 1492 { 1493 if (mntpt == NULL || mntpt[0] == '\0') 1494 mntpt = VFS_NOMNTPT; 1495 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt); 1496 } 1497 1498 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1499 1500 refstr_t * 1501 vfs_getresource(const struct vfs *vfsp) 1502 { 1503 refstr_t *resource; 1504 1505 vfs_list_read_lock(); 1506 resource = vfsp->vfs_resource; 1507 refstr_hold(resource); 1508 vfs_list_unlock(); 1509 1510 return (resource); 1511 } 1512 1513 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 1514 1515 refstr_t * 1516 vfs_getmntpoint(const struct vfs *vfsp) 1517 { 1518 refstr_t *mntpt; 1519 1520 vfs_list_read_lock(); 1521 mntpt = vfsp->vfs_mntpt; 1522 refstr_hold(mntpt); 1523 vfs_list_unlock(); 1524 1525 return (mntpt); 1526 } 1527 1528 /* 1529 * Create an empty options table with enough empty slots to hold all 1530 * The options in the options string passed as an argument. 1531 * Potentially prepend another options table. 1532 * 1533 * Note: caller is responsible for locking the vfs list, if needed, 1534 * to protect mops. 1535 */ 1536 static void 1537 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 1538 const mntopts_t *mtmpl) 1539 { 1540 const char *s = opts; 1541 uint_t count; 1542 1543 if (opts == NULL || *opts == '\0') { 1544 count = 0; 1545 } else { 1546 count = 1; 1547 1548 /* 1549 * Count number of options in the string 1550 */ 1551 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 1552 count++; 1553 s++; 1554 } 1555 } 1556 vfs_copyopttbl_extend(mtmpl, mops, count); 1557 } 1558 1559 /* 1560 * Create an empty options table with enough empty slots to hold all 1561 * The options in the options string passed as an argument. 1562 * 1563 * This function is *not* for general use by filesystems. 1564 * 1565 * Note: caller is responsible for locking the vfs list, if needed, 1566 * to protect mops. 1567 */ 1568 void 1569 vfs_createopttbl(mntopts_t *mops, const char *opts) 1570 { 1571 vfs_createopttbl_extend(mops, opts, NULL); 1572 } 1573 1574 1575 /* 1576 * Swap two mount options tables 1577 */ 1578 static void 1579 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 1580 { 1581 uint_t tmpcnt; 1582 mntopt_t *tmplist; 1583 1584 tmpcnt = optbl2->mo_count; 1585 tmplist = optbl2->mo_list; 1586 optbl2->mo_count = optbl1->mo_count; 1587 optbl2->mo_list = optbl1->mo_list; 1588 optbl1->mo_count = tmpcnt; 1589 optbl1->mo_list = tmplist; 1590 } 1591 1592 static void 1593 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 1594 { 1595 vfs_list_lock(); 1596 vfs_swapopttbl_nolock(optbl1, optbl2); 1597 vfs_mnttab_modtimeupd(); 1598 vfs_list_unlock(); 1599 } 1600 1601 static char ** 1602 vfs_copycancelopt_extend(char **const moc, int extend) 1603 { 1604 int i = 0; 1605 int j; 1606 char **result; 1607 1608 if (moc != NULL) { 1609 for (; moc[i] != NULL; i++) 1610 /* count number of options to cancel */; 1611 } 1612 1613 if (i + extend == 0) 1614 return (NULL); 1615 1616 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 1617 1618 for (j = 0; j < i; j++) { 1619 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 1620 (void) strcpy(result[j], moc[j]); 1621 } 1622 for (; j <= i + extend; j++) 1623 result[j] = NULL; 1624 1625 return (result); 1626 } 1627 1628 static void 1629 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 1630 { 1631 char *sp, *dp; 1632 1633 d->mo_flags = s->mo_flags; 1634 d->mo_data = s->mo_data; 1635 sp = s->mo_name; 1636 if (sp != NULL) { 1637 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1638 (void) strcpy(dp, sp); 1639 d->mo_name = dp; 1640 } else { 1641 d->mo_name = NULL; /* should never happen */ 1642 } 1643 1644 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 1645 1646 sp = s->mo_arg; 1647 if (sp != NULL) { 1648 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1649 (void) strcpy(dp, sp); 1650 d->mo_arg = dp; 1651 } else { 1652 d->mo_arg = NULL; 1653 } 1654 } 1655 1656 /* 1657 * Copy a mount options table, possibly allocating some spare 1658 * slots at the end. It is permissible to copy_extend the NULL table. 1659 */ 1660 static void 1661 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 1662 { 1663 uint_t i, count; 1664 mntopt_t *motbl; 1665 1666 /* 1667 * Clear out any existing stuff in the options table being initialized 1668 */ 1669 vfs_freeopttbl(dmo); 1670 count = (smo == NULL) ? 0 : smo->mo_count; 1671 if ((count + extra) == 0) /* nothing to do */ 1672 return; 1673 dmo->mo_count = count + extra; 1674 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 1675 dmo->mo_list = motbl; 1676 for (i = 0; i < count; i++) { 1677 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 1678 } 1679 for (i = count; i < count + extra; i++) { 1680 motbl[i].mo_flags = MO_EMPTY; 1681 } 1682 } 1683 1684 /* 1685 * Copy a mount options table. 1686 * 1687 * This function is *not* for general use by filesystems. 1688 * 1689 * Note: caller is responsible for locking the vfs list, if needed, 1690 * to protect smo and dmo. 1691 */ 1692 void 1693 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 1694 { 1695 vfs_copyopttbl_extend(smo, dmo, 0); 1696 } 1697 1698 static char ** 1699 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 1700 { 1701 int c1 = 0; 1702 int c2 = 0; 1703 char **result; 1704 char **sp1, **sp2, **dp; 1705 1706 /* 1707 * First we count both lists of cancel options. 1708 * If either is NULL or has no elements, we return a copy of 1709 * the other. 1710 */ 1711 if (mop1->mo_cancel != NULL) { 1712 for (; mop1->mo_cancel[c1] != NULL; c1++) 1713 /* count cancel options in mop1 */; 1714 } 1715 1716 if (c1 == 0) 1717 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 1718 1719 if (mop2->mo_cancel != NULL) { 1720 for (; mop2->mo_cancel[c2] != NULL; c2++) 1721 /* count cancel options in mop2 */; 1722 } 1723 1724 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 1725 1726 if (c2 == 0) 1727 return (result); 1728 1729 /* 1730 * When we get here, we've got two sets of cancel options; 1731 * we need to merge the two sets. We know that the result 1732 * array has "c1+c2+1" entries and in the end we might shrink 1733 * it. 1734 * Result now has a copy of the c1 entries from mop1; we'll 1735 * now lookup all the entries of mop2 in mop1 and copy it if 1736 * it is unique. 1737 * This operation is O(n^2) but it's only called once per 1738 * filesystem per duplicate option. This is a situation 1739 * which doesn't arise with the filesystems in ON and 1740 * n is generally 1. 1741 */ 1742 1743 dp = &result[c1]; 1744 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 1745 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 1746 if (strcmp(*sp1, *sp2) == 0) 1747 break; 1748 } 1749 if (*sp1 == NULL) { 1750 /* 1751 * Option *sp2 not found in mop1, so copy it. 1752 * The calls to vfs_copycancelopt_extend() 1753 * guarantee that there's enough room. 1754 */ 1755 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 1756 (void) strcpy(*dp++, *sp2); 1757 } 1758 } 1759 if (dp != &result[c1+c2]) { 1760 size_t bytes = (dp - result + 1) * sizeof (char *); 1761 char **nres = kmem_alloc(bytes, KM_SLEEP); 1762 1763 bcopy(result, nres, bytes); 1764 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 1765 result = nres; 1766 } 1767 return (result); 1768 } 1769 1770 /* 1771 * Merge two mount option tables (outer and inner) into one. This is very 1772 * similar to "merging" global variables and automatic variables in C. 1773 * 1774 * This isn't (and doesn't have to be) fast. 1775 * 1776 * This function is *not* for general use by filesystems. 1777 * 1778 * Note: caller is responsible for locking the vfs list, if needed, 1779 * to protect omo, imo & dmo. 1780 */ 1781 void 1782 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 1783 { 1784 uint_t i, count; 1785 mntopt_t *mop, *motbl; 1786 uint_t freeidx; 1787 1788 /* 1789 * First determine how much space we need to allocate. 1790 */ 1791 count = omo->mo_count; 1792 for (i = 0; i < imo->mo_count; i++) { 1793 if (imo->mo_list[i].mo_flags & MO_EMPTY) 1794 continue; 1795 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 1796 count++; 1797 } 1798 ASSERT(count >= omo->mo_count && 1799 count <= omo->mo_count + imo->mo_count); 1800 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 1801 for (i = 0; i < omo->mo_count; i++) 1802 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 1803 freeidx = omo->mo_count; 1804 for (i = 0; i < imo->mo_count; i++) { 1805 if (imo->mo_list[i].mo_flags & MO_EMPTY) 1806 continue; 1807 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 1808 char **newcanp; 1809 uint_t index = mop - omo->mo_list; 1810 1811 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 1812 1813 vfs_freeopt(&motbl[index]); 1814 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 1815 1816 vfs_freecancelopt(motbl[index].mo_cancel); 1817 motbl[index].mo_cancel = newcanp; 1818 } else { 1819 /* 1820 * If it's a new option, just copy it over to the first 1821 * free location. 1822 */ 1823 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 1824 } 1825 } 1826 dmo->mo_count = count; 1827 dmo->mo_list = motbl; 1828 } 1829 1830 /* 1831 * Functions to set and clear mount options in a mount options table. 1832 */ 1833 1834 /* 1835 * Clear a mount option, if it exists. 1836 * 1837 * The update_mnttab arg indicates whether mops is part of a vfs that is on 1838 * the vfs list. 1839 */ 1840 static void 1841 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 1842 { 1843 struct mntopt *mop; 1844 uint_t i, count; 1845 1846 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 1847 1848 count = mops->mo_count; 1849 for (i = 0; i < count; i++) { 1850 mop = &mops->mo_list[i]; 1851 1852 if (mop->mo_flags & MO_EMPTY) 1853 continue; 1854 if (strcmp(opt, mop->mo_name)) 1855 continue; 1856 mop->mo_flags &= ~MO_SET; 1857 if (mop->mo_arg != NULL) { 1858 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 1859 } 1860 mop->mo_arg = NULL; 1861 if (update_mnttab) 1862 vfs_mnttab_modtimeupd(); 1863 break; 1864 } 1865 } 1866 1867 void 1868 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 1869 { 1870 int gotlock = 0; 1871 1872 if (VFS_ON_LIST(vfsp)) { 1873 gotlock = 1; 1874 vfs_list_lock(); 1875 } 1876 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 1877 if (gotlock) 1878 vfs_list_unlock(); 1879 } 1880 1881 1882 /* 1883 * Set a mount option on. If it's not found in the table, it's silently 1884 * ignored. If the option has MO_IGNORE set, it is still set unless the 1885 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 1886 * bits can be used to toggle the MO_NODISPLAY bit for the option. 1887 * If the VFS_CREATEOPT flag bit is set then the first option slot with 1888 * MO_EMPTY set is created as the option passed in. 1889 * 1890 * The update_mnttab arg indicates whether mops is part of a vfs that is on 1891 * the vfs list. 1892 */ 1893 static void 1894 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 1895 const char *arg, int flags, int update_mnttab) 1896 { 1897 mntopt_t *mop; 1898 uint_t i, count; 1899 char *sp; 1900 1901 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 1902 1903 if (flags & VFS_CREATEOPT) { 1904 if (vfs_hasopt(mops, opt) != NULL) { 1905 flags &= ~VFS_CREATEOPT; 1906 } 1907 } 1908 count = mops->mo_count; 1909 for (i = 0; i < count; i++) { 1910 mop = &mops->mo_list[i]; 1911 1912 if (mop->mo_flags & MO_EMPTY) { 1913 if ((flags & VFS_CREATEOPT) == 0) 1914 continue; 1915 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 1916 (void) strcpy(sp, opt); 1917 mop->mo_name = sp; 1918 if (arg != NULL) 1919 mop->mo_flags = MO_HASVALUE; 1920 else 1921 mop->mo_flags = 0; 1922 } else if (strcmp(opt, mop->mo_name)) { 1923 continue; 1924 } 1925 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 1926 break; 1927 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 1928 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 1929 (void) strcpy(sp, arg); 1930 } else { 1931 sp = NULL; 1932 } 1933 if (mop->mo_arg != NULL) 1934 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 1935 mop->mo_arg = sp; 1936 if (flags & VFS_DISPLAY) 1937 mop->mo_flags &= ~MO_NODISPLAY; 1938 if (flags & VFS_NODISPLAY) 1939 mop->mo_flags |= MO_NODISPLAY; 1940 mop->mo_flags |= MO_SET; 1941 if (mop->mo_cancel != NULL) { 1942 char **cp; 1943 1944 for (cp = mop->mo_cancel; *cp != NULL; cp++) 1945 vfs_clearmntopt_nolock(mops, *cp, 0); 1946 } 1947 if (update_mnttab) 1948 vfs_mnttab_modtimeupd(); 1949 break; 1950 } 1951 } 1952 1953 void 1954 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 1955 { 1956 int gotlock = 0; 1957 1958 if (VFS_ON_LIST(vfsp)) { 1959 gotlock = 1; 1960 vfs_list_lock(); 1961 } 1962 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 1963 if (gotlock) 1964 vfs_list_unlock(); 1965 } 1966 1967 1968 /* 1969 * Add a "tag" option to a mounted file system's options list. 1970 * 1971 * Note: caller is responsible for locking the vfs list, if needed, 1972 * to protect mops. 1973 */ 1974 static mntopt_t * 1975 vfs_addtag(mntopts_t *mops, const char *tag) 1976 { 1977 uint_t count; 1978 mntopt_t *mop, *motbl; 1979 1980 count = mops->mo_count + 1; 1981 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 1982 if (mops->mo_count) { 1983 size_t len = (count - 1) * sizeof (mntopt_t); 1984 1985 bcopy(mops->mo_list, motbl, len); 1986 kmem_free(mops->mo_list, len); 1987 } 1988 mops->mo_count = count; 1989 mops->mo_list = motbl; 1990 mop = &motbl[count - 1]; 1991 mop->mo_flags = MO_TAG; 1992 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 1993 (void) strcpy(mop->mo_name, tag); 1994 return (mop); 1995 } 1996 1997 /* 1998 * Allow users to set arbitrary "tags" in a vfs's mount options. 1999 * Broader use within the kernel is discouraged. 2000 */ 2001 int 2002 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2003 cred_t *cr) 2004 { 2005 vfs_t *vfsp; 2006 mntopts_t *mops; 2007 mntopt_t *mop; 2008 int found = 0; 2009 dev_t dev = makedevice(major, minor); 2010 int err = 0; 2011 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2012 2013 /* 2014 * Find the desired mounted file system 2015 */ 2016 vfs_list_lock(); 2017 vfsp = rootvfs; 2018 do { 2019 if (vfsp->vfs_dev == dev && 2020 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2021 found = 1; 2022 break; 2023 } 2024 vfsp = vfsp->vfs_next; 2025 } while (vfsp != rootvfs); 2026 2027 if (!found) { 2028 err = EINVAL; 2029 goto out; 2030 } 2031 err = secpolicy_fs_config(cr, vfsp); 2032 if (err != 0) 2033 goto out; 2034 2035 mops = &vfsp->vfs_mntopts; 2036 /* 2037 * Add tag if it doesn't already exist 2038 */ 2039 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2040 int len; 2041 2042 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2043 len = strlen(buf); 2044 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2045 err = ENAMETOOLONG; 2046 goto out; 2047 } 2048 mop = vfs_addtag(mops, tag); 2049 } 2050 if ((mop->mo_flags & MO_TAG) == 0) { 2051 err = EINVAL; 2052 goto out; 2053 } 2054 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2055 out: 2056 vfs_list_unlock(); 2057 kmem_free(buf, MAX_MNTOPT_STR); 2058 return (err); 2059 } 2060 2061 /* 2062 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2063 * Broader use within the kernel is discouraged. 2064 */ 2065 int 2066 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2067 cred_t *cr) 2068 { 2069 vfs_t *vfsp; 2070 mntopt_t *mop; 2071 int found = 0; 2072 dev_t dev = makedevice(major, minor); 2073 int err = 0; 2074 2075 /* 2076 * Find the desired mounted file system 2077 */ 2078 vfs_list_lock(); 2079 vfsp = rootvfs; 2080 do { 2081 if (vfsp->vfs_dev == dev && 2082 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2083 found = 1; 2084 break; 2085 } 2086 vfsp = vfsp->vfs_next; 2087 } while (vfsp != rootvfs); 2088 2089 if (!found) { 2090 err = EINVAL; 2091 goto out; 2092 } 2093 err = secpolicy_fs_config(cr, vfsp); 2094 if (err != 0) 2095 goto out; 2096 2097 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2098 err = EINVAL; 2099 goto out; 2100 } 2101 if ((mop->mo_flags & MO_TAG) == 0) { 2102 err = EINVAL; 2103 goto out; 2104 } 2105 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2106 out: 2107 vfs_list_unlock(); 2108 return (err); 2109 } 2110 2111 /* 2112 * Function to parse an option string and fill in a mount options table. 2113 * Unknown options are silently ignored. The input option string is modified 2114 * by replacing separators with nulls. If the create flag is set, options 2115 * not found in the table are just added on the fly. The table must have 2116 * an option slot marked MO_EMPTY to add an option on the fly. 2117 * 2118 * This function is *not* for general use by filesystems. 2119 * 2120 * Note: caller is responsible for locking the vfs list, if needed, 2121 * to protect mops.. 2122 */ 2123 void 2124 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2125 { 2126 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2127 int setflg = VFS_NOFORCEOPT; 2128 2129 if (osp == NULL) 2130 return; 2131 while (*s != '\0') { 2132 p = strchr(s, ','); /* find next option */ 2133 if (p == NULL) { 2134 cp = NULL; 2135 p = s + strlen(s); 2136 } else { 2137 cp = p; /* save location of comma */ 2138 *p++ = '\0'; /* mark end and point to next option */ 2139 } 2140 nextop = p; 2141 p = strchr(s, '='); /* look for value */ 2142 if (p == NULL) { 2143 valp = NULL; /* no value supplied */ 2144 } else { 2145 ep = p; /* save location of equals */ 2146 *p++ = '\0'; /* end option and point to value */ 2147 valp = p; 2148 } 2149 /* 2150 * set option into options table 2151 */ 2152 if (create) 2153 setflg |= VFS_CREATEOPT; 2154 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2155 if (cp != NULL) 2156 *cp = ','; /* restore the comma */ 2157 if (valp != NULL) 2158 *ep = '='; /* restore the equals */ 2159 s = nextop; 2160 } 2161 } 2162 2163 /* 2164 * Function to inquire if an option exists in a mount options table. 2165 * Returns a pointer to the option if it exists, else NULL. 2166 * 2167 * This function is *not* for general use by filesystems. 2168 * 2169 * Note: caller is responsible for locking the vfs list, if needed, 2170 * to protect mops. 2171 */ 2172 struct mntopt * 2173 vfs_hasopt(const mntopts_t *mops, const char *opt) 2174 { 2175 struct mntopt *mop; 2176 uint_t i, count; 2177 2178 count = mops->mo_count; 2179 for (i = 0; i < count; i++) { 2180 mop = &mops->mo_list[i]; 2181 2182 if (mop->mo_flags & MO_EMPTY) 2183 continue; 2184 if (strcmp(opt, mop->mo_name) == 0) 2185 return (mop); 2186 } 2187 return (NULL); 2188 } 2189 2190 /* 2191 * Function to inquire if an option is set in a mount options table. 2192 * Returns non-zero if set and fills in the arg pointer with a pointer to 2193 * the argument string or NULL if there is no argument string. 2194 */ 2195 static int 2196 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2197 { 2198 struct mntopt *mop; 2199 uint_t i, count; 2200 2201 count = mops->mo_count; 2202 for (i = 0; i < count; i++) { 2203 mop = &mops->mo_list[i]; 2204 2205 if (mop->mo_flags & MO_EMPTY) 2206 continue; 2207 if (strcmp(opt, mop->mo_name)) 2208 continue; 2209 if ((mop->mo_flags & MO_SET) == 0) 2210 return (0); 2211 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2212 *argp = mop->mo_arg; 2213 return (1); 2214 } 2215 return (0); 2216 } 2217 2218 2219 int 2220 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2221 { 2222 int ret; 2223 2224 vfs_list_read_lock(); 2225 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2226 vfs_list_unlock(); 2227 return (ret); 2228 } 2229 2230 2231 /* 2232 * Construct a comma separated string of the options set in the given 2233 * mount table, return the string in the given buffer. Return non-zero if 2234 * the buffer would overflow. 2235 * 2236 * This function is *not* for general use by filesystems. 2237 * 2238 * Note: caller is responsible for locking the vfs list, if needed, 2239 * to protect mp. 2240 */ 2241 int 2242 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2243 { 2244 char *cp; 2245 uint_t i; 2246 2247 buf[0] = '\0'; 2248 cp = buf; 2249 for (i = 0; i < mp->mo_count; i++) { 2250 struct mntopt *mop; 2251 2252 mop = &mp->mo_list[i]; 2253 if (mop->mo_flags & MO_SET) { 2254 int optlen, comma = 0; 2255 2256 if (buf[0] != '\0') 2257 comma = 1; 2258 optlen = strlen(mop->mo_name); 2259 if (strlen(buf) + comma + optlen + 1 > len) 2260 goto err; 2261 if (comma) 2262 *cp++ = ','; 2263 (void) strcpy(cp, mop->mo_name); 2264 cp += optlen; 2265 /* 2266 * Append option value if there is one 2267 */ 2268 if (mop->mo_arg != NULL) { 2269 int arglen; 2270 2271 arglen = strlen(mop->mo_arg); 2272 if (strlen(buf) + arglen + 2 > len) 2273 goto err; 2274 *cp++ = '='; 2275 (void) strcpy(cp, mop->mo_arg); 2276 cp += arglen; 2277 } 2278 } 2279 } 2280 return (0); 2281 err: 2282 return (EOVERFLOW); 2283 } 2284 2285 static void 2286 vfs_freecancelopt(char **moc) 2287 { 2288 if (moc != NULL) { 2289 int ccnt = 0; 2290 char **cp; 2291 2292 for (cp = moc; *cp != NULL; cp++) { 2293 kmem_free(*cp, strlen(*cp) + 1); 2294 ccnt++; 2295 } 2296 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2297 } 2298 } 2299 2300 static void 2301 vfs_freeopt(mntopt_t *mop) 2302 { 2303 if (mop->mo_name != NULL) 2304 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2305 2306 vfs_freecancelopt(mop->mo_cancel); 2307 2308 if (mop->mo_arg != NULL) 2309 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2310 } 2311 2312 /* 2313 * Free a mount options table 2314 * 2315 * This function is *not* for general use by filesystems. 2316 * 2317 * Note: caller is responsible for locking the vfs list, if needed, 2318 * to protect mp. 2319 */ 2320 void 2321 vfs_freeopttbl(mntopts_t *mp) 2322 { 2323 uint_t i, count; 2324 2325 count = mp->mo_count; 2326 for (i = 0; i < count; i++) { 2327 vfs_freeopt(&mp->mo_list[i]); 2328 } 2329 if (count) { 2330 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2331 mp->mo_count = 0; 2332 mp->mo_list = NULL; 2333 } 2334 } 2335 2336 /* 2337 * Free any mnttab information recorded in the vfs struct. 2338 * The vfs must not be on the vfs list. 2339 */ 2340 static void 2341 vfs_freemnttab(struct vfs *vfsp) 2342 { 2343 ASSERT(!VFS_ON_LIST(vfsp)); 2344 2345 /* 2346 * Free device and mount point information 2347 */ 2348 if (vfsp->vfs_mntpt != NULL) { 2349 refstr_rele(vfsp->vfs_mntpt); 2350 vfsp->vfs_mntpt = NULL; 2351 } 2352 if (vfsp->vfs_resource != NULL) { 2353 refstr_rele(vfsp->vfs_resource); 2354 vfsp->vfs_resource = NULL; 2355 } 2356 /* 2357 * Now free mount options information 2358 */ 2359 vfs_freeopttbl(&vfsp->vfs_mntopts); 2360 } 2361 2362 /* 2363 * Return the last mnttab modification time 2364 */ 2365 void 2366 vfs_mnttab_modtime(timespec_t *ts) 2367 { 2368 ASSERT(RW_LOCK_HELD(&vfslist)); 2369 *ts = vfs_mnttab_mtime; 2370 } 2371 2372 /* 2373 * See if mnttab is changed 2374 */ 2375 void 2376 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2377 { 2378 int changed; 2379 2380 *phpp = (struct pollhead *)NULL; 2381 2382 /* 2383 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 2384 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 2385 * to not grab the vfs list lock because tv_sec is monotonically 2386 * increasing. 2387 */ 2388 2389 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 2390 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 2391 if (!changed) { 2392 *phpp = &vfs_pollhd; 2393 } 2394 } 2395 2396 /* 2397 * Update the mnttab modification time and wake up any waiters for 2398 * mnttab changes 2399 */ 2400 void 2401 vfs_mnttab_modtimeupd() 2402 { 2403 hrtime_t oldhrt, newhrt; 2404 2405 ASSERT(RW_WRITE_HELD(&vfslist)); 2406 oldhrt = ts2hrt(&vfs_mnttab_mtime); 2407 gethrestime(&vfs_mnttab_mtime); 2408 newhrt = ts2hrt(&vfs_mnttab_mtime); 2409 if (oldhrt == (hrtime_t)0) 2410 vfs_mnttab_ctime = vfs_mnttab_mtime; 2411 /* 2412 * Attempt to provide unique mtime (like uniqtime but not). 2413 */ 2414 if (newhrt == oldhrt) { 2415 newhrt++; 2416 hrt2ts(newhrt, &vfs_mnttab_mtime); 2417 } 2418 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 2419 } 2420 2421 int 2422 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 2423 { 2424 vnode_t *coveredvp; 2425 int error; 2426 2427 /* 2428 * Get covered vnode. This will be NULL if the vfs is not linked 2429 * into the file system name space (i.e., domount() with MNT_NOSPICE). 2430 */ 2431 coveredvp = vfsp->vfs_vnodecovered; 2432 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 2433 2434 /* 2435 * Purge all dnlc entries for this vfs. 2436 */ 2437 (void) dnlc_purge_vfsp(vfsp, 0); 2438 2439 /* For forcible umount, skip VFS_SYNC() since it may hang */ 2440 if ((flag & MS_FORCE) == 0) 2441 (void) VFS_SYNC(vfsp, 0, cr); 2442 2443 /* 2444 * Lock the vfs to maintain fs status quo during unmount. This 2445 * has to be done after the sync because ufs_update tries to acquire 2446 * the vfs_reflock. 2447 */ 2448 vfs_lock_wait(vfsp); 2449 2450 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 2451 vfs_unlock(vfsp); 2452 if (coveredvp != NULL) 2453 vn_vfsunlock(coveredvp); 2454 } else if (coveredvp != NULL) { 2455 /* 2456 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 2457 * when it frees vfsp so we do a VN_HOLD() so we can 2458 * continue to use coveredvp afterwards. 2459 */ 2460 VN_HOLD(coveredvp); 2461 vfs_remove(vfsp); 2462 vn_vfsunlock(coveredvp); 2463 VN_RELE(coveredvp); 2464 } else { 2465 /* 2466 * Release the reference to vfs that is not linked 2467 * into the name space. 2468 */ 2469 vfs_unlock(vfsp); 2470 VFS_RELE(vfsp); 2471 } 2472 return (error); 2473 } 2474 2475 2476 /* 2477 * Vfs_unmountall() is called by uadmin() to unmount all 2478 * mounted file systems (except the root file system) during shutdown. 2479 * It follows the existing locking protocol when traversing the vfs list 2480 * to sync and unmount vfses. Even though there should be no 2481 * other thread running while the system is shutting down, it is prudent 2482 * to still follow the locking protocol. 2483 */ 2484 void 2485 vfs_unmountall(void) 2486 { 2487 struct vfs *vfsp; 2488 struct vfs *prev_vfsp = NULL; 2489 int error; 2490 2491 /* 2492 * Toss all dnlc entries now so that the per-vfs sync 2493 * and unmount operations don't have to slog through 2494 * a bunch of uninteresting vnodes over and over again. 2495 */ 2496 dnlc_purge(); 2497 2498 vfs_list_lock(); 2499 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 2500 prev_vfsp = vfsp->vfs_prev; 2501 2502 if (vfs_lock(vfsp) != 0) 2503 continue; 2504 error = vn_vfswlock(vfsp->vfs_vnodecovered); 2505 vfs_unlock(vfsp); 2506 if (error) 2507 continue; 2508 2509 vfs_list_unlock(); 2510 2511 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 2512 (void) dounmount(vfsp, 0, CRED()); 2513 2514 /* 2515 * Since we dropped the vfslist lock above we must 2516 * verify that next_vfsp still exists, else start over. 2517 */ 2518 vfs_list_lock(); 2519 for (vfsp = rootvfs->vfs_prev; 2520 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 2521 if (vfsp == prev_vfsp) 2522 break; 2523 if (vfsp == rootvfs && prev_vfsp != rootvfs) 2524 prev_vfsp = rootvfs->vfs_prev; 2525 } 2526 vfs_list_unlock(); 2527 } 2528 2529 /* 2530 * Called to add an entry to the end of the vfs mount in progress list 2531 */ 2532 void 2533 vfs_addmip(dev_t dev, struct vfs *vfsp) 2534 { 2535 struct ipmnt *mipp; 2536 2537 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 2538 mipp->mip_next = NULL; 2539 mipp->mip_dev = dev; 2540 mipp->mip_vfsp = vfsp; 2541 mutex_enter(&vfs_miplist_mutex); 2542 if (vfs_miplist_end != NULL) 2543 vfs_miplist_end->mip_next = mipp; 2544 else 2545 vfs_miplist = mipp; 2546 vfs_miplist_end = mipp; 2547 mutex_exit(&vfs_miplist_mutex); 2548 } 2549 2550 /* 2551 * Called to remove an entry from the mount in progress list 2552 * Either because the mount completed or it failed. 2553 */ 2554 void 2555 vfs_delmip(struct vfs *vfsp) 2556 { 2557 struct ipmnt *mipp, *mipprev; 2558 2559 mutex_enter(&vfs_miplist_mutex); 2560 mipprev = NULL; 2561 for (mipp = vfs_miplist; 2562 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 2563 mipprev = mipp; 2564 } 2565 if (mipp == NULL) 2566 return; /* shouldn't happen */ 2567 if (mipp == vfs_miplist_end) 2568 vfs_miplist_end = mipprev; 2569 if (mipprev == NULL) 2570 vfs_miplist = mipp->mip_next; 2571 else 2572 mipprev->mip_next = mipp->mip_next; 2573 mutex_exit(&vfs_miplist_mutex); 2574 kmem_free(mipp, sizeof (struct ipmnt)); 2575 } 2576 2577 /* 2578 * vfs_add is called by a specific filesystem's mount routine to add 2579 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 2580 * The vfs should already have been locked by the caller. 2581 * 2582 * coveredvp is NULL if this is the root. 2583 */ 2584 void 2585 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 2586 { 2587 int newflag; 2588 2589 ASSERT(vfs_lock_held(vfsp)); 2590 VFS_HOLD(vfsp); 2591 newflag = vfsp->vfs_flag; 2592 if (mflag & MS_RDONLY) 2593 newflag |= VFS_RDONLY; 2594 else 2595 newflag &= ~VFS_RDONLY; 2596 if (mflag & MS_NOSUID) 2597 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 2598 else 2599 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 2600 if (mflag & MS_NOMNTTAB) 2601 newflag |= VFS_NOMNTTAB; 2602 else 2603 newflag &= ~VFS_NOMNTTAB; 2604 2605 if (coveredvp != NULL) { 2606 ASSERT(vn_vfswlock_held(coveredvp)); 2607 coveredvp->v_vfsmountedhere = vfsp; 2608 VN_HOLD(coveredvp); 2609 } 2610 vfsp->vfs_vnodecovered = coveredvp; 2611 vfsp->vfs_flag = newflag; 2612 2613 vfs_list_add(vfsp); 2614 } 2615 2616 /* 2617 * Remove a vfs from the vfs list, null out the pointer from the 2618 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 2619 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 2620 * reference to the vfs and to the covered vnode. 2621 * 2622 * Called from dounmount after it's confirmed with the file system 2623 * that the unmount is legal. 2624 */ 2625 void 2626 vfs_remove(struct vfs *vfsp) 2627 { 2628 vnode_t *vp; 2629 2630 ASSERT(vfs_lock_held(vfsp)); 2631 2632 /* 2633 * Can't unmount root. Should never happen because fs will 2634 * be busy. 2635 */ 2636 if (vfsp == rootvfs) 2637 cmn_err(CE_PANIC, "vfs_remove: unmounting root"); 2638 2639 vfs_list_remove(vfsp); 2640 2641 /* 2642 * Unhook from the file system name space. 2643 */ 2644 vp = vfsp->vfs_vnodecovered; 2645 ASSERT(vn_vfswlock_held(vp)); 2646 vp->v_vfsmountedhere = NULL; 2647 vfsp->vfs_vnodecovered = NULL; 2648 VN_RELE(vp); 2649 2650 /* 2651 * Release lock and wakeup anybody waiting. 2652 */ 2653 vfs_unlock(vfsp); 2654 VFS_RELE(vfsp); 2655 } 2656 2657 /* 2658 * Lock a filesystem to prevent access to it while mounting, 2659 * unmounting and syncing. Return EBUSY immediately if lock 2660 * can't be acquired. 2661 */ 2662 int 2663 vfs_lock(vfs_t *vfsp) 2664 { 2665 vn_vfslocks_entry_t *vpvfsentry; 2666 2667 vpvfsentry = vn_vfslocks_getlock(vfsp); 2668 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 2669 return (0); 2670 2671 vn_vfslocks_rele(vpvfsentry); 2672 return (EBUSY); 2673 } 2674 2675 int 2676 vfs_rlock(vfs_t *vfsp) 2677 { 2678 vn_vfslocks_entry_t *vpvfsentry; 2679 2680 vpvfsentry = vn_vfslocks_getlock(vfsp); 2681 2682 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 2683 return (0); 2684 2685 vn_vfslocks_rele(vpvfsentry); 2686 return (EBUSY); 2687 } 2688 2689 void 2690 vfs_lock_wait(vfs_t *vfsp) 2691 { 2692 vn_vfslocks_entry_t *vpvfsentry; 2693 2694 vpvfsentry = vn_vfslocks_getlock(vfsp); 2695 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 2696 } 2697 2698 void 2699 vfs_rlock_wait(vfs_t *vfsp) 2700 { 2701 vn_vfslocks_entry_t *vpvfsentry; 2702 2703 vpvfsentry = vn_vfslocks_getlock(vfsp); 2704 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 2705 } 2706 2707 /* 2708 * Unlock a locked filesystem. 2709 */ 2710 void 2711 vfs_unlock(vfs_t *vfsp) 2712 { 2713 vn_vfslocks_entry_t *vpvfsentry; 2714 2715 /* 2716 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 2717 * And these changes should remain for the patch changes as it is. 2718 */ 2719 if (panicstr) 2720 return; 2721 2722 /* 2723 * ve_refcount needs to be dropped twice here. 2724 * 1. To release refernce after a call to vfs_locks_getlock() 2725 * 2. To release the reference from the locking routines like 2726 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 2727 */ 2728 2729 vpvfsentry = vn_vfslocks_getlock(vfsp); 2730 vn_vfslocks_rele(vpvfsentry); 2731 2732 rwst_exit(&vpvfsentry->ve_lock); 2733 vn_vfslocks_rele(vpvfsentry); 2734 } 2735 2736 /* 2737 * Utility routine that allows a filesystem to construct its 2738 * fsid in "the usual way" - by munging some underlying dev_t and 2739 * the filesystem type number into the 64-bit fsid. Note that 2740 * this implicitly relies on dev_t persistence to make filesystem 2741 * id's persistent. 2742 * 2743 * There's nothing to prevent an individual fs from constructing its 2744 * fsid in a different way, and indeed they should. 2745 * 2746 * Since we want fsids to be 32-bit quantities (so that they can be 2747 * exported identically by either 32-bit or 64-bit APIs, as well as 2748 * the fact that fsid's are "known" to NFS), we compress the device 2749 * number given down to 32-bits, and panic if that isn't possible. 2750 */ 2751 void 2752 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 2753 { 2754 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 2755 panic("device number too big for fsid!"); 2756 fsi->val[1] = val; 2757 } 2758 2759 int 2760 vfs_lock_held(vfs_t *vfsp) 2761 { 2762 int held; 2763 vn_vfslocks_entry_t *vpvfsentry; 2764 2765 /* 2766 * vfs_lock_held will mimic sema_held behaviour 2767 * if panicstr is set. And these changes should remain 2768 * for the patch changes as it is. 2769 */ 2770 if (panicstr) 2771 return (1); 2772 2773 vpvfsentry = vn_vfslocks_getlock(vfsp); 2774 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 2775 2776 vn_vfslocks_rele(vpvfsentry); 2777 return (held); 2778 } 2779 2780 struct _kthread * 2781 vfs_lock_owner(vfs_t *vfsp) 2782 { 2783 struct _kthread *owner; 2784 vn_vfslocks_entry_t *vpvfsentry; 2785 2786 /* 2787 * vfs_wlock_held will mimic sema_held behaviour 2788 * if panicstr is set. And these changes should remain 2789 * for the patch changes as it is. 2790 */ 2791 if (panicstr) 2792 return (NULL); 2793 2794 vpvfsentry = vn_vfslocks_getlock(vfsp); 2795 owner = rwst_owner(&vpvfsentry->ve_lock); 2796 2797 vn_vfslocks_rele(vpvfsentry); 2798 return (owner); 2799 } 2800 2801 /* 2802 * vfs list locking. 2803 * 2804 * Rather than manipulate the vfslist lock directly, we abstract into lock 2805 * and unlock routines to allow the locking implementation to be changed for 2806 * clustering. 2807 * 2808 * Whenever the vfs list is modified through its hash links, the overall list 2809 * lock must be obtained before locking the relevant hash bucket. But to see 2810 * whether a given vfs is on the list, it suffices to obtain the lock for the 2811 * hash bucket without getting the overall list lock. (See getvfs() below.) 2812 */ 2813 2814 void 2815 vfs_list_lock() 2816 { 2817 rw_enter(&vfslist, RW_WRITER); 2818 } 2819 2820 void 2821 vfs_list_read_lock() 2822 { 2823 rw_enter(&vfslist, RW_READER); 2824 } 2825 2826 void 2827 vfs_list_unlock() 2828 { 2829 rw_exit(&vfslist); 2830 } 2831 2832 /* 2833 * Low level worker routines for adding entries to and removing entries from 2834 * the vfs list. 2835 */ 2836 2837 static void 2838 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 2839 { 2840 int vhno; 2841 struct vfs **hp; 2842 dev_t dev; 2843 2844 ASSERT(RW_WRITE_HELD(&vfslist)); 2845 2846 dev = expldev(vfsp->vfs_fsid.val[0]); 2847 vhno = VFSHASH(getmajor(dev), getminor(dev)); 2848 2849 mutex_enter(&rvfs_list[vhno].rvfs_lock); 2850 2851 /* 2852 * Link into the hash table, inserting it at the end, so that LOFS 2853 * with the same fsid as UFS (or other) file systems will not hide the 2854 * UFS. 2855 */ 2856 if (insert_at_head) { 2857 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 2858 rvfs_list[vhno].rvfs_head = vfsp; 2859 } else { 2860 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 2861 hp = &(*hp)->vfs_hash) 2862 continue; 2863 /* 2864 * hp now contains the address of the pointer to update 2865 * to effect the insertion. 2866 */ 2867 vfsp->vfs_hash = NULL; 2868 *hp = vfsp; 2869 } 2870 2871 rvfs_list[vhno].rvfs_len++; 2872 mutex_exit(&rvfs_list[vhno].rvfs_lock); 2873 } 2874 2875 2876 static void 2877 vfs_hash_remove(struct vfs *vfsp) 2878 { 2879 int vhno; 2880 struct vfs *tvfsp; 2881 dev_t dev; 2882 2883 ASSERT(RW_WRITE_HELD(&vfslist)); 2884 2885 dev = expldev(vfsp->vfs_fsid.val[0]); 2886 vhno = VFSHASH(getmajor(dev), getminor(dev)); 2887 2888 mutex_enter(&rvfs_list[vhno].rvfs_lock); 2889 2890 /* 2891 * Remove from hash. 2892 */ 2893 if (rvfs_list[vhno].rvfs_head == vfsp) { 2894 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 2895 rvfs_list[vhno].rvfs_len--; 2896 goto foundit; 2897 } 2898 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 2899 tvfsp = tvfsp->vfs_hash) { 2900 if (tvfsp->vfs_hash == vfsp) { 2901 tvfsp->vfs_hash = vfsp->vfs_hash; 2902 rvfs_list[vhno].rvfs_len--; 2903 goto foundit; 2904 } 2905 } 2906 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 2907 2908 foundit: 2909 2910 mutex_exit(&rvfs_list[vhno].rvfs_lock); 2911 } 2912 2913 2914 void 2915 vfs_list_add(struct vfs *vfsp) 2916 { 2917 zone_t *zone; 2918 2919 /* 2920 * The zone that owns the mount is the one that performed the mount. 2921 * Note that this isn't necessarily the same as the zone mounted into. 2922 * The corresponding zone_rele() will be done when the vfs_t is 2923 * being free'd. 2924 */ 2925 vfsp->vfs_zone = curproc->p_zone; 2926 zone_hold(vfsp->vfs_zone); 2927 2928 /* 2929 * Find the zone mounted into, and put this mount on its vfs list. 2930 */ 2931 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 2932 ASSERT(zone != NULL); 2933 /* 2934 * Special casing for the root vfs. This structure is allocated 2935 * statically and hooked onto rootvfs at link time. During the 2936 * vfs_mountroot call at system startup time, the root file system's 2937 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 2938 * as argument. The code below must detect and handle this special 2939 * case. The only apparent justification for this special casing is 2940 * to ensure that the root file system appears at the head of the 2941 * list. 2942 * 2943 * XXX: I'm assuming that it's ok to do normal list locking when 2944 * adding the entry for the root file system (this used to be 2945 * done with no locks held). 2946 */ 2947 vfs_list_lock(); 2948 /* 2949 * Link into the vfs list proper. 2950 */ 2951 if (vfsp == &root) { 2952 /* 2953 * Assert: This vfs is already on the list as its first entry. 2954 * Thus, there's nothing to do. 2955 */ 2956 ASSERT(rootvfs == vfsp); 2957 /* 2958 * Add it to the head of the global zone's vfslist. 2959 */ 2960 ASSERT(zone == global_zone); 2961 ASSERT(zone->zone_vfslist == NULL); 2962 zone->zone_vfslist = vfsp; 2963 } else { 2964 /* 2965 * Link to end of list using vfs_prev (as rootvfs is now a 2966 * doubly linked circular list) so list is in mount order for 2967 * mnttab use. 2968 */ 2969 rootvfs->vfs_prev->vfs_next = vfsp; 2970 vfsp->vfs_prev = rootvfs->vfs_prev; 2971 rootvfs->vfs_prev = vfsp; 2972 vfsp->vfs_next = rootvfs; 2973 2974 /* 2975 * Do it again for the zone-private list (which may be NULL). 2976 */ 2977 if (zone->zone_vfslist == NULL) { 2978 ASSERT(zone != global_zone); 2979 zone->zone_vfslist = vfsp; 2980 } else { 2981 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 2982 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 2983 zone->zone_vfslist->vfs_zone_prev = vfsp; 2984 vfsp->vfs_zone_next = zone->zone_vfslist; 2985 } 2986 } 2987 2988 /* 2989 * Link into the hash table, inserting it at the end, so that LOFS 2990 * with the same fsid as UFS (or other) file systems will not hide 2991 * the UFS. 2992 */ 2993 vfs_hash_add(vfsp, 0); 2994 2995 /* 2996 * update the mnttab modification time 2997 */ 2998 vfs_mnttab_modtimeupd(); 2999 vfs_list_unlock(); 3000 zone_rele(zone); 3001 } 3002 3003 void 3004 vfs_list_remove(struct vfs *vfsp) 3005 { 3006 zone_t *zone; 3007 3008 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3009 ASSERT(zone != NULL); 3010 /* 3011 * Callers are responsible for preventing attempts to unmount the 3012 * root. 3013 */ 3014 ASSERT(vfsp != rootvfs); 3015 3016 vfs_list_lock(); 3017 3018 /* 3019 * Remove from hash. 3020 */ 3021 vfs_hash_remove(vfsp); 3022 3023 /* 3024 * Remove from vfs list. 3025 */ 3026 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3027 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3028 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3029 3030 /* 3031 * Remove from zone-specific vfs list. 3032 */ 3033 if (zone->zone_vfslist == vfsp) 3034 zone->zone_vfslist = vfsp->vfs_zone_next; 3035 3036 if (vfsp->vfs_zone_next == vfsp) { 3037 ASSERT(vfsp->vfs_zone_prev == vfsp); 3038 ASSERT(zone->zone_vfslist == vfsp); 3039 zone->zone_vfslist = NULL; 3040 } 3041 3042 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3043 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3044 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3045 3046 /* 3047 * update the mnttab modification time 3048 */ 3049 vfs_mnttab_modtimeupd(); 3050 vfs_list_unlock(); 3051 zone_rele(zone); 3052 } 3053 3054 struct vfs * 3055 getvfs(fsid_t *fsid) 3056 { 3057 struct vfs *vfsp; 3058 int val0 = fsid->val[0]; 3059 int val1 = fsid->val[1]; 3060 dev_t dev = expldev(val0); 3061 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3062 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3063 3064 mutex_enter(hmp); 3065 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3066 if (vfsp->vfs_fsid.val[0] == val0 && 3067 vfsp->vfs_fsid.val[1] == val1) { 3068 VFS_HOLD(vfsp); 3069 mutex_exit(hmp); 3070 return (vfsp); 3071 } 3072 } 3073 mutex_exit(hmp); 3074 return (NULL); 3075 } 3076 3077 /* 3078 * Search the vfs mount in progress list for a specified device/vfs entry. 3079 * Returns 0 if the first entry in the list that the device matches has the 3080 * given vfs pointer as well. If the device matches but a different vfs 3081 * pointer is encountered in the list before the given vfs pointer then 3082 * a 1 is returned. 3083 */ 3084 3085 int 3086 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3087 { 3088 int retval = 0; 3089 struct ipmnt *mipp; 3090 3091 mutex_enter(&vfs_miplist_mutex); 3092 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3093 if (mipp->mip_dev == dev) { 3094 if (mipp->mip_vfsp != vfsp) 3095 retval = 1; 3096 break; 3097 } 3098 } 3099 mutex_exit(&vfs_miplist_mutex); 3100 return (retval); 3101 } 3102 3103 /* 3104 * Search the vfs list for a specified device. Returns 1, if entry is found 3105 * or 0 if no suitable entry is found. 3106 */ 3107 3108 int 3109 vfs_devismounted(dev_t dev) 3110 { 3111 struct vfs *vfsp; 3112 int found; 3113 3114 vfs_list_read_lock(); 3115 vfsp = rootvfs; 3116 found = 0; 3117 do { 3118 if (vfsp->vfs_dev == dev) { 3119 found = 1; 3120 break; 3121 } 3122 vfsp = vfsp->vfs_next; 3123 } while (vfsp != rootvfs); 3124 3125 vfs_list_unlock(); 3126 return (found); 3127 } 3128 3129 /* 3130 * Search the vfs list for a specified device. Returns a pointer to it 3131 * or NULL if no suitable entry is found. The caller of this routine 3132 * is responsible for releasing the returned vfs pointer. 3133 */ 3134 struct vfs * 3135 vfs_dev2vfsp(dev_t dev) 3136 { 3137 struct vfs *vfsp; 3138 int found; 3139 3140 vfs_list_read_lock(); 3141 vfsp = rootvfs; 3142 found = 0; 3143 do { 3144 /* 3145 * The following could be made more efficient by making 3146 * the entire loop use vfs_zone_next if the call is from 3147 * a zone. The only callers, however, ustat(2) and 3148 * umount2(2), don't seem to justify the added 3149 * complexity at present. 3150 */ 3151 if (vfsp->vfs_dev == dev && 3152 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3153 curproc->p_zone)) { 3154 VFS_HOLD(vfsp); 3155 found = 1; 3156 break; 3157 } 3158 vfsp = vfsp->vfs_next; 3159 } while (vfsp != rootvfs); 3160 vfs_list_unlock(); 3161 return (found ? vfsp: NULL); 3162 } 3163 3164 /* 3165 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3166 * or NULL if no suitable entry is found. The caller of this routine 3167 * is responsible for releasing the returned vfs pointer. 3168 * 3169 * Note that if multiple mntpoints match, the last one matching is 3170 * returned in an attempt to return the "top" mount when overlay 3171 * mounts are covering the same mount point. This is accomplished by starting 3172 * at the end of the list and working our way backwards, stopping at the first 3173 * matching mount. 3174 */ 3175 struct vfs * 3176 vfs_mntpoint2vfsp(const char *mp) 3177 { 3178 struct vfs *vfsp; 3179 struct vfs *retvfsp = NULL; 3180 zone_t *zone = curproc->p_zone; 3181 struct vfs *list; 3182 3183 vfs_list_read_lock(); 3184 if (getzoneid() == GLOBAL_ZONEID) { 3185 /* 3186 * The global zone may see filesystems in any zone. 3187 */ 3188 vfsp = rootvfs->vfs_prev; 3189 do { 3190 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) { 3191 retvfsp = vfsp; 3192 break; 3193 } 3194 vfsp = vfsp->vfs_prev; 3195 } while (vfsp != rootvfs->vfs_prev); 3196 } else if ((list = zone->zone_vfslist) != NULL) { 3197 const char *mntpt; 3198 3199 vfsp = list->vfs_zone_prev; 3200 do { 3201 mntpt = refstr_value(vfsp->vfs_mntpt); 3202 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3203 if (strcmp(mntpt, mp) == 0) { 3204 retvfsp = vfsp; 3205 break; 3206 } 3207 vfsp = vfsp->vfs_zone_prev; 3208 } while (vfsp != list->vfs_zone_prev); 3209 } 3210 if (retvfsp) 3211 VFS_HOLD(retvfsp); 3212 vfs_list_unlock(); 3213 return (retvfsp); 3214 } 3215 3216 /* 3217 * Search the vfs list for a specified vfsops. 3218 * if vfs entry is found then return 1, else 0. 3219 */ 3220 int 3221 vfs_opsinuse(vfsops_t *ops) 3222 { 3223 struct vfs *vfsp; 3224 int found; 3225 3226 vfs_list_read_lock(); 3227 vfsp = rootvfs; 3228 found = 0; 3229 do { 3230 if (vfs_getops(vfsp) == ops) { 3231 found = 1; 3232 break; 3233 } 3234 vfsp = vfsp->vfs_next; 3235 } while (vfsp != rootvfs); 3236 vfs_list_unlock(); 3237 return (found); 3238 } 3239 3240 /* 3241 * Allocate an entry in vfssw for a file system type 3242 */ 3243 struct vfssw * 3244 allocate_vfssw(char *type) 3245 { 3246 struct vfssw *vswp; 3247 3248 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3249 /* 3250 * The vfssw table uses the empty string to identify an 3251 * available entry; we cannot add any type which has 3252 * a leading NUL. The string length is limited to 3253 * the size of the st_fstype array in struct stat. 3254 */ 3255 return (NULL); 3256 } 3257 3258 ASSERT(VFSSW_WRITE_LOCKED()); 3259 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3260 if (!ALLOCATED_VFSSW(vswp)) { 3261 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3262 (void) strcpy(vswp->vsw_name, type); 3263 ASSERT(vswp->vsw_count == 0); 3264 vswp->vsw_count = 1; 3265 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3266 return (vswp); 3267 } 3268 return (NULL); 3269 } 3270 3271 /* 3272 * Impose additional layer of translation between vfstype names 3273 * and module names in the filesystem. 3274 */ 3275 static char * 3276 vfs_to_modname(char *vfstype) 3277 { 3278 if (strcmp(vfstype, "proc") == 0) { 3279 vfstype = "procfs"; 3280 } else if (strcmp(vfstype, "fd") == 0) { 3281 vfstype = "fdfs"; 3282 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3283 vfstype = "nfs"; 3284 } 3285 3286 return (vfstype); 3287 } 3288 3289 /* 3290 * Find a vfssw entry given a file system type name. 3291 * Try to autoload the filesystem if it's not found. 3292 * If it's installed, return the vfssw locked to prevent unloading. 3293 */ 3294 struct vfssw * 3295 vfs_getvfssw(char *type) 3296 { 3297 struct vfssw *vswp; 3298 char *modname; 3299 3300 RLOCK_VFSSW(); 3301 vswp = vfs_getvfsswbyname(type); 3302 modname = vfs_to_modname(type); 3303 3304 if (rootdir == NULL) { 3305 /* 3306 * If we haven't yet loaded the root file system, then our 3307 * _init won't be called until later. Allocate vfssw entry, 3308 * because mod_installfs won't be called. 3309 */ 3310 if (vswp == NULL) { 3311 RUNLOCK_VFSSW(); 3312 WLOCK_VFSSW(); 3313 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 3314 if ((vswp = allocate_vfssw(type)) == NULL) { 3315 WUNLOCK_VFSSW(); 3316 return (NULL); 3317 } 3318 } 3319 WUNLOCK_VFSSW(); 3320 RLOCK_VFSSW(); 3321 } 3322 if (!VFS_INSTALLED(vswp)) { 3323 RUNLOCK_VFSSW(); 3324 (void) modloadonly("fs", modname); 3325 } else 3326 RUNLOCK_VFSSW(); 3327 return (vswp); 3328 } 3329 3330 /* 3331 * Try to load the filesystem. Before calling modload(), we drop 3332 * our lock on the VFS switch table, and pick it up after the 3333 * module is loaded. However, there is a potential race: the 3334 * module could be unloaded after the call to modload() completes 3335 * but before we pick up the lock and drive on. Therefore, 3336 * we keep reloading the module until we've loaded the module 3337 * _and_ we have the lock on the VFS switch table. 3338 */ 3339 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 3340 RUNLOCK_VFSSW(); 3341 if (modload("fs", modname) == -1) 3342 return (NULL); 3343 RLOCK_VFSSW(); 3344 if (vswp == NULL) 3345 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 3346 break; 3347 } 3348 RUNLOCK_VFSSW(); 3349 3350 return (vswp); 3351 } 3352 3353 /* 3354 * Find a vfssw entry given a file system type name. 3355 */ 3356 struct vfssw * 3357 vfs_getvfsswbyname(char *type) 3358 { 3359 struct vfssw *vswp; 3360 3361 ASSERT(VFSSW_LOCKED()); 3362 if (type == NULL || *type == '\0') 3363 return (NULL); 3364 3365 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3366 if (strcmp(type, vswp->vsw_name) == 0) { 3367 vfs_refvfssw(vswp); 3368 return (vswp); 3369 } 3370 } 3371 3372 return (NULL); 3373 } 3374 3375 /* 3376 * Find a vfssw entry given a set of vfsops. 3377 */ 3378 struct vfssw * 3379 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 3380 { 3381 struct vfssw *vswp; 3382 3383 RLOCK_VFSSW(); 3384 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3385 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 3386 vfs_refvfssw(vswp); 3387 RUNLOCK_VFSSW(); 3388 return (vswp); 3389 } 3390 } 3391 RUNLOCK_VFSSW(); 3392 3393 return (NULL); 3394 } 3395 3396 /* 3397 * Reference a vfssw entry. 3398 */ 3399 void 3400 vfs_refvfssw(struct vfssw *vswp) 3401 { 3402 3403 mutex_enter(&vswp->vsw_lock); 3404 vswp->vsw_count++; 3405 mutex_exit(&vswp->vsw_lock); 3406 } 3407 3408 /* 3409 * Unreference a vfssw entry. 3410 */ 3411 void 3412 vfs_unrefvfssw(struct vfssw *vswp) 3413 { 3414 3415 mutex_enter(&vswp->vsw_lock); 3416 vswp->vsw_count--; 3417 mutex_exit(&vswp->vsw_lock); 3418 } 3419 3420 int sync_timeout = 30; /* timeout for syncing a page during panic */ 3421 int sync_timeleft; /* portion of sync_timeout remaining */ 3422 3423 static int sync_retries = 20; /* number of retries when not making progress */ 3424 static int sync_triesleft; /* portion of sync_retries remaining */ 3425 3426 static pgcnt_t old_pgcnt, new_pgcnt; 3427 static int new_bufcnt, old_bufcnt; 3428 3429 /* 3430 * Sync all of the mounted filesystems, and then wait for the actual i/o to 3431 * complete. We wait by counting the number of dirty pages and buffers, 3432 * pushing them out using bio_busy() and page_busy(), and then counting again. 3433 * This routine is used during both the uadmin A_SHUTDOWN code as well as 3434 * the SYNC phase of the panic code (see comments in panic.c). It should only 3435 * be used after some higher-level mechanism has quiesced the system so that 3436 * new writes are not being initiated while we are waiting for completion. 3437 * 3438 * To ensure finite running time, our algorithm uses two timeout mechanisms: 3439 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and 3440 * sync_triesleft (a progress counter used by the vfs_syncall() loop below). 3441 * Together these ensure that syncing completes if our i/o paths are stuck. 3442 * The counters are declared above so they can be found easily in the debugger. 3443 * 3444 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the 3445 * vfs_syncprogress() subroutine whenever we make progress through the lists of 3446 * pages and buffers. It is decremented and expired by the deadman() cyclic. 3447 * When vfs_syncall() decides it is done, we disable the deadman() counter by 3448 * setting sync_timeleft to zero. This timer guards against vfs_syncall() 3449 * deadlocking or hanging inside of a broken filesystem or driver routine. 3450 * 3451 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 3452 * sync_retries consecutive calls to bio_busy() and page_busy() without 3453 * decreasing either the number of dirty buffers or dirty pages below the 3454 * lowest count we have seen so far, we give up and return from vfs_syncall(). 3455 * 3456 * Each loop iteration ends with a call to delay() one second to allow time for 3457 * i/o completion and to permit the user time to read our progress messages. 3458 */ 3459 void 3460 vfs_syncall(void) 3461 { 3462 if (rootdir == NULL && !modrootloaded) 3463 return; /* panic during boot - no filesystems yet */ 3464 3465 printf("syncing file systems..."); 3466 vfs_syncprogress(); 3467 sync(); 3468 3469 vfs_syncprogress(); 3470 sync_triesleft = sync_retries; 3471 3472 old_bufcnt = new_bufcnt = INT_MAX; 3473 old_pgcnt = new_pgcnt = ULONG_MAX; 3474 3475 while (sync_triesleft > 0) { 3476 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 3477 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 3478 3479 new_bufcnt = bio_busy(B_TRUE); 3480 new_pgcnt = page_busy(B_TRUE); 3481 vfs_syncprogress(); 3482 3483 if (new_bufcnt == 0 && new_pgcnt == 0) 3484 break; 3485 3486 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 3487 sync_triesleft = sync_retries; 3488 else 3489 sync_triesleft--; 3490 3491 if (new_bufcnt) 3492 printf(" [%d]", new_bufcnt); 3493 if (new_pgcnt) 3494 printf(" %lu", new_pgcnt); 3495 3496 delay(hz); 3497 } 3498 3499 if (new_bufcnt != 0 || new_pgcnt != 0) 3500 printf(" done (not all i/o completed)\n"); 3501 else 3502 printf(" done\n"); 3503 3504 sync_timeleft = 0; 3505 delay(hz); 3506 } 3507 3508 /* 3509 * If we are in the middle of the sync phase of panic, reset sync_timeleft to 3510 * sync_timeout to indicate that we are making progress and the deadman() 3511 * omnipresent cyclic should not yet time us out. Note that it is safe to 3512 * store to sync_timeleft here since the deadman() is firing at high-level 3513 * on top of us. If we are racing with the deadman(), either the deadman() 3514 * will decrement the old value and then we will reset it, or we will 3515 * reset it and then the deadman() will immediately decrement it. In either 3516 * case, correct behavior results. 3517 */ 3518 void 3519 vfs_syncprogress(void) 3520 { 3521 if (panicstr) 3522 sync_timeleft = sync_timeout; 3523 } 3524 3525 /* 3526 * Map VFS flags to statvfs flags. These shouldn't really be separate 3527 * flags at all. 3528 */ 3529 uint_t 3530 vf_to_stf(uint_t vf) 3531 { 3532 uint_t stf = 0; 3533 3534 if (vf & VFS_RDONLY) 3535 stf |= ST_RDONLY; 3536 if (vf & VFS_NOSETUID) 3537 stf |= ST_NOSUID; 3538 if (vf & VFS_NOTRUNC) 3539 stf |= ST_NOTRUNC; 3540 3541 return (stf); 3542 } 3543 3544 /* 3545 * Use old-style function prototype for vfsstray() so 3546 * that we can use it anywhere in the vfsops structure. 3547 */ 3548 int vfsstray(); 3549 3550 /* 3551 * Entries for (illegal) fstype 0. 3552 */ 3553 /* ARGSUSED */ 3554 int 3555 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 3556 { 3557 cmn_err(CE_PANIC, "stray vfs operation"); 3558 return (0); 3559 } 3560 3561 vfsops_t vfs_strayops = { 3562 vfsstray, 3563 vfsstray, 3564 vfsstray, 3565 vfsstray, 3566 vfsstray_sync, 3567 vfsstray, 3568 vfsstray, 3569 vfsstray 3570 }; 3571 3572 /* 3573 * Entries for (illegal) fstype 0. 3574 */ 3575 int 3576 vfsstray(void) 3577 { 3578 cmn_err(CE_PANIC, "stray vfs operation"); 3579 return (0); 3580 } 3581 3582 /* 3583 * Support for dealing with forced UFS unmount and its interaction with 3584 * LOFS. Could be used by any filesystem. 3585 * See bug 1203132. 3586 */ 3587 int 3588 vfs_EIO(void) 3589 { 3590 return (EIO); 3591 } 3592 3593 /* 3594 * We've gotta define the op for sync separately, since the compiler gets 3595 * confused if we mix and match ANSI and normal style prototypes when 3596 * a "short" argument is present and spits out a warning. 3597 */ 3598 /*ARGSUSED*/ 3599 int 3600 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 3601 { 3602 return (EIO); 3603 } 3604 3605 vfs_t EIO_vfs; 3606 vfsops_t *EIO_vfsops; 3607 3608 /* 3609 * Called from startup() to initialize all loaded vfs's 3610 */ 3611 void 3612 vfsinit(void) 3613 { 3614 struct vfssw *vswp; 3615 int error; 3616 3617 static const fs_operation_def_t EIO_vfsops_template[] = { 3618 VFSNAME_MOUNT, vfs_EIO, 3619 VFSNAME_UNMOUNT, vfs_EIO, 3620 VFSNAME_ROOT, vfs_EIO, 3621 VFSNAME_STATVFS, vfs_EIO, 3622 VFSNAME_SYNC, (fs_generic_func_p) vfs_EIO_sync, 3623 VFSNAME_VGET, vfs_EIO, 3624 VFSNAME_MOUNTROOT, vfs_EIO, 3625 VFSNAME_FREEVFS, vfs_EIO, 3626 VFSNAME_VNSTATE, vfs_EIO, 3627 NULL, NULL 3628 }; 3629 3630 3631 /* Initialize the vnode cache (file systems may use it during init). */ 3632 3633 vn_create_cache(); 3634 3635 /* Setup event monitor framework */ 3636 3637 fem_init(); 3638 3639 /* Initialize the dummy stray file system type. */ 3640 3641 vfssw[0].vsw_vfsops = vfs_strayops; 3642 3643 /* Initialize the dummy EIO file system. */ 3644 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 3645 if (error != 0) { 3646 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 3647 /* Shouldn't happen, but not bad enough to panic */ 3648 } 3649 3650 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 3651 3652 /* 3653 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 3654 * on this vfs can immediately notice it's invalid. 3655 */ 3656 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 3657 3658 /* 3659 * Call the init routines of non-loadable filesystems only. 3660 * Filesystems which are loaded as separate modules will be 3661 * initialized by the module loading code instead. 3662 */ 3663 3664 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3665 RLOCK_VFSSW(); 3666 if (vswp->vsw_init != NULL) 3667 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 3668 RUNLOCK_VFSSW(); 3669 } 3670 } 3671 3672 /* 3673 * Increments the vfs reference count by one atomically. 3674 */ 3675 void 3676 vfs_hold(vfs_t *vfsp) 3677 { 3678 atomic_add_32(&vfsp->vfs_count, 1); 3679 ASSERT(vfsp->vfs_count != 0); 3680 } 3681 3682 /* 3683 * Decrements the vfs reference count by one atomically. When 3684 * vfs reference count becomes zero, it calls the file system 3685 * specific vfs_freevfs() to free up the resources. 3686 */ 3687 void 3688 vfs_rele(vfs_t *vfsp) 3689 { 3690 ASSERT(vfsp->vfs_count != 0); 3691 if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) { 3692 VFS_FREEVFS(vfsp); 3693 if (vfsp->vfs_zone) 3694 zone_rele(vfsp->vfs_zone); 3695 vfs_freemnttab(vfsp); 3696 sema_destroy(&vfsp->vfs_reflock); 3697 kmem_free(vfsp, sizeof (*vfsp)); 3698 } 3699 } 3700 3701 /* 3702 * Generic operations vector support. 3703 * 3704 * This is used to build operations vectors for both the vfs and vnode. 3705 * It's normally called only when a file system is loaded. 3706 * 3707 * There are many possible algorithms for this, including the following: 3708 * 3709 * (1) scan the list of known operations; for each, see if the file system 3710 * includes an entry for it, and fill it in as appropriate. 3711 * 3712 * (2) set up defaults for all known operations. scan the list of ops 3713 * supplied by the file system; for each which is both supplied and 3714 * known, fill it in. 3715 * 3716 * (3) sort the lists of known ops & supplied ops; scan the list, filling 3717 * in entries as we go. 3718 * 3719 * we choose (1) for simplicity, and because performance isn't critical here. 3720 * note that (2) could be sped up using a precomputed hash table on known ops. 3721 * (3) could be faster than either, but only if the lists were very large or 3722 * supplied in sorted order. 3723 * 3724 */ 3725 3726 int 3727 fs_build_vector(void *vector, int *unused_ops, 3728 const fs_operation_trans_def_t *translation, 3729 const fs_operation_def_t *operations) 3730 { 3731 int i, num_trans, num_ops, used; 3732 3733 /* Count the number of translations and the number of supplied */ 3734 /* operations. */ 3735 3736 { 3737 const fs_operation_trans_def_t *p; 3738 3739 for (num_trans = 0, p = translation; 3740 p->name != NULL; 3741 num_trans++, p++) 3742 ; 3743 } 3744 3745 { 3746 const fs_operation_def_t *p; 3747 3748 for (num_ops = 0, p = operations; 3749 p->name != NULL; 3750 num_ops++, p++) 3751 ; 3752 } 3753 3754 /* Walk through each operation known to our caller. There will be */ 3755 /* one entry in the supplied "translation table" for each. */ 3756 3757 used = 0; 3758 3759 for (i = 0; i < num_trans; i++) { 3760 int j, found; 3761 char *curname; 3762 fs_generic_func_p result; 3763 fs_generic_func_p *location; 3764 3765 curname = translation[i].name; 3766 3767 /* Look for a matching operation in the list supplied by the */ 3768 /* file system. */ 3769 3770 found = 0; 3771 3772 for (j = 0; j < num_ops; j++) { 3773 if (strcmp(operations[j].name, curname) == 0) { 3774 used++; 3775 found = 1; 3776 break; 3777 } 3778 } 3779 3780 /* If the file system is using a "placeholder" for default */ 3781 /* or error functions, grab the appropriate function out of */ 3782 /* the translation table. If the file system didn't supply */ 3783 /* this operation at all, use the default function. */ 3784 3785 if (found) { 3786 result = operations[j].func; 3787 if (result == fs_default) { 3788 result = translation[i].defaultFunc; 3789 } else if (result == fs_error) { 3790 result = translation[i].errorFunc; 3791 } else if (result == NULL) { 3792 /* Null values are PROHIBITED */ 3793 return (EINVAL); 3794 } 3795 } else { 3796 result = translation[i].defaultFunc; 3797 } 3798 3799 /* Now store the function into the operations vector. */ 3800 3801 location = (fs_generic_func_p *) 3802 (((char *)vector) + translation[i].offset); 3803 3804 *location = result; 3805 } 3806 3807 *unused_ops = num_ops - used; 3808 3809 return (0); 3810 } 3811 3812 /* Placeholder functions, should never be called. */ 3813 3814 int 3815 fs_error(void) 3816 { 3817 cmn_err(CE_PANIC, "fs_error called"); 3818 return (0); 3819 } 3820 3821 int 3822 fs_default(void) 3823 { 3824 cmn_err(CE_PANIC, "fs_default called"); 3825 return (0); 3826 } 3827 3828 #ifdef __sparc 3829 3830 /* 3831 * Part of the implementation of booting off a mirrored root 3832 * involves a change of dev_t for the root device. To 3833 * accomplish this, first remove the existing hash table 3834 * entry for the root device, convert to the new dev_t, 3835 * then re-insert in the hash table at the head of the list. 3836 */ 3837 void 3838 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 3839 { 3840 vfs_list_lock(); 3841 3842 vfs_hash_remove(vfsp); 3843 3844 vfsp->vfs_dev = ndev; 3845 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 3846 3847 vfs_hash_add(vfsp, 1); 3848 3849 vfs_list_unlock(); 3850 } 3851 3852 #else /* x86 NEWBOOT */ 3853 3854 int 3855 rootconf() 3856 { 3857 int error; 3858 struct vfssw *vsw; 3859 extern void pm_init(); 3860 char *fstyp; 3861 3862 fstyp = getrootfs(); 3863 3864 if (error = clboot_rootconf()) 3865 return (error); 3866 3867 if (modload("fs", fstyp) == -1) 3868 cmn_err(CE_PANIC, "Cannot _init %s module\n", fstyp); 3869 3870 RLOCK_VFSSW(); 3871 vsw = vfs_getvfsswbyname(fstyp); 3872 RUNLOCK_VFSSW(); 3873 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 3874 VFS_HOLD(rootvfs); 3875 3876 /* always mount readonly first */ 3877 rootvfs->vfs_flag |= VFS_RDONLY; 3878 3879 pm_init(); 3880 3881 if (netboot) 3882 (void) strplumb(); 3883 3884 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 3885 vfs_unrefvfssw(vsw); 3886 rootdev = rootvfs->vfs_dev; 3887 3888 if (error) 3889 cmn_err(CE_PANIC, "cannot mount root path %s", svm_bootpath); 3890 return (error); 3891 } 3892 3893 /* 3894 * XXX this is called by nfs only and should probably be removed 3895 * If booted with ASKNAME, prompt on the console for a filesystem 3896 * name and return it. 3897 */ 3898 void 3899 getfsname(char *askfor, char *name, size_t namelen) 3900 { 3901 if (boothowto & RB_ASKNAME) { 3902 printf("%s name: ", askfor); 3903 console_gets(name, namelen); 3904 } 3905 } 3906 3907 /* 3908 * If server_path exists, then we are booting a diskless 3909 * client. Otherwise, we default to ufs. Zfs should perhaps be 3910 * another property. 3911 */ 3912 static char * 3913 getrootfs(void) 3914 { 3915 extern char *strplumb_get_netdev_path(void); 3916 char *propstr = NULL; 3917 3918 /* check fstype property; it should be nfsdyn for diskless */ 3919 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 3920 DDI_PROP_DONTPASS, "fstype", &propstr) 3921 == DDI_SUCCESS) { 3922 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 3923 ddi_prop_free(propstr); 3924 } 3925 3926 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) 3927 return (rootfs.bo_fstype); 3928 3929 ++netboot; 3930 /* check if path to network interface is specified in bootpath */ 3931 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 3932 DDI_PROP_DONTPASS, "bootpath", &propstr) 3933 == DDI_SUCCESS) { 3934 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 3935 ddi_prop_free(propstr); 3936 } else { 3937 /* attempt to determine netdev_path via boot_mac address */ 3938 netdev_path = strplumb_get_netdev_path(); 3939 if (netdev_path == NULL) 3940 cmn_err(CE_PANIC, 3941 "Cannot find boot network interface\n"); 3942 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 3943 } 3944 return ("nfs"); 3945 } 3946 #endif 3947