1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/param.h> 46 #include <sys/errno.h> 47 #include <sys/user.h> 48 #include <sys/fstyp.h> 49 #include <sys/kmem.h> 50 #include <sys/systm.h> 51 #include <sys/proc.h> 52 #include <sys/mount.h> 53 #include <sys/vfs.h> 54 #include <sys/fem.h> 55 #include <sys/mntent.h> 56 #include <sys/stat.h> 57 #include <sys/statvfs.h> 58 #include <sys/statfs.h> 59 #include <sys/cred.h> 60 #include <sys/vnode.h> 61 #include <sys/rwstlock.h> 62 #include <sys/dnlc.h> 63 #include <sys/file.h> 64 #include <sys/time.h> 65 #include <sys/atomic.h> 66 #include <sys/cmn_err.h> 67 #include <sys/buf.h> 68 #include <sys/swap.h> 69 #include <sys/debug.h> 70 #include <sys/vnode.h> 71 #include <sys/modctl.h> 72 #include <sys/ddi.h> 73 #include <sys/pathname.h> 74 #include <sys/bootconf.h> 75 #include <sys/dumphdr.h> 76 #include <sys/dc_ki.h> 77 #include <sys/poll.h> 78 #include <sys/sunddi.h> 79 #include <sys/sysmacros.h> 80 #include <sys/zone.h> 81 #include <sys/policy.h> 82 #include <sys/ctfs.h> 83 #include <sys/objfs.h> 84 #include <sys/console.h> 85 #include <sys/reboot.h> 86 87 #include <vm/page.h> 88 89 #include <fs/fs_subr.h> 90 91 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 92 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 93 const char *, int, int); 94 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 95 static void vfs_freemnttab(struct vfs *); 96 static void vfs_freeopt(mntopt_t *); 97 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 98 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 99 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 100 static void vfs_createopttbl_extend(mntopts_t *, const char *, 101 const mntopts_t *); 102 static char **vfs_copycancelopt_extend(char **const, int); 103 static void vfs_freecancelopt(char **); 104 static char *getrootfs(void); 105 static int getmacpath(dev_info_t *, void *); 106 107 struct ipmnt { 108 struct ipmnt *mip_next; 109 dev_t mip_dev; 110 struct vfs *mip_vfsp; 111 }; 112 113 static kmutex_t vfs_miplist_mutex; 114 static struct ipmnt *vfs_miplist = NULL; 115 static struct ipmnt *vfs_miplist_end = NULL; 116 117 /* 118 * VFS global data. 119 */ 120 vnode_t *rootdir; /* pointer to root inode vnode. */ 121 vnode_t *devicesdir; /* pointer to inode of devices root */ 122 123 char *server_rootpath; /* root path for diskless clients */ 124 char *server_hostname; /* hostname of diskless server */ 125 126 static struct vfs root; 127 static struct vfs devices; 128 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 129 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 130 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 131 /* must be power of 2! */ 132 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 133 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 134 char *vfs_dummyfstype = "\0"; 135 struct pollhead vfs_pollhd; /* for mnttab pollers */ 136 137 /* 138 * Table for generic options recognized in the VFS layer and acted 139 * on at this level before parsing file system specific options. 140 * The nosuid option is stronger than any of the devices and setuid 141 * options, so those are canceled when nosuid is seen. 142 * 143 * All options which are added here need to be added to the 144 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 145 */ 146 /* 147 * VFS Mount options table 148 */ 149 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 150 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 151 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 152 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 153 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 154 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 155 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 156 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 157 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 158 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 159 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 160 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 161 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 162 163 static const mntopt_t mntopts[] = { 164 /* 165 * option name cancel options default arg flags 166 */ 167 { MNTOPT_REMOUNT, NULL, NULL, 168 MO_NODISPLAY, (void *)0 }, 169 { MNTOPT_RO, ro_cancel, NULL, 0, 170 (void *)0 }, 171 { MNTOPT_RW, rw_cancel, NULL, 0, 172 (void *)0 }, 173 { MNTOPT_SUID, suid_cancel, NULL, 0, 174 (void *)0 }, 175 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 176 (void *)0 }, 177 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 178 (void *)0 }, 179 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 180 (void *)0 }, 181 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 182 (void *)0 }, 183 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 184 (void *)0 }, 185 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 186 (void *)0 }, 187 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 188 (void *)0 }, 189 { MNTOPT_EXEC, exec_cancel, NULL, 0, 190 (void *)0 }, 191 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 192 (void *)0 }, 193 }; 194 195 const mntopts_t vfs_mntopts = { 196 sizeof (mntopts) / sizeof (mntopt_t), 197 (mntopt_t *)&mntopts[0] 198 }; 199 200 /* 201 * File system operation dispatch functions. 202 */ 203 204 int 205 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 206 { 207 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 208 } 209 210 int 211 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 212 { 213 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 214 } 215 216 int 217 fsop_root(vfs_t *vfsp, vnode_t **vpp) 218 { 219 refstr_t *mntpt; 220 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 221 /* 222 * Make sure this root has a path. With lofs, it is possible to have 223 * a NULL mountpoint. 224 */ 225 if (vfs_vnode_path && ret == 0 && vfsp->vfs_mntpt != NULL && 226 vn_path(*vpp) == NULL) { 227 mntpt = vfs_getmntpoint(vfsp); 228 vn_setpath_str(*vpp, refstr_value(mntpt), 229 strlen(refstr_value(mntpt))); 230 refstr_rele(mntpt); 231 } 232 233 return (ret); 234 } 235 236 int 237 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 238 { 239 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 240 } 241 242 int 243 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 244 { 245 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 246 } 247 248 int 249 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 250 { 251 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 252 } 253 254 int 255 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 256 { 257 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 258 } 259 260 void 261 fsop_freefs(vfs_t *vfsp) 262 { 263 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 264 } 265 266 int 267 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 268 { 269 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 270 } 271 272 int 273 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 274 { 275 ASSERT((fstype >= 0) && (fstype < nfstype)); 276 277 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 278 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 279 else 280 return (ENOTSUP); 281 } 282 283 /* 284 * File system initialization. vfs_setfsops() must be called from a file 285 * system's init routine. 286 */ 287 288 static int 289 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 290 int *unused_ops) 291 { 292 static const fs_operation_trans_def_t vfs_ops_table[] = { 293 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 294 fs_nosys, fs_nosys, 295 296 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 297 fs_nosys, fs_nosys, 298 299 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 300 fs_nosys, fs_nosys, 301 302 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 303 fs_nosys, fs_nosys, 304 305 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 306 (fs_generic_func_p) fs_sync, 307 (fs_generic_func_p) fs_sync, /* No errors allowed */ 308 309 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 310 fs_nosys, fs_nosys, 311 312 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 313 fs_nosys, fs_nosys, 314 315 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 316 (fs_generic_func_p)fs_freevfs, 317 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 318 319 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 320 (fs_generic_func_p)fs_nosys, 321 (fs_generic_func_p)fs_nosys, 322 323 NULL, 0, NULL, NULL 324 }; 325 326 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 327 } 328 329 int 330 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 331 { 332 int error; 333 int unused_ops; 334 335 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 336 337 if ((fstype <= 0) || (fstype >= nfstype)) 338 return (EINVAL); 339 340 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 341 return (EINVAL); 342 343 /* Set up the operations vector. */ 344 345 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 346 347 if (error != 0) 348 return (error); 349 350 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 351 352 if (actual != NULL) 353 *actual = &vfssw[fstype].vsw_vfsops; 354 355 #if DEBUG 356 if (unused_ops != 0) 357 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 358 "but not used", vfssw[fstype].vsw_name, unused_ops); 359 #endif 360 361 return (0); 362 } 363 364 int 365 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 366 { 367 int error; 368 int unused_ops; 369 370 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 371 372 error = fs_copyfsops(template, *actual, &unused_ops); 373 if (error != 0) { 374 kmem_free(*actual, sizeof (vfsops_t)); 375 *actual = NULL; 376 return (error); 377 } 378 379 return (0); 380 } 381 382 /* 383 * Free a vfsops structure created as a result of vfs_makefsops(). 384 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 385 * vfs_freevfsops_by_type(). 386 */ 387 void 388 vfs_freevfsops(vfsops_t *vfsops) 389 { 390 kmem_free(vfsops, sizeof (vfsops_t)); 391 } 392 393 /* 394 * Since the vfsops structure is part of the vfssw table and wasn't 395 * really allocated, we're not really freeing anything. We keep 396 * the name for consistency with vfs_freevfsops(). We do, however, 397 * need to take care of a little bookkeeping. 398 * NOTE: For a vfsops structure created by vfs_setfsops(), use 399 * vfs_freevfsops_by_type(). 400 */ 401 int 402 vfs_freevfsops_by_type(int fstype) 403 { 404 405 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 406 if ((fstype <= 0) || (fstype >= nfstype)) 407 return (EINVAL); 408 409 WLOCK_VFSSW(); 410 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 411 WUNLOCK_VFSSW(); 412 return (EINVAL); 413 } 414 415 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 416 WUNLOCK_VFSSW(); 417 418 return (0); 419 } 420 421 /* Support routines used to reference vfs_op */ 422 423 /* Set the operations vector for a vfs */ 424 void 425 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 426 { 427 vfsops_t *op; 428 429 ASSERT(vfsp != NULL); 430 ASSERT(vfsops != NULL); 431 432 op = vfsp->vfs_op; 433 membar_consumer(); 434 if (vfsp->vfs_femhead == NULL && 435 casptr(&vfsp->vfs_op, op, vfsops) == op) { 436 return; 437 } 438 fsem_setvfsops(vfsp, vfsops); 439 } 440 441 /* Retrieve the operations vector for a vfs */ 442 vfsops_t * 443 vfs_getops(vfs_t *vfsp) 444 { 445 vfsops_t *op; 446 447 ASSERT(vfsp != NULL); 448 449 op = vfsp->vfs_op; 450 membar_consumer(); 451 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) { 452 return (op); 453 } else { 454 return (fsem_getvfsops(vfsp)); 455 } 456 } 457 458 /* 459 * Returns non-zero (1) if the vfsops matches that of the vfs. 460 * Returns zero (0) if not. 461 */ 462 int 463 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 464 { 465 return (vfs_getops(vfsp) == vfsops); 466 } 467 468 /* 469 * Returns non-zero (1) if the file system has installed a non-default, 470 * non-error vfs_sync routine. Returns zero (0) otherwise. 471 */ 472 int 473 vfs_can_sync(vfs_t *vfsp) 474 { 475 /* vfs_sync() routine is not the default/error function */ 476 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 477 } 478 479 /* 480 * Initialize a vfs structure. 481 */ 482 void 483 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 484 { 485 vfsp->vfs_count = 0; 486 vfsp->vfs_next = vfsp; 487 vfsp->vfs_prev = vfsp; 488 vfsp->vfs_zone_next = vfsp; 489 vfsp->vfs_zone_prev = vfsp; 490 vfsp->vfs_flag = 0; 491 vfsp->vfs_data = (data); 492 vfsp->vfs_resource = NULL; 493 vfsp->vfs_mntpt = NULL; 494 vfsp->vfs_mntopts.mo_count = 0; 495 vfsp->vfs_mntopts.mo_list = NULL; 496 vfsp->vfs_femhead = NULL; 497 vfsp->vfs_zone = NULL; 498 vfs_setops((vfsp), (op)); 499 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 500 } 501 502 503 /* 504 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 505 * fstatvfs, and sysfs moved to common/syscall. 506 */ 507 508 /* 509 * Update every mounted file system. We call the vfs_sync operation of 510 * each file system type, passing it a NULL vfsp to indicate that all 511 * mounted file systems of that type should be updated. 512 */ 513 void 514 vfs_sync(int flag) 515 { 516 struct vfssw *vswp; 517 RLOCK_VFSSW(); 518 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 519 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 520 vfs_refvfssw(vswp); 521 RUNLOCK_VFSSW(); 522 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 523 CRED()); 524 vfs_unrefvfssw(vswp); 525 RLOCK_VFSSW(); 526 } 527 } 528 RUNLOCK_VFSSW(); 529 } 530 531 void 532 sync(void) 533 { 534 vfs_sync(0); 535 } 536 537 /* 538 * External routines. 539 */ 540 541 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 542 543 /* 544 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 545 * but otherwise should be accessed only via vfs_list_lock() and 546 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 547 */ 548 static krwlock_t vfslist; 549 550 /* 551 * Mount devfs on /devices. This is done right after root is mounted 552 * to provide device access support for the system 553 */ 554 static void 555 vfs_mountdevices(void) 556 { 557 struct vfssw *vsw; 558 struct vnode *mvp; 559 struct mounta mounta = { /* fake mounta for devfs_mount() */ 560 NULL, 561 NULL, 562 MS_SYSSPACE, 563 NULL, 564 NULL, 565 0, 566 NULL, 567 0 568 }; 569 570 /* 571 * _init devfs module to fill in the vfssw 572 */ 573 if (modload("fs", "devfs") == -1) 574 cmn_err(CE_PANIC, "Cannot _init devfs module\n"); 575 576 /* 577 * Hold vfs 578 */ 579 RLOCK_VFSSW(); 580 vsw = vfs_getvfsswbyname("devfs"); 581 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 582 VFS_HOLD(&devices); 583 584 /* 585 * Locate mount point 586 */ 587 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 588 cmn_err(CE_PANIC, "Cannot find /devices\n"); 589 590 /* 591 * Perform the mount of /devices 592 */ 593 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 594 cmn_err(CE_PANIC, "Cannot mount /devices\n"); 595 596 RUNLOCK_VFSSW(); 597 598 /* 599 * Set appropriate members and add to vfs list for mnttab display 600 */ 601 vfs_setresource(&devices, "/devices"); 602 vfs_setmntpoint(&devices, "/devices"); 603 604 /* 605 * Hold the root of /devices so it won't go away 606 */ 607 if (VFS_ROOT(&devices, &devicesdir)) 608 cmn_err(CE_PANIC, "vfs_mountdevices: not devices root"); 609 VN_HOLD(devicesdir); 610 611 if (vfs_lock(&devices) != 0) { 612 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 613 return; 614 } 615 616 if (vn_vfswlock(mvp) != 0) { 617 vfs_unlock(&devices); 618 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 619 return; 620 } 621 622 vfs_add(mvp, &devices, 0); 623 vn_vfsunlock(mvp); 624 vfs_unlock(&devices); 625 } 626 627 /* 628 * Mount required filesystem. This is done right after root is mounted. 629 */ 630 static void 631 vfs_mountfs(char *module, char *spec, char *path) 632 { 633 struct vnode *mvp; 634 struct mounta mounta; 635 vfs_t *vfsp; 636 637 mounta.flags = MS_SYSSPACE | MS_DATA; 638 mounta.fstype = module; 639 mounta.spec = spec; 640 mounta.dir = path; 641 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 642 cmn_err(CE_WARN, "Cannot find %s\n", path); 643 return; 644 } 645 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 646 cmn_err(CE_WARN, "Cannot mount %s\n", path); 647 else 648 VFS_RELE(vfsp); 649 VN_RELE(mvp); 650 } 651 652 /* 653 * vfs_mountroot is called by main() to mount the root filesystem. 654 */ 655 void 656 vfs_mountroot(void) 657 { 658 struct vnode *rvp = NULL; 659 char *path; 660 size_t plen; 661 662 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 663 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 664 665 /* 666 * Alloc the vfs hash bucket array and locks 667 */ 668 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 669 670 /* 671 * Call machine-dependent routine "rootconf" to choose a root 672 * file system type. 673 */ 674 if (rootconf()) 675 cmn_err(CE_PANIC, "vfs_mountroot: cannot mount root"); 676 /* 677 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 678 * to point to it. These are used by lookuppn() so that it 679 * knows where to start from ('/' or '.'). 680 */ 681 vfs_setmntpoint(rootvfs, "/"); 682 if (VFS_ROOT(rootvfs, &rootdir)) 683 cmn_err(CE_PANIC, "vfs_mountroot: no root vnode"); 684 u.u_cdir = rootdir; 685 VN_HOLD(u.u_cdir); 686 u.u_rdir = NULL; 687 688 /* 689 * Setup the global zone's rootvp, now that it exists. 690 */ 691 global_zone->zone_rootvp = rootdir; 692 VN_HOLD(global_zone->zone_rootvp); 693 694 /* 695 * Notify the module code that it can begin using the 696 * root filesystem instead of the boot program's services. 697 */ 698 modrootloaded = 1; 699 /* 700 * Set up mnttab information for root 701 */ 702 vfs_setresource(rootvfs, rootfs.bo_name); 703 704 /* 705 * Notify cluster software that the root filesystem is available. 706 */ 707 clboot_mountroot(); 708 709 /* 710 * Mount /devices, /system/contract, /etc/mnttab, /etc/svc/volatile, 711 * /system/object, and /proc. 712 */ 713 vfs_mountdevices(); 714 715 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 716 vfs_mountfs("proc", "/proc", "/proc"); 717 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 718 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 719 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 720 721 #ifdef __sparc 722 /* 723 * This bit of magic can go away when we convert sparc to 724 * the new boot architecture based on ramdisk. 725 * 726 * Booting off a mirrored root volume: 727 * At this point, we have booted and mounted root on a 728 * single component of the mirror. Complete the boot 729 * by configuring SVM and converting the root to the 730 * dev_t of the mirrored root device. This dev_t conversion 731 * only works because the underlying device doesn't change. 732 */ 733 if (root_is_svm) { 734 if (svm_rootconf()) { 735 cmn_err(CE_PANIC, "vfs_mountroot: cannot remount root"); 736 } 737 738 /* 739 * mnttab should reflect the new root device 740 */ 741 vfs_lock_wait(rootvfs); 742 vfs_setresource(rootvfs, rootfs.bo_name); 743 vfs_unlock(rootvfs); 744 } 745 #endif /* __sparc */ 746 747 /* 748 * Look up the root device via devfs so that a dv_node is 749 * created for it. The vnode is never VN_RELE()ed. 750 * We allocate more than MAXPATHLEN so that the 751 * buffer passed to i_ddi_prompath_to_devfspath() is 752 * exactly MAXPATHLEN (the function expects a buffer 753 * of that length). 754 */ 755 plen = strlen("/devices"); 756 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 757 (void) strcpy(path, "/devices"); 758 759 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 760 != DDI_SUCCESS || 761 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 762 763 /* NUL terminate in case "path" has garbage */ 764 path[plen + MAXPATHLEN - 1] = '\0'; 765 #ifdef DEBUG 766 cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); 767 #endif 768 } 769 kmem_free(path, plen + MAXPATHLEN); 770 } 771 772 /* 773 * Common mount code. Called from the system call entry point, from autofs, 774 * and from pxfs. 775 * 776 * Takes the effective file system type, mount arguments, the mount point 777 * vnode, flags specifying whether the mount is a remount and whether it 778 * should be entered into the vfs list, and credentials. Fills in its vfspp 779 * parameter with the mounted file system instance's vfs. 780 * 781 * Note that the effective file system type is specified as a string. It may 782 * be null, in which case it's determined from the mount arguments, and may 783 * differ from the type specified in the mount arguments; this is a hook to 784 * allow interposition when instantiating file system instances. 785 * 786 * The caller is responsible for releasing its own hold on the mount point 787 * vp (this routine does its own hold when necessary). 788 * Also note that for remounts, the mount point vp should be the vnode for 789 * the root of the file system rather than the vnode that the file system 790 * is mounted on top of. 791 */ 792 int 793 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 794 struct vfs **vfspp) 795 { 796 struct vfssw *vswp; 797 vfsops_t *vfsops; 798 struct vfs *vfsp; 799 struct vnode *bvp; 800 dev_t bdev = 0; 801 mntopts_t mnt_mntopts; 802 int error = 0; 803 int copyout_error = 0; 804 int ovflags; 805 char *opts = uap->optptr; 806 char *inargs = opts; 807 int optlen = uap->optlen; 808 int remount; 809 int rdonly; 810 int nbmand = 0; 811 int delmip = 0; 812 int addmip = 0; 813 int splice = ((uap->flags & MS_NOSPLICE) == 0); 814 int fromspace = (uap->flags & MS_SYSSPACE) ? 815 UIO_SYSSPACE : UIO_USERSPACE; 816 char *resource = NULL, *mountpt = NULL; 817 refstr_t *oldresource, *oldmntpt; 818 struct pathname pn, rpn; 819 820 /* 821 * The v_flag value for the mount point vp is permanently set 822 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 823 * for mount point locking. 824 */ 825 mutex_enter(&vp->v_lock); 826 vp->v_flag |= VVFSLOCK; 827 mutex_exit(&vp->v_lock); 828 829 mnt_mntopts.mo_count = 0; 830 /* 831 * Find the ops vector to use to invoke the file system-specific mount 832 * method. If the fsname argument is non-NULL, use it directly. 833 * Otherwise, dig the file system type information out of the mount 834 * arguments. 835 * 836 * A side effect is to hold the vfssw entry. 837 * 838 * Mount arguments can be specified in several ways, which are 839 * distinguished by flag bit settings. The preferred way is to set 840 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 841 * type supplied as a character string and the last two arguments 842 * being a pointer to a character buffer and the size of the buffer. 843 * On entry, the buffer holds a null terminated list of options; on 844 * return, the string is the list of options the file system 845 * recognized. If MS_DATA is set arguments five and six point to a 846 * block of binary data which the file system interprets. 847 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 848 * consistently with these conventions. To handle them, we check to 849 * see whether the pointer to the file system name has a numeric value 850 * less than 256. If so, we treat it as an index. 851 */ 852 if (fsname != NULL) { 853 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 854 return (EINVAL); 855 } 856 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 857 size_t n; 858 uint_t fstype; 859 char name[FSTYPSZ]; 860 861 if ((fstype = (uintptr_t)uap->fstype) < 256) { 862 RLOCK_VFSSW(); 863 if (fstype == 0 || fstype >= nfstype || 864 !ALLOCATED_VFSSW(&vfssw[fstype])) { 865 RUNLOCK_VFSSW(); 866 return (EINVAL); 867 } 868 (void) strcpy(name, vfssw[fstype].vsw_name); 869 RUNLOCK_VFSSW(); 870 if ((vswp = vfs_getvfssw(name)) == NULL) 871 return (EINVAL); 872 } else { 873 /* 874 * Handle either kernel or user address space. 875 */ 876 if (uap->flags & MS_SYSSPACE) { 877 error = copystr(uap->fstype, name, 878 FSTYPSZ, &n); 879 } else { 880 error = copyinstr(uap->fstype, name, 881 FSTYPSZ, &n); 882 } 883 if (error) { 884 if (error == ENAMETOOLONG) 885 return (EINVAL); 886 return (error); 887 } 888 if ((vswp = vfs_getvfssw(name)) == NULL) 889 return (EINVAL); 890 } 891 } else { 892 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 893 return (EINVAL); 894 } 895 if (!VFS_INSTALLED(vswp)) 896 return (EINVAL); 897 vfsops = &vswp->vsw_vfsops; 898 899 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 900 /* 901 * Fetch mount options and parse them for generic vfs options 902 */ 903 if (uap->flags & MS_OPTIONSTR) { 904 /* 905 * Limit the buffer size 906 */ 907 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 908 error = EINVAL; 909 goto errout; 910 } 911 if ((uap->flags & MS_SYSSPACE) == 0) { 912 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 913 inargs[0] = '\0'; 914 if (optlen) { 915 error = copyinstr(opts, inargs, (size_t)optlen, 916 NULL); 917 if (error) { 918 goto errout; 919 } 920 } 921 } 922 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 923 } 924 /* 925 * Flag bits override the options string. 926 */ 927 if (uap->flags & MS_REMOUNT) 928 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 929 if (uap->flags & MS_RDONLY) 930 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 931 if (uap->flags & MS_NOSUID) 932 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 933 934 /* 935 * Check if this is a remount; must be set in the option string and 936 * the file system must support a remount option. 937 */ 938 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 939 MNTOPT_REMOUNT, NULL)) { 940 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 941 error = ENOTSUP; 942 goto errout; 943 } 944 uap->flags |= MS_REMOUNT; 945 } 946 947 /* 948 * uap->flags and vfs_optionisset() should agree. 949 */ 950 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 951 uap->flags |= MS_RDONLY; 952 } 953 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 954 uap->flags |= MS_NOSUID; 955 } 956 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 957 ASSERT(splice || !remount); 958 /* 959 * If we are splicing the fs into the namespace, 960 * perform mount point checks. 961 * 962 * We want to resolve the path for the mount point to eliminate 963 * '.' and ".." and symlinks in mount points; we can't do the 964 * same for the resource string, since it would turn 965 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 966 * this before grabbing vn_vfswlock(), because otherwise we 967 * would deadlock with lookuppn(). 968 */ 969 if (splice) { 970 ASSERT(vp->v_count > 0); 971 972 /* 973 * Pick up mount point and device from appropriate space. 974 */ 975 if (pn_get(uap->spec, fromspace, &pn) == 0) { 976 resource = kmem_alloc(pn.pn_pathlen + 1, 977 KM_SLEEP); 978 (void) strcpy(resource, pn.pn_path); 979 pn_free(&pn); 980 } 981 /* 982 * Do a lookupname prior to taking the 983 * writelock. Mark this as completed if 984 * successful for later cleanup and addition to 985 * the mount in progress table. 986 */ 987 if ((uap->flags & MS_GLOBAL) == 0 && 988 lookupname(uap->spec, fromspace, 989 FOLLOW, NULL, &bvp) == 0) { 990 addmip = 1; 991 } 992 993 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 994 pathname_t *pnp; 995 996 if (*pn.pn_path != '/') { 997 error = EINVAL; 998 pn_free(&pn); 999 goto errout; 1000 } 1001 pn_alloc(&rpn); 1002 /* 1003 * Kludge to prevent autofs from deadlocking with 1004 * itself when it calls domount(). 1005 * 1006 * If autofs is calling, it is because it is doing 1007 * (autofs) mounts in the process of an NFS mount. A 1008 * lookuppn() here would cause us to block waiting for 1009 * said NFS mount to complete, which can't since this 1010 * is the thread that was supposed to doing it. 1011 */ 1012 if (fromspace == UIO_USERSPACE) { 1013 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1014 NULL)) == 0) { 1015 pnp = &rpn; 1016 } else { 1017 /* 1018 * The file disappeared or otherwise 1019 * became inaccessible since we opened 1020 * it; might as well fail the mount 1021 * since the mount point is no longer 1022 * accessible. 1023 */ 1024 pn_free(&rpn); 1025 pn_free(&pn); 1026 goto errout; 1027 } 1028 } else { 1029 pnp = &pn; 1030 } 1031 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1032 (void) strcpy(mountpt, pnp->pn_path); 1033 1034 /* 1035 * If the addition of the zone's rootpath 1036 * would push us over a total path length 1037 * of MAXPATHLEN, we fail the mount with 1038 * ENAMETOOLONG, which is what we would have 1039 * gotten if we were trying to perform the same 1040 * mount in the global zone. 1041 * 1042 * strlen() doesn't count the trailing 1043 * '\0', but zone_rootpathlen counts both a 1044 * trailing '/' and the terminating '\0'. 1045 */ 1046 if ((curproc->p_zone->zone_rootpathlen - 1 + 1047 strlen(mountpt)) > MAXPATHLEN || 1048 (resource != NULL && 1049 (curproc->p_zone->zone_rootpathlen - 1 + 1050 strlen(resource)) > MAXPATHLEN)) { 1051 error = ENAMETOOLONG; 1052 } 1053 1054 pn_free(&rpn); 1055 pn_free(&pn); 1056 } 1057 1058 if (error) 1059 goto errout; 1060 1061 /* 1062 * Prevent path name resolution from proceeding past 1063 * the mount point. 1064 */ 1065 if (vn_vfswlock(vp) != 0) { 1066 error = EBUSY; 1067 goto errout; 1068 } 1069 1070 /* 1071 * Verify that it's legitimate to establish a mount on 1072 * the prospective mount point. 1073 */ 1074 if (vn_mountedvfs(vp) != NULL) { 1075 /* 1076 * The mount point lock was obtained after some 1077 * other thread raced through and established a mount. 1078 */ 1079 vn_vfsunlock(vp); 1080 error = EBUSY; 1081 goto errout; 1082 } 1083 if (vp->v_flag & VNOMOUNT) { 1084 vn_vfsunlock(vp); 1085 error = EINVAL; 1086 goto errout; 1087 } 1088 } 1089 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1090 uap->dataptr = NULL; 1091 uap->datalen = 0; 1092 } 1093 1094 /* 1095 * If this is a remount, we don't want to create a new VFS. 1096 * Instead, we pass the existing one with a remount flag. 1097 */ 1098 if (remount) { 1099 /* 1100 * Confirm that the mount point is the root vnode of the 1101 * file system that is being remounted. 1102 * This can happen if the user specifies a different 1103 * mount point directory pathname in the (re)mount command. 1104 * 1105 * Code below can only be reached if splice is true, so it's 1106 * safe to do vn_vfsunlock() here. 1107 */ 1108 if ((vp->v_flag & VROOT) == 0) { 1109 vn_vfsunlock(vp); 1110 error = ENOENT; 1111 goto errout; 1112 } 1113 /* 1114 * Disallow making file systems read-only unless file system 1115 * explicitly allows it in its vfssw. Ignore other flags. 1116 */ 1117 if (rdonly && vn_is_readonly(vp) == 0 && 1118 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1119 vn_vfsunlock(vp); 1120 error = EINVAL; 1121 goto errout; 1122 } 1123 /* 1124 * Changing the NBMAND setting on remounts is permitted 1125 * but logged since it can lead to unexpected behavior. 1126 * We also counsel against using it for / and /usr. 1127 */ 1128 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1129 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1130 cmn_err(CE_WARN, "domount: nbmand turned %s via " 1131 "remounting %s", nbmand ? "on" : "off", 1132 refstr_value(vp->v_vfsp->vfs_mntpt)); 1133 } 1134 vfsp = vp->v_vfsp; 1135 ovflags = vfsp->vfs_flag; 1136 vfsp->vfs_flag |= VFS_REMOUNT; 1137 vfsp->vfs_flag &= ~VFS_RDONLY; 1138 } else { 1139 vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP); 1140 VFS_INIT(vfsp, vfsops, NULL); 1141 } 1142 1143 VFS_HOLD(vfsp); 1144 1145 /* 1146 * The vfs_reflock is not used anymore the code below explicitly 1147 * holds it preventing others accesing it directly. 1148 */ 1149 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1150 !(vfsp->vfs_flag & VFS_REMOUNT)) 1151 cmn_err(CE_WARN, 1152 "mount type %s couldn't get vfs_reflock\n", vswp->vsw_name); 1153 1154 /* 1155 * Lock the vfs. If this is a remount we want to avoid spurious umount 1156 * failures that happen as a side-effect of fsflush() and other mount 1157 * and unmount operations that might be going on simultaneously and 1158 * may have locked the vfs currently. To not return EBUSY immediately 1159 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1160 */ 1161 if (!remount) { 1162 if (error = vfs_lock(vfsp)) { 1163 vfsp->vfs_flag = ovflags; 1164 if (splice) 1165 vn_vfsunlock(vp); 1166 kmem_free(vfsp, sizeof (struct vfs)); 1167 goto errout; 1168 } 1169 } else { 1170 vfs_lock_wait(vfsp); 1171 } 1172 1173 /* 1174 * Add device to mount in progress table, global mounts require special 1175 * handling. It is possible that we have already done the lookupname 1176 * on a spliced, non-global fs. If so, we don't want to do it again 1177 * since we cannot do a lookupname after taking the 1178 * wlock above. This case is for a non-spliced, non-global filesystem. 1179 */ 1180 if (!addmip) { 1181 if ((uap->flags & MS_GLOBAL) == 0 && 1182 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1183 addmip = 1; 1184 } 1185 } 1186 1187 if (addmip) { 1188 bdev = bvp->v_rdev; 1189 VN_RELE(bvp); 1190 vfs_addmip(bdev, vfsp); 1191 addmip = 0; 1192 delmip = 1; 1193 } 1194 /* 1195 * Invalidate cached entry for the mount point. 1196 */ 1197 if (splice) 1198 dnlc_purge_vp(vp); 1199 1200 /* 1201 * If have an option string but the filesystem doesn't supply a 1202 * prototype options table, create a table with the global 1203 * options and sufficient room to accept all the options in the 1204 * string. Then parse the passed in option string 1205 * accepting all the options in the string. This gives us an 1206 * option table with all the proper cancel properties for the 1207 * global options. 1208 * 1209 * Filesystems that supply a prototype options table are handled 1210 * earlier in this function. 1211 */ 1212 if (uap->flags & MS_OPTIONSTR) { 1213 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1214 mntopts_t tmp_mntopts; 1215 1216 tmp_mntopts.mo_count = 0; 1217 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1218 &mnt_mntopts); 1219 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1220 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1221 vfs_freeopttbl(&tmp_mntopts); 1222 } 1223 } 1224 1225 /* 1226 * Serialize with zone creations. 1227 */ 1228 mount_in_progress(); 1229 /* 1230 * Instantiate (or reinstantiate) the file system. If appropriate, 1231 * splice it into the file system name space. 1232 * 1233 * We want VFS_MOUNT() to be able to override the vfs_resource 1234 * string if necessary (ie, mntfs), and also for a remount to 1235 * change the same (necessary when remounting '/' during boot). 1236 * So we set up vfs_mntpt and vfs_resource to what we think they 1237 * should be, then hand off control to VFS_MOUNT() which can 1238 * override this. 1239 * 1240 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1241 * a vfs which is on the vfs list (i.e. during a remount), we must 1242 * never set those fields to NULL. Several bits of code make 1243 * assumptions that the fields are always valid. 1244 */ 1245 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1246 if (remount) { 1247 if ((oldresource = vfsp->vfs_resource) != NULL) 1248 refstr_hold(oldresource); 1249 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1250 refstr_hold(oldmntpt); 1251 } 1252 vfs_setresource(vfsp, resource); 1253 vfs_setmntpoint(vfsp, mountpt); 1254 1255 error = VFS_MOUNT(vfsp, vp, uap, credp); 1256 1257 if (uap->flags & MS_RDONLY) 1258 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1259 if (uap->flags & MS_NOSUID) 1260 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1261 if (uap->flags & MS_GLOBAL) 1262 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1263 1264 if (error) { 1265 if (remount) { 1266 /* put back pre-remount options */ 1267 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1268 vfs_setmntpoint(vfsp, refstr_value(oldmntpt)); 1269 if (oldmntpt) 1270 refstr_rele(oldmntpt); 1271 vfs_setresource(vfsp, refstr_value(oldresource)); 1272 if (oldresource) 1273 refstr_rele(oldresource); 1274 vfsp->vfs_flag = ovflags; 1275 vfs_unlock(vfsp); 1276 VFS_RELE(vfsp); 1277 } else { 1278 vfs_unlock(vfsp); 1279 vfs_freemnttab(vfsp); 1280 kmem_free(vfsp, sizeof (struct vfs)); 1281 } 1282 } else { 1283 /* 1284 * Set the mount time to now 1285 */ 1286 vfsp->vfs_mtime = ddi_get_time(); 1287 if (remount) { 1288 vfsp->vfs_flag &= ~VFS_REMOUNT; 1289 if (oldresource) 1290 refstr_rele(oldresource); 1291 if (oldmntpt) 1292 refstr_rele(oldmntpt); 1293 } else if (splice) { 1294 /* 1295 * Link vfsp into the name space at the mount 1296 * point. Vfs_add() is responsible for 1297 * holding the mount point which will be 1298 * released when vfs_remove() is called. 1299 */ 1300 vfs_add(vp, vfsp, uap->flags); 1301 } else { 1302 /* 1303 * Hold the reference to file system which is 1304 * not linked into the name space. 1305 */ 1306 vfsp->vfs_zone = NULL; 1307 VFS_HOLD(vfsp); 1308 vfsp->vfs_vnodecovered = NULL; 1309 } 1310 /* 1311 * Set flags for global options encountered 1312 */ 1313 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1314 vfsp->vfs_flag |= VFS_RDONLY; 1315 else 1316 vfsp->vfs_flag &= ~VFS_RDONLY; 1317 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1318 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1319 } else { 1320 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1321 vfsp->vfs_flag |= VFS_NODEVICES; 1322 else 1323 vfsp->vfs_flag &= ~VFS_NODEVICES; 1324 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1325 vfsp->vfs_flag |= VFS_NOSETUID; 1326 else 1327 vfsp->vfs_flag &= ~VFS_NOSETUID; 1328 } 1329 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1330 vfsp->vfs_flag |= VFS_NBMAND; 1331 else 1332 vfsp->vfs_flag &= ~VFS_NBMAND; 1333 1334 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1335 vfsp->vfs_flag |= VFS_XATTR; 1336 else 1337 vfsp->vfs_flag &= ~VFS_XATTR; 1338 1339 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1340 vfsp->vfs_flag |= VFS_NOEXEC; 1341 else 1342 vfsp->vfs_flag &= ~VFS_NOEXEC; 1343 1344 /* 1345 * Now construct the output option string of options 1346 * we recognized. 1347 */ 1348 if (uap->flags & MS_OPTIONSTR) { 1349 vfs_list_read_lock(); 1350 copyout_error = vfs_buildoptionstr( 1351 &vfsp->vfs_mntopts, inargs, optlen); 1352 vfs_list_unlock(); 1353 if (copyout_error == 0 && 1354 (uap->flags & MS_SYSSPACE) == 0) { 1355 copyout_error = copyoutstr(inargs, opts, 1356 optlen, NULL); 1357 } 1358 } 1359 vfs_unlock(vfsp); 1360 } 1361 mount_completed(); 1362 if (splice) 1363 vn_vfsunlock(vp); 1364 1365 /* 1366 * Return vfsp to caller. 1367 */ 1368 if ((error == 0) && (copyout_error == 0)) { 1369 *vfspp = vfsp; 1370 } 1371 errout: 1372 vfs_freeopttbl(&mnt_mntopts); 1373 if (resource != NULL) 1374 kmem_free(resource, strlen(resource) + 1); 1375 if (mountpt != NULL) 1376 kmem_free(mountpt, strlen(mountpt) + 1); 1377 /* 1378 * It is possible we errored prior to adding to mount in progress 1379 * table. Must free vnode we acquired with successful lookupname. 1380 */ 1381 if (addmip) 1382 VN_RELE(bvp); 1383 if (delmip) 1384 vfs_delmip(vfsp); 1385 ASSERT(vswp != NULL); 1386 vfs_unrefvfssw(vswp); 1387 if (inargs != opts) 1388 kmem_free(inargs, MAX_MNTOPT_STR); 1389 if (copyout_error) { 1390 VFS_RELE(vfsp); 1391 error = copyout_error; 1392 } 1393 return (error); 1394 } 1395 1396 static void 1397 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath) 1398 { 1399 size_t len; 1400 refstr_t *ref; 1401 zone_t *zone = curproc->p_zone; 1402 char *sp; 1403 int have_list_lock = 0; 1404 1405 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1406 1407 /* 1408 * New path must be less than MAXPATHLEN because mntfs 1409 * will only display up to MAXPATHLEN bytes. This is currently 1410 * safe, because domount() uses pn_get(), and other callers 1411 * similarly cap the size to fewer than MAXPATHLEN bytes. 1412 */ 1413 1414 ASSERT(strlen(newpath) < MAXPATHLEN); 1415 1416 /* mntfs requires consistency while vfs list lock is held */ 1417 1418 if (VFS_ON_LIST(vfsp)) { 1419 have_list_lock = 1; 1420 vfs_list_lock(); 1421 } 1422 1423 if (*refp != NULL) 1424 refstr_rele(*refp); 1425 1426 /* Do we need to modify the path? */ 1427 1428 if (zone == global_zone || *newpath != '/') { 1429 ref = refstr_alloc(newpath); 1430 goto out; 1431 } 1432 1433 /* 1434 * Truncate the trailing '/' in the zoneroot, and merge 1435 * in the zone's rootpath with the "newpath" (resource 1436 * or mountpoint) passed in. 1437 * 1438 * The size of the required buffer is thus the size of 1439 * the buffer required for the passed-in newpath 1440 * (strlen(newpath) + 1), plus the size of the buffer 1441 * required to hold zone_rootpath (zone_rootpathlen) 1442 * minus one for one of the now-superfluous NUL 1443 * terminations, minus one for the trailing '/'. 1444 * 1445 * That gives us: 1446 * 1447 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1448 * 1449 * Which is what we have below. 1450 */ 1451 1452 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1453 sp = kmem_alloc(len, KM_SLEEP); 1454 1455 /* 1456 * Copy everything including the trailing slash, which 1457 * we then overwrite with the NUL character. 1458 */ 1459 1460 (void) strcpy(sp, zone->zone_rootpath); 1461 sp[zone->zone_rootpathlen - 2] = '\0'; 1462 (void) strcat(sp, newpath); 1463 1464 ref = refstr_alloc(sp); 1465 kmem_free(sp, len); 1466 out: 1467 *refp = ref; 1468 1469 if (have_list_lock) { 1470 vfs_mnttab_modtimeupd(); 1471 vfs_list_unlock(); 1472 } 1473 } 1474 1475 /* 1476 * Record a mounted resource name in a vfs structure. 1477 * If vfsp is already mounted, caller must hold the vfs lock. 1478 */ 1479 void 1480 vfs_setresource(struct vfs *vfsp, const char *resource) 1481 { 1482 if (resource == NULL || resource[0] == '\0') 1483 resource = VFS_NORESOURCE; 1484 vfs_setpath(vfsp, &vfsp->vfs_resource, resource); 1485 } 1486 1487 /* 1488 * Record a mount point name in a vfs structure. 1489 * If vfsp is already mounted, caller must hold the vfs lock. 1490 */ 1491 void 1492 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt) 1493 { 1494 if (mntpt == NULL || mntpt[0] == '\0') 1495 mntpt = VFS_NOMNTPT; 1496 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt); 1497 } 1498 1499 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1500 1501 refstr_t * 1502 vfs_getresource(const struct vfs *vfsp) 1503 { 1504 refstr_t *resource; 1505 1506 vfs_list_read_lock(); 1507 resource = vfsp->vfs_resource; 1508 refstr_hold(resource); 1509 vfs_list_unlock(); 1510 1511 return (resource); 1512 } 1513 1514 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 1515 1516 refstr_t * 1517 vfs_getmntpoint(const struct vfs *vfsp) 1518 { 1519 refstr_t *mntpt; 1520 1521 vfs_list_read_lock(); 1522 mntpt = vfsp->vfs_mntpt; 1523 refstr_hold(mntpt); 1524 vfs_list_unlock(); 1525 1526 return (mntpt); 1527 } 1528 1529 /* 1530 * Create an empty options table with enough empty slots to hold all 1531 * The options in the options string passed as an argument. 1532 * Potentially prepend another options table. 1533 * 1534 * Note: caller is responsible for locking the vfs list, if needed, 1535 * to protect mops. 1536 */ 1537 static void 1538 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 1539 const mntopts_t *mtmpl) 1540 { 1541 const char *s = opts; 1542 uint_t count; 1543 1544 if (opts == NULL || *opts == '\0') { 1545 count = 0; 1546 } else { 1547 count = 1; 1548 1549 /* 1550 * Count number of options in the string 1551 */ 1552 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 1553 count++; 1554 s++; 1555 } 1556 } 1557 vfs_copyopttbl_extend(mtmpl, mops, count); 1558 } 1559 1560 /* 1561 * Create an empty options table with enough empty slots to hold all 1562 * The options in the options string passed as an argument. 1563 * 1564 * This function is *not* for general use by filesystems. 1565 * 1566 * Note: caller is responsible for locking the vfs list, if needed, 1567 * to protect mops. 1568 */ 1569 void 1570 vfs_createopttbl(mntopts_t *mops, const char *opts) 1571 { 1572 vfs_createopttbl_extend(mops, opts, NULL); 1573 } 1574 1575 1576 /* 1577 * Swap two mount options tables 1578 */ 1579 static void 1580 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 1581 { 1582 uint_t tmpcnt; 1583 mntopt_t *tmplist; 1584 1585 tmpcnt = optbl2->mo_count; 1586 tmplist = optbl2->mo_list; 1587 optbl2->mo_count = optbl1->mo_count; 1588 optbl2->mo_list = optbl1->mo_list; 1589 optbl1->mo_count = tmpcnt; 1590 optbl1->mo_list = tmplist; 1591 } 1592 1593 static void 1594 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 1595 { 1596 vfs_list_lock(); 1597 vfs_swapopttbl_nolock(optbl1, optbl2); 1598 vfs_mnttab_modtimeupd(); 1599 vfs_list_unlock(); 1600 } 1601 1602 static char ** 1603 vfs_copycancelopt_extend(char **const moc, int extend) 1604 { 1605 int i = 0; 1606 int j; 1607 char **result; 1608 1609 if (moc != NULL) { 1610 for (; moc[i] != NULL; i++) 1611 /* count number of options to cancel */; 1612 } 1613 1614 if (i + extend == 0) 1615 return (NULL); 1616 1617 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 1618 1619 for (j = 0; j < i; j++) { 1620 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 1621 (void) strcpy(result[j], moc[j]); 1622 } 1623 for (; j <= i + extend; j++) 1624 result[j] = NULL; 1625 1626 return (result); 1627 } 1628 1629 static void 1630 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 1631 { 1632 char *sp, *dp; 1633 1634 d->mo_flags = s->mo_flags; 1635 d->mo_data = s->mo_data; 1636 sp = s->mo_name; 1637 if (sp != NULL) { 1638 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1639 (void) strcpy(dp, sp); 1640 d->mo_name = dp; 1641 } else { 1642 d->mo_name = NULL; /* should never happen */ 1643 } 1644 1645 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 1646 1647 sp = s->mo_arg; 1648 if (sp != NULL) { 1649 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1650 (void) strcpy(dp, sp); 1651 d->mo_arg = dp; 1652 } else { 1653 d->mo_arg = NULL; 1654 } 1655 } 1656 1657 /* 1658 * Copy a mount options table, possibly allocating some spare 1659 * slots at the end. It is permissible to copy_extend the NULL table. 1660 */ 1661 static void 1662 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 1663 { 1664 uint_t i, count; 1665 mntopt_t *motbl; 1666 1667 /* 1668 * Clear out any existing stuff in the options table being initialized 1669 */ 1670 vfs_freeopttbl(dmo); 1671 count = (smo == NULL) ? 0 : smo->mo_count; 1672 if ((count + extra) == 0) /* nothing to do */ 1673 return; 1674 dmo->mo_count = count + extra; 1675 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 1676 dmo->mo_list = motbl; 1677 for (i = 0; i < count; i++) { 1678 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 1679 } 1680 for (i = count; i < count + extra; i++) { 1681 motbl[i].mo_flags = MO_EMPTY; 1682 } 1683 } 1684 1685 /* 1686 * Copy a mount options table. 1687 * 1688 * This function is *not* for general use by filesystems. 1689 * 1690 * Note: caller is responsible for locking the vfs list, if needed, 1691 * to protect smo and dmo. 1692 */ 1693 void 1694 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 1695 { 1696 vfs_copyopttbl_extend(smo, dmo, 0); 1697 } 1698 1699 static char ** 1700 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 1701 { 1702 int c1 = 0; 1703 int c2 = 0; 1704 char **result; 1705 char **sp1, **sp2, **dp; 1706 1707 /* 1708 * First we count both lists of cancel options. 1709 * If either is NULL or has no elements, we return a copy of 1710 * the other. 1711 */ 1712 if (mop1->mo_cancel != NULL) { 1713 for (; mop1->mo_cancel[c1] != NULL; c1++) 1714 /* count cancel options in mop1 */; 1715 } 1716 1717 if (c1 == 0) 1718 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 1719 1720 if (mop2->mo_cancel != NULL) { 1721 for (; mop2->mo_cancel[c2] != NULL; c2++) 1722 /* count cancel options in mop2 */; 1723 } 1724 1725 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 1726 1727 if (c2 == 0) 1728 return (result); 1729 1730 /* 1731 * When we get here, we've got two sets of cancel options; 1732 * we need to merge the two sets. We know that the result 1733 * array has "c1+c2+1" entries and in the end we might shrink 1734 * it. 1735 * Result now has a copy of the c1 entries from mop1; we'll 1736 * now lookup all the entries of mop2 in mop1 and copy it if 1737 * it is unique. 1738 * This operation is O(n^2) but it's only called once per 1739 * filesystem per duplicate option. This is a situation 1740 * which doesn't arise with the filesystems in ON and 1741 * n is generally 1. 1742 */ 1743 1744 dp = &result[c1]; 1745 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 1746 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 1747 if (strcmp(*sp1, *sp2) == 0) 1748 break; 1749 } 1750 if (*sp1 == NULL) { 1751 /* 1752 * Option *sp2 not found in mop1, so copy it. 1753 * The calls to vfs_copycancelopt_extend() 1754 * guarantee that there's enough room. 1755 */ 1756 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 1757 (void) strcpy(*dp++, *sp2); 1758 } 1759 } 1760 if (dp != &result[c1+c2]) { 1761 size_t bytes = (dp - result + 1) * sizeof (char *); 1762 char **nres = kmem_alloc(bytes, KM_SLEEP); 1763 1764 bcopy(result, nres, bytes); 1765 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 1766 result = nres; 1767 } 1768 return (result); 1769 } 1770 1771 /* 1772 * Merge two mount option tables (outer and inner) into one. This is very 1773 * similar to "merging" global variables and automatic variables in C. 1774 * 1775 * This isn't (and doesn't have to be) fast. 1776 * 1777 * This function is *not* for general use by filesystems. 1778 * 1779 * Note: caller is responsible for locking the vfs list, if needed, 1780 * to protect omo, imo & dmo. 1781 */ 1782 void 1783 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 1784 { 1785 uint_t i, count; 1786 mntopt_t *mop, *motbl; 1787 uint_t freeidx; 1788 1789 /* 1790 * First determine how much space we need to allocate. 1791 */ 1792 count = omo->mo_count; 1793 for (i = 0; i < imo->mo_count; i++) { 1794 if (imo->mo_list[i].mo_flags & MO_EMPTY) 1795 continue; 1796 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 1797 count++; 1798 } 1799 ASSERT(count >= omo->mo_count && 1800 count <= omo->mo_count + imo->mo_count); 1801 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 1802 for (i = 0; i < omo->mo_count; i++) 1803 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 1804 freeidx = omo->mo_count; 1805 for (i = 0; i < imo->mo_count; i++) { 1806 if (imo->mo_list[i].mo_flags & MO_EMPTY) 1807 continue; 1808 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 1809 char **newcanp; 1810 uint_t index = mop - omo->mo_list; 1811 1812 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 1813 1814 vfs_freeopt(&motbl[index]); 1815 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 1816 1817 vfs_freecancelopt(motbl[index].mo_cancel); 1818 motbl[index].mo_cancel = newcanp; 1819 } else { 1820 /* 1821 * If it's a new option, just copy it over to the first 1822 * free location. 1823 */ 1824 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 1825 } 1826 } 1827 dmo->mo_count = count; 1828 dmo->mo_list = motbl; 1829 } 1830 1831 /* 1832 * Functions to set and clear mount options in a mount options table. 1833 */ 1834 1835 /* 1836 * Clear a mount option, if it exists. 1837 * 1838 * The update_mnttab arg indicates whether mops is part of a vfs that is on 1839 * the vfs list. 1840 */ 1841 static void 1842 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 1843 { 1844 struct mntopt *mop; 1845 uint_t i, count; 1846 1847 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 1848 1849 count = mops->mo_count; 1850 for (i = 0; i < count; i++) { 1851 mop = &mops->mo_list[i]; 1852 1853 if (mop->mo_flags & MO_EMPTY) 1854 continue; 1855 if (strcmp(opt, mop->mo_name)) 1856 continue; 1857 mop->mo_flags &= ~MO_SET; 1858 if (mop->mo_arg != NULL) { 1859 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 1860 } 1861 mop->mo_arg = NULL; 1862 if (update_mnttab) 1863 vfs_mnttab_modtimeupd(); 1864 break; 1865 } 1866 } 1867 1868 void 1869 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 1870 { 1871 int gotlock = 0; 1872 1873 if (VFS_ON_LIST(vfsp)) { 1874 gotlock = 1; 1875 vfs_list_lock(); 1876 } 1877 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 1878 if (gotlock) 1879 vfs_list_unlock(); 1880 } 1881 1882 1883 /* 1884 * Set a mount option on. If it's not found in the table, it's silently 1885 * ignored. If the option has MO_IGNORE set, it is still set unless the 1886 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 1887 * bits can be used to toggle the MO_NODISPLAY bit for the option. 1888 * If the VFS_CREATEOPT flag bit is set then the first option slot with 1889 * MO_EMPTY set is created as the option passed in. 1890 * 1891 * The update_mnttab arg indicates whether mops is part of a vfs that is on 1892 * the vfs list. 1893 */ 1894 static void 1895 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 1896 const char *arg, int flags, int update_mnttab) 1897 { 1898 mntopt_t *mop; 1899 uint_t i, count; 1900 char *sp; 1901 1902 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 1903 1904 if (flags & VFS_CREATEOPT) { 1905 if (vfs_hasopt(mops, opt) != NULL) { 1906 flags &= ~VFS_CREATEOPT; 1907 } 1908 } 1909 count = mops->mo_count; 1910 for (i = 0; i < count; i++) { 1911 mop = &mops->mo_list[i]; 1912 1913 if (mop->mo_flags & MO_EMPTY) { 1914 if ((flags & VFS_CREATEOPT) == 0) 1915 continue; 1916 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 1917 (void) strcpy(sp, opt); 1918 mop->mo_name = sp; 1919 if (arg != NULL) 1920 mop->mo_flags = MO_HASVALUE; 1921 else 1922 mop->mo_flags = 0; 1923 } else if (strcmp(opt, mop->mo_name)) { 1924 continue; 1925 } 1926 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 1927 break; 1928 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 1929 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 1930 (void) strcpy(sp, arg); 1931 } else { 1932 sp = NULL; 1933 } 1934 if (mop->mo_arg != NULL) 1935 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 1936 mop->mo_arg = sp; 1937 if (flags & VFS_DISPLAY) 1938 mop->mo_flags &= ~MO_NODISPLAY; 1939 if (flags & VFS_NODISPLAY) 1940 mop->mo_flags |= MO_NODISPLAY; 1941 mop->mo_flags |= MO_SET; 1942 if (mop->mo_cancel != NULL) { 1943 char **cp; 1944 1945 for (cp = mop->mo_cancel; *cp != NULL; cp++) 1946 vfs_clearmntopt_nolock(mops, *cp, 0); 1947 } 1948 if (update_mnttab) 1949 vfs_mnttab_modtimeupd(); 1950 break; 1951 } 1952 } 1953 1954 void 1955 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 1956 { 1957 int gotlock = 0; 1958 1959 if (VFS_ON_LIST(vfsp)) { 1960 gotlock = 1; 1961 vfs_list_lock(); 1962 } 1963 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 1964 if (gotlock) 1965 vfs_list_unlock(); 1966 } 1967 1968 1969 /* 1970 * Add a "tag" option to a mounted file system's options list. 1971 * 1972 * Note: caller is responsible for locking the vfs list, if needed, 1973 * to protect mops. 1974 */ 1975 static mntopt_t * 1976 vfs_addtag(mntopts_t *mops, const char *tag) 1977 { 1978 uint_t count; 1979 mntopt_t *mop, *motbl; 1980 1981 count = mops->mo_count + 1; 1982 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 1983 if (mops->mo_count) { 1984 size_t len = (count - 1) * sizeof (mntopt_t); 1985 1986 bcopy(mops->mo_list, motbl, len); 1987 kmem_free(mops->mo_list, len); 1988 } 1989 mops->mo_count = count; 1990 mops->mo_list = motbl; 1991 mop = &motbl[count - 1]; 1992 mop->mo_flags = MO_TAG; 1993 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 1994 (void) strcpy(mop->mo_name, tag); 1995 return (mop); 1996 } 1997 1998 /* 1999 * Allow users to set arbitrary "tags" in a vfs's mount options. 2000 * Broader use within the kernel is discouraged. 2001 */ 2002 int 2003 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2004 cred_t *cr) 2005 { 2006 vfs_t *vfsp; 2007 mntopts_t *mops; 2008 mntopt_t *mop; 2009 int found = 0; 2010 dev_t dev = makedevice(major, minor); 2011 int err = 0; 2012 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2013 2014 /* 2015 * Find the desired mounted file system 2016 */ 2017 vfs_list_lock(); 2018 vfsp = rootvfs; 2019 do { 2020 if (vfsp->vfs_dev == dev && 2021 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2022 found = 1; 2023 break; 2024 } 2025 vfsp = vfsp->vfs_next; 2026 } while (vfsp != rootvfs); 2027 2028 if (!found) { 2029 err = EINVAL; 2030 goto out; 2031 } 2032 err = secpolicy_fs_config(cr, vfsp); 2033 if (err != 0) 2034 goto out; 2035 2036 mops = &vfsp->vfs_mntopts; 2037 /* 2038 * Add tag if it doesn't already exist 2039 */ 2040 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2041 int len; 2042 2043 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2044 len = strlen(buf); 2045 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2046 err = ENAMETOOLONG; 2047 goto out; 2048 } 2049 mop = vfs_addtag(mops, tag); 2050 } 2051 if ((mop->mo_flags & MO_TAG) == 0) { 2052 err = EINVAL; 2053 goto out; 2054 } 2055 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2056 out: 2057 vfs_list_unlock(); 2058 kmem_free(buf, MAX_MNTOPT_STR); 2059 return (err); 2060 } 2061 2062 /* 2063 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2064 * Broader use within the kernel is discouraged. 2065 */ 2066 int 2067 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2068 cred_t *cr) 2069 { 2070 vfs_t *vfsp; 2071 mntopt_t *mop; 2072 int found = 0; 2073 dev_t dev = makedevice(major, minor); 2074 int err = 0; 2075 2076 /* 2077 * Find the desired mounted file system 2078 */ 2079 vfs_list_lock(); 2080 vfsp = rootvfs; 2081 do { 2082 if (vfsp->vfs_dev == dev && 2083 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2084 found = 1; 2085 break; 2086 } 2087 vfsp = vfsp->vfs_next; 2088 } while (vfsp != rootvfs); 2089 2090 if (!found) { 2091 err = EINVAL; 2092 goto out; 2093 } 2094 err = secpolicy_fs_config(cr, vfsp); 2095 if (err != 0) 2096 goto out; 2097 2098 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2099 err = EINVAL; 2100 goto out; 2101 } 2102 if ((mop->mo_flags & MO_TAG) == 0) { 2103 err = EINVAL; 2104 goto out; 2105 } 2106 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2107 out: 2108 vfs_list_unlock(); 2109 return (err); 2110 } 2111 2112 /* 2113 * Function to parse an option string and fill in a mount options table. 2114 * Unknown options are silently ignored. The input option string is modified 2115 * by replacing separators with nulls. If the create flag is set, options 2116 * not found in the table are just added on the fly. The table must have 2117 * an option slot marked MO_EMPTY to add an option on the fly. 2118 * 2119 * This function is *not* for general use by filesystems. 2120 * 2121 * Note: caller is responsible for locking the vfs list, if needed, 2122 * to protect mops.. 2123 */ 2124 void 2125 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2126 { 2127 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2128 int setflg = VFS_NOFORCEOPT; 2129 2130 if (osp == NULL) 2131 return; 2132 while (*s != '\0') { 2133 p = strchr(s, ','); /* find next option */ 2134 if (p == NULL) { 2135 cp = NULL; 2136 p = s + strlen(s); 2137 } else { 2138 cp = p; /* save location of comma */ 2139 *p++ = '\0'; /* mark end and point to next option */ 2140 } 2141 nextop = p; 2142 p = strchr(s, '='); /* look for value */ 2143 if (p == NULL) { 2144 valp = NULL; /* no value supplied */ 2145 } else { 2146 ep = p; /* save location of equals */ 2147 *p++ = '\0'; /* end option and point to value */ 2148 valp = p; 2149 } 2150 /* 2151 * set option into options table 2152 */ 2153 if (create) 2154 setflg |= VFS_CREATEOPT; 2155 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2156 if (cp != NULL) 2157 *cp = ','; /* restore the comma */ 2158 if (valp != NULL) 2159 *ep = '='; /* restore the equals */ 2160 s = nextop; 2161 } 2162 } 2163 2164 /* 2165 * Function to inquire if an option exists in a mount options table. 2166 * Returns a pointer to the option if it exists, else NULL. 2167 * 2168 * This function is *not* for general use by filesystems. 2169 * 2170 * Note: caller is responsible for locking the vfs list, if needed, 2171 * to protect mops. 2172 */ 2173 struct mntopt * 2174 vfs_hasopt(const mntopts_t *mops, const char *opt) 2175 { 2176 struct mntopt *mop; 2177 uint_t i, count; 2178 2179 count = mops->mo_count; 2180 for (i = 0; i < count; i++) { 2181 mop = &mops->mo_list[i]; 2182 2183 if (mop->mo_flags & MO_EMPTY) 2184 continue; 2185 if (strcmp(opt, mop->mo_name) == 0) 2186 return (mop); 2187 } 2188 return (NULL); 2189 } 2190 2191 /* 2192 * Function to inquire if an option is set in a mount options table. 2193 * Returns non-zero if set and fills in the arg pointer with a pointer to 2194 * the argument string or NULL if there is no argument string. 2195 */ 2196 static int 2197 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2198 { 2199 struct mntopt *mop; 2200 uint_t i, count; 2201 2202 count = mops->mo_count; 2203 for (i = 0; i < count; i++) { 2204 mop = &mops->mo_list[i]; 2205 2206 if (mop->mo_flags & MO_EMPTY) 2207 continue; 2208 if (strcmp(opt, mop->mo_name)) 2209 continue; 2210 if ((mop->mo_flags & MO_SET) == 0) 2211 return (0); 2212 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2213 *argp = mop->mo_arg; 2214 return (1); 2215 } 2216 return (0); 2217 } 2218 2219 2220 int 2221 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2222 { 2223 int ret; 2224 2225 vfs_list_read_lock(); 2226 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2227 vfs_list_unlock(); 2228 return (ret); 2229 } 2230 2231 2232 /* 2233 * Construct a comma separated string of the options set in the given 2234 * mount table, return the string in the given buffer. Return non-zero if 2235 * the buffer would overflow. 2236 * 2237 * This function is *not* for general use by filesystems. 2238 * 2239 * Note: caller is responsible for locking the vfs list, if needed, 2240 * to protect mp. 2241 */ 2242 int 2243 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2244 { 2245 char *cp; 2246 uint_t i; 2247 2248 buf[0] = '\0'; 2249 cp = buf; 2250 for (i = 0; i < mp->mo_count; i++) { 2251 struct mntopt *mop; 2252 2253 mop = &mp->mo_list[i]; 2254 if (mop->mo_flags & MO_SET) { 2255 int optlen, comma = 0; 2256 2257 if (buf[0] != '\0') 2258 comma = 1; 2259 optlen = strlen(mop->mo_name); 2260 if (strlen(buf) + comma + optlen + 1 > len) 2261 goto err; 2262 if (comma) 2263 *cp++ = ','; 2264 (void) strcpy(cp, mop->mo_name); 2265 cp += optlen; 2266 /* 2267 * Append option value if there is one 2268 */ 2269 if (mop->mo_arg != NULL) { 2270 int arglen; 2271 2272 arglen = strlen(mop->mo_arg); 2273 if (strlen(buf) + arglen + 2 > len) 2274 goto err; 2275 *cp++ = '='; 2276 (void) strcpy(cp, mop->mo_arg); 2277 cp += arglen; 2278 } 2279 } 2280 } 2281 return (0); 2282 err: 2283 return (EOVERFLOW); 2284 } 2285 2286 static void 2287 vfs_freecancelopt(char **moc) 2288 { 2289 if (moc != NULL) { 2290 int ccnt = 0; 2291 char **cp; 2292 2293 for (cp = moc; *cp != NULL; cp++) { 2294 kmem_free(*cp, strlen(*cp) + 1); 2295 ccnt++; 2296 } 2297 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2298 } 2299 } 2300 2301 static void 2302 vfs_freeopt(mntopt_t *mop) 2303 { 2304 if (mop->mo_name != NULL) 2305 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2306 2307 vfs_freecancelopt(mop->mo_cancel); 2308 2309 if (mop->mo_arg != NULL) 2310 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2311 } 2312 2313 /* 2314 * Free a mount options table 2315 * 2316 * This function is *not* for general use by filesystems. 2317 * 2318 * Note: caller is responsible for locking the vfs list, if needed, 2319 * to protect mp. 2320 */ 2321 void 2322 vfs_freeopttbl(mntopts_t *mp) 2323 { 2324 uint_t i, count; 2325 2326 count = mp->mo_count; 2327 for (i = 0; i < count; i++) { 2328 vfs_freeopt(&mp->mo_list[i]); 2329 } 2330 if (count) { 2331 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2332 mp->mo_count = 0; 2333 mp->mo_list = NULL; 2334 } 2335 } 2336 2337 /* 2338 * Free any mnttab information recorded in the vfs struct. 2339 * The vfs must not be on the vfs list. 2340 */ 2341 static void 2342 vfs_freemnttab(struct vfs *vfsp) 2343 { 2344 ASSERT(!VFS_ON_LIST(vfsp)); 2345 2346 /* 2347 * Free device and mount point information 2348 */ 2349 if (vfsp->vfs_mntpt != NULL) { 2350 refstr_rele(vfsp->vfs_mntpt); 2351 vfsp->vfs_mntpt = NULL; 2352 } 2353 if (vfsp->vfs_resource != NULL) { 2354 refstr_rele(vfsp->vfs_resource); 2355 vfsp->vfs_resource = NULL; 2356 } 2357 /* 2358 * Now free mount options information 2359 */ 2360 vfs_freeopttbl(&vfsp->vfs_mntopts); 2361 } 2362 2363 /* 2364 * Return the last mnttab modification time 2365 */ 2366 void 2367 vfs_mnttab_modtime(timespec_t *ts) 2368 { 2369 ASSERT(RW_LOCK_HELD(&vfslist)); 2370 *ts = vfs_mnttab_mtime; 2371 } 2372 2373 /* 2374 * See if mnttab is changed 2375 */ 2376 void 2377 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2378 { 2379 int changed; 2380 2381 *phpp = (struct pollhead *)NULL; 2382 2383 /* 2384 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 2385 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 2386 * to not grab the vfs list lock because tv_sec is monotonically 2387 * increasing. 2388 */ 2389 2390 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 2391 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 2392 if (!changed) { 2393 *phpp = &vfs_pollhd; 2394 } 2395 } 2396 2397 /* 2398 * Update the mnttab modification time and wake up any waiters for 2399 * mnttab changes 2400 */ 2401 void 2402 vfs_mnttab_modtimeupd() 2403 { 2404 hrtime_t oldhrt, newhrt; 2405 2406 ASSERT(RW_WRITE_HELD(&vfslist)); 2407 oldhrt = ts2hrt(&vfs_mnttab_mtime); 2408 gethrestime(&vfs_mnttab_mtime); 2409 newhrt = ts2hrt(&vfs_mnttab_mtime); 2410 if (oldhrt == (hrtime_t)0) 2411 vfs_mnttab_ctime = vfs_mnttab_mtime; 2412 /* 2413 * Attempt to provide unique mtime (like uniqtime but not). 2414 */ 2415 if (newhrt == oldhrt) { 2416 newhrt++; 2417 hrt2ts(newhrt, &vfs_mnttab_mtime); 2418 } 2419 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 2420 } 2421 2422 int 2423 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 2424 { 2425 vnode_t *coveredvp; 2426 int error; 2427 2428 /* 2429 * Get covered vnode. This will be NULL if the vfs is not linked 2430 * into the file system name space (i.e., domount() with MNT_NOSPICE). 2431 */ 2432 coveredvp = vfsp->vfs_vnodecovered; 2433 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 2434 2435 /* 2436 * Purge all dnlc entries for this vfs. 2437 */ 2438 (void) dnlc_purge_vfsp(vfsp, 0); 2439 2440 /* For forcible umount, skip VFS_SYNC() since it may hang */ 2441 if ((flag & MS_FORCE) == 0) 2442 (void) VFS_SYNC(vfsp, 0, cr); 2443 2444 /* 2445 * Lock the vfs to maintain fs status quo during unmount. This 2446 * has to be done after the sync because ufs_update tries to acquire 2447 * the vfs_reflock. 2448 */ 2449 vfs_lock_wait(vfsp); 2450 2451 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 2452 vfs_unlock(vfsp); 2453 if (coveredvp != NULL) 2454 vn_vfsunlock(coveredvp); 2455 } else if (coveredvp != NULL) { 2456 /* 2457 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 2458 * when it frees vfsp so we do a VN_HOLD() so we can 2459 * continue to use coveredvp afterwards. 2460 */ 2461 VN_HOLD(coveredvp); 2462 vfs_remove(vfsp); 2463 vn_vfsunlock(coveredvp); 2464 VN_RELE(coveredvp); 2465 } else { 2466 /* 2467 * Release the reference to vfs that is not linked 2468 * into the name space. 2469 */ 2470 vfs_unlock(vfsp); 2471 VFS_RELE(vfsp); 2472 } 2473 return (error); 2474 } 2475 2476 2477 /* 2478 * Vfs_unmountall() is called by uadmin() to unmount all 2479 * mounted file systems (except the root file system) during shutdown. 2480 * It follows the existing locking protocol when traversing the vfs list 2481 * to sync and unmount vfses. Even though there should be no 2482 * other thread running while the system is shutting down, it is prudent 2483 * to still follow the locking protocol. 2484 */ 2485 void 2486 vfs_unmountall(void) 2487 { 2488 struct vfs *vfsp; 2489 struct vfs *prev_vfsp = NULL; 2490 int error; 2491 2492 /* 2493 * Toss all dnlc entries now so that the per-vfs sync 2494 * and unmount operations don't have to slog through 2495 * a bunch of uninteresting vnodes over and over again. 2496 */ 2497 dnlc_purge(); 2498 2499 vfs_list_lock(); 2500 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 2501 prev_vfsp = vfsp->vfs_prev; 2502 2503 if (vfs_lock(vfsp) != 0) 2504 continue; 2505 error = vn_vfswlock(vfsp->vfs_vnodecovered); 2506 vfs_unlock(vfsp); 2507 if (error) 2508 continue; 2509 2510 vfs_list_unlock(); 2511 2512 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 2513 (void) dounmount(vfsp, 0, CRED()); 2514 2515 /* 2516 * Since we dropped the vfslist lock above we must 2517 * verify that next_vfsp still exists, else start over. 2518 */ 2519 vfs_list_lock(); 2520 for (vfsp = rootvfs->vfs_prev; 2521 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 2522 if (vfsp == prev_vfsp) 2523 break; 2524 if (vfsp == rootvfs && prev_vfsp != rootvfs) 2525 prev_vfsp = rootvfs->vfs_prev; 2526 } 2527 vfs_list_unlock(); 2528 } 2529 2530 /* 2531 * Called to add an entry to the end of the vfs mount in progress list 2532 */ 2533 void 2534 vfs_addmip(dev_t dev, struct vfs *vfsp) 2535 { 2536 struct ipmnt *mipp; 2537 2538 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 2539 mipp->mip_next = NULL; 2540 mipp->mip_dev = dev; 2541 mipp->mip_vfsp = vfsp; 2542 mutex_enter(&vfs_miplist_mutex); 2543 if (vfs_miplist_end != NULL) 2544 vfs_miplist_end->mip_next = mipp; 2545 else 2546 vfs_miplist = mipp; 2547 vfs_miplist_end = mipp; 2548 mutex_exit(&vfs_miplist_mutex); 2549 } 2550 2551 /* 2552 * Called to remove an entry from the mount in progress list 2553 * Either because the mount completed or it failed. 2554 */ 2555 void 2556 vfs_delmip(struct vfs *vfsp) 2557 { 2558 struct ipmnt *mipp, *mipprev; 2559 2560 mutex_enter(&vfs_miplist_mutex); 2561 mipprev = NULL; 2562 for (mipp = vfs_miplist; 2563 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 2564 mipprev = mipp; 2565 } 2566 if (mipp == NULL) 2567 return; /* shouldn't happen */ 2568 if (mipp == vfs_miplist_end) 2569 vfs_miplist_end = mipprev; 2570 if (mipprev == NULL) 2571 vfs_miplist = mipp->mip_next; 2572 else 2573 mipprev->mip_next = mipp->mip_next; 2574 mutex_exit(&vfs_miplist_mutex); 2575 kmem_free(mipp, sizeof (struct ipmnt)); 2576 } 2577 2578 /* 2579 * vfs_add is called by a specific filesystem's mount routine to add 2580 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 2581 * The vfs should already have been locked by the caller. 2582 * 2583 * coveredvp is NULL if this is the root. 2584 */ 2585 void 2586 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 2587 { 2588 int newflag; 2589 2590 ASSERT(vfs_lock_held(vfsp)); 2591 VFS_HOLD(vfsp); 2592 newflag = vfsp->vfs_flag; 2593 if (mflag & MS_RDONLY) 2594 newflag |= VFS_RDONLY; 2595 else 2596 newflag &= ~VFS_RDONLY; 2597 if (mflag & MS_NOSUID) 2598 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 2599 else 2600 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 2601 if (mflag & MS_NOMNTTAB) 2602 newflag |= VFS_NOMNTTAB; 2603 else 2604 newflag &= ~VFS_NOMNTTAB; 2605 2606 if (coveredvp != NULL) { 2607 ASSERT(vn_vfswlock_held(coveredvp)); 2608 coveredvp->v_vfsmountedhere = vfsp; 2609 VN_HOLD(coveredvp); 2610 } 2611 vfsp->vfs_vnodecovered = coveredvp; 2612 vfsp->vfs_flag = newflag; 2613 2614 vfs_list_add(vfsp); 2615 } 2616 2617 /* 2618 * Remove a vfs from the vfs list, null out the pointer from the 2619 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 2620 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 2621 * reference to the vfs and to the covered vnode. 2622 * 2623 * Called from dounmount after it's confirmed with the file system 2624 * that the unmount is legal. 2625 */ 2626 void 2627 vfs_remove(struct vfs *vfsp) 2628 { 2629 vnode_t *vp; 2630 2631 ASSERT(vfs_lock_held(vfsp)); 2632 2633 /* 2634 * Can't unmount root. Should never happen because fs will 2635 * be busy. 2636 */ 2637 if (vfsp == rootvfs) 2638 cmn_err(CE_PANIC, "vfs_remove: unmounting root"); 2639 2640 vfs_list_remove(vfsp); 2641 2642 /* 2643 * Unhook from the file system name space. 2644 */ 2645 vp = vfsp->vfs_vnodecovered; 2646 ASSERT(vn_vfswlock_held(vp)); 2647 vp->v_vfsmountedhere = NULL; 2648 vfsp->vfs_vnodecovered = NULL; 2649 VN_RELE(vp); 2650 2651 /* 2652 * Release lock and wakeup anybody waiting. 2653 */ 2654 vfs_unlock(vfsp); 2655 VFS_RELE(vfsp); 2656 } 2657 2658 /* 2659 * Lock a filesystem to prevent access to it while mounting, 2660 * unmounting and syncing. Return EBUSY immediately if lock 2661 * can't be acquired. 2662 */ 2663 int 2664 vfs_lock(vfs_t *vfsp) 2665 { 2666 vn_vfslocks_entry_t *vpvfsentry; 2667 2668 vpvfsentry = vn_vfslocks_getlock(vfsp); 2669 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 2670 return (0); 2671 2672 vn_vfslocks_rele(vpvfsentry); 2673 return (EBUSY); 2674 } 2675 2676 int 2677 vfs_rlock(vfs_t *vfsp) 2678 { 2679 vn_vfslocks_entry_t *vpvfsentry; 2680 2681 vpvfsentry = vn_vfslocks_getlock(vfsp); 2682 2683 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 2684 return (0); 2685 2686 vn_vfslocks_rele(vpvfsentry); 2687 return (EBUSY); 2688 } 2689 2690 void 2691 vfs_lock_wait(vfs_t *vfsp) 2692 { 2693 vn_vfslocks_entry_t *vpvfsentry; 2694 2695 vpvfsentry = vn_vfslocks_getlock(vfsp); 2696 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 2697 } 2698 2699 void 2700 vfs_rlock_wait(vfs_t *vfsp) 2701 { 2702 vn_vfslocks_entry_t *vpvfsentry; 2703 2704 vpvfsentry = vn_vfslocks_getlock(vfsp); 2705 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 2706 } 2707 2708 /* 2709 * Unlock a locked filesystem. 2710 */ 2711 void 2712 vfs_unlock(vfs_t *vfsp) 2713 { 2714 vn_vfslocks_entry_t *vpvfsentry; 2715 2716 /* 2717 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 2718 * And these changes should remain for the patch changes as it is. 2719 */ 2720 if (panicstr) 2721 return; 2722 2723 /* 2724 * ve_refcount needs to be dropped twice here. 2725 * 1. To release refernce after a call to vfs_locks_getlock() 2726 * 2. To release the reference from the locking routines like 2727 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 2728 */ 2729 2730 vpvfsentry = vn_vfslocks_getlock(vfsp); 2731 vn_vfslocks_rele(vpvfsentry); 2732 2733 rwst_exit(&vpvfsentry->ve_lock); 2734 vn_vfslocks_rele(vpvfsentry); 2735 } 2736 2737 /* 2738 * Utility routine that allows a filesystem to construct its 2739 * fsid in "the usual way" - by munging some underlying dev_t and 2740 * the filesystem type number into the 64-bit fsid. Note that 2741 * this implicitly relies on dev_t persistence to make filesystem 2742 * id's persistent. 2743 * 2744 * There's nothing to prevent an individual fs from constructing its 2745 * fsid in a different way, and indeed they should. 2746 * 2747 * Since we want fsids to be 32-bit quantities (so that they can be 2748 * exported identically by either 32-bit or 64-bit APIs, as well as 2749 * the fact that fsid's are "known" to NFS), we compress the device 2750 * number given down to 32-bits, and panic if that isn't possible. 2751 */ 2752 void 2753 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 2754 { 2755 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 2756 panic("device number too big for fsid!"); 2757 fsi->val[1] = val; 2758 } 2759 2760 int 2761 vfs_lock_held(vfs_t *vfsp) 2762 { 2763 int held; 2764 vn_vfslocks_entry_t *vpvfsentry; 2765 2766 /* 2767 * vfs_lock_held will mimic sema_held behaviour 2768 * if panicstr is set. And these changes should remain 2769 * for the patch changes as it is. 2770 */ 2771 if (panicstr) 2772 return (1); 2773 2774 vpvfsentry = vn_vfslocks_getlock(vfsp); 2775 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 2776 2777 vn_vfslocks_rele(vpvfsentry); 2778 return (held); 2779 } 2780 2781 struct _kthread * 2782 vfs_lock_owner(vfs_t *vfsp) 2783 { 2784 struct _kthread *owner; 2785 vn_vfslocks_entry_t *vpvfsentry; 2786 2787 /* 2788 * vfs_wlock_held will mimic sema_held behaviour 2789 * if panicstr is set. And these changes should remain 2790 * for the patch changes as it is. 2791 */ 2792 if (panicstr) 2793 return (NULL); 2794 2795 vpvfsentry = vn_vfslocks_getlock(vfsp); 2796 owner = rwst_owner(&vpvfsentry->ve_lock); 2797 2798 vn_vfslocks_rele(vpvfsentry); 2799 return (owner); 2800 } 2801 2802 /* 2803 * vfs list locking. 2804 * 2805 * Rather than manipulate the vfslist lock directly, we abstract into lock 2806 * and unlock routines to allow the locking implementation to be changed for 2807 * clustering. 2808 * 2809 * Whenever the vfs list is modified through its hash links, the overall list 2810 * lock must be obtained before locking the relevant hash bucket. But to see 2811 * whether a given vfs is on the list, it suffices to obtain the lock for the 2812 * hash bucket without getting the overall list lock. (See getvfs() below.) 2813 */ 2814 2815 void 2816 vfs_list_lock() 2817 { 2818 rw_enter(&vfslist, RW_WRITER); 2819 } 2820 2821 void 2822 vfs_list_read_lock() 2823 { 2824 rw_enter(&vfslist, RW_READER); 2825 } 2826 2827 void 2828 vfs_list_unlock() 2829 { 2830 rw_exit(&vfslist); 2831 } 2832 2833 /* 2834 * Low level worker routines for adding entries to and removing entries from 2835 * the vfs list. 2836 */ 2837 2838 static void 2839 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 2840 { 2841 int vhno; 2842 struct vfs **hp; 2843 dev_t dev; 2844 2845 ASSERT(RW_WRITE_HELD(&vfslist)); 2846 2847 dev = expldev(vfsp->vfs_fsid.val[0]); 2848 vhno = VFSHASH(getmajor(dev), getminor(dev)); 2849 2850 mutex_enter(&rvfs_list[vhno].rvfs_lock); 2851 2852 /* 2853 * Link into the hash table, inserting it at the end, so that LOFS 2854 * with the same fsid as UFS (or other) file systems will not hide the 2855 * UFS. 2856 */ 2857 if (insert_at_head) { 2858 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 2859 rvfs_list[vhno].rvfs_head = vfsp; 2860 } else { 2861 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 2862 hp = &(*hp)->vfs_hash) 2863 continue; 2864 /* 2865 * hp now contains the address of the pointer to update 2866 * to effect the insertion. 2867 */ 2868 vfsp->vfs_hash = NULL; 2869 *hp = vfsp; 2870 } 2871 2872 rvfs_list[vhno].rvfs_len++; 2873 mutex_exit(&rvfs_list[vhno].rvfs_lock); 2874 } 2875 2876 2877 static void 2878 vfs_hash_remove(struct vfs *vfsp) 2879 { 2880 int vhno; 2881 struct vfs *tvfsp; 2882 dev_t dev; 2883 2884 ASSERT(RW_WRITE_HELD(&vfslist)); 2885 2886 dev = expldev(vfsp->vfs_fsid.val[0]); 2887 vhno = VFSHASH(getmajor(dev), getminor(dev)); 2888 2889 mutex_enter(&rvfs_list[vhno].rvfs_lock); 2890 2891 /* 2892 * Remove from hash. 2893 */ 2894 if (rvfs_list[vhno].rvfs_head == vfsp) { 2895 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 2896 rvfs_list[vhno].rvfs_len--; 2897 goto foundit; 2898 } 2899 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 2900 tvfsp = tvfsp->vfs_hash) { 2901 if (tvfsp->vfs_hash == vfsp) { 2902 tvfsp->vfs_hash = vfsp->vfs_hash; 2903 rvfs_list[vhno].rvfs_len--; 2904 goto foundit; 2905 } 2906 } 2907 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 2908 2909 foundit: 2910 2911 mutex_exit(&rvfs_list[vhno].rvfs_lock); 2912 } 2913 2914 2915 void 2916 vfs_list_add(struct vfs *vfsp) 2917 { 2918 zone_t *zone; 2919 2920 /* 2921 * The zone that owns the mount is the one that performed the mount. 2922 * Note that this isn't necessarily the same as the zone mounted into. 2923 * The corresponding zone_rele() will be done when the vfs_t is 2924 * being free'd. 2925 */ 2926 vfsp->vfs_zone = curproc->p_zone; 2927 zone_hold(vfsp->vfs_zone); 2928 2929 /* 2930 * Find the zone mounted into, and put this mount on its vfs list. 2931 */ 2932 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 2933 ASSERT(zone != NULL); 2934 /* 2935 * Special casing for the root vfs. This structure is allocated 2936 * statically and hooked onto rootvfs at link time. During the 2937 * vfs_mountroot call at system startup time, the root file system's 2938 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 2939 * as argument. The code below must detect and handle this special 2940 * case. The only apparent justification for this special casing is 2941 * to ensure that the root file system appears at the head of the 2942 * list. 2943 * 2944 * XXX: I'm assuming that it's ok to do normal list locking when 2945 * adding the entry for the root file system (this used to be 2946 * done with no locks held). 2947 */ 2948 vfs_list_lock(); 2949 /* 2950 * Link into the vfs list proper. 2951 */ 2952 if (vfsp == &root) { 2953 /* 2954 * Assert: This vfs is already on the list as its first entry. 2955 * Thus, there's nothing to do. 2956 */ 2957 ASSERT(rootvfs == vfsp); 2958 /* 2959 * Add it to the head of the global zone's vfslist. 2960 */ 2961 ASSERT(zone == global_zone); 2962 ASSERT(zone->zone_vfslist == NULL); 2963 zone->zone_vfslist = vfsp; 2964 } else { 2965 /* 2966 * Link to end of list using vfs_prev (as rootvfs is now a 2967 * doubly linked circular list) so list is in mount order for 2968 * mnttab use. 2969 */ 2970 rootvfs->vfs_prev->vfs_next = vfsp; 2971 vfsp->vfs_prev = rootvfs->vfs_prev; 2972 rootvfs->vfs_prev = vfsp; 2973 vfsp->vfs_next = rootvfs; 2974 2975 /* 2976 * Do it again for the zone-private list (which may be NULL). 2977 */ 2978 if (zone->zone_vfslist == NULL) { 2979 ASSERT(zone != global_zone); 2980 zone->zone_vfslist = vfsp; 2981 } else { 2982 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 2983 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 2984 zone->zone_vfslist->vfs_zone_prev = vfsp; 2985 vfsp->vfs_zone_next = zone->zone_vfslist; 2986 } 2987 } 2988 2989 /* 2990 * Link into the hash table, inserting it at the end, so that LOFS 2991 * with the same fsid as UFS (or other) file systems will not hide 2992 * the UFS. 2993 */ 2994 vfs_hash_add(vfsp, 0); 2995 2996 /* 2997 * update the mnttab modification time 2998 */ 2999 vfs_mnttab_modtimeupd(); 3000 vfs_list_unlock(); 3001 zone_rele(zone); 3002 } 3003 3004 void 3005 vfs_list_remove(struct vfs *vfsp) 3006 { 3007 zone_t *zone; 3008 3009 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3010 ASSERT(zone != NULL); 3011 /* 3012 * Callers are responsible for preventing attempts to unmount the 3013 * root. 3014 */ 3015 ASSERT(vfsp != rootvfs); 3016 3017 vfs_list_lock(); 3018 3019 /* 3020 * Remove from hash. 3021 */ 3022 vfs_hash_remove(vfsp); 3023 3024 /* 3025 * Remove from vfs list. 3026 */ 3027 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3028 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3029 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3030 3031 /* 3032 * Remove from zone-specific vfs list. 3033 */ 3034 if (zone->zone_vfslist == vfsp) 3035 zone->zone_vfslist = vfsp->vfs_zone_next; 3036 3037 if (vfsp->vfs_zone_next == vfsp) { 3038 ASSERT(vfsp->vfs_zone_prev == vfsp); 3039 ASSERT(zone->zone_vfslist == vfsp); 3040 zone->zone_vfslist = NULL; 3041 } 3042 3043 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3044 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3045 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3046 3047 /* 3048 * update the mnttab modification time 3049 */ 3050 vfs_mnttab_modtimeupd(); 3051 vfs_list_unlock(); 3052 zone_rele(zone); 3053 } 3054 3055 struct vfs * 3056 getvfs(fsid_t *fsid) 3057 { 3058 struct vfs *vfsp; 3059 int val0 = fsid->val[0]; 3060 int val1 = fsid->val[1]; 3061 dev_t dev = expldev(val0); 3062 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3063 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3064 3065 mutex_enter(hmp); 3066 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3067 if (vfsp->vfs_fsid.val[0] == val0 && 3068 vfsp->vfs_fsid.val[1] == val1) { 3069 VFS_HOLD(vfsp); 3070 mutex_exit(hmp); 3071 return (vfsp); 3072 } 3073 } 3074 mutex_exit(hmp); 3075 return (NULL); 3076 } 3077 3078 /* 3079 * Search the vfs mount in progress list for a specified device/vfs entry. 3080 * Returns 0 if the first entry in the list that the device matches has the 3081 * given vfs pointer as well. If the device matches but a different vfs 3082 * pointer is encountered in the list before the given vfs pointer then 3083 * a 1 is returned. 3084 */ 3085 3086 int 3087 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3088 { 3089 int retval = 0; 3090 struct ipmnt *mipp; 3091 3092 mutex_enter(&vfs_miplist_mutex); 3093 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3094 if (mipp->mip_dev == dev) { 3095 if (mipp->mip_vfsp != vfsp) 3096 retval = 1; 3097 break; 3098 } 3099 } 3100 mutex_exit(&vfs_miplist_mutex); 3101 return (retval); 3102 } 3103 3104 /* 3105 * Search the vfs list for a specified device. Returns 1, if entry is found 3106 * or 0 if no suitable entry is found. 3107 */ 3108 3109 int 3110 vfs_devismounted(dev_t dev) 3111 { 3112 struct vfs *vfsp; 3113 int found; 3114 3115 vfs_list_read_lock(); 3116 vfsp = rootvfs; 3117 found = 0; 3118 do { 3119 if (vfsp->vfs_dev == dev) { 3120 found = 1; 3121 break; 3122 } 3123 vfsp = vfsp->vfs_next; 3124 } while (vfsp != rootvfs); 3125 3126 vfs_list_unlock(); 3127 return (found); 3128 } 3129 3130 /* 3131 * Search the vfs list for a specified device. Returns a pointer to it 3132 * or NULL if no suitable entry is found. The caller of this routine 3133 * is responsible for releasing the returned vfs pointer. 3134 */ 3135 struct vfs * 3136 vfs_dev2vfsp(dev_t dev) 3137 { 3138 struct vfs *vfsp; 3139 int found; 3140 3141 vfs_list_read_lock(); 3142 vfsp = rootvfs; 3143 found = 0; 3144 do { 3145 /* 3146 * The following could be made more efficient by making 3147 * the entire loop use vfs_zone_next if the call is from 3148 * a zone. The only callers, however, ustat(2) and 3149 * umount2(2), don't seem to justify the added 3150 * complexity at present. 3151 */ 3152 if (vfsp->vfs_dev == dev && 3153 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3154 curproc->p_zone)) { 3155 VFS_HOLD(vfsp); 3156 found = 1; 3157 break; 3158 } 3159 vfsp = vfsp->vfs_next; 3160 } while (vfsp != rootvfs); 3161 vfs_list_unlock(); 3162 return (found ? vfsp: NULL); 3163 } 3164 3165 /* 3166 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3167 * or NULL if no suitable entry is found. The caller of this routine 3168 * is responsible for releasing the returned vfs pointer. 3169 * 3170 * Note that if multiple mntpoints match, the last one matching is 3171 * returned in an attempt to return the "top" mount when overlay 3172 * mounts are covering the same mount point. This is accomplished by starting 3173 * at the end of the list and working our way backwards, stopping at the first 3174 * matching mount. 3175 */ 3176 struct vfs * 3177 vfs_mntpoint2vfsp(const char *mp) 3178 { 3179 struct vfs *vfsp; 3180 struct vfs *retvfsp = NULL; 3181 zone_t *zone = curproc->p_zone; 3182 struct vfs *list; 3183 3184 vfs_list_read_lock(); 3185 if (getzoneid() == GLOBAL_ZONEID) { 3186 /* 3187 * The global zone may see filesystems in any zone. 3188 */ 3189 vfsp = rootvfs->vfs_prev; 3190 do { 3191 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) { 3192 retvfsp = vfsp; 3193 break; 3194 } 3195 vfsp = vfsp->vfs_prev; 3196 } while (vfsp != rootvfs->vfs_prev); 3197 } else if ((list = zone->zone_vfslist) != NULL) { 3198 const char *mntpt; 3199 3200 vfsp = list->vfs_zone_prev; 3201 do { 3202 mntpt = refstr_value(vfsp->vfs_mntpt); 3203 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3204 if (strcmp(mntpt, mp) == 0) { 3205 retvfsp = vfsp; 3206 break; 3207 } 3208 vfsp = vfsp->vfs_zone_prev; 3209 } while (vfsp != list->vfs_zone_prev); 3210 } 3211 if (retvfsp) 3212 VFS_HOLD(retvfsp); 3213 vfs_list_unlock(); 3214 return (retvfsp); 3215 } 3216 3217 /* 3218 * Search the vfs list for a specified vfsops. 3219 * if vfs entry is found then return 1, else 0. 3220 */ 3221 int 3222 vfs_opsinuse(vfsops_t *ops) 3223 { 3224 struct vfs *vfsp; 3225 int found; 3226 3227 vfs_list_read_lock(); 3228 vfsp = rootvfs; 3229 found = 0; 3230 do { 3231 if (vfs_getops(vfsp) == ops) { 3232 found = 1; 3233 break; 3234 } 3235 vfsp = vfsp->vfs_next; 3236 } while (vfsp != rootvfs); 3237 vfs_list_unlock(); 3238 return (found); 3239 } 3240 3241 /* 3242 * Allocate an entry in vfssw for a file system type 3243 */ 3244 struct vfssw * 3245 allocate_vfssw(char *type) 3246 { 3247 struct vfssw *vswp; 3248 3249 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3250 /* 3251 * The vfssw table uses the empty string to identify an 3252 * available entry; we cannot add any type which has 3253 * a leading NUL. The string length is limited to 3254 * the size of the st_fstype array in struct stat. 3255 */ 3256 return (NULL); 3257 } 3258 3259 ASSERT(VFSSW_WRITE_LOCKED()); 3260 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3261 if (!ALLOCATED_VFSSW(vswp)) { 3262 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3263 (void) strcpy(vswp->vsw_name, type); 3264 ASSERT(vswp->vsw_count == 0); 3265 vswp->vsw_count = 1; 3266 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3267 return (vswp); 3268 } 3269 return (NULL); 3270 } 3271 3272 /* 3273 * Impose additional layer of translation between vfstype names 3274 * and module names in the filesystem. 3275 */ 3276 static char * 3277 vfs_to_modname(char *vfstype) 3278 { 3279 if (strcmp(vfstype, "proc") == 0) { 3280 vfstype = "procfs"; 3281 } else if (strcmp(vfstype, "fd") == 0) { 3282 vfstype = "fdfs"; 3283 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3284 vfstype = "nfs"; 3285 } 3286 3287 return (vfstype); 3288 } 3289 3290 /* 3291 * Find a vfssw entry given a file system type name. 3292 * Try to autoload the filesystem if it's not found. 3293 * If it's installed, return the vfssw locked to prevent unloading. 3294 */ 3295 struct vfssw * 3296 vfs_getvfssw(char *type) 3297 { 3298 struct vfssw *vswp; 3299 char *modname; 3300 3301 RLOCK_VFSSW(); 3302 vswp = vfs_getvfsswbyname(type); 3303 modname = vfs_to_modname(type); 3304 3305 if (rootdir == NULL) { 3306 /* 3307 * If we haven't yet loaded the root file system, then our 3308 * _init won't be called until later. Allocate vfssw entry, 3309 * because mod_installfs won't be called. 3310 */ 3311 if (vswp == NULL) { 3312 RUNLOCK_VFSSW(); 3313 WLOCK_VFSSW(); 3314 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 3315 if ((vswp = allocate_vfssw(type)) == NULL) { 3316 WUNLOCK_VFSSW(); 3317 return (NULL); 3318 } 3319 } 3320 WUNLOCK_VFSSW(); 3321 RLOCK_VFSSW(); 3322 } 3323 if (!VFS_INSTALLED(vswp)) { 3324 RUNLOCK_VFSSW(); 3325 (void) modloadonly("fs", modname); 3326 } else 3327 RUNLOCK_VFSSW(); 3328 return (vswp); 3329 } 3330 3331 /* 3332 * Try to load the filesystem. Before calling modload(), we drop 3333 * our lock on the VFS switch table, and pick it up after the 3334 * module is loaded. However, there is a potential race: the 3335 * module could be unloaded after the call to modload() completes 3336 * but before we pick up the lock and drive on. Therefore, 3337 * we keep reloading the module until we've loaded the module 3338 * _and_ we have the lock on the VFS switch table. 3339 */ 3340 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 3341 RUNLOCK_VFSSW(); 3342 if (modload("fs", modname) == -1) 3343 return (NULL); 3344 RLOCK_VFSSW(); 3345 if (vswp == NULL) 3346 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 3347 break; 3348 } 3349 RUNLOCK_VFSSW(); 3350 3351 return (vswp); 3352 } 3353 3354 /* 3355 * Find a vfssw entry given a file system type name. 3356 */ 3357 struct vfssw * 3358 vfs_getvfsswbyname(char *type) 3359 { 3360 struct vfssw *vswp; 3361 3362 ASSERT(VFSSW_LOCKED()); 3363 if (type == NULL || *type == '\0') 3364 return (NULL); 3365 3366 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3367 if (strcmp(type, vswp->vsw_name) == 0) { 3368 vfs_refvfssw(vswp); 3369 return (vswp); 3370 } 3371 } 3372 3373 return (NULL); 3374 } 3375 3376 /* 3377 * Find a vfssw entry given a set of vfsops. 3378 */ 3379 struct vfssw * 3380 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 3381 { 3382 struct vfssw *vswp; 3383 3384 RLOCK_VFSSW(); 3385 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3386 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 3387 vfs_refvfssw(vswp); 3388 RUNLOCK_VFSSW(); 3389 return (vswp); 3390 } 3391 } 3392 RUNLOCK_VFSSW(); 3393 3394 return (NULL); 3395 } 3396 3397 /* 3398 * Reference a vfssw entry. 3399 */ 3400 void 3401 vfs_refvfssw(struct vfssw *vswp) 3402 { 3403 3404 mutex_enter(&vswp->vsw_lock); 3405 vswp->vsw_count++; 3406 mutex_exit(&vswp->vsw_lock); 3407 } 3408 3409 /* 3410 * Unreference a vfssw entry. 3411 */ 3412 void 3413 vfs_unrefvfssw(struct vfssw *vswp) 3414 { 3415 3416 mutex_enter(&vswp->vsw_lock); 3417 vswp->vsw_count--; 3418 mutex_exit(&vswp->vsw_lock); 3419 } 3420 3421 int sync_timeout = 30; /* timeout for syncing a page during panic */ 3422 int sync_timeleft; /* portion of sync_timeout remaining */ 3423 3424 static int sync_retries = 20; /* number of retries when not making progress */ 3425 static int sync_triesleft; /* portion of sync_retries remaining */ 3426 3427 static pgcnt_t old_pgcnt, new_pgcnt; 3428 static int new_bufcnt, old_bufcnt; 3429 3430 /* 3431 * Sync all of the mounted filesystems, and then wait for the actual i/o to 3432 * complete. We wait by counting the number of dirty pages and buffers, 3433 * pushing them out using bio_busy() and page_busy(), and then counting again. 3434 * This routine is used during both the uadmin A_SHUTDOWN code as well as 3435 * the SYNC phase of the panic code (see comments in panic.c). It should only 3436 * be used after some higher-level mechanism has quiesced the system so that 3437 * new writes are not being initiated while we are waiting for completion. 3438 * 3439 * To ensure finite running time, our algorithm uses two timeout mechanisms: 3440 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and 3441 * sync_triesleft (a progress counter used by the vfs_syncall() loop below). 3442 * Together these ensure that syncing completes if our i/o paths are stuck. 3443 * The counters are declared above so they can be found easily in the debugger. 3444 * 3445 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the 3446 * vfs_syncprogress() subroutine whenever we make progress through the lists of 3447 * pages and buffers. It is decremented and expired by the deadman() cyclic. 3448 * When vfs_syncall() decides it is done, we disable the deadman() counter by 3449 * setting sync_timeleft to zero. This timer guards against vfs_syncall() 3450 * deadlocking or hanging inside of a broken filesystem or driver routine. 3451 * 3452 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 3453 * sync_retries consecutive calls to bio_busy() and page_busy() without 3454 * decreasing either the number of dirty buffers or dirty pages below the 3455 * lowest count we have seen so far, we give up and return from vfs_syncall(). 3456 * 3457 * Each loop iteration ends with a call to delay() one second to allow time for 3458 * i/o completion and to permit the user time to read our progress messages. 3459 */ 3460 void 3461 vfs_syncall(void) 3462 { 3463 if (rootdir == NULL && !modrootloaded) 3464 return; /* panic during boot - no filesystems yet */ 3465 3466 printf("syncing file systems..."); 3467 vfs_syncprogress(); 3468 sync(); 3469 3470 vfs_syncprogress(); 3471 sync_triesleft = sync_retries; 3472 3473 old_bufcnt = new_bufcnt = INT_MAX; 3474 old_pgcnt = new_pgcnt = ULONG_MAX; 3475 3476 while (sync_triesleft > 0) { 3477 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 3478 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 3479 3480 new_bufcnt = bio_busy(B_TRUE); 3481 new_pgcnt = page_busy(B_TRUE); 3482 vfs_syncprogress(); 3483 3484 if (new_bufcnt == 0 && new_pgcnt == 0) 3485 break; 3486 3487 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 3488 sync_triesleft = sync_retries; 3489 else 3490 sync_triesleft--; 3491 3492 if (new_bufcnt) 3493 printf(" [%d]", new_bufcnt); 3494 if (new_pgcnt) 3495 printf(" %lu", new_pgcnt); 3496 3497 delay(hz); 3498 } 3499 3500 if (new_bufcnt != 0 || new_pgcnt != 0) 3501 printf(" done (not all i/o completed)\n"); 3502 else 3503 printf(" done\n"); 3504 3505 sync_timeleft = 0; 3506 delay(hz); 3507 } 3508 3509 /* 3510 * If we are in the middle of the sync phase of panic, reset sync_timeleft to 3511 * sync_timeout to indicate that we are making progress and the deadman() 3512 * omnipresent cyclic should not yet time us out. Note that it is safe to 3513 * store to sync_timeleft here since the deadman() is firing at high-level 3514 * on top of us. If we are racing with the deadman(), either the deadman() 3515 * will decrement the old value and then we will reset it, or we will 3516 * reset it and then the deadman() will immediately decrement it. In either 3517 * case, correct behavior results. 3518 */ 3519 void 3520 vfs_syncprogress(void) 3521 { 3522 if (panicstr) 3523 sync_timeleft = sync_timeout; 3524 } 3525 3526 /* 3527 * Map VFS flags to statvfs flags. These shouldn't really be separate 3528 * flags at all. 3529 */ 3530 uint_t 3531 vf_to_stf(uint_t vf) 3532 { 3533 uint_t stf = 0; 3534 3535 if (vf & VFS_RDONLY) 3536 stf |= ST_RDONLY; 3537 if (vf & VFS_NOSETUID) 3538 stf |= ST_NOSUID; 3539 if (vf & VFS_NOTRUNC) 3540 stf |= ST_NOTRUNC; 3541 3542 return (stf); 3543 } 3544 3545 /* 3546 * Use old-style function prototype for vfsstray() so 3547 * that we can use it anywhere in the vfsops structure. 3548 */ 3549 int vfsstray(); 3550 3551 /* 3552 * Entries for (illegal) fstype 0. 3553 */ 3554 /* ARGSUSED */ 3555 int 3556 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 3557 { 3558 cmn_err(CE_PANIC, "stray vfs operation"); 3559 return (0); 3560 } 3561 3562 vfsops_t vfs_strayops = { 3563 vfsstray, 3564 vfsstray, 3565 vfsstray, 3566 vfsstray, 3567 vfsstray_sync, 3568 vfsstray, 3569 vfsstray, 3570 vfsstray 3571 }; 3572 3573 /* 3574 * Entries for (illegal) fstype 0. 3575 */ 3576 int 3577 vfsstray(void) 3578 { 3579 cmn_err(CE_PANIC, "stray vfs operation"); 3580 return (0); 3581 } 3582 3583 /* 3584 * Support for dealing with forced UFS unmount and its interaction with 3585 * LOFS. Could be used by any filesystem. 3586 * See bug 1203132. 3587 */ 3588 int 3589 vfs_EIO(void) 3590 { 3591 return (EIO); 3592 } 3593 3594 /* 3595 * We've gotta define the op for sync separately, since the compiler gets 3596 * confused if we mix and match ANSI and normal style prototypes when 3597 * a "short" argument is present and spits out a warning. 3598 */ 3599 /*ARGSUSED*/ 3600 int 3601 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 3602 { 3603 return (EIO); 3604 } 3605 3606 vfs_t EIO_vfs; 3607 vfsops_t *EIO_vfsops; 3608 3609 /* 3610 * Called from startup() to initialize all loaded vfs's 3611 */ 3612 void 3613 vfsinit(void) 3614 { 3615 struct vfssw *vswp; 3616 int error; 3617 3618 static const fs_operation_def_t EIO_vfsops_template[] = { 3619 VFSNAME_MOUNT, vfs_EIO, 3620 VFSNAME_UNMOUNT, vfs_EIO, 3621 VFSNAME_ROOT, vfs_EIO, 3622 VFSNAME_STATVFS, vfs_EIO, 3623 VFSNAME_SYNC, (fs_generic_func_p) vfs_EIO_sync, 3624 VFSNAME_VGET, vfs_EIO, 3625 VFSNAME_MOUNTROOT, vfs_EIO, 3626 VFSNAME_FREEVFS, vfs_EIO, 3627 VFSNAME_VNSTATE, vfs_EIO, 3628 NULL, NULL 3629 }; 3630 3631 3632 /* Initialize the vnode cache (file systems may use it during init). */ 3633 3634 vn_create_cache(); 3635 3636 /* Setup event monitor framework */ 3637 3638 fem_init(); 3639 3640 /* Initialize the dummy stray file system type. */ 3641 3642 vfssw[0].vsw_vfsops = vfs_strayops; 3643 3644 /* Initialize the dummy EIO file system. */ 3645 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 3646 if (error != 0) { 3647 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 3648 /* Shouldn't happen, but not bad enough to panic */ 3649 } 3650 3651 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 3652 3653 /* 3654 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 3655 * on this vfs can immediately notice it's invalid. 3656 */ 3657 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 3658 3659 /* 3660 * Call the init routines of non-loadable filesystems only. 3661 * Filesystems which are loaded as separate modules will be 3662 * initialized by the module loading code instead. 3663 */ 3664 3665 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3666 RLOCK_VFSSW(); 3667 if (vswp->vsw_init != NULL) 3668 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 3669 RUNLOCK_VFSSW(); 3670 } 3671 } 3672 3673 /* 3674 * Increments the vfs reference count by one atomically. 3675 */ 3676 void 3677 vfs_hold(vfs_t *vfsp) 3678 { 3679 atomic_add_32(&vfsp->vfs_count, 1); 3680 ASSERT(vfsp->vfs_count != 0); 3681 } 3682 3683 /* 3684 * Decrements the vfs reference count by one atomically. When 3685 * vfs reference count becomes zero, it calls the file system 3686 * specific vfs_freevfs() to free up the resources. 3687 */ 3688 void 3689 vfs_rele(vfs_t *vfsp) 3690 { 3691 ASSERT(vfsp->vfs_count != 0); 3692 if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) { 3693 VFS_FREEVFS(vfsp); 3694 if (vfsp->vfs_zone) 3695 zone_rele(vfsp->vfs_zone); 3696 vfs_freemnttab(vfsp); 3697 sema_destroy(&vfsp->vfs_reflock); 3698 kmem_free(vfsp, sizeof (*vfsp)); 3699 } 3700 } 3701 3702 /* 3703 * Generic operations vector support. 3704 * 3705 * This is used to build operations vectors for both the vfs and vnode. 3706 * It's normally called only when a file system is loaded. 3707 * 3708 * There are many possible algorithms for this, including the following: 3709 * 3710 * (1) scan the list of known operations; for each, see if the file system 3711 * includes an entry for it, and fill it in as appropriate. 3712 * 3713 * (2) set up defaults for all known operations. scan the list of ops 3714 * supplied by the file system; for each which is both supplied and 3715 * known, fill it in. 3716 * 3717 * (3) sort the lists of known ops & supplied ops; scan the list, filling 3718 * in entries as we go. 3719 * 3720 * we choose (1) for simplicity, and because performance isn't critical here. 3721 * note that (2) could be sped up using a precomputed hash table on known ops. 3722 * (3) could be faster than either, but only if the lists were very large or 3723 * supplied in sorted order. 3724 * 3725 */ 3726 3727 int 3728 fs_build_vector(void *vector, int *unused_ops, 3729 const fs_operation_trans_def_t *translation, 3730 const fs_operation_def_t *operations) 3731 { 3732 int i, num_trans, num_ops, used; 3733 3734 /* Count the number of translations and the number of supplied */ 3735 /* operations. */ 3736 3737 { 3738 const fs_operation_trans_def_t *p; 3739 3740 for (num_trans = 0, p = translation; 3741 p->name != NULL; 3742 num_trans++, p++) 3743 ; 3744 } 3745 3746 { 3747 const fs_operation_def_t *p; 3748 3749 for (num_ops = 0, p = operations; 3750 p->name != NULL; 3751 num_ops++, p++) 3752 ; 3753 } 3754 3755 /* Walk through each operation known to our caller. There will be */ 3756 /* one entry in the supplied "translation table" for each. */ 3757 3758 used = 0; 3759 3760 for (i = 0; i < num_trans; i++) { 3761 int j, found; 3762 char *curname; 3763 fs_generic_func_p result; 3764 fs_generic_func_p *location; 3765 3766 curname = translation[i].name; 3767 3768 /* Look for a matching operation in the list supplied by the */ 3769 /* file system. */ 3770 3771 found = 0; 3772 3773 for (j = 0; j < num_ops; j++) { 3774 if (strcmp(operations[j].name, curname) == 0) { 3775 used++; 3776 found = 1; 3777 break; 3778 } 3779 } 3780 3781 /* If the file system is using a "placeholder" for default */ 3782 /* or error functions, grab the appropriate function out of */ 3783 /* the translation table. If the file system didn't supply */ 3784 /* this operation at all, use the default function. */ 3785 3786 if (found) { 3787 result = operations[j].func; 3788 if (result == fs_default) { 3789 result = translation[i].defaultFunc; 3790 } else if (result == fs_error) { 3791 result = translation[i].errorFunc; 3792 } else if (result == NULL) { 3793 /* Null values are PROHIBITED */ 3794 return (EINVAL); 3795 } 3796 } else { 3797 result = translation[i].defaultFunc; 3798 } 3799 3800 /* Now store the function into the operations vector. */ 3801 3802 location = (fs_generic_func_p *) 3803 (((char *)vector) + translation[i].offset); 3804 3805 *location = result; 3806 } 3807 3808 *unused_ops = num_ops - used; 3809 3810 return (0); 3811 } 3812 3813 /* Placeholder functions, should never be called. */ 3814 3815 int 3816 fs_error(void) 3817 { 3818 cmn_err(CE_PANIC, "fs_error called"); 3819 return (0); 3820 } 3821 3822 int 3823 fs_default(void) 3824 { 3825 cmn_err(CE_PANIC, "fs_default called"); 3826 return (0); 3827 } 3828 3829 #ifdef __sparc 3830 3831 /* 3832 * Part of the implementation of booting off a mirrored root 3833 * involves a change of dev_t for the root device. To 3834 * accomplish this, first remove the existing hash table 3835 * entry for the root device, convert to the new dev_t, 3836 * then re-insert in the hash table at the head of the list. 3837 */ 3838 void 3839 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 3840 { 3841 vfs_list_lock(); 3842 3843 vfs_hash_remove(vfsp); 3844 3845 vfsp->vfs_dev = ndev; 3846 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 3847 3848 vfs_hash_add(vfsp, 1); 3849 3850 vfs_list_unlock(); 3851 } 3852 3853 #else /* x86 NEWBOOT */ 3854 3855 int 3856 rootconf() 3857 { 3858 int error; 3859 struct vfssw *vsw; 3860 extern void pm_init(); 3861 char *fstyp; 3862 3863 fstyp = getrootfs(); 3864 3865 if (error = clboot_rootconf()) 3866 return (error); 3867 3868 if (modload("fs", fstyp) == -1) 3869 cmn_err(CE_PANIC, "Cannot _init %s module\n", fstyp); 3870 3871 RLOCK_VFSSW(); 3872 vsw = vfs_getvfsswbyname(fstyp); 3873 RUNLOCK_VFSSW(); 3874 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 3875 VFS_HOLD(rootvfs); 3876 3877 /* always mount readonly first */ 3878 rootvfs->vfs_flag |= VFS_RDONLY; 3879 3880 pm_init(); 3881 3882 if (netboot) 3883 (void) strplumb(); 3884 3885 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 3886 vfs_unrefvfssw(vsw); 3887 rootdev = rootvfs->vfs_dev; 3888 3889 if (error) 3890 cmn_err(CE_PANIC, "cannot mount root path %s", svm_bootpath); 3891 return (error); 3892 } 3893 3894 /* 3895 * XXX this is called by nfs only and should probably be removed 3896 * If booted with ASKNAME, prompt on the console for a filesystem 3897 * name and return it. 3898 */ 3899 void 3900 getfsname(char *askfor, char *name, size_t namelen) 3901 { 3902 if (boothowto & RB_ASKNAME) { 3903 printf("%s name: ", askfor); 3904 console_gets(name, namelen); 3905 } 3906 } 3907 3908 /* 3909 * If server_path exists, then we are booting a diskless 3910 * client. Otherwise, we default to ufs. Zfs should perhaps be 3911 * another property. 3912 */ 3913 static char * 3914 getrootfs(void) 3915 { 3916 extern char *strplumb_get_netdev_path(void); 3917 char *propstr = NULL; 3918 3919 /* check fstype property; it should be nfsdyn for diskless */ 3920 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 3921 DDI_PROP_DONTPASS, "fstype", &propstr) 3922 == DDI_SUCCESS) { 3923 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 3924 ddi_prop_free(propstr); 3925 } 3926 3927 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) 3928 return (rootfs.bo_fstype); 3929 3930 ++netboot; 3931 /* check if path to network interface is specified in bootpath */ 3932 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 3933 DDI_PROP_DONTPASS, "bootpath", &propstr) 3934 == DDI_SUCCESS) { 3935 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 3936 ddi_prop_free(propstr); 3937 } else { 3938 /* attempt to determine netdev_path via boot_mac address */ 3939 netdev_path = strplumb_get_netdev_path(); 3940 if (netdev_path == NULL) 3941 cmn_err(CE_PANIC, 3942 "Cannot find boot network interface\n"); 3943 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 3944 } 3945 return ("nfs"); 3946 } 3947 #endif 3948