1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/errno.h> 46 #include <sys/user.h> 47 #include <sys/fstyp.h> 48 #include <sys/kmem.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/mount.h> 52 #include <sys/vfs.h> 53 #include <sys/vfs_opreg.h> 54 #include <sys/fem.h> 55 #include <sys/mntent.h> 56 #include <sys/stat.h> 57 #include <sys/statvfs.h> 58 #include <sys/statfs.h> 59 #include <sys/cred.h> 60 #include <sys/vnode.h> 61 #include <sys/rwstlock.h> 62 #include <sys/dnlc.h> 63 #include <sys/file.h> 64 #include <sys/time.h> 65 #include <sys/atomic.h> 66 #include <sys/cmn_err.h> 67 #include <sys/buf.h> 68 #include <sys/swap.h> 69 #include <sys/debug.h> 70 #include <sys/vnode.h> 71 #include <sys/modctl.h> 72 #include <sys/ddi.h> 73 #include <sys/pathname.h> 74 #include <sys/bootconf.h> 75 #include <sys/dumphdr.h> 76 #include <sys/dc_ki.h> 77 #include <sys/poll.h> 78 #include <sys/sunddi.h> 79 #include <sys/sysmacros.h> 80 #include <sys/zone.h> 81 #include <sys/policy.h> 82 #include <sys/ctfs.h> 83 #include <sys/objfs.h> 84 #include <sys/console.h> 85 #include <sys/reboot.h> 86 #include <sys/attr.h> 87 88 #include <vm/page.h> 89 90 #include <fs/fs_subr.h> 91 92 /* Private interfaces to create vopstats-related data structures */ 93 extern void initialize_vopstats(vopstats_t *); 94 extern vopstats_t *get_fstype_vopstats(struct vfs *, struct vfssw *); 95 extern vsk_anchor_t *get_vskstat_anchor(struct vfs *); 96 97 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 98 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 99 const char *, int, int); 100 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 101 static void vfs_freemnttab(struct vfs *); 102 static void vfs_freeopt(mntopt_t *); 103 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 104 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 105 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 106 static void vfs_createopttbl_extend(mntopts_t *, const char *, 107 const mntopts_t *); 108 static char **vfs_copycancelopt_extend(char **const, int); 109 static void vfs_freecancelopt(char **); 110 static void getrootfs(char **, char **); 111 static int getmacpath(dev_info_t *, void *); 112 static void vfs_mnttabvp_setup(void); 113 114 struct ipmnt { 115 struct ipmnt *mip_next; 116 dev_t mip_dev; 117 struct vfs *mip_vfsp; 118 }; 119 120 static kmutex_t vfs_miplist_mutex; 121 static struct ipmnt *vfs_miplist = NULL; 122 static struct ipmnt *vfs_miplist_end = NULL; 123 124 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */ 125 126 /* 127 * VFS global data. 128 */ 129 vnode_t *rootdir; /* pointer to root inode vnode. */ 130 vnode_t *devicesdir; /* pointer to inode of devices root */ 131 vnode_t *devdir; /* pointer to inode of dev root */ 132 133 char *server_rootpath; /* root path for diskless clients */ 134 char *server_hostname; /* hostname of diskless server */ 135 136 static struct vfs root; 137 static struct vfs devices; 138 static struct vfs dev; 139 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 140 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 141 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 142 /* must be power of 2! */ 143 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 144 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 145 char *vfs_dummyfstype = "\0"; 146 struct pollhead vfs_pollhd; /* for mnttab pollers */ 147 struct vnode *vfs_mntdummyvp; /* to fake mnttab read/write for file events */ 148 int mntfstype; /* will be set once mnt fs is mounted */ 149 150 /* 151 * Table for generic options recognized in the VFS layer and acted 152 * on at this level before parsing file system specific options. 153 * The nosuid option is stronger than any of the devices and setuid 154 * options, so those are canceled when nosuid is seen. 155 * 156 * All options which are added here need to be added to the 157 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 158 */ 159 /* 160 * VFS Mount options table 161 */ 162 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 163 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 164 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 165 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 166 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 167 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 168 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 169 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 170 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 171 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 172 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 173 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 174 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 175 176 static const mntopt_t mntopts[] = { 177 /* 178 * option name cancel options default arg flags 179 */ 180 { MNTOPT_REMOUNT, NULL, NULL, 181 MO_NODISPLAY, (void *)0 }, 182 { MNTOPT_RO, ro_cancel, NULL, 0, 183 (void *)0 }, 184 { MNTOPT_RW, rw_cancel, NULL, 0, 185 (void *)0 }, 186 { MNTOPT_SUID, suid_cancel, NULL, 0, 187 (void *)0 }, 188 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 189 (void *)0 }, 190 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 191 (void *)0 }, 192 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 193 (void *)0 }, 194 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 195 (void *)0 }, 196 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 197 (void *)0 }, 198 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 199 (void *)0 }, 200 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 201 (void *)0 }, 202 { MNTOPT_EXEC, exec_cancel, NULL, 0, 203 (void *)0 }, 204 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 205 (void *)0 }, 206 }; 207 208 const mntopts_t vfs_mntopts = { 209 sizeof (mntopts) / sizeof (mntopt_t), 210 (mntopt_t *)&mntopts[0] 211 }; 212 213 /* 214 * File system operation dispatch functions. 215 */ 216 217 int 218 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 219 { 220 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 221 } 222 223 int 224 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 225 { 226 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 227 } 228 229 int 230 fsop_root(vfs_t *vfsp, vnode_t **vpp) 231 { 232 refstr_t *mntpt; 233 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 234 /* 235 * Make sure this root has a path. With lofs, it is possible to have 236 * a NULL mountpoint. 237 */ 238 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { 239 mntpt = vfs_getmntpoint(vfsp); 240 vn_setpath_str(*vpp, refstr_value(mntpt), 241 strlen(refstr_value(mntpt))); 242 refstr_rele(mntpt); 243 } 244 245 return (ret); 246 } 247 248 int 249 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 250 { 251 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 252 } 253 254 int 255 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 256 { 257 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 258 } 259 260 int 261 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 262 { 263 /* 264 * In order to handle system attribute fids in a manner 265 * transparent to the underlying fs, we embed the fid for 266 * the sysattr parent object in the sysattr fid and tack on 267 * some extra bytes that only the sysattr layer knows about. 268 * 269 * This guarantees that sysattr fids are larger than other fids 270 * for this vfs. If the vfs supports sysattrs (implied 271 * by VFSFT_XVATTR support), we cannot have a size collision 272 * with XATTR_FIDSZ. 273 */ 274 if (vfs_has_feature(vfsp, VFSFT_XVATTR) && 275 fidp->fid_len == XATTR_FIDSZ) 276 return (xattr_dir_vget(vfsp, vpp, fidp)); 277 278 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 279 } 280 281 int 282 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 283 { 284 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 285 } 286 287 void 288 fsop_freefs(vfs_t *vfsp) 289 { 290 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 291 } 292 293 int 294 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 295 { 296 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 297 } 298 299 int 300 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 301 { 302 ASSERT((fstype >= 0) && (fstype < nfstype)); 303 304 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 305 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 306 else 307 return (ENOTSUP); 308 } 309 310 /* 311 * File system initialization. vfs_setfsops() must be called from a file 312 * system's init routine. 313 */ 314 315 static int 316 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 317 int *unused_ops) 318 { 319 static const fs_operation_trans_def_t vfs_ops_table[] = { 320 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 321 fs_nosys, fs_nosys, 322 323 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 324 fs_nosys, fs_nosys, 325 326 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 327 fs_nosys, fs_nosys, 328 329 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 330 fs_nosys, fs_nosys, 331 332 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 333 (fs_generic_func_p) fs_sync, 334 (fs_generic_func_p) fs_sync, /* No errors allowed */ 335 336 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 337 fs_nosys, fs_nosys, 338 339 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 340 fs_nosys, fs_nosys, 341 342 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 343 (fs_generic_func_p)fs_freevfs, 344 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 345 346 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 347 (fs_generic_func_p)fs_nosys, 348 (fs_generic_func_p)fs_nosys, 349 350 NULL, 0, NULL, NULL 351 }; 352 353 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 354 } 355 356 int 357 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 358 { 359 int error; 360 int unused_ops; 361 362 /* 363 * Verify that fstype refers to a valid fs. Note that 364 * 0 is valid since it's used to set "stray" ops. 365 */ 366 if ((fstype < 0) || (fstype >= nfstype)) 367 return (EINVAL); 368 369 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 370 return (EINVAL); 371 372 /* Set up the operations vector. */ 373 374 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 375 376 if (error != 0) 377 return (error); 378 379 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 380 381 if (actual != NULL) 382 *actual = &vfssw[fstype].vsw_vfsops; 383 384 #if DEBUG 385 if (unused_ops != 0) 386 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 387 "but not used", vfssw[fstype].vsw_name, unused_ops); 388 #endif 389 390 return (0); 391 } 392 393 int 394 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 395 { 396 int error; 397 int unused_ops; 398 399 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 400 401 error = fs_copyfsops(template, *actual, &unused_ops); 402 if (error != 0) { 403 kmem_free(*actual, sizeof (vfsops_t)); 404 *actual = NULL; 405 return (error); 406 } 407 408 return (0); 409 } 410 411 /* 412 * Free a vfsops structure created as a result of vfs_makefsops(). 413 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 414 * vfs_freevfsops_by_type(). 415 */ 416 void 417 vfs_freevfsops(vfsops_t *vfsops) 418 { 419 kmem_free(vfsops, sizeof (vfsops_t)); 420 } 421 422 /* 423 * Since the vfsops structure is part of the vfssw table and wasn't 424 * really allocated, we're not really freeing anything. We keep 425 * the name for consistency with vfs_freevfsops(). We do, however, 426 * need to take care of a little bookkeeping. 427 * NOTE: For a vfsops structure created by vfs_setfsops(), use 428 * vfs_freevfsops_by_type(). 429 */ 430 int 431 vfs_freevfsops_by_type(int fstype) 432 { 433 434 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 435 if ((fstype <= 0) || (fstype >= nfstype)) 436 return (EINVAL); 437 438 WLOCK_VFSSW(); 439 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 440 WUNLOCK_VFSSW(); 441 return (EINVAL); 442 } 443 444 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 445 WUNLOCK_VFSSW(); 446 447 return (0); 448 } 449 450 /* Support routines used to reference vfs_op */ 451 452 /* Set the operations vector for a vfs */ 453 void 454 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 455 { 456 vfsops_t *op; 457 458 ASSERT(vfsp != NULL); 459 ASSERT(vfsops != NULL); 460 461 op = vfsp->vfs_op; 462 membar_consumer(); 463 if (vfsp->vfs_femhead == NULL && 464 casptr(&vfsp->vfs_op, op, vfsops) == op) { 465 return; 466 } 467 fsem_setvfsops(vfsp, vfsops); 468 } 469 470 /* Retrieve the operations vector for a vfs */ 471 vfsops_t * 472 vfs_getops(vfs_t *vfsp) 473 { 474 vfsops_t *op; 475 476 ASSERT(vfsp != NULL); 477 478 op = vfsp->vfs_op; 479 membar_consumer(); 480 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) { 481 return (op); 482 } else { 483 return (fsem_getvfsops(vfsp)); 484 } 485 } 486 487 /* 488 * Returns non-zero (1) if the vfsops matches that of the vfs. 489 * Returns zero (0) if not. 490 */ 491 int 492 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 493 { 494 return (vfs_getops(vfsp) == vfsops); 495 } 496 497 /* 498 * Returns non-zero (1) if the file system has installed a non-default, 499 * non-error vfs_sync routine. Returns zero (0) otherwise. 500 */ 501 int 502 vfs_can_sync(vfs_t *vfsp) 503 { 504 /* vfs_sync() routine is not the default/error function */ 505 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 506 } 507 508 /* 509 * Initialize a vfs structure. 510 */ 511 void 512 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 513 { 514 /* Other initialization has been moved to vfs_alloc() */ 515 vfsp->vfs_count = 0; 516 vfsp->vfs_next = vfsp; 517 vfsp->vfs_prev = vfsp; 518 vfsp->vfs_zone_next = vfsp; 519 vfsp->vfs_zone_prev = vfsp; 520 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 521 vfsimpl_setup(vfsp); 522 vfsp->vfs_data = (data); 523 vfs_setops((vfsp), (op)); 524 } 525 526 /* 527 * Allocate and initialize the vfs implementation private data 528 * structure, vfs_impl_t. 529 */ 530 void 531 vfsimpl_setup(vfs_t *vfsp) 532 { 533 int i; 534 535 if (vfsp->vfs_implp != NULL) { 536 return; 537 } 538 539 vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP); 540 /* Note that these are #define'd in vfs.h */ 541 vfsp->vfs_vskap = NULL; 542 vfsp->vfs_fstypevsp = NULL; 543 544 /* Set size of counted array, then zero the array */ 545 vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1; 546 for (i = 1; i < VFS_FEATURE_MAXSZ; i++) { 547 vfsp->vfs_featureset[i] = 0; 548 } 549 } 550 551 /* 552 * Release the vfs_impl_t structure, if it exists. Some unbundled 553 * filesystems may not use the newer version of vfs and thus 554 * would not contain this implementation private data structure. 555 */ 556 void 557 vfsimpl_teardown(vfs_t *vfsp) 558 { 559 vfs_impl_t *vip = vfsp->vfs_implp; 560 561 if (vip == NULL) 562 return; 563 564 kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t)); 565 vfsp->vfs_implp = NULL; 566 } 567 568 /* 569 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 570 * fstatvfs, and sysfs moved to common/syscall. 571 */ 572 573 /* 574 * Update every mounted file system. We call the vfs_sync operation of 575 * each file system type, passing it a NULL vfsp to indicate that all 576 * mounted file systems of that type should be updated. 577 */ 578 void 579 vfs_sync(int flag) 580 { 581 struct vfssw *vswp; 582 RLOCK_VFSSW(); 583 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 584 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 585 vfs_refvfssw(vswp); 586 RUNLOCK_VFSSW(); 587 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 588 CRED()); 589 vfs_unrefvfssw(vswp); 590 RLOCK_VFSSW(); 591 } 592 } 593 RUNLOCK_VFSSW(); 594 } 595 596 void 597 sync(void) 598 { 599 vfs_sync(0); 600 } 601 602 /* 603 * External routines. 604 */ 605 606 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 607 608 /* 609 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 610 * but otherwise should be accessed only via vfs_list_lock() and 611 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 612 */ 613 static krwlock_t vfslist; 614 615 /* 616 * Mount devfs on /devices. This is done right after root is mounted 617 * to provide device access support for the system 618 */ 619 static void 620 vfs_mountdevices(void) 621 { 622 struct vfssw *vsw; 623 struct vnode *mvp; 624 struct mounta mounta = { /* fake mounta for devfs_mount() */ 625 NULL, 626 NULL, 627 MS_SYSSPACE, 628 NULL, 629 NULL, 630 0, 631 NULL, 632 0 633 }; 634 635 /* 636 * _init devfs module to fill in the vfssw 637 */ 638 if (modload("fs", "devfs") == -1) 639 panic("Cannot _init devfs module"); 640 641 /* 642 * Hold vfs 643 */ 644 RLOCK_VFSSW(); 645 vsw = vfs_getvfsswbyname("devfs"); 646 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 647 VFS_HOLD(&devices); 648 649 /* 650 * Locate mount point 651 */ 652 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 653 panic("Cannot find /devices"); 654 655 /* 656 * Perform the mount of /devices 657 */ 658 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 659 panic("Cannot mount /devices"); 660 661 RUNLOCK_VFSSW(); 662 663 /* 664 * Set appropriate members and add to vfs list for mnttab display 665 */ 666 vfs_setresource(&devices, "/devices"); 667 vfs_setmntpoint(&devices, "/devices"); 668 669 /* 670 * Hold the root of /devices so it won't go away 671 */ 672 if (VFS_ROOT(&devices, &devicesdir)) 673 panic("vfs_mountdevices: not devices root"); 674 675 if (vfs_lock(&devices) != 0) { 676 VN_RELE(devicesdir); 677 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 678 return; 679 } 680 681 if (vn_vfswlock(mvp) != 0) { 682 vfs_unlock(&devices); 683 VN_RELE(devicesdir); 684 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 685 return; 686 } 687 688 vfs_add(mvp, &devices, 0); 689 vn_vfsunlock(mvp); 690 vfs_unlock(&devices); 691 VN_RELE(devicesdir); 692 } 693 694 /* 695 * mount the first instance of /dev to root and remain mounted 696 */ 697 static void 698 vfs_mountdev1(void) 699 { 700 struct vfssw *vsw; 701 struct vnode *mvp; 702 struct mounta mounta = { /* fake mounta for sdev_mount() */ 703 NULL, 704 NULL, 705 MS_SYSSPACE | MS_OVERLAY, 706 NULL, 707 NULL, 708 0, 709 NULL, 710 0 711 }; 712 713 /* 714 * _init dev module to fill in the vfssw 715 */ 716 if (modload("fs", "dev") == -1) 717 cmn_err(CE_PANIC, "Cannot _init dev module\n"); 718 719 /* 720 * Hold vfs 721 */ 722 RLOCK_VFSSW(); 723 vsw = vfs_getvfsswbyname("dev"); 724 VFS_INIT(&dev, &vsw->vsw_vfsops, NULL); 725 VFS_HOLD(&dev); 726 727 /* 728 * Locate mount point 729 */ 730 if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 731 cmn_err(CE_PANIC, "Cannot find /dev\n"); 732 733 /* 734 * Perform the mount of /dev 735 */ 736 if (VFS_MOUNT(&dev, mvp, &mounta, CRED())) 737 cmn_err(CE_PANIC, "Cannot mount /dev 1\n"); 738 739 RUNLOCK_VFSSW(); 740 741 /* 742 * Set appropriate members and add to vfs list for mnttab display 743 */ 744 vfs_setresource(&dev, "/dev"); 745 vfs_setmntpoint(&dev, "/dev"); 746 747 /* 748 * Hold the root of /dev so it won't go away 749 */ 750 if (VFS_ROOT(&dev, &devdir)) 751 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root"); 752 753 if (vfs_lock(&dev) != 0) { 754 VN_RELE(devdir); 755 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev"); 756 return; 757 } 758 759 if (vn_vfswlock(mvp) != 0) { 760 vfs_unlock(&dev); 761 VN_RELE(devdir); 762 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev"); 763 return; 764 } 765 766 vfs_add(mvp, &dev, 0); 767 vn_vfsunlock(mvp); 768 vfs_unlock(&dev); 769 VN_RELE(devdir); 770 } 771 772 /* 773 * Mount required filesystem. This is done right after root is mounted. 774 */ 775 static void 776 vfs_mountfs(char *module, char *spec, char *path) 777 { 778 struct vnode *mvp; 779 struct mounta mounta; 780 vfs_t *vfsp; 781 782 mounta.flags = MS_SYSSPACE | MS_DATA; 783 mounta.fstype = module; 784 mounta.spec = spec; 785 mounta.dir = path; 786 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 787 cmn_err(CE_WARN, "Cannot find %s", path); 788 return; 789 } 790 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 791 cmn_err(CE_WARN, "Cannot mount %s", path); 792 else 793 VFS_RELE(vfsp); 794 VN_RELE(mvp); 795 } 796 797 /* 798 * vfs_mountroot is called by main() to mount the root filesystem. 799 */ 800 void 801 vfs_mountroot(void) 802 { 803 struct vnode *rvp = NULL; 804 char *path; 805 size_t plen; 806 struct vfssw *vswp; 807 808 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 809 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 810 811 /* 812 * Alloc the vfs hash bucket array and locks 813 */ 814 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 815 816 /* 817 * Call machine-dependent routine "rootconf" to choose a root 818 * file system type. 819 */ 820 if (rootconf()) 821 panic("vfs_mountroot: cannot mount root"); 822 /* 823 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 824 * to point to it. These are used by lookuppn() so that it 825 * knows where to start from ('/' or '.'). 826 */ 827 vfs_setmntpoint(rootvfs, "/"); 828 if (VFS_ROOT(rootvfs, &rootdir)) 829 panic("vfs_mountroot: no root vnode"); 830 PTOU(curproc)->u_cdir = rootdir; 831 VN_HOLD(PTOU(curproc)->u_cdir); 832 PTOU(curproc)->u_rdir = NULL; 833 834 /* 835 * Setup the global zone's rootvp, now that it exists. 836 */ 837 global_zone->zone_rootvp = rootdir; 838 VN_HOLD(global_zone->zone_rootvp); 839 840 /* 841 * Notify the module code that it can begin using the 842 * root filesystem instead of the boot program's services. 843 */ 844 modrootloaded = 1; 845 /* 846 * Set up mnttab information for root 847 */ 848 vfs_setresource(rootvfs, rootfs.bo_name); 849 850 /* 851 * Notify cluster software that the root filesystem is available. 852 */ 853 clboot_mountroot(); 854 855 /* Now that we're all done with the root FS, set up its vopstats */ 856 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) { 857 /* Set flag for statistics collection */ 858 if (vswp->vsw_flag & VSW_STATS) { 859 initialize_vopstats(&rootvfs->vfs_vopstats); 860 rootvfs->vfs_flag |= VFS_STATS; 861 rootvfs->vfs_fstypevsp = 862 get_fstype_vopstats(rootvfs, vswp); 863 rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs); 864 } 865 vfs_unrefvfssw(vswp); 866 } 867 868 /* 869 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab, 870 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc. 871 */ 872 vfs_mountdevices(); 873 vfs_mountdev1(); 874 875 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 876 vfs_mountfs("proc", "/proc", "/proc"); 877 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 878 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 879 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 880 881 if (getzoneid() == GLOBAL_ZONEID) { 882 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab"); 883 } 884 885 #ifdef __sparc 886 /* 887 * This bit of magic can go away when we convert sparc to 888 * the new boot architecture based on ramdisk. 889 * 890 * Booting off a mirrored root volume: 891 * At this point, we have booted and mounted root on a 892 * single component of the mirror. Complete the boot 893 * by configuring SVM and converting the root to the 894 * dev_t of the mirrored root device. This dev_t conversion 895 * only works because the underlying device doesn't change. 896 */ 897 if (root_is_svm) { 898 if (svm_rootconf()) { 899 panic("vfs_mountroot: cannot remount root"); 900 } 901 902 /* 903 * mnttab should reflect the new root device 904 */ 905 vfs_lock_wait(rootvfs); 906 vfs_setresource(rootvfs, rootfs.bo_name); 907 vfs_unlock(rootvfs); 908 } 909 #endif /* __sparc */ 910 911 /* 912 * Look up the root device via devfs so that a dv_node is 913 * created for it. The vnode is never VN_RELE()ed. 914 * We allocate more than MAXPATHLEN so that the 915 * buffer passed to i_ddi_prompath_to_devfspath() is 916 * exactly MAXPATHLEN (the function expects a buffer 917 * of that length). 918 */ 919 plen = strlen("/devices"); 920 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 921 (void) strcpy(path, "/devices"); 922 923 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 924 != DDI_SUCCESS || 925 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 926 927 /* NUL terminate in case "path" has garbage */ 928 path[plen + MAXPATHLEN - 1] = '\0'; 929 #ifdef DEBUG 930 cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); 931 #endif 932 } 933 kmem_free(path, plen + MAXPATHLEN); 934 vfs_mnttabvp_setup(); 935 } 936 937 /* 938 * If remount failed and we're in a zone we need to check for the zone 939 * root path and strip it before the call to vfs_setpath(). 940 * 941 * If strpath doesn't begin with the zone_rootpath the original 942 * strpath is returned unchanged. 943 */ 944 static const char * 945 stripzonepath(const char *strpath) 946 { 947 char *str1, *str2; 948 int i; 949 zone_t *zonep = curproc->p_zone; 950 951 if (zonep->zone_rootpath == NULL || strpath == NULL) { 952 return (NULL); 953 } 954 955 /* 956 * we check for the end of the string at one past the 957 * current position because the zone_rootpath always 958 * ends with "/" but we don't want to strip that off. 959 */ 960 str1 = zonep->zone_rootpath; 961 str2 = (char *)strpath; 962 ASSERT(str1[0] != '\0'); 963 for (i = 0; str1[i + 1] != '\0'; i++) { 964 if (str1[i] != str2[i]) 965 return ((char *)strpath); 966 } 967 return (&str2[i]); 968 } 969 970 /* 971 * Common mount code. Called from the system call entry point, from autofs, 972 * nfsv4 trigger mounts, and from pxfs. 973 * 974 * Takes the effective file system type, mount arguments, the mount point 975 * vnode, flags specifying whether the mount is a remount and whether it 976 * should be entered into the vfs list, and credentials. Fills in its vfspp 977 * parameter with the mounted file system instance's vfs. 978 * 979 * Note that the effective file system type is specified as a string. It may 980 * be null, in which case it's determined from the mount arguments, and may 981 * differ from the type specified in the mount arguments; this is a hook to 982 * allow interposition when instantiating file system instances. 983 * 984 * The caller is responsible for releasing its own hold on the mount point 985 * vp (this routine does its own hold when necessary). 986 * Also note that for remounts, the mount point vp should be the vnode for 987 * the root of the file system rather than the vnode that the file system 988 * is mounted on top of. 989 */ 990 int 991 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 992 struct vfs **vfspp) 993 { 994 struct vfssw *vswp; 995 vfsops_t *vfsops; 996 struct vfs *vfsp; 997 struct vnode *bvp; 998 dev_t bdev = 0; 999 mntopts_t mnt_mntopts; 1000 int error = 0; 1001 int copyout_error = 0; 1002 int ovflags; 1003 char *opts = uap->optptr; 1004 char *inargs = opts; 1005 int optlen = uap->optlen; 1006 int remount; 1007 int rdonly; 1008 int nbmand = 0; 1009 int delmip = 0; 1010 int addmip = 0; 1011 int splice = ((uap->flags & MS_NOSPLICE) == 0); 1012 int fromspace = (uap->flags & MS_SYSSPACE) ? 1013 UIO_SYSSPACE : UIO_USERSPACE; 1014 char *resource = NULL, *mountpt = NULL; 1015 refstr_t *oldresource, *oldmntpt; 1016 struct pathname pn, rpn; 1017 vsk_anchor_t *vskap; 1018 1019 /* 1020 * The v_flag value for the mount point vp is permanently set 1021 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 1022 * for mount point locking. 1023 */ 1024 mutex_enter(&vp->v_lock); 1025 vp->v_flag |= VVFSLOCK; 1026 mutex_exit(&vp->v_lock); 1027 1028 mnt_mntopts.mo_count = 0; 1029 /* 1030 * Find the ops vector to use to invoke the file system-specific mount 1031 * method. If the fsname argument is non-NULL, use it directly. 1032 * Otherwise, dig the file system type information out of the mount 1033 * arguments. 1034 * 1035 * A side effect is to hold the vfssw entry. 1036 * 1037 * Mount arguments can be specified in several ways, which are 1038 * distinguished by flag bit settings. The preferred way is to set 1039 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 1040 * type supplied as a character string and the last two arguments 1041 * being a pointer to a character buffer and the size of the buffer. 1042 * On entry, the buffer holds a null terminated list of options; on 1043 * return, the string is the list of options the file system 1044 * recognized. If MS_DATA is set arguments five and six point to a 1045 * block of binary data which the file system interprets. 1046 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 1047 * consistently with these conventions. To handle them, we check to 1048 * see whether the pointer to the file system name has a numeric value 1049 * less than 256. If so, we treat it as an index. 1050 */ 1051 if (fsname != NULL) { 1052 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 1053 return (EINVAL); 1054 } 1055 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 1056 size_t n; 1057 uint_t fstype; 1058 char name[FSTYPSZ]; 1059 1060 if ((fstype = (uintptr_t)uap->fstype) < 256) { 1061 RLOCK_VFSSW(); 1062 if (fstype == 0 || fstype >= nfstype || 1063 !ALLOCATED_VFSSW(&vfssw[fstype])) { 1064 RUNLOCK_VFSSW(); 1065 return (EINVAL); 1066 } 1067 (void) strcpy(name, vfssw[fstype].vsw_name); 1068 RUNLOCK_VFSSW(); 1069 if ((vswp = vfs_getvfssw(name)) == NULL) 1070 return (EINVAL); 1071 } else { 1072 /* 1073 * Handle either kernel or user address space. 1074 */ 1075 if (uap->flags & MS_SYSSPACE) { 1076 error = copystr(uap->fstype, name, 1077 FSTYPSZ, &n); 1078 } else { 1079 error = copyinstr(uap->fstype, name, 1080 FSTYPSZ, &n); 1081 } 1082 if (error) { 1083 if (error == ENAMETOOLONG) 1084 return (EINVAL); 1085 return (error); 1086 } 1087 if ((vswp = vfs_getvfssw(name)) == NULL) 1088 return (EINVAL); 1089 } 1090 } else { 1091 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 1092 return (EINVAL); 1093 } 1094 if (!VFS_INSTALLED(vswp)) 1095 return (EINVAL); 1096 vfsops = &vswp->vsw_vfsops; 1097 1098 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 1099 /* 1100 * Fetch mount options and parse them for generic vfs options 1101 */ 1102 if (uap->flags & MS_OPTIONSTR) { 1103 /* 1104 * Limit the buffer size 1105 */ 1106 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 1107 error = EINVAL; 1108 goto errout; 1109 } 1110 if ((uap->flags & MS_SYSSPACE) == 0) { 1111 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 1112 inargs[0] = '\0'; 1113 if (optlen) { 1114 error = copyinstr(opts, inargs, (size_t)optlen, 1115 NULL); 1116 if (error) { 1117 goto errout; 1118 } 1119 } 1120 } 1121 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 1122 } 1123 /* 1124 * Flag bits override the options string. 1125 */ 1126 if (uap->flags & MS_REMOUNT) 1127 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 1128 if (uap->flags & MS_RDONLY) 1129 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 1130 if (uap->flags & MS_NOSUID) 1131 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 1132 1133 /* 1134 * Check if this is a remount; must be set in the option string and 1135 * the file system must support a remount option. 1136 */ 1137 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 1138 MNTOPT_REMOUNT, NULL)) { 1139 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 1140 error = ENOTSUP; 1141 goto errout; 1142 } 1143 uap->flags |= MS_REMOUNT; 1144 } 1145 1146 /* 1147 * uap->flags and vfs_optionisset() should agree. 1148 */ 1149 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 1150 uap->flags |= MS_RDONLY; 1151 } 1152 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 1153 uap->flags |= MS_NOSUID; 1154 } 1155 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 1156 ASSERT(splice || !remount); 1157 /* 1158 * If we are splicing the fs into the namespace, 1159 * perform mount point checks. 1160 * 1161 * We want to resolve the path for the mount point to eliminate 1162 * '.' and ".." and symlinks in mount points; we can't do the 1163 * same for the resource string, since it would turn 1164 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 1165 * this before grabbing vn_vfswlock(), because otherwise we 1166 * would deadlock with lookuppn(). 1167 */ 1168 if (splice) { 1169 ASSERT(vp->v_count > 0); 1170 1171 /* 1172 * Pick up mount point and device from appropriate space. 1173 */ 1174 if (pn_get(uap->spec, fromspace, &pn) == 0) { 1175 resource = kmem_alloc(pn.pn_pathlen + 1, 1176 KM_SLEEP); 1177 (void) strcpy(resource, pn.pn_path); 1178 pn_free(&pn); 1179 } 1180 /* 1181 * Do a lookupname prior to taking the 1182 * writelock. Mark this as completed if 1183 * successful for later cleanup and addition to 1184 * the mount in progress table. 1185 */ 1186 if ((uap->flags & MS_GLOBAL) == 0 && 1187 lookupname(uap->spec, fromspace, 1188 FOLLOW, NULL, &bvp) == 0) { 1189 addmip = 1; 1190 } 1191 1192 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 1193 pathname_t *pnp; 1194 1195 if (*pn.pn_path != '/') { 1196 error = EINVAL; 1197 pn_free(&pn); 1198 goto errout; 1199 } 1200 pn_alloc(&rpn); 1201 /* 1202 * Kludge to prevent autofs from deadlocking with 1203 * itself when it calls domount(). 1204 * 1205 * If autofs is calling, it is because it is doing 1206 * (autofs) mounts in the process of an NFS mount. A 1207 * lookuppn() here would cause us to block waiting for 1208 * said NFS mount to complete, which can't since this 1209 * is the thread that was supposed to doing it. 1210 */ 1211 if (fromspace == UIO_USERSPACE) { 1212 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1213 NULL)) == 0) { 1214 pnp = &rpn; 1215 } else { 1216 /* 1217 * The file disappeared or otherwise 1218 * became inaccessible since we opened 1219 * it; might as well fail the mount 1220 * since the mount point is no longer 1221 * accessible. 1222 */ 1223 pn_free(&rpn); 1224 pn_free(&pn); 1225 goto errout; 1226 } 1227 } else { 1228 pnp = &pn; 1229 } 1230 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1231 (void) strcpy(mountpt, pnp->pn_path); 1232 1233 /* 1234 * If the addition of the zone's rootpath 1235 * would push us over a total path length 1236 * of MAXPATHLEN, we fail the mount with 1237 * ENAMETOOLONG, which is what we would have 1238 * gotten if we were trying to perform the same 1239 * mount in the global zone. 1240 * 1241 * strlen() doesn't count the trailing 1242 * '\0', but zone_rootpathlen counts both a 1243 * trailing '/' and the terminating '\0'. 1244 */ 1245 if ((curproc->p_zone->zone_rootpathlen - 1 + 1246 strlen(mountpt)) > MAXPATHLEN || 1247 (resource != NULL && 1248 (curproc->p_zone->zone_rootpathlen - 1 + 1249 strlen(resource)) > MAXPATHLEN)) { 1250 error = ENAMETOOLONG; 1251 } 1252 1253 pn_free(&rpn); 1254 pn_free(&pn); 1255 } 1256 1257 if (error) 1258 goto errout; 1259 1260 /* 1261 * Prevent path name resolution from proceeding past 1262 * the mount point. 1263 */ 1264 if (vn_vfswlock(vp) != 0) { 1265 error = EBUSY; 1266 goto errout; 1267 } 1268 1269 /* 1270 * Verify that it's legitimate to establish a mount on 1271 * the prospective mount point. 1272 */ 1273 if (vn_mountedvfs(vp) != NULL) { 1274 /* 1275 * The mount point lock was obtained after some 1276 * other thread raced through and established a mount. 1277 */ 1278 vn_vfsunlock(vp); 1279 error = EBUSY; 1280 goto errout; 1281 } 1282 if (vp->v_flag & VNOMOUNT) { 1283 vn_vfsunlock(vp); 1284 error = EINVAL; 1285 goto errout; 1286 } 1287 } 1288 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1289 uap->dataptr = NULL; 1290 uap->datalen = 0; 1291 } 1292 1293 /* 1294 * If this is a remount, we don't want to create a new VFS. 1295 * Instead, we pass the existing one with a remount flag. 1296 */ 1297 if (remount) { 1298 /* 1299 * Confirm that the mount point is the root vnode of the 1300 * file system that is being remounted. 1301 * This can happen if the user specifies a different 1302 * mount point directory pathname in the (re)mount command. 1303 * 1304 * Code below can only be reached if splice is true, so it's 1305 * safe to do vn_vfsunlock() here. 1306 */ 1307 if ((vp->v_flag & VROOT) == 0) { 1308 vn_vfsunlock(vp); 1309 error = ENOENT; 1310 goto errout; 1311 } 1312 /* 1313 * Disallow making file systems read-only unless file system 1314 * explicitly allows it in its vfssw. Ignore other flags. 1315 */ 1316 if (rdonly && vn_is_readonly(vp) == 0 && 1317 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1318 vn_vfsunlock(vp); 1319 error = EINVAL; 1320 goto errout; 1321 } 1322 /* 1323 * Disallow changing the NBMAND disposition of the file 1324 * system on remounts. 1325 */ 1326 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1327 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1328 vn_vfsunlock(vp); 1329 error = EINVAL; 1330 goto errout; 1331 } 1332 vfsp = vp->v_vfsp; 1333 ovflags = vfsp->vfs_flag; 1334 vfsp->vfs_flag |= VFS_REMOUNT; 1335 vfsp->vfs_flag &= ~VFS_RDONLY; 1336 } else { 1337 vfsp = vfs_alloc(KM_SLEEP); 1338 VFS_INIT(vfsp, vfsops, NULL); 1339 } 1340 1341 VFS_HOLD(vfsp); 1342 1343 /* 1344 * The vfs_reflock is not used anymore the code below explicitly 1345 * holds it preventing others accesing it directly. 1346 */ 1347 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1348 !(vfsp->vfs_flag & VFS_REMOUNT)) 1349 cmn_err(CE_WARN, 1350 "mount type %s couldn't get vfs_reflock", vswp->vsw_name); 1351 1352 /* 1353 * Lock the vfs. If this is a remount we want to avoid spurious umount 1354 * failures that happen as a side-effect of fsflush() and other mount 1355 * and unmount operations that might be going on simultaneously and 1356 * may have locked the vfs currently. To not return EBUSY immediately 1357 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1358 */ 1359 if (!remount) { 1360 if (error = vfs_lock(vfsp)) { 1361 vfsp->vfs_flag = ovflags; 1362 if (splice) 1363 vn_vfsunlock(vp); 1364 vfs_free(vfsp); 1365 goto errout; 1366 } 1367 } else { 1368 vfs_lock_wait(vfsp); 1369 } 1370 1371 /* 1372 * Add device to mount in progress table, global mounts require special 1373 * handling. It is possible that we have already done the lookupname 1374 * on a spliced, non-global fs. If so, we don't want to do it again 1375 * since we cannot do a lookupname after taking the 1376 * wlock above. This case is for a non-spliced, non-global filesystem. 1377 */ 1378 if (!addmip) { 1379 if ((uap->flags & MS_GLOBAL) == 0 && 1380 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1381 addmip = 1; 1382 } 1383 } 1384 1385 if (addmip) { 1386 bdev = bvp->v_rdev; 1387 VN_RELE(bvp); 1388 vfs_addmip(bdev, vfsp); 1389 addmip = 0; 1390 delmip = 1; 1391 } 1392 /* 1393 * Invalidate cached entry for the mount point. 1394 */ 1395 if (splice) 1396 dnlc_purge_vp(vp); 1397 1398 /* 1399 * If have an option string but the filesystem doesn't supply a 1400 * prototype options table, create a table with the global 1401 * options and sufficient room to accept all the options in the 1402 * string. Then parse the passed in option string 1403 * accepting all the options in the string. This gives us an 1404 * option table with all the proper cancel properties for the 1405 * global options. 1406 * 1407 * Filesystems that supply a prototype options table are handled 1408 * earlier in this function. 1409 */ 1410 if (uap->flags & MS_OPTIONSTR) { 1411 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1412 mntopts_t tmp_mntopts; 1413 1414 tmp_mntopts.mo_count = 0; 1415 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1416 &mnt_mntopts); 1417 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1418 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1419 vfs_freeopttbl(&tmp_mntopts); 1420 } 1421 } 1422 1423 /* 1424 * Serialize with zone creations. 1425 */ 1426 mount_in_progress(); 1427 /* 1428 * Instantiate (or reinstantiate) the file system. If appropriate, 1429 * splice it into the file system name space. 1430 * 1431 * We want VFS_MOUNT() to be able to override the vfs_resource 1432 * string if necessary (ie, mntfs), and also for a remount to 1433 * change the same (necessary when remounting '/' during boot). 1434 * So we set up vfs_mntpt and vfs_resource to what we think they 1435 * should be, then hand off control to VFS_MOUNT() which can 1436 * override this. 1437 * 1438 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1439 * a vfs which is on the vfs list (i.e. during a remount), we must 1440 * never set those fields to NULL. Several bits of code make 1441 * assumptions that the fields are always valid. 1442 */ 1443 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1444 if (remount) { 1445 if ((oldresource = vfsp->vfs_resource) != NULL) 1446 refstr_hold(oldresource); 1447 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1448 refstr_hold(oldmntpt); 1449 } 1450 vfs_setresource(vfsp, resource); 1451 vfs_setmntpoint(vfsp, mountpt); 1452 1453 /* 1454 * going to mount on this vnode, so notify. 1455 */ 1456 vnevent_mountedover(vp, NULL); 1457 error = VFS_MOUNT(vfsp, vp, uap, credp); 1458 1459 if (uap->flags & MS_RDONLY) 1460 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1461 if (uap->flags & MS_NOSUID) 1462 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1463 if (uap->flags & MS_GLOBAL) 1464 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1465 1466 if (error) { 1467 if (remount) { 1468 /* put back pre-remount options */ 1469 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1470 vfs_setmntpoint(vfsp, (stripzonepath( 1471 refstr_value(oldmntpt)))); 1472 if (oldmntpt) 1473 refstr_rele(oldmntpt); 1474 vfs_setresource(vfsp, (stripzonepath( 1475 refstr_value(oldresource)))); 1476 if (oldresource) 1477 refstr_rele(oldresource); 1478 vfsp->vfs_flag = ovflags; 1479 vfs_unlock(vfsp); 1480 VFS_RELE(vfsp); 1481 } else { 1482 vfs_unlock(vfsp); 1483 vfs_freemnttab(vfsp); 1484 vfs_free(vfsp); 1485 } 1486 } else { 1487 /* 1488 * Set the mount time to now 1489 */ 1490 vfsp->vfs_mtime = ddi_get_time(); 1491 if (remount) { 1492 vfsp->vfs_flag &= ~VFS_REMOUNT; 1493 if (oldresource) 1494 refstr_rele(oldresource); 1495 if (oldmntpt) 1496 refstr_rele(oldmntpt); 1497 } else if (splice) { 1498 /* 1499 * Link vfsp into the name space at the mount 1500 * point. Vfs_add() is responsible for 1501 * holding the mount point which will be 1502 * released when vfs_remove() is called. 1503 */ 1504 vfs_add(vp, vfsp, uap->flags); 1505 } else { 1506 /* 1507 * Hold the reference to file system which is 1508 * not linked into the name space. 1509 */ 1510 vfsp->vfs_zone = NULL; 1511 VFS_HOLD(vfsp); 1512 vfsp->vfs_vnodecovered = NULL; 1513 } 1514 /* 1515 * Set flags for global options encountered 1516 */ 1517 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1518 vfsp->vfs_flag |= VFS_RDONLY; 1519 else 1520 vfsp->vfs_flag &= ~VFS_RDONLY; 1521 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1522 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1523 } else { 1524 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1525 vfsp->vfs_flag |= VFS_NODEVICES; 1526 else 1527 vfsp->vfs_flag &= ~VFS_NODEVICES; 1528 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1529 vfsp->vfs_flag |= VFS_NOSETUID; 1530 else 1531 vfsp->vfs_flag &= ~VFS_NOSETUID; 1532 } 1533 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1534 vfsp->vfs_flag |= VFS_NBMAND; 1535 else 1536 vfsp->vfs_flag &= ~VFS_NBMAND; 1537 1538 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1539 vfsp->vfs_flag |= VFS_XATTR; 1540 else 1541 vfsp->vfs_flag &= ~VFS_XATTR; 1542 1543 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1544 vfsp->vfs_flag |= VFS_NOEXEC; 1545 else 1546 vfsp->vfs_flag &= ~VFS_NOEXEC; 1547 1548 /* 1549 * Now construct the output option string of options 1550 * we recognized. 1551 */ 1552 if (uap->flags & MS_OPTIONSTR) { 1553 vfs_list_read_lock(); 1554 copyout_error = vfs_buildoptionstr( 1555 &vfsp->vfs_mntopts, inargs, optlen); 1556 vfs_list_unlock(); 1557 if (copyout_error == 0 && 1558 (uap->flags & MS_SYSSPACE) == 0) { 1559 copyout_error = copyoutstr(inargs, opts, 1560 optlen, NULL); 1561 } 1562 } 1563 1564 /* 1565 * If this isn't a remount, set up the vopstats before 1566 * anyone can touch this. We only allow spliced file 1567 * systems (file systems which are in the namespace) to 1568 * have the VFS_STATS flag set. 1569 * NOTE: PxFS mounts the underlying file system with 1570 * MS_NOSPLICE set and copies those vfs_flags to its private 1571 * vfs structure. As a result, PxFS should never have 1572 * the VFS_STATS flag or else we might access the vfs 1573 * statistics-related fields prior to them being 1574 * properly initialized. 1575 */ 1576 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) { 1577 initialize_vopstats(&vfsp->vfs_vopstats); 1578 /* 1579 * We need to set vfs_vskap to NULL because there's 1580 * a chance it won't be set below. This is checked 1581 * in teardown_vopstats() so we can't have garbage. 1582 */ 1583 vfsp->vfs_vskap = NULL; 1584 vfsp->vfs_flag |= VFS_STATS; 1585 vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp); 1586 } 1587 1588 if (vswp->vsw_flag & VSW_XID) 1589 vfsp->vfs_flag |= VFS_XID; 1590 1591 vfs_unlock(vfsp); 1592 } 1593 mount_completed(); 1594 if (splice) 1595 vn_vfsunlock(vp); 1596 1597 if ((error == 0) && (copyout_error == 0)) { 1598 if (!remount) { 1599 /* 1600 * Don't call get_vskstat_anchor() while holding 1601 * locks since it allocates memory and calls 1602 * VFS_STATVFS(). For NFS, the latter can generate 1603 * an over-the-wire call. 1604 */ 1605 vskap = get_vskstat_anchor(vfsp); 1606 /* Only take the lock if we have something to do */ 1607 if (vskap != NULL) { 1608 vfs_lock_wait(vfsp); 1609 if (vfsp->vfs_flag & VFS_STATS) { 1610 vfsp->vfs_vskap = vskap; 1611 } 1612 vfs_unlock(vfsp); 1613 } 1614 } 1615 /* Return vfsp to caller. */ 1616 *vfspp = vfsp; 1617 } 1618 errout: 1619 vfs_freeopttbl(&mnt_mntopts); 1620 if (resource != NULL) 1621 kmem_free(resource, strlen(resource) + 1); 1622 if (mountpt != NULL) 1623 kmem_free(mountpt, strlen(mountpt) + 1); 1624 /* 1625 * It is possible we errored prior to adding to mount in progress 1626 * table. Must free vnode we acquired with successful lookupname. 1627 */ 1628 if (addmip) 1629 VN_RELE(bvp); 1630 if (delmip) 1631 vfs_delmip(vfsp); 1632 ASSERT(vswp != NULL); 1633 vfs_unrefvfssw(vswp); 1634 if (inargs != opts) 1635 kmem_free(inargs, MAX_MNTOPT_STR); 1636 if (copyout_error) { 1637 VFS_RELE(vfsp); 1638 error = copyout_error; 1639 } 1640 return (error); 1641 } 1642 1643 static void 1644 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath) 1645 { 1646 size_t len; 1647 refstr_t *ref; 1648 zone_t *zone = curproc->p_zone; 1649 char *sp; 1650 int have_list_lock = 0; 1651 1652 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1653 1654 /* 1655 * New path must be less than MAXPATHLEN because mntfs 1656 * will only display up to MAXPATHLEN bytes. This is currently 1657 * safe, because domount() uses pn_get(), and other callers 1658 * similarly cap the size to fewer than MAXPATHLEN bytes. 1659 */ 1660 1661 ASSERT(strlen(newpath) < MAXPATHLEN); 1662 1663 /* mntfs requires consistency while vfs list lock is held */ 1664 1665 if (VFS_ON_LIST(vfsp)) { 1666 have_list_lock = 1; 1667 vfs_list_lock(); 1668 } 1669 1670 if (*refp != NULL) 1671 refstr_rele(*refp); 1672 1673 /* Do we need to modify the path? */ 1674 1675 if (zone == global_zone || *newpath != '/') { 1676 ref = refstr_alloc(newpath); 1677 goto out; 1678 } 1679 1680 /* 1681 * Truncate the trailing '/' in the zoneroot, and merge 1682 * in the zone's rootpath with the "newpath" (resource 1683 * or mountpoint) passed in. 1684 * 1685 * The size of the required buffer is thus the size of 1686 * the buffer required for the passed-in newpath 1687 * (strlen(newpath) + 1), plus the size of the buffer 1688 * required to hold zone_rootpath (zone_rootpathlen) 1689 * minus one for one of the now-superfluous NUL 1690 * terminations, minus one for the trailing '/'. 1691 * 1692 * That gives us: 1693 * 1694 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1695 * 1696 * Which is what we have below. 1697 */ 1698 1699 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1700 sp = kmem_alloc(len, KM_SLEEP); 1701 1702 /* 1703 * Copy everything including the trailing slash, which 1704 * we then overwrite with the NUL character. 1705 */ 1706 1707 (void) strcpy(sp, zone->zone_rootpath); 1708 sp[zone->zone_rootpathlen - 2] = '\0'; 1709 (void) strcat(sp, newpath); 1710 1711 ref = refstr_alloc(sp); 1712 kmem_free(sp, len); 1713 out: 1714 *refp = ref; 1715 1716 if (have_list_lock) { 1717 vfs_mnttab_modtimeupd(); 1718 vfs_list_unlock(); 1719 } 1720 } 1721 1722 /* 1723 * Record a mounted resource name in a vfs structure. 1724 * If vfsp is already mounted, caller must hold the vfs lock. 1725 */ 1726 void 1727 vfs_setresource(struct vfs *vfsp, const char *resource) 1728 { 1729 if (resource == NULL || resource[0] == '\0') 1730 resource = VFS_NORESOURCE; 1731 vfs_setpath(vfsp, &vfsp->vfs_resource, resource); 1732 } 1733 1734 /* 1735 * Record a mount point name in a vfs structure. 1736 * If vfsp is already mounted, caller must hold the vfs lock. 1737 */ 1738 void 1739 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt) 1740 { 1741 if (mntpt == NULL || mntpt[0] == '\0') 1742 mntpt = VFS_NOMNTPT; 1743 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt); 1744 } 1745 1746 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1747 1748 refstr_t * 1749 vfs_getresource(const struct vfs *vfsp) 1750 { 1751 refstr_t *resource; 1752 1753 vfs_list_read_lock(); 1754 resource = vfsp->vfs_resource; 1755 refstr_hold(resource); 1756 vfs_list_unlock(); 1757 1758 return (resource); 1759 } 1760 1761 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 1762 1763 refstr_t * 1764 vfs_getmntpoint(const struct vfs *vfsp) 1765 { 1766 refstr_t *mntpt; 1767 1768 vfs_list_read_lock(); 1769 mntpt = vfsp->vfs_mntpt; 1770 refstr_hold(mntpt); 1771 vfs_list_unlock(); 1772 1773 return (mntpt); 1774 } 1775 1776 /* 1777 * Create an empty options table with enough empty slots to hold all 1778 * The options in the options string passed as an argument. 1779 * Potentially prepend another options table. 1780 * 1781 * Note: caller is responsible for locking the vfs list, if needed, 1782 * to protect mops. 1783 */ 1784 static void 1785 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 1786 const mntopts_t *mtmpl) 1787 { 1788 const char *s = opts; 1789 uint_t count; 1790 1791 if (opts == NULL || *opts == '\0') { 1792 count = 0; 1793 } else { 1794 count = 1; 1795 1796 /* 1797 * Count number of options in the string 1798 */ 1799 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 1800 count++; 1801 s++; 1802 } 1803 } 1804 vfs_copyopttbl_extend(mtmpl, mops, count); 1805 } 1806 1807 /* 1808 * Create an empty options table with enough empty slots to hold all 1809 * The options in the options string passed as an argument. 1810 * 1811 * This function is *not* for general use by filesystems. 1812 * 1813 * Note: caller is responsible for locking the vfs list, if needed, 1814 * to protect mops. 1815 */ 1816 void 1817 vfs_createopttbl(mntopts_t *mops, const char *opts) 1818 { 1819 vfs_createopttbl_extend(mops, opts, NULL); 1820 } 1821 1822 1823 /* 1824 * Swap two mount options tables 1825 */ 1826 static void 1827 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 1828 { 1829 uint_t tmpcnt; 1830 mntopt_t *tmplist; 1831 1832 tmpcnt = optbl2->mo_count; 1833 tmplist = optbl2->mo_list; 1834 optbl2->mo_count = optbl1->mo_count; 1835 optbl2->mo_list = optbl1->mo_list; 1836 optbl1->mo_count = tmpcnt; 1837 optbl1->mo_list = tmplist; 1838 } 1839 1840 static void 1841 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 1842 { 1843 vfs_list_lock(); 1844 vfs_swapopttbl_nolock(optbl1, optbl2); 1845 vfs_mnttab_modtimeupd(); 1846 vfs_list_unlock(); 1847 } 1848 1849 static char ** 1850 vfs_copycancelopt_extend(char **const moc, int extend) 1851 { 1852 int i = 0; 1853 int j; 1854 char **result; 1855 1856 if (moc != NULL) { 1857 for (; moc[i] != NULL; i++) 1858 /* count number of options to cancel */; 1859 } 1860 1861 if (i + extend == 0) 1862 return (NULL); 1863 1864 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 1865 1866 for (j = 0; j < i; j++) { 1867 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 1868 (void) strcpy(result[j], moc[j]); 1869 } 1870 for (; j <= i + extend; j++) 1871 result[j] = NULL; 1872 1873 return (result); 1874 } 1875 1876 static void 1877 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 1878 { 1879 char *sp, *dp; 1880 1881 d->mo_flags = s->mo_flags; 1882 d->mo_data = s->mo_data; 1883 sp = s->mo_name; 1884 if (sp != NULL) { 1885 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1886 (void) strcpy(dp, sp); 1887 d->mo_name = dp; 1888 } else { 1889 d->mo_name = NULL; /* should never happen */ 1890 } 1891 1892 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 1893 1894 sp = s->mo_arg; 1895 if (sp != NULL) { 1896 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1897 (void) strcpy(dp, sp); 1898 d->mo_arg = dp; 1899 } else { 1900 d->mo_arg = NULL; 1901 } 1902 } 1903 1904 /* 1905 * Copy a mount options table, possibly allocating some spare 1906 * slots at the end. It is permissible to copy_extend the NULL table. 1907 */ 1908 static void 1909 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 1910 { 1911 uint_t i, count; 1912 mntopt_t *motbl; 1913 1914 /* 1915 * Clear out any existing stuff in the options table being initialized 1916 */ 1917 vfs_freeopttbl(dmo); 1918 count = (smo == NULL) ? 0 : smo->mo_count; 1919 if ((count + extra) == 0) /* nothing to do */ 1920 return; 1921 dmo->mo_count = count + extra; 1922 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 1923 dmo->mo_list = motbl; 1924 for (i = 0; i < count; i++) { 1925 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 1926 } 1927 for (i = count; i < count + extra; i++) { 1928 motbl[i].mo_flags = MO_EMPTY; 1929 } 1930 } 1931 1932 /* 1933 * Copy a mount options table. 1934 * 1935 * This function is *not* for general use by filesystems. 1936 * 1937 * Note: caller is responsible for locking the vfs list, if needed, 1938 * to protect smo and dmo. 1939 */ 1940 void 1941 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 1942 { 1943 vfs_copyopttbl_extend(smo, dmo, 0); 1944 } 1945 1946 static char ** 1947 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 1948 { 1949 int c1 = 0; 1950 int c2 = 0; 1951 char **result; 1952 char **sp1, **sp2, **dp; 1953 1954 /* 1955 * First we count both lists of cancel options. 1956 * If either is NULL or has no elements, we return a copy of 1957 * the other. 1958 */ 1959 if (mop1->mo_cancel != NULL) { 1960 for (; mop1->mo_cancel[c1] != NULL; c1++) 1961 /* count cancel options in mop1 */; 1962 } 1963 1964 if (c1 == 0) 1965 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 1966 1967 if (mop2->mo_cancel != NULL) { 1968 for (; mop2->mo_cancel[c2] != NULL; c2++) 1969 /* count cancel options in mop2 */; 1970 } 1971 1972 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 1973 1974 if (c2 == 0) 1975 return (result); 1976 1977 /* 1978 * When we get here, we've got two sets of cancel options; 1979 * we need to merge the two sets. We know that the result 1980 * array has "c1+c2+1" entries and in the end we might shrink 1981 * it. 1982 * Result now has a copy of the c1 entries from mop1; we'll 1983 * now lookup all the entries of mop2 in mop1 and copy it if 1984 * it is unique. 1985 * This operation is O(n^2) but it's only called once per 1986 * filesystem per duplicate option. This is a situation 1987 * which doesn't arise with the filesystems in ON and 1988 * n is generally 1. 1989 */ 1990 1991 dp = &result[c1]; 1992 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 1993 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 1994 if (strcmp(*sp1, *sp2) == 0) 1995 break; 1996 } 1997 if (*sp1 == NULL) { 1998 /* 1999 * Option *sp2 not found in mop1, so copy it. 2000 * The calls to vfs_copycancelopt_extend() 2001 * guarantee that there's enough room. 2002 */ 2003 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 2004 (void) strcpy(*dp++, *sp2); 2005 } 2006 } 2007 if (dp != &result[c1+c2]) { 2008 size_t bytes = (dp - result + 1) * sizeof (char *); 2009 char **nres = kmem_alloc(bytes, KM_SLEEP); 2010 2011 bcopy(result, nres, bytes); 2012 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 2013 result = nres; 2014 } 2015 return (result); 2016 } 2017 2018 /* 2019 * Merge two mount option tables (outer and inner) into one. This is very 2020 * similar to "merging" global variables and automatic variables in C. 2021 * 2022 * This isn't (and doesn't have to be) fast. 2023 * 2024 * This function is *not* for general use by filesystems. 2025 * 2026 * Note: caller is responsible for locking the vfs list, if needed, 2027 * to protect omo, imo & dmo. 2028 */ 2029 void 2030 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 2031 { 2032 uint_t i, count; 2033 mntopt_t *mop, *motbl; 2034 uint_t freeidx; 2035 2036 /* 2037 * First determine how much space we need to allocate. 2038 */ 2039 count = omo->mo_count; 2040 for (i = 0; i < imo->mo_count; i++) { 2041 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2042 continue; 2043 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 2044 count++; 2045 } 2046 ASSERT(count >= omo->mo_count && 2047 count <= omo->mo_count + imo->mo_count); 2048 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 2049 for (i = 0; i < omo->mo_count; i++) 2050 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 2051 freeidx = omo->mo_count; 2052 for (i = 0; i < imo->mo_count; i++) { 2053 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2054 continue; 2055 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 2056 char **newcanp; 2057 uint_t index = mop - omo->mo_list; 2058 2059 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 2060 2061 vfs_freeopt(&motbl[index]); 2062 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 2063 2064 vfs_freecancelopt(motbl[index].mo_cancel); 2065 motbl[index].mo_cancel = newcanp; 2066 } else { 2067 /* 2068 * If it's a new option, just copy it over to the first 2069 * free location. 2070 */ 2071 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 2072 } 2073 } 2074 dmo->mo_count = count; 2075 dmo->mo_list = motbl; 2076 } 2077 2078 /* 2079 * Functions to set and clear mount options in a mount options table. 2080 */ 2081 2082 /* 2083 * Clear a mount option, if it exists. 2084 * 2085 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2086 * the vfs list. 2087 */ 2088 static void 2089 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 2090 { 2091 struct mntopt *mop; 2092 uint_t i, count; 2093 2094 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2095 2096 count = mops->mo_count; 2097 for (i = 0; i < count; i++) { 2098 mop = &mops->mo_list[i]; 2099 2100 if (mop->mo_flags & MO_EMPTY) 2101 continue; 2102 if (strcmp(opt, mop->mo_name)) 2103 continue; 2104 mop->mo_flags &= ~MO_SET; 2105 if (mop->mo_arg != NULL) { 2106 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2107 } 2108 mop->mo_arg = NULL; 2109 if (update_mnttab) 2110 vfs_mnttab_modtimeupd(); 2111 break; 2112 } 2113 } 2114 2115 void 2116 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 2117 { 2118 int gotlock = 0; 2119 2120 if (VFS_ON_LIST(vfsp)) { 2121 gotlock = 1; 2122 vfs_list_lock(); 2123 } 2124 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 2125 if (gotlock) 2126 vfs_list_unlock(); 2127 } 2128 2129 2130 /* 2131 * Set a mount option on. If it's not found in the table, it's silently 2132 * ignored. If the option has MO_IGNORE set, it is still set unless the 2133 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 2134 * bits can be used to toggle the MO_NODISPLAY bit for the option. 2135 * If the VFS_CREATEOPT flag bit is set then the first option slot with 2136 * MO_EMPTY set is created as the option passed in. 2137 * 2138 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2139 * the vfs list. 2140 */ 2141 static void 2142 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 2143 const char *arg, int flags, int update_mnttab) 2144 { 2145 mntopt_t *mop; 2146 uint_t i, count; 2147 char *sp; 2148 2149 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2150 2151 if (flags & VFS_CREATEOPT) { 2152 if (vfs_hasopt(mops, opt) != NULL) { 2153 flags &= ~VFS_CREATEOPT; 2154 } 2155 } 2156 count = mops->mo_count; 2157 for (i = 0; i < count; i++) { 2158 mop = &mops->mo_list[i]; 2159 2160 if (mop->mo_flags & MO_EMPTY) { 2161 if ((flags & VFS_CREATEOPT) == 0) 2162 continue; 2163 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 2164 (void) strcpy(sp, opt); 2165 mop->mo_name = sp; 2166 if (arg != NULL) 2167 mop->mo_flags = MO_HASVALUE; 2168 else 2169 mop->mo_flags = 0; 2170 } else if (strcmp(opt, mop->mo_name)) { 2171 continue; 2172 } 2173 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 2174 break; 2175 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 2176 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 2177 (void) strcpy(sp, arg); 2178 } else { 2179 sp = NULL; 2180 } 2181 if (mop->mo_arg != NULL) 2182 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2183 mop->mo_arg = sp; 2184 if (flags & VFS_DISPLAY) 2185 mop->mo_flags &= ~MO_NODISPLAY; 2186 if (flags & VFS_NODISPLAY) 2187 mop->mo_flags |= MO_NODISPLAY; 2188 mop->mo_flags |= MO_SET; 2189 if (mop->mo_cancel != NULL) { 2190 char **cp; 2191 2192 for (cp = mop->mo_cancel; *cp != NULL; cp++) 2193 vfs_clearmntopt_nolock(mops, *cp, 0); 2194 } 2195 if (update_mnttab) 2196 vfs_mnttab_modtimeupd(); 2197 break; 2198 } 2199 } 2200 2201 void 2202 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 2203 { 2204 int gotlock = 0; 2205 2206 if (VFS_ON_LIST(vfsp)) { 2207 gotlock = 1; 2208 vfs_list_lock(); 2209 } 2210 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 2211 if (gotlock) 2212 vfs_list_unlock(); 2213 } 2214 2215 2216 /* 2217 * Add a "tag" option to a mounted file system's options list. 2218 * 2219 * Note: caller is responsible for locking the vfs list, if needed, 2220 * to protect mops. 2221 */ 2222 static mntopt_t * 2223 vfs_addtag(mntopts_t *mops, const char *tag) 2224 { 2225 uint_t count; 2226 mntopt_t *mop, *motbl; 2227 2228 count = mops->mo_count + 1; 2229 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 2230 if (mops->mo_count) { 2231 size_t len = (count - 1) * sizeof (mntopt_t); 2232 2233 bcopy(mops->mo_list, motbl, len); 2234 kmem_free(mops->mo_list, len); 2235 } 2236 mops->mo_count = count; 2237 mops->mo_list = motbl; 2238 mop = &motbl[count - 1]; 2239 mop->mo_flags = MO_TAG; 2240 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 2241 (void) strcpy(mop->mo_name, tag); 2242 return (mop); 2243 } 2244 2245 /* 2246 * Allow users to set arbitrary "tags" in a vfs's mount options. 2247 * Broader use within the kernel is discouraged. 2248 */ 2249 int 2250 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2251 cred_t *cr) 2252 { 2253 vfs_t *vfsp; 2254 mntopts_t *mops; 2255 mntopt_t *mop; 2256 int found = 0; 2257 dev_t dev = makedevice(major, minor); 2258 int err = 0; 2259 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2260 2261 /* 2262 * Find the desired mounted file system 2263 */ 2264 vfs_list_lock(); 2265 vfsp = rootvfs; 2266 do { 2267 if (vfsp->vfs_dev == dev && 2268 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2269 found = 1; 2270 break; 2271 } 2272 vfsp = vfsp->vfs_next; 2273 } while (vfsp != rootvfs); 2274 2275 if (!found) { 2276 err = EINVAL; 2277 goto out; 2278 } 2279 err = secpolicy_fs_config(cr, vfsp); 2280 if (err != 0) 2281 goto out; 2282 2283 mops = &vfsp->vfs_mntopts; 2284 /* 2285 * Add tag if it doesn't already exist 2286 */ 2287 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2288 int len; 2289 2290 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2291 len = strlen(buf); 2292 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2293 err = ENAMETOOLONG; 2294 goto out; 2295 } 2296 mop = vfs_addtag(mops, tag); 2297 } 2298 if ((mop->mo_flags & MO_TAG) == 0) { 2299 err = EINVAL; 2300 goto out; 2301 } 2302 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2303 out: 2304 vfs_list_unlock(); 2305 kmem_free(buf, MAX_MNTOPT_STR); 2306 return (err); 2307 } 2308 2309 /* 2310 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2311 * Broader use within the kernel is discouraged. 2312 */ 2313 int 2314 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2315 cred_t *cr) 2316 { 2317 vfs_t *vfsp; 2318 mntopt_t *mop; 2319 int found = 0; 2320 dev_t dev = makedevice(major, minor); 2321 int err = 0; 2322 2323 /* 2324 * Find the desired mounted file system 2325 */ 2326 vfs_list_lock(); 2327 vfsp = rootvfs; 2328 do { 2329 if (vfsp->vfs_dev == dev && 2330 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2331 found = 1; 2332 break; 2333 } 2334 vfsp = vfsp->vfs_next; 2335 } while (vfsp != rootvfs); 2336 2337 if (!found) { 2338 err = EINVAL; 2339 goto out; 2340 } 2341 err = secpolicy_fs_config(cr, vfsp); 2342 if (err != 0) 2343 goto out; 2344 2345 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2346 err = EINVAL; 2347 goto out; 2348 } 2349 if ((mop->mo_flags & MO_TAG) == 0) { 2350 err = EINVAL; 2351 goto out; 2352 } 2353 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2354 out: 2355 vfs_list_unlock(); 2356 return (err); 2357 } 2358 2359 /* 2360 * Function to parse an option string and fill in a mount options table. 2361 * Unknown options are silently ignored. The input option string is modified 2362 * by replacing separators with nulls. If the create flag is set, options 2363 * not found in the table are just added on the fly. The table must have 2364 * an option slot marked MO_EMPTY to add an option on the fly. 2365 * 2366 * This function is *not* for general use by filesystems. 2367 * 2368 * Note: caller is responsible for locking the vfs list, if needed, 2369 * to protect mops.. 2370 */ 2371 void 2372 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2373 { 2374 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2375 int setflg = VFS_NOFORCEOPT; 2376 2377 if (osp == NULL) 2378 return; 2379 while (*s != '\0') { 2380 p = strchr(s, ','); /* find next option */ 2381 if (p == NULL) { 2382 cp = NULL; 2383 p = s + strlen(s); 2384 } else { 2385 cp = p; /* save location of comma */ 2386 *p++ = '\0'; /* mark end and point to next option */ 2387 } 2388 nextop = p; 2389 p = strchr(s, '='); /* look for value */ 2390 if (p == NULL) { 2391 valp = NULL; /* no value supplied */ 2392 } else { 2393 ep = p; /* save location of equals */ 2394 *p++ = '\0'; /* end option and point to value */ 2395 valp = p; 2396 } 2397 /* 2398 * set option into options table 2399 */ 2400 if (create) 2401 setflg |= VFS_CREATEOPT; 2402 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2403 if (cp != NULL) 2404 *cp = ','; /* restore the comma */ 2405 if (valp != NULL) 2406 *ep = '='; /* restore the equals */ 2407 s = nextop; 2408 } 2409 } 2410 2411 /* 2412 * Function to inquire if an option exists in a mount options table. 2413 * Returns a pointer to the option if it exists, else NULL. 2414 * 2415 * This function is *not* for general use by filesystems. 2416 * 2417 * Note: caller is responsible for locking the vfs list, if needed, 2418 * to protect mops. 2419 */ 2420 struct mntopt * 2421 vfs_hasopt(const mntopts_t *mops, const char *opt) 2422 { 2423 struct mntopt *mop; 2424 uint_t i, count; 2425 2426 count = mops->mo_count; 2427 for (i = 0; i < count; i++) { 2428 mop = &mops->mo_list[i]; 2429 2430 if (mop->mo_flags & MO_EMPTY) 2431 continue; 2432 if (strcmp(opt, mop->mo_name) == 0) 2433 return (mop); 2434 } 2435 return (NULL); 2436 } 2437 2438 /* 2439 * Function to inquire if an option is set in a mount options table. 2440 * Returns non-zero if set and fills in the arg pointer with a pointer to 2441 * the argument string or NULL if there is no argument string. 2442 */ 2443 static int 2444 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2445 { 2446 struct mntopt *mop; 2447 uint_t i, count; 2448 2449 count = mops->mo_count; 2450 for (i = 0; i < count; i++) { 2451 mop = &mops->mo_list[i]; 2452 2453 if (mop->mo_flags & MO_EMPTY) 2454 continue; 2455 if (strcmp(opt, mop->mo_name)) 2456 continue; 2457 if ((mop->mo_flags & MO_SET) == 0) 2458 return (0); 2459 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2460 *argp = mop->mo_arg; 2461 return (1); 2462 } 2463 return (0); 2464 } 2465 2466 2467 int 2468 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2469 { 2470 int ret; 2471 2472 vfs_list_read_lock(); 2473 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2474 vfs_list_unlock(); 2475 return (ret); 2476 } 2477 2478 2479 /* 2480 * Construct a comma separated string of the options set in the given 2481 * mount table, return the string in the given buffer. Return non-zero if 2482 * the buffer would overflow. 2483 * 2484 * This function is *not* for general use by filesystems. 2485 * 2486 * Note: caller is responsible for locking the vfs list, if needed, 2487 * to protect mp. 2488 */ 2489 int 2490 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2491 { 2492 char *cp; 2493 uint_t i; 2494 2495 buf[0] = '\0'; 2496 cp = buf; 2497 for (i = 0; i < mp->mo_count; i++) { 2498 struct mntopt *mop; 2499 2500 mop = &mp->mo_list[i]; 2501 if (mop->mo_flags & MO_SET) { 2502 int optlen, comma = 0; 2503 2504 if (buf[0] != '\0') 2505 comma = 1; 2506 optlen = strlen(mop->mo_name); 2507 if (strlen(buf) + comma + optlen + 1 > len) 2508 goto err; 2509 if (comma) 2510 *cp++ = ','; 2511 (void) strcpy(cp, mop->mo_name); 2512 cp += optlen; 2513 /* 2514 * Append option value if there is one 2515 */ 2516 if (mop->mo_arg != NULL) { 2517 int arglen; 2518 2519 arglen = strlen(mop->mo_arg); 2520 if (strlen(buf) + arglen + 2 > len) 2521 goto err; 2522 *cp++ = '='; 2523 (void) strcpy(cp, mop->mo_arg); 2524 cp += arglen; 2525 } 2526 } 2527 } 2528 return (0); 2529 err: 2530 return (EOVERFLOW); 2531 } 2532 2533 static void 2534 vfs_freecancelopt(char **moc) 2535 { 2536 if (moc != NULL) { 2537 int ccnt = 0; 2538 char **cp; 2539 2540 for (cp = moc; *cp != NULL; cp++) { 2541 kmem_free(*cp, strlen(*cp) + 1); 2542 ccnt++; 2543 } 2544 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2545 } 2546 } 2547 2548 static void 2549 vfs_freeopt(mntopt_t *mop) 2550 { 2551 if (mop->mo_name != NULL) 2552 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2553 2554 vfs_freecancelopt(mop->mo_cancel); 2555 2556 if (mop->mo_arg != NULL) 2557 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2558 } 2559 2560 /* 2561 * Free a mount options table 2562 * 2563 * This function is *not* for general use by filesystems. 2564 * 2565 * Note: caller is responsible for locking the vfs list, if needed, 2566 * to protect mp. 2567 */ 2568 void 2569 vfs_freeopttbl(mntopts_t *mp) 2570 { 2571 uint_t i, count; 2572 2573 count = mp->mo_count; 2574 for (i = 0; i < count; i++) { 2575 vfs_freeopt(&mp->mo_list[i]); 2576 } 2577 if (count) { 2578 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2579 mp->mo_count = 0; 2580 mp->mo_list = NULL; 2581 } 2582 } 2583 2584 2585 /* ARGSUSED */ 2586 static int 2587 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2588 caller_context_t *ct) 2589 { 2590 return (0); 2591 } 2592 2593 /* ARGSUSED */ 2594 static int 2595 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2596 caller_context_t *ct) 2597 { 2598 return (0); 2599 } 2600 2601 /* 2602 * The dummy vnode is currently used only by file events notification 2603 * module which is just interested in the timestamps. 2604 */ 2605 /* ARGSUSED */ 2606 static int 2607 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2608 caller_context_t *ct) 2609 { 2610 bzero(vap, sizeof (vattr_t)); 2611 vap->va_type = VREG; 2612 vap->va_nlink = 1; 2613 vap->va_ctime = vfs_mnttab_ctime; 2614 /* 2615 * it is ok to just copy mtime as the time will be monotonically 2616 * increasing. 2617 */ 2618 vap->va_mtime = vfs_mnttab_mtime; 2619 vap->va_atime = vap->va_mtime; 2620 return (0); 2621 } 2622 2623 static void 2624 vfs_mnttabvp_setup(void) 2625 { 2626 vnode_t *tvp; 2627 vnodeops_t *vfs_mntdummyvnops; 2628 const fs_operation_def_t mnt_dummyvnodeops_template[] = { 2629 VOPNAME_READ, { .vop_read = vfs_mntdummyread }, 2630 VOPNAME_WRITE, { .vop_write = vfs_mntdummywrite }, 2631 VOPNAME_GETATTR, { .vop_getattr = vfs_mntdummygetattr }, 2632 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 2633 NULL, NULL 2634 }; 2635 2636 if (vn_make_ops("mnttab", mnt_dummyvnodeops_template, 2637 &vfs_mntdummyvnops) != 0) { 2638 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed"); 2639 /* Shouldn't happen, but not bad enough to panic */ 2640 return; 2641 } 2642 2643 /* 2644 * A global dummy vnode is allocated to represent mntfs files. 2645 * The mntfs file (/etc/mnttab) can be monitored for file events 2646 * and receive an event when mnttab changes. Dummy VOP calls 2647 * will be made on this vnode. The file events notification module 2648 * intercepts this vnode and delivers relevant events. 2649 */ 2650 tvp = vn_alloc(KM_SLEEP); 2651 tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE; 2652 vn_setops(tvp, vfs_mntdummyvnops); 2653 tvp->v_type = VREG; 2654 /* 2655 * The mnt dummy ops do not reference v_data. 2656 * No other module intercepting this vnode should either. 2657 * Just set it to point to itself. 2658 */ 2659 tvp->v_data = (caddr_t)tvp; 2660 tvp->v_vfsp = rootvfs; 2661 vfs_mntdummyvp = tvp; 2662 } 2663 2664 /* 2665 * performs fake read/write ops 2666 */ 2667 static void 2668 vfs_mnttab_rwop(int rw) 2669 { 2670 struct uio uio; 2671 struct iovec iov; 2672 char buf[1]; 2673 2674 if (vfs_mntdummyvp == NULL) 2675 return; 2676 2677 bzero(&uio, sizeof (uio)); 2678 bzero(&iov, sizeof (iov)); 2679 iov.iov_base = buf; 2680 iov.iov_len = 0; 2681 uio.uio_iov = &iov; 2682 uio.uio_iovcnt = 1; 2683 uio.uio_loffset = 0; 2684 uio.uio_segflg = UIO_SYSSPACE; 2685 uio.uio_resid = 0; 2686 if (rw) { 2687 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2688 } else { 2689 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2690 } 2691 } 2692 2693 /* 2694 * Generate a write operation. 2695 */ 2696 void 2697 vfs_mnttab_writeop(void) 2698 { 2699 vfs_mnttab_rwop(1); 2700 } 2701 2702 /* 2703 * Generate a read operation. 2704 */ 2705 void 2706 vfs_mnttab_readop(void) 2707 { 2708 vfs_mnttab_rwop(0); 2709 } 2710 2711 /* 2712 * Free any mnttab information recorded in the vfs struct. 2713 * The vfs must not be on the vfs list. 2714 */ 2715 static void 2716 vfs_freemnttab(struct vfs *vfsp) 2717 { 2718 ASSERT(!VFS_ON_LIST(vfsp)); 2719 2720 /* 2721 * Free device and mount point information 2722 */ 2723 if (vfsp->vfs_mntpt != NULL) { 2724 refstr_rele(vfsp->vfs_mntpt); 2725 vfsp->vfs_mntpt = NULL; 2726 } 2727 if (vfsp->vfs_resource != NULL) { 2728 refstr_rele(vfsp->vfs_resource); 2729 vfsp->vfs_resource = NULL; 2730 } 2731 /* 2732 * Now free mount options information 2733 */ 2734 vfs_freeopttbl(&vfsp->vfs_mntopts); 2735 } 2736 2737 /* 2738 * Return the last mnttab modification time 2739 */ 2740 void 2741 vfs_mnttab_modtime(timespec_t *ts) 2742 { 2743 ASSERT(RW_LOCK_HELD(&vfslist)); 2744 *ts = vfs_mnttab_mtime; 2745 } 2746 2747 /* 2748 * See if mnttab is changed 2749 */ 2750 void 2751 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2752 { 2753 int changed; 2754 2755 *phpp = (struct pollhead *)NULL; 2756 2757 /* 2758 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 2759 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 2760 * to not grab the vfs list lock because tv_sec is monotonically 2761 * increasing. 2762 */ 2763 2764 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 2765 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 2766 if (!changed) { 2767 *phpp = &vfs_pollhd; 2768 } 2769 } 2770 2771 /* 2772 * Update the mnttab modification time and wake up any waiters for 2773 * mnttab changes 2774 */ 2775 void 2776 vfs_mnttab_modtimeupd() 2777 { 2778 hrtime_t oldhrt, newhrt; 2779 2780 ASSERT(RW_WRITE_HELD(&vfslist)); 2781 oldhrt = ts2hrt(&vfs_mnttab_mtime); 2782 gethrestime(&vfs_mnttab_mtime); 2783 newhrt = ts2hrt(&vfs_mnttab_mtime); 2784 if (oldhrt == (hrtime_t)0) 2785 vfs_mnttab_ctime = vfs_mnttab_mtime; 2786 /* 2787 * Attempt to provide unique mtime (like uniqtime but not). 2788 */ 2789 if (newhrt == oldhrt) { 2790 newhrt++; 2791 hrt2ts(newhrt, &vfs_mnttab_mtime); 2792 } 2793 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 2794 vfs_mnttab_writeop(); 2795 } 2796 2797 int 2798 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 2799 { 2800 vnode_t *coveredvp; 2801 int error; 2802 extern void teardown_vopstats(vfs_t *); 2803 2804 /* 2805 * Get covered vnode. This will be NULL if the vfs is not linked 2806 * into the file system name space (i.e., domount() with MNT_NOSPICE). 2807 */ 2808 coveredvp = vfsp->vfs_vnodecovered; 2809 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 2810 2811 /* 2812 * Purge all dnlc entries for this vfs. 2813 */ 2814 (void) dnlc_purge_vfsp(vfsp, 0); 2815 2816 /* For forcible umount, skip VFS_SYNC() since it may hang */ 2817 if ((flag & MS_FORCE) == 0) 2818 (void) VFS_SYNC(vfsp, 0, cr); 2819 2820 /* 2821 * Lock the vfs to maintain fs status quo during unmount. This 2822 * has to be done after the sync because ufs_update tries to acquire 2823 * the vfs_reflock. 2824 */ 2825 vfs_lock_wait(vfsp); 2826 2827 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 2828 vfs_unlock(vfsp); 2829 if (coveredvp != NULL) 2830 vn_vfsunlock(coveredvp); 2831 } else if (coveredvp != NULL) { 2832 teardown_vopstats(vfsp); 2833 /* 2834 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 2835 * when it frees vfsp so we do a VN_HOLD() so we can 2836 * continue to use coveredvp afterwards. 2837 */ 2838 VN_HOLD(coveredvp); 2839 vfs_remove(vfsp); 2840 vn_vfsunlock(coveredvp); 2841 VN_RELE(coveredvp); 2842 } else { 2843 teardown_vopstats(vfsp); 2844 /* 2845 * Release the reference to vfs that is not linked 2846 * into the name space. 2847 */ 2848 vfs_unlock(vfsp); 2849 VFS_RELE(vfsp); 2850 } 2851 return (error); 2852 } 2853 2854 2855 /* 2856 * Vfs_unmountall() is called by uadmin() to unmount all 2857 * mounted file systems (except the root file system) during shutdown. 2858 * It follows the existing locking protocol when traversing the vfs list 2859 * to sync and unmount vfses. Even though there should be no 2860 * other thread running while the system is shutting down, it is prudent 2861 * to still follow the locking protocol. 2862 */ 2863 void 2864 vfs_unmountall(void) 2865 { 2866 struct vfs *vfsp; 2867 struct vfs *prev_vfsp = NULL; 2868 int error; 2869 2870 /* 2871 * Toss all dnlc entries now so that the per-vfs sync 2872 * and unmount operations don't have to slog through 2873 * a bunch of uninteresting vnodes over and over again. 2874 */ 2875 dnlc_purge(); 2876 2877 vfs_list_lock(); 2878 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 2879 prev_vfsp = vfsp->vfs_prev; 2880 2881 if (vfs_lock(vfsp) != 0) 2882 continue; 2883 error = vn_vfswlock(vfsp->vfs_vnodecovered); 2884 vfs_unlock(vfsp); 2885 if (error) 2886 continue; 2887 2888 vfs_list_unlock(); 2889 2890 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 2891 (void) dounmount(vfsp, 0, CRED()); 2892 2893 /* 2894 * Since we dropped the vfslist lock above we must 2895 * verify that next_vfsp still exists, else start over. 2896 */ 2897 vfs_list_lock(); 2898 for (vfsp = rootvfs->vfs_prev; 2899 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 2900 if (vfsp == prev_vfsp) 2901 break; 2902 if (vfsp == rootvfs && prev_vfsp != rootvfs) 2903 prev_vfsp = rootvfs->vfs_prev; 2904 } 2905 vfs_list_unlock(); 2906 } 2907 2908 /* 2909 * Called to add an entry to the end of the vfs mount in progress list 2910 */ 2911 void 2912 vfs_addmip(dev_t dev, struct vfs *vfsp) 2913 { 2914 struct ipmnt *mipp; 2915 2916 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 2917 mipp->mip_next = NULL; 2918 mipp->mip_dev = dev; 2919 mipp->mip_vfsp = vfsp; 2920 mutex_enter(&vfs_miplist_mutex); 2921 if (vfs_miplist_end != NULL) 2922 vfs_miplist_end->mip_next = mipp; 2923 else 2924 vfs_miplist = mipp; 2925 vfs_miplist_end = mipp; 2926 mutex_exit(&vfs_miplist_mutex); 2927 } 2928 2929 /* 2930 * Called to remove an entry from the mount in progress list 2931 * Either because the mount completed or it failed. 2932 */ 2933 void 2934 vfs_delmip(struct vfs *vfsp) 2935 { 2936 struct ipmnt *mipp, *mipprev; 2937 2938 mutex_enter(&vfs_miplist_mutex); 2939 mipprev = NULL; 2940 for (mipp = vfs_miplist; 2941 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 2942 mipprev = mipp; 2943 } 2944 if (mipp == NULL) 2945 return; /* shouldn't happen */ 2946 if (mipp == vfs_miplist_end) 2947 vfs_miplist_end = mipprev; 2948 if (mipprev == NULL) 2949 vfs_miplist = mipp->mip_next; 2950 else 2951 mipprev->mip_next = mipp->mip_next; 2952 mutex_exit(&vfs_miplist_mutex); 2953 kmem_free(mipp, sizeof (struct ipmnt)); 2954 } 2955 2956 /* 2957 * vfs_add is called by a specific filesystem's mount routine to add 2958 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 2959 * The vfs should already have been locked by the caller. 2960 * 2961 * coveredvp is NULL if this is the root. 2962 */ 2963 void 2964 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 2965 { 2966 int newflag; 2967 2968 ASSERT(vfs_lock_held(vfsp)); 2969 VFS_HOLD(vfsp); 2970 newflag = vfsp->vfs_flag; 2971 if (mflag & MS_RDONLY) 2972 newflag |= VFS_RDONLY; 2973 else 2974 newflag &= ~VFS_RDONLY; 2975 if (mflag & MS_NOSUID) 2976 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 2977 else 2978 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 2979 if (mflag & MS_NOMNTTAB) 2980 newflag |= VFS_NOMNTTAB; 2981 else 2982 newflag &= ~VFS_NOMNTTAB; 2983 2984 if (coveredvp != NULL) { 2985 ASSERT(vn_vfswlock_held(coveredvp)); 2986 coveredvp->v_vfsmountedhere = vfsp; 2987 VN_HOLD(coveredvp); 2988 } 2989 vfsp->vfs_vnodecovered = coveredvp; 2990 vfsp->vfs_flag = newflag; 2991 2992 vfs_list_add(vfsp); 2993 } 2994 2995 /* 2996 * Remove a vfs from the vfs list, null out the pointer from the 2997 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 2998 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 2999 * reference to the vfs and to the covered vnode. 3000 * 3001 * Called from dounmount after it's confirmed with the file system 3002 * that the unmount is legal. 3003 */ 3004 void 3005 vfs_remove(struct vfs *vfsp) 3006 { 3007 vnode_t *vp; 3008 3009 ASSERT(vfs_lock_held(vfsp)); 3010 3011 /* 3012 * Can't unmount root. Should never happen because fs will 3013 * be busy. 3014 */ 3015 if (vfsp == rootvfs) 3016 panic("vfs_remove: unmounting root"); 3017 3018 vfs_list_remove(vfsp); 3019 3020 /* 3021 * Unhook from the file system name space. 3022 */ 3023 vp = vfsp->vfs_vnodecovered; 3024 ASSERT(vn_vfswlock_held(vp)); 3025 vp->v_vfsmountedhere = NULL; 3026 vfsp->vfs_vnodecovered = NULL; 3027 VN_RELE(vp); 3028 3029 /* 3030 * Release lock and wakeup anybody waiting. 3031 */ 3032 vfs_unlock(vfsp); 3033 VFS_RELE(vfsp); 3034 } 3035 3036 /* 3037 * Lock a filesystem to prevent access to it while mounting, 3038 * unmounting and syncing. Return EBUSY immediately if lock 3039 * can't be acquired. 3040 */ 3041 int 3042 vfs_lock(vfs_t *vfsp) 3043 { 3044 vn_vfslocks_entry_t *vpvfsentry; 3045 3046 vpvfsentry = vn_vfslocks_getlock(vfsp); 3047 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 3048 return (0); 3049 3050 vn_vfslocks_rele(vpvfsentry); 3051 return (EBUSY); 3052 } 3053 3054 int 3055 vfs_rlock(vfs_t *vfsp) 3056 { 3057 vn_vfslocks_entry_t *vpvfsentry; 3058 3059 vpvfsentry = vn_vfslocks_getlock(vfsp); 3060 3061 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 3062 return (0); 3063 3064 vn_vfslocks_rele(vpvfsentry); 3065 return (EBUSY); 3066 } 3067 3068 void 3069 vfs_lock_wait(vfs_t *vfsp) 3070 { 3071 vn_vfslocks_entry_t *vpvfsentry; 3072 3073 vpvfsentry = vn_vfslocks_getlock(vfsp); 3074 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 3075 } 3076 3077 void 3078 vfs_rlock_wait(vfs_t *vfsp) 3079 { 3080 vn_vfslocks_entry_t *vpvfsentry; 3081 3082 vpvfsentry = vn_vfslocks_getlock(vfsp); 3083 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 3084 } 3085 3086 /* 3087 * Unlock a locked filesystem. 3088 */ 3089 void 3090 vfs_unlock(vfs_t *vfsp) 3091 { 3092 vn_vfslocks_entry_t *vpvfsentry; 3093 3094 /* 3095 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 3096 * And these changes should remain for the patch changes as it is. 3097 */ 3098 if (panicstr) 3099 return; 3100 3101 /* 3102 * ve_refcount needs to be dropped twice here. 3103 * 1. To release refernce after a call to vfs_locks_getlock() 3104 * 2. To release the reference from the locking routines like 3105 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 3106 */ 3107 3108 vpvfsentry = vn_vfslocks_getlock(vfsp); 3109 vn_vfslocks_rele(vpvfsentry); 3110 3111 rwst_exit(&vpvfsentry->ve_lock); 3112 vn_vfslocks_rele(vpvfsentry); 3113 } 3114 3115 /* 3116 * Utility routine that allows a filesystem to construct its 3117 * fsid in "the usual way" - by munging some underlying dev_t and 3118 * the filesystem type number into the 64-bit fsid. Note that 3119 * this implicitly relies on dev_t persistence to make filesystem 3120 * id's persistent. 3121 * 3122 * There's nothing to prevent an individual fs from constructing its 3123 * fsid in a different way, and indeed they should. 3124 * 3125 * Since we want fsids to be 32-bit quantities (so that they can be 3126 * exported identically by either 32-bit or 64-bit APIs, as well as 3127 * the fact that fsid's are "known" to NFS), we compress the device 3128 * number given down to 32-bits, and panic if that isn't possible. 3129 */ 3130 void 3131 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 3132 { 3133 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 3134 panic("device number too big for fsid!"); 3135 fsi->val[1] = val; 3136 } 3137 3138 int 3139 vfs_lock_held(vfs_t *vfsp) 3140 { 3141 int held; 3142 vn_vfslocks_entry_t *vpvfsentry; 3143 3144 /* 3145 * vfs_lock_held will mimic sema_held behaviour 3146 * if panicstr is set. And these changes should remain 3147 * for the patch changes as it is. 3148 */ 3149 if (panicstr) 3150 return (1); 3151 3152 vpvfsentry = vn_vfslocks_getlock(vfsp); 3153 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 3154 3155 vn_vfslocks_rele(vpvfsentry); 3156 return (held); 3157 } 3158 3159 struct _kthread * 3160 vfs_lock_owner(vfs_t *vfsp) 3161 { 3162 struct _kthread *owner; 3163 vn_vfslocks_entry_t *vpvfsentry; 3164 3165 /* 3166 * vfs_wlock_held will mimic sema_held behaviour 3167 * if panicstr is set. And these changes should remain 3168 * for the patch changes as it is. 3169 */ 3170 if (panicstr) 3171 return (NULL); 3172 3173 vpvfsentry = vn_vfslocks_getlock(vfsp); 3174 owner = rwst_owner(&vpvfsentry->ve_lock); 3175 3176 vn_vfslocks_rele(vpvfsentry); 3177 return (owner); 3178 } 3179 3180 /* 3181 * vfs list locking. 3182 * 3183 * Rather than manipulate the vfslist lock directly, we abstract into lock 3184 * and unlock routines to allow the locking implementation to be changed for 3185 * clustering. 3186 * 3187 * Whenever the vfs list is modified through its hash links, the overall list 3188 * lock must be obtained before locking the relevant hash bucket. But to see 3189 * whether a given vfs is on the list, it suffices to obtain the lock for the 3190 * hash bucket without getting the overall list lock. (See getvfs() below.) 3191 */ 3192 3193 void 3194 vfs_list_lock() 3195 { 3196 rw_enter(&vfslist, RW_WRITER); 3197 } 3198 3199 void 3200 vfs_list_read_lock() 3201 { 3202 rw_enter(&vfslist, RW_READER); 3203 } 3204 3205 void 3206 vfs_list_unlock() 3207 { 3208 rw_exit(&vfslist); 3209 } 3210 3211 /* 3212 * Low level worker routines for adding entries to and removing entries from 3213 * the vfs list. 3214 */ 3215 3216 static void 3217 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 3218 { 3219 int vhno; 3220 struct vfs **hp; 3221 dev_t dev; 3222 3223 ASSERT(RW_WRITE_HELD(&vfslist)); 3224 3225 dev = expldev(vfsp->vfs_fsid.val[0]); 3226 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3227 3228 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3229 3230 /* 3231 * Link into the hash table, inserting it at the end, so that LOFS 3232 * with the same fsid as UFS (or other) file systems will not hide the 3233 * UFS. 3234 */ 3235 if (insert_at_head) { 3236 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 3237 rvfs_list[vhno].rvfs_head = vfsp; 3238 } else { 3239 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 3240 hp = &(*hp)->vfs_hash) 3241 continue; 3242 /* 3243 * hp now contains the address of the pointer to update 3244 * to effect the insertion. 3245 */ 3246 vfsp->vfs_hash = NULL; 3247 *hp = vfsp; 3248 } 3249 3250 rvfs_list[vhno].rvfs_len++; 3251 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3252 } 3253 3254 3255 static void 3256 vfs_hash_remove(struct vfs *vfsp) 3257 { 3258 int vhno; 3259 struct vfs *tvfsp; 3260 dev_t dev; 3261 3262 ASSERT(RW_WRITE_HELD(&vfslist)); 3263 3264 dev = expldev(vfsp->vfs_fsid.val[0]); 3265 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3266 3267 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3268 3269 /* 3270 * Remove from hash. 3271 */ 3272 if (rvfs_list[vhno].rvfs_head == vfsp) { 3273 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 3274 rvfs_list[vhno].rvfs_len--; 3275 goto foundit; 3276 } 3277 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 3278 tvfsp = tvfsp->vfs_hash) { 3279 if (tvfsp->vfs_hash == vfsp) { 3280 tvfsp->vfs_hash = vfsp->vfs_hash; 3281 rvfs_list[vhno].rvfs_len--; 3282 goto foundit; 3283 } 3284 } 3285 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 3286 3287 foundit: 3288 3289 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3290 } 3291 3292 3293 void 3294 vfs_list_add(struct vfs *vfsp) 3295 { 3296 zone_t *zone; 3297 3298 /* 3299 * The zone that owns the mount is the one that performed the mount. 3300 * Note that this isn't necessarily the same as the zone mounted into. 3301 * The corresponding zone_rele() will be done when the vfs_t is 3302 * being free'd. 3303 */ 3304 vfsp->vfs_zone = curproc->p_zone; 3305 zone_hold(vfsp->vfs_zone); 3306 3307 /* 3308 * Find the zone mounted into, and put this mount on its vfs list. 3309 */ 3310 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3311 ASSERT(zone != NULL); 3312 /* 3313 * Special casing for the root vfs. This structure is allocated 3314 * statically and hooked onto rootvfs at link time. During the 3315 * vfs_mountroot call at system startup time, the root file system's 3316 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 3317 * as argument. The code below must detect and handle this special 3318 * case. The only apparent justification for this special casing is 3319 * to ensure that the root file system appears at the head of the 3320 * list. 3321 * 3322 * XXX: I'm assuming that it's ok to do normal list locking when 3323 * adding the entry for the root file system (this used to be 3324 * done with no locks held). 3325 */ 3326 vfs_list_lock(); 3327 /* 3328 * Link into the vfs list proper. 3329 */ 3330 if (vfsp == &root) { 3331 /* 3332 * Assert: This vfs is already on the list as its first entry. 3333 * Thus, there's nothing to do. 3334 */ 3335 ASSERT(rootvfs == vfsp); 3336 /* 3337 * Add it to the head of the global zone's vfslist. 3338 */ 3339 ASSERT(zone == global_zone); 3340 ASSERT(zone->zone_vfslist == NULL); 3341 zone->zone_vfslist = vfsp; 3342 } else { 3343 /* 3344 * Link to end of list using vfs_prev (as rootvfs is now a 3345 * doubly linked circular list) so list is in mount order for 3346 * mnttab use. 3347 */ 3348 rootvfs->vfs_prev->vfs_next = vfsp; 3349 vfsp->vfs_prev = rootvfs->vfs_prev; 3350 rootvfs->vfs_prev = vfsp; 3351 vfsp->vfs_next = rootvfs; 3352 3353 /* 3354 * Do it again for the zone-private list (which may be NULL). 3355 */ 3356 if (zone->zone_vfslist == NULL) { 3357 ASSERT(zone != global_zone); 3358 zone->zone_vfslist = vfsp; 3359 } else { 3360 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 3361 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 3362 zone->zone_vfslist->vfs_zone_prev = vfsp; 3363 vfsp->vfs_zone_next = zone->zone_vfslist; 3364 } 3365 } 3366 3367 /* 3368 * Link into the hash table, inserting it at the end, so that LOFS 3369 * with the same fsid as UFS (or other) file systems will not hide 3370 * the UFS. 3371 */ 3372 vfs_hash_add(vfsp, 0); 3373 3374 /* 3375 * update the mnttab modification time 3376 */ 3377 vfs_mnttab_modtimeupd(); 3378 vfs_list_unlock(); 3379 zone_rele(zone); 3380 } 3381 3382 void 3383 vfs_list_remove(struct vfs *vfsp) 3384 { 3385 zone_t *zone; 3386 3387 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3388 ASSERT(zone != NULL); 3389 /* 3390 * Callers are responsible for preventing attempts to unmount the 3391 * root. 3392 */ 3393 ASSERT(vfsp != rootvfs); 3394 3395 vfs_list_lock(); 3396 3397 /* 3398 * Remove from hash. 3399 */ 3400 vfs_hash_remove(vfsp); 3401 3402 /* 3403 * Remove from vfs list. 3404 */ 3405 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3406 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3407 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3408 3409 /* 3410 * Remove from zone-specific vfs list. 3411 */ 3412 if (zone->zone_vfslist == vfsp) 3413 zone->zone_vfslist = vfsp->vfs_zone_next; 3414 3415 if (vfsp->vfs_zone_next == vfsp) { 3416 ASSERT(vfsp->vfs_zone_prev == vfsp); 3417 ASSERT(zone->zone_vfslist == vfsp); 3418 zone->zone_vfslist = NULL; 3419 } 3420 3421 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3422 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3423 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3424 3425 /* 3426 * update the mnttab modification time 3427 */ 3428 vfs_mnttab_modtimeupd(); 3429 vfs_list_unlock(); 3430 zone_rele(zone); 3431 } 3432 3433 struct vfs * 3434 getvfs(fsid_t *fsid) 3435 { 3436 struct vfs *vfsp; 3437 int val0 = fsid->val[0]; 3438 int val1 = fsid->val[1]; 3439 dev_t dev = expldev(val0); 3440 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3441 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3442 3443 mutex_enter(hmp); 3444 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3445 if (vfsp->vfs_fsid.val[0] == val0 && 3446 vfsp->vfs_fsid.val[1] == val1) { 3447 VFS_HOLD(vfsp); 3448 mutex_exit(hmp); 3449 return (vfsp); 3450 } 3451 } 3452 mutex_exit(hmp); 3453 return (NULL); 3454 } 3455 3456 /* 3457 * Search the vfs mount in progress list for a specified device/vfs entry. 3458 * Returns 0 if the first entry in the list that the device matches has the 3459 * given vfs pointer as well. If the device matches but a different vfs 3460 * pointer is encountered in the list before the given vfs pointer then 3461 * a 1 is returned. 3462 */ 3463 3464 int 3465 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3466 { 3467 int retval = 0; 3468 struct ipmnt *mipp; 3469 3470 mutex_enter(&vfs_miplist_mutex); 3471 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3472 if (mipp->mip_dev == dev) { 3473 if (mipp->mip_vfsp != vfsp) 3474 retval = 1; 3475 break; 3476 } 3477 } 3478 mutex_exit(&vfs_miplist_mutex); 3479 return (retval); 3480 } 3481 3482 /* 3483 * Search the vfs list for a specified device. Returns 1, if entry is found 3484 * or 0 if no suitable entry is found. 3485 */ 3486 3487 int 3488 vfs_devismounted(dev_t dev) 3489 { 3490 struct vfs *vfsp; 3491 int found; 3492 3493 vfs_list_read_lock(); 3494 vfsp = rootvfs; 3495 found = 0; 3496 do { 3497 if (vfsp->vfs_dev == dev) { 3498 found = 1; 3499 break; 3500 } 3501 vfsp = vfsp->vfs_next; 3502 } while (vfsp != rootvfs); 3503 3504 vfs_list_unlock(); 3505 return (found); 3506 } 3507 3508 /* 3509 * Search the vfs list for a specified device. Returns a pointer to it 3510 * or NULL if no suitable entry is found. The caller of this routine 3511 * is responsible for releasing the returned vfs pointer. 3512 */ 3513 struct vfs * 3514 vfs_dev2vfsp(dev_t dev) 3515 { 3516 struct vfs *vfsp; 3517 int found; 3518 3519 vfs_list_read_lock(); 3520 vfsp = rootvfs; 3521 found = 0; 3522 do { 3523 /* 3524 * The following could be made more efficient by making 3525 * the entire loop use vfs_zone_next if the call is from 3526 * a zone. The only callers, however, ustat(2) and 3527 * umount2(2), don't seem to justify the added 3528 * complexity at present. 3529 */ 3530 if (vfsp->vfs_dev == dev && 3531 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3532 curproc->p_zone)) { 3533 VFS_HOLD(vfsp); 3534 found = 1; 3535 break; 3536 } 3537 vfsp = vfsp->vfs_next; 3538 } while (vfsp != rootvfs); 3539 vfs_list_unlock(); 3540 return (found ? vfsp: NULL); 3541 } 3542 3543 /* 3544 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3545 * or NULL if no suitable entry is found. The caller of this routine 3546 * is responsible for releasing the returned vfs pointer. 3547 * 3548 * Note that if multiple mntpoints match, the last one matching is 3549 * returned in an attempt to return the "top" mount when overlay 3550 * mounts are covering the same mount point. This is accomplished by starting 3551 * at the end of the list and working our way backwards, stopping at the first 3552 * matching mount. 3553 */ 3554 struct vfs * 3555 vfs_mntpoint2vfsp(const char *mp) 3556 { 3557 struct vfs *vfsp; 3558 struct vfs *retvfsp = NULL; 3559 zone_t *zone = curproc->p_zone; 3560 struct vfs *list; 3561 3562 vfs_list_read_lock(); 3563 if (getzoneid() == GLOBAL_ZONEID) { 3564 /* 3565 * The global zone may see filesystems in any zone. 3566 */ 3567 vfsp = rootvfs->vfs_prev; 3568 do { 3569 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) { 3570 retvfsp = vfsp; 3571 break; 3572 } 3573 vfsp = vfsp->vfs_prev; 3574 } while (vfsp != rootvfs->vfs_prev); 3575 } else if ((list = zone->zone_vfslist) != NULL) { 3576 const char *mntpt; 3577 3578 vfsp = list->vfs_zone_prev; 3579 do { 3580 mntpt = refstr_value(vfsp->vfs_mntpt); 3581 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3582 if (strcmp(mntpt, mp) == 0) { 3583 retvfsp = vfsp; 3584 break; 3585 } 3586 vfsp = vfsp->vfs_zone_prev; 3587 } while (vfsp != list->vfs_zone_prev); 3588 } 3589 if (retvfsp) 3590 VFS_HOLD(retvfsp); 3591 vfs_list_unlock(); 3592 return (retvfsp); 3593 } 3594 3595 /* 3596 * Search the vfs list for a specified vfsops. 3597 * if vfs entry is found then return 1, else 0. 3598 */ 3599 int 3600 vfs_opsinuse(vfsops_t *ops) 3601 { 3602 struct vfs *vfsp; 3603 int found; 3604 3605 vfs_list_read_lock(); 3606 vfsp = rootvfs; 3607 found = 0; 3608 do { 3609 if (vfs_getops(vfsp) == ops) { 3610 found = 1; 3611 break; 3612 } 3613 vfsp = vfsp->vfs_next; 3614 } while (vfsp != rootvfs); 3615 vfs_list_unlock(); 3616 return (found); 3617 } 3618 3619 /* 3620 * Allocate an entry in vfssw for a file system type 3621 */ 3622 struct vfssw * 3623 allocate_vfssw(char *type) 3624 { 3625 struct vfssw *vswp; 3626 3627 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3628 /* 3629 * The vfssw table uses the empty string to identify an 3630 * available entry; we cannot add any type which has 3631 * a leading NUL. The string length is limited to 3632 * the size of the st_fstype array in struct stat. 3633 */ 3634 return (NULL); 3635 } 3636 3637 ASSERT(VFSSW_WRITE_LOCKED()); 3638 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3639 if (!ALLOCATED_VFSSW(vswp)) { 3640 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3641 (void) strcpy(vswp->vsw_name, type); 3642 ASSERT(vswp->vsw_count == 0); 3643 vswp->vsw_count = 1; 3644 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3645 return (vswp); 3646 } 3647 return (NULL); 3648 } 3649 3650 /* 3651 * Impose additional layer of translation between vfstype names 3652 * and module names in the filesystem. 3653 */ 3654 static char * 3655 vfs_to_modname(char *vfstype) 3656 { 3657 if (strcmp(vfstype, "proc") == 0) { 3658 vfstype = "procfs"; 3659 } else if (strcmp(vfstype, "fd") == 0) { 3660 vfstype = "fdfs"; 3661 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3662 vfstype = "nfs"; 3663 } 3664 3665 return (vfstype); 3666 } 3667 3668 /* 3669 * Find a vfssw entry given a file system type name. 3670 * Try to autoload the filesystem if it's not found. 3671 * If it's installed, return the vfssw locked to prevent unloading. 3672 */ 3673 struct vfssw * 3674 vfs_getvfssw(char *type) 3675 { 3676 struct vfssw *vswp; 3677 char *modname; 3678 3679 RLOCK_VFSSW(); 3680 vswp = vfs_getvfsswbyname(type); 3681 modname = vfs_to_modname(type); 3682 3683 if (rootdir == NULL) { 3684 /* 3685 * If we haven't yet loaded the root file system, then our 3686 * _init won't be called until later. Allocate vfssw entry, 3687 * because mod_installfs won't be called. 3688 */ 3689 if (vswp == NULL) { 3690 RUNLOCK_VFSSW(); 3691 WLOCK_VFSSW(); 3692 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 3693 if ((vswp = allocate_vfssw(type)) == NULL) { 3694 WUNLOCK_VFSSW(); 3695 return (NULL); 3696 } 3697 } 3698 WUNLOCK_VFSSW(); 3699 RLOCK_VFSSW(); 3700 } 3701 if (!VFS_INSTALLED(vswp)) { 3702 RUNLOCK_VFSSW(); 3703 (void) modloadonly("fs", modname); 3704 } else 3705 RUNLOCK_VFSSW(); 3706 return (vswp); 3707 } 3708 3709 /* 3710 * Try to load the filesystem. Before calling modload(), we drop 3711 * our lock on the VFS switch table, and pick it up after the 3712 * module is loaded. However, there is a potential race: the 3713 * module could be unloaded after the call to modload() completes 3714 * but before we pick up the lock and drive on. Therefore, 3715 * we keep reloading the module until we've loaded the module 3716 * _and_ we have the lock on the VFS switch table. 3717 */ 3718 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 3719 RUNLOCK_VFSSW(); 3720 if (modload("fs", modname) == -1) 3721 return (NULL); 3722 RLOCK_VFSSW(); 3723 if (vswp == NULL) 3724 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 3725 break; 3726 } 3727 RUNLOCK_VFSSW(); 3728 3729 return (vswp); 3730 } 3731 3732 /* 3733 * Find a vfssw entry given a file system type name. 3734 */ 3735 struct vfssw * 3736 vfs_getvfsswbyname(char *type) 3737 { 3738 struct vfssw *vswp; 3739 3740 ASSERT(VFSSW_LOCKED()); 3741 if (type == NULL || *type == '\0') 3742 return (NULL); 3743 3744 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3745 if (strcmp(type, vswp->vsw_name) == 0) { 3746 vfs_refvfssw(vswp); 3747 return (vswp); 3748 } 3749 } 3750 3751 return (NULL); 3752 } 3753 3754 /* 3755 * Find a vfssw entry given a set of vfsops. 3756 */ 3757 struct vfssw * 3758 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 3759 { 3760 struct vfssw *vswp; 3761 3762 RLOCK_VFSSW(); 3763 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3764 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 3765 vfs_refvfssw(vswp); 3766 RUNLOCK_VFSSW(); 3767 return (vswp); 3768 } 3769 } 3770 RUNLOCK_VFSSW(); 3771 3772 return (NULL); 3773 } 3774 3775 /* 3776 * Reference a vfssw entry. 3777 */ 3778 void 3779 vfs_refvfssw(struct vfssw *vswp) 3780 { 3781 3782 mutex_enter(&vswp->vsw_lock); 3783 vswp->vsw_count++; 3784 mutex_exit(&vswp->vsw_lock); 3785 } 3786 3787 /* 3788 * Unreference a vfssw entry. 3789 */ 3790 void 3791 vfs_unrefvfssw(struct vfssw *vswp) 3792 { 3793 3794 mutex_enter(&vswp->vsw_lock); 3795 vswp->vsw_count--; 3796 mutex_exit(&vswp->vsw_lock); 3797 } 3798 3799 int sync_timeout = 30; /* timeout for syncing a page during panic */ 3800 int sync_timeleft; /* portion of sync_timeout remaining */ 3801 3802 static int sync_retries = 20; /* number of retries when not making progress */ 3803 static int sync_triesleft; /* portion of sync_retries remaining */ 3804 3805 static pgcnt_t old_pgcnt, new_pgcnt; 3806 static int new_bufcnt, old_bufcnt; 3807 3808 /* 3809 * Sync all of the mounted filesystems, and then wait for the actual i/o to 3810 * complete. We wait by counting the number of dirty pages and buffers, 3811 * pushing them out using bio_busy() and page_busy(), and then counting again. 3812 * This routine is used during both the uadmin A_SHUTDOWN code as well as 3813 * the SYNC phase of the panic code (see comments in panic.c). It should only 3814 * be used after some higher-level mechanism has quiesced the system so that 3815 * new writes are not being initiated while we are waiting for completion. 3816 * 3817 * To ensure finite running time, our algorithm uses two timeout mechanisms: 3818 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and 3819 * sync_triesleft (a progress counter used by the vfs_syncall() loop below). 3820 * Together these ensure that syncing completes if our i/o paths are stuck. 3821 * The counters are declared above so they can be found easily in the debugger. 3822 * 3823 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the 3824 * vfs_syncprogress() subroutine whenever we make progress through the lists of 3825 * pages and buffers. It is decremented and expired by the deadman() cyclic. 3826 * When vfs_syncall() decides it is done, we disable the deadman() counter by 3827 * setting sync_timeleft to zero. This timer guards against vfs_syncall() 3828 * deadlocking or hanging inside of a broken filesystem or driver routine. 3829 * 3830 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 3831 * sync_retries consecutive calls to bio_busy() and page_busy() without 3832 * decreasing either the number of dirty buffers or dirty pages below the 3833 * lowest count we have seen so far, we give up and return from vfs_syncall(). 3834 * 3835 * Each loop iteration ends with a call to delay() one second to allow time for 3836 * i/o completion and to permit the user time to read our progress messages. 3837 */ 3838 void 3839 vfs_syncall(void) 3840 { 3841 if (rootdir == NULL && !modrootloaded) 3842 return; /* panic during boot - no filesystems yet */ 3843 3844 printf("syncing file systems..."); 3845 vfs_syncprogress(); 3846 sync(); 3847 3848 vfs_syncprogress(); 3849 sync_triesleft = sync_retries; 3850 3851 old_bufcnt = new_bufcnt = INT_MAX; 3852 old_pgcnt = new_pgcnt = ULONG_MAX; 3853 3854 while (sync_triesleft > 0) { 3855 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 3856 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 3857 3858 new_bufcnt = bio_busy(B_TRUE); 3859 new_pgcnt = page_busy(B_TRUE); 3860 vfs_syncprogress(); 3861 3862 if (new_bufcnt == 0 && new_pgcnt == 0) 3863 break; 3864 3865 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 3866 sync_triesleft = sync_retries; 3867 else 3868 sync_triesleft--; 3869 3870 if (new_bufcnt) 3871 printf(" [%d]", new_bufcnt); 3872 if (new_pgcnt) 3873 printf(" %lu", new_pgcnt); 3874 3875 delay(hz); 3876 } 3877 3878 if (new_bufcnt != 0 || new_pgcnt != 0) 3879 printf(" done (not all i/o completed)\n"); 3880 else 3881 printf(" done\n"); 3882 3883 sync_timeleft = 0; 3884 delay(hz); 3885 } 3886 3887 /* 3888 * If we are in the middle of the sync phase of panic, reset sync_timeleft to 3889 * sync_timeout to indicate that we are making progress and the deadman() 3890 * omnipresent cyclic should not yet time us out. Note that it is safe to 3891 * store to sync_timeleft here since the deadman() is firing at high-level 3892 * on top of us. If we are racing with the deadman(), either the deadman() 3893 * will decrement the old value and then we will reset it, or we will 3894 * reset it and then the deadman() will immediately decrement it. In either 3895 * case, correct behavior results. 3896 */ 3897 void 3898 vfs_syncprogress(void) 3899 { 3900 if (panicstr) 3901 sync_timeleft = sync_timeout; 3902 } 3903 3904 /* 3905 * Map VFS flags to statvfs flags. These shouldn't really be separate 3906 * flags at all. 3907 */ 3908 uint_t 3909 vf_to_stf(uint_t vf) 3910 { 3911 uint_t stf = 0; 3912 3913 if (vf & VFS_RDONLY) 3914 stf |= ST_RDONLY; 3915 if (vf & VFS_NOSETUID) 3916 stf |= ST_NOSUID; 3917 if (vf & VFS_NOTRUNC) 3918 stf |= ST_NOTRUNC; 3919 3920 return (stf); 3921 } 3922 3923 /* 3924 * Entries for (illegal) fstype 0. 3925 */ 3926 /* ARGSUSED */ 3927 int 3928 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 3929 { 3930 cmn_err(CE_PANIC, "stray vfs operation"); 3931 return (0); 3932 } 3933 3934 /* 3935 * Entries for (illegal) fstype 0. 3936 */ 3937 int 3938 vfsstray(void) 3939 { 3940 cmn_err(CE_PANIC, "stray vfs operation"); 3941 return (0); 3942 } 3943 3944 /* 3945 * Support for dealing with forced UFS unmount and its interaction with 3946 * LOFS. Could be used by any filesystem. 3947 * See bug 1203132. 3948 */ 3949 int 3950 vfs_EIO(void) 3951 { 3952 return (EIO); 3953 } 3954 3955 /* 3956 * We've gotta define the op for sync separately, since the compiler gets 3957 * confused if we mix and match ANSI and normal style prototypes when 3958 * a "short" argument is present and spits out a warning. 3959 */ 3960 /*ARGSUSED*/ 3961 int 3962 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 3963 { 3964 return (EIO); 3965 } 3966 3967 vfs_t EIO_vfs; 3968 vfsops_t *EIO_vfsops; 3969 3970 /* 3971 * Called from startup() to initialize all loaded vfs's 3972 */ 3973 void 3974 vfsinit(void) 3975 { 3976 struct vfssw *vswp; 3977 int error; 3978 extern int vopstats_enabled; 3979 extern void vopstats_startup(); 3980 3981 static const fs_operation_def_t EIO_vfsops_template[] = { 3982 VFSNAME_MOUNT, { .error = vfs_EIO }, 3983 VFSNAME_UNMOUNT, { .error = vfs_EIO }, 3984 VFSNAME_ROOT, { .error = vfs_EIO }, 3985 VFSNAME_STATVFS, { .error = vfs_EIO }, 3986 VFSNAME_SYNC, { .vfs_sync = vfs_EIO_sync }, 3987 VFSNAME_VGET, { .error = vfs_EIO }, 3988 VFSNAME_MOUNTROOT, { .error = vfs_EIO }, 3989 VFSNAME_FREEVFS, { .error = vfs_EIO }, 3990 VFSNAME_VNSTATE, { .error = vfs_EIO }, 3991 NULL, NULL 3992 }; 3993 3994 static const fs_operation_def_t stray_vfsops_template[] = { 3995 VFSNAME_MOUNT, { .error = vfsstray }, 3996 VFSNAME_UNMOUNT, { .error = vfsstray }, 3997 VFSNAME_ROOT, { .error = vfsstray }, 3998 VFSNAME_STATVFS, { .error = vfsstray }, 3999 VFSNAME_SYNC, { .vfs_sync = vfsstray_sync }, 4000 VFSNAME_VGET, { .error = vfsstray }, 4001 VFSNAME_MOUNTROOT, { .error = vfsstray }, 4002 VFSNAME_FREEVFS, { .error = vfsstray }, 4003 VFSNAME_VNSTATE, { .error = vfsstray }, 4004 NULL, NULL 4005 }; 4006 4007 /* Create vfs cache */ 4008 vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs), 4009 sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0); 4010 4011 /* Initialize the vnode cache (file systems may use it during init). */ 4012 vn_create_cache(); 4013 4014 /* Setup event monitor framework */ 4015 fem_init(); 4016 4017 /* Initialize the dummy stray file system type. */ 4018 error = vfs_setfsops(0, stray_vfsops_template, NULL); 4019 4020 /* Initialize the dummy EIO file system. */ 4021 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 4022 if (error != 0) { 4023 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 4024 /* Shouldn't happen, but not bad enough to panic */ 4025 } 4026 4027 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 4028 4029 /* 4030 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 4031 * on this vfs can immediately notice it's invalid. 4032 */ 4033 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 4034 4035 /* 4036 * Call the init routines of non-loadable filesystems only. 4037 * Filesystems which are loaded as separate modules will be 4038 * initialized by the module loading code instead. 4039 */ 4040 4041 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 4042 RLOCK_VFSSW(); 4043 if (vswp->vsw_init != NULL) 4044 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 4045 RUNLOCK_VFSSW(); 4046 } 4047 4048 vopstats_startup(); 4049 4050 if (vopstats_enabled) { 4051 /* EIO_vfs can collect stats, but we don't retrieve them */ 4052 initialize_vopstats(&EIO_vfs.vfs_vopstats); 4053 EIO_vfs.vfs_fstypevsp = NULL; 4054 EIO_vfs.vfs_vskap = NULL; 4055 EIO_vfs.vfs_flag |= VFS_STATS; 4056 } 4057 4058 xattr_init(); 4059 } 4060 4061 vfs_t * 4062 vfs_alloc(int kmflag) 4063 { 4064 vfs_t *vfsp; 4065 4066 vfsp = kmem_cache_alloc(vfs_cache, kmflag); 4067 4068 /* 4069 * Do the simplest initialization here. 4070 * Everything else gets done in vfs_init() 4071 */ 4072 bzero(vfsp, sizeof (vfs_t)); 4073 return (vfsp); 4074 } 4075 4076 void 4077 vfs_free(vfs_t *vfsp) 4078 { 4079 /* 4080 * One would be tempted to assert that "vfsp->vfs_count == 0". 4081 * The problem is that this gets called out of domount() with 4082 * a partially initialized vfs and a vfs_count of 1. This is 4083 * also called from vfs_rele() with a vfs_count of 0. We can't 4084 * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully 4085 * returned. This is because VFS_MOUNT() fully initializes the 4086 * vfs structure and its associated data. VFS_RELE() will call 4087 * VFS_FREEVFS() which may panic the system if the data structures 4088 * aren't fully initialized from a successful VFS_MOUNT()). 4089 */ 4090 4091 /* If FEM was in use, make sure everything gets cleaned up */ 4092 if (vfsp->vfs_femhead) { 4093 ASSERT(vfsp->vfs_femhead->femh_list == NULL); 4094 mutex_destroy(&vfsp->vfs_femhead->femh_lock); 4095 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead))); 4096 vfsp->vfs_femhead = NULL; 4097 } 4098 4099 if (vfsp->vfs_implp) 4100 vfsimpl_teardown(vfsp); 4101 sema_destroy(&vfsp->vfs_reflock); 4102 kmem_cache_free(vfs_cache, vfsp); 4103 } 4104 4105 /* 4106 * Increments the vfs reference count by one atomically. 4107 */ 4108 void 4109 vfs_hold(vfs_t *vfsp) 4110 { 4111 atomic_add_32(&vfsp->vfs_count, 1); 4112 ASSERT(vfsp->vfs_count != 0); 4113 } 4114 4115 /* 4116 * Decrements the vfs reference count by one atomically. When 4117 * vfs reference count becomes zero, it calls the file system 4118 * specific vfs_freevfs() to free up the resources. 4119 */ 4120 void 4121 vfs_rele(vfs_t *vfsp) 4122 { 4123 ASSERT(vfsp->vfs_count != 0); 4124 if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) { 4125 VFS_FREEVFS(vfsp); 4126 if (vfsp->vfs_zone) 4127 zone_rele(vfsp->vfs_zone); 4128 vfs_freemnttab(vfsp); 4129 vfs_free(vfsp); 4130 } 4131 } 4132 4133 /* 4134 * Generic operations vector support. 4135 * 4136 * This is used to build operations vectors for both the vfs and vnode. 4137 * It's normally called only when a file system is loaded. 4138 * 4139 * There are many possible algorithms for this, including the following: 4140 * 4141 * (1) scan the list of known operations; for each, see if the file system 4142 * includes an entry for it, and fill it in as appropriate. 4143 * 4144 * (2) set up defaults for all known operations. scan the list of ops 4145 * supplied by the file system; for each which is both supplied and 4146 * known, fill it in. 4147 * 4148 * (3) sort the lists of known ops & supplied ops; scan the list, filling 4149 * in entries as we go. 4150 * 4151 * we choose (1) for simplicity, and because performance isn't critical here. 4152 * note that (2) could be sped up using a precomputed hash table on known ops. 4153 * (3) could be faster than either, but only if the lists were very large or 4154 * supplied in sorted order. 4155 * 4156 */ 4157 4158 int 4159 fs_build_vector(void *vector, int *unused_ops, 4160 const fs_operation_trans_def_t *translation, 4161 const fs_operation_def_t *operations) 4162 { 4163 int i, num_trans, num_ops, used; 4164 4165 /* 4166 * Count the number of translations and the number of supplied 4167 * operations. 4168 */ 4169 4170 { 4171 const fs_operation_trans_def_t *p; 4172 4173 for (num_trans = 0, p = translation; 4174 p->name != NULL; 4175 num_trans++, p++) 4176 ; 4177 } 4178 4179 { 4180 const fs_operation_def_t *p; 4181 4182 for (num_ops = 0, p = operations; 4183 p->name != NULL; 4184 num_ops++, p++) 4185 ; 4186 } 4187 4188 /* Walk through each operation known to our caller. There will be */ 4189 /* one entry in the supplied "translation table" for each. */ 4190 4191 used = 0; 4192 4193 for (i = 0; i < num_trans; i++) { 4194 int j, found; 4195 char *curname; 4196 fs_generic_func_p result; 4197 fs_generic_func_p *location; 4198 4199 curname = translation[i].name; 4200 4201 /* Look for a matching operation in the list supplied by the */ 4202 /* file system. */ 4203 4204 found = 0; 4205 4206 for (j = 0; j < num_ops; j++) { 4207 if (strcmp(operations[j].name, curname) == 0) { 4208 used++; 4209 found = 1; 4210 break; 4211 } 4212 } 4213 4214 /* 4215 * If the file system is using a "placeholder" for default 4216 * or error functions, grab the appropriate function out of 4217 * the translation table. If the file system didn't supply 4218 * this operation at all, use the default function. 4219 */ 4220 4221 if (found) { 4222 result = operations[j].func.fs_generic; 4223 if (result == fs_default) { 4224 result = translation[i].defaultFunc; 4225 } else if (result == fs_error) { 4226 result = translation[i].errorFunc; 4227 } else if (result == NULL) { 4228 /* Null values are PROHIBITED */ 4229 return (EINVAL); 4230 } 4231 } else { 4232 result = translation[i].defaultFunc; 4233 } 4234 4235 /* Now store the function into the operations vector. */ 4236 4237 location = (fs_generic_func_p *) 4238 (((char *)vector) + translation[i].offset); 4239 4240 *location = result; 4241 } 4242 4243 *unused_ops = num_ops - used; 4244 4245 return (0); 4246 } 4247 4248 /* Placeholder functions, should never be called. */ 4249 4250 int 4251 fs_error(void) 4252 { 4253 cmn_err(CE_PANIC, "fs_error called"); 4254 return (0); 4255 } 4256 4257 int 4258 fs_default(void) 4259 { 4260 cmn_err(CE_PANIC, "fs_default called"); 4261 return (0); 4262 } 4263 4264 #ifdef __sparc 4265 4266 /* 4267 * Part of the implementation of booting off a mirrored root 4268 * involves a change of dev_t for the root device. To 4269 * accomplish this, first remove the existing hash table 4270 * entry for the root device, convert to the new dev_t, 4271 * then re-insert in the hash table at the head of the list. 4272 */ 4273 void 4274 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 4275 { 4276 vfs_list_lock(); 4277 4278 vfs_hash_remove(vfsp); 4279 4280 vfsp->vfs_dev = ndev; 4281 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 4282 4283 vfs_hash_add(vfsp, 1); 4284 4285 vfs_list_unlock(); 4286 } 4287 4288 #else /* x86 NEWBOOT */ 4289 4290 #if defined(__x86) 4291 extern int hvmboot_rootconf(); 4292 #endif /* __x86 */ 4293 4294 int 4295 rootconf() 4296 { 4297 int error; 4298 struct vfssw *vsw; 4299 extern void pm_init(); 4300 char *fstyp, *fsmod; 4301 4302 getrootfs(&fstyp, &fsmod); 4303 4304 #if defined(__x86) 4305 /* 4306 * hvmboot_rootconf() is defined in the hvm_bootstrap misc module, 4307 * which lives in /platform/i86hvm, and hence is only available when 4308 * booted in an x86 hvm environment. If the hvm_bootstrap misc module 4309 * is not available then the modstub for this function will return 0. 4310 * If the hvm_bootstrap misc module is available it will be loaded 4311 * and hvmboot_rootconf() will be invoked. 4312 */ 4313 if (error = hvmboot_rootconf()) 4314 return (error); 4315 #endif /* __x86 */ 4316 4317 if (error = clboot_rootconf()) 4318 return (error); 4319 4320 if (modload("fs", fsmod) == -1) 4321 panic("Cannot _init %s module", fsmod); 4322 4323 RLOCK_VFSSW(); 4324 vsw = vfs_getvfsswbyname(fstyp); 4325 RUNLOCK_VFSSW(); 4326 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 4327 VFS_HOLD(rootvfs); 4328 4329 /* always mount readonly first */ 4330 rootvfs->vfs_flag |= VFS_RDONLY; 4331 4332 pm_init(); 4333 4334 if (netboot) 4335 (void) strplumb(); 4336 4337 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 4338 vfs_unrefvfssw(vsw); 4339 rootdev = rootvfs->vfs_dev; 4340 4341 if (error) 4342 panic("cannot mount root path %s", rootfs.bo_name); 4343 return (error); 4344 } 4345 4346 /* 4347 * XXX this is called by nfs only and should probably be removed 4348 * If booted with ASKNAME, prompt on the console for a filesystem 4349 * name and return it. 4350 */ 4351 void 4352 getfsname(char *askfor, char *name, size_t namelen) 4353 { 4354 if (boothowto & RB_ASKNAME) { 4355 printf("%s name: ", askfor); 4356 console_gets(name, namelen); 4357 } 4358 } 4359 4360 /* 4361 * If server_path exists, then we are booting a diskless 4362 * client. Otherwise, we default to ufs. Zfs should perhaps be 4363 * another property. 4364 */ 4365 static void 4366 getrootfs(char **fstypp, char **fsmodp) 4367 { 4368 extern char *strplumb_get_netdev_path(void); 4369 char *propstr = NULL; 4370 4371 /* check fstype property; it should be nfsdyn for diskless */ 4372 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4373 DDI_PROP_DONTPASS, "fstype", &propstr) 4374 == DDI_SUCCESS) { 4375 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 4376 ddi_prop_free(propstr); 4377 4378 /* 4379 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set, 4380 * assume the type of this root filesystem is 'zfs'. 4381 */ 4382 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4383 DDI_PROP_DONTPASS, "zfs-bootfs", &propstr) 4384 == DDI_SUCCESS) { 4385 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME); 4386 ddi_prop_free(propstr); 4387 } 4388 4389 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) { 4390 *fstypp = *fsmodp = rootfs.bo_fstype; 4391 return; 4392 } 4393 4394 ++netboot; 4395 /* 4396 * check if path to network interface is specified in bootpath 4397 * or by a hypervisor domain configuration file. 4398 * XXPV - enable strlumb_get_netdev_path() 4399 */ 4400 if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS, 4401 "xpv-nfsroot")) { 4402 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0"); 4403 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4404 DDI_PROP_DONTPASS, "bootpath", &propstr) 4405 == DDI_SUCCESS) { 4406 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 4407 ddi_prop_free(propstr); 4408 } else { 4409 /* attempt to determine netdev_path via boot_mac address */ 4410 netdev_path = strplumb_get_netdev_path(); 4411 if (netdev_path == NULL) 4412 panic("cannot find boot network interface"); 4413 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 4414 } 4415 *fstypp = rootfs.bo_fstype; 4416 *fsmodp = "nfs"; 4417 } 4418 #endif 4419 4420 /* 4421 * VFS feature routines 4422 */ 4423 4424 #define VFTINDEX(feature) (((feature) >> 32) & 0xFFFFFFFF) 4425 #define VFTBITS(feature) ((feature) & 0xFFFFFFFFLL) 4426 4427 /* Register a feature in the vfs */ 4428 void 4429 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature) 4430 { 4431 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4432 if (vfsp->vfs_implp == NULL) 4433 return; 4434 4435 vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature); 4436 } 4437 4438 /* 4439 * Query a vfs for a feature. 4440 * Returns 1 if feature is present, 0 if not 4441 */ 4442 int 4443 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature) 4444 { 4445 int ret = 0; 4446 4447 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4448 if (vfsp->vfs_implp == NULL) 4449 return (ret); 4450 4451 if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature)) 4452 ret = 1; 4453 4454 return (ret); 4455 } 4456 4457 /* 4458 * Propagate feature set from one vfs to another 4459 */ 4460 void 4461 vfs_propagate_features(vfs_t *from, vfs_t *to) 4462 { 4463 int i; 4464 4465 if (to->vfs_implp == NULL || from->vfs_implp == NULL) 4466 return; 4467 4468 for (i = 1; i <= to->vfs_featureset[0]; i++) { 4469 to->vfs_featureset[i] = from->vfs_featureset[i]; 4470 } 4471 } 4472