1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/errno.h> 46 #include <sys/user.h> 47 #include <sys/fstyp.h> 48 #include <sys/kmem.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/mount.h> 52 #include <sys/vfs.h> 53 #include <sys/vfs_opreg.h> 54 #include <sys/fem.h> 55 #include <sys/mntent.h> 56 #include <sys/stat.h> 57 #include <sys/statvfs.h> 58 #include <sys/statfs.h> 59 #include <sys/cred.h> 60 #include <sys/vnode.h> 61 #include <sys/rwstlock.h> 62 #include <sys/dnlc.h> 63 #include <sys/file.h> 64 #include <sys/time.h> 65 #include <sys/atomic.h> 66 #include <sys/cmn_err.h> 67 #include <sys/buf.h> 68 #include <sys/swap.h> 69 #include <sys/debug.h> 70 #include <sys/vnode.h> 71 #include <sys/modctl.h> 72 #include <sys/ddi.h> 73 #include <sys/pathname.h> 74 #include <sys/bootconf.h> 75 #include <sys/dumphdr.h> 76 #include <sys/dc_ki.h> 77 #include <sys/poll.h> 78 #include <sys/sunddi.h> 79 #include <sys/sysmacros.h> 80 #include <sys/zone.h> 81 #include <sys/policy.h> 82 #include <sys/ctfs.h> 83 #include <sys/objfs.h> 84 #include <sys/console.h> 85 #include <sys/reboot.h> 86 87 #include <vm/page.h> 88 89 #include <fs/fs_subr.h> 90 91 /* Private interfaces to create vopstats-related data structures */ 92 extern void initialize_vopstats(vopstats_t *); 93 extern vopstats_t *get_fstype_vopstats(struct vfs *, struct vfssw *); 94 extern vsk_anchor_t *get_vskstat_anchor(struct vfs *); 95 96 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 97 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 98 const char *, int, int); 99 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 100 static void vfs_freemnttab(struct vfs *); 101 static void vfs_freeopt(mntopt_t *); 102 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 103 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 104 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 105 static void vfs_createopttbl_extend(mntopts_t *, const char *, 106 const mntopts_t *); 107 static char **vfs_copycancelopt_extend(char **const, int); 108 static void vfs_freecancelopt(char **); 109 static char *getrootfs(void); 110 static int getmacpath(dev_info_t *, void *); 111 static void vfs_mnttabvp_setup(void); 112 113 struct ipmnt { 114 struct ipmnt *mip_next; 115 dev_t mip_dev; 116 struct vfs *mip_vfsp; 117 }; 118 119 static kmutex_t vfs_miplist_mutex; 120 static struct ipmnt *vfs_miplist = NULL; 121 static struct ipmnt *vfs_miplist_end = NULL; 122 123 /* 124 * VFS global data. 125 */ 126 vnode_t *rootdir; /* pointer to root inode vnode. */ 127 vnode_t *devicesdir; /* pointer to inode of devices root */ 128 vnode_t *devdir; /* pointer to inode of dev root */ 129 130 char *server_rootpath; /* root path for diskless clients */ 131 char *server_hostname; /* hostname of diskless server */ 132 133 static struct vfs root; 134 static struct vfs devices; 135 static struct vfs dev; 136 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 137 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 138 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 139 /* must be power of 2! */ 140 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 141 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 142 char *vfs_dummyfstype = "\0"; 143 struct pollhead vfs_pollhd; /* for mnttab pollers */ 144 struct vnode *vfs_mntdummyvp; /* to fake mnttab read/write for file events */ 145 int mntfstype; /* will be set once mnt fs is mounted */ 146 147 /* 148 * Table for generic options recognized in the VFS layer and acted 149 * on at this level before parsing file system specific options. 150 * The nosuid option is stronger than any of the devices and setuid 151 * options, so those are canceled when nosuid is seen. 152 * 153 * All options which are added here need to be added to the 154 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 155 */ 156 /* 157 * VFS Mount options table 158 */ 159 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 160 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 161 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 162 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 163 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 164 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 165 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 166 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 167 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 168 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 169 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 170 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 171 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 172 173 static const mntopt_t mntopts[] = { 174 /* 175 * option name cancel options default arg flags 176 */ 177 { MNTOPT_REMOUNT, NULL, NULL, 178 MO_NODISPLAY, (void *)0 }, 179 { MNTOPT_RO, ro_cancel, NULL, 0, 180 (void *)0 }, 181 { MNTOPT_RW, rw_cancel, NULL, 0, 182 (void *)0 }, 183 { MNTOPT_SUID, suid_cancel, NULL, 0, 184 (void *)0 }, 185 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 186 (void *)0 }, 187 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 188 (void *)0 }, 189 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 190 (void *)0 }, 191 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 192 (void *)0 }, 193 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 194 (void *)0 }, 195 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 196 (void *)0 }, 197 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 198 (void *)0 }, 199 { MNTOPT_EXEC, exec_cancel, NULL, 0, 200 (void *)0 }, 201 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 202 (void *)0 }, 203 }; 204 205 const mntopts_t vfs_mntopts = { 206 sizeof (mntopts) / sizeof (mntopt_t), 207 (mntopt_t *)&mntopts[0] 208 }; 209 210 /* 211 * File system operation dispatch functions. 212 */ 213 214 int 215 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 216 { 217 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 218 } 219 220 int 221 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 222 { 223 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 224 } 225 226 int 227 fsop_root(vfs_t *vfsp, vnode_t **vpp) 228 { 229 refstr_t *mntpt; 230 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 231 /* 232 * Make sure this root has a path. With lofs, it is possible to have 233 * a NULL mountpoint. 234 */ 235 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { 236 mntpt = vfs_getmntpoint(vfsp); 237 vn_setpath_str(*vpp, refstr_value(mntpt), 238 strlen(refstr_value(mntpt))); 239 refstr_rele(mntpt); 240 } 241 242 return (ret); 243 } 244 245 int 246 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 247 { 248 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 249 } 250 251 int 252 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 253 { 254 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 255 } 256 257 int 258 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 259 { 260 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 261 } 262 263 int 264 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 265 { 266 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 267 } 268 269 void 270 fsop_freefs(vfs_t *vfsp) 271 { 272 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 273 } 274 275 int 276 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 277 { 278 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 279 } 280 281 int 282 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 283 { 284 ASSERT((fstype >= 0) && (fstype < nfstype)); 285 286 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 287 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 288 else 289 return (ENOTSUP); 290 } 291 292 /* 293 * File system initialization. vfs_setfsops() must be called from a file 294 * system's init routine. 295 */ 296 297 static int 298 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 299 int *unused_ops) 300 { 301 static const fs_operation_trans_def_t vfs_ops_table[] = { 302 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 303 fs_nosys, fs_nosys, 304 305 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 306 fs_nosys, fs_nosys, 307 308 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 309 fs_nosys, fs_nosys, 310 311 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 312 fs_nosys, fs_nosys, 313 314 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 315 (fs_generic_func_p) fs_sync, 316 (fs_generic_func_p) fs_sync, /* No errors allowed */ 317 318 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 319 fs_nosys, fs_nosys, 320 321 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 322 fs_nosys, fs_nosys, 323 324 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 325 (fs_generic_func_p)fs_freevfs, 326 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 327 328 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 329 (fs_generic_func_p)fs_nosys, 330 (fs_generic_func_p)fs_nosys, 331 332 NULL, 0, NULL, NULL 333 }; 334 335 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 336 } 337 338 int 339 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 340 { 341 int error; 342 int unused_ops; 343 344 /* 345 * Verify that fstype refers to a valid fs. Note that 346 * 0 is valid since it's used to set "stray" ops. 347 */ 348 if ((fstype < 0) || (fstype >= nfstype)) 349 return (EINVAL); 350 351 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 352 return (EINVAL); 353 354 /* Set up the operations vector. */ 355 356 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 357 358 if (error != 0) 359 return (error); 360 361 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 362 363 if (actual != NULL) 364 *actual = &vfssw[fstype].vsw_vfsops; 365 366 #if DEBUG 367 if (unused_ops != 0) 368 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 369 "but not used", vfssw[fstype].vsw_name, unused_ops); 370 #endif 371 372 return (0); 373 } 374 375 int 376 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 377 { 378 int error; 379 int unused_ops; 380 381 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 382 383 error = fs_copyfsops(template, *actual, &unused_ops); 384 if (error != 0) { 385 kmem_free(*actual, sizeof (vfsops_t)); 386 *actual = NULL; 387 return (error); 388 } 389 390 return (0); 391 } 392 393 /* 394 * Free a vfsops structure created as a result of vfs_makefsops(). 395 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 396 * vfs_freevfsops_by_type(). 397 */ 398 void 399 vfs_freevfsops(vfsops_t *vfsops) 400 { 401 kmem_free(vfsops, sizeof (vfsops_t)); 402 } 403 404 /* 405 * Since the vfsops structure is part of the vfssw table and wasn't 406 * really allocated, we're not really freeing anything. We keep 407 * the name for consistency with vfs_freevfsops(). We do, however, 408 * need to take care of a little bookkeeping. 409 * NOTE: For a vfsops structure created by vfs_setfsops(), use 410 * vfs_freevfsops_by_type(). 411 */ 412 int 413 vfs_freevfsops_by_type(int fstype) 414 { 415 416 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 417 if ((fstype <= 0) || (fstype >= nfstype)) 418 return (EINVAL); 419 420 WLOCK_VFSSW(); 421 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 422 WUNLOCK_VFSSW(); 423 return (EINVAL); 424 } 425 426 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 427 WUNLOCK_VFSSW(); 428 429 return (0); 430 } 431 432 /* Support routines used to reference vfs_op */ 433 434 /* Set the operations vector for a vfs */ 435 void 436 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 437 { 438 vfsops_t *op; 439 440 ASSERT(vfsp != NULL); 441 ASSERT(vfsops != NULL); 442 443 op = vfsp->vfs_op; 444 membar_consumer(); 445 if ((vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) && 446 casptr(&vfsp->vfs_op, op, vfsops) == op) { 447 return; 448 } 449 fsem_setvfsops(vfsp, vfsops); 450 } 451 452 /* Retrieve the operations vector for a vfs */ 453 vfsops_t * 454 vfs_getops(vfs_t *vfsp) 455 { 456 vfsops_t *op; 457 458 ASSERT(vfsp != NULL); 459 460 op = vfsp->vfs_op; 461 membar_consumer(); 462 if ((vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) && 463 op == vfsp->vfs_op) { 464 return (op); 465 } else { 466 return (fsem_getvfsops(vfsp)); 467 } 468 } 469 470 /* 471 * Returns non-zero (1) if the vfsops matches that of the vfs. 472 * Returns zero (0) if not. 473 */ 474 int 475 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 476 { 477 return (vfs_getops(vfsp) == vfsops); 478 } 479 480 /* 481 * Returns non-zero (1) if the file system has installed a non-default, 482 * non-error vfs_sync routine. Returns zero (0) otherwise. 483 */ 484 int 485 vfs_can_sync(vfs_t *vfsp) 486 { 487 /* vfs_sync() routine is not the default/error function */ 488 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 489 } 490 491 /* 492 * Initialize a vfs structure. 493 */ 494 void 495 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 496 { 497 vfsp->vfs_count = 0; 498 vfsp->vfs_next = vfsp; 499 vfsp->vfs_prev = vfsp; 500 vfsp->vfs_zone_next = vfsp; 501 vfsp->vfs_zone_prev = vfsp; 502 vfsp->vfs_flag = 0; 503 vfsp->vfs_data = (data); 504 vfsp->vfs_resource = NULL; 505 vfsp->vfs_mntpt = NULL; 506 vfsp->vfs_mntopts.mo_count = 0; 507 vfsp->vfs_mntopts.mo_list = NULL; 508 vfsp->vfs_implp = NULL; 509 vfsp->vfs_zone = NULL; 510 /* 511 * Note: Don't initialize any member of the vfs_impl_t structure 512 * here as it could be a problem for unbundled file systems. 513 */ 514 vfs_setops((vfsp), (op)); 515 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 516 } 517 518 /* 519 * Allocate and initialize the vfs implementation private data 520 * structure, vfs_impl_t. 521 */ 522 void 523 vfsimpl_setup(vfs_t *vfsp) 524 { 525 vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP); 526 /* Note that this are #define'd in vfs.h */ 527 vfsp->vfs_femhead = NULL; 528 vfsp->vfs_vskap = NULL; 529 vfsp->vfs_fstypevsp = NULL; 530 } 531 532 /* 533 * Release the vfs_impl_t structure, if it exists. Some unbundled 534 * filesystems may not use the newer version of vfs and thus 535 * would not contain this implementation private data structure. 536 */ 537 void 538 vfsimpl_teardown(vfs_t *vfsp) 539 { 540 vfs_impl_t *vip = vfsp->vfs_implp; 541 542 if (vip == NULL) 543 return; 544 545 if (vip->vi_femhead) { 546 ASSERT(vip->vi_femhead->femh_list == NULL); 547 mutex_destroy(&vip->vi_femhead->femh_lock); 548 kmem_free(vip->vi_femhead, sizeof (*(vip->vi_femhead))); 549 vip->vi_femhead = NULL; 550 } 551 552 kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t)); 553 vfsp->vfs_implp = NULL; 554 } 555 556 /* 557 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 558 * fstatvfs, and sysfs moved to common/syscall. 559 */ 560 561 /* 562 * Update every mounted file system. We call the vfs_sync operation of 563 * each file system type, passing it a NULL vfsp to indicate that all 564 * mounted file systems of that type should be updated. 565 */ 566 void 567 vfs_sync(int flag) 568 { 569 struct vfssw *vswp; 570 RLOCK_VFSSW(); 571 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 572 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 573 vfs_refvfssw(vswp); 574 RUNLOCK_VFSSW(); 575 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 576 CRED()); 577 vfs_unrefvfssw(vswp); 578 RLOCK_VFSSW(); 579 } 580 } 581 RUNLOCK_VFSSW(); 582 } 583 584 void 585 sync(void) 586 { 587 vfs_sync(0); 588 } 589 590 /* 591 * External routines. 592 */ 593 594 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 595 596 /* 597 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 598 * but otherwise should be accessed only via vfs_list_lock() and 599 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 600 */ 601 static krwlock_t vfslist; 602 603 /* 604 * Mount devfs on /devices. This is done right after root is mounted 605 * to provide device access support for the system 606 */ 607 static void 608 vfs_mountdevices(void) 609 { 610 struct vfssw *vsw; 611 struct vnode *mvp; 612 struct mounta mounta = { /* fake mounta for devfs_mount() */ 613 NULL, 614 NULL, 615 MS_SYSSPACE, 616 NULL, 617 NULL, 618 0, 619 NULL, 620 0 621 }; 622 623 /* 624 * _init devfs module to fill in the vfssw 625 */ 626 if (modload("fs", "devfs") == -1) 627 panic("Cannot _init devfs module"); 628 629 /* 630 * Hold vfs 631 */ 632 RLOCK_VFSSW(); 633 vsw = vfs_getvfsswbyname("devfs"); 634 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 635 VFS_HOLD(&devices); 636 637 /* 638 * Locate mount point 639 */ 640 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 641 panic("Cannot find /devices"); 642 643 /* 644 * Perform the mount of /devices 645 */ 646 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 647 panic("Cannot mount /devices"); 648 649 RUNLOCK_VFSSW(); 650 651 /* 652 * Set appropriate members and add to vfs list for mnttab display 653 */ 654 vfs_setresource(&devices, "/devices"); 655 vfs_setmntpoint(&devices, "/devices"); 656 657 /* 658 * Hold the root of /devices so it won't go away 659 */ 660 if (VFS_ROOT(&devices, &devicesdir)) 661 panic("vfs_mountdevices: not devices root"); 662 663 if (vfs_lock(&devices) != 0) { 664 VN_RELE(devicesdir); 665 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 666 return; 667 } 668 669 if (vn_vfswlock(mvp) != 0) { 670 vfs_unlock(&devices); 671 VN_RELE(devicesdir); 672 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 673 return; 674 } 675 676 vfs_add(mvp, &devices, 0); 677 vn_vfsunlock(mvp); 678 vfs_unlock(&devices); 679 VN_RELE(devicesdir); 680 } 681 682 /* 683 * mount the first instance of /dev to root and remain mounted 684 */ 685 static void 686 vfs_mountdev1(void) 687 { 688 struct vfssw *vsw; 689 struct vnode *mvp; 690 struct mounta mounta = { /* fake mounta for sdev_mount() */ 691 NULL, 692 NULL, 693 MS_SYSSPACE | MS_OVERLAY, 694 NULL, 695 NULL, 696 0, 697 NULL, 698 0 699 }; 700 701 /* 702 * _init dev module to fill in the vfssw 703 */ 704 if (modload("fs", "dev") == -1) 705 cmn_err(CE_PANIC, "Cannot _init dev module\n"); 706 707 /* 708 * Hold vfs 709 */ 710 RLOCK_VFSSW(); 711 vsw = vfs_getvfsswbyname("dev"); 712 VFS_INIT(&dev, &vsw->vsw_vfsops, NULL); 713 VFS_HOLD(&dev); 714 715 /* 716 * Locate mount point 717 */ 718 if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 719 cmn_err(CE_PANIC, "Cannot find /dev\n"); 720 721 /* 722 * Perform the mount of /dev 723 */ 724 if (VFS_MOUNT(&dev, mvp, &mounta, CRED())) 725 cmn_err(CE_PANIC, "Cannot mount /dev 1\n"); 726 727 RUNLOCK_VFSSW(); 728 729 /* 730 * Set appropriate members and add to vfs list for mnttab display 731 */ 732 vfs_setresource(&dev, "/dev"); 733 vfs_setmntpoint(&dev, "/dev"); 734 735 /* 736 * Hold the root of /dev so it won't go away 737 */ 738 if (VFS_ROOT(&dev, &devdir)) 739 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root"); 740 741 if (vfs_lock(&dev) != 0) { 742 VN_RELE(devdir); 743 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev"); 744 return; 745 } 746 747 if (vn_vfswlock(mvp) != 0) { 748 vfs_unlock(&dev); 749 VN_RELE(devdir); 750 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev"); 751 return; 752 } 753 754 vfs_add(mvp, &dev, 0); 755 vn_vfsunlock(mvp); 756 vfs_unlock(&dev); 757 VN_RELE(devdir); 758 } 759 760 /* 761 * Mount required filesystem. This is done right after root is mounted. 762 */ 763 static void 764 vfs_mountfs(char *module, char *spec, char *path) 765 { 766 struct vnode *mvp; 767 struct mounta mounta; 768 vfs_t *vfsp; 769 770 mounta.flags = MS_SYSSPACE | MS_DATA; 771 mounta.fstype = module; 772 mounta.spec = spec; 773 mounta.dir = path; 774 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 775 cmn_err(CE_WARN, "Cannot find %s", path); 776 return; 777 } 778 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 779 cmn_err(CE_WARN, "Cannot mount %s", path); 780 else 781 VFS_RELE(vfsp); 782 VN_RELE(mvp); 783 } 784 785 /* 786 * vfs_mountroot is called by main() to mount the root filesystem. 787 */ 788 void 789 vfs_mountroot(void) 790 { 791 struct vnode *rvp = NULL; 792 char *path; 793 size_t plen; 794 struct vfssw *vswp; 795 796 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 797 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 798 799 /* 800 * Alloc the vfs hash bucket array and locks 801 */ 802 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 803 804 /* 805 * Call machine-dependent routine "rootconf" to choose a root 806 * file system type. 807 */ 808 if (rootconf()) 809 panic("vfs_mountroot: cannot mount root"); 810 /* 811 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 812 * to point to it. These are used by lookuppn() so that it 813 * knows where to start from ('/' or '.'). 814 */ 815 vfs_setmntpoint(rootvfs, "/"); 816 if (VFS_ROOT(rootvfs, &rootdir)) 817 panic("vfs_mountroot: no root vnode"); 818 PTOU(curproc)->u_cdir = rootdir; 819 VN_HOLD(PTOU(curproc)->u_cdir); 820 PTOU(curproc)->u_rdir = NULL; 821 822 /* 823 * Setup the global zone's rootvp, now that it exists. 824 */ 825 global_zone->zone_rootvp = rootdir; 826 VN_HOLD(global_zone->zone_rootvp); 827 828 /* 829 * Notify the module code that it can begin using the 830 * root filesystem instead of the boot program's services. 831 */ 832 modrootloaded = 1; 833 /* 834 * Set up mnttab information for root 835 */ 836 vfs_setresource(rootvfs, rootfs.bo_name); 837 838 /* 839 * Notify cluster software that the root filesystem is available. 840 */ 841 clboot_mountroot(); 842 843 /* Now that we're all done with the root FS, set up its vopstats */ 844 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) { 845 /* Set flag for statistics collection */ 846 if (vswp->vsw_flag & VSW_STATS) { 847 initialize_vopstats(&rootvfs->vfs_vopstats); 848 rootvfs->vfs_flag |= VFS_STATS; 849 rootvfs->vfs_fstypevsp = 850 get_fstype_vopstats(rootvfs, vswp); 851 rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs); 852 } 853 vfs_unrefvfssw(vswp); 854 } 855 856 /* 857 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab, 858 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc. 859 */ 860 vfs_mountdevices(); 861 vfs_mountdev1(); 862 863 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 864 vfs_mountfs("proc", "/proc", "/proc"); 865 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 866 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 867 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 868 869 if (getzoneid() == GLOBAL_ZONEID) { 870 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab"); 871 } 872 873 #ifdef __sparc 874 /* 875 * This bit of magic can go away when we convert sparc to 876 * the new boot architecture based on ramdisk. 877 * 878 * Booting off a mirrored root volume: 879 * At this point, we have booted and mounted root on a 880 * single component of the mirror. Complete the boot 881 * by configuring SVM and converting the root to the 882 * dev_t of the mirrored root device. This dev_t conversion 883 * only works because the underlying device doesn't change. 884 */ 885 if (root_is_svm) { 886 if (svm_rootconf()) { 887 panic("vfs_mountroot: cannot remount root"); 888 } 889 890 /* 891 * mnttab should reflect the new root device 892 */ 893 vfs_lock_wait(rootvfs); 894 vfs_setresource(rootvfs, rootfs.bo_name); 895 vfs_unlock(rootvfs); 896 } 897 #endif /* __sparc */ 898 899 /* 900 * Look up the root device via devfs so that a dv_node is 901 * created for it. The vnode is never VN_RELE()ed. 902 * We allocate more than MAXPATHLEN so that the 903 * buffer passed to i_ddi_prompath_to_devfspath() is 904 * exactly MAXPATHLEN (the function expects a buffer 905 * of that length). 906 */ 907 plen = strlen("/devices"); 908 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 909 (void) strcpy(path, "/devices"); 910 911 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 912 != DDI_SUCCESS || 913 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 914 915 /* NUL terminate in case "path" has garbage */ 916 path[plen + MAXPATHLEN - 1] = '\0'; 917 #ifdef DEBUG 918 cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); 919 #endif 920 } 921 kmem_free(path, plen + MAXPATHLEN); 922 vfs_mnttabvp_setup(); 923 } 924 925 /* 926 * If remount failed and we're in a zone we need to check for the zone 927 * root path and strip it before the call to vfs_setpath(). 928 * 929 * If strpath doesn't begin with the zone_rootpath the original 930 * strpath is returned unchanged. 931 */ 932 static const char * 933 stripzonepath(const char *strpath) 934 { 935 char *str1, *str2; 936 int i; 937 zone_t *zonep = curproc->p_zone; 938 939 if (zonep->zone_rootpath == NULL || strpath == NULL) { 940 return (NULL); 941 } 942 943 /* 944 * we check for the end of the string at one past the 945 * current position because the zone_rootpath always 946 * ends with "/" but we don't want to strip that off. 947 */ 948 str1 = zonep->zone_rootpath; 949 str2 = (char *)strpath; 950 ASSERT(str1[0] != '\0'); 951 for (i = 0; str1[i + 1] != '\0'; i++) { 952 if (str1[i] != str2[i]) 953 return ((char *)strpath); 954 } 955 return (&str2[i]); 956 } 957 958 /* 959 * Common mount code. Called from the system call entry point, from autofs, 960 * and from pxfs. 961 * 962 * Takes the effective file system type, mount arguments, the mount point 963 * vnode, flags specifying whether the mount is a remount and whether it 964 * should be entered into the vfs list, and credentials. Fills in its vfspp 965 * parameter with the mounted file system instance's vfs. 966 * 967 * Note that the effective file system type is specified as a string. It may 968 * be null, in which case it's determined from the mount arguments, and may 969 * differ from the type specified in the mount arguments; this is a hook to 970 * allow interposition when instantiating file system instances. 971 * 972 * The caller is responsible for releasing its own hold on the mount point 973 * vp (this routine does its own hold when necessary). 974 * Also note that for remounts, the mount point vp should be the vnode for 975 * the root of the file system rather than the vnode that the file system 976 * is mounted on top of. 977 */ 978 int 979 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 980 struct vfs **vfspp) 981 { 982 struct vfssw *vswp; 983 vfsops_t *vfsops; 984 struct vfs *vfsp; 985 struct vnode *bvp; 986 dev_t bdev = 0; 987 mntopts_t mnt_mntopts; 988 int error = 0; 989 int copyout_error = 0; 990 int ovflags; 991 char *opts = uap->optptr; 992 char *inargs = opts; 993 int optlen = uap->optlen; 994 int remount; 995 int rdonly; 996 int nbmand = 0; 997 int delmip = 0; 998 int addmip = 0; 999 int splice = ((uap->flags & MS_NOSPLICE) == 0); 1000 int fromspace = (uap->flags & MS_SYSSPACE) ? 1001 UIO_SYSSPACE : UIO_USERSPACE; 1002 char *resource = NULL, *mountpt = NULL; 1003 refstr_t *oldresource, *oldmntpt; 1004 struct pathname pn, rpn; 1005 vsk_anchor_t *vskap; 1006 1007 /* 1008 * The v_flag value for the mount point vp is permanently set 1009 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 1010 * for mount point locking. 1011 */ 1012 mutex_enter(&vp->v_lock); 1013 vp->v_flag |= VVFSLOCK; 1014 mutex_exit(&vp->v_lock); 1015 1016 mnt_mntopts.mo_count = 0; 1017 /* 1018 * Find the ops vector to use to invoke the file system-specific mount 1019 * method. If the fsname argument is non-NULL, use it directly. 1020 * Otherwise, dig the file system type information out of the mount 1021 * arguments. 1022 * 1023 * A side effect is to hold the vfssw entry. 1024 * 1025 * Mount arguments can be specified in several ways, which are 1026 * distinguished by flag bit settings. The preferred way is to set 1027 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 1028 * type supplied as a character string and the last two arguments 1029 * being a pointer to a character buffer and the size of the buffer. 1030 * On entry, the buffer holds a null terminated list of options; on 1031 * return, the string is the list of options the file system 1032 * recognized. If MS_DATA is set arguments five and six point to a 1033 * block of binary data which the file system interprets. 1034 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 1035 * consistently with these conventions. To handle them, we check to 1036 * see whether the pointer to the file system name has a numeric value 1037 * less than 256. If so, we treat it as an index. 1038 */ 1039 if (fsname != NULL) { 1040 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 1041 return (EINVAL); 1042 } 1043 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 1044 size_t n; 1045 uint_t fstype; 1046 char name[FSTYPSZ]; 1047 1048 if ((fstype = (uintptr_t)uap->fstype) < 256) { 1049 RLOCK_VFSSW(); 1050 if (fstype == 0 || fstype >= nfstype || 1051 !ALLOCATED_VFSSW(&vfssw[fstype])) { 1052 RUNLOCK_VFSSW(); 1053 return (EINVAL); 1054 } 1055 (void) strcpy(name, vfssw[fstype].vsw_name); 1056 RUNLOCK_VFSSW(); 1057 if ((vswp = vfs_getvfssw(name)) == NULL) 1058 return (EINVAL); 1059 } else { 1060 /* 1061 * Handle either kernel or user address space. 1062 */ 1063 if (uap->flags & MS_SYSSPACE) { 1064 error = copystr(uap->fstype, name, 1065 FSTYPSZ, &n); 1066 } else { 1067 error = copyinstr(uap->fstype, name, 1068 FSTYPSZ, &n); 1069 } 1070 if (error) { 1071 if (error == ENAMETOOLONG) 1072 return (EINVAL); 1073 return (error); 1074 } 1075 if ((vswp = vfs_getvfssw(name)) == NULL) 1076 return (EINVAL); 1077 } 1078 } else { 1079 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 1080 return (EINVAL); 1081 } 1082 if (!VFS_INSTALLED(vswp)) 1083 return (EINVAL); 1084 vfsops = &vswp->vsw_vfsops; 1085 1086 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 1087 /* 1088 * Fetch mount options and parse them for generic vfs options 1089 */ 1090 if (uap->flags & MS_OPTIONSTR) { 1091 /* 1092 * Limit the buffer size 1093 */ 1094 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 1095 error = EINVAL; 1096 goto errout; 1097 } 1098 if ((uap->flags & MS_SYSSPACE) == 0) { 1099 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 1100 inargs[0] = '\0'; 1101 if (optlen) { 1102 error = copyinstr(opts, inargs, (size_t)optlen, 1103 NULL); 1104 if (error) { 1105 goto errout; 1106 } 1107 } 1108 } 1109 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 1110 } 1111 /* 1112 * Flag bits override the options string. 1113 */ 1114 if (uap->flags & MS_REMOUNT) 1115 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 1116 if (uap->flags & MS_RDONLY) 1117 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 1118 if (uap->flags & MS_NOSUID) 1119 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 1120 1121 /* 1122 * Check if this is a remount; must be set in the option string and 1123 * the file system must support a remount option. 1124 */ 1125 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 1126 MNTOPT_REMOUNT, NULL)) { 1127 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 1128 error = ENOTSUP; 1129 goto errout; 1130 } 1131 uap->flags |= MS_REMOUNT; 1132 } 1133 1134 /* 1135 * uap->flags and vfs_optionisset() should agree. 1136 */ 1137 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 1138 uap->flags |= MS_RDONLY; 1139 } 1140 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 1141 uap->flags |= MS_NOSUID; 1142 } 1143 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 1144 ASSERT(splice || !remount); 1145 /* 1146 * If we are splicing the fs into the namespace, 1147 * perform mount point checks. 1148 * 1149 * We want to resolve the path for the mount point to eliminate 1150 * '.' and ".." and symlinks in mount points; we can't do the 1151 * same for the resource string, since it would turn 1152 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 1153 * this before grabbing vn_vfswlock(), because otherwise we 1154 * would deadlock with lookuppn(). 1155 */ 1156 if (splice) { 1157 ASSERT(vp->v_count > 0); 1158 1159 /* 1160 * Pick up mount point and device from appropriate space. 1161 */ 1162 if (pn_get(uap->spec, fromspace, &pn) == 0) { 1163 resource = kmem_alloc(pn.pn_pathlen + 1, 1164 KM_SLEEP); 1165 (void) strcpy(resource, pn.pn_path); 1166 pn_free(&pn); 1167 } 1168 /* 1169 * Do a lookupname prior to taking the 1170 * writelock. Mark this as completed if 1171 * successful for later cleanup and addition to 1172 * the mount in progress table. 1173 */ 1174 if ((uap->flags & MS_GLOBAL) == 0 && 1175 lookupname(uap->spec, fromspace, 1176 FOLLOW, NULL, &bvp) == 0) { 1177 addmip = 1; 1178 } 1179 1180 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 1181 pathname_t *pnp; 1182 1183 if (*pn.pn_path != '/') { 1184 error = EINVAL; 1185 pn_free(&pn); 1186 goto errout; 1187 } 1188 pn_alloc(&rpn); 1189 /* 1190 * Kludge to prevent autofs from deadlocking with 1191 * itself when it calls domount(). 1192 * 1193 * If autofs is calling, it is because it is doing 1194 * (autofs) mounts in the process of an NFS mount. A 1195 * lookuppn() here would cause us to block waiting for 1196 * said NFS mount to complete, which can't since this 1197 * is the thread that was supposed to doing it. 1198 */ 1199 if (fromspace == UIO_USERSPACE) { 1200 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1201 NULL)) == 0) { 1202 pnp = &rpn; 1203 } else { 1204 /* 1205 * The file disappeared or otherwise 1206 * became inaccessible since we opened 1207 * it; might as well fail the mount 1208 * since the mount point is no longer 1209 * accessible. 1210 */ 1211 pn_free(&rpn); 1212 pn_free(&pn); 1213 goto errout; 1214 } 1215 } else { 1216 pnp = &pn; 1217 } 1218 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1219 (void) strcpy(mountpt, pnp->pn_path); 1220 1221 /* 1222 * If the addition of the zone's rootpath 1223 * would push us over a total path length 1224 * of MAXPATHLEN, we fail the mount with 1225 * ENAMETOOLONG, which is what we would have 1226 * gotten if we were trying to perform the same 1227 * mount in the global zone. 1228 * 1229 * strlen() doesn't count the trailing 1230 * '\0', but zone_rootpathlen counts both a 1231 * trailing '/' and the terminating '\0'. 1232 */ 1233 if ((curproc->p_zone->zone_rootpathlen - 1 + 1234 strlen(mountpt)) > MAXPATHLEN || 1235 (resource != NULL && 1236 (curproc->p_zone->zone_rootpathlen - 1 + 1237 strlen(resource)) > MAXPATHLEN)) { 1238 error = ENAMETOOLONG; 1239 } 1240 1241 pn_free(&rpn); 1242 pn_free(&pn); 1243 } 1244 1245 if (error) 1246 goto errout; 1247 1248 /* 1249 * Prevent path name resolution from proceeding past 1250 * the mount point. 1251 */ 1252 if (vn_vfswlock(vp) != 0) { 1253 error = EBUSY; 1254 goto errout; 1255 } 1256 1257 /* 1258 * Verify that it's legitimate to establish a mount on 1259 * the prospective mount point. 1260 */ 1261 if (vn_mountedvfs(vp) != NULL) { 1262 /* 1263 * The mount point lock was obtained after some 1264 * other thread raced through and established a mount. 1265 */ 1266 vn_vfsunlock(vp); 1267 error = EBUSY; 1268 goto errout; 1269 } 1270 if (vp->v_flag & VNOMOUNT) { 1271 vn_vfsunlock(vp); 1272 error = EINVAL; 1273 goto errout; 1274 } 1275 } 1276 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1277 uap->dataptr = NULL; 1278 uap->datalen = 0; 1279 } 1280 1281 /* 1282 * If this is a remount, we don't want to create a new VFS. 1283 * Instead, we pass the existing one with a remount flag. 1284 */ 1285 if (remount) { 1286 /* 1287 * Confirm that the mount point is the root vnode of the 1288 * file system that is being remounted. 1289 * This can happen if the user specifies a different 1290 * mount point directory pathname in the (re)mount command. 1291 * 1292 * Code below can only be reached if splice is true, so it's 1293 * safe to do vn_vfsunlock() here. 1294 */ 1295 if ((vp->v_flag & VROOT) == 0) { 1296 vn_vfsunlock(vp); 1297 error = ENOENT; 1298 goto errout; 1299 } 1300 /* 1301 * Disallow making file systems read-only unless file system 1302 * explicitly allows it in its vfssw. Ignore other flags. 1303 */ 1304 if (rdonly && vn_is_readonly(vp) == 0 && 1305 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1306 vn_vfsunlock(vp); 1307 error = EINVAL; 1308 goto errout; 1309 } 1310 /* 1311 * Changing the NBMAND setting on remounts is permitted 1312 * but logged since it can lead to unexpected behavior. 1313 * We also counsel against using it for / and /usr. 1314 */ 1315 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1316 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1317 cmn_err(CE_WARN, "domount: nbmand turned %s via " 1318 "remounting %s", nbmand ? "on" : "off", 1319 refstr_value(vp->v_vfsp->vfs_mntpt)); 1320 } 1321 vfsp = vp->v_vfsp; 1322 ovflags = vfsp->vfs_flag; 1323 vfsp->vfs_flag |= VFS_REMOUNT; 1324 vfsp->vfs_flag &= ~VFS_RDONLY; 1325 } else { 1326 vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP); 1327 VFS_INIT(vfsp, vfsops, NULL); 1328 } 1329 1330 VFS_HOLD(vfsp); 1331 1332 /* 1333 * The vfs_reflock is not used anymore the code below explicitly 1334 * holds it preventing others accesing it directly. 1335 */ 1336 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1337 !(vfsp->vfs_flag & VFS_REMOUNT)) 1338 cmn_err(CE_WARN, 1339 "mount type %s couldn't get vfs_reflock", vswp->vsw_name); 1340 1341 /* 1342 * Lock the vfs. If this is a remount we want to avoid spurious umount 1343 * failures that happen as a side-effect of fsflush() and other mount 1344 * and unmount operations that might be going on simultaneously and 1345 * may have locked the vfs currently. To not return EBUSY immediately 1346 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1347 */ 1348 if (!remount) { 1349 if (error = vfs_lock(vfsp)) { 1350 vfsp->vfs_flag = ovflags; 1351 if (splice) 1352 vn_vfsunlock(vp); 1353 if (vfsp->vfs_implp) 1354 vfsimpl_teardown(vfsp); 1355 kmem_free(vfsp, sizeof (struct vfs)); 1356 goto errout; 1357 } 1358 } else { 1359 vfs_lock_wait(vfsp); 1360 } 1361 1362 /* 1363 * Add device to mount in progress table, global mounts require special 1364 * handling. It is possible that we have already done the lookupname 1365 * on a spliced, non-global fs. If so, we don't want to do it again 1366 * since we cannot do a lookupname after taking the 1367 * wlock above. This case is for a non-spliced, non-global filesystem. 1368 */ 1369 if (!addmip) { 1370 if ((uap->flags & MS_GLOBAL) == 0 && 1371 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1372 addmip = 1; 1373 } 1374 } 1375 1376 if (addmip) { 1377 bdev = bvp->v_rdev; 1378 VN_RELE(bvp); 1379 vfs_addmip(bdev, vfsp); 1380 addmip = 0; 1381 delmip = 1; 1382 } 1383 /* 1384 * Invalidate cached entry for the mount point. 1385 */ 1386 if (splice) 1387 dnlc_purge_vp(vp); 1388 1389 /* 1390 * If have an option string but the filesystem doesn't supply a 1391 * prototype options table, create a table with the global 1392 * options and sufficient room to accept all the options in the 1393 * string. Then parse the passed in option string 1394 * accepting all the options in the string. This gives us an 1395 * option table with all the proper cancel properties for the 1396 * global options. 1397 * 1398 * Filesystems that supply a prototype options table are handled 1399 * earlier in this function. 1400 */ 1401 if (uap->flags & MS_OPTIONSTR) { 1402 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1403 mntopts_t tmp_mntopts; 1404 1405 tmp_mntopts.mo_count = 0; 1406 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1407 &mnt_mntopts); 1408 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1409 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1410 vfs_freeopttbl(&tmp_mntopts); 1411 } 1412 } 1413 1414 /* 1415 * Serialize with zone creations. 1416 */ 1417 mount_in_progress(); 1418 /* 1419 * Instantiate (or reinstantiate) the file system. If appropriate, 1420 * splice it into the file system name space. 1421 * 1422 * We want VFS_MOUNT() to be able to override the vfs_resource 1423 * string if necessary (ie, mntfs), and also for a remount to 1424 * change the same (necessary when remounting '/' during boot). 1425 * So we set up vfs_mntpt and vfs_resource to what we think they 1426 * should be, then hand off control to VFS_MOUNT() which can 1427 * override this. 1428 * 1429 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1430 * a vfs which is on the vfs list (i.e. during a remount), we must 1431 * never set those fields to NULL. Several bits of code make 1432 * assumptions that the fields are always valid. 1433 */ 1434 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1435 if (remount) { 1436 if ((oldresource = vfsp->vfs_resource) != NULL) 1437 refstr_hold(oldresource); 1438 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1439 refstr_hold(oldmntpt); 1440 } 1441 vfs_setresource(vfsp, resource); 1442 vfs_setmntpoint(vfsp, mountpt); 1443 1444 /* 1445 * going to mount on this vnode, so notify. 1446 */ 1447 vnevent_mountedover(vp); 1448 error = VFS_MOUNT(vfsp, vp, uap, credp); 1449 1450 if (uap->flags & MS_RDONLY) 1451 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1452 if (uap->flags & MS_NOSUID) 1453 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1454 if (uap->flags & MS_GLOBAL) 1455 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1456 1457 if (error) { 1458 if (remount) { 1459 /* put back pre-remount options */ 1460 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1461 vfs_setmntpoint(vfsp, (stripzonepath( 1462 refstr_value(oldmntpt)))); 1463 if (oldmntpt) 1464 refstr_rele(oldmntpt); 1465 vfs_setresource(vfsp, (stripzonepath( 1466 refstr_value(oldresource)))); 1467 if (oldresource) 1468 refstr_rele(oldresource); 1469 vfsp->vfs_flag = ovflags; 1470 vfs_unlock(vfsp); 1471 VFS_RELE(vfsp); 1472 } else { 1473 vfs_unlock(vfsp); 1474 vfs_freemnttab(vfsp); 1475 if (vfsp->vfs_implp) 1476 vfsimpl_teardown(vfsp); 1477 kmem_free(vfsp, sizeof (struct vfs)); 1478 } 1479 } else { 1480 /* 1481 * Set the mount time to now 1482 */ 1483 vfsp->vfs_mtime = ddi_get_time(); 1484 if (remount) { 1485 vfsp->vfs_flag &= ~VFS_REMOUNT; 1486 if (oldresource) 1487 refstr_rele(oldresource); 1488 if (oldmntpt) 1489 refstr_rele(oldmntpt); 1490 } else if (splice) { 1491 /* 1492 * Link vfsp into the name space at the mount 1493 * point. Vfs_add() is responsible for 1494 * holding the mount point which will be 1495 * released when vfs_remove() is called. 1496 */ 1497 vfs_add(vp, vfsp, uap->flags); 1498 } else { 1499 /* 1500 * Hold the reference to file system which is 1501 * not linked into the name space. 1502 */ 1503 vfsp->vfs_zone = NULL; 1504 VFS_HOLD(vfsp); 1505 vfsp->vfs_vnodecovered = NULL; 1506 } 1507 /* 1508 * Set flags for global options encountered 1509 */ 1510 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1511 vfsp->vfs_flag |= VFS_RDONLY; 1512 else 1513 vfsp->vfs_flag &= ~VFS_RDONLY; 1514 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1515 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1516 } else { 1517 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1518 vfsp->vfs_flag |= VFS_NODEVICES; 1519 else 1520 vfsp->vfs_flag &= ~VFS_NODEVICES; 1521 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1522 vfsp->vfs_flag |= VFS_NOSETUID; 1523 else 1524 vfsp->vfs_flag &= ~VFS_NOSETUID; 1525 } 1526 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1527 vfsp->vfs_flag |= VFS_NBMAND; 1528 else 1529 vfsp->vfs_flag &= ~VFS_NBMAND; 1530 1531 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1532 vfsp->vfs_flag |= VFS_XATTR; 1533 else 1534 vfsp->vfs_flag &= ~VFS_XATTR; 1535 1536 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1537 vfsp->vfs_flag |= VFS_NOEXEC; 1538 else 1539 vfsp->vfs_flag &= ~VFS_NOEXEC; 1540 1541 /* 1542 * Now construct the output option string of options 1543 * we recognized. 1544 */ 1545 if (uap->flags & MS_OPTIONSTR) { 1546 vfs_list_read_lock(); 1547 copyout_error = vfs_buildoptionstr( 1548 &vfsp->vfs_mntopts, inargs, optlen); 1549 vfs_list_unlock(); 1550 if (copyout_error == 0 && 1551 (uap->flags & MS_SYSSPACE) == 0) { 1552 copyout_error = copyoutstr(inargs, opts, 1553 optlen, NULL); 1554 } 1555 } 1556 1557 /* 1558 * If this isn't a remount, set up the vopstats before 1559 * anyone can touch this. We only allow spliced file 1560 * systems (file systems which are in the namespace) to 1561 * have the VFS_STATS flag set. 1562 * NOTE: PxFS mounts the underlying file system with 1563 * MS_NOSPLICE set and copies those vfs_flags to its private 1564 * vfs structure. As a result, PxFS should never have 1565 * the VFS_STATS flag or else we might access the vfs 1566 * statistics-related fields prior to them being 1567 * properly initialized. 1568 */ 1569 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) { 1570 initialize_vopstats(&vfsp->vfs_vopstats); 1571 /* 1572 * We need to set vfs_vskap to NULL because there's 1573 * a chance it won't be set below. This is checked 1574 * in teardown_vopstats() so we can't have garbage. 1575 */ 1576 vfsp->vfs_vskap = NULL; 1577 vfsp->vfs_flag |= VFS_STATS; 1578 vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp); 1579 } 1580 1581 if (vswp->vsw_flag & VSW_XID) 1582 vfsp->vfs_flag |= VFS_XID; 1583 1584 vfs_unlock(vfsp); 1585 } 1586 mount_completed(); 1587 if (splice) 1588 vn_vfsunlock(vp); 1589 1590 if ((error == 0) && (copyout_error == 0)) { 1591 if (!remount) { 1592 /* 1593 * Don't call get_vskstat_anchor() while holding 1594 * locks since it allocates memory and calls 1595 * VFS_STATVFS(). For NFS, the latter can generate 1596 * an over-the-wire call. 1597 */ 1598 vskap = get_vskstat_anchor(vfsp); 1599 /* Only take the lock if we have something to do */ 1600 if (vskap != NULL) { 1601 vfs_lock_wait(vfsp); 1602 if (vfsp->vfs_flag & VFS_STATS) { 1603 vfsp->vfs_vskap = vskap; 1604 } 1605 vfs_unlock(vfsp); 1606 } 1607 } 1608 /* Return vfsp to caller. */ 1609 *vfspp = vfsp; 1610 } 1611 errout: 1612 vfs_freeopttbl(&mnt_mntopts); 1613 if (resource != NULL) 1614 kmem_free(resource, strlen(resource) + 1); 1615 if (mountpt != NULL) 1616 kmem_free(mountpt, strlen(mountpt) + 1); 1617 /* 1618 * It is possible we errored prior to adding to mount in progress 1619 * table. Must free vnode we acquired with successful lookupname. 1620 */ 1621 if (addmip) 1622 VN_RELE(bvp); 1623 if (delmip) 1624 vfs_delmip(vfsp); 1625 ASSERT(vswp != NULL); 1626 vfs_unrefvfssw(vswp); 1627 if (inargs != opts) 1628 kmem_free(inargs, MAX_MNTOPT_STR); 1629 if (copyout_error) { 1630 VFS_RELE(vfsp); 1631 error = copyout_error; 1632 } 1633 return (error); 1634 } 1635 1636 static void 1637 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath) 1638 { 1639 size_t len; 1640 refstr_t *ref; 1641 zone_t *zone = curproc->p_zone; 1642 char *sp; 1643 int have_list_lock = 0; 1644 1645 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1646 1647 /* 1648 * New path must be less than MAXPATHLEN because mntfs 1649 * will only display up to MAXPATHLEN bytes. This is currently 1650 * safe, because domount() uses pn_get(), and other callers 1651 * similarly cap the size to fewer than MAXPATHLEN bytes. 1652 */ 1653 1654 ASSERT(strlen(newpath) < MAXPATHLEN); 1655 1656 /* mntfs requires consistency while vfs list lock is held */ 1657 1658 if (VFS_ON_LIST(vfsp)) { 1659 have_list_lock = 1; 1660 vfs_list_lock(); 1661 } 1662 1663 if (*refp != NULL) 1664 refstr_rele(*refp); 1665 1666 /* Do we need to modify the path? */ 1667 1668 if (zone == global_zone || *newpath != '/') { 1669 ref = refstr_alloc(newpath); 1670 goto out; 1671 } 1672 1673 /* 1674 * Truncate the trailing '/' in the zoneroot, and merge 1675 * in the zone's rootpath with the "newpath" (resource 1676 * or mountpoint) passed in. 1677 * 1678 * The size of the required buffer is thus the size of 1679 * the buffer required for the passed-in newpath 1680 * (strlen(newpath) + 1), plus the size of the buffer 1681 * required to hold zone_rootpath (zone_rootpathlen) 1682 * minus one for one of the now-superfluous NUL 1683 * terminations, minus one for the trailing '/'. 1684 * 1685 * That gives us: 1686 * 1687 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1688 * 1689 * Which is what we have below. 1690 */ 1691 1692 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1693 sp = kmem_alloc(len, KM_SLEEP); 1694 1695 /* 1696 * Copy everything including the trailing slash, which 1697 * we then overwrite with the NUL character. 1698 */ 1699 1700 (void) strcpy(sp, zone->zone_rootpath); 1701 sp[zone->zone_rootpathlen - 2] = '\0'; 1702 (void) strcat(sp, newpath); 1703 1704 ref = refstr_alloc(sp); 1705 kmem_free(sp, len); 1706 out: 1707 *refp = ref; 1708 1709 if (have_list_lock) { 1710 vfs_mnttab_modtimeupd(); 1711 vfs_list_unlock(); 1712 } 1713 } 1714 1715 /* 1716 * Record a mounted resource name in a vfs structure. 1717 * If vfsp is already mounted, caller must hold the vfs lock. 1718 */ 1719 void 1720 vfs_setresource(struct vfs *vfsp, const char *resource) 1721 { 1722 if (resource == NULL || resource[0] == '\0') 1723 resource = VFS_NORESOURCE; 1724 vfs_setpath(vfsp, &vfsp->vfs_resource, resource); 1725 } 1726 1727 /* 1728 * Record a mount point name in a vfs structure. 1729 * If vfsp is already mounted, caller must hold the vfs lock. 1730 */ 1731 void 1732 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt) 1733 { 1734 if (mntpt == NULL || mntpt[0] == '\0') 1735 mntpt = VFS_NOMNTPT; 1736 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt); 1737 } 1738 1739 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1740 1741 refstr_t * 1742 vfs_getresource(const struct vfs *vfsp) 1743 { 1744 refstr_t *resource; 1745 1746 vfs_list_read_lock(); 1747 resource = vfsp->vfs_resource; 1748 refstr_hold(resource); 1749 vfs_list_unlock(); 1750 1751 return (resource); 1752 } 1753 1754 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 1755 1756 refstr_t * 1757 vfs_getmntpoint(const struct vfs *vfsp) 1758 { 1759 refstr_t *mntpt; 1760 1761 vfs_list_read_lock(); 1762 mntpt = vfsp->vfs_mntpt; 1763 refstr_hold(mntpt); 1764 vfs_list_unlock(); 1765 1766 return (mntpt); 1767 } 1768 1769 /* 1770 * Create an empty options table with enough empty slots to hold all 1771 * The options in the options string passed as an argument. 1772 * Potentially prepend another options table. 1773 * 1774 * Note: caller is responsible for locking the vfs list, if needed, 1775 * to protect mops. 1776 */ 1777 static void 1778 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 1779 const mntopts_t *mtmpl) 1780 { 1781 const char *s = opts; 1782 uint_t count; 1783 1784 if (opts == NULL || *opts == '\0') { 1785 count = 0; 1786 } else { 1787 count = 1; 1788 1789 /* 1790 * Count number of options in the string 1791 */ 1792 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 1793 count++; 1794 s++; 1795 } 1796 } 1797 vfs_copyopttbl_extend(mtmpl, mops, count); 1798 } 1799 1800 /* 1801 * Create an empty options table with enough empty slots to hold all 1802 * The options in the options string passed as an argument. 1803 * 1804 * This function is *not* for general use by filesystems. 1805 * 1806 * Note: caller is responsible for locking the vfs list, if needed, 1807 * to protect mops. 1808 */ 1809 void 1810 vfs_createopttbl(mntopts_t *mops, const char *opts) 1811 { 1812 vfs_createopttbl_extend(mops, opts, NULL); 1813 } 1814 1815 1816 /* 1817 * Swap two mount options tables 1818 */ 1819 static void 1820 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 1821 { 1822 uint_t tmpcnt; 1823 mntopt_t *tmplist; 1824 1825 tmpcnt = optbl2->mo_count; 1826 tmplist = optbl2->mo_list; 1827 optbl2->mo_count = optbl1->mo_count; 1828 optbl2->mo_list = optbl1->mo_list; 1829 optbl1->mo_count = tmpcnt; 1830 optbl1->mo_list = tmplist; 1831 } 1832 1833 static void 1834 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 1835 { 1836 vfs_list_lock(); 1837 vfs_swapopttbl_nolock(optbl1, optbl2); 1838 vfs_mnttab_modtimeupd(); 1839 vfs_list_unlock(); 1840 } 1841 1842 static char ** 1843 vfs_copycancelopt_extend(char **const moc, int extend) 1844 { 1845 int i = 0; 1846 int j; 1847 char **result; 1848 1849 if (moc != NULL) { 1850 for (; moc[i] != NULL; i++) 1851 /* count number of options to cancel */; 1852 } 1853 1854 if (i + extend == 0) 1855 return (NULL); 1856 1857 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 1858 1859 for (j = 0; j < i; j++) { 1860 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 1861 (void) strcpy(result[j], moc[j]); 1862 } 1863 for (; j <= i + extend; j++) 1864 result[j] = NULL; 1865 1866 return (result); 1867 } 1868 1869 static void 1870 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 1871 { 1872 char *sp, *dp; 1873 1874 d->mo_flags = s->mo_flags; 1875 d->mo_data = s->mo_data; 1876 sp = s->mo_name; 1877 if (sp != NULL) { 1878 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1879 (void) strcpy(dp, sp); 1880 d->mo_name = dp; 1881 } else { 1882 d->mo_name = NULL; /* should never happen */ 1883 } 1884 1885 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 1886 1887 sp = s->mo_arg; 1888 if (sp != NULL) { 1889 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 1890 (void) strcpy(dp, sp); 1891 d->mo_arg = dp; 1892 } else { 1893 d->mo_arg = NULL; 1894 } 1895 } 1896 1897 /* 1898 * Copy a mount options table, possibly allocating some spare 1899 * slots at the end. It is permissible to copy_extend the NULL table. 1900 */ 1901 static void 1902 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 1903 { 1904 uint_t i, count; 1905 mntopt_t *motbl; 1906 1907 /* 1908 * Clear out any existing stuff in the options table being initialized 1909 */ 1910 vfs_freeopttbl(dmo); 1911 count = (smo == NULL) ? 0 : smo->mo_count; 1912 if ((count + extra) == 0) /* nothing to do */ 1913 return; 1914 dmo->mo_count = count + extra; 1915 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 1916 dmo->mo_list = motbl; 1917 for (i = 0; i < count; i++) { 1918 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 1919 } 1920 for (i = count; i < count + extra; i++) { 1921 motbl[i].mo_flags = MO_EMPTY; 1922 } 1923 } 1924 1925 /* 1926 * Copy a mount options table. 1927 * 1928 * This function is *not* for general use by filesystems. 1929 * 1930 * Note: caller is responsible for locking the vfs list, if needed, 1931 * to protect smo and dmo. 1932 */ 1933 void 1934 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 1935 { 1936 vfs_copyopttbl_extend(smo, dmo, 0); 1937 } 1938 1939 static char ** 1940 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 1941 { 1942 int c1 = 0; 1943 int c2 = 0; 1944 char **result; 1945 char **sp1, **sp2, **dp; 1946 1947 /* 1948 * First we count both lists of cancel options. 1949 * If either is NULL or has no elements, we return a copy of 1950 * the other. 1951 */ 1952 if (mop1->mo_cancel != NULL) { 1953 for (; mop1->mo_cancel[c1] != NULL; c1++) 1954 /* count cancel options in mop1 */; 1955 } 1956 1957 if (c1 == 0) 1958 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 1959 1960 if (mop2->mo_cancel != NULL) { 1961 for (; mop2->mo_cancel[c2] != NULL; c2++) 1962 /* count cancel options in mop2 */; 1963 } 1964 1965 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 1966 1967 if (c2 == 0) 1968 return (result); 1969 1970 /* 1971 * When we get here, we've got two sets of cancel options; 1972 * we need to merge the two sets. We know that the result 1973 * array has "c1+c2+1" entries and in the end we might shrink 1974 * it. 1975 * Result now has a copy of the c1 entries from mop1; we'll 1976 * now lookup all the entries of mop2 in mop1 and copy it if 1977 * it is unique. 1978 * This operation is O(n^2) but it's only called once per 1979 * filesystem per duplicate option. This is a situation 1980 * which doesn't arise with the filesystems in ON and 1981 * n is generally 1. 1982 */ 1983 1984 dp = &result[c1]; 1985 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 1986 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 1987 if (strcmp(*sp1, *sp2) == 0) 1988 break; 1989 } 1990 if (*sp1 == NULL) { 1991 /* 1992 * Option *sp2 not found in mop1, so copy it. 1993 * The calls to vfs_copycancelopt_extend() 1994 * guarantee that there's enough room. 1995 */ 1996 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 1997 (void) strcpy(*dp++, *sp2); 1998 } 1999 } 2000 if (dp != &result[c1+c2]) { 2001 size_t bytes = (dp - result + 1) * sizeof (char *); 2002 char **nres = kmem_alloc(bytes, KM_SLEEP); 2003 2004 bcopy(result, nres, bytes); 2005 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 2006 result = nres; 2007 } 2008 return (result); 2009 } 2010 2011 /* 2012 * Merge two mount option tables (outer and inner) into one. This is very 2013 * similar to "merging" global variables and automatic variables in C. 2014 * 2015 * This isn't (and doesn't have to be) fast. 2016 * 2017 * This function is *not* for general use by filesystems. 2018 * 2019 * Note: caller is responsible for locking the vfs list, if needed, 2020 * to protect omo, imo & dmo. 2021 */ 2022 void 2023 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 2024 { 2025 uint_t i, count; 2026 mntopt_t *mop, *motbl; 2027 uint_t freeidx; 2028 2029 /* 2030 * First determine how much space we need to allocate. 2031 */ 2032 count = omo->mo_count; 2033 for (i = 0; i < imo->mo_count; i++) { 2034 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2035 continue; 2036 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 2037 count++; 2038 } 2039 ASSERT(count >= omo->mo_count && 2040 count <= omo->mo_count + imo->mo_count); 2041 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 2042 for (i = 0; i < omo->mo_count; i++) 2043 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 2044 freeidx = omo->mo_count; 2045 for (i = 0; i < imo->mo_count; i++) { 2046 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2047 continue; 2048 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 2049 char **newcanp; 2050 uint_t index = mop - omo->mo_list; 2051 2052 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 2053 2054 vfs_freeopt(&motbl[index]); 2055 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 2056 2057 vfs_freecancelopt(motbl[index].mo_cancel); 2058 motbl[index].mo_cancel = newcanp; 2059 } else { 2060 /* 2061 * If it's a new option, just copy it over to the first 2062 * free location. 2063 */ 2064 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 2065 } 2066 } 2067 dmo->mo_count = count; 2068 dmo->mo_list = motbl; 2069 } 2070 2071 /* 2072 * Functions to set and clear mount options in a mount options table. 2073 */ 2074 2075 /* 2076 * Clear a mount option, if it exists. 2077 * 2078 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2079 * the vfs list. 2080 */ 2081 static void 2082 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 2083 { 2084 struct mntopt *mop; 2085 uint_t i, count; 2086 2087 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2088 2089 count = mops->mo_count; 2090 for (i = 0; i < count; i++) { 2091 mop = &mops->mo_list[i]; 2092 2093 if (mop->mo_flags & MO_EMPTY) 2094 continue; 2095 if (strcmp(opt, mop->mo_name)) 2096 continue; 2097 mop->mo_flags &= ~MO_SET; 2098 if (mop->mo_arg != NULL) { 2099 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2100 } 2101 mop->mo_arg = NULL; 2102 if (update_mnttab) 2103 vfs_mnttab_modtimeupd(); 2104 break; 2105 } 2106 } 2107 2108 void 2109 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 2110 { 2111 int gotlock = 0; 2112 2113 if (VFS_ON_LIST(vfsp)) { 2114 gotlock = 1; 2115 vfs_list_lock(); 2116 } 2117 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 2118 if (gotlock) 2119 vfs_list_unlock(); 2120 } 2121 2122 2123 /* 2124 * Set a mount option on. If it's not found in the table, it's silently 2125 * ignored. If the option has MO_IGNORE set, it is still set unless the 2126 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 2127 * bits can be used to toggle the MO_NODISPLAY bit for the option. 2128 * If the VFS_CREATEOPT flag bit is set then the first option slot with 2129 * MO_EMPTY set is created as the option passed in. 2130 * 2131 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2132 * the vfs list. 2133 */ 2134 static void 2135 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 2136 const char *arg, int flags, int update_mnttab) 2137 { 2138 mntopt_t *mop; 2139 uint_t i, count; 2140 char *sp; 2141 2142 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2143 2144 if (flags & VFS_CREATEOPT) { 2145 if (vfs_hasopt(mops, opt) != NULL) { 2146 flags &= ~VFS_CREATEOPT; 2147 } 2148 } 2149 count = mops->mo_count; 2150 for (i = 0; i < count; i++) { 2151 mop = &mops->mo_list[i]; 2152 2153 if (mop->mo_flags & MO_EMPTY) { 2154 if ((flags & VFS_CREATEOPT) == 0) 2155 continue; 2156 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 2157 (void) strcpy(sp, opt); 2158 mop->mo_name = sp; 2159 if (arg != NULL) 2160 mop->mo_flags = MO_HASVALUE; 2161 else 2162 mop->mo_flags = 0; 2163 } else if (strcmp(opt, mop->mo_name)) { 2164 continue; 2165 } 2166 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 2167 break; 2168 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 2169 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 2170 (void) strcpy(sp, arg); 2171 } else { 2172 sp = NULL; 2173 } 2174 if (mop->mo_arg != NULL) 2175 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2176 mop->mo_arg = sp; 2177 if (flags & VFS_DISPLAY) 2178 mop->mo_flags &= ~MO_NODISPLAY; 2179 if (flags & VFS_NODISPLAY) 2180 mop->mo_flags |= MO_NODISPLAY; 2181 mop->mo_flags |= MO_SET; 2182 if (mop->mo_cancel != NULL) { 2183 char **cp; 2184 2185 for (cp = mop->mo_cancel; *cp != NULL; cp++) 2186 vfs_clearmntopt_nolock(mops, *cp, 0); 2187 } 2188 if (update_mnttab) 2189 vfs_mnttab_modtimeupd(); 2190 break; 2191 } 2192 } 2193 2194 void 2195 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 2196 { 2197 int gotlock = 0; 2198 2199 if (VFS_ON_LIST(vfsp)) { 2200 gotlock = 1; 2201 vfs_list_lock(); 2202 } 2203 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 2204 if (gotlock) 2205 vfs_list_unlock(); 2206 } 2207 2208 2209 /* 2210 * Add a "tag" option to a mounted file system's options list. 2211 * 2212 * Note: caller is responsible for locking the vfs list, if needed, 2213 * to protect mops. 2214 */ 2215 static mntopt_t * 2216 vfs_addtag(mntopts_t *mops, const char *tag) 2217 { 2218 uint_t count; 2219 mntopt_t *mop, *motbl; 2220 2221 count = mops->mo_count + 1; 2222 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 2223 if (mops->mo_count) { 2224 size_t len = (count - 1) * sizeof (mntopt_t); 2225 2226 bcopy(mops->mo_list, motbl, len); 2227 kmem_free(mops->mo_list, len); 2228 } 2229 mops->mo_count = count; 2230 mops->mo_list = motbl; 2231 mop = &motbl[count - 1]; 2232 mop->mo_flags = MO_TAG; 2233 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 2234 (void) strcpy(mop->mo_name, tag); 2235 return (mop); 2236 } 2237 2238 /* 2239 * Allow users to set arbitrary "tags" in a vfs's mount options. 2240 * Broader use within the kernel is discouraged. 2241 */ 2242 int 2243 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2244 cred_t *cr) 2245 { 2246 vfs_t *vfsp; 2247 mntopts_t *mops; 2248 mntopt_t *mop; 2249 int found = 0; 2250 dev_t dev = makedevice(major, minor); 2251 int err = 0; 2252 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2253 2254 /* 2255 * Find the desired mounted file system 2256 */ 2257 vfs_list_lock(); 2258 vfsp = rootvfs; 2259 do { 2260 if (vfsp->vfs_dev == dev && 2261 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2262 found = 1; 2263 break; 2264 } 2265 vfsp = vfsp->vfs_next; 2266 } while (vfsp != rootvfs); 2267 2268 if (!found) { 2269 err = EINVAL; 2270 goto out; 2271 } 2272 err = secpolicy_fs_config(cr, vfsp); 2273 if (err != 0) 2274 goto out; 2275 2276 mops = &vfsp->vfs_mntopts; 2277 /* 2278 * Add tag if it doesn't already exist 2279 */ 2280 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2281 int len; 2282 2283 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2284 len = strlen(buf); 2285 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2286 err = ENAMETOOLONG; 2287 goto out; 2288 } 2289 mop = vfs_addtag(mops, tag); 2290 } 2291 if ((mop->mo_flags & MO_TAG) == 0) { 2292 err = EINVAL; 2293 goto out; 2294 } 2295 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2296 out: 2297 vfs_list_unlock(); 2298 kmem_free(buf, MAX_MNTOPT_STR); 2299 return (err); 2300 } 2301 2302 /* 2303 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2304 * Broader use within the kernel is discouraged. 2305 */ 2306 int 2307 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2308 cred_t *cr) 2309 { 2310 vfs_t *vfsp; 2311 mntopt_t *mop; 2312 int found = 0; 2313 dev_t dev = makedevice(major, minor); 2314 int err = 0; 2315 2316 /* 2317 * Find the desired mounted file system 2318 */ 2319 vfs_list_lock(); 2320 vfsp = rootvfs; 2321 do { 2322 if (vfsp->vfs_dev == dev && 2323 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2324 found = 1; 2325 break; 2326 } 2327 vfsp = vfsp->vfs_next; 2328 } while (vfsp != rootvfs); 2329 2330 if (!found) { 2331 err = EINVAL; 2332 goto out; 2333 } 2334 err = secpolicy_fs_config(cr, vfsp); 2335 if (err != 0) 2336 goto out; 2337 2338 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2339 err = EINVAL; 2340 goto out; 2341 } 2342 if ((mop->mo_flags & MO_TAG) == 0) { 2343 err = EINVAL; 2344 goto out; 2345 } 2346 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2347 out: 2348 vfs_list_unlock(); 2349 return (err); 2350 } 2351 2352 /* 2353 * Function to parse an option string and fill in a mount options table. 2354 * Unknown options are silently ignored. The input option string is modified 2355 * by replacing separators with nulls. If the create flag is set, options 2356 * not found in the table are just added on the fly. The table must have 2357 * an option slot marked MO_EMPTY to add an option on the fly. 2358 * 2359 * This function is *not* for general use by filesystems. 2360 * 2361 * Note: caller is responsible for locking the vfs list, if needed, 2362 * to protect mops.. 2363 */ 2364 void 2365 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2366 { 2367 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2368 int setflg = VFS_NOFORCEOPT; 2369 2370 if (osp == NULL) 2371 return; 2372 while (*s != '\0') { 2373 p = strchr(s, ','); /* find next option */ 2374 if (p == NULL) { 2375 cp = NULL; 2376 p = s + strlen(s); 2377 } else { 2378 cp = p; /* save location of comma */ 2379 *p++ = '\0'; /* mark end and point to next option */ 2380 } 2381 nextop = p; 2382 p = strchr(s, '='); /* look for value */ 2383 if (p == NULL) { 2384 valp = NULL; /* no value supplied */ 2385 } else { 2386 ep = p; /* save location of equals */ 2387 *p++ = '\0'; /* end option and point to value */ 2388 valp = p; 2389 } 2390 /* 2391 * set option into options table 2392 */ 2393 if (create) 2394 setflg |= VFS_CREATEOPT; 2395 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2396 if (cp != NULL) 2397 *cp = ','; /* restore the comma */ 2398 if (valp != NULL) 2399 *ep = '='; /* restore the equals */ 2400 s = nextop; 2401 } 2402 } 2403 2404 /* 2405 * Function to inquire if an option exists in a mount options table. 2406 * Returns a pointer to the option if it exists, else NULL. 2407 * 2408 * This function is *not* for general use by filesystems. 2409 * 2410 * Note: caller is responsible for locking the vfs list, if needed, 2411 * to protect mops. 2412 */ 2413 struct mntopt * 2414 vfs_hasopt(const mntopts_t *mops, const char *opt) 2415 { 2416 struct mntopt *mop; 2417 uint_t i, count; 2418 2419 count = mops->mo_count; 2420 for (i = 0; i < count; i++) { 2421 mop = &mops->mo_list[i]; 2422 2423 if (mop->mo_flags & MO_EMPTY) 2424 continue; 2425 if (strcmp(opt, mop->mo_name) == 0) 2426 return (mop); 2427 } 2428 return (NULL); 2429 } 2430 2431 /* 2432 * Function to inquire if an option is set in a mount options table. 2433 * Returns non-zero if set and fills in the arg pointer with a pointer to 2434 * the argument string or NULL if there is no argument string. 2435 */ 2436 static int 2437 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2438 { 2439 struct mntopt *mop; 2440 uint_t i, count; 2441 2442 count = mops->mo_count; 2443 for (i = 0; i < count; i++) { 2444 mop = &mops->mo_list[i]; 2445 2446 if (mop->mo_flags & MO_EMPTY) 2447 continue; 2448 if (strcmp(opt, mop->mo_name)) 2449 continue; 2450 if ((mop->mo_flags & MO_SET) == 0) 2451 return (0); 2452 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2453 *argp = mop->mo_arg; 2454 return (1); 2455 } 2456 return (0); 2457 } 2458 2459 2460 int 2461 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2462 { 2463 int ret; 2464 2465 vfs_list_read_lock(); 2466 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2467 vfs_list_unlock(); 2468 return (ret); 2469 } 2470 2471 2472 /* 2473 * Construct a comma separated string of the options set in the given 2474 * mount table, return the string in the given buffer. Return non-zero if 2475 * the buffer would overflow. 2476 * 2477 * This function is *not* for general use by filesystems. 2478 * 2479 * Note: caller is responsible for locking the vfs list, if needed, 2480 * to protect mp. 2481 */ 2482 int 2483 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2484 { 2485 char *cp; 2486 uint_t i; 2487 2488 buf[0] = '\0'; 2489 cp = buf; 2490 for (i = 0; i < mp->mo_count; i++) { 2491 struct mntopt *mop; 2492 2493 mop = &mp->mo_list[i]; 2494 if (mop->mo_flags & MO_SET) { 2495 int optlen, comma = 0; 2496 2497 if (buf[0] != '\0') 2498 comma = 1; 2499 optlen = strlen(mop->mo_name); 2500 if (strlen(buf) + comma + optlen + 1 > len) 2501 goto err; 2502 if (comma) 2503 *cp++ = ','; 2504 (void) strcpy(cp, mop->mo_name); 2505 cp += optlen; 2506 /* 2507 * Append option value if there is one 2508 */ 2509 if (mop->mo_arg != NULL) { 2510 int arglen; 2511 2512 arglen = strlen(mop->mo_arg); 2513 if (strlen(buf) + arglen + 2 > len) 2514 goto err; 2515 *cp++ = '='; 2516 (void) strcpy(cp, mop->mo_arg); 2517 cp += arglen; 2518 } 2519 } 2520 } 2521 return (0); 2522 err: 2523 return (EOVERFLOW); 2524 } 2525 2526 static void 2527 vfs_freecancelopt(char **moc) 2528 { 2529 if (moc != NULL) { 2530 int ccnt = 0; 2531 char **cp; 2532 2533 for (cp = moc; *cp != NULL; cp++) { 2534 kmem_free(*cp, strlen(*cp) + 1); 2535 ccnt++; 2536 } 2537 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2538 } 2539 } 2540 2541 static void 2542 vfs_freeopt(mntopt_t *mop) 2543 { 2544 if (mop->mo_name != NULL) 2545 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2546 2547 vfs_freecancelopt(mop->mo_cancel); 2548 2549 if (mop->mo_arg != NULL) 2550 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2551 } 2552 2553 /* 2554 * Free a mount options table 2555 * 2556 * This function is *not* for general use by filesystems. 2557 * 2558 * Note: caller is responsible for locking the vfs list, if needed, 2559 * to protect mp. 2560 */ 2561 void 2562 vfs_freeopttbl(mntopts_t *mp) 2563 { 2564 uint_t i, count; 2565 2566 count = mp->mo_count; 2567 for (i = 0; i < count; i++) { 2568 vfs_freeopt(&mp->mo_list[i]); 2569 } 2570 if (count) { 2571 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2572 mp->mo_count = 0; 2573 mp->mo_list = NULL; 2574 } 2575 } 2576 2577 2578 /* ARGSUSED */ 2579 static int 2580 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2581 caller_context_t *ct) 2582 { 2583 return (0); 2584 } 2585 2586 /* ARGSUSED */ 2587 static int 2588 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2589 caller_context_t *ct) 2590 { 2591 return (0); 2592 } 2593 2594 /* 2595 * The dummy vnode is currently used only by file events notification 2596 * module which is just interested in the timestamps. 2597 */ 2598 /* ARGSUSED */ 2599 static int 2600 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) 2601 { 2602 bzero(vap, sizeof (vattr_t)); 2603 vap->va_type = VREG; 2604 vap->va_nlink = 1; 2605 vap->va_ctime = vfs_mnttab_ctime; 2606 /* 2607 * it is ok to just copy mtime as the time will be monotonically 2608 * increasing. 2609 */ 2610 vap->va_mtime = vfs_mnttab_mtime; 2611 vap->va_atime = vap->va_mtime; 2612 return (0); 2613 } 2614 2615 static void 2616 vfs_mnttabvp_setup(void) 2617 { 2618 vnode_t *tvp; 2619 vnodeops_t *vfs_mntdummyvnops; 2620 const fs_operation_def_t mnt_dummyvnodeops_template[] = { 2621 VOPNAME_READ, { .vop_read = vfs_mntdummyread }, 2622 VOPNAME_WRITE, { .vop_write = vfs_mntdummywrite }, 2623 VOPNAME_GETATTR, { .vop_getattr = vfs_mntdummygetattr }, 2624 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 2625 NULL, NULL 2626 }; 2627 2628 if (vn_make_ops("mnttab", mnt_dummyvnodeops_template, 2629 &vfs_mntdummyvnops) != 0) { 2630 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed"); 2631 /* Shouldn't happen, but not bad enough to panic */ 2632 return; 2633 } 2634 2635 /* 2636 * A global dummy vnode is allocated to represent mntfs files. 2637 * The mntfs file (/etc/mnttab) can be monitored for file events 2638 * and receive an event when mnttab changes. Dummy VOP calls 2639 * will be made on this vnode. The file events notification module 2640 * intercepts this vnode and delivers relevant events. 2641 */ 2642 tvp = vn_alloc(KM_SLEEP); 2643 tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE; 2644 vn_setops(tvp, vfs_mntdummyvnops); 2645 tvp->v_type = VREG; 2646 /* 2647 * The mnt dummy ops do not reference v_data. 2648 * No other module intercepting this vnode should either. 2649 * Just set it to point to itself. 2650 */ 2651 tvp->v_data = (caddr_t)tvp; 2652 tvp->v_vfsp = rootvfs; 2653 vfs_mntdummyvp = tvp; 2654 } 2655 2656 /* 2657 * performs fake read/write ops 2658 */ 2659 static void 2660 vfs_mnttab_rwop(int rw) 2661 { 2662 struct uio uio; 2663 struct iovec iov; 2664 char buf[1]; 2665 2666 if (vfs_mntdummyvp == NULL) 2667 return; 2668 2669 bzero(&uio, sizeof (uio)); 2670 bzero(&iov, sizeof (iov)); 2671 iov.iov_base = buf; 2672 iov.iov_len = 0; 2673 uio.uio_iov = &iov; 2674 uio.uio_iovcnt = 1; 2675 uio.uio_loffset = 0; 2676 uio.uio_segflg = UIO_SYSSPACE; 2677 uio.uio_resid = 0; 2678 if (rw) { 2679 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2680 } else { 2681 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2682 } 2683 } 2684 2685 /* 2686 * Generate a write operation. 2687 */ 2688 void 2689 vfs_mnttab_writeop(void) 2690 { 2691 vfs_mnttab_rwop(1); 2692 } 2693 2694 /* 2695 * Generate a read operation. 2696 */ 2697 void 2698 vfs_mnttab_readop(void) 2699 { 2700 vfs_mnttab_rwop(0); 2701 } 2702 2703 /* 2704 * Free any mnttab information recorded in the vfs struct. 2705 * The vfs must not be on the vfs list. 2706 */ 2707 static void 2708 vfs_freemnttab(struct vfs *vfsp) 2709 { 2710 ASSERT(!VFS_ON_LIST(vfsp)); 2711 2712 /* 2713 * Free device and mount point information 2714 */ 2715 if (vfsp->vfs_mntpt != NULL) { 2716 refstr_rele(vfsp->vfs_mntpt); 2717 vfsp->vfs_mntpt = NULL; 2718 } 2719 if (vfsp->vfs_resource != NULL) { 2720 refstr_rele(vfsp->vfs_resource); 2721 vfsp->vfs_resource = NULL; 2722 } 2723 /* 2724 * Now free mount options information 2725 */ 2726 vfs_freeopttbl(&vfsp->vfs_mntopts); 2727 } 2728 2729 /* 2730 * Return the last mnttab modification time 2731 */ 2732 void 2733 vfs_mnttab_modtime(timespec_t *ts) 2734 { 2735 ASSERT(RW_LOCK_HELD(&vfslist)); 2736 *ts = vfs_mnttab_mtime; 2737 } 2738 2739 /* 2740 * See if mnttab is changed 2741 */ 2742 void 2743 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2744 { 2745 int changed; 2746 2747 *phpp = (struct pollhead *)NULL; 2748 2749 /* 2750 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 2751 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 2752 * to not grab the vfs list lock because tv_sec is monotonically 2753 * increasing. 2754 */ 2755 2756 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 2757 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 2758 if (!changed) { 2759 *phpp = &vfs_pollhd; 2760 } 2761 } 2762 2763 /* 2764 * Update the mnttab modification time and wake up any waiters for 2765 * mnttab changes 2766 */ 2767 void 2768 vfs_mnttab_modtimeupd() 2769 { 2770 hrtime_t oldhrt, newhrt; 2771 2772 ASSERT(RW_WRITE_HELD(&vfslist)); 2773 oldhrt = ts2hrt(&vfs_mnttab_mtime); 2774 gethrestime(&vfs_mnttab_mtime); 2775 newhrt = ts2hrt(&vfs_mnttab_mtime); 2776 if (oldhrt == (hrtime_t)0) 2777 vfs_mnttab_ctime = vfs_mnttab_mtime; 2778 /* 2779 * Attempt to provide unique mtime (like uniqtime but not). 2780 */ 2781 if (newhrt == oldhrt) { 2782 newhrt++; 2783 hrt2ts(newhrt, &vfs_mnttab_mtime); 2784 } 2785 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 2786 vfs_mnttab_writeop(); 2787 } 2788 2789 int 2790 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 2791 { 2792 vnode_t *coveredvp; 2793 int error; 2794 extern void teardown_vopstats(vfs_t *); 2795 2796 /* 2797 * Get covered vnode. This will be NULL if the vfs is not linked 2798 * into the file system name space (i.e., domount() with MNT_NOSPICE). 2799 */ 2800 coveredvp = vfsp->vfs_vnodecovered; 2801 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 2802 2803 /* 2804 * Purge all dnlc entries for this vfs. 2805 */ 2806 (void) dnlc_purge_vfsp(vfsp, 0); 2807 2808 /* For forcible umount, skip VFS_SYNC() since it may hang */ 2809 if ((flag & MS_FORCE) == 0) 2810 (void) VFS_SYNC(vfsp, 0, cr); 2811 2812 /* 2813 * Lock the vfs to maintain fs status quo during unmount. This 2814 * has to be done after the sync because ufs_update tries to acquire 2815 * the vfs_reflock. 2816 */ 2817 vfs_lock_wait(vfsp); 2818 2819 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 2820 vfs_unlock(vfsp); 2821 if (coveredvp != NULL) 2822 vn_vfsunlock(coveredvp); 2823 } else if (coveredvp != NULL) { 2824 teardown_vopstats(vfsp); 2825 /* 2826 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 2827 * when it frees vfsp so we do a VN_HOLD() so we can 2828 * continue to use coveredvp afterwards. 2829 */ 2830 VN_HOLD(coveredvp); 2831 vfs_remove(vfsp); 2832 vn_vfsunlock(coveredvp); 2833 VN_RELE(coveredvp); 2834 } else { 2835 teardown_vopstats(vfsp); 2836 /* 2837 * Release the reference to vfs that is not linked 2838 * into the name space. 2839 */ 2840 vfs_unlock(vfsp); 2841 VFS_RELE(vfsp); 2842 } 2843 return (error); 2844 } 2845 2846 2847 /* 2848 * Vfs_unmountall() is called by uadmin() to unmount all 2849 * mounted file systems (except the root file system) during shutdown. 2850 * It follows the existing locking protocol when traversing the vfs list 2851 * to sync and unmount vfses. Even though there should be no 2852 * other thread running while the system is shutting down, it is prudent 2853 * to still follow the locking protocol. 2854 */ 2855 void 2856 vfs_unmountall(void) 2857 { 2858 struct vfs *vfsp; 2859 struct vfs *prev_vfsp = NULL; 2860 int error; 2861 2862 /* 2863 * Toss all dnlc entries now so that the per-vfs sync 2864 * and unmount operations don't have to slog through 2865 * a bunch of uninteresting vnodes over and over again. 2866 */ 2867 dnlc_purge(); 2868 2869 vfs_list_lock(); 2870 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 2871 prev_vfsp = vfsp->vfs_prev; 2872 2873 if (vfs_lock(vfsp) != 0) 2874 continue; 2875 error = vn_vfswlock(vfsp->vfs_vnodecovered); 2876 vfs_unlock(vfsp); 2877 if (error) 2878 continue; 2879 2880 vfs_list_unlock(); 2881 2882 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 2883 (void) dounmount(vfsp, 0, CRED()); 2884 2885 /* 2886 * Since we dropped the vfslist lock above we must 2887 * verify that next_vfsp still exists, else start over. 2888 */ 2889 vfs_list_lock(); 2890 for (vfsp = rootvfs->vfs_prev; 2891 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 2892 if (vfsp == prev_vfsp) 2893 break; 2894 if (vfsp == rootvfs && prev_vfsp != rootvfs) 2895 prev_vfsp = rootvfs->vfs_prev; 2896 } 2897 vfs_list_unlock(); 2898 } 2899 2900 /* 2901 * Called to add an entry to the end of the vfs mount in progress list 2902 */ 2903 void 2904 vfs_addmip(dev_t dev, struct vfs *vfsp) 2905 { 2906 struct ipmnt *mipp; 2907 2908 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 2909 mipp->mip_next = NULL; 2910 mipp->mip_dev = dev; 2911 mipp->mip_vfsp = vfsp; 2912 mutex_enter(&vfs_miplist_mutex); 2913 if (vfs_miplist_end != NULL) 2914 vfs_miplist_end->mip_next = mipp; 2915 else 2916 vfs_miplist = mipp; 2917 vfs_miplist_end = mipp; 2918 mutex_exit(&vfs_miplist_mutex); 2919 } 2920 2921 /* 2922 * Called to remove an entry from the mount in progress list 2923 * Either because the mount completed or it failed. 2924 */ 2925 void 2926 vfs_delmip(struct vfs *vfsp) 2927 { 2928 struct ipmnt *mipp, *mipprev; 2929 2930 mutex_enter(&vfs_miplist_mutex); 2931 mipprev = NULL; 2932 for (mipp = vfs_miplist; 2933 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 2934 mipprev = mipp; 2935 } 2936 if (mipp == NULL) 2937 return; /* shouldn't happen */ 2938 if (mipp == vfs_miplist_end) 2939 vfs_miplist_end = mipprev; 2940 if (mipprev == NULL) 2941 vfs_miplist = mipp->mip_next; 2942 else 2943 mipprev->mip_next = mipp->mip_next; 2944 mutex_exit(&vfs_miplist_mutex); 2945 kmem_free(mipp, sizeof (struct ipmnt)); 2946 } 2947 2948 /* 2949 * vfs_add is called by a specific filesystem's mount routine to add 2950 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 2951 * The vfs should already have been locked by the caller. 2952 * 2953 * coveredvp is NULL if this is the root. 2954 */ 2955 void 2956 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 2957 { 2958 int newflag; 2959 2960 ASSERT(vfs_lock_held(vfsp)); 2961 VFS_HOLD(vfsp); 2962 newflag = vfsp->vfs_flag; 2963 if (mflag & MS_RDONLY) 2964 newflag |= VFS_RDONLY; 2965 else 2966 newflag &= ~VFS_RDONLY; 2967 if (mflag & MS_NOSUID) 2968 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 2969 else 2970 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 2971 if (mflag & MS_NOMNTTAB) 2972 newflag |= VFS_NOMNTTAB; 2973 else 2974 newflag &= ~VFS_NOMNTTAB; 2975 2976 if (coveredvp != NULL) { 2977 ASSERT(vn_vfswlock_held(coveredvp)); 2978 coveredvp->v_vfsmountedhere = vfsp; 2979 VN_HOLD(coveredvp); 2980 } 2981 vfsp->vfs_vnodecovered = coveredvp; 2982 vfsp->vfs_flag = newflag; 2983 2984 vfs_list_add(vfsp); 2985 } 2986 2987 /* 2988 * Remove a vfs from the vfs list, null out the pointer from the 2989 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 2990 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 2991 * reference to the vfs and to the covered vnode. 2992 * 2993 * Called from dounmount after it's confirmed with the file system 2994 * that the unmount is legal. 2995 */ 2996 void 2997 vfs_remove(struct vfs *vfsp) 2998 { 2999 vnode_t *vp; 3000 3001 ASSERT(vfs_lock_held(vfsp)); 3002 3003 /* 3004 * Can't unmount root. Should never happen because fs will 3005 * be busy. 3006 */ 3007 if (vfsp == rootvfs) 3008 panic("vfs_remove: unmounting root"); 3009 3010 vfs_list_remove(vfsp); 3011 3012 /* 3013 * Unhook from the file system name space. 3014 */ 3015 vp = vfsp->vfs_vnodecovered; 3016 ASSERT(vn_vfswlock_held(vp)); 3017 vp->v_vfsmountedhere = NULL; 3018 vfsp->vfs_vnodecovered = NULL; 3019 VN_RELE(vp); 3020 3021 /* 3022 * Release lock and wakeup anybody waiting. 3023 */ 3024 vfs_unlock(vfsp); 3025 VFS_RELE(vfsp); 3026 } 3027 3028 /* 3029 * Lock a filesystem to prevent access to it while mounting, 3030 * unmounting and syncing. Return EBUSY immediately if lock 3031 * can't be acquired. 3032 */ 3033 int 3034 vfs_lock(vfs_t *vfsp) 3035 { 3036 vn_vfslocks_entry_t *vpvfsentry; 3037 3038 vpvfsentry = vn_vfslocks_getlock(vfsp); 3039 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 3040 return (0); 3041 3042 vn_vfslocks_rele(vpvfsentry); 3043 return (EBUSY); 3044 } 3045 3046 int 3047 vfs_rlock(vfs_t *vfsp) 3048 { 3049 vn_vfslocks_entry_t *vpvfsentry; 3050 3051 vpvfsentry = vn_vfslocks_getlock(vfsp); 3052 3053 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 3054 return (0); 3055 3056 vn_vfslocks_rele(vpvfsentry); 3057 return (EBUSY); 3058 } 3059 3060 void 3061 vfs_lock_wait(vfs_t *vfsp) 3062 { 3063 vn_vfslocks_entry_t *vpvfsentry; 3064 3065 vpvfsentry = vn_vfslocks_getlock(vfsp); 3066 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 3067 } 3068 3069 void 3070 vfs_rlock_wait(vfs_t *vfsp) 3071 { 3072 vn_vfslocks_entry_t *vpvfsentry; 3073 3074 vpvfsentry = vn_vfslocks_getlock(vfsp); 3075 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 3076 } 3077 3078 /* 3079 * Unlock a locked filesystem. 3080 */ 3081 void 3082 vfs_unlock(vfs_t *vfsp) 3083 { 3084 vn_vfslocks_entry_t *vpvfsentry; 3085 3086 /* 3087 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 3088 * And these changes should remain for the patch changes as it is. 3089 */ 3090 if (panicstr) 3091 return; 3092 3093 /* 3094 * ve_refcount needs to be dropped twice here. 3095 * 1. To release refernce after a call to vfs_locks_getlock() 3096 * 2. To release the reference from the locking routines like 3097 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 3098 */ 3099 3100 vpvfsentry = vn_vfslocks_getlock(vfsp); 3101 vn_vfslocks_rele(vpvfsentry); 3102 3103 rwst_exit(&vpvfsentry->ve_lock); 3104 vn_vfslocks_rele(vpvfsentry); 3105 } 3106 3107 /* 3108 * Utility routine that allows a filesystem to construct its 3109 * fsid in "the usual way" - by munging some underlying dev_t and 3110 * the filesystem type number into the 64-bit fsid. Note that 3111 * this implicitly relies on dev_t persistence to make filesystem 3112 * id's persistent. 3113 * 3114 * There's nothing to prevent an individual fs from constructing its 3115 * fsid in a different way, and indeed they should. 3116 * 3117 * Since we want fsids to be 32-bit quantities (so that they can be 3118 * exported identically by either 32-bit or 64-bit APIs, as well as 3119 * the fact that fsid's are "known" to NFS), we compress the device 3120 * number given down to 32-bits, and panic if that isn't possible. 3121 */ 3122 void 3123 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 3124 { 3125 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 3126 panic("device number too big for fsid!"); 3127 fsi->val[1] = val; 3128 } 3129 3130 int 3131 vfs_lock_held(vfs_t *vfsp) 3132 { 3133 int held; 3134 vn_vfslocks_entry_t *vpvfsentry; 3135 3136 /* 3137 * vfs_lock_held will mimic sema_held behaviour 3138 * if panicstr is set. And these changes should remain 3139 * for the patch changes as it is. 3140 */ 3141 if (panicstr) 3142 return (1); 3143 3144 vpvfsentry = vn_vfslocks_getlock(vfsp); 3145 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 3146 3147 vn_vfslocks_rele(vpvfsentry); 3148 return (held); 3149 } 3150 3151 struct _kthread * 3152 vfs_lock_owner(vfs_t *vfsp) 3153 { 3154 struct _kthread *owner; 3155 vn_vfslocks_entry_t *vpvfsentry; 3156 3157 /* 3158 * vfs_wlock_held will mimic sema_held behaviour 3159 * if panicstr is set. And these changes should remain 3160 * for the patch changes as it is. 3161 */ 3162 if (panicstr) 3163 return (NULL); 3164 3165 vpvfsentry = vn_vfslocks_getlock(vfsp); 3166 owner = rwst_owner(&vpvfsentry->ve_lock); 3167 3168 vn_vfslocks_rele(vpvfsentry); 3169 return (owner); 3170 } 3171 3172 /* 3173 * vfs list locking. 3174 * 3175 * Rather than manipulate the vfslist lock directly, we abstract into lock 3176 * and unlock routines to allow the locking implementation to be changed for 3177 * clustering. 3178 * 3179 * Whenever the vfs list is modified through its hash links, the overall list 3180 * lock must be obtained before locking the relevant hash bucket. But to see 3181 * whether a given vfs is on the list, it suffices to obtain the lock for the 3182 * hash bucket without getting the overall list lock. (See getvfs() below.) 3183 */ 3184 3185 void 3186 vfs_list_lock() 3187 { 3188 rw_enter(&vfslist, RW_WRITER); 3189 } 3190 3191 void 3192 vfs_list_read_lock() 3193 { 3194 rw_enter(&vfslist, RW_READER); 3195 } 3196 3197 void 3198 vfs_list_unlock() 3199 { 3200 rw_exit(&vfslist); 3201 } 3202 3203 /* 3204 * Low level worker routines for adding entries to and removing entries from 3205 * the vfs list. 3206 */ 3207 3208 static void 3209 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 3210 { 3211 int vhno; 3212 struct vfs **hp; 3213 dev_t dev; 3214 3215 ASSERT(RW_WRITE_HELD(&vfslist)); 3216 3217 dev = expldev(vfsp->vfs_fsid.val[0]); 3218 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3219 3220 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3221 3222 /* 3223 * Link into the hash table, inserting it at the end, so that LOFS 3224 * with the same fsid as UFS (or other) file systems will not hide the 3225 * UFS. 3226 */ 3227 if (insert_at_head) { 3228 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 3229 rvfs_list[vhno].rvfs_head = vfsp; 3230 } else { 3231 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 3232 hp = &(*hp)->vfs_hash) 3233 continue; 3234 /* 3235 * hp now contains the address of the pointer to update 3236 * to effect the insertion. 3237 */ 3238 vfsp->vfs_hash = NULL; 3239 *hp = vfsp; 3240 } 3241 3242 rvfs_list[vhno].rvfs_len++; 3243 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3244 } 3245 3246 3247 static void 3248 vfs_hash_remove(struct vfs *vfsp) 3249 { 3250 int vhno; 3251 struct vfs *tvfsp; 3252 dev_t dev; 3253 3254 ASSERT(RW_WRITE_HELD(&vfslist)); 3255 3256 dev = expldev(vfsp->vfs_fsid.val[0]); 3257 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3258 3259 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3260 3261 /* 3262 * Remove from hash. 3263 */ 3264 if (rvfs_list[vhno].rvfs_head == vfsp) { 3265 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 3266 rvfs_list[vhno].rvfs_len--; 3267 goto foundit; 3268 } 3269 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 3270 tvfsp = tvfsp->vfs_hash) { 3271 if (tvfsp->vfs_hash == vfsp) { 3272 tvfsp->vfs_hash = vfsp->vfs_hash; 3273 rvfs_list[vhno].rvfs_len--; 3274 goto foundit; 3275 } 3276 } 3277 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 3278 3279 foundit: 3280 3281 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3282 } 3283 3284 3285 void 3286 vfs_list_add(struct vfs *vfsp) 3287 { 3288 zone_t *zone; 3289 3290 /* 3291 * The zone that owns the mount is the one that performed the mount. 3292 * Note that this isn't necessarily the same as the zone mounted into. 3293 * The corresponding zone_rele() will be done when the vfs_t is 3294 * being free'd. 3295 */ 3296 vfsp->vfs_zone = curproc->p_zone; 3297 zone_hold(vfsp->vfs_zone); 3298 3299 /* 3300 * Find the zone mounted into, and put this mount on its vfs list. 3301 */ 3302 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3303 ASSERT(zone != NULL); 3304 /* 3305 * Special casing for the root vfs. This structure is allocated 3306 * statically and hooked onto rootvfs at link time. During the 3307 * vfs_mountroot call at system startup time, the root file system's 3308 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 3309 * as argument. The code below must detect and handle this special 3310 * case. The only apparent justification for this special casing is 3311 * to ensure that the root file system appears at the head of the 3312 * list. 3313 * 3314 * XXX: I'm assuming that it's ok to do normal list locking when 3315 * adding the entry for the root file system (this used to be 3316 * done with no locks held). 3317 */ 3318 vfs_list_lock(); 3319 /* 3320 * Link into the vfs list proper. 3321 */ 3322 if (vfsp == &root) { 3323 /* 3324 * Assert: This vfs is already on the list as its first entry. 3325 * Thus, there's nothing to do. 3326 */ 3327 ASSERT(rootvfs == vfsp); 3328 /* 3329 * Add it to the head of the global zone's vfslist. 3330 */ 3331 ASSERT(zone == global_zone); 3332 ASSERT(zone->zone_vfslist == NULL); 3333 zone->zone_vfslist = vfsp; 3334 } else { 3335 /* 3336 * Link to end of list using vfs_prev (as rootvfs is now a 3337 * doubly linked circular list) so list is in mount order for 3338 * mnttab use. 3339 */ 3340 rootvfs->vfs_prev->vfs_next = vfsp; 3341 vfsp->vfs_prev = rootvfs->vfs_prev; 3342 rootvfs->vfs_prev = vfsp; 3343 vfsp->vfs_next = rootvfs; 3344 3345 /* 3346 * Do it again for the zone-private list (which may be NULL). 3347 */ 3348 if (zone->zone_vfslist == NULL) { 3349 ASSERT(zone != global_zone); 3350 zone->zone_vfslist = vfsp; 3351 } else { 3352 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 3353 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 3354 zone->zone_vfslist->vfs_zone_prev = vfsp; 3355 vfsp->vfs_zone_next = zone->zone_vfslist; 3356 } 3357 } 3358 3359 /* 3360 * Link into the hash table, inserting it at the end, so that LOFS 3361 * with the same fsid as UFS (or other) file systems will not hide 3362 * the UFS. 3363 */ 3364 vfs_hash_add(vfsp, 0); 3365 3366 /* 3367 * update the mnttab modification time 3368 */ 3369 vfs_mnttab_modtimeupd(); 3370 vfs_list_unlock(); 3371 zone_rele(zone); 3372 } 3373 3374 void 3375 vfs_list_remove(struct vfs *vfsp) 3376 { 3377 zone_t *zone; 3378 3379 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3380 ASSERT(zone != NULL); 3381 /* 3382 * Callers are responsible for preventing attempts to unmount the 3383 * root. 3384 */ 3385 ASSERT(vfsp != rootvfs); 3386 3387 vfs_list_lock(); 3388 3389 /* 3390 * Remove from hash. 3391 */ 3392 vfs_hash_remove(vfsp); 3393 3394 /* 3395 * Remove from vfs list. 3396 */ 3397 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3398 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3399 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3400 3401 /* 3402 * Remove from zone-specific vfs list. 3403 */ 3404 if (zone->zone_vfslist == vfsp) 3405 zone->zone_vfslist = vfsp->vfs_zone_next; 3406 3407 if (vfsp->vfs_zone_next == vfsp) { 3408 ASSERT(vfsp->vfs_zone_prev == vfsp); 3409 ASSERT(zone->zone_vfslist == vfsp); 3410 zone->zone_vfslist = NULL; 3411 } 3412 3413 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3414 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3415 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3416 3417 /* 3418 * update the mnttab modification time 3419 */ 3420 vfs_mnttab_modtimeupd(); 3421 vfs_list_unlock(); 3422 zone_rele(zone); 3423 } 3424 3425 struct vfs * 3426 getvfs(fsid_t *fsid) 3427 { 3428 struct vfs *vfsp; 3429 int val0 = fsid->val[0]; 3430 int val1 = fsid->val[1]; 3431 dev_t dev = expldev(val0); 3432 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3433 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3434 3435 mutex_enter(hmp); 3436 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3437 if (vfsp->vfs_fsid.val[0] == val0 && 3438 vfsp->vfs_fsid.val[1] == val1) { 3439 VFS_HOLD(vfsp); 3440 mutex_exit(hmp); 3441 return (vfsp); 3442 } 3443 } 3444 mutex_exit(hmp); 3445 return (NULL); 3446 } 3447 3448 /* 3449 * Search the vfs mount in progress list for a specified device/vfs entry. 3450 * Returns 0 if the first entry in the list that the device matches has the 3451 * given vfs pointer as well. If the device matches but a different vfs 3452 * pointer is encountered in the list before the given vfs pointer then 3453 * a 1 is returned. 3454 */ 3455 3456 int 3457 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3458 { 3459 int retval = 0; 3460 struct ipmnt *mipp; 3461 3462 mutex_enter(&vfs_miplist_mutex); 3463 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3464 if (mipp->mip_dev == dev) { 3465 if (mipp->mip_vfsp != vfsp) 3466 retval = 1; 3467 break; 3468 } 3469 } 3470 mutex_exit(&vfs_miplist_mutex); 3471 return (retval); 3472 } 3473 3474 /* 3475 * Search the vfs list for a specified device. Returns 1, if entry is found 3476 * or 0 if no suitable entry is found. 3477 */ 3478 3479 int 3480 vfs_devismounted(dev_t dev) 3481 { 3482 struct vfs *vfsp; 3483 int found; 3484 3485 vfs_list_read_lock(); 3486 vfsp = rootvfs; 3487 found = 0; 3488 do { 3489 if (vfsp->vfs_dev == dev) { 3490 found = 1; 3491 break; 3492 } 3493 vfsp = vfsp->vfs_next; 3494 } while (vfsp != rootvfs); 3495 3496 vfs_list_unlock(); 3497 return (found); 3498 } 3499 3500 /* 3501 * Search the vfs list for a specified device. Returns a pointer to it 3502 * or NULL if no suitable entry is found. The caller of this routine 3503 * is responsible for releasing the returned vfs pointer. 3504 */ 3505 struct vfs * 3506 vfs_dev2vfsp(dev_t dev) 3507 { 3508 struct vfs *vfsp; 3509 int found; 3510 3511 vfs_list_read_lock(); 3512 vfsp = rootvfs; 3513 found = 0; 3514 do { 3515 /* 3516 * The following could be made more efficient by making 3517 * the entire loop use vfs_zone_next if the call is from 3518 * a zone. The only callers, however, ustat(2) and 3519 * umount2(2), don't seem to justify the added 3520 * complexity at present. 3521 */ 3522 if (vfsp->vfs_dev == dev && 3523 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3524 curproc->p_zone)) { 3525 VFS_HOLD(vfsp); 3526 found = 1; 3527 break; 3528 } 3529 vfsp = vfsp->vfs_next; 3530 } while (vfsp != rootvfs); 3531 vfs_list_unlock(); 3532 return (found ? vfsp: NULL); 3533 } 3534 3535 /* 3536 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3537 * or NULL if no suitable entry is found. The caller of this routine 3538 * is responsible for releasing the returned vfs pointer. 3539 * 3540 * Note that if multiple mntpoints match, the last one matching is 3541 * returned in an attempt to return the "top" mount when overlay 3542 * mounts are covering the same mount point. This is accomplished by starting 3543 * at the end of the list and working our way backwards, stopping at the first 3544 * matching mount. 3545 */ 3546 struct vfs * 3547 vfs_mntpoint2vfsp(const char *mp) 3548 { 3549 struct vfs *vfsp; 3550 struct vfs *retvfsp = NULL; 3551 zone_t *zone = curproc->p_zone; 3552 struct vfs *list; 3553 3554 vfs_list_read_lock(); 3555 if (getzoneid() == GLOBAL_ZONEID) { 3556 /* 3557 * The global zone may see filesystems in any zone. 3558 */ 3559 vfsp = rootvfs->vfs_prev; 3560 do { 3561 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) { 3562 retvfsp = vfsp; 3563 break; 3564 } 3565 vfsp = vfsp->vfs_prev; 3566 } while (vfsp != rootvfs->vfs_prev); 3567 } else if ((list = zone->zone_vfslist) != NULL) { 3568 const char *mntpt; 3569 3570 vfsp = list->vfs_zone_prev; 3571 do { 3572 mntpt = refstr_value(vfsp->vfs_mntpt); 3573 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3574 if (strcmp(mntpt, mp) == 0) { 3575 retvfsp = vfsp; 3576 break; 3577 } 3578 vfsp = vfsp->vfs_zone_prev; 3579 } while (vfsp != list->vfs_zone_prev); 3580 } 3581 if (retvfsp) 3582 VFS_HOLD(retvfsp); 3583 vfs_list_unlock(); 3584 return (retvfsp); 3585 } 3586 3587 /* 3588 * Search the vfs list for a specified vfsops. 3589 * if vfs entry is found then return 1, else 0. 3590 */ 3591 int 3592 vfs_opsinuse(vfsops_t *ops) 3593 { 3594 struct vfs *vfsp; 3595 int found; 3596 3597 vfs_list_read_lock(); 3598 vfsp = rootvfs; 3599 found = 0; 3600 do { 3601 if (vfs_getops(vfsp) == ops) { 3602 found = 1; 3603 break; 3604 } 3605 vfsp = vfsp->vfs_next; 3606 } while (vfsp != rootvfs); 3607 vfs_list_unlock(); 3608 return (found); 3609 } 3610 3611 /* 3612 * Allocate an entry in vfssw for a file system type 3613 */ 3614 struct vfssw * 3615 allocate_vfssw(char *type) 3616 { 3617 struct vfssw *vswp; 3618 3619 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3620 /* 3621 * The vfssw table uses the empty string to identify an 3622 * available entry; we cannot add any type which has 3623 * a leading NUL. The string length is limited to 3624 * the size of the st_fstype array in struct stat. 3625 */ 3626 return (NULL); 3627 } 3628 3629 ASSERT(VFSSW_WRITE_LOCKED()); 3630 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3631 if (!ALLOCATED_VFSSW(vswp)) { 3632 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3633 (void) strcpy(vswp->vsw_name, type); 3634 ASSERT(vswp->vsw_count == 0); 3635 vswp->vsw_count = 1; 3636 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3637 return (vswp); 3638 } 3639 return (NULL); 3640 } 3641 3642 /* 3643 * Impose additional layer of translation between vfstype names 3644 * and module names in the filesystem. 3645 */ 3646 static char * 3647 vfs_to_modname(char *vfstype) 3648 { 3649 if (strcmp(vfstype, "proc") == 0) { 3650 vfstype = "procfs"; 3651 } else if (strcmp(vfstype, "fd") == 0) { 3652 vfstype = "fdfs"; 3653 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3654 vfstype = "nfs"; 3655 } 3656 3657 return (vfstype); 3658 } 3659 3660 /* 3661 * Find a vfssw entry given a file system type name. 3662 * Try to autoload the filesystem if it's not found. 3663 * If it's installed, return the vfssw locked to prevent unloading. 3664 */ 3665 struct vfssw * 3666 vfs_getvfssw(char *type) 3667 { 3668 struct vfssw *vswp; 3669 char *modname; 3670 3671 RLOCK_VFSSW(); 3672 vswp = vfs_getvfsswbyname(type); 3673 modname = vfs_to_modname(type); 3674 3675 if (rootdir == NULL) { 3676 /* 3677 * If we haven't yet loaded the root file system, then our 3678 * _init won't be called until later. Allocate vfssw entry, 3679 * because mod_installfs won't be called. 3680 */ 3681 if (vswp == NULL) { 3682 RUNLOCK_VFSSW(); 3683 WLOCK_VFSSW(); 3684 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 3685 if ((vswp = allocate_vfssw(type)) == NULL) { 3686 WUNLOCK_VFSSW(); 3687 return (NULL); 3688 } 3689 } 3690 WUNLOCK_VFSSW(); 3691 RLOCK_VFSSW(); 3692 } 3693 if (!VFS_INSTALLED(vswp)) { 3694 RUNLOCK_VFSSW(); 3695 (void) modloadonly("fs", modname); 3696 } else 3697 RUNLOCK_VFSSW(); 3698 return (vswp); 3699 } 3700 3701 /* 3702 * Try to load the filesystem. Before calling modload(), we drop 3703 * our lock on the VFS switch table, and pick it up after the 3704 * module is loaded. However, there is a potential race: the 3705 * module could be unloaded after the call to modload() completes 3706 * but before we pick up the lock and drive on. Therefore, 3707 * we keep reloading the module until we've loaded the module 3708 * _and_ we have the lock on the VFS switch table. 3709 */ 3710 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 3711 RUNLOCK_VFSSW(); 3712 if (modload("fs", modname) == -1) 3713 return (NULL); 3714 RLOCK_VFSSW(); 3715 if (vswp == NULL) 3716 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 3717 break; 3718 } 3719 RUNLOCK_VFSSW(); 3720 3721 return (vswp); 3722 } 3723 3724 /* 3725 * Find a vfssw entry given a file system type name. 3726 */ 3727 struct vfssw * 3728 vfs_getvfsswbyname(char *type) 3729 { 3730 struct vfssw *vswp; 3731 3732 ASSERT(VFSSW_LOCKED()); 3733 if (type == NULL || *type == '\0') 3734 return (NULL); 3735 3736 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3737 if (strcmp(type, vswp->vsw_name) == 0) { 3738 vfs_refvfssw(vswp); 3739 return (vswp); 3740 } 3741 } 3742 3743 return (NULL); 3744 } 3745 3746 /* 3747 * Find a vfssw entry given a set of vfsops. 3748 */ 3749 struct vfssw * 3750 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 3751 { 3752 struct vfssw *vswp; 3753 3754 RLOCK_VFSSW(); 3755 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3756 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 3757 vfs_refvfssw(vswp); 3758 RUNLOCK_VFSSW(); 3759 return (vswp); 3760 } 3761 } 3762 RUNLOCK_VFSSW(); 3763 3764 return (NULL); 3765 } 3766 3767 /* 3768 * Reference a vfssw entry. 3769 */ 3770 void 3771 vfs_refvfssw(struct vfssw *vswp) 3772 { 3773 3774 mutex_enter(&vswp->vsw_lock); 3775 vswp->vsw_count++; 3776 mutex_exit(&vswp->vsw_lock); 3777 } 3778 3779 /* 3780 * Unreference a vfssw entry. 3781 */ 3782 void 3783 vfs_unrefvfssw(struct vfssw *vswp) 3784 { 3785 3786 mutex_enter(&vswp->vsw_lock); 3787 vswp->vsw_count--; 3788 mutex_exit(&vswp->vsw_lock); 3789 } 3790 3791 int sync_timeout = 30; /* timeout for syncing a page during panic */ 3792 int sync_timeleft; /* portion of sync_timeout remaining */ 3793 3794 static int sync_retries = 20; /* number of retries when not making progress */ 3795 static int sync_triesleft; /* portion of sync_retries remaining */ 3796 3797 static pgcnt_t old_pgcnt, new_pgcnt; 3798 static int new_bufcnt, old_bufcnt; 3799 3800 /* 3801 * Sync all of the mounted filesystems, and then wait for the actual i/o to 3802 * complete. We wait by counting the number of dirty pages and buffers, 3803 * pushing them out using bio_busy() and page_busy(), and then counting again. 3804 * This routine is used during both the uadmin A_SHUTDOWN code as well as 3805 * the SYNC phase of the panic code (see comments in panic.c). It should only 3806 * be used after some higher-level mechanism has quiesced the system so that 3807 * new writes are not being initiated while we are waiting for completion. 3808 * 3809 * To ensure finite running time, our algorithm uses two timeout mechanisms: 3810 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and 3811 * sync_triesleft (a progress counter used by the vfs_syncall() loop below). 3812 * Together these ensure that syncing completes if our i/o paths are stuck. 3813 * The counters are declared above so they can be found easily in the debugger. 3814 * 3815 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the 3816 * vfs_syncprogress() subroutine whenever we make progress through the lists of 3817 * pages and buffers. It is decremented and expired by the deadman() cyclic. 3818 * When vfs_syncall() decides it is done, we disable the deadman() counter by 3819 * setting sync_timeleft to zero. This timer guards against vfs_syncall() 3820 * deadlocking or hanging inside of a broken filesystem or driver routine. 3821 * 3822 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 3823 * sync_retries consecutive calls to bio_busy() and page_busy() without 3824 * decreasing either the number of dirty buffers or dirty pages below the 3825 * lowest count we have seen so far, we give up and return from vfs_syncall(). 3826 * 3827 * Each loop iteration ends with a call to delay() one second to allow time for 3828 * i/o completion and to permit the user time to read our progress messages. 3829 */ 3830 void 3831 vfs_syncall(void) 3832 { 3833 if (rootdir == NULL && !modrootloaded) 3834 return; /* panic during boot - no filesystems yet */ 3835 3836 printf("syncing file systems..."); 3837 vfs_syncprogress(); 3838 sync(); 3839 3840 vfs_syncprogress(); 3841 sync_triesleft = sync_retries; 3842 3843 old_bufcnt = new_bufcnt = INT_MAX; 3844 old_pgcnt = new_pgcnt = ULONG_MAX; 3845 3846 while (sync_triesleft > 0) { 3847 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 3848 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 3849 3850 new_bufcnt = bio_busy(B_TRUE); 3851 new_pgcnt = page_busy(B_TRUE); 3852 vfs_syncprogress(); 3853 3854 if (new_bufcnt == 0 && new_pgcnt == 0) 3855 break; 3856 3857 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 3858 sync_triesleft = sync_retries; 3859 else 3860 sync_triesleft--; 3861 3862 if (new_bufcnt) 3863 printf(" [%d]", new_bufcnt); 3864 if (new_pgcnt) 3865 printf(" %lu", new_pgcnt); 3866 3867 delay(hz); 3868 } 3869 3870 if (new_bufcnt != 0 || new_pgcnt != 0) 3871 printf(" done (not all i/o completed)\n"); 3872 else 3873 printf(" done\n"); 3874 3875 sync_timeleft = 0; 3876 delay(hz); 3877 } 3878 3879 /* 3880 * If we are in the middle of the sync phase of panic, reset sync_timeleft to 3881 * sync_timeout to indicate that we are making progress and the deadman() 3882 * omnipresent cyclic should not yet time us out. Note that it is safe to 3883 * store to sync_timeleft here since the deadman() is firing at high-level 3884 * on top of us. If we are racing with the deadman(), either the deadman() 3885 * will decrement the old value and then we will reset it, or we will 3886 * reset it and then the deadman() will immediately decrement it. In either 3887 * case, correct behavior results. 3888 */ 3889 void 3890 vfs_syncprogress(void) 3891 { 3892 if (panicstr) 3893 sync_timeleft = sync_timeout; 3894 } 3895 3896 /* 3897 * Map VFS flags to statvfs flags. These shouldn't really be separate 3898 * flags at all. 3899 */ 3900 uint_t 3901 vf_to_stf(uint_t vf) 3902 { 3903 uint_t stf = 0; 3904 3905 if (vf & VFS_RDONLY) 3906 stf |= ST_RDONLY; 3907 if (vf & VFS_NOSETUID) 3908 stf |= ST_NOSUID; 3909 if (vf & VFS_NOTRUNC) 3910 stf |= ST_NOTRUNC; 3911 3912 return (stf); 3913 } 3914 3915 /* 3916 * Entries for (illegal) fstype 0. 3917 */ 3918 /* ARGSUSED */ 3919 int 3920 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 3921 { 3922 cmn_err(CE_PANIC, "stray vfs operation"); 3923 return (0); 3924 } 3925 3926 /* 3927 * Entries for (illegal) fstype 0. 3928 */ 3929 int 3930 vfsstray(void) 3931 { 3932 cmn_err(CE_PANIC, "stray vfs operation"); 3933 return (0); 3934 } 3935 3936 /* 3937 * Support for dealing with forced UFS unmount and its interaction with 3938 * LOFS. Could be used by any filesystem. 3939 * See bug 1203132. 3940 */ 3941 int 3942 vfs_EIO(void) 3943 { 3944 return (EIO); 3945 } 3946 3947 /* 3948 * We've gotta define the op for sync separately, since the compiler gets 3949 * confused if we mix and match ANSI and normal style prototypes when 3950 * a "short" argument is present and spits out a warning. 3951 */ 3952 /*ARGSUSED*/ 3953 int 3954 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 3955 { 3956 return (EIO); 3957 } 3958 3959 vfs_t EIO_vfs; 3960 vfsops_t *EIO_vfsops; 3961 3962 /* 3963 * Called from startup() to initialize all loaded vfs's 3964 */ 3965 void 3966 vfsinit(void) 3967 { 3968 struct vfssw *vswp; 3969 int error; 3970 extern int vopstats_enabled; 3971 extern void vopstats_startup(); 3972 3973 static const fs_operation_def_t EIO_vfsops_template[] = { 3974 VFSNAME_MOUNT, { .error = vfs_EIO }, 3975 VFSNAME_UNMOUNT, { .error = vfs_EIO }, 3976 VFSNAME_ROOT, { .error = vfs_EIO }, 3977 VFSNAME_STATVFS, { .error = vfs_EIO }, 3978 VFSNAME_SYNC, { .vfs_sync = vfs_EIO_sync }, 3979 VFSNAME_VGET, { .error = vfs_EIO }, 3980 VFSNAME_MOUNTROOT, { .error = vfs_EIO }, 3981 VFSNAME_FREEVFS, { .error = vfs_EIO }, 3982 VFSNAME_VNSTATE, { .error = vfs_EIO }, 3983 NULL, NULL 3984 }; 3985 3986 static const fs_operation_def_t stray_vfsops_template[] = { 3987 VFSNAME_MOUNT, { .error = vfsstray }, 3988 VFSNAME_UNMOUNT, { .error = vfsstray }, 3989 VFSNAME_ROOT, { .error = vfsstray }, 3990 VFSNAME_STATVFS, { .error = vfsstray }, 3991 VFSNAME_SYNC, { .vfs_sync = vfsstray_sync }, 3992 VFSNAME_VGET, { .error = vfsstray }, 3993 VFSNAME_MOUNTROOT, { .error = vfsstray }, 3994 VFSNAME_FREEVFS, { .error = vfsstray }, 3995 VFSNAME_VNSTATE, { .error = vfsstray }, 3996 NULL, NULL 3997 }; 3998 3999 /* Initialize the vnode cache (file systems may use it during init). */ 4000 4001 vn_create_cache(); 4002 4003 /* Setup event monitor framework */ 4004 4005 fem_init(); 4006 4007 /* Initialize the dummy stray file system type. */ 4008 error = vfs_setfsops(0, stray_vfsops_template, NULL); 4009 4010 /* Initialize the dummy EIO file system. */ 4011 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 4012 if (error != 0) { 4013 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 4014 /* Shouldn't happen, but not bad enough to panic */ 4015 } 4016 4017 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 4018 4019 /* 4020 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 4021 * on this vfs can immediately notice it's invalid. 4022 */ 4023 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 4024 4025 /* 4026 * Call the init routines of non-loadable filesystems only. 4027 * Filesystems which are loaded as separate modules will be 4028 * initialized by the module loading code instead. 4029 */ 4030 4031 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 4032 RLOCK_VFSSW(); 4033 if (vswp->vsw_init != NULL) 4034 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 4035 RUNLOCK_VFSSW(); 4036 } 4037 4038 vopstats_startup(); 4039 4040 if (vopstats_enabled) { 4041 /* EIO_vfs can collect stats, but we don't retrieve them */ 4042 initialize_vopstats(&EIO_vfs.vfs_vopstats); 4043 EIO_vfs.vfs_fstypevsp = NULL; 4044 EIO_vfs.vfs_vskap = NULL; 4045 EIO_vfs.vfs_flag |= VFS_STATS; 4046 } 4047 } 4048 4049 /* 4050 * Increments the vfs reference count by one atomically. 4051 */ 4052 void 4053 vfs_hold(vfs_t *vfsp) 4054 { 4055 atomic_add_32(&vfsp->vfs_count, 1); 4056 ASSERT(vfsp->vfs_count != 0); 4057 } 4058 4059 /* 4060 * Decrements the vfs reference count by one atomically. When 4061 * vfs reference count becomes zero, it calls the file system 4062 * specific vfs_freevfs() to free up the resources. 4063 */ 4064 void 4065 vfs_rele(vfs_t *vfsp) 4066 { 4067 ASSERT(vfsp->vfs_count != 0); 4068 if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) { 4069 VFS_FREEVFS(vfsp); 4070 if (vfsp->vfs_zone) 4071 zone_rele(vfsp->vfs_zone); 4072 vfs_freemnttab(vfsp); 4073 if (vfsp->vfs_implp) 4074 vfsimpl_teardown(vfsp); 4075 sema_destroy(&vfsp->vfs_reflock); 4076 kmem_free(vfsp, sizeof (*vfsp)); 4077 } 4078 } 4079 4080 /* 4081 * Generic operations vector support. 4082 * 4083 * This is used to build operations vectors for both the vfs and vnode. 4084 * It's normally called only when a file system is loaded. 4085 * 4086 * There are many possible algorithms for this, including the following: 4087 * 4088 * (1) scan the list of known operations; for each, see if the file system 4089 * includes an entry for it, and fill it in as appropriate. 4090 * 4091 * (2) set up defaults for all known operations. scan the list of ops 4092 * supplied by the file system; for each which is both supplied and 4093 * known, fill it in. 4094 * 4095 * (3) sort the lists of known ops & supplied ops; scan the list, filling 4096 * in entries as we go. 4097 * 4098 * we choose (1) for simplicity, and because performance isn't critical here. 4099 * note that (2) could be sped up using a precomputed hash table on known ops. 4100 * (3) could be faster than either, but only if the lists were very large or 4101 * supplied in sorted order. 4102 * 4103 */ 4104 4105 int 4106 fs_build_vector(void *vector, int *unused_ops, 4107 const fs_operation_trans_def_t *translation, 4108 const fs_operation_def_t *operations) 4109 { 4110 int i, num_trans, num_ops, used; 4111 4112 /* 4113 * Count the number of translations and the number of supplied 4114 * operations. 4115 */ 4116 4117 { 4118 const fs_operation_trans_def_t *p; 4119 4120 for (num_trans = 0, p = translation; 4121 p->name != NULL; 4122 num_trans++, p++) 4123 ; 4124 } 4125 4126 { 4127 const fs_operation_def_t *p; 4128 4129 for (num_ops = 0, p = operations; 4130 p->name != NULL; 4131 num_ops++, p++) 4132 ; 4133 } 4134 4135 /* Walk through each operation known to our caller. There will be */ 4136 /* one entry in the supplied "translation table" for each. */ 4137 4138 used = 0; 4139 4140 for (i = 0; i < num_trans; i++) { 4141 int j, found; 4142 char *curname; 4143 fs_generic_func_p result; 4144 fs_generic_func_p *location; 4145 4146 curname = translation[i].name; 4147 4148 /* Look for a matching operation in the list supplied by the */ 4149 /* file system. */ 4150 4151 found = 0; 4152 4153 for (j = 0; j < num_ops; j++) { 4154 if (strcmp(operations[j].name, curname) == 0) { 4155 used++; 4156 found = 1; 4157 break; 4158 } 4159 } 4160 4161 /* 4162 * If the file system is using a "placeholder" for default 4163 * or error functions, grab the appropriate function out of 4164 * the translation table. If the file system didn't supply 4165 * this operation at all, use the default function. 4166 */ 4167 4168 if (found) { 4169 result = operations[j].func.fs_generic; 4170 if (result == fs_default) { 4171 result = translation[i].defaultFunc; 4172 } else if (result == fs_error) { 4173 result = translation[i].errorFunc; 4174 } else if (result == NULL) { 4175 /* Null values are PROHIBITED */ 4176 return (EINVAL); 4177 } 4178 } else { 4179 result = translation[i].defaultFunc; 4180 } 4181 4182 /* Now store the function into the operations vector. */ 4183 4184 location = (fs_generic_func_p *) 4185 (((char *)vector) + translation[i].offset); 4186 4187 *location = result; 4188 } 4189 4190 *unused_ops = num_ops - used; 4191 4192 return (0); 4193 } 4194 4195 /* Placeholder functions, should never be called. */ 4196 4197 int 4198 fs_error(void) 4199 { 4200 cmn_err(CE_PANIC, "fs_error called"); 4201 return (0); 4202 } 4203 4204 int 4205 fs_default(void) 4206 { 4207 cmn_err(CE_PANIC, "fs_default called"); 4208 return (0); 4209 } 4210 4211 #ifdef __sparc 4212 4213 /* 4214 * Part of the implementation of booting off a mirrored root 4215 * involves a change of dev_t for the root device. To 4216 * accomplish this, first remove the existing hash table 4217 * entry for the root device, convert to the new dev_t, 4218 * then re-insert in the hash table at the head of the list. 4219 */ 4220 void 4221 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 4222 { 4223 vfs_list_lock(); 4224 4225 vfs_hash_remove(vfsp); 4226 4227 vfsp->vfs_dev = ndev; 4228 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 4229 4230 vfs_hash_add(vfsp, 1); 4231 4232 vfs_list_unlock(); 4233 } 4234 4235 #else /* x86 NEWBOOT */ 4236 4237 int 4238 rootconf() 4239 { 4240 int error; 4241 struct vfssw *vsw; 4242 extern void pm_init(); 4243 char *fstyp; 4244 4245 fstyp = getrootfs(); 4246 4247 if (error = clboot_rootconf()) 4248 return (error); 4249 4250 if (modload("fs", fstyp) == -1) 4251 panic("Cannot _init %s module", fstyp); 4252 4253 RLOCK_VFSSW(); 4254 vsw = vfs_getvfsswbyname(fstyp); 4255 RUNLOCK_VFSSW(); 4256 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 4257 VFS_HOLD(rootvfs); 4258 4259 /* always mount readonly first */ 4260 rootvfs->vfs_flag |= VFS_RDONLY; 4261 4262 pm_init(); 4263 4264 if (netboot) 4265 (void) strplumb(); 4266 4267 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 4268 vfs_unrefvfssw(vsw); 4269 rootdev = rootvfs->vfs_dev; 4270 4271 if (error) 4272 panic("cannot mount root path %s", rootfs.bo_name); 4273 return (error); 4274 } 4275 4276 /* 4277 * XXX this is called by nfs only and should probably be removed 4278 * If booted with ASKNAME, prompt on the console for a filesystem 4279 * name and return it. 4280 */ 4281 void 4282 getfsname(char *askfor, char *name, size_t namelen) 4283 { 4284 if (boothowto & RB_ASKNAME) { 4285 printf("%s name: ", askfor); 4286 console_gets(name, namelen); 4287 } 4288 } 4289 4290 /* 4291 * If server_path exists, then we are booting a diskless 4292 * client. Otherwise, we default to ufs. Zfs should perhaps be 4293 * another property. 4294 */ 4295 static char * 4296 getrootfs(void) 4297 { 4298 extern char *strplumb_get_netdev_path(void); 4299 char *propstr = NULL; 4300 4301 /* check fstype property; it should be nfsdyn for diskless */ 4302 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4303 DDI_PROP_DONTPASS, "fstype", &propstr) 4304 == DDI_SUCCESS) { 4305 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 4306 ddi_prop_free(propstr); 4307 4308 /* 4309 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set, 4310 * assume the type of this root filesystem is 'zfs'. 4311 */ 4312 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4313 DDI_PROP_DONTPASS, "zfs-bootfs", &propstr) 4314 == DDI_SUCCESS) { 4315 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME); 4316 ddi_prop_free(propstr); 4317 } 4318 4319 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) 4320 return (rootfs.bo_fstype); 4321 4322 ++netboot; 4323 /* check if path to network interface is specified in bootpath */ 4324 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4325 DDI_PROP_DONTPASS, "bootpath", &propstr) 4326 == DDI_SUCCESS) { 4327 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 4328 ddi_prop_free(propstr); 4329 } else { 4330 /* attempt to determine netdev_path via boot_mac address */ 4331 netdev_path = strplumb_get_netdev_path(); 4332 if (netdev_path == NULL) 4333 panic("cannot find boot network interface"); 4334 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 4335 } 4336 return ("nfs"); 4337 } 4338 #endif 4339