1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 25 * Copyright 2017 RackTop Systems. 26 * Copyright 2016 Nexenta Systems, Inc. 27 */ 28 29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 30 /* All Rights Reserved */ 31 32 /* 33 * University Copyright- Copyright (c) 1982, 1986, 1988 34 * The Regents of the University of California 35 * All Rights Reserved 36 * 37 * University Acknowledgment- Portions of this document are derived from 38 * software developed by the University of California, Berkeley, and its 39 * contributors. 40 */ 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/errno.h> 46 #include <sys/user.h> 47 #include <sys/fstyp.h> 48 #include <sys/kmem.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/mount.h> 52 #include <sys/vfs.h> 53 #include <sys/vfs_opreg.h> 54 #include <sys/fem.h> 55 #include <sys/mntent.h> 56 #include <sys/stat.h> 57 #include <sys/statvfs.h> 58 #include <sys/statfs.h> 59 #include <sys/cred.h> 60 #include <sys/vnode.h> 61 #include <sys/rwstlock.h> 62 #include <sys/dnlc.h> 63 #include <sys/file.h> 64 #include <sys/time.h> 65 #include <sys/atomic.h> 66 #include <sys/cmn_err.h> 67 #include <sys/buf.h> 68 #include <sys/swap.h> 69 #include <sys/debug.h> 70 #include <sys/vnode.h> 71 #include <sys/modctl.h> 72 #include <sys/ddi.h> 73 #include <sys/pathname.h> 74 #include <sys/bootconf.h> 75 #include <sys/dumphdr.h> 76 #include <sys/dc_ki.h> 77 #include <sys/poll.h> 78 #include <sys/sunddi.h> 79 #include <sys/sysmacros.h> 80 #include <sys/zone.h> 81 #include <sys/policy.h> 82 #include <sys/ctfs.h> 83 #include <sys/objfs.h> 84 #include <sys/console.h> 85 #include <sys/reboot.h> 86 #include <sys/attr.h> 87 #include <sys/zio.h> 88 #include <sys/spa.h> 89 #include <sys/lofi.h> 90 #include <sys/bootprops.h> 91 #include <sys/avl.h> 92 93 #include <vm/page.h> 94 95 #include <fs/fs_subr.h> 96 /* Private interfaces to create vopstats-related data structures */ 97 extern void initialize_vopstats(vopstats_t *); 98 extern vopstats_t *get_fstype_vopstats(struct vfs *, struct vfssw *); 99 extern vsk_anchor_t *get_vskstat_anchor(struct vfs *); 100 101 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 102 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 103 const char *, int, int); 104 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 105 static void vfs_freemnttab(struct vfs *); 106 static void vfs_freeopt(mntopt_t *); 107 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 108 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 109 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 110 static void vfs_createopttbl_extend(mntopts_t *, const char *, 111 const mntopts_t *); 112 static char **vfs_copycancelopt_extend(char **const, int); 113 static void vfs_freecancelopt(char **); 114 static void getrootfs(char **, char **); 115 static int getmacpath(dev_info_t *, void *); 116 static void vfs_mnttabvp_setup(void); 117 118 struct ipmnt { 119 struct ipmnt *mip_next; 120 dev_t mip_dev; 121 struct vfs *mip_vfsp; 122 }; 123 124 static kmutex_t vfs_miplist_mutex; 125 static struct ipmnt *vfs_miplist = NULL; 126 static struct ipmnt *vfs_miplist_end = NULL; 127 128 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */ 129 130 /* 131 * VFS global data. 132 */ 133 vnode_t *rootdir; /* pointer to root inode vnode. */ 134 vnode_t *devicesdir; /* pointer to inode of devices root */ 135 vnode_t *devdir; /* pointer to inode of dev root */ 136 137 char *server_rootpath; /* root path for diskless clients */ 138 char *server_hostname; /* hostname of diskless server */ 139 140 static struct vfs root; 141 static struct vfs devices; 142 static struct vfs dev; 143 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 144 avl_tree_t vfs_by_dev; /* avl tree to index mounted VFSs by dev */ 145 avl_tree_t vfs_by_mntpnt; /* avl tree to index mounted VFSs by mntpnt */ 146 uint64_t vfs_curr_mntix; /* counter to provide a unique mntix for 147 * entries in the above avl trees. 148 * protected by vfslist lock */ 149 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 150 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 151 /* must be power of 2! */ 152 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 153 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 154 char *vfs_dummyfstype = "\0"; 155 struct pollhead vfs_pollhd; /* for mnttab pollers */ 156 struct vnode *vfs_mntdummyvp; /* to fake mnttab read/write for file events */ 157 int mntfstype; /* will be set once mnt fs is mounted */ 158 159 /* 160 * Table for generic options recognized in the VFS layer and acted 161 * on at this level before parsing file system specific options. 162 * The nosuid option is stronger than any of the devices and setuid 163 * options, so those are canceled when nosuid is seen. 164 * 165 * All options which are added here need to be added to the 166 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 167 */ 168 /* 169 * VFS Mount options table 170 */ 171 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 172 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 173 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 174 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 175 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 176 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 177 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 178 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 179 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 180 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 181 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 182 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 183 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 184 static char *follow_cancel[] = { MNTOPT_NOFOLLOW, NULL }; 185 static char *nofollow_cancel[] = { MNTOPT_FOLLOW, NULL }; 186 187 static const mntopt_t mntopts[] = { 188 /* 189 * option name cancel options default arg flags 190 */ 191 { MNTOPT_REMOUNT, NULL, NULL, 192 MO_NODISPLAY, (void *)0 }, 193 { MNTOPT_RO, ro_cancel, NULL, 0, 194 (void *)0 }, 195 { MNTOPT_RW, rw_cancel, NULL, 0, 196 (void *)0 }, 197 { MNTOPT_SUID, suid_cancel, NULL, 0, 198 (void *)0 }, 199 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 200 (void *)0 }, 201 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 202 (void *)0 }, 203 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 204 (void *)0 }, 205 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 206 (void *)0 }, 207 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 208 (void *)0 }, 209 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 210 (void *)0 }, 211 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 212 (void *)0 }, 213 { MNTOPT_EXEC, exec_cancel, NULL, 0, 214 (void *)0 }, 215 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 216 (void *)0 }, 217 { MNTOPT_FOLLOW, follow_cancel, NULL, 0, 218 (void *)0 }, 219 { MNTOPT_NOFOLLOW, nofollow_cancel, NULL, 0, 220 (void *)0 }, 221 }; 222 223 const mntopts_t vfs_mntopts = { 224 sizeof (mntopts) / sizeof (mntopt_t), 225 (mntopt_t *)&mntopts[0] 226 }; 227 228 /* 229 * File system operation dispatch functions. 230 */ 231 232 int 233 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 234 { 235 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 236 } 237 238 int 239 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 240 { 241 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 242 } 243 244 int 245 fsop_root(vfs_t *vfsp, vnode_t **vpp) 246 { 247 refstr_t *mntpt; 248 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 249 /* 250 * Make sure this root has a path. With lofs, it is possible to have 251 * a NULL mountpoint. 252 */ 253 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { 254 mntpt = vfs_getmntpoint(vfsp); 255 vn_setpath_str(*vpp, refstr_value(mntpt), 256 strlen(refstr_value(mntpt))); 257 refstr_rele(mntpt); 258 } 259 260 return (ret); 261 } 262 263 int 264 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 265 { 266 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 267 } 268 269 int 270 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 271 { 272 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 273 } 274 275 int 276 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 277 { 278 /* 279 * In order to handle system attribute fids in a manner 280 * transparent to the underlying fs, we embed the fid for 281 * the sysattr parent object in the sysattr fid and tack on 282 * some extra bytes that only the sysattr layer knows about. 283 * 284 * This guarantees that sysattr fids are larger than other fids 285 * for this vfs. If the vfs supports the sysattr view interface 286 * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size 287 * collision with XATTR_FIDSZ. 288 */ 289 if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) && 290 fidp->fid_len == XATTR_FIDSZ) 291 return (xattr_dir_vget(vfsp, vpp, fidp)); 292 293 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 294 } 295 296 int 297 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 298 { 299 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 300 } 301 302 void 303 fsop_freefs(vfs_t *vfsp) 304 { 305 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 306 } 307 308 int 309 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 310 { 311 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 312 } 313 314 int 315 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 316 { 317 ASSERT((fstype >= 0) && (fstype < nfstype)); 318 319 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 320 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 321 else 322 return (ENOTSUP); 323 } 324 325 /* 326 * File system initialization. vfs_setfsops() must be called from a file 327 * system's init routine. 328 */ 329 330 static int 331 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 332 int *unused_ops) 333 { 334 static const fs_operation_trans_def_t vfs_ops_table[] = { 335 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 336 fs_nosys, fs_nosys, 337 338 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 339 fs_nosys, fs_nosys, 340 341 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 342 fs_nosys, fs_nosys, 343 344 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 345 fs_nosys, fs_nosys, 346 347 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 348 (fs_generic_func_p) fs_sync, 349 (fs_generic_func_p) fs_sync, /* No errors allowed */ 350 351 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 352 fs_nosys, fs_nosys, 353 354 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 355 fs_nosys, fs_nosys, 356 357 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 358 (fs_generic_func_p)fs_freevfs, 359 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 360 361 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 362 (fs_generic_func_p)fs_nosys, 363 (fs_generic_func_p)fs_nosys, 364 365 NULL, 0, NULL, NULL 366 }; 367 368 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 369 } 370 371 void 372 zfs_boot_init() { 373 374 if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0) 375 spa_boot_init(); 376 } 377 378 int 379 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 380 { 381 int error; 382 int unused_ops; 383 384 /* 385 * Verify that fstype refers to a valid fs. Note that 386 * 0 is valid since it's used to set "stray" ops. 387 */ 388 if ((fstype < 0) || (fstype >= nfstype)) 389 return (EINVAL); 390 391 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 392 return (EINVAL); 393 394 /* Set up the operations vector. */ 395 396 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 397 398 if (error != 0) 399 return (error); 400 401 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 402 403 if (actual != NULL) 404 *actual = &vfssw[fstype].vsw_vfsops; 405 406 #if DEBUG 407 if (unused_ops != 0) 408 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 409 "but not used", vfssw[fstype].vsw_name, unused_ops); 410 #endif 411 412 return (0); 413 } 414 415 int 416 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 417 { 418 int error; 419 int unused_ops; 420 421 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 422 423 error = fs_copyfsops(template, *actual, &unused_ops); 424 if (error != 0) { 425 kmem_free(*actual, sizeof (vfsops_t)); 426 *actual = NULL; 427 return (error); 428 } 429 430 return (0); 431 } 432 433 /* 434 * Free a vfsops structure created as a result of vfs_makefsops(). 435 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 436 * vfs_freevfsops_by_type(). 437 */ 438 void 439 vfs_freevfsops(vfsops_t *vfsops) 440 { 441 kmem_free(vfsops, sizeof (vfsops_t)); 442 } 443 444 /* 445 * Since the vfsops structure is part of the vfssw table and wasn't 446 * really allocated, we're not really freeing anything. We keep 447 * the name for consistency with vfs_freevfsops(). We do, however, 448 * need to take care of a little bookkeeping. 449 * NOTE: For a vfsops structure created by vfs_setfsops(), use 450 * vfs_freevfsops_by_type(). 451 */ 452 int 453 vfs_freevfsops_by_type(int fstype) 454 { 455 456 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 457 if ((fstype <= 0) || (fstype >= nfstype)) 458 return (EINVAL); 459 460 WLOCK_VFSSW(); 461 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 462 WUNLOCK_VFSSW(); 463 return (EINVAL); 464 } 465 466 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 467 WUNLOCK_VFSSW(); 468 469 return (0); 470 } 471 472 /* Support routines used to reference vfs_op */ 473 474 /* Set the operations vector for a vfs */ 475 void 476 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 477 { 478 vfsops_t *op; 479 480 ASSERT(vfsp != NULL); 481 ASSERT(vfsops != NULL); 482 483 op = vfsp->vfs_op; 484 membar_consumer(); 485 if (vfsp->vfs_femhead == NULL && 486 atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) { 487 return; 488 } 489 fsem_setvfsops(vfsp, vfsops); 490 } 491 492 /* Retrieve the operations vector for a vfs */ 493 vfsops_t * 494 vfs_getops(vfs_t *vfsp) 495 { 496 vfsops_t *op; 497 498 ASSERT(vfsp != NULL); 499 500 op = vfsp->vfs_op; 501 membar_consumer(); 502 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) { 503 return (op); 504 } else { 505 return (fsem_getvfsops(vfsp)); 506 } 507 } 508 509 /* 510 * Returns non-zero (1) if the vfsops matches that of the vfs. 511 * Returns zero (0) if not. 512 */ 513 int 514 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 515 { 516 return (vfs_getops(vfsp) == vfsops); 517 } 518 519 /* 520 * Returns non-zero (1) if the file system has installed a non-default, 521 * non-error vfs_sync routine. Returns zero (0) otherwise. 522 */ 523 int 524 vfs_can_sync(vfs_t *vfsp) 525 { 526 /* vfs_sync() routine is not the default/error function */ 527 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 528 } 529 530 /* 531 * Initialize a vfs structure. 532 */ 533 void 534 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 535 { 536 /* Other initialization has been moved to vfs_alloc() */ 537 vfsp->vfs_count = 0; 538 vfsp->vfs_next = vfsp; 539 vfsp->vfs_prev = vfsp; 540 vfsp->vfs_zone_next = vfsp; 541 vfsp->vfs_zone_prev = vfsp; 542 vfsp->vfs_lofi_minor = 0; 543 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 544 vfsimpl_setup(vfsp); 545 vfsp->vfs_data = (data); 546 vfs_setops((vfsp), (op)); 547 } 548 549 /* 550 * Allocate and initialize the vfs implementation private data 551 * structure, vfs_impl_t. 552 */ 553 void 554 vfsimpl_setup(vfs_t *vfsp) 555 { 556 int i; 557 558 if (vfsp->vfs_implp != NULL) { 559 return; 560 } 561 562 vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP); 563 /* Note that these are #define'd in vfs.h */ 564 vfsp->vfs_vskap = NULL; 565 vfsp->vfs_fstypevsp = NULL; 566 567 /* Set size of counted array, then zero the array */ 568 vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1; 569 for (i = 1; i < VFS_FEATURE_MAXSZ; i++) { 570 vfsp->vfs_featureset[i] = 0; 571 } 572 } 573 574 /* 575 * Release the vfs_impl_t structure, if it exists. Some unbundled 576 * filesystems may not use the newer version of vfs and thus 577 * would not contain this implementation private data structure. 578 */ 579 void 580 vfsimpl_teardown(vfs_t *vfsp) 581 { 582 vfs_impl_t *vip = vfsp->vfs_implp; 583 584 if (vip == NULL) 585 return; 586 587 kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t)); 588 vfsp->vfs_implp = NULL; 589 } 590 591 /* 592 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 593 * fstatvfs, and sysfs moved to common/syscall. 594 */ 595 596 /* 597 * Update every mounted file system. We call the vfs_sync operation of 598 * each file system type, passing it a NULL vfsp to indicate that all 599 * mounted file systems of that type should be updated. 600 */ 601 void 602 vfs_sync(int flag) 603 { 604 struct vfssw *vswp; 605 RLOCK_VFSSW(); 606 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 607 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 608 vfs_refvfssw(vswp); 609 RUNLOCK_VFSSW(); 610 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 611 CRED()); 612 vfs_unrefvfssw(vswp); 613 RLOCK_VFSSW(); 614 } 615 } 616 RUNLOCK_VFSSW(); 617 } 618 619 void 620 sync(void) 621 { 622 vfs_sync(0); 623 } 624 625 /* 626 * compare function for vfs_by_dev avl tree. compare dev first, then mntix 627 */ 628 static int 629 vfs_cmp_dev(const void *aa, const void *bb) 630 { 631 const vfs_t *a = aa; 632 const vfs_t *b = bb; 633 634 if (a->vfs_dev < b->vfs_dev) 635 return (-1); 636 if (a->vfs_dev > b->vfs_dev) 637 return (1); 638 if (a->vfs_mntix < b->vfs_mntix) 639 return (-1); 640 if (a->vfs_mntix > b->vfs_mntix) 641 return (1); 642 return (0); 643 } 644 645 /* 646 * compare function for vfs_by_mntpnt avl tree. compare mntpnt first, then mntix 647 */ 648 static int 649 vfs_cmp_mntpnt(const void *aa, const void *bb) 650 { 651 const vfs_t *a = aa; 652 const vfs_t *b = bb; 653 int ret; 654 655 ret = strcmp(refstr_value(a->vfs_mntpt), refstr_value(b->vfs_mntpt)); 656 if (ret < 0) 657 return (-1); 658 if (ret > 0) 659 return (1); 660 if (a->vfs_mntix < b->vfs_mntix) 661 return (-1); 662 if (a->vfs_mntix > b->vfs_mntix) 663 return (1); 664 return (0); 665 } 666 667 /* 668 * External routines. 669 */ 670 671 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 672 673 /* 674 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 675 * but otherwise should be accessed only via vfs_list_lock() and 676 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 677 */ 678 static krwlock_t vfslist; 679 680 /* 681 * Mount devfs on /devices. This is done right after root is mounted 682 * to provide device access support for the system 683 */ 684 static void 685 vfs_mountdevices(void) 686 { 687 struct vfssw *vsw; 688 struct vnode *mvp; 689 struct mounta mounta = { /* fake mounta for devfs_mount() */ 690 NULL, 691 NULL, 692 MS_SYSSPACE, 693 NULL, 694 NULL, 695 0, 696 NULL, 697 0 698 }; 699 700 /* 701 * _init devfs module to fill in the vfssw 702 */ 703 if (modload("fs", "devfs") == -1) 704 panic("Cannot _init devfs module"); 705 706 /* 707 * Hold vfs 708 */ 709 RLOCK_VFSSW(); 710 vsw = vfs_getvfsswbyname("devfs"); 711 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 712 VFS_HOLD(&devices); 713 714 /* 715 * Locate mount point 716 */ 717 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 718 panic("Cannot find /devices"); 719 720 /* 721 * Perform the mount of /devices 722 */ 723 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 724 panic("Cannot mount /devices"); 725 726 RUNLOCK_VFSSW(); 727 728 /* 729 * Set appropriate members and add to vfs list for mnttab display 730 */ 731 vfs_setresource(&devices, "/devices", 0); 732 vfs_setmntpoint(&devices, "/devices", 0); 733 734 /* 735 * Hold the root of /devices so it won't go away 736 */ 737 if (VFS_ROOT(&devices, &devicesdir)) 738 panic("vfs_mountdevices: not devices root"); 739 740 if (vfs_lock(&devices) != 0) { 741 VN_RELE(devicesdir); 742 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 743 return; 744 } 745 746 if (vn_vfswlock(mvp) != 0) { 747 vfs_unlock(&devices); 748 VN_RELE(devicesdir); 749 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 750 return; 751 } 752 753 vfs_add(mvp, &devices, 0); 754 vn_vfsunlock(mvp); 755 vfs_unlock(&devices); 756 VN_RELE(devicesdir); 757 } 758 759 /* 760 * mount the first instance of /dev to root and remain mounted 761 */ 762 static void 763 vfs_mountdev1(void) 764 { 765 struct vfssw *vsw; 766 struct vnode *mvp; 767 struct mounta mounta = { /* fake mounta for sdev_mount() */ 768 NULL, 769 NULL, 770 MS_SYSSPACE | MS_OVERLAY, 771 NULL, 772 NULL, 773 0, 774 NULL, 775 0 776 }; 777 778 /* 779 * _init dev module to fill in the vfssw 780 */ 781 if (modload("fs", "dev") == -1) 782 cmn_err(CE_PANIC, "Cannot _init dev module\n"); 783 784 /* 785 * Hold vfs 786 */ 787 RLOCK_VFSSW(); 788 vsw = vfs_getvfsswbyname("dev"); 789 VFS_INIT(&dev, &vsw->vsw_vfsops, NULL); 790 VFS_HOLD(&dev); 791 792 /* 793 * Locate mount point 794 */ 795 if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 796 cmn_err(CE_PANIC, "Cannot find /dev\n"); 797 798 /* 799 * Perform the mount of /dev 800 */ 801 if (VFS_MOUNT(&dev, mvp, &mounta, CRED())) 802 cmn_err(CE_PANIC, "Cannot mount /dev 1\n"); 803 804 RUNLOCK_VFSSW(); 805 806 /* 807 * Set appropriate members and add to vfs list for mnttab display 808 */ 809 vfs_setresource(&dev, "/dev", 0); 810 vfs_setmntpoint(&dev, "/dev", 0); 811 812 /* 813 * Hold the root of /dev so it won't go away 814 */ 815 if (VFS_ROOT(&dev, &devdir)) 816 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root"); 817 818 if (vfs_lock(&dev) != 0) { 819 VN_RELE(devdir); 820 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev"); 821 return; 822 } 823 824 if (vn_vfswlock(mvp) != 0) { 825 vfs_unlock(&dev); 826 VN_RELE(devdir); 827 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev"); 828 return; 829 } 830 831 vfs_add(mvp, &dev, 0); 832 vn_vfsunlock(mvp); 833 vfs_unlock(&dev); 834 VN_RELE(devdir); 835 } 836 837 /* 838 * Mount required filesystem. This is done right after root is mounted. 839 */ 840 static void 841 vfs_mountfs(char *module, char *spec, char *path) 842 { 843 struct vnode *mvp; 844 struct mounta mounta; 845 vfs_t *vfsp; 846 847 bzero(&mounta, sizeof (mounta)); 848 mounta.flags = MS_SYSSPACE | MS_DATA; 849 mounta.fstype = module; 850 mounta.spec = spec; 851 mounta.dir = path; 852 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 853 cmn_err(CE_WARN, "Cannot find %s", path); 854 return; 855 } 856 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 857 cmn_err(CE_WARN, "Cannot mount %s", path); 858 else 859 VFS_RELE(vfsp); 860 VN_RELE(mvp); 861 } 862 863 /* 864 * vfs_mountroot is called by main() to mount the root filesystem. 865 */ 866 void 867 vfs_mountroot(void) 868 { 869 struct vnode *rvp = NULL; 870 char *path; 871 size_t plen; 872 struct vfssw *vswp; 873 proc_t *p; 874 875 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 876 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 877 878 /* 879 * Alloc the avl trees for quick indexing via dev and mountpoint 880 */ 881 avl_create(&vfs_by_dev, vfs_cmp_dev, sizeof(vfs_t), 882 offsetof(vfs_t, vfs_avldev)); 883 avl_create(&vfs_by_mntpnt, vfs_cmp_mntpnt, sizeof(vfs_t), 884 offsetof(vfs_t, vfs_avlmntpnt)); 885 886 /* 887 * Alloc the vfs hash bucket array and locks 888 */ 889 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 890 891 /* 892 * Call machine-dependent routine "rootconf" to choose a root 893 * file system type. 894 */ 895 if (rootconf()) 896 panic("vfs_mountroot: cannot mount root"); 897 /* 898 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 899 * to point to it. These are used by lookuppn() so that it 900 * knows where to start from ('/' or '.'). 901 */ 902 vfs_setmntpoint(rootvfs, "/", 0); 903 if (VFS_ROOT(rootvfs, &rootdir)) 904 panic("vfs_mountroot: no root vnode"); 905 906 /* 907 * At this point, the process tree consists of p0 and possibly some 908 * direct children of p0. (i.e. there are no grandchildren) 909 * 910 * Walk through them all, setting their current directory. 911 */ 912 mutex_enter(&pidlock); 913 for (p = practive; p != NULL; p = p->p_next) { 914 ASSERT(p == &p0 || p->p_parent == &p0); 915 916 PTOU(p)->u_cdir = rootdir; 917 VN_HOLD(PTOU(p)->u_cdir); 918 PTOU(p)->u_rdir = NULL; 919 } 920 mutex_exit(&pidlock); 921 922 /* 923 * Setup the global zone's rootvp, now that it exists. 924 */ 925 global_zone->zone_rootvp = rootdir; 926 VN_HOLD(global_zone->zone_rootvp); 927 928 /* 929 * Notify the module code that it can begin using the 930 * root filesystem instead of the boot program's services. 931 */ 932 modrootloaded = 1; 933 934 /* 935 * Special handling for a ZFS root file system. 936 */ 937 zfs_boot_init(); 938 939 /* 940 * Set up mnttab information for root 941 */ 942 vfs_setresource(rootvfs, rootfs.bo_name, 0); 943 944 /* 945 * Notify cluster software that the root filesystem is available. 946 */ 947 clboot_mountroot(); 948 949 /* Now that we're all done with the root FS, set up its vopstats */ 950 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) { 951 /* Set flag for statistics collection */ 952 if (vswp->vsw_flag & VSW_STATS) { 953 initialize_vopstats(&rootvfs->vfs_vopstats); 954 rootvfs->vfs_flag |= VFS_STATS; 955 rootvfs->vfs_fstypevsp = 956 get_fstype_vopstats(rootvfs, vswp); 957 rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs); 958 } 959 vfs_unrefvfssw(vswp); 960 } 961 962 /* 963 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab, 964 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc. 965 */ 966 vfs_mountdevices(); 967 vfs_mountdev1(); 968 969 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 970 vfs_mountfs("proc", "/proc", "/proc"); 971 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 972 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 973 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 974 vfs_mountfs("bootfs", "bootfs", "/system/boot"); 975 976 if (getzoneid() == GLOBAL_ZONEID) { 977 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab"); 978 } 979 980 if (strcmp(rootfs.bo_fstype, "zfs") != 0) { 981 /* 982 * Look up the root device via devfs so that a dv_node is 983 * created for it. The vnode is never VN_RELE()ed. 984 * We allocate more than MAXPATHLEN so that the 985 * buffer passed to i_ddi_prompath_to_devfspath() is 986 * exactly MAXPATHLEN (the function expects a buffer 987 * of that length). 988 */ 989 plen = strlen("/devices"); 990 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 991 (void) strcpy(path, "/devices"); 992 993 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 994 != DDI_SUCCESS || 995 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 996 997 /* NUL terminate in case "path" has garbage */ 998 path[plen + MAXPATHLEN - 1] = '\0'; 999 #ifdef DEBUG 1000 cmn_err(CE_WARN, "!Cannot lookup root device: %s", 1001 path); 1002 #endif 1003 } 1004 kmem_free(path, plen + MAXPATHLEN); 1005 } 1006 1007 vfs_mnttabvp_setup(); 1008 } 1009 1010 /* 1011 * Check to see if our "block device" is actually a file. If so, 1012 * automatically add a lofi device, and keep track of this fact. 1013 */ 1014 static int 1015 lofi_add(const char *fsname, struct vfs *vfsp, 1016 mntopts_t *mntopts, struct mounta *uap) 1017 { 1018 int fromspace = (uap->flags & MS_SYSSPACE) ? 1019 UIO_SYSSPACE : UIO_USERSPACE; 1020 struct lofi_ioctl *li = NULL; 1021 struct vnode *vp = NULL; 1022 struct pathname pn = { NULL }; 1023 ldi_ident_t ldi_id; 1024 ldi_handle_t ldi_hdl; 1025 vfssw_t *vfssw; 1026 int minor; 1027 int err = 0; 1028 1029 if ((vfssw = vfs_getvfssw(fsname)) == NULL) 1030 return (0); 1031 1032 if (!(vfssw->vsw_flag & VSW_CANLOFI)) { 1033 vfs_unrefvfssw(vfssw); 1034 return (0); 1035 } 1036 1037 vfs_unrefvfssw(vfssw); 1038 vfssw = NULL; 1039 1040 if (pn_get(uap->spec, fromspace, &pn) != 0) 1041 return (0); 1042 1043 if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0) 1044 goto out; 1045 1046 if (vp->v_type != VREG) 1047 goto out; 1048 1049 /* OK, this is a lofi mount. */ 1050 1051 if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) || 1052 vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) || 1053 vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) || 1054 vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) { 1055 err = EINVAL; 1056 goto out; 1057 } 1058 1059 ldi_id = ldi_ident_from_anon(); 1060 li = kmem_zalloc(sizeof (*li), KM_SLEEP); 1061 (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN); 1062 1063 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred, 1064 &ldi_hdl, ldi_id); 1065 1066 if (err) 1067 goto out2; 1068 1069 err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li, 1070 FREAD | FWRITE | FKIOCTL, kcred, &minor); 1071 1072 (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred); 1073 1074 if (!err) 1075 vfsp->vfs_lofi_minor = minor; 1076 1077 out2: 1078 ldi_ident_release(ldi_id); 1079 out: 1080 if (li != NULL) 1081 kmem_free(li, sizeof (*li)); 1082 if (vp != NULL) 1083 VN_RELE(vp); 1084 pn_free(&pn); 1085 return (err); 1086 } 1087 1088 static void 1089 lofi_remove(struct vfs *vfsp) 1090 { 1091 struct lofi_ioctl *li = NULL; 1092 ldi_ident_t ldi_id; 1093 ldi_handle_t ldi_hdl; 1094 int err; 1095 1096 if (vfsp->vfs_lofi_minor == 0) 1097 return; 1098 1099 ldi_id = ldi_ident_from_anon(); 1100 1101 li = kmem_zalloc(sizeof (*li), KM_SLEEP); 1102 li->li_minor = vfsp->vfs_lofi_minor; 1103 li->li_cleanup = B_TRUE; 1104 1105 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred, 1106 &ldi_hdl, ldi_id); 1107 1108 if (err) 1109 goto out; 1110 1111 err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li, 1112 FREAD | FWRITE | FKIOCTL, kcred, NULL); 1113 1114 (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred); 1115 1116 if (!err) 1117 vfsp->vfs_lofi_minor = 0; 1118 1119 out: 1120 ldi_ident_release(ldi_id); 1121 if (li != NULL) 1122 kmem_free(li, sizeof (*li)); 1123 } 1124 1125 /* 1126 * Common mount code. Called from the system call entry point, from autofs, 1127 * nfsv4 trigger mounts, and from pxfs. 1128 * 1129 * Takes the effective file system type, mount arguments, the mount point 1130 * vnode, flags specifying whether the mount is a remount and whether it 1131 * should be entered into the vfs list, and credentials. Fills in its vfspp 1132 * parameter with the mounted file system instance's vfs. 1133 * 1134 * Note that the effective file system type is specified as a string. It may 1135 * be null, in which case it's determined from the mount arguments, and may 1136 * differ from the type specified in the mount arguments; this is a hook to 1137 * allow interposition when instantiating file system instances. 1138 * 1139 * The caller is responsible for releasing its own hold on the mount point 1140 * vp (this routine does its own hold when necessary). 1141 * Also note that for remounts, the mount point vp should be the vnode for 1142 * the root of the file system rather than the vnode that the file system 1143 * is mounted on top of. 1144 */ 1145 int 1146 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 1147 struct vfs **vfspp) 1148 { 1149 struct vfssw *vswp; 1150 vfsops_t *vfsops; 1151 struct vfs *vfsp; 1152 struct vnode *bvp; 1153 dev_t bdev = 0; 1154 mntopts_t mnt_mntopts; 1155 int error = 0; 1156 int copyout_error = 0; 1157 int ovflags; 1158 char *opts = uap->optptr; 1159 char *inargs = opts; 1160 int optlen = uap->optlen; 1161 int remount; 1162 int rdonly; 1163 int nbmand = 0; 1164 int delmip = 0; 1165 int addmip = 0; 1166 int splice = ((uap->flags & MS_NOSPLICE) == 0); 1167 int fromspace = (uap->flags & MS_SYSSPACE) ? 1168 UIO_SYSSPACE : UIO_USERSPACE; 1169 char *resource = NULL, *mountpt = NULL; 1170 refstr_t *oldresource, *oldmntpt; 1171 struct pathname pn, rpn; 1172 vsk_anchor_t *vskap; 1173 char fstname[FSTYPSZ]; 1174 zone_t *zone; 1175 1176 /* 1177 * The v_flag value for the mount point vp is permanently set 1178 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 1179 * for mount point locking. 1180 */ 1181 mutex_enter(&vp->v_lock); 1182 vp->v_flag |= VVFSLOCK; 1183 mutex_exit(&vp->v_lock); 1184 1185 mnt_mntopts.mo_count = 0; 1186 /* 1187 * Find the ops vector to use to invoke the file system-specific mount 1188 * method. If the fsname argument is non-NULL, use it directly. 1189 * Otherwise, dig the file system type information out of the mount 1190 * arguments. 1191 * 1192 * A side effect is to hold the vfssw entry. 1193 * 1194 * Mount arguments can be specified in several ways, which are 1195 * distinguished by flag bit settings. The preferred way is to set 1196 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 1197 * type supplied as a character string and the last two arguments 1198 * being a pointer to a character buffer and the size of the buffer. 1199 * On entry, the buffer holds a null terminated list of options; on 1200 * return, the string is the list of options the file system 1201 * recognized. If MS_DATA is set arguments five and six point to a 1202 * block of binary data which the file system interprets. 1203 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 1204 * consistently with these conventions. To handle them, we check to 1205 * see whether the pointer to the file system name has a numeric value 1206 * less than 256. If so, we treat it as an index. 1207 */ 1208 if (fsname != NULL) { 1209 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 1210 return (EINVAL); 1211 } 1212 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 1213 size_t n; 1214 uint_t fstype; 1215 1216 fsname = fstname; 1217 1218 if ((fstype = (uintptr_t)uap->fstype) < 256) { 1219 RLOCK_VFSSW(); 1220 if (fstype == 0 || fstype >= nfstype || 1221 !ALLOCATED_VFSSW(&vfssw[fstype])) { 1222 RUNLOCK_VFSSW(); 1223 return (EINVAL); 1224 } 1225 (void) strcpy(fsname, vfssw[fstype].vsw_name); 1226 RUNLOCK_VFSSW(); 1227 if ((vswp = vfs_getvfssw(fsname)) == NULL) 1228 return (EINVAL); 1229 } else { 1230 /* 1231 * Handle either kernel or user address space. 1232 */ 1233 if (uap->flags & MS_SYSSPACE) { 1234 error = copystr(uap->fstype, fsname, 1235 FSTYPSZ, &n); 1236 } else { 1237 error = copyinstr(uap->fstype, fsname, 1238 FSTYPSZ, &n); 1239 } 1240 if (error) { 1241 if (error == ENAMETOOLONG) 1242 return (EINVAL); 1243 return (error); 1244 } 1245 if ((vswp = vfs_getvfssw(fsname)) == NULL) 1246 return (EINVAL); 1247 } 1248 } else { 1249 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 1250 return (EINVAL); 1251 fsname = vswp->vsw_name; 1252 } 1253 if (!VFS_INSTALLED(vswp)) 1254 return (EINVAL); 1255 1256 if ((error = secpolicy_fs_allowed_mount(fsname)) != 0) { 1257 vfs_unrefvfssw(vswp); 1258 return (error); 1259 } 1260 1261 vfsops = &vswp->vsw_vfsops; 1262 1263 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 1264 /* 1265 * Fetch mount options and parse them for generic vfs options 1266 */ 1267 if (uap->flags & MS_OPTIONSTR) { 1268 /* 1269 * Limit the buffer size 1270 */ 1271 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 1272 error = EINVAL; 1273 goto errout; 1274 } 1275 if ((uap->flags & MS_SYSSPACE) == 0) { 1276 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 1277 inargs[0] = '\0'; 1278 if (optlen) { 1279 error = copyinstr(opts, inargs, (size_t)optlen, 1280 NULL); 1281 if (error) { 1282 goto errout; 1283 } 1284 } 1285 } 1286 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 1287 } 1288 /* 1289 * Flag bits override the options string. 1290 */ 1291 if (uap->flags & MS_REMOUNT) 1292 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 1293 if (uap->flags & MS_RDONLY) 1294 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 1295 if (uap->flags & MS_NOSUID) 1296 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 1297 1298 /* 1299 * Check if this is a remount; must be set in the option string and 1300 * the file system must support a remount option. 1301 */ 1302 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 1303 MNTOPT_REMOUNT, NULL)) { 1304 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 1305 error = ENOTSUP; 1306 goto errout; 1307 } 1308 uap->flags |= MS_REMOUNT; 1309 } 1310 1311 /* 1312 * uap->flags and vfs_optionisset() should agree. 1313 */ 1314 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 1315 uap->flags |= MS_RDONLY; 1316 } 1317 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 1318 uap->flags |= MS_NOSUID; 1319 } 1320 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 1321 ASSERT(splice || !remount); 1322 /* 1323 * If we are splicing the fs into the namespace, 1324 * perform mount point checks. 1325 * 1326 * We want to resolve the path for the mount point to eliminate 1327 * '.' and ".." and symlinks in mount points; we can't do the 1328 * same for the resource string, since it would turn 1329 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 1330 * this before grabbing vn_vfswlock(), because otherwise we 1331 * would deadlock with lookuppn(). 1332 */ 1333 if (splice) { 1334 ASSERT(vp->v_count > 0); 1335 1336 /* 1337 * Pick up mount point and device from appropriate space. 1338 */ 1339 if (pn_get(uap->spec, fromspace, &pn) == 0) { 1340 resource = kmem_alloc(pn.pn_pathlen + 1, 1341 KM_SLEEP); 1342 (void) strcpy(resource, pn.pn_path); 1343 pn_free(&pn); 1344 } 1345 /* 1346 * Do a lookupname prior to taking the 1347 * writelock. Mark this as completed if 1348 * successful for later cleanup and addition to 1349 * the mount in progress table. 1350 */ 1351 if ((uap->flags & MS_GLOBAL) == 0 && 1352 lookupname(uap->spec, fromspace, 1353 FOLLOW, NULL, &bvp) == 0) { 1354 addmip = 1; 1355 } 1356 1357 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 1358 pathname_t *pnp; 1359 1360 if (*pn.pn_path != '/') { 1361 error = EINVAL; 1362 pn_free(&pn); 1363 goto errout; 1364 } 1365 pn_alloc(&rpn); 1366 /* 1367 * Kludge to prevent autofs from deadlocking with 1368 * itself when it calls domount(). 1369 * 1370 * If autofs is calling, it is because it is doing 1371 * (autofs) mounts in the process of an NFS mount. A 1372 * lookuppn() here would cause us to block waiting for 1373 * said NFS mount to complete, which can't since this 1374 * is the thread that was supposed to doing it. 1375 */ 1376 if (fromspace == UIO_USERSPACE) { 1377 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1378 NULL)) == 0) { 1379 pnp = &rpn; 1380 } else { 1381 /* 1382 * The file disappeared or otherwise 1383 * became inaccessible since we opened 1384 * it; might as well fail the mount 1385 * since the mount point is no longer 1386 * accessible. 1387 */ 1388 pn_free(&rpn); 1389 pn_free(&pn); 1390 goto errout; 1391 } 1392 } else { 1393 pnp = &pn; 1394 } 1395 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1396 (void) strcpy(mountpt, pnp->pn_path); 1397 1398 /* 1399 * If the addition of the zone's rootpath 1400 * would push us over a total path length 1401 * of MAXPATHLEN, we fail the mount with 1402 * ENAMETOOLONG, which is what we would have 1403 * gotten if we were trying to perform the same 1404 * mount in the global zone. 1405 * 1406 * strlen() doesn't count the trailing 1407 * '\0', but zone_rootpathlen counts both a 1408 * trailing '/' and the terminating '\0'. 1409 */ 1410 if ((curproc->p_zone->zone_rootpathlen - 1 + 1411 strlen(mountpt)) > MAXPATHLEN || 1412 (resource != NULL && 1413 (curproc->p_zone->zone_rootpathlen - 1 + 1414 strlen(resource)) > MAXPATHLEN)) { 1415 error = ENAMETOOLONG; 1416 } 1417 1418 pn_free(&rpn); 1419 pn_free(&pn); 1420 } 1421 1422 if (error) 1423 goto errout; 1424 1425 /* 1426 * Prevent path name resolution from proceeding past 1427 * the mount point. 1428 */ 1429 if (vn_vfswlock(vp) != 0) { 1430 error = EBUSY; 1431 goto errout; 1432 } 1433 1434 /* 1435 * Verify that it's legitimate to establish a mount on 1436 * the prospective mount point. 1437 */ 1438 if (vn_mountedvfs(vp) != NULL) { 1439 /* 1440 * The mount point lock was obtained after some 1441 * other thread raced through and established a mount. 1442 */ 1443 vn_vfsunlock(vp); 1444 error = EBUSY; 1445 goto errout; 1446 } 1447 if (vp->v_flag & VNOMOUNT) { 1448 vn_vfsunlock(vp); 1449 error = EINVAL; 1450 goto errout; 1451 } 1452 } 1453 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1454 uap->dataptr = NULL; 1455 uap->datalen = 0; 1456 } 1457 1458 /* 1459 * If this is a remount, we don't want to create a new VFS. 1460 * Instead, we pass the existing one with a remount flag. 1461 */ 1462 if (remount) { 1463 /* 1464 * Confirm that the mount point is the root vnode of the 1465 * file system that is being remounted. 1466 * This can happen if the user specifies a different 1467 * mount point directory pathname in the (re)mount command. 1468 * 1469 * Code below can only be reached if splice is true, so it's 1470 * safe to do vn_vfsunlock() here. 1471 */ 1472 if ((vp->v_flag & VROOT) == 0) { 1473 vn_vfsunlock(vp); 1474 error = ENOENT; 1475 goto errout; 1476 } 1477 /* 1478 * Disallow making file systems read-only unless file system 1479 * explicitly allows it in its vfssw. Ignore other flags. 1480 */ 1481 if (rdonly && vn_is_readonly(vp) == 0 && 1482 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1483 vn_vfsunlock(vp); 1484 error = EINVAL; 1485 goto errout; 1486 } 1487 /* 1488 * Disallow changing the NBMAND disposition of the file 1489 * system on remounts. 1490 */ 1491 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1492 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1493 vn_vfsunlock(vp); 1494 error = EINVAL; 1495 goto errout; 1496 } 1497 vfsp = vp->v_vfsp; 1498 ovflags = vfsp->vfs_flag; 1499 vfsp->vfs_flag |= VFS_REMOUNT; 1500 vfsp->vfs_flag &= ~VFS_RDONLY; 1501 } else { 1502 vfsp = vfs_alloc(KM_SLEEP); 1503 VFS_INIT(vfsp, vfsops, NULL); 1504 } 1505 1506 VFS_HOLD(vfsp); 1507 1508 if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) { 1509 if (!remount) { 1510 if (splice) 1511 vn_vfsunlock(vp); 1512 vfs_free(vfsp); 1513 } else { 1514 vn_vfsunlock(vp); 1515 VFS_RELE(vfsp); 1516 } 1517 goto errout; 1518 } 1519 1520 /* 1521 * PRIV_SYS_MOUNT doesn't mean you can become root. 1522 */ 1523 if (vfsp->vfs_lofi_minor != 0) { 1524 uap->flags |= MS_NOSUID; 1525 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 1526 } 1527 1528 /* 1529 * The vfs_reflock is not used anymore the code below explicitly 1530 * holds it preventing others accesing it directly. 1531 */ 1532 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1533 !(vfsp->vfs_flag & VFS_REMOUNT)) 1534 cmn_err(CE_WARN, 1535 "mount type %s couldn't get vfs_reflock", vswp->vsw_name); 1536 1537 /* 1538 * Lock the vfs. If this is a remount we want to avoid spurious umount 1539 * failures that happen as a side-effect of fsflush() and other mount 1540 * and unmount operations that might be going on simultaneously and 1541 * may have locked the vfs currently. To not return EBUSY immediately 1542 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1543 */ 1544 if (!remount) { 1545 if (error = vfs_lock(vfsp)) { 1546 vfsp->vfs_flag = ovflags; 1547 1548 lofi_remove(vfsp); 1549 1550 if (splice) 1551 vn_vfsunlock(vp); 1552 vfs_free(vfsp); 1553 goto errout; 1554 } 1555 } else { 1556 vfs_lock_wait(vfsp); 1557 } 1558 1559 /* 1560 * Add device to mount in progress table, global mounts require special 1561 * handling. It is possible that we have already done the lookupname 1562 * on a spliced, non-global fs. If so, we don't want to do it again 1563 * since we cannot do a lookupname after taking the 1564 * wlock above. This case is for a non-spliced, non-global filesystem. 1565 */ 1566 if (!addmip) { 1567 if ((uap->flags & MS_GLOBAL) == 0 && 1568 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1569 addmip = 1; 1570 } 1571 } 1572 1573 if (addmip) { 1574 vnode_t *lvp = NULL; 1575 1576 error = vfs_get_lofi(vfsp, &lvp); 1577 if (error > 0) { 1578 lofi_remove(vfsp); 1579 1580 if (splice) 1581 vn_vfsunlock(vp); 1582 vfs_unlock(vfsp); 1583 1584 if (remount) { 1585 VFS_RELE(vfsp); 1586 } else { 1587 vfs_free(vfsp); 1588 } 1589 1590 goto errout; 1591 } else if (error == -1) { 1592 bdev = bvp->v_rdev; 1593 VN_RELE(bvp); 1594 } else { 1595 bdev = lvp->v_rdev; 1596 VN_RELE(lvp); 1597 VN_RELE(bvp); 1598 } 1599 1600 vfs_addmip(bdev, vfsp); 1601 addmip = 0; 1602 delmip = 1; 1603 } 1604 /* 1605 * Invalidate cached entry for the mount point. 1606 */ 1607 if (splice) 1608 dnlc_purge_vp(vp); 1609 1610 /* 1611 * If have an option string but the filesystem doesn't supply a 1612 * prototype options table, create a table with the global 1613 * options and sufficient room to accept all the options in the 1614 * string. Then parse the passed in option string 1615 * accepting all the options in the string. This gives us an 1616 * option table with all the proper cancel properties for the 1617 * global options. 1618 * 1619 * Filesystems that supply a prototype options table are handled 1620 * earlier in this function. 1621 */ 1622 if (uap->flags & MS_OPTIONSTR) { 1623 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1624 mntopts_t tmp_mntopts; 1625 1626 tmp_mntopts.mo_count = 0; 1627 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1628 &mnt_mntopts); 1629 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1630 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1631 vfs_freeopttbl(&tmp_mntopts); 1632 } 1633 } 1634 1635 /* 1636 * Serialize with zone state transitions. 1637 * See vfs_list_add; zone mounted into is: 1638 * zone_find_by_path(refstr_value(vfsp->vfs_mntpt)) 1639 * not the zone doing the mount (curproc->p_zone), but if we're already 1640 * inside a NGZ, then we know what zone we are. 1641 */ 1642 if (INGLOBALZONE(curproc)) { 1643 zone = zone_find_by_path(mountpt); 1644 ASSERT(zone != NULL); 1645 } else { 1646 zone = curproc->p_zone; 1647 /* 1648 * zone_find_by_path does a hold, so do one here too so that 1649 * we can do a zone_rele after mount_completed. 1650 */ 1651 zone_hold(zone); 1652 } 1653 mount_in_progress(zone); 1654 /* 1655 * Instantiate (or reinstantiate) the file system. If appropriate, 1656 * splice it into the file system name space. 1657 * 1658 * We want VFS_MOUNT() to be able to override the vfs_resource 1659 * string if necessary (ie, mntfs), and also for a remount to 1660 * change the same (necessary when remounting '/' during boot). 1661 * So we set up vfs_mntpt and vfs_resource to what we think they 1662 * should be, then hand off control to VFS_MOUNT() which can 1663 * override this. 1664 * 1665 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1666 * a vfs which is on the vfs list (i.e. during a remount), we must 1667 * never set those fields to NULL. Several bits of code make 1668 * assumptions that the fields are always valid. 1669 */ 1670 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1671 if (remount) { 1672 if ((oldresource = vfsp->vfs_resource) != NULL) 1673 refstr_hold(oldresource); 1674 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1675 refstr_hold(oldmntpt); 1676 } 1677 vfs_setresource(vfsp, resource, 0); 1678 vfs_setmntpoint(vfsp, mountpt, 0); 1679 1680 /* 1681 * going to mount on this vnode, so notify. 1682 */ 1683 vnevent_mountedover(vp, NULL); 1684 error = VFS_MOUNT(vfsp, vp, uap, credp); 1685 1686 if (uap->flags & MS_RDONLY) 1687 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1688 if (uap->flags & MS_NOSUID) 1689 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1690 if (uap->flags & MS_GLOBAL) 1691 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1692 1693 if (error) { 1694 lofi_remove(vfsp); 1695 1696 if (remount) { 1697 /* put back pre-remount options */ 1698 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1699 vfs_setmntpoint(vfsp, refstr_value(oldmntpt), 1700 VFSSP_VERBATIM); 1701 if (oldmntpt) 1702 refstr_rele(oldmntpt); 1703 vfs_setresource(vfsp, refstr_value(oldresource), 1704 VFSSP_VERBATIM); 1705 if (oldresource) 1706 refstr_rele(oldresource); 1707 vfsp->vfs_flag = ovflags; 1708 vfs_unlock(vfsp); 1709 VFS_RELE(vfsp); 1710 } else { 1711 vfs_unlock(vfsp); 1712 vfs_freemnttab(vfsp); 1713 vfs_free(vfsp); 1714 } 1715 } else { 1716 /* 1717 * Set the mount time to now 1718 */ 1719 vfsp->vfs_mtime = ddi_get_time(); 1720 if (remount) { 1721 vfsp->vfs_flag &= ~VFS_REMOUNT; 1722 if (oldresource) 1723 refstr_rele(oldresource); 1724 if (oldmntpt) 1725 refstr_rele(oldmntpt); 1726 } else if (splice) { 1727 /* 1728 * Link vfsp into the name space at the mount 1729 * point. Vfs_add() is responsible for 1730 * holding the mount point which will be 1731 * released when vfs_remove() is called. 1732 */ 1733 vfs_add(vp, vfsp, uap->flags); 1734 } else { 1735 /* 1736 * Hold the reference to file system which is 1737 * not linked into the name space. 1738 */ 1739 vfsp->vfs_zone = NULL; 1740 VFS_HOLD(vfsp); 1741 vfsp->vfs_vnodecovered = NULL; 1742 } 1743 /* 1744 * Set flags for global options encountered 1745 */ 1746 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1747 vfsp->vfs_flag |= VFS_RDONLY; 1748 else 1749 vfsp->vfs_flag &= ~VFS_RDONLY; 1750 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1751 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1752 } else { 1753 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1754 vfsp->vfs_flag |= VFS_NODEVICES; 1755 else 1756 vfsp->vfs_flag &= ~VFS_NODEVICES; 1757 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1758 vfsp->vfs_flag |= VFS_NOSETUID; 1759 else 1760 vfsp->vfs_flag &= ~VFS_NOSETUID; 1761 } 1762 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1763 vfsp->vfs_flag |= VFS_NBMAND; 1764 else 1765 vfsp->vfs_flag &= ~VFS_NBMAND; 1766 1767 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1768 vfsp->vfs_flag |= VFS_XATTR; 1769 else 1770 vfsp->vfs_flag &= ~VFS_XATTR; 1771 1772 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1773 vfsp->vfs_flag |= VFS_NOEXEC; 1774 else 1775 vfsp->vfs_flag &= ~VFS_NOEXEC; 1776 1777 if (vfs_optionisset(vfsp, MNTOPT_NOFOLLOW, NULL)) 1778 vfsp->vfs_flag |= VFS_NOFOLLOW; 1779 else 1780 vfsp->vfs_flag &= ~VFS_NOFOLLOW; 1781 1782 /* 1783 * Now construct the output option string of options 1784 * we recognized. 1785 */ 1786 if (uap->flags & MS_OPTIONSTR) { 1787 vfs_list_read_lock(); 1788 copyout_error = vfs_buildoptionstr( 1789 &vfsp->vfs_mntopts, inargs, optlen); 1790 vfs_list_unlock(); 1791 if (copyout_error == 0 && 1792 (uap->flags & MS_SYSSPACE) == 0) { 1793 copyout_error = copyoutstr(inargs, opts, 1794 optlen, NULL); 1795 } 1796 } 1797 1798 /* 1799 * If this isn't a remount, set up the vopstats before 1800 * anyone can touch this. We only allow spliced file 1801 * systems (file systems which are in the namespace) to 1802 * have the VFS_STATS flag set. 1803 * NOTE: PxFS mounts the underlying file system with 1804 * MS_NOSPLICE set and copies those vfs_flags to its private 1805 * vfs structure. As a result, PxFS should never have 1806 * the VFS_STATS flag or else we might access the vfs 1807 * statistics-related fields prior to them being 1808 * properly initialized. 1809 */ 1810 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) { 1811 initialize_vopstats(&vfsp->vfs_vopstats); 1812 /* 1813 * We need to set vfs_vskap to NULL because there's 1814 * a chance it won't be set below. This is checked 1815 * in teardown_vopstats() so we can't have garbage. 1816 */ 1817 vfsp->vfs_vskap = NULL; 1818 vfsp->vfs_flag |= VFS_STATS; 1819 vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp); 1820 } 1821 1822 if (vswp->vsw_flag & VSW_XID) 1823 vfsp->vfs_flag |= VFS_XID; 1824 1825 vfs_unlock(vfsp); 1826 } 1827 mount_completed(zone); 1828 zone_rele(zone); 1829 if (splice) 1830 vn_vfsunlock(vp); 1831 1832 if ((error == 0) && (copyout_error == 0)) { 1833 if (!remount) { 1834 /* 1835 * Don't call get_vskstat_anchor() while holding 1836 * locks since it allocates memory and calls 1837 * VFS_STATVFS(). For NFS, the latter can generate 1838 * an over-the-wire call. 1839 */ 1840 vskap = get_vskstat_anchor(vfsp); 1841 /* Only take the lock if we have something to do */ 1842 if (vskap != NULL) { 1843 vfs_lock_wait(vfsp); 1844 if (vfsp->vfs_flag & VFS_STATS) { 1845 vfsp->vfs_vskap = vskap; 1846 } 1847 vfs_unlock(vfsp); 1848 } 1849 } 1850 /* Return vfsp to caller. */ 1851 *vfspp = vfsp; 1852 } 1853 errout: 1854 vfs_freeopttbl(&mnt_mntopts); 1855 if (resource != NULL) 1856 kmem_free(resource, strlen(resource) + 1); 1857 if (mountpt != NULL) 1858 kmem_free(mountpt, strlen(mountpt) + 1); 1859 /* 1860 * It is possible we errored prior to adding to mount in progress 1861 * table. Must free vnode we acquired with successful lookupname. 1862 */ 1863 if (addmip) 1864 VN_RELE(bvp); 1865 if (delmip) 1866 vfs_delmip(vfsp); 1867 ASSERT(vswp != NULL); 1868 vfs_unrefvfssw(vswp); 1869 if (inargs != opts) 1870 kmem_free(inargs, MAX_MNTOPT_STR); 1871 if (copyout_error) { 1872 lofi_remove(vfsp); 1873 VFS_RELE(vfsp); 1874 error = copyout_error; 1875 } 1876 return (error); 1877 } 1878 1879 static void 1880 vfs_setpath( 1881 struct vfs *vfsp, /* vfs being updated */ 1882 refstr_t **refp, /* Ref-count string to contain the new path */ 1883 const char *newpath, /* Path to add to refp (above) */ 1884 uint32_t flag) /* flag */ 1885 { 1886 size_t len; 1887 refstr_t *ref; 1888 zone_t *zone = curproc->p_zone; 1889 char *sp; 1890 int have_list_lock = 0; 1891 1892 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1893 1894 /* 1895 * New path must be less than MAXPATHLEN because mntfs 1896 * will only display up to MAXPATHLEN bytes. This is currently 1897 * safe, because domount() uses pn_get(), and other callers 1898 * similarly cap the size to fewer than MAXPATHLEN bytes. 1899 */ 1900 1901 ASSERT(strlen(newpath) < MAXPATHLEN); 1902 1903 /* mntfs requires consistency while vfs list lock is held */ 1904 1905 if (VFS_ON_LIST(vfsp)) { 1906 have_list_lock = 1; 1907 vfs_list_lock(); 1908 } 1909 1910 if (*refp != NULL) 1911 refstr_rele(*refp); 1912 1913 /* 1914 * If we are in a non-global zone then we prefix the supplied path, 1915 * newpath, with the zone's root path, with two exceptions. The first 1916 * is where we have been explicitly directed to avoid doing so; this 1917 * will be the case following a failed remount, where the path supplied 1918 * will be a saved version which must now be restored. The second 1919 * exception is where newpath is not a pathname but a descriptive name, 1920 * e.g. "procfs". 1921 */ 1922 if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') { 1923 ref = refstr_alloc(newpath); 1924 goto out; 1925 } 1926 1927 /* 1928 * Truncate the trailing '/' in the zoneroot, and merge 1929 * in the zone's rootpath with the "newpath" (resource 1930 * or mountpoint) passed in. 1931 * 1932 * The size of the required buffer is thus the size of 1933 * the buffer required for the passed-in newpath 1934 * (strlen(newpath) + 1), plus the size of the buffer 1935 * required to hold zone_rootpath (zone_rootpathlen) 1936 * minus one for one of the now-superfluous NUL 1937 * terminations, minus one for the trailing '/'. 1938 * 1939 * That gives us: 1940 * 1941 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1942 * 1943 * Which is what we have below. 1944 */ 1945 1946 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1947 sp = kmem_alloc(len, KM_SLEEP); 1948 1949 /* 1950 * Copy everything including the trailing slash, which 1951 * we then overwrite with the NUL character. 1952 */ 1953 1954 (void) strcpy(sp, zone->zone_rootpath); 1955 sp[zone->zone_rootpathlen - 2] = '\0'; 1956 (void) strcat(sp, newpath); 1957 1958 ref = refstr_alloc(sp); 1959 kmem_free(sp, len); 1960 out: 1961 *refp = ref; 1962 1963 if (have_list_lock) { 1964 vfs_mnttab_modtimeupd(); 1965 vfs_list_unlock(); 1966 } 1967 } 1968 1969 /* 1970 * Record a mounted resource name in a vfs structure. 1971 * If vfsp is already mounted, caller must hold the vfs lock. 1972 */ 1973 void 1974 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag) 1975 { 1976 if (resource == NULL || resource[0] == '\0') 1977 resource = VFS_NORESOURCE; 1978 vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag); 1979 } 1980 1981 /* 1982 * Record a mount point name in a vfs structure. 1983 * If vfsp is already mounted, caller must hold the vfs lock. 1984 */ 1985 void 1986 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag) 1987 { 1988 if (mntpt == NULL || mntpt[0] == '\0') 1989 mntpt = VFS_NOMNTPT; 1990 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag); 1991 } 1992 1993 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1994 1995 refstr_t * 1996 vfs_getresource(const struct vfs *vfsp) 1997 { 1998 refstr_t *resource; 1999 2000 vfs_list_read_lock(); 2001 resource = vfsp->vfs_resource; 2002 refstr_hold(resource); 2003 vfs_list_unlock(); 2004 2005 return (resource); 2006 } 2007 2008 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 2009 2010 refstr_t * 2011 vfs_getmntpoint(const struct vfs *vfsp) 2012 { 2013 refstr_t *mntpt; 2014 2015 vfs_list_read_lock(); 2016 mntpt = vfsp->vfs_mntpt; 2017 refstr_hold(mntpt); 2018 vfs_list_unlock(); 2019 2020 return (mntpt); 2021 } 2022 2023 /* 2024 * Create an empty options table with enough empty slots to hold all 2025 * The options in the options string passed as an argument. 2026 * Potentially prepend another options table. 2027 * 2028 * Note: caller is responsible for locking the vfs list, if needed, 2029 * to protect mops. 2030 */ 2031 static void 2032 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 2033 const mntopts_t *mtmpl) 2034 { 2035 const char *s = opts; 2036 uint_t count; 2037 2038 if (opts == NULL || *opts == '\0') { 2039 count = 0; 2040 } else { 2041 count = 1; 2042 2043 /* 2044 * Count number of options in the string 2045 */ 2046 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 2047 count++; 2048 s++; 2049 } 2050 } 2051 vfs_copyopttbl_extend(mtmpl, mops, count); 2052 } 2053 2054 /* 2055 * Create an empty options table with enough empty slots to hold all 2056 * The options in the options string passed as an argument. 2057 * 2058 * This function is *not* for general use by filesystems. 2059 * 2060 * Note: caller is responsible for locking the vfs list, if needed, 2061 * to protect mops. 2062 */ 2063 void 2064 vfs_createopttbl(mntopts_t *mops, const char *opts) 2065 { 2066 vfs_createopttbl_extend(mops, opts, NULL); 2067 } 2068 2069 2070 /* 2071 * Swap two mount options tables 2072 */ 2073 static void 2074 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 2075 { 2076 uint_t tmpcnt; 2077 mntopt_t *tmplist; 2078 2079 tmpcnt = optbl2->mo_count; 2080 tmplist = optbl2->mo_list; 2081 optbl2->mo_count = optbl1->mo_count; 2082 optbl2->mo_list = optbl1->mo_list; 2083 optbl1->mo_count = tmpcnt; 2084 optbl1->mo_list = tmplist; 2085 } 2086 2087 static void 2088 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 2089 { 2090 vfs_list_lock(); 2091 vfs_swapopttbl_nolock(optbl1, optbl2); 2092 vfs_mnttab_modtimeupd(); 2093 vfs_list_unlock(); 2094 } 2095 2096 static char ** 2097 vfs_copycancelopt_extend(char **const moc, int extend) 2098 { 2099 int i = 0; 2100 int j; 2101 char **result; 2102 2103 if (moc != NULL) { 2104 for (; moc[i] != NULL; i++) 2105 /* count number of options to cancel */; 2106 } 2107 2108 if (i + extend == 0) 2109 return (NULL); 2110 2111 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 2112 2113 for (j = 0; j < i; j++) { 2114 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 2115 (void) strcpy(result[j], moc[j]); 2116 } 2117 for (; j <= i + extend; j++) 2118 result[j] = NULL; 2119 2120 return (result); 2121 } 2122 2123 static void 2124 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 2125 { 2126 char *sp, *dp; 2127 2128 d->mo_flags = s->mo_flags; 2129 d->mo_data = s->mo_data; 2130 sp = s->mo_name; 2131 if (sp != NULL) { 2132 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 2133 (void) strcpy(dp, sp); 2134 d->mo_name = dp; 2135 } else { 2136 d->mo_name = NULL; /* should never happen */ 2137 } 2138 2139 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 2140 2141 sp = s->mo_arg; 2142 if (sp != NULL) { 2143 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 2144 (void) strcpy(dp, sp); 2145 d->mo_arg = dp; 2146 } else { 2147 d->mo_arg = NULL; 2148 } 2149 } 2150 2151 /* 2152 * Copy a mount options table, possibly allocating some spare 2153 * slots at the end. It is permissible to copy_extend the NULL table. 2154 */ 2155 static void 2156 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 2157 { 2158 uint_t i, count; 2159 mntopt_t *motbl; 2160 2161 /* 2162 * Clear out any existing stuff in the options table being initialized 2163 */ 2164 vfs_freeopttbl(dmo); 2165 count = (smo == NULL) ? 0 : smo->mo_count; 2166 if ((count + extra) == 0) /* nothing to do */ 2167 return; 2168 dmo->mo_count = count + extra; 2169 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 2170 dmo->mo_list = motbl; 2171 for (i = 0; i < count; i++) { 2172 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 2173 } 2174 for (i = count; i < count + extra; i++) { 2175 motbl[i].mo_flags = MO_EMPTY; 2176 } 2177 } 2178 2179 /* 2180 * Copy a mount options table. 2181 * 2182 * This function is *not* for general use by filesystems. 2183 * 2184 * Note: caller is responsible for locking the vfs list, if needed, 2185 * to protect smo and dmo. 2186 */ 2187 void 2188 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 2189 { 2190 vfs_copyopttbl_extend(smo, dmo, 0); 2191 } 2192 2193 static char ** 2194 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 2195 { 2196 int c1 = 0; 2197 int c2 = 0; 2198 char **result; 2199 char **sp1, **sp2, **dp; 2200 2201 /* 2202 * First we count both lists of cancel options. 2203 * If either is NULL or has no elements, we return a copy of 2204 * the other. 2205 */ 2206 if (mop1->mo_cancel != NULL) { 2207 for (; mop1->mo_cancel[c1] != NULL; c1++) 2208 /* count cancel options in mop1 */; 2209 } 2210 2211 if (c1 == 0) 2212 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 2213 2214 if (mop2->mo_cancel != NULL) { 2215 for (; mop2->mo_cancel[c2] != NULL; c2++) 2216 /* count cancel options in mop2 */; 2217 } 2218 2219 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 2220 2221 if (c2 == 0) 2222 return (result); 2223 2224 /* 2225 * When we get here, we've got two sets of cancel options; 2226 * we need to merge the two sets. We know that the result 2227 * array has "c1+c2+1" entries and in the end we might shrink 2228 * it. 2229 * Result now has a copy of the c1 entries from mop1; we'll 2230 * now lookup all the entries of mop2 in mop1 and copy it if 2231 * it is unique. 2232 * This operation is O(n^2) but it's only called once per 2233 * filesystem per duplicate option. This is a situation 2234 * which doesn't arise with the filesystems in ON and 2235 * n is generally 1. 2236 */ 2237 2238 dp = &result[c1]; 2239 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 2240 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 2241 if (strcmp(*sp1, *sp2) == 0) 2242 break; 2243 } 2244 if (*sp1 == NULL) { 2245 /* 2246 * Option *sp2 not found in mop1, so copy it. 2247 * The calls to vfs_copycancelopt_extend() 2248 * guarantee that there's enough room. 2249 */ 2250 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 2251 (void) strcpy(*dp++, *sp2); 2252 } 2253 } 2254 if (dp != &result[c1+c2]) { 2255 size_t bytes = (dp - result + 1) * sizeof (char *); 2256 char **nres = kmem_alloc(bytes, KM_SLEEP); 2257 2258 bcopy(result, nres, bytes); 2259 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 2260 result = nres; 2261 } 2262 return (result); 2263 } 2264 2265 /* 2266 * Merge two mount option tables (outer and inner) into one. This is very 2267 * similar to "merging" global variables and automatic variables in C. 2268 * 2269 * This isn't (and doesn't have to be) fast. 2270 * 2271 * This function is *not* for general use by filesystems. 2272 * 2273 * Note: caller is responsible for locking the vfs list, if needed, 2274 * to protect omo, imo & dmo. 2275 */ 2276 void 2277 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 2278 { 2279 uint_t i, count; 2280 mntopt_t *mop, *motbl; 2281 uint_t freeidx; 2282 2283 /* 2284 * First determine how much space we need to allocate. 2285 */ 2286 count = omo->mo_count; 2287 for (i = 0; i < imo->mo_count; i++) { 2288 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2289 continue; 2290 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 2291 count++; 2292 } 2293 ASSERT(count >= omo->mo_count && 2294 count <= omo->mo_count + imo->mo_count); 2295 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 2296 for (i = 0; i < omo->mo_count; i++) 2297 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 2298 freeidx = omo->mo_count; 2299 for (i = 0; i < imo->mo_count; i++) { 2300 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2301 continue; 2302 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 2303 char **newcanp; 2304 uint_t index = mop - omo->mo_list; 2305 2306 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 2307 2308 vfs_freeopt(&motbl[index]); 2309 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 2310 2311 vfs_freecancelopt(motbl[index].mo_cancel); 2312 motbl[index].mo_cancel = newcanp; 2313 } else { 2314 /* 2315 * If it's a new option, just copy it over to the first 2316 * free location. 2317 */ 2318 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 2319 } 2320 } 2321 dmo->mo_count = count; 2322 dmo->mo_list = motbl; 2323 } 2324 2325 /* 2326 * Functions to set and clear mount options in a mount options table. 2327 */ 2328 2329 /* 2330 * Clear a mount option, if it exists. 2331 * 2332 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2333 * the vfs list. 2334 */ 2335 static void 2336 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 2337 { 2338 struct mntopt *mop; 2339 uint_t i, count; 2340 2341 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2342 2343 count = mops->mo_count; 2344 for (i = 0; i < count; i++) { 2345 mop = &mops->mo_list[i]; 2346 2347 if (mop->mo_flags & MO_EMPTY) 2348 continue; 2349 if (strcmp(opt, mop->mo_name)) 2350 continue; 2351 mop->mo_flags &= ~MO_SET; 2352 if (mop->mo_arg != NULL) { 2353 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2354 } 2355 mop->mo_arg = NULL; 2356 if (update_mnttab) 2357 vfs_mnttab_modtimeupd(); 2358 break; 2359 } 2360 } 2361 2362 void 2363 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 2364 { 2365 int gotlock = 0; 2366 2367 if (VFS_ON_LIST(vfsp)) { 2368 gotlock = 1; 2369 vfs_list_lock(); 2370 } 2371 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 2372 if (gotlock) 2373 vfs_list_unlock(); 2374 } 2375 2376 2377 /* 2378 * Set a mount option on. If it's not found in the table, it's silently 2379 * ignored. If the option has MO_IGNORE set, it is still set unless the 2380 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 2381 * bits can be used to toggle the MO_NODISPLAY bit for the option. 2382 * If the VFS_CREATEOPT flag bit is set then the first option slot with 2383 * MO_EMPTY set is created as the option passed in. 2384 * 2385 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2386 * the vfs list. 2387 */ 2388 static void 2389 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 2390 const char *arg, int flags, int update_mnttab) 2391 { 2392 mntopt_t *mop; 2393 uint_t i, count; 2394 char *sp; 2395 2396 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2397 2398 if (flags & VFS_CREATEOPT) { 2399 if (vfs_hasopt(mops, opt) != NULL) { 2400 flags &= ~VFS_CREATEOPT; 2401 } 2402 } 2403 count = mops->mo_count; 2404 for (i = 0; i < count; i++) { 2405 mop = &mops->mo_list[i]; 2406 2407 if (mop->mo_flags & MO_EMPTY) { 2408 if ((flags & VFS_CREATEOPT) == 0) 2409 continue; 2410 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 2411 (void) strcpy(sp, opt); 2412 mop->mo_name = sp; 2413 if (arg != NULL) 2414 mop->mo_flags = MO_HASVALUE; 2415 else 2416 mop->mo_flags = 0; 2417 } else if (strcmp(opt, mop->mo_name)) { 2418 continue; 2419 } 2420 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 2421 break; 2422 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 2423 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 2424 (void) strcpy(sp, arg); 2425 } else { 2426 sp = NULL; 2427 } 2428 if (mop->mo_arg != NULL) 2429 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2430 mop->mo_arg = sp; 2431 if (flags & VFS_DISPLAY) 2432 mop->mo_flags &= ~MO_NODISPLAY; 2433 if (flags & VFS_NODISPLAY) 2434 mop->mo_flags |= MO_NODISPLAY; 2435 mop->mo_flags |= MO_SET; 2436 if (mop->mo_cancel != NULL) { 2437 char **cp; 2438 2439 for (cp = mop->mo_cancel; *cp != NULL; cp++) 2440 vfs_clearmntopt_nolock(mops, *cp, 0); 2441 } 2442 if (update_mnttab) 2443 vfs_mnttab_modtimeupd(); 2444 break; 2445 } 2446 } 2447 2448 void 2449 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 2450 { 2451 int gotlock = 0; 2452 2453 if (VFS_ON_LIST(vfsp)) { 2454 gotlock = 1; 2455 vfs_list_lock(); 2456 } 2457 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 2458 if (gotlock) 2459 vfs_list_unlock(); 2460 } 2461 2462 2463 /* 2464 * Add a "tag" option to a mounted file system's options list. 2465 * 2466 * Note: caller is responsible for locking the vfs list, if needed, 2467 * to protect mops. 2468 */ 2469 static mntopt_t * 2470 vfs_addtag(mntopts_t *mops, const char *tag) 2471 { 2472 uint_t count; 2473 mntopt_t *mop, *motbl; 2474 2475 count = mops->mo_count + 1; 2476 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 2477 if (mops->mo_count) { 2478 size_t len = (count - 1) * sizeof (mntopt_t); 2479 2480 bcopy(mops->mo_list, motbl, len); 2481 kmem_free(mops->mo_list, len); 2482 } 2483 mops->mo_count = count; 2484 mops->mo_list = motbl; 2485 mop = &motbl[count - 1]; 2486 mop->mo_flags = MO_TAG; 2487 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 2488 (void) strcpy(mop->mo_name, tag); 2489 return (mop); 2490 } 2491 2492 /* 2493 * Allow users to set arbitrary "tags" in a vfs's mount options. 2494 * Broader use within the kernel is discouraged. 2495 */ 2496 int 2497 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2498 cred_t *cr) 2499 { 2500 vfs_t *vfsp; 2501 mntopts_t *mops; 2502 mntopt_t *mop; 2503 int found = 0; 2504 dev_t dev = makedevice(major, minor); 2505 int err = 0; 2506 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2507 2508 /* 2509 * Find the desired mounted file system 2510 */ 2511 vfs_list_lock(); 2512 vfsp = rootvfs; 2513 do { 2514 if (vfsp->vfs_dev == dev && 2515 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2516 found = 1; 2517 break; 2518 } 2519 vfsp = vfsp->vfs_next; 2520 } while (vfsp != rootvfs); 2521 2522 if (!found) { 2523 err = EINVAL; 2524 goto out; 2525 } 2526 err = secpolicy_fs_config(cr, vfsp); 2527 if (err != 0) 2528 goto out; 2529 2530 mops = &vfsp->vfs_mntopts; 2531 /* 2532 * Add tag if it doesn't already exist 2533 */ 2534 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2535 int len; 2536 2537 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2538 len = strlen(buf); 2539 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2540 err = ENAMETOOLONG; 2541 goto out; 2542 } 2543 mop = vfs_addtag(mops, tag); 2544 } 2545 if ((mop->mo_flags & MO_TAG) == 0) { 2546 err = EINVAL; 2547 goto out; 2548 } 2549 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2550 out: 2551 vfs_list_unlock(); 2552 kmem_free(buf, MAX_MNTOPT_STR); 2553 return (err); 2554 } 2555 2556 /* 2557 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2558 * Broader use within the kernel is discouraged. 2559 */ 2560 int 2561 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2562 cred_t *cr) 2563 { 2564 vfs_t *vfsp; 2565 mntopt_t *mop; 2566 int found = 0; 2567 dev_t dev = makedevice(major, minor); 2568 int err = 0; 2569 2570 /* 2571 * Find the desired mounted file system 2572 */ 2573 vfs_list_lock(); 2574 vfsp = rootvfs; 2575 do { 2576 if (vfsp->vfs_dev == dev && 2577 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2578 found = 1; 2579 break; 2580 } 2581 vfsp = vfsp->vfs_next; 2582 } while (vfsp != rootvfs); 2583 2584 if (!found) { 2585 err = EINVAL; 2586 goto out; 2587 } 2588 err = secpolicy_fs_config(cr, vfsp); 2589 if (err != 0) 2590 goto out; 2591 2592 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2593 err = EINVAL; 2594 goto out; 2595 } 2596 if ((mop->mo_flags & MO_TAG) == 0) { 2597 err = EINVAL; 2598 goto out; 2599 } 2600 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2601 out: 2602 vfs_list_unlock(); 2603 return (err); 2604 } 2605 2606 /* 2607 * Function to parse an option string and fill in a mount options table. 2608 * Unknown options are silently ignored. The input option string is modified 2609 * by replacing separators with nulls. If the create flag is set, options 2610 * not found in the table are just added on the fly. The table must have 2611 * an option slot marked MO_EMPTY to add an option on the fly. 2612 * 2613 * This function is *not* for general use by filesystems. 2614 * 2615 * Note: caller is responsible for locking the vfs list, if needed, 2616 * to protect mops.. 2617 */ 2618 void 2619 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2620 { 2621 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2622 int setflg = VFS_NOFORCEOPT; 2623 2624 if (osp == NULL) 2625 return; 2626 while (*s != '\0') { 2627 p = strchr(s, ','); /* find next option */ 2628 if (p == NULL) { 2629 cp = NULL; 2630 p = s + strlen(s); 2631 } else { 2632 cp = p; /* save location of comma */ 2633 *p++ = '\0'; /* mark end and point to next option */ 2634 } 2635 nextop = p; 2636 p = strchr(s, '='); /* look for value */ 2637 if (p == NULL) { 2638 valp = NULL; /* no value supplied */ 2639 } else { 2640 ep = p; /* save location of equals */ 2641 *p++ = '\0'; /* end option and point to value */ 2642 valp = p; 2643 } 2644 /* 2645 * set option into options table 2646 */ 2647 if (create) 2648 setflg |= VFS_CREATEOPT; 2649 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2650 if (cp != NULL) 2651 *cp = ','; /* restore the comma */ 2652 if (valp != NULL) 2653 *ep = '='; /* restore the equals */ 2654 s = nextop; 2655 } 2656 } 2657 2658 /* 2659 * Function to inquire if an option exists in a mount options table. 2660 * Returns a pointer to the option if it exists, else NULL. 2661 * 2662 * This function is *not* for general use by filesystems. 2663 * 2664 * Note: caller is responsible for locking the vfs list, if needed, 2665 * to protect mops. 2666 */ 2667 struct mntopt * 2668 vfs_hasopt(const mntopts_t *mops, const char *opt) 2669 { 2670 struct mntopt *mop; 2671 uint_t i, count; 2672 2673 count = mops->mo_count; 2674 for (i = 0; i < count; i++) { 2675 mop = &mops->mo_list[i]; 2676 2677 if (mop->mo_flags & MO_EMPTY) 2678 continue; 2679 if (strcmp(opt, mop->mo_name) == 0) 2680 return (mop); 2681 } 2682 return (NULL); 2683 } 2684 2685 /* 2686 * Function to inquire if an option is set in a mount options table. 2687 * Returns non-zero if set and fills in the arg pointer with a pointer to 2688 * the argument string or NULL if there is no argument string. 2689 */ 2690 static int 2691 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2692 { 2693 struct mntopt *mop; 2694 uint_t i, count; 2695 2696 count = mops->mo_count; 2697 for (i = 0; i < count; i++) { 2698 mop = &mops->mo_list[i]; 2699 2700 if (mop->mo_flags & MO_EMPTY) 2701 continue; 2702 if (strcmp(opt, mop->mo_name)) 2703 continue; 2704 if ((mop->mo_flags & MO_SET) == 0) 2705 return (0); 2706 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2707 *argp = mop->mo_arg; 2708 return (1); 2709 } 2710 return (0); 2711 } 2712 2713 2714 int 2715 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2716 { 2717 int ret; 2718 2719 vfs_list_read_lock(); 2720 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2721 vfs_list_unlock(); 2722 return (ret); 2723 } 2724 2725 2726 /* 2727 * Construct a comma separated string of the options set in the given 2728 * mount table, return the string in the given buffer. Return non-zero if 2729 * the buffer would overflow. 2730 * 2731 * This function is *not* for general use by filesystems. 2732 * 2733 * Note: caller is responsible for locking the vfs list, if needed, 2734 * to protect mp. 2735 */ 2736 int 2737 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2738 { 2739 char *cp; 2740 uint_t i; 2741 2742 buf[0] = '\0'; 2743 cp = buf; 2744 for (i = 0; i < mp->mo_count; i++) { 2745 struct mntopt *mop; 2746 2747 mop = &mp->mo_list[i]; 2748 if (mop->mo_flags & MO_SET) { 2749 int optlen, comma = 0; 2750 2751 if (buf[0] != '\0') 2752 comma = 1; 2753 optlen = strlen(mop->mo_name); 2754 if (strlen(buf) + comma + optlen + 1 > len) 2755 goto err; 2756 if (comma) 2757 *cp++ = ','; 2758 (void) strcpy(cp, mop->mo_name); 2759 cp += optlen; 2760 /* 2761 * Append option value if there is one 2762 */ 2763 if (mop->mo_arg != NULL) { 2764 int arglen; 2765 2766 arglen = strlen(mop->mo_arg); 2767 if (strlen(buf) + arglen + 2 > len) 2768 goto err; 2769 *cp++ = '='; 2770 (void) strcpy(cp, mop->mo_arg); 2771 cp += arglen; 2772 } 2773 } 2774 } 2775 return (0); 2776 err: 2777 return (EOVERFLOW); 2778 } 2779 2780 static void 2781 vfs_freecancelopt(char **moc) 2782 { 2783 if (moc != NULL) { 2784 int ccnt = 0; 2785 char **cp; 2786 2787 for (cp = moc; *cp != NULL; cp++) { 2788 kmem_free(*cp, strlen(*cp) + 1); 2789 ccnt++; 2790 } 2791 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2792 } 2793 } 2794 2795 static void 2796 vfs_freeopt(mntopt_t *mop) 2797 { 2798 if (mop->mo_name != NULL) 2799 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2800 2801 vfs_freecancelopt(mop->mo_cancel); 2802 2803 if (mop->mo_arg != NULL) 2804 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2805 } 2806 2807 /* 2808 * Free a mount options table 2809 * 2810 * This function is *not* for general use by filesystems. 2811 * 2812 * Note: caller is responsible for locking the vfs list, if needed, 2813 * to protect mp. 2814 */ 2815 void 2816 vfs_freeopttbl(mntopts_t *mp) 2817 { 2818 uint_t i, count; 2819 2820 count = mp->mo_count; 2821 for (i = 0; i < count; i++) { 2822 vfs_freeopt(&mp->mo_list[i]); 2823 } 2824 if (count) { 2825 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2826 mp->mo_count = 0; 2827 mp->mo_list = NULL; 2828 } 2829 } 2830 2831 2832 /* ARGSUSED */ 2833 static int 2834 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2835 caller_context_t *ct) 2836 { 2837 return (0); 2838 } 2839 2840 /* ARGSUSED */ 2841 static int 2842 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2843 caller_context_t *ct) 2844 { 2845 return (0); 2846 } 2847 2848 /* 2849 * The dummy vnode is currently used only by file events notification 2850 * module which is just interested in the timestamps. 2851 */ 2852 /* ARGSUSED */ 2853 static int 2854 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2855 caller_context_t *ct) 2856 { 2857 bzero(vap, sizeof (vattr_t)); 2858 vap->va_type = VREG; 2859 vap->va_nlink = 1; 2860 vap->va_ctime = vfs_mnttab_ctime; 2861 /* 2862 * it is ok to just copy mtime as the time will be monotonically 2863 * increasing. 2864 */ 2865 vap->va_mtime = vfs_mnttab_mtime; 2866 vap->va_atime = vap->va_mtime; 2867 return (0); 2868 } 2869 2870 static void 2871 vfs_mnttabvp_setup(void) 2872 { 2873 vnode_t *tvp; 2874 vnodeops_t *vfs_mntdummyvnops; 2875 const fs_operation_def_t mnt_dummyvnodeops_template[] = { 2876 VOPNAME_READ, { .vop_read = vfs_mntdummyread }, 2877 VOPNAME_WRITE, { .vop_write = vfs_mntdummywrite }, 2878 VOPNAME_GETATTR, { .vop_getattr = vfs_mntdummygetattr }, 2879 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 2880 NULL, NULL 2881 }; 2882 2883 if (vn_make_ops("mnttab", mnt_dummyvnodeops_template, 2884 &vfs_mntdummyvnops) != 0) { 2885 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed"); 2886 /* Shouldn't happen, but not bad enough to panic */ 2887 return; 2888 } 2889 2890 /* 2891 * A global dummy vnode is allocated to represent mntfs files. 2892 * The mntfs file (/etc/mnttab) can be monitored for file events 2893 * and receive an event when mnttab changes. Dummy VOP calls 2894 * will be made on this vnode. The file events notification module 2895 * intercepts this vnode and delivers relevant events. 2896 */ 2897 tvp = vn_alloc(KM_SLEEP); 2898 tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE; 2899 vn_setops(tvp, vfs_mntdummyvnops); 2900 tvp->v_type = VREG; 2901 /* 2902 * The mnt dummy ops do not reference v_data. 2903 * No other module intercepting this vnode should either. 2904 * Just set it to point to itself. 2905 */ 2906 tvp->v_data = (caddr_t)tvp; 2907 tvp->v_vfsp = rootvfs; 2908 vfs_mntdummyvp = tvp; 2909 } 2910 2911 /* 2912 * performs fake read/write ops 2913 */ 2914 static void 2915 vfs_mnttab_rwop(int rw) 2916 { 2917 struct uio uio; 2918 struct iovec iov; 2919 char buf[1]; 2920 2921 if (vfs_mntdummyvp == NULL) 2922 return; 2923 2924 bzero(&uio, sizeof (uio)); 2925 bzero(&iov, sizeof (iov)); 2926 iov.iov_base = buf; 2927 iov.iov_len = 0; 2928 uio.uio_iov = &iov; 2929 uio.uio_iovcnt = 1; 2930 uio.uio_loffset = 0; 2931 uio.uio_segflg = UIO_SYSSPACE; 2932 uio.uio_resid = 0; 2933 if (rw) { 2934 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2935 } else { 2936 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2937 } 2938 } 2939 2940 /* 2941 * Generate a write operation. 2942 */ 2943 void 2944 vfs_mnttab_writeop(void) 2945 { 2946 vfs_mnttab_rwop(1); 2947 } 2948 2949 /* 2950 * Generate a read operation. 2951 */ 2952 void 2953 vfs_mnttab_readop(void) 2954 { 2955 vfs_mnttab_rwop(0); 2956 } 2957 2958 /* 2959 * Free any mnttab information recorded in the vfs struct. 2960 * The vfs must not be on the vfs list. 2961 */ 2962 static void 2963 vfs_freemnttab(struct vfs *vfsp) 2964 { 2965 ASSERT(!VFS_ON_LIST(vfsp)); 2966 2967 /* 2968 * Free device and mount point information 2969 */ 2970 if (vfsp->vfs_mntpt != NULL) { 2971 refstr_rele(vfsp->vfs_mntpt); 2972 vfsp->vfs_mntpt = NULL; 2973 } 2974 if (vfsp->vfs_resource != NULL) { 2975 refstr_rele(vfsp->vfs_resource); 2976 vfsp->vfs_resource = NULL; 2977 } 2978 /* 2979 * Now free mount options information 2980 */ 2981 vfs_freeopttbl(&vfsp->vfs_mntopts); 2982 } 2983 2984 /* 2985 * Return the last mnttab modification time 2986 */ 2987 void 2988 vfs_mnttab_modtime(timespec_t *ts) 2989 { 2990 ASSERT(RW_LOCK_HELD(&vfslist)); 2991 *ts = vfs_mnttab_mtime; 2992 } 2993 2994 /* 2995 * See if mnttab is changed 2996 */ 2997 void 2998 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2999 { 3000 int changed; 3001 3002 *phpp = (struct pollhead *)NULL; 3003 3004 /* 3005 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 3006 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 3007 * to not grab the vfs list lock because tv_sec is monotonically 3008 * increasing. 3009 */ 3010 3011 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 3012 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 3013 if (!changed) { 3014 *phpp = &vfs_pollhd; 3015 } 3016 } 3017 3018 /* Provide a unique and monotonically-increasing timestamp. */ 3019 void 3020 vfs_mono_time(timespec_t *ts) 3021 { 3022 static volatile hrtime_t hrt; /* The saved time. */ 3023 hrtime_t newhrt, oldhrt; /* For effecting the CAS. */ 3024 timespec_t newts; 3025 3026 /* 3027 * Try gethrestime() first, but be prepared to fabricate a sensible 3028 * answer at the first sign of any trouble. 3029 */ 3030 gethrestime(&newts); 3031 newhrt = ts2hrt(&newts); 3032 for (;;) { 3033 oldhrt = hrt; 3034 if (newhrt <= hrt) 3035 newhrt = hrt + 1; 3036 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt) 3037 break; 3038 } 3039 hrt2ts(newhrt, ts); 3040 } 3041 3042 /* 3043 * Update the mnttab modification time and wake up any waiters for 3044 * mnttab changes 3045 */ 3046 void 3047 vfs_mnttab_modtimeupd() 3048 { 3049 hrtime_t oldhrt, newhrt; 3050 3051 ASSERT(RW_WRITE_HELD(&vfslist)); 3052 oldhrt = ts2hrt(&vfs_mnttab_mtime); 3053 gethrestime(&vfs_mnttab_mtime); 3054 newhrt = ts2hrt(&vfs_mnttab_mtime); 3055 if (oldhrt == (hrtime_t)0) 3056 vfs_mnttab_ctime = vfs_mnttab_mtime; 3057 /* 3058 * Attempt to provide unique mtime (like uniqtime but not). 3059 */ 3060 if (newhrt == oldhrt) { 3061 newhrt++; 3062 hrt2ts(newhrt, &vfs_mnttab_mtime); 3063 } 3064 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 3065 vfs_mnttab_writeop(); 3066 } 3067 3068 int 3069 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 3070 { 3071 vnode_t *coveredvp; 3072 int error; 3073 extern void teardown_vopstats(vfs_t *); 3074 3075 /* 3076 * Get covered vnode. This will be NULL if the vfs is not linked 3077 * into the file system name space (i.e., domount() with MNT_NOSPICE). 3078 */ 3079 coveredvp = vfsp->vfs_vnodecovered; 3080 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 3081 3082 /* 3083 * Purge all dnlc entries for this vfs. 3084 */ 3085 (void) dnlc_purge_vfsp(vfsp, 0); 3086 3087 /* For forcible umount, skip VFS_SYNC() since it may hang */ 3088 if ((flag & MS_FORCE) == 0) 3089 (void) VFS_SYNC(vfsp, 0, cr); 3090 3091 /* 3092 * Lock the vfs to maintain fs status quo during unmount. This 3093 * has to be done after the sync because ufs_update tries to acquire 3094 * the vfs_reflock. 3095 */ 3096 vfs_lock_wait(vfsp); 3097 3098 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 3099 vfs_unlock(vfsp); 3100 if (coveredvp != NULL) 3101 vn_vfsunlock(coveredvp); 3102 } else if (coveredvp != NULL) { 3103 teardown_vopstats(vfsp); 3104 /* 3105 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 3106 * when it frees vfsp so we do a VN_HOLD() so we can 3107 * continue to use coveredvp afterwards. 3108 */ 3109 VN_HOLD(coveredvp); 3110 vfs_remove(vfsp); 3111 vn_vfsunlock(coveredvp); 3112 VN_RELE(coveredvp); 3113 } else { 3114 teardown_vopstats(vfsp); 3115 /* 3116 * Release the reference to vfs that is not linked 3117 * into the name space. 3118 */ 3119 vfs_unlock(vfsp); 3120 VFS_RELE(vfsp); 3121 } 3122 return (error); 3123 } 3124 3125 3126 /* 3127 * Vfs_unmountall() is called by uadmin() to unmount all 3128 * mounted file systems (except the root file system) during shutdown. 3129 * It follows the existing locking protocol when traversing the vfs list 3130 * to sync and unmount vfses. Even though there should be no 3131 * other thread running while the system is shutting down, it is prudent 3132 * to still follow the locking protocol. 3133 */ 3134 void 3135 vfs_unmountall(void) 3136 { 3137 struct vfs *vfsp; 3138 struct vfs *prev_vfsp = NULL; 3139 int error; 3140 3141 /* 3142 * Toss all dnlc entries now so that the per-vfs sync 3143 * and unmount operations don't have to slog through 3144 * a bunch of uninteresting vnodes over and over again. 3145 */ 3146 dnlc_purge(); 3147 3148 vfs_list_lock(); 3149 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 3150 prev_vfsp = vfsp->vfs_prev; 3151 3152 if (vfs_lock(vfsp) != 0) 3153 continue; 3154 error = vn_vfswlock(vfsp->vfs_vnodecovered); 3155 vfs_unlock(vfsp); 3156 if (error) 3157 continue; 3158 3159 vfs_list_unlock(); 3160 3161 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 3162 (void) dounmount(vfsp, 0, CRED()); 3163 3164 /* 3165 * Since we dropped the vfslist lock above we must 3166 * verify that next_vfsp still exists, else start over. 3167 */ 3168 vfs_list_lock(); 3169 for (vfsp = rootvfs->vfs_prev; 3170 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 3171 if (vfsp == prev_vfsp) 3172 break; 3173 if (vfsp == rootvfs && prev_vfsp != rootvfs) 3174 prev_vfsp = rootvfs->vfs_prev; 3175 } 3176 vfs_list_unlock(); 3177 } 3178 3179 /* 3180 * Called to add an entry to the end of the vfs mount in progress list 3181 */ 3182 void 3183 vfs_addmip(dev_t dev, struct vfs *vfsp) 3184 { 3185 struct ipmnt *mipp; 3186 3187 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 3188 mipp->mip_next = NULL; 3189 mipp->mip_dev = dev; 3190 mipp->mip_vfsp = vfsp; 3191 mutex_enter(&vfs_miplist_mutex); 3192 if (vfs_miplist_end != NULL) 3193 vfs_miplist_end->mip_next = mipp; 3194 else 3195 vfs_miplist = mipp; 3196 vfs_miplist_end = mipp; 3197 mutex_exit(&vfs_miplist_mutex); 3198 } 3199 3200 /* 3201 * Called to remove an entry from the mount in progress list 3202 * Either because the mount completed or it failed. 3203 */ 3204 void 3205 vfs_delmip(struct vfs *vfsp) 3206 { 3207 struct ipmnt *mipp, *mipprev; 3208 3209 mutex_enter(&vfs_miplist_mutex); 3210 mipprev = NULL; 3211 for (mipp = vfs_miplist; 3212 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 3213 mipprev = mipp; 3214 } 3215 if (mipp == NULL) 3216 return; /* shouldn't happen */ 3217 if (mipp == vfs_miplist_end) 3218 vfs_miplist_end = mipprev; 3219 if (mipprev == NULL) 3220 vfs_miplist = mipp->mip_next; 3221 else 3222 mipprev->mip_next = mipp->mip_next; 3223 mutex_exit(&vfs_miplist_mutex); 3224 kmem_free(mipp, sizeof (struct ipmnt)); 3225 } 3226 3227 /* 3228 * vfs_add is called by a specific filesystem's mount routine to add 3229 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 3230 * The vfs should already have been locked by the caller. 3231 * 3232 * coveredvp is NULL if this is the root. 3233 */ 3234 void 3235 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 3236 { 3237 int newflag; 3238 3239 ASSERT(vfs_lock_held(vfsp)); 3240 VFS_HOLD(vfsp); 3241 newflag = vfsp->vfs_flag; 3242 if (mflag & MS_RDONLY) 3243 newflag |= VFS_RDONLY; 3244 else 3245 newflag &= ~VFS_RDONLY; 3246 if (mflag & MS_NOSUID) 3247 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 3248 else 3249 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 3250 if (mflag & MS_NOMNTTAB) 3251 newflag |= VFS_NOMNTTAB; 3252 else 3253 newflag &= ~VFS_NOMNTTAB; 3254 3255 if (coveredvp != NULL) { 3256 ASSERT(vn_vfswlock_held(coveredvp)); 3257 coveredvp->v_vfsmountedhere = vfsp; 3258 VN_HOLD(coveredvp); 3259 } 3260 vfsp->vfs_vnodecovered = coveredvp; 3261 vfsp->vfs_flag = newflag; 3262 3263 vfs_list_add(vfsp); 3264 } 3265 3266 /* 3267 * Remove a vfs from the vfs list, null out the pointer from the 3268 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 3269 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 3270 * reference to the vfs and to the covered vnode. 3271 * 3272 * Called from dounmount after it's confirmed with the file system 3273 * that the unmount is legal. 3274 */ 3275 void 3276 vfs_remove(struct vfs *vfsp) 3277 { 3278 vnode_t *vp; 3279 3280 ASSERT(vfs_lock_held(vfsp)); 3281 3282 /* 3283 * Can't unmount root. Should never happen because fs will 3284 * be busy. 3285 */ 3286 if (vfsp == rootvfs) 3287 panic("vfs_remove: unmounting root"); 3288 3289 vfs_list_remove(vfsp); 3290 3291 /* 3292 * Unhook from the file system name space. 3293 */ 3294 vp = vfsp->vfs_vnodecovered; 3295 ASSERT(vn_vfswlock_held(vp)); 3296 vp->v_vfsmountedhere = NULL; 3297 vfsp->vfs_vnodecovered = NULL; 3298 VN_RELE(vp); 3299 3300 /* 3301 * Release lock and wakeup anybody waiting. 3302 */ 3303 vfs_unlock(vfsp); 3304 VFS_RELE(vfsp); 3305 } 3306 3307 /* 3308 * Lock a filesystem to prevent access to it while mounting, 3309 * unmounting and syncing. Return EBUSY immediately if lock 3310 * can't be acquired. 3311 */ 3312 int 3313 vfs_lock(vfs_t *vfsp) 3314 { 3315 vn_vfslocks_entry_t *vpvfsentry; 3316 3317 vpvfsentry = vn_vfslocks_getlock(vfsp); 3318 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 3319 return (0); 3320 3321 vn_vfslocks_rele(vpvfsentry); 3322 return (EBUSY); 3323 } 3324 3325 int 3326 vfs_rlock(vfs_t *vfsp) 3327 { 3328 vn_vfslocks_entry_t *vpvfsentry; 3329 3330 vpvfsentry = vn_vfslocks_getlock(vfsp); 3331 3332 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 3333 return (0); 3334 3335 vn_vfslocks_rele(vpvfsentry); 3336 return (EBUSY); 3337 } 3338 3339 void 3340 vfs_lock_wait(vfs_t *vfsp) 3341 { 3342 vn_vfslocks_entry_t *vpvfsentry; 3343 3344 vpvfsentry = vn_vfslocks_getlock(vfsp); 3345 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 3346 } 3347 3348 void 3349 vfs_rlock_wait(vfs_t *vfsp) 3350 { 3351 vn_vfslocks_entry_t *vpvfsentry; 3352 3353 vpvfsentry = vn_vfslocks_getlock(vfsp); 3354 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 3355 } 3356 3357 /* 3358 * Unlock a locked filesystem. 3359 */ 3360 void 3361 vfs_unlock(vfs_t *vfsp) 3362 { 3363 vn_vfslocks_entry_t *vpvfsentry; 3364 3365 /* 3366 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 3367 * And these changes should remain for the patch changes as it is. 3368 */ 3369 if (panicstr) 3370 return; 3371 3372 /* 3373 * ve_refcount needs to be dropped twice here. 3374 * 1. To release refernce after a call to vfs_locks_getlock() 3375 * 2. To release the reference from the locking routines like 3376 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 3377 */ 3378 3379 vpvfsentry = vn_vfslocks_getlock(vfsp); 3380 vn_vfslocks_rele(vpvfsentry); 3381 3382 rwst_exit(&vpvfsentry->ve_lock); 3383 vn_vfslocks_rele(vpvfsentry); 3384 } 3385 3386 /* 3387 * Utility routine that allows a filesystem to construct its 3388 * fsid in "the usual way" - by munging some underlying dev_t and 3389 * the filesystem type number into the 64-bit fsid. Note that 3390 * this implicitly relies on dev_t persistence to make filesystem 3391 * id's persistent. 3392 * 3393 * There's nothing to prevent an individual fs from constructing its 3394 * fsid in a different way, and indeed they should. 3395 * 3396 * Since we want fsids to be 32-bit quantities (so that they can be 3397 * exported identically by either 32-bit or 64-bit APIs, as well as 3398 * the fact that fsid's are "known" to NFS), we compress the device 3399 * number given down to 32-bits, and panic if that isn't possible. 3400 */ 3401 void 3402 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 3403 { 3404 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 3405 panic("device number too big for fsid!"); 3406 fsi->val[1] = val; 3407 } 3408 3409 int 3410 vfs_lock_held(vfs_t *vfsp) 3411 { 3412 int held; 3413 vn_vfslocks_entry_t *vpvfsentry; 3414 3415 /* 3416 * vfs_lock_held will mimic sema_held behaviour 3417 * if panicstr is set. And these changes should remain 3418 * for the patch changes as it is. 3419 */ 3420 if (panicstr) 3421 return (1); 3422 3423 vpvfsentry = vn_vfslocks_getlock(vfsp); 3424 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 3425 3426 vn_vfslocks_rele(vpvfsentry); 3427 return (held); 3428 } 3429 3430 struct _kthread * 3431 vfs_lock_owner(vfs_t *vfsp) 3432 { 3433 struct _kthread *owner; 3434 vn_vfslocks_entry_t *vpvfsentry; 3435 3436 /* 3437 * vfs_wlock_held will mimic sema_held behaviour 3438 * if panicstr is set. And these changes should remain 3439 * for the patch changes as it is. 3440 */ 3441 if (panicstr) 3442 return (NULL); 3443 3444 vpvfsentry = vn_vfslocks_getlock(vfsp); 3445 owner = rwst_owner(&vpvfsentry->ve_lock); 3446 3447 vn_vfslocks_rele(vpvfsentry); 3448 return (owner); 3449 } 3450 3451 /* 3452 * vfs list locking. 3453 * 3454 * Rather than manipulate the vfslist lock directly, we abstract into lock 3455 * and unlock routines to allow the locking implementation to be changed for 3456 * clustering. 3457 * 3458 * Whenever the vfs list is modified through its hash links, the overall list 3459 * lock must be obtained before locking the relevant hash bucket. But to see 3460 * whether a given vfs is on the list, it suffices to obtain the lock for the 3461 * hash bucket without getting the overall list lock. (See getvfs() below.) 3462 */ 3463 3464 void 3465 vfs_list_lock() 3466 { 3467 rw_enter(&vfslist, RW_WRITER); 3468 } 3469 3470 void 3471 vfs_list_read_lock() 3472 { 3473 rw_enter(&vfslist, RW_READER); 3474 } 3475 3476 void 3477 vfs_list_unlock() 3478 { 3479 rw_exit(&vfslist); 3480 } 3481 3482 /* 3483 * Low level worker routines for adding entries to and removing entries from 3484 * the vfs list. 3485 */ 3486 3487 static void 3488 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 3489 { 3490 int vhno; 3491 struct vfs **hp; 3492 dev_t dev; 3493 3494 ASSERT(RW_WRITE_HELD(&vfslist)); 3495 3496 dev = expldev(vfsp->vfs_fsid.val[0]); 3497 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3498 3499 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3500 3501 /* 3502 * Link into the hash table, inserting it at the end, so that LOFS 3503 * with the same fsid as UFS (or other) file systems will not hide the 3504 * UFS. 3505 */ 3506 if (insert_at_head) { 3507 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 3508 rvfs_list[vhno].rvfs_head = vfsp; 3509 } else { 3510 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 3511 hp = &(*hp)->vfs_hash) 3512 continue; 3513 /* 3514 * hp now contains the address of the pointer to update 3515 * to effect the insertion. 3516 */ 3517 vfsp->vfs_hash = NULL; 3518 *hp = vfsp; 3519 } 3520 3521 rvfs_list[vhno].rvfs_len++; 3522 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3523 } 3524 3525 3526 static void 3527 vfs_hash_remove(struct vfs *vfsp) 3528 { 3529 int vhno; 3530 struct vfs *tvfsp; 3531 dev_t dev; 3532 3533 ASSERT(RW_WRITE_HELD(&vfslist)); 3534 3535 dev = expldev(vfsp->vfs_fsid.val[0]); 3536 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3537 3538 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3539 3540 /* 3541 * Remove from hash. 3542 */ 3543 if (rvfs_list[vhno].rvfs_head == vfsp) { 3544 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 3545 rvfs_list[vhno].rvfs_len--; 3546 goto foundit; 3547 } 3548 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 3549 tvfsp = tvfsp->vfs_hash) { 3550 if (tvfsp->vfs_hash == vfsp) { 3551 tvfsp->vfs_hash = vfsp->vfs_hash; 3552 rvfs_list[vhno].rvfs_len--; 3553 goto foundit; 3554 } 3555 } 3556 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 3557 3558 foundit: 3559 3560 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3561 } 3562 3563 3564 void 3565 vfs_list_add(struct vfs *vfsp) 3566 { 3567 zone_t *zone; 3568 3569 /* 3570 * Typically, the vfs_t will have been created on behalf of the file 3571 * system in vfs_init, where it will have been provided with a 3572 * vfs_impl_t. This, however, might be lacking if the vfs_t was created 3573 * by an unbundled file system. We therefore check for such an example 3574 * before stamping the vfs_t with its creation time for the benefit of 3575 * mntfs. 3576 */ 3577 if (vfsp->vfs_implp == NULL) 3578 vfsimpl_setup(vfsp); 3579 vfs_mono_time(&vfsp->vfs_hrctime); 3580 3581 /* 3582 * The zone that owns the mount is the one that performed the mount. 3583 * Note that this isn't necessarily the same as the zone mounted into. 3584 * The corresponding zone_rele_ref() will be done when the vfs_t 3585 * is being free'd. 3586 */ 3587 vfsp->vfs_zone = curproc->p_zone; 3588 zone_init_ref(&vfsp->vfs_implp->vi_zone_ref); 3589 zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref, 3590 ZONE_REF_VFS); 3591 3592 /* 3593 * Find the zone mounted into, and put this mount on its vfs list. 3594 */ 3595 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3596 ASSERT(zone != NULL); 3597 /* 3598 * Special casing for the root vfs. This structure is allocated 3599 * statically and hooked onto rootvfs at link time. During the 3600 * vfs_mountroot call at system startup time, the root file system's 3601 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 3602 * as argument. The code below must detect and handle this special 3603 * case. The only apparent justification for this special casing is 3604 * to ensure that the root file system appears at the head of the 3605 * list. 3606 * 3607 * XXX: I'm assuming that it's ok to do normal list locking when 3608 * adding the entry for the root file system (this used to be 3609 * done with no locks held). 3610 */ 3611 vfs_list_lock(); 3612 /* 3613 * Link into the vfs list proper. 3614 */ 3615 if (vfsp == &root) { 3616 /* 3617 * Assert: This vfs is already on the list as its first entry. 3618 * Thus, there's nothing to do. 3619 */ 3620 ASSERT(rootvfs == vfsp); 3621 /* 3622 * Add it to the head of the global zone's vfslist. 3623 */ 3624 ASSERT(zone == global_zone); 3625 ASSERT(zone->zone_vfslist == NULL); 3626 zone->zone_vfslist = vfsp; 3627 } else { 3628 /* 3629 * Link to end of list using vfs_prev (as rootvfs is now a 3630 * doubly linked circular list) so list is in mount order for 3631 * mnttab use. 3632 */ 3633 rootvfs->vfs_prev->vfs_next = vfsp; 3634 vfsp->vfs_prev = rootvfs->vfs_prev; 3635 rootvfs->vfs_prev = vfsp; 3636 vfsp->vfs_next = rootvfs; 3637 3638 /* 3639 * Do it again for the zone-private list (which may be NULL). 3640 */ 3641 if (zone->zone_vfslist == NULL) { 3642 ASSERT(zone != global_zone); 3643 zone->zone_vfslist = vfsp; 3644 } else { 3645 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 3646 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 3647 zone->zone_vfslist->vfs_zone_prev = vfsp; 3648 vfsp->vfs_zone_next = zone->zone_vfslist; 3649 } 3650 } 3651 3652 /* 3653 * Link into the hash table, inserting it at the end, so that LOFS 3654 * with the same fsid as UFS (or other) file systems will not hide 3655 * the UFS. 3656 */ 3657 vfs_hash_add(vfsp, 0); 3658 3659 /* 3660 * Link into tree indexed by mntpoint, for vfs_mntpoint2vfsp 3661 * mntix discerns entries with the same key 3662 */ 3663 vfsp->vfs_mntix = ++vfs_curr_mntix; 3664 avl_add(&vfs_by_dev, vfsp); 3665 3666 /* 3667 * Link into tree indexed by dev, for vfs_devismounted 3668 */ 3669 avl_add(&vfs_by_mntpnt, vfsp); 3670 3671 /* 3672 * update the mnttab modification time 3673 */ 3674 vfs_mnttab_modtimeupd(); 3675 vfs_list_unlock(); 3676 zone_rele(zone); 3677 } 3678 3679 void 3680 vfs_list_remove(struct vfs *vfsp) 3681 { 3682 zone_t *zone; 3683 3684 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3685 ASSERT(zone != NULL); 3686 /* 3687 * Callers are responsible for preventing attempts to unmount the 3688 * root. 3689 */ 3690 ASSERT(vfsp != rootvfs); 3691 3692 vfs_list_lock(); 3693 3694 /* 3695 * Remove from avl trees 3696 */ 3697 avl_remove(&vfs_by_mntpnt, vfsp); 3698 avl_remove(&vfs_by_dev, vfsp); 3699 3700 /* 3701 * Remove from hash. 3702 */ 3703 vfs_hash_remove(vfsp); 3704 3705 /* 3706 * Remove from vfs list. 3707 */ 3708 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3709 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3710 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3711 3712 /* 3713 * Remove from zone-specific vfs list. 3714 */ 3715 if (zone->zone_vfslist == vfsp) 3716 zone->zone_vfslist = vfsp->vfs_zone_next; 3717 3718 if (vfsp->vfs_zone_next == vfsp) { 3719 ASSERT(vfsp->vfs_zone_prev == vfsp); 3720 ASSERT(zone->zone_vfslist == vfsp); 3721 zone->zone_vfslist = NULL; 3722 } 3723 3724 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3725 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3726 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3727 3728 /* 3729 * update the mnttab modification time 3730 */ 3731 vfs_mnttab_modtimeupd(); 3732 vfs_list_unlock(); 3733 zone_rele(zone); 3734 } 3735 3736 struct vfs * 3737 getvfs(fsid_t *fsid) 3738 { 3739 struct vfs *vfsp; 3740 int val0 = fsid->val[0]; 3741 int val1 = fsid->val[1]; 3742 dev_t dev = expldev(val0); 3743 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3744 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3745 3746 mutex_enter(hmp); 3747 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3748 if (vfsp->vfs_fsid.val[0] == val0 && 3749 vfsp->vfs_fsid.val[1] == val1) { 3750 VFS_HOLD(vfsp); 3751 mutex_exit(hmp); 3752 return (vfsp); 3753 } 3754 } 3755 mutex_exit(hmp); 3756 return (NULL); 3757 } 3758 3759 /* 3760 * Search the vfs mount in progress list for a specified device/vfs entry. 3761 * Returns 0 if the first entry in the list that the device matches has the 3762 * given vfs pointer as well. If the device matches but a different vfs 3763 * pointer is encountered in the list before the given vfs pointer then 3764 * a 1 is returned. 3765 */ 3766 3767 int 3768 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3769 { 3770 int retval = 0; 3771 struct ipmnt *mipp; 3772 3773 mutex_enter(&vfs_miplist_mutex); 3774 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3775 if (mipp->mip_dev == dev) { 3776 if (mipp->mip_vfsp != vfsp) 3777 retval = 1; 3778 break; 3779 } 3780 } 3781 mutex_exit(&vfs_miplist_mutex); 3782 return (retval); 3783 } 3784 3785 /* 3786 * Search the vfs list for a specified device. Returns 1, if entry is found 3787 * or 0 if no suitable entry is found. 3788 */ 3789 3790 int 3791 vfs_devismounted(dev_t dev) 3792 { 3793 struct vfs *vfsp; 3794 int found = 0; 3795 struct vfs search; 3796 avl_index_t index; 3797 3798 search.vfs_dev = dev; 3799 search.vfs_mntix = 0; 3800 3801 vfs_list_read_lock(); 3802 3803 /* 3804 * there might be several entries with the same dev in the tree, 3805 * only discerned by mntix. To find the first, we start with a mntix 3806 * of 0. The search will fail. The following avl_nearest will give 3807 * us the actual first entry. 3808 */ 3809 VERIFY(avl_find(&vfs_by_dev, &search, &index) == NULL); 3810 vfsp = avl_nearest(&vfs_by_dev, index, AVL_AFTER); 3811 3812 if (vfsp != NULL && vfsp->vfs_dev == dev) 3813 found = 1; 3814 3815 vfs_list_unlock(); 3816 return (found); 3817 } 3818 3819 /* 3820 * Search the vfs list for a specified device. Returns a pointer to it 3821 * or NULL if no suitable entry is found. The caller of this routine 3822 * is responsible for releasing the returned vfs pointer. 3823 */ 3824 struct vfs * 3825 vfs_dev2vfsp(dev_t dev) 3826 { 3827 struct vfs *vfsp; 3828 int found; 3829 struct vfs search; 3830 avl_index_t index; 3831 3832 search.vfs_dev = dev; 3833 search.vfs_mntix = 0; 3834 3835 vfs_list_read_lock(); 3836 3837 /* 3838 * there might be several entries with the same dev in the tree, 3839 * only discerned by mntix. To find the first, we start with a mntix 3840 * of 0. The search will fail. The following avl_nearest will give 3841 * us the actual first entry. 3842 */ 3843 VERIFY(avl_find(&vfs_by_dev, &search, &index) == NULL); 3844 vfsp = avl_nearest(&vfs_by_dev, index, AVL_AFTER); 3845 3846 found = 0; 3847 while (vfsp != NULL && vfsp->vfs_dev == dev) { 3848 /* 3849 * The following could be made more efficient by making 3850 * the entire loop use vfs_zone_next if the call is from 3851 * a zone. The only callers, however, ustat(2) and 3852 * umount2(2), don't seem to justify the added 3853 * complexity at present. 3854 */ 3855 if (ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3856 curproc->p_zone)) { 3857 VFS_HOLD(vfsp); 3858 found = 1; 3859 break; 3860 } 3861 vfsp = AVL_NEXT(&vfs_by_dev, vfsp); 3862 } 3863 vfs_list_unlock(); 3864 return (found ? vfsp : NULL); 3865 } 3866 3867 /* 3868 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3869 * or NULL if no suitable entry is found. The caller of this routine 3870 * is responsible for releasing the returned vfs pointer. 3871 * 3872 * Note that if multiple mntpoints match, the last one matching is 3873 * returned in an attempt to return the "top" mount when overlay 3874 * mounts are covering the same mount point. This is accomplished by starting 3875 * at the end of the list and working our way backwards, stopping at the first 3876 * matching mount. 3877 */ 3878 struct vfs * 3879 vfs_mntpoint2vfsp(const char *mp) 3880 { 3881 struct vfs *vfsp; 3882 struct vfs *retvfsp = NULL; 3883 zone_t *zone = curproc->p_zone; 3884 struct vfs *list; 3885 3886 vfs_list_read_lock(); 3887 if (getzoneid() == GLOBAL_ZONEID) { 3888 /* 3889 * The global zone may see filesystems in any zone. 3890 */ 3891 struct vfs search; 3892 search.vfs_mntpt = refstr_alloc(mp); 3893 search.vfs_mntix = UINT64_MAX; 3894 avl_index_t index; 3895 3896 /* 3897 * there might be several entries with the same mntpnt in the 3898 * tree, only discerned by mntix. To find the last, we start 3899 * with a mntix of UINT64_MAX. The search will fail. The 3900 * following avl_nearest will give us the actual last entry 3901 * matching the mntpnt. 3902 */ 3903 VERIFY(avl_find(&vfs_by_mntpnt, &search, &index) == 0); 3904 vfsp = avl_nearest(&vfs_by_mntpnt, index, AVL_BEFORE); 3905 3906 refstr_rele(search.vfs_mntpt); 3907 3908 if (vfsp != NULL && 3909 strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) 3910 retvfsp = vfsp; 3911 } else if ((list = zone->zone_vfslist) != NULL) { 3912 const char *mntpt; 3913 3914 vfsp = list->vfs_zone_prev; 3915 do { 3916 mntpt = refstr_value(vfsp->vfs_mntpt); 3917 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3918 if (strcmp(mntpt, mp) == 0) { 3919 retvfsp = vfsp; 3920 break; 3921 } 3922 vfsp = vfsp->vfs_zone_prev; 3923 } while (vfsp != list->vfs_zone_prev); 3924 } 3925 if (retvfsp) 3926 VFS_HOLD(retvfsp); 3927 vfs_list_unlock(); 3928 return (retvfsp); 3929 } 3930 3931 /* 3932 * Search the vfs list for a specified vfsops. 3933 * if vfs entry is found then return 1, else 0. 3934 */ 3935 int 3936 vfs_opsinuse(vfsops_t *ops) 3937 { 3938 struct vfs *vfsp; 3939 int found; 3940 3941 vfs_list_read_lock(); 3942 vfsp = rootvfs; 3943 found = 0; 3944 do { 3945 if (vfs_getops(vfsp) == ops) { 3946 found = 1; 3947 break; 3948 } 3949 vfsp = vfsp->vfs_next; 3950 } while (vfsp != rootvfs); 3951 vfs_list_unlock(); 3952 return (found); 3953 } 3954 3955 /* 3956 * Allocate an entry in vfssw for a file system type 3957 */ 3958 struct vfssw * 3959 allocate_vfssw(const char *type) 3960 { 3961 struct vfssw *vswp; 3962 3963 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3964 /* 3965 * The vfssw table uses the empty string to identify an 3966 * available entry; we cannot add any type which has 3967 * a leading NUL. The string length is limited to 3968 * the size of the st_fstype array in struct stat. 3969 */ 3970 return (NULL); 3971 } 3972 3973 ASSERT(VFSSW_WRITE_LOCKED()); 3974 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3975 if (!ALLOCATED_VFSSW(vswp)) { 3976 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3977 (void) strcpy(vswp->vsw_name, type); 3978 ASSERT(vswp->vsw_count == 0); 3979 vswp->vsw_count = 1; 3980 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3981 return (vswp); 3982 } 3983 return (NULL); 3984 } 3985 3986 /* 3987 * Impose additional layer of translation between vfstype names 3988 * and module names in the filesystem. 3989 */ 3990 static const char * 3991 vfs_to_modname(const char *vfstype) 3992 { 3993 if (strcmp(vfstype, "proc") == 0) { 3994 vfstype = "procfs"; 3995 } else if (strcmp(vfstype, "fd") == 0) { 3996 vfstype = "fdfs"; 3997 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3998 vfstype = "nfs"; 3999 } 4000 4001 return (vfstype); 4002 } 4003 4004 /* 4005 * Find a vfssw entry given a file system type name. 4006 * Try to autoload the filesystem if it's not found. 4007 * If it's installed, return the vfssw locked to prevent unloading. 4008 */ 4009 struct vfssw * 4010 vfs_getvfssw(const char *type) 4011 { 4012 struct vfssw *vswp; 4013 const char *modname; 4014 4015 RLOCK_VFSSW(); 4016 vswp = vfs_getvfsswbyname(type); 4017 modname = vfs_to_modname(type); 4018 4019 if (rootdir == NULL) { 4020 /* 4021 * If we haven't yet loaded the root file system, then our 4022 * _init won't be called until later. Allocate vfssw entry, 4023 * because mod_installfs won't be called. 4024 */ 4025 if (vswp == NULL) { 4026 RUNLOCK_VFSSW(); 4027 WLOCK_VFSSW(); 4028 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 4029 if ((vswp = allocate_vfssw(type)) == NULL) { 4030 WUNLOCK_VFSSW(); 4031 return (NULL); 4032 } 4033 } 4034 WUNLOCK_VFSSW(); 4035 RLOCK_VFSSW(); 4036 } 4037 if (!VFS_INSTALLED(vswp)) { 4038 RUNLOCK_VFSSW(); 4039 (void) modloadonly("fs", modname); 4040 } else 4041 RUNLOCK_VFSSW(); 4042 return (vswp); 4043 } 4044 4045 /* 4046 * Try to load the filesystem. Before calling modload(), we drop 4047 * our lock on the VFS switch table, and pick it up after the 4048 * module is loaded. However, there is a potential race: the 4049 * module could be unloaded after the call to modload() completes 4050 * but before we pick up the lock and drive on. Therefore, 4051 * we keep reloading the module until we've loaded the module 4052 * _and_ we have the lock on the VFS switch table. 4053 */ 4054 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 4055 RUNLOCK_VFSSW(); 4056 if (modload("fs", modname) == -1) 4057 return (NULL); 4058 RLOCK_VFSSW(); 4059 if (vswp == NULL) 4060 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 4061 break; 4062 } 4063 RUNLOCK_VFSSW(); 4064 4065 return (vswp); 4066 } 4067 4068 /* 4069 * Find a vfssw entry given a file system type name. 4070 */ 4071 struct vfssw * 4072 vfs_getvfsswbyname(const char *type) 4073 { 4074 struct vfssw *vswp; 4075 4076 ASSERT(VFSSW_LOCKED()); 4077 if (type == NULL || *type == '\0') 4078 return (NULL); 4079 4080 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 4081 if (strcmp(type, vswp->vsw_name) == 0) { 4082 vfs_refvfssw(vswp); 4083 return (vswp); 4084 } 4085 } 4086 4087 return (NULL); 4088 } 4089 4090 /* 4091 * Find a vfssw entry given a set of vfsops. 4092 */ 4093 struct vfssw * 4094 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 4095 { 4096 struct vfssw *vswp; 4097 4098 RLOCK_VFSSW(); 4099 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 4100 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 4101 vfs_refvfssw(vswp); 4102 RUNLOCK_VFSSW(); 4103 return (vswp); 4104 } 4105 } 4106 RUNLOCK_VFSSW(); 4107 4108 return (NULL); 4109 } 4110 4111 /* 4112 * Reference a vfssw entry. 4113 */ 4114 void 4115 vfs_refvfssw(struct vfssw *vswp) 4116 { 4117 4118 mutex_enter(&vswp->vsw_lock); 4119 vswp->vsw_count++; 4120 mutex_exit(&vswp->vsw_lock); 4121 } 4122 4123 /* 4124 * Unreference a vfssw entry. 4125 */ 4126 void 4127 vfs_unrefvfssw(struct vfssw *vswp) 4128 { 4129 4130 mutex_enter(&vswp->vsw_lock); 4131 vswp->vsw_count--; 4132 mutex_exit(&vswp->vsw_lock); 4133 } 4134 4135 int sync_timeout = 30; /* timeout for syncing a page during panic */ 4136 int sync_timeleft; /* portion of sync_timeout remaining */ 4137 4138 static int sync_retries = 20; /* number of retries when not making progress */ 4139 static int sync_triesleft; /* portion of sync_retries remaining */ 4140 4141 static pgcnt_t old_pgcnt, new_pgcnt; 4142 static int new_bufcnt, old_bufcnt; 4143 4144 /* 4145 * Sync all of the mounted filesystems, and then wait for the actual i/o to 4146 * complete. We wait by counting the number of dirty pages and buffers, 4147 * pushing them out using bio_busy() and page_busy(), and then counting again. 4148 * This routine is used during both the uadmin A_SHUTDOWN code as well as 4149 * the SYNC phase of the panic code (see comments in panic.c). It should only 4150 * be used after some higher-level mechanism has quiesced the system so that 4151 * new writes are not being initiated while we are waiting for completion. 4152 * 4153 * To ensure finite running time, our algorithm uses two timeout mechanisms: 4154 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and 4155 * sync_triesleft (a progress counter used by the vfs_syncall() loop below). 4156 * Together these ensure that syncing completes if our i/o paths are stuck. 4157 * The counters are declared above so they can be found easily in the debugger. 4158 * 4159 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the 4160 * vfs_syncprogress() subroutine whenever we make progress through the lists of 4161 * pages and buffers. It is decremented and expired by the deadman() cyclic. 4162 * When vfs_syncall() decides it is done, we disable the deadman() counter by 4163 * setting sync_timeleft to zero. This timer guards against vfs_syncall() 4164 * deadlocking or hanging inside of a broken filesystem or driver routine. 4165 * 4166 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 4167 * sync_retries consecutive calls to bio_busy() and page_busy() without 4168 * decreasing either the number of dirty buffers or dirty pages below the 4169 * lowest count we have seen so far, we give up and return from vfs_syncall(). 4170 * 4171 * Each loop iteration ends with a call to delay() one second to allow time for 4172 * i/o completion and to permit the user time to read our progress messages. 4173 */ 4174 void 4175 vfs_syncall(void) 4176 { 4177 if (rootdir == NULL && !modrootloaded) 4178 return; /* panic during boot - no filesystems yet */ 4179 4180 printf("syncing file systems..."); 4181 vfs_syncprogress(); 4182 sync(); 4183 4184 vfs_syncprogress(); 4185 sync_triesleft = sync_retries; 4186 4187 old_bufcnt = new_bufcnt = INT_MAX; 4188 old_pgcnt = new_pgcnt = ULONG_MAX; 4189 4190 while (sync_triesleft > 0) { 4191 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 4192 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 4193 4194 new_bufcnt = bio_busy(B_TRUE); 4195 new_pgcnt = page_busy(B_TRUE); 4196 vfs_syncprogress(); 4197 4198 if (new_bufcnt == 0 && new_pgcnt == 0) 4199 break; 4200 4201 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 4202 sync_triesleft = sync_retries; 4203 else 4204 sync_triesleft--; 4205 4206 if (new_bufcnt) 4207 printf(" [%d]", new_bufcnt); 4208 if (new_pgcnt) 4209 printf(" %lu", new_pgcnt); 4210 4211 delay(hz); 4212 } 4213 4214 if (new_bufcnt != 0 || new_pgcnt != 0) 4215 printf(" done (not all i/o completed)\n"); 4216 else 4217 printf(" done\n"); 4218 4219 sync_timeleft = 0; 4220 delay(hz); 4221 } 4222 4223 /* 4224 * If we are in the middle of the sync phase of panic, reset sync_timeleft to 4225 * sync_timeout to indicate that we are making progress and the deadman() 4226 * omnipresent cyclic should not yet time us out. Note that it is safe to 4227 * store to sync_timeleft here since the deadman() is firing at high-level 4228 * on top of us. If we are racing with the deadman(), either the deadman() 4229 * will decrement the old value and then we will reset it, or we will 4230 * reset it and then the deadman() will immediately decrement it. In either 4231 * case, correct behavior results. 4232 */ 4233 void 4234 vfs_syncprogress(void) 4235 { 4236 if (panicstr) 4237 sync_timeleft = sync_timeout; 4238 } 4239 4240 /* 4241 * Map VFS flags to statvfs flags. These shouldn't really be separate 4242 * flags at all. 4243 */ 4244 uint_t 4245 vf_to_stf(uint_t vf) 4246 { 4247 uint_t stf = 0; 4248 4249 if (vf & VFS_RDONLY) 4250 stf |= ST_RDONLY; 4251 if (vf & VFS_NOSETUID) 4252 stf |= ST_NOSUID; 4253 if (vf & VFS_NOTRUNC) 4254 stf |= ST_NOTRUNC; 4255 4256 return (stf); 4257 } 4258 4259 /* 4260 * Entries for (illegal) fstype 0. 4261 */ 4262 /* ARGSUSED */ 4263 int 4264 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 4265 { 4266 cmn_err(CE_PANIC, "stray vfs operation"); 4267 return (0); 4268 } 4269 4270 /* 4271 * Entries for (illegal) fstype 0. 4272 */ 4273 int 4274 vfsstray(void) 4275 { 4276 cmn_err(CE_PANIC, "stray vfs operation"); 4277 return (0); 4278 } 4279 4280 /* 4281 * Support for dealing with forced UFS unmount and its interaction with 4282 * LOFS. Could be used by any filesystem. 4283 * See bug 1203132. 4284 */ 4285 int 4286 vfs_EIO(void) 4287 { 4288 return (EIO); 4289 } 4290 4291 /* 4292 * We've gotta define the op for sync separately, since the compiler gets 4293 * confused if we mix and match ANSI and normal style prototypes when 4294 * a "short" argument is present and spits out a warning. 4295 */ 4296 /*ARGSUSED*/ 4297 int 4298 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 4299 { 4300 return (EIO); 4301 } 4302 4303 vfs_t EIO_vfs; 4304 vfsops_t *EIO_vfsops; 4305 4306 /* 4307 * Called from startup() to initialize all loaded vfs's 4308 */ 4309 void 4310 vfsinit(void) 4311 { 4312 struct vfssw *vswp; 4313 int error; 4314 extern int vopstats_enabled; 4315 extern void vopstats_startup(); 4316 4317 static const fs_operation_def_t EIO_vfsops_template[] = { 4318 VFSNAME_MOUNT, { .error = vfs_EIO }, 4319 VFSNAME_UNMOUNT, { .error = vfs_EIO }, 4320 VFSNAME_ROOT, { .error = vfs_EIO }, 4321 VFSNAME_STATVFS, { .error = vfs_EIO }, 4322 VFSNAME_SYNC, { .vfs_sync = vfs_EIO_sync }, 4323 VFSNAME_VGET, { .error = vfs_EIO }, 4324 VFSNAME_MOUNTROOT, { .error = vfs_EIO }, 4325 VFSNAME_FREEVFS, { .error = vfs_EIO }, 4326 VFSNAME_VNSTATE, { .error = vfs_EIO }, 4327 NULL, NULL 4328 }; 4329 4330 static const fs_operation_def_t stray_vfsops_template[] = { 4331 VFSNAME_MOUNT, { .error = vfsstray }, 4332 VFSNAME_UNMOUNT, { .error = vfsstray }, 4333 VFSNAME_ROOT, { .error = vfsstray }, 4334 VFSNAME_STATVFS, { .error = vfsstray }, 4335 VFSNAME_SYNC, { .vfs_sync = vfsstray_sync }, 4336 VFSNAME_VGET, { .error = vfsstray }, 4337 VFSNAME_MOUNTROOT, { .error = vfsstray }, 4338 VFSNAME_FREEVFS, { .error = vfsstray }, 4339 VFSNAME_VNSTATE, { .error = vfsstray }, 4340 NULL, NULL 4341 }; 4342 4343 /* Create vfs cache */ 4344 vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs), 4345 sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0); 4346 4347 /* Initialize the vnode cache (file systems may use it during init). */ 4348 vn_create_cache(); 4349 4350 /* Setup event monitor framework */ 4351 fem_init(); 4352 4353 /* Initialize the dummy stray file system type. */ 4354 error = vfs_setfsops(0, stray_vfsops_template, NULL); 4355 4356 /* Initialize the dummy EIO file system. */ 4357 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 4358 if (error != 0) { 4359 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 4360 /* Shouldn't happen, but not bad enough to panic */ 4361 } 4362 4363 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 4364 4365 /* 4366 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 4367 * on this vfs can immediately notice it's invalid. 4368 */ 4369 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 4370 4371 /* 4372 * Call the init routines of non-loadable filesystems only. 4373 * Filesystems which are loaded as separate modules will be 4374 * initialized by the module loading code instead. 4375 */ 4376 4377 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 4378 RLOCK_VFSSW(); 4379 if (vswp->vsw_init != NULL) 4380 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 4381 RUNLOCK_VFSSW(); 4382 } 4383 4384 vopstats_startup(); 4385 4386 if (vopstats_enabled) { 4387 /* EIO_vfs can collect stats, but we don't retrieve them */ 4388 initialize_vopstats(&EIO_vfs.vfs_vopstats); 4389 EIO_vfs.vfs_fstypevsp = NULL; 4390 EIO_vfs.vfs_vskap = NULL; 4391 EIO_vfs.vfs_flag |= VFS_STATS; 4392 } 4393 4394 xattr_init(); 4395 4396 reparse_point_init(); 4397 } 4398 4399 vfs_t * 4400 vfs_alloc(int kmflag) 4401 { 4402 vfs_t *vfsp; 4403 4404 vfsp = kmem_cache_alloc(vfs_cache, kmflag); 4405 4406 /* 4407 * Do the simplest initialization here. 4408 * Everything else gets done in vfs_init() 4409 */ 4410 bzero(vfsp, sizeof (vfs_t)); 4411 return (vfsp); 4412 } 4413 4414 void 4415 vfs_free(vfs_t *vfsp) 4416 { 4417 /* 4418 * One would be tempted to assert that "vfsp->vfs_count == 0". 4419 * The problem is that this gets called out of domount() with 4420 * a partially initialized vfs and a vfs_count of 1. This is 4421 * also called from vfs_rele() with a vfs_count of 0. We can't 4422 * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully 4423 * returned. This is because VFS_MOUNT() fully initializes the 4424 * vfs structure and its associated data. VFS_RELE() will call 4425 * VFS_FREEVFS() which may panic the system if the data structures 4426 * aren't fully initialized from a successful VFS_MOUNT()). 4427 */ 4428 4429 /* If FEM was in use, make sure everything gets cleaned up */ 4430 if (vfsp->vfs_femhead) { 4431 ASSERT(vfsp->vfs_femhead->femh_list == NULL); 4432 mutex_destroy(&vfsp->vfs_femhead->femh_lock); 4433 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead))); 4434 vfsp->vfs_femhead = NULL; 4435 } 4436 4437 if (vfsp->vfs_implp) 4438 vfsimpl_teardown(vfsp); 4439 sema_destroy(&vfsp->vfs_reflock); 4440 kmem_cache_free(vfs_cache, vfsp); 4441 } 4442 4443 /* 4444 * Increments the vfs reference count by one atomically. 4445 */ 4446 void 4447 vfs_hold(vfs_t *vfsp) 4448 { 4449 atomic_inc_32(&vfsp->vfs_count); 4450 ASSERT(vfsp->vfs_count != 0); 4451 } 4452 4453 /* 4454 * Decrements the vfs reference count by one atomically. When 4455 * vfs reference count becomes zero, it calls the file system 4456 * specific vfs_freevfs() to free up the resources. 4457 */ 4458 void 4459 vfs_rele(vfs_t *vfsp) 4460 { 4461 ASSERT(vfsp->vfs_count != 0); 4462 if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) { 4463 VFS_FREEVFS(vfsp); 4464 lofi_remove(vfsp); 4465 if (vfsp->vfs_zone) 4466 zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, 4467 ZONE_REF_VFS); 4468 vfs_freemnttab(vfsp); 4469 vfs_free(vfsp); 4470 } 4471 } 4472 4473 /* 4474 * Generic operations vector support. 4475 * 4476 * This is used to build operations vectors for both the vfs and vnode. 4477 * It's normally called only when a file system is loaded. 4478 * 4479 * There are many possible algorithms for this, including the following: 4480 * 4481 * (1) scan the list of known operations; for each, see if the file system 4482 * includes an entry for it, and fill it in as appropriate. 4483 * 4484 * (2) set up defaults for all known operations. scan the list of ops 4485 * supplied by the file system; for each which is both supplied and 4486 * known, fill it in. 4487 * 4488 * (3) sort the lists of known ops & supplied ops; scan the list, filling 4489 * in entries as we go. 4490 * 4491 * we choose (1) for simplicity, and because performance isn't critical here. 4492 * note that (2) could be sped up using a precomputed hash table on known ops. 4493 * (3) could be faster than either, but only if the lists were very large or 4494 * supplied in sorted order. 4495 * 4496 */ 4497 4498 int 4499 fs_build_vector(void *vector, int *unused_ops, 4500 const fs_operation_trans_def_t *translation, 4501 const fs_operation_def_t *operations) 4502 { 4503 int i, num_trans, num_ops, used; 4504 4505 /* 4506 * Count the number of translations and the number of supplied 4507 * operations. 4508 */ 4509 4510 { 4511 const fs_operation_trans_def_t *p; 4512 4513 for (num_trans = 0, p = translation; 4514 p->name != NULL; 4515 num_trans++, p++) 4516 ; 4517 } 4518 4519 { 4520 const fs_operation_def_t *p; 4521 4522 for (num_ops = 0, p = operations; 4523 p->name != NULL; 4524 num_ops++, p++) 4525 ; 4526 } 4527 4528 /* Walk through each operation known to our caller. There will be */ 4529 /* one entry in the supplied "translation table" for each. */ 4530 4531 used = 0; 4532 4533 for (i = 0; i < num_trans; i++) { 4534 int j, found; 4535 char *curname; 4536 fs_generic_func_p result; 4537 fs_generic_func_p *location; 4538 4539 curname = translation[i].name; 4540 4541 /* Look for a matching operation in the list supplied by the */ 4542 /* file system. */ 4543 4544 found = 0; 4545 4546 for (j = 0; j < num_ops; j++) { 4547 if (strcmp(operations[j].name, curname) == 0) { 4548 used++; 4549 found = 1; 4550 break; 4551 } 4552 } 4553 4554 /* 4555 * If the file system is using a "placeholder" for default 4556 * or error functions, grab the appropriate function out of 4557 * the translation table. If the file system didn't supply 4558 * this operation at all, use the default function. 4559 */ 4560 4561 if (found) { 4562 result = operations[j].func.fs_generic; 4563 if (result == fs_default) { 4564 result = translation[i].defaultFunc; 4565 } else if (result == fs_error) { 4566 result = translation[i].errorFunc; 4567 } else if (result == NULL) { 4568 /* Null values are PROHIBITED */ 4569 return (EINVAL); 4570 } 4571 } else { 4572 result = translation[i].defaultFunc; 4573 } 4574 4575 /* Now store the function into the operations vector. */ 4576 4577 location = (fs_generic_func_p *) 4578 (((char *)vector) + translation[i].offset); 4579 4580 *location = result; 4581 } 4582 4583 *unused_ops = num_ops - used; 4584 4585 return (0); 4586 } 4587 4588 /* Placeholder functions, should never be called. */ 4589 4590 int 4591 fs_error(void) 4592 { 4593 cmn_err(CE_PANIC, "fs_error called"); 4594 return (0); 4595 } 4596 4597 int 4598 fs_default(void) 4599 { 4600 cmn_err(CE_PANIC, "fs_default called"); 4601 return (0); 4602 } 4603 4604 #ifdef __sparc 4605 4606 /* 4607 * Part of the implementation of booting off a mirrored root 4608 * involves a change of dev_t for the root device. To 4609 * accomplish this, first remove the existing hash table 4610 * entry for the root device, convert to the new dev_t, 4611 * then re-insert in the hash table at the head of the list. 4612 */ 4613 void 4614 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 4615 { 4616 vfs_list_lock(); 4617 4618 vfs_hash_remove(vfsp); 4619 4620 vfsp->vfs_dev = ndev; 4621 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 4622 4623 vfs_hash_add(vfsp, 1); 4624 4625 vfs_list_unlock(); 4626 } 4627 4628 #else /* x86 NEWBOOT */ 4629 4630 #if defined(__x86) 4631 extern int hvmboot_rootconf(); 4632 #endif /* __x86 */ 4633 4634 extern ib_boot_prop_t *iscsiboot_prop; 4635 4636 int 4637 rootconf() 4638 { 4639 int error; 4640 struct vfssw *vsw; 4641 extern void pm_init(); 4642 char *fstyp, *fsmod; 4643 int ret = -1; 4644 4645 getrootfs(&fstyp, &fsmod); 4646 4647 #if defined(__x86) 4648 /* 4649 * hvmboot_rootconf() is defined in the hvm_bootstrap misc module, 4650 * which lives in /platform/i86hvm, and hence is only available when 4651 * booted in an x86 hvm environment. If the hvm_bootstrap misc module 4652 * is not available then the modstub for this function will return 0. 4653 * If the hvm_bootstrap misc module is available it will be loaded 4654 * and hvmboot_rootconf() will be invoked. 4655 */ 4656 if (error = hvmboot_rootconf()) 4657 return (error); 4658 #endif /* __x86 */ 4659 4660 if (error = clboot_rootconf()) 4661 return (error); 4662 4663 if (modload("fs", fsmod) == -1) 4664 panic("Cannot _init %s module", fsmod); 4665 4666 RLOCK_VFSSW(); 4667 vsw = vfs_getvfsswbyname(fstyp); 4668 RUNLOCK_VFSSW(); 4669 if (vsw == NULL) { 4670 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp); 4671 return (ENXIO); 4672 } 4673 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 4674 VFS_HOLD(rootvfs); 4675 4676 /* always mount readonly first */ 4677 rootvfs->vfs_flag |= VFS_RDONLY; 4678 4679 pm_init(); 4680 4681 if (netboot && iscsiboot_prop) { 4682 cmn_err(CE_WARN, "NFS boot and iSCSI boot" 4683 " shouldn't happen in the same time"); 4684 return (EINVAL); 4685 } 4686 4687 if (netboot || iscsiboot_prop) { 4688 ret = strplumb(); 4689 if (ret != 0) { 4690 cmn_err(CE_WARN, "Cannot plumb network device %d", ret); 4691 return (EFAULT); 4692 } 4693 } 4694 4695 if ((ret == 0) && iscsiboot_prop) { 4696 ret = modload("drv", "iscsi"); 4697 /* -1 indicates fail */ 4698 if (ret == -1) { 4699 cmn_err(CE_WARN, "Failed to load iscsi module"); 4700 iscsi_boot_prop_free(); 4701 return (EINVAL); 4702 } else { 4703 if (!i_ddi_attach_pseudo_node("iscsi")) { 4704 cmn_err(CE_WARN, 4705 "Failed to attach iscsi driver"); 4706 iscsi_boot_prop_free(); 4707 return (ENODEV); 4708 } 4709 } 4710 } 4711 4712 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 4713 vfs_unrefvfssw(vsw); 4714 rootdev = rootvfs->vfs_dev; 4715 4716 if (error) 4717 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n", 4718 rootfs.bo_name, fstyp); 4719 else 4720 cmn_err(CE_CONT, "?root on %s fstype %s\n", 4721 rootfs.bo_name, fstyp); 4722 return (error); 4723 } 4724 4725 /* 4726 * XXX this is called by nfs only and should probably be removed 4727 * If booted with ASKNAME, prompt on the console for a filesystem 4728 * name and return it. 4729 */ 4730 void 4731 getfsname(char *askfor, char *name, size_t namelen) 4732 { 4733 if (boothowto & RB_ASKNAME) { 4734 printf("%s name: ", askfor); 4735 console_gets(name, namelen); 4736 } 4737 } 4738 4739 /* 4740 * Init the root filesystem type (rootfs.bo_fstype) from the "fstype" 4741 * property. 4742 * 4743 * Filesystem types starting with the prefix "nfs" are diskless clients; 4744 * init the root filename name (rootfs.bo_name), too. 4745 * 4746 * If we are booting via NFS we currently have these options: 4747 * nfs - dynamically choose NFS V2, V3, or V4 (default) 4748 * nfs2 - force NFS V2 4749 * nfs3 - force NFS V3 4750 * nfs4 - force NFS V4 4751 * Because we need to maintain backward compatibility with the naming 4752 * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c) 4753 * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs". The dynamic 4754 * nfs module will map the type back to either "nfs", "nfs3", or "nfs4". 4755 * This is only for root filesystems, all other uses will expect 4756 * that "nfs" == NFS V2. 4757 */ 4758 static void 4759 getrootfs(char **fstypp, char **fsmodp) 4760 { 4761 extern char *strplumb_get_netdev_path(void); 4762 char *propstr = NULL; 4763 4764 /* 4765 * Check fstype property; for diskless it should be one of "nfs", 4766 * "nfs2", "nfs3" or "nfs4". 4767 */ 4768 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4769 DDI_PROP_DONTPASS, "fstype", &propstr) 4770 == DDI_SUCCESS) { 4771 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 4772 ddi_prop_free(propstr); 4773 4774 /* 4775 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set, 4776 * assume the type of this root filesystem is 'zfs'. 4777 */ 4778 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4779 DDI_PROP_DONTPASS, "zfs-bootfs", &propstr) 4780 == DDI_SUCCESS) { 4781 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME); 4782 ddi_prop_free(propstr); 4783 } 4784 4785 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) { 4786 *fstypp = *fsmodp = rootfs.bo_fstype; 4787 return; 4788 } 4789 4790 ++netboot; 4791 4792 if (strcmp(rootfs.bo_fstype, "nfs2") == 0) 4793 (void) strcpy(rootfs.bo_fstype, "nfs"); 4794 else if (strcmp(rootfs.bo_fstype, "nfs") == 0) 4795 (void) strcpy(rootfs.bo_fstype, "nfsdyn"); 4796 4797 /* 4798 * check if path to network interface is specified in bootpath 4799 * or by a hypervisor domain configuration file. 4800 * XXPV - enable strlumb_get_netdev_path() 4801 */ 4802 if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS, 4803 "xpv-nfsroot")) { 4804 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0"); 4805 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4806 DDI_PROP_DONTPASS, "bootpath", &propstr) 4807 == DDI_SUCCESS) { 4808 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 4809 ddi_prop_free(propstr); 4810 } else { 4811 /* attempt to determine netdev_path via boot_mac address */ 4812 netdev_path = strplumb_get_netdev_path(); 4813 if (netdev_path == NULL) 4814 panic("cannot find boot network interface"); 4815 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 4816 } 4817 *fstypp = rootfs.bo_fstype; 4818 *fsmodp = "nfs"; 4819 } 4820 #endif 4821 4822 /* 4823 * VFS feature routines 4824 */ 4825 4826 #define VFTINDEX(feature) (((feature) >> 32) & 0xFFFFFFFF) 4827 #define VFTBITS(feature) ((feature) & 0xFFFFFFFFLL) 4828 4829 /* Register a feature in the vfs */ 4830 void 4831 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature) 4832 { 4833 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4834 if (vfsp->vfs_implp == NULL) 4835 return; 4836 4837 vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature); 4838 } 4839 4840 void 4841 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature) 4842 { 4843 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4844 if (vfsp->vfs_implp == NULL) 4845 return; 4846 vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature); 4847 } 4848 4849 /* 4850 * Query a vfs for a feature. 4851 * Returns 1 if feature is present, 0 if not 4852 */ 4853 int 4854 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature) 4855 { 4856 int ret = 0; 4857 4858 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4859 if (vfsp->vfs_implp == NULL) 4860 return (ret); 4861 4862 if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature)) 4863 ret = 1; 4864 4865 return (ret); 4866 } 4867 4868 /* 4869 * Propagate feature set from one vfs to another 4870 */ 4871 void 4872 vfs_propagate_features(vfs_t *from, vfs_t *to) 4873 { 4874 int i; 4875 4876 if (to->vfs_implp == NULL || from->vfs_implp == NULL) 4877 return; 4878 4879 for (i = 1; i <= to->vfs_featureset[0]; i++) { 4880 to->vfs_featureset[i] = from->vfs_featureset[i]; 4881 } 4882 } 4883 4884 #define LOFINODE_PATH "/dev/lofi/%d" 4885 4886 /* 4887 * Return the vnode for the lofi node if there's a lofi mount in place. 4888 * Returns -1 when there's no lofi node, 0 on success, and > 0 on 4889 * failure. 4890 */ 4891 int 4892 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp) 4893 { 4894 char *path = NULL; 4895 int strsize; 4896 int err; 4897 4898 if (vfsp->vfs_lofi_minor == 0) { 4899 *vpp = NULL; 4900 return (-1); 4901 } 4902 4903 strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_minor); 4904 path = kmem_alloc(strsize + 1, KM_SLEEP); 4905 (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_minor); 4906 4907 /* 4908 * We may be inside a zone, so we need to use the /dev path, but 4909 * it's created asynchronously, so we wait here. 4910 */ 4911 for (;;) { 4912 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp); 4913 4914 if (err != ENOENT) 4915 break; 4916 4917 if ((err = delay_sig(hz / 8)) == EINTR) 4918 break; 4919 } 4920 4921 if (err) 4922 *vpp = NULL; 4923 4924 kmem_free(path, strsize + 1); 4925 return (err); 4926 } 4927