1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2016 Toomas Soome <tsoome@me.com> 26 * Copyright (c) 2016 by Delphix. All rights reserved. 27 */ 28 29 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 30 /* All Rights Reserved */ 31 32 /* 33 * University Copyright- Copyright (c) 1982, 1986, 1988 34 * The Regents of the University of California 35 * All Rights Reserved 36 * 37 * University Acknowledgment- Portions of this document are derived from 38 * software developed by the University of California, Berkeley, and its 39 * contributors. 40 */ 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/errno.h> 46 #include <sys/user.h> 47 #include <sys/fstyp.h> 48 #include <sys/kmem.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/mount.h> 52 #include <sys/vfs.h> 53 #include <sys/vfs_opreg.h> 54 #include <sys/fem.h> 55 #include <sys/mntent.h> 56 #include <sys/stat.h> 57 #include <sys/statvfs.h> 58 #include <sys/statfs.h> 59 #include <sys/cred.h> 60 #include <sys/vnode.h> 61 #include <sys/rwstlock.h> 62 #include <sys/dnlc.h> 63 #include <sys/file.h> 64 #include <sys/time.h> 65 #include <sys/atomic.h> 66 #include <sys/cmn_err.h> 67 #include <sys/buf.h> 68 #include <sys/swap.h> 69 #include <sys/debug.h> 70 #include <sys/vnode.h> 71 #include <sys/modctl.h> 72 #include <sys/ddi.h> 73 #include <sys/pathname.h> 74 #include <sys/bootconf.h> 75 #include <sys/dumphdr.h> 76 #include <sys/dc_ki.h> 77 #include <sys/poll.h> 78 #include <sys/sunddi.h> 79 #include <sys/sysmacros.h> 80 #include <sys/zone.h> 81 #include <sys/policy.h> 82 #include <sys/ctfs.h> 83 #include <sys/objfs.h> 84 #include <sys/console.h> 85 #include <sys/reboot.h> 86 #include <sys/attr.h> 87 #include <sys/zio.h> 88 #include <sys/spa.h> 89 #include <sys/lofi.h> 90 #include <sys/bootprops.h> 91 92 #include <vm/page.h> 93 94 #include <fs/fs_subr.h> 95 /* Private interfaces to create vopstats-related data structures */ 96 extern void initialize_vopstats(vopstats_t *); 97 extern vopstats_t *get_fstype_vopstats(struct vfs *, struct vfssw *); 98 extern vsk_anchor_t *get_vskstat_anchor(struct vfs *); 99 100 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int); 101 static void vfs_setmntopt_nolock(mntopts_t *, const char *, 102 const char *, int, int); 103 static int vfs_optionisset_nolock(const mntopts_t *, const char *, char **); 104 static void vfs_freemnttab(struct vfs *); 105 static void vfs_freeopt(mntopt_t *); 106 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *); 107 static void vfs_swapopttbl(mntopts_t *, mntopts_t *); 108 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int); 109 static void vfs_createopttbl_extend(mntopts_t *, const char *, 110 const mntopts_t *); 111 static char **vfs_copycancelopt_extend(char **const, int); 112 static void vfs_freecancelopt(char **); 113 static void getrootfs(char **, char **); 114 static int getmacpath(dev_info_t *, void *); 115 static void vfs_mnttabvp_setup(void); 116 117 struct ipmnt { 118 struct ipmnt *mip_next; 119 dev_t mip_dev; 120 struct vfs *mip_vfsp; 121 }; 122 123 static kmutex_t vfs_miplist_mutex; 124 static struct ipmnt *vfs_miplist = NULL; 125 static struct ipmnt *vfs_miplist_end = NULL; 126 127 static kmem_cache_t *vfs_cache; /* Pointer to VFS kmem cache */ 128 129 /* 130 * VFS global data. 131 */ 132 vnode_t *rootdir; /* pointer to root inode vnode. */ 133 vnode_t *devicesdir; /* pointer to inode of devices root */ 134 vnode_t *devdir; /* pointer to inode of dev root */ 135 136 char *server_rootpath; /* root path for diskless clients */ 137 char *server_hostname; /* hostname of diskless server */ 138 139 static struct vfs root; 140 static struct vfs devices; 141 static struct vfs dev; 142 struct vfs *rootvfs = &root; /* pointer to root vfs; head of VFS list. */ 143 rvfs_t *rvfs_list; /* array of vfs ptrs for vfs hash list */ 144 int vfshsz = 512; /* # of heads/locks in vfs hash arrays */ 145 /* must be power of 2! */ 146 timespec_t vfs_mnttab_ctime; /* mnttab created time */ 147 timespec_t vfs_mnttab_mtime; /* mnttab last modified time */ 148 char *vfs_dummyfstype = "\0"; 149 struct pollhead vfs_pollhd; /* for mnttab pollers */ 150 struct vnode *vfs_mntdummyvp; /* to fake mnttab read/write for file events */ 151 int mntfstype; /* will be set once mnt fs is mounted */ 152 153 /* 154 * Table for generic options recognized in the VFS layer and acted 155 * on at this level before parsing file system specific options. 156 * The nosuid option is stronger than any of the devices and setuid 157 * options, so those are canceled when nosuid is seen. 158 * 159 * All options which are added here need to be added to the 160 * list of standard options in usr/src/cmd/fs.d/fslib.c as well. 161 */ 162 /* 163 * VFS Mount options table 164 */ 165 static char *ro_cancel[] = { MNTOPT_RW, NULL }; 166 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 167 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL }; 168 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES, 169 MNTOPT_NOSETUID, MNTOPT_SETUID, NULL }; 170 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL }; 171 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL }; 172 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL }; 173 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL }; 174 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL }; 175 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL }; 176 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL }; 177 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL }; 178 179 static const mntopt_t mntopts[] = { 180 /* 181 * option name cancel options default arg flags 182 */ 183 { MNTOPT_REMOUNT, NULL, NULL, 184 MO_NODISPLAY, (void *)0 }, 185 { MNTOPT_RO, ro_cancel, NULL, 0, 186 (void *)0 }, 187 { MNTOPT_RW, rw_cancel, NULL, 0, 188 (void *)0 }, 189 { MNTOPT_SUID, suid_cancel, NULL, 0, 190 (void *)0 }, 191 { MNTOPT_NOSUID, nosuid_cancel, NULL, 0, 192 (void *)0 }, 193 { MNTOPT_DEVICES, devices_cancel, NULL, 0, 194 (void *)0 }, 195 { MNTOPT_NODEVICES, nodevices_cancel, NULL, 0, 196 (void *)0 }, 197 { MNTOPT_SETUID, setuid_cancel, NULL, 0, 198 (void *)0 }, 199 { MNTOPT_NOSETUID, nosetuid_cancel, NULL, 0, 200 (void *)0 }, 201 { MNTOPT_NBMAND, nbmand_cancel, NULL, 0, 202 (void *)0 }, 203 { MNTOPT_NONBMAND, nonbmand_cancel, NULL, 0, 204 (void *)0 }, 205 { MNTOPT_EXEC, exec_cancel, NULL, 0, 206 (void *)0 }, 207 { MNTOPT_NOEXEC, noexec_cancel, NULL, 0, 208 (void *)0 }, 209 }; 210 211 const mntopts_t vfs_mntopts = { 212 sizeof (mntopts) / sizeof (mntopt_t), 213 (mntopt_t *)&mntopts[0] 214 }; 215 216 /* 217 * File system operation dispatch functions. 218 */ 219 220 int 221 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 222 { 223 return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr); 224 } 225 226 int 227 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr) 228 { 229 return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr); 230 } 231 232 int 233 fsop_root(vfs_t *vfsp, vnode_t **vpp) 234 { 235 refstr_t *mntpt; 236 int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp); 237 /* 238 * Make sure this root has a path. With lofs, it is possible to have 239 * a NULL mountpoint. 240 */ 241 if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { 242 mntpt = vfs_getmntpoint(vfsp); 243 vn_setpath_str(*vpp, refstr_value(mntpt), 244 strlen(refstr_value(mntpt))); 245 refstr_rele(mntpt); 246 } 247 248 return (ret); 249 } 250 251 int 252 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp) 253 { 254 return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp); 255 } 256 257 int 258 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr) 259 { 260 return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr); 261 } 262 263 int 264 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 265 { 266 /* 267 * In order to handle system attribute fids in a manner 268 * transparent to the underlying fs, we embed the fid for 269 * the sysattr parent object in the sysattr fid and tack on 270 * some extra bytes that only the sysattr layer knows about. 271 * 272 * This guarantees that sysattr fids are larger than other fids 273 * for this vfs. If the vfs supports the sysattr view interface 274 * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size 275 * collision with XATTR_FIDSZ. 276 */ 277 if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) && 278 fidp->fid_len == XATTR_FIDSZ) 279 return (xattr_dir_vget(vfsp, vpp, fidp)); 280 281 return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp); 282 } 283 284 int 285 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason) 286 { 287 return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason); 288 } 289 290 void 291 fsop_freefs(vfs_t *vfsp) 292 { 293 (*(vfsp)->vfs_op->vfs_freevfs)(vfsp); 294 } 295 296 int 297 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate) 298 { 299 return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate)); 300 } 301 302 int 303 fsop_sync_by_kind(int fstype, short flag, cred_t *cr) 304 { 305 ASSERT((fstype >= 0) && (fstype < nfstype)); 306 307 if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype])) 308 return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr); 309 else 310 return (ENOTSUP); 311 } 312 313 /* 314 * File system initialization. vfs_setfsops() must be called from a file 315 * system's init routine. 316 */ 317 318 static int 319 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual, 320 int *unused_ops) 321 { 322 static const fs_operation_trans_def_t vfs_ops_table[] = { 323 VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount), 324 fs_nosys, fs_nosys, 325 326 VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount), 327 fs_nosys, fs_nosys, 328 329 VFSNAME_ROOT, offsetof(vfsops_t, vfs_root), 330 fs_nosys, fs_nosys, 331 332 VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs), 333 fs_nosys, fs_nosys, 334 335 VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync), 336 (fs_generic_func_p) fs_sync, 337 (fs_generic_func_p) fs_sync, /* No errors allowed */ 338 339 VFSNAME_VGET, offsetof(vfsops_t, vfs_vget), 340 fs_nosys, fs_nosys, 341 342 VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot), 343 fs_nosys, fs_nosys, 344 345 VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs), 346 (fs_generic_func_p)fs_freevfs, 347 (fs_generic_func_p)fs_freevfs, /* Shouldn't fail */ 348 349 VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate), 350 (fs_generic_func_p)fs_nosys, 351 (fs_generic_func_p)fs_nosys, 352 353 NULL, 0, NULL, NULL 354 }; 355 356 return (fs_build_vector(actual, unused_ops, vfs_ops_table, template)); 357 } 358 359 void 360 zfs_boot_init(void) 361 { 362 if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0) 363 spa_boot_init(); 364 } 365 366 int 367 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual) 368 { 369 int error; 370 int unused_ops; 371 372 /* 373 * Verify that fstype refers to a valid fs. Note that 374 * 0 is valid since it's used to set "stray" ops. 375 */ 376 if ((fstype < 0) || (fstype >= nfstype)) 377 return (EINVAL); 378 379 if (!ALLOCATED_VFSSW(&vfssw[fstype])) 380 return (EINVAL); 381 382 /* Set up the operations vector. */ 383 384 error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops); 385 386 if (error != 0) 387 return (error); 388 389 vfssw[fstype].vsw_flag |= VSW_INSTALLED; 390 391 if (actual != NULL) 392 *actual = &vfssw[fstype].vsw_vfsops; 393 394 #if DEBUG 395 if (unused_ops != 0) 396 cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied " 397 "but not used", vfssw[fstype].vsw_name, unused_ops); 398 #endif 399 400 return (0); 401 } 402 403 int 404 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual) 405 { 406 int error; 407 int unused_ops; 408 409 *actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP); 410 411 error = fs_copyfsops(template, *actual, &unused_ops); 412 if (error != 0) { 413 kmem_free(*actual, sizeof (vfsops_t)); 414 *actual = NULL; 415 return (error); 416 } 417 418 return (0); 419 } 420 421 /* 422 * Free a vfsops structure created as a result of vfs_makefsops(). 423 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use 424 * vfs_freevfsops_by_type(). 425 */ 426 void 427 vfs_freevfsops(vfsops_t *vfsops) 428 { 429 kmem_free(vfsops, sizeof (vfsops_t)); 430 } 431 432 /* 433 * Since the vfsops structure is part of the vfssw table and wasn't 434 * really allocated, we're not really freeing anything. We keep 435 * the name for consistency with vfs_freevfsops(). We do, however, 436 * need to take care of a little bookkeeping. 437 * NOTE: For a vfsops structure created by vfs_setfsops(), use 438 * vfs_freevfsops_by_type(). 439 */ 440 int 441 vfs_freevfsops_by_type(int fstype) 442 { 443 444 /* Verify that fstype refers to a loaded fs (and not fsid 0). */ 445 if ((fstype <= 0) || (fstype >= nfstype)) 446 return (EINVAL); 447 448 WLOCK_VFSSW(); 449 if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) { 450 WUNLOCK_VFSSW(); 451 return (EINVAL); 452 } 453 454 vfssw[fstype].vsw_flag &= ~VSW_INSTALLED; 455 WUNLOCK_VFSSW(); 456 457 return (0); 458 } 459 460 /* Support routines used to reference vfs_op */ 461 462 /* Set the operations vector for a vfs */ 463 void 464 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops) 465 { 466 vfsops_t *op; 467 468 ASSERT(vfsp != NULL); 469 ASSERT(vfsops != NULL); 470 471 op = vfsp->vfs_op; 472 membar_consumer(); 473 if (vfsp->vfs_femhead == NULL && 474 atomic_cas_ptr(&vfsp->vfs_op, op, vfsops) == op) { 475 return; 476 } 477 fsem_setvfsops(vfsp, vfsops); 478 } 479 480 /* Retrieve the operations vector for a vfs */ 481 vfsops_t * 482 vfs_getops(vfs_t *vfsp) 483 { 484 vfsops_t *op; 485 486 ASSERT(vfsp != NULL); 487 488 op = vfsp->vfs_op; 489 membar_consumer(); 490 if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) { 491 return (op); 492 } else { 493 return (fsem_getvfsops(vfsp)); 494 } 495 } 496 497 /* 498 * Returns non-zero (1) if the vfsops matches that of the vfs. 499 * Returns zero (0) if not. 500 */ 501 int 502 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops) 503 { 504 return (vfs_getops(vfsp) == vfsops); 505 } 506 507 /* 508 * Returns non-zero (1) if the file system has installed a non-default, 509 * non-error vfs_sync routine. Returns zero (0) otherwise. 510 */ 511 int 512 vfs_can_sync(vfs_t *vfsp) 513 { 514 /* vfs_sync() routine is not the default/error function */ 515 return (vfs_getops(vfsp)->vfs_sync != fs_sync); 516 } 517 518 /* 519 * Initialize a vfs structure. 520 */ 521 void 522 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data) 523 { 524 /* Other initialization has been moved to vfs_alloc() */ 525 vfsp->vfs_count = 0; 526 vfsp->vfs_next = vfsp; 527 vfsp->vfs_prev = vfsp; 528 vfsp->vfs_zone_next = vfsp; 529 vfsp->vfs_zone_prev = vfsp; 530 vfsp->vfs_lofi_id = 0; 531 sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL); 532 vfsimpl_setup(vfsp); 533 vfsp->vfs_data = (data); 534 vfs_setops((vfsp), (op)); 535 } 536 537 /* 538 * Allocate and initialize the vfs implementation private data 539 * structure, vfs_impl_t. 540 */ 541 void 542 vfsimpl_setup(vfs_t *vfsp) 543 { 544 int i; 545 546 if (vfsp->vfs_implp != NULL) { 547 return; 548 } 549 550 vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP); 551 /* Note that these are #define'd in vfs.h */ 552 vfsp->vfs_vskap = NULL; 553 vfsp->vfs_fstypevsp = NULL; 554 555 /* Set size of counted array, then zero the array */ 556 vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1; 557 for (i = 1; i < VFS_FEATURE_MAXSZ; i++) { 558 vfsp->vfs_featureset[i] = 0; 559 } 560 } 561 562 /* 563 * Release the vfs_impl_t structure, if it exists. Some unbundled 564 * filesystems may not use the newer version of vfs and thus 565 * would not contain this implementation private data structure. 566 */ 567 void 568 vfsimpl_teardown(vfs_t *vfsp) 569 { 570 vfs_impl_t *vip = vfsp->vfs_implp; 571 572 if (vip == NULL) 573 return; 574 575 kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t)); 576 vfsp->vfs_implp = NULL; 577 } 578 579 /* 580 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs, 581 * fstatvfs, and sysfs moved to common/syscall. 582 */ 583 584 /* 585 * Update every mounted file system. We call the vfs_sync operation of 586 * each file system type, passing it a NULL vfsp to indicate that all 587 * mounted file systems of that type should be updated. 588 */ 589 void 590 vfs_sync(int flag) 591 { 592 struct vfssw *vswp; 593 RLOCK_VFSSW(); 594 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 595 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { 596 vfs_refvfssw(vswp); 597 RUNLOCK_VFSSW(); 598 (void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag, 599 CRED()); 600 vfs_unrefvfssw(vswp); 601 RLOCK_VFSSW(); 602 } 603 } 604 RUNLOCK_VFSSW(); 605 } 606 607 void 608 sync(void) 609 { 610 vfs_sync(0); 611 } 612 613 /* 614 * External routines. 615 */ 616 617 krwlock_t vfssw_lock; /* lock accesses to vfssw */ 618 619 /* 620 * Lock for accessing the vfs linked list. Initialized in vfs_mountroot(), 621 * but otherwise should be accessed only via vfs_list_lock() and 622 * vfs_list_unlock(). Also used to protect the timestamp for mods to the list. 623 */ 624 static krwlock_t vfslist; 625 626 /* 627 * Mount devfs on /devices. This is done right after root is mounted 628 * to provide device access support for the system 629 */ 630 static void 631 vfs_mountdevices(void) 632 { 633 struct vfssw *vsw; 634 struct vnode *mvp; 635 struct mounta mounta = { /* fake mounta for devfs_mount() */ 636 NULL, 637 NULL, 638 MS_SYSSPACE, 639 NULL, 640 NULL, 641 0, 642 NULL, 643 0 644 }; 645 646 /* 647 * _init devfs module to fill in the vfssw 648 */ 649 if (modload("fs", "devfs") == -1) 650 panic("Cannot _init devfs module"); 651 652 /* 653 * Hold vfs 654 */ 655 RLOCK_VFSSW(); 656 vsw = vfs_getvfsswbyname("devfs"); 657 VFS_INIT(&devices, &vsw->vsw_vfsops, NULL); 658 VFS_HOLD(&devices); 659 660 /* 661 * Locate mount point 662 */ 663 if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 664 panic("Cannot find /devices"); 665 666 /* 667 * Perform the mount of /devices 668 */ 669 if (VFS_MOUNT(&devices, mvp, &mounta, CRED())) 670 panic("Cannot mount /devices"); 671 672 RUNLOCK_VFSSW(); 673 674 /* 675 * Set appropriate members and add to vfs list for mnttab display 676 */ 677 vfs_setresource(&devices, "/devices", 0); 678 vfs_setmntpoint(&devices, "/devices", 0); 679 680 /* 681 * Hold the root of /devices so it won't go away 682 */ 683 if (VFS_ROOT(&devices, &devicesdir)) 684 panic("vfs_mountdevices: not devices root"); 685 686 if (vfs_lock(&devices) != 0) { 687 VN_RELE(devicesdir); 688 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices"); 689 return; 690 } 691 692 if (vn_vfswlock(mvp) != 0) { 693 vfs_unlock(&devices); 694 VN_RELE(devicesdir); 695 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices"); 696 return; 697 } 698 699 vfs_add(mvp, &devices, 0); 700 vn_vfsunlock(mvp); 701 vfs_unlock(&devices); 702 VN_RELE(devicesdir); 703 } 704 705 /* 706 * mount the first instance of /dev to root and remain mounted 707 */ 708 static void 709 vfs_mountdev1(void) 710 { 711 struct vfssw *vsw; 712 struct vnode *mvp; 713 struct mounta mounta = { /* fake mounta for sdev_mount() */ 714 NULL, 715 NULL, 716 MS_SYSSPACE | MS_OVERLAY, 717 NULL, 718 NULL, 719 0, 720 NULL, 721 0 722 }; 723 724 /* 725 * _init dev module to fill in the vfssw 726 */ 727 if (modload("fs", "dev") == -1) 728 cmn_err(CE_PANIC, "Cannot _init dev module\n"); 729 730 /* 731 * Hold vfs 732 */ 733 RLOCK_VFSSW(); 734 vsw = vfs_getvfsswbyname("dev"); 735 VFS_INIT(&dev, &vsw->vsw_vfsops, NULL); 736 VFS_HOLD(&dev); 737 738 /* 739 * Locate mount point 740 */ 741 if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) 742 cmn_err(CE_PANIC, "Cannot find /dev\n"); 743 744 /* 745 * Perform the mount of /dev 746 */ 747 if (VFS_MOUNT(&dev, mvp, &mounta, CRED())) 748 cmn_err(CE_PANIC, "Cannot mount /dev 1\n"); 749 750 RUNLOCK_VFSSW(); 751 752 /* 753 * Set appropriate members and add to vfs list for mnttab display 754 */ 755 vfs_setresource(&dev, "/dev", 0); 756 vfs_setmntpoint(&dev, "/dev", 0); 757 758 /* 759 * Hold the root of /dev so it won't go away 760 */ 761 if (VFS_ROOT(&dev, &devdir)) 762 cmn_err(CE_PANIC, "vfs_mountdev1: not dev root"); 763 764 if (vfs_lock(&dev) != 0) { 765 VN_RELE(devdir); 766 cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev"); 767 return; 768 } 769 770 if (vn_vfswlock(mvp) != 0) { 771 vfs_unlock(&dev); 772 VN_RELE(devdir); 773 cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev"); 774 return; 775 } 776 777 vfs_add(mvp, &dev, 0); 778 vn_vfsunlock(mvp); 779 vfs_unlock(&dev); 780 VN_RELE(devdir); 781 } 782 783 /* 784 * Mount required filesystem. This is done right after root is mounted. 785 */ 786 static void 787 vfs_mountfs(char *module, char *spec, char *path) 788 { 789 struct vnode *mvp; 790 struct mounta mounta; 791 vfs_t *vfsp; 792 793 mounta.flags = MS_SYSSPACE | MS_DATA; 794 mounta.fstype = module; 795 mounta.spec = spec; 796 mounta.dir = path; 797 if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) { 798 cmn_err(CE_WARN, "Cannot find %s", path); 799 return; 800 } 801 if (domount(NULL, &mounta, mvp, CRED(), &vfsp)) 802 cmn_err(CE_WARN, "Cannot mount %s", path); 803 else 804 VFS_RELE(vfsp); 805 VN_RELE(mvp); 806 } 807 808 /* 809 * vfs_mountroot is called by main() to mount the root filesystem. 810 */ 811 void 812 vfs_mountroot(void) 813 { 814 struct vnode *rvp = NULL; 815 char *path; 816 size_t plen; 817 struct vfssw *vswp; 818 proc_t *p; 819 820 rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); 821 rw_init(&vfslist, NULL, RW_DEFAULT, NULL); 822 823 /* 824 * Alloc the vfs hash bucket array and locks 825 */ 826 rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP); 827 828 /* 829 * Call machine-dependent routine "rootconf" to choose a root 830 * file system type. 831 */ 832 if (rootconf()) 833 panic("vfs_mountroot: cannot mount root"); 834 /* 835 * Get vnode for '/'. Set up rootdir, u.u_rdir and u.u_cdir 836 * to point to it. These are used by lookuppn() so that it 837 * knows where to start from ('/' or '.'). 838 */ 839 vfs_setmntpoint(rootvfs, "/", 0); 840 if (VFS_ROOT(rootvfs, &rootdir)) 841 panic("vfs_mountroot: no root vnode"); 842 843 /* 844 * At this point, the process tree consists of p0 and possibly some 845 * direct children of p0. (i.e. there are no grandchildren) 846 * 847 * Walk through them all, setting their current directory. 848 */ 849 mutex_enter(&pidlock); 850 for (p = practive; p != NULL; p = p->p_next) { 851 ASSERT(p == &p0 || p->p_parent == &p0); 852 853 PTOU(p)->u_cdir = rootdir; 854 VN_HOLD(PTOU(p)->u_cdir); 855 PTOU(p)->u_rdir = NULL; 856 } 857 mutex_exit(&pidlock); 858 859 /* 860 * Setup the global zone's rootvp, now that it exists. 861 */ 862 global_zone->zone_rootvp = rootdir; 863 VN_HOLD(global_zone->zone_rootvp); 864 865 /* 866 * Notify the module code that it can begin using the 867 * root filesystem instead of the boot program's services. 868 */ 869 modrootloaded = 1; 870 871 /* 872 * Special handling for a ZFS root file system. 873 */ 874 zfs_boot_init(); 875 876 /* 877 * Set up mnttab information for root 878 */ 879 vfs_setresource(rootvfs, rootfs.bo_name, 0); 880 881 /* 882 * Notify cluster software that the root filesystem is available. 883 */ 884 clboot_mountroot(); 885 886 /* Now that we're all done with the root FS, set up its vopstats */ 887 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) { 888 /* Set flag for statistics collection */ 889 if (vswp->vsw_flag & VSW_STATS) { 890 initialize_vopstats(&rootvfs->vfs_vopstats); 891 rootvfs->vfs_flag |= VFS_STATS; 892 rootvfs->vfs_fstypevsp = 893 get_fstype_vopstats(rootvfs, vswp); 894 rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs); 895 } 896 vfs_unrefvfssw(vswp); 897 } 898 899 /* 900 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab, 901 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc. 902 */ 903 vfs_mountdevices(); 904 vfs_mountdev1(); 905 906 vfs_mountfs("ctfs", "ctfs", CTFS_ROOT); 907 vfs_mountfs("proc", "/proc", "/proc"); 908 vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); 909 vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); 910 vfs_mountfs("objfs", "objfs", OBJFS_ROOT); 911 vfs_mountfs("bootfs", "bootfs", "/system/boot"); 912 913 if (getzoneid() == GLOBAL_ZONEID) { 914 vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab"); 915 } 916 917 #ifdef __sparc 918 /* 919 * This bit of magic can go away when we convert sparc to 920 * the new boot architecture based on ramdisk. 921 * 922 * Booting off a mirrored root volume: 923 * At this point, we have booted and mounted root on a 924 * single component of the mirror. Complete the boot 925 * by configuring SVM and converting the root to the 926 * dev_t of the mirrored root device. This dev_t conversion 927 * only works because the underlying device doesn't change. 928 */ 929 if (root_is_svm) { 930 if (svm_rootconf()) { 931 panic("vfs_mountroot: cannot remount root"); 932 } 933 934 /* 935 * mnttab should reflect the new root device 936 */ 937 vfs_lock_wait(rootvfs); 938 vfs_setresource(rootvfs, rootfs.bo_name, 0); 939 vfs_unlock(rootvfs); 940 } 941 #endif /* __sparc */ 942 943 if (strcmp(rootfs.bo_fstype, "zfs") != 0) { 944 /* 945 * Look up the root device via devfs so that a dv_node is 946 * created for it. The vnode is never VN_RELE()ed. 947 * We allocate more than MAXPATHLEN so that the 948 * buffer passed to i_ddi_prompath_to_devfspath() is 949 * exactly MAXPATHLEN (the function expects a buffer 950 * of that length). 951 */ 952 plen = strlen("/devices"); 953 path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); 954 (void) strcpy(path, "/devices"); 955 956 if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) 957 != DDI_SUCCESS || 958 lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { 959 960 /* NUL terminate in case "path" has garbage */ 961 path[plen + MAXPATHLEN - 1] = '\0'; 962 #ifdef DEBUG 963 cmn_err(CE_WARN, "!Cannot lookup root device: %s", 964 path); 965 #endif 966 } 967 kmem_free(path, plen + MAXPATHLEN); 968 } 969 970 vfs_mnttabvp_setup(); 971 } 972 973 /* 974 * Check to see if our "block device" is actually a file. If so, 975 * automatically add a lofi device, and keep track of this fact. 976 */ 977 static int 978 lofi_add(const char *fsname, struct vfs *vfsp, 979 mntopts_t *mntopts, struct mounta *uap) 980 { 981 int fromspace = (uap->flags & MS_SYSSPACE) ? 982 UIO_SYSSPACE : UIO_USERSPACE; 983 struct lofi_ioctl *li = NULL; 984 struct vnode *vp = NULL; 985 struct pathname pn = { NULL }; 986 ldi_ident_t ldi_id; 987 ldi_handle_t ldi_hdl; 988 vfssw_t *vfssw; 989 int id; 990 int err = 0; 991 992 if ((vfssw = vfs_getvfssw(fsname)) == NULL) 993 return (0); 994 995 if (!(vfssw->vsw_flag & VSW_CANLOFI)) { 996 vfs_unrefvfssw(vfssw); 997 return (0); 998 } 999 1000 vfs_unrefvfssw(vfssw); 1001 vfssw = NULL; 1002 1003 if (pn_get(uap->spec, fromspace, &pn) != 0) 1004 return (0); 1005 1006 if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0) 1007 goto out; 1008 1009 if (vp->v_type != VREG) 1010 goto out; 1011 1012 /* OK, this is a lofi mount. */ 1013 1014 if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) || 1015 vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) || 1016 vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) || 1017 vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) { 1018 err = EINVAL; 1019 goto out; 1020 } 1021 1022 ldi_id = ldi_ident_from_anon(); 1023 li = kmem_zalloc(sizeof (*li), KM_SLEEP); 1024 (void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN); 1025 1026 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred, 1027 &ldi_hdl, ldi_id); 1028 1029 if (err) 1030 goto out2; 1031 1032 err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li, 1033 FREAD | FWRITE | FKIOCTL, kcred, &id); 1034 1035 (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred); 1036 1037 if (!err) 1038 vfsp->vfs_lofi_id = id; 1039 1040 out2: 1041 ldi_ident_release(ldi_id); 1042 out: 1043 if (li != NULL) 1044 kmem_free(li, sizeof (*li)); 1045 if (vp != NULL) 1046 VN_RELE(vp); 1047 pn_free(&pn); 1048 return (err); 1049 } 1050 1051 static void 1052 lofi_remove(struct vfs *vfsp) 1053 { 1054 struct lofi_ioctl *li = NULL; 1055 ldi_ident_t ldi_id; 1056 ldi_handle_t ldi_hdl; 1057 int err; 1058 1059 if (vfsp->vfs_lofi_id == 0) 1060 return; 1061 1062 ldi_id = ldi_ident_from_anon(); 1063 1064 li = kmem_zalloc(sizeof (*li), KM_SLEEP); 1065 li->li_id = vfsp->vfs_lofi_id; 1066 li->li_cleanup = B_TRUE; 1067 1068 err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE, kcred, 1069 &ldi_hdl, ldi_id); 1070 1071 if (err) 1072 goto out; 1073 1074 err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li, 1075 FREAD | FWRITE | FKIOCTL, kcred, NULL); 1076 1077 (void) ldi_close(ldi_hdl, FREAD | FWRITE, kcred); 1078 1079 if (!err) 1080 vfsp->vfs_lofi_id = 0; 1081 1082 out: 1083 ldi_ident_release(ldi_id); 1084 if (li != NULL) 1085 kmem_free(li, sizeof (*li)); 1086 } 1087 1088 /* 1089 * Common mount code. Called from the system call entry point, from autofs, 1090 * nfsv4 trigger mounts, and from pxfs. 1091 * 1092 * Takes the effective file system type, mount arguments, the mount point 1093 * vnode, flags specifying whether the mount is a remount and whether it 1094 * should be entered into the vfs list, and credentials. Fills in its vfspp 1095 * parameter with the mounted file system instance's vfs. 1096 * 1097 * Note that the effective file system type is specified as a string. It may 1098 * be null, in which case it's determined from the mount arguments, and may 1099 * differ from the type specified in the mount arguments; this is a hook to 1100 * allow interposition when instantiating file system instances. 1101 * 1102 * The caller is responsible for releasing its own hold on the mount point 1103 * vp (this routine does its own hold when necessary). 1104 * Also note that for remounts, the mount point vp should be the vnode for 1105 * the root of the file system rather than the vnode that the file system 1106 * is mounted on top of. 1107 */ 1108 int 1109 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, 1110 struct vfs **vfspp) 1111 { 1112 struct vfssw *vswp; 1113 vfsops_t *vfsops; 1114 struct vfs *vfsp; 1115 struct vnode *bvp; 1116 dev_t bdev = 0; 1117 mntopts_t mnt_mntopts; 1118 int error = 0; 1119 int copyout_error = 0; 1120 int ovflags; 1121 char *opts = uap->optptr; 1122 char *inargs = opts; 1123 int optlen = uap->optlen; 1124 int remount; 1125 int rdonly; 1126 int nbmand = 0; 1127 int delmip = 0; 1128 int addmip = 0; 1129 int splice = ((uap->flags & MS_NOSPLICE) == 0); 1130 int fromspace = (uap->flags & MS_SYSSPACE) ? 1131 UIO_SYSSPACE : UIO_USERSPACE; 1132 char *resource = NULL, *mountpt = NULL; 1133 refstr_t *oldresource, *oldmntpt; 1134 struct pathname pn, rpn; 1135 vsk_anchor_t *vskap; 1136 char fstname[FSTYPSZ]; 1137 zone_t *zone; 1138 1139 /* 1140 * The v_flag value for the mount point vp is permanently set 1141 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine 1142 * for mount point locking. 1143 */ 1144 mutex_enter(&vp->v_lock); 1145 vp->v_flag |= VVFSLOCK; 1146 mutex_exit(&vp->v_lock); 1147 1148 mnt_mntopts.mo_count = 0; 1149 /* 1150 * Find the ops vector to use to invoke the file system-specific mount 1151 * method. If the fsname argument is non-NULL, use it directly. 1152 * Otherwise, dig the file system type information out of the mount 1153 * arguments. 1154 * 1155 * A side effect is to hold the vfssw entry. 1156 * 1157 * Mount arguments can be specified in several ways, which are 1158 * distinguished by flag bit settings. The preferred way is to set 1159 * MS_OPTIONSTR, indicating an 8 argument mount with the file system 1160 * type supplied as a character string and the last two arguments 1161 * being a pointer to a character buffer and the size of the buffer. 1162 * On entry, the buffer holds a null terminated list of options; on 1163 * return, the string is the list of options the file system 1164 * recognized. If MS_DATA is set arguments five and six point to a 1165 * block of binary data which the file system interprets. 1166 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA 1167 * consistently with these conventions. To handle them, we check to 1168 * see whether the pointer to the file system name has a numeric value 1169 * less than 256. If so, we treat it as an index. 1170 */ 1171 if (fsname != NULL) { 1172 if ((vswp = vfs_getvfssw(fsname)) == NULL) { 1173 return (EINVAL); 1174 } 1175 } else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) { 1176 size_t n; 1177 uint_t fstype; 1178 1179 fsname = fstname; 1180 1181 if ((fstype = (uintptr_t)uap->fstype) < 256) { 1182 RLOCK_VFSSW(); 1183 if (fstype == 0 || fstype >= nfstype || 1184 !ALLOCATED_VFSSW(&vfssw[fstype])) { 1185 RUNLOCK_VFSSW(); 1186 return (EINVAL); 1187 } 1188 (void) strcpy(fsname, vfssw[fstype].vsw_name); 1189 RUNLOCK_VFSSW(); 1190 if ((vswp = vfs_getvfssw(fsname)) == NULL) 1191 return (EINVAL); 1192 } else { 1193 /* 1194 * Handle either kernel or user address space. 1195 */ 1196 if (uap->flags & MS_SYSSPACE) { 1197 error = copystr(uap->fstype, fsname, 1198 FSTYPSZ, &n); 1199 } else { 1200 error = copyinstr(uap->fstype, fsname, 1201 FSTYPSZ, &n); 1202 } 1203 if (error) { 1204 if (error == ENAMETOOLONG) 1205 return (EINVAL); 1206 return (error); 1207 } 1208 if ((vswp = vfs_getvfssw(fsname)) == NULL) 1209 return (EINVAL); 1210 } 1211 } else { 1212 if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL) 1213 return (EINVAL); 1214 fsname = vswp->vsw_name; 1215 } 1216 if (!VFS_INSTALLED(vswp)) 1217 return (EINVAL); 1218 1219 if ((error = secpolicy_fs_allowed_mount(fsname)) != 0) { 1220 vfs_unrefvfssw(vswp); 1221 return (error); 1222 } 1223 1224 vfsops = &vswp->vsw_vfsops; 1225 1226 vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts); 1227 /* 1228 * Fetch mount options and parse them for generic vfs options 1229 */ 1230 if (uap->flags & MS_OPTIONSTR) { 1231 /* 1232 * Limit the buffer size 1233 */ 1234 if (optlen < 0 || optlen > MAX_MNTOPT_STR) { 1235 error = EINVAL; 1236 goto errout; 1237 } 1238 if ((uap->flags & MS_SYSSPACE) == 0) { 1239 inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 1240 inargs[0] = '\0'; 1241 if (optlen) { 1242 error = copyinstr(opts, inargs, (size_t)optlen, 1243 NULL); 1244 if (error) { 1245 goto errout; 1246 } 1247 } 1248 } 1249 vfs_parsemntopts(&mnt_mntopts, inargs, 0); 1250 } 1251 /* 1252 * Flag bits override the options string. 1253 */ 1254 if (uap->flags & MS_REMOUNT) 1255 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0); 1256 if (uap->flags & MS_RDONLY) 1257 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0); 1258 if (uap->flags & MS_NOSUID) 1259 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 1260 1261 /* 1262 * Check if this is a remount; must be set in the option string and 1263 * the file system must support a remount option. 1264 */ 1265 if (remount = vfs_optionisset_nolock(&mnt_mntopts, 1266 MNTOPT_REMOUNT, NULL)) { 1267 if (!(vswp->vsw_flag & VSW_CANREMOUNT)) { 1268 error = ENOTSUP; 1269 goto errout; 1270 } 1271 uap->flags |= MS_REMOUNT; 1272 } 1273 1274 /* 1275 * uap->flags and vfs_optionisset() should agree. 1276 */ 1277 if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) { 1278 uap->flags |= MS_RDONLY; 1279 } 1280 if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) { 1281 uap->flags |= MS_NOSUID; 1282 } 1283 nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL); 1284 ASSERT(splice || !remount); 1285 /* 1286 * If we are splicing the fs into the namespace, 1287 * perform mount point checks. 1288 * 1289 * We want to resolve the path for the mount point to eliminate 1290 * '.' and ".." and symlinks in mount points; we can't do the 1291 * same for the resource string, since it would turn 1292 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...". We need to do 1293 * this before grabbing vn_vfswlock(), because otherwise we 1294 * would deadlock with lookuppn(). 1295 */ 1296 if (splice) { 1297 ASSERT(vp->v_count > 0); 1298 1299 /* 1300 * Pick up mount point and device from appropriate space. 1301 */ 1302 if (pn_get(uap->spec, fromspace, &pn) == 0) { 1303 resource = kmem_alloc(pn.pn_pathlen + 1, 1304 KM_SLEEP); 1305 (void) strcpy(resource, pn.pn_path); 1306 pn_free(&pn); 1307 } 1308 /* 1309 * Do a lookupname prior to taking the 1310 * writelock. Mark this as completed if 1311 * successful for later cleanup and addition to 1312 * the mount in progress table. 1313 */ 1314 if ((uap->flags & MS_GLOBAL) == 0 && 1315 lookupname(uap->spec, fromspace, 1316 FOLLOW, NULL, &bvp) == 0) { 1317 addmip = 1; 1318 } 1319 1320 if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) { 1321 pathname_t *pnp; 1322 1323 if (*pn.pn_path != '/') { 1324 error = EINVAL; 1325 pn_free(&pn); 1326 goto errout; 1327 } 1328 pn_alloc(&rpn); 1329 /* 1330 * Kludge to prevent autofs from deadlocking with 1331 * itself when it calls domount(). 1332 * 1333 * If autofs is calling, it is because it is doing 1334 * (autofs) mounts in the process of an NFS mount. A 1335 * lookuppn() here would cause us to block waiting for 1336 * said NFS mount to complete, which can't since this 1337 * is the thread that was supposed to doing it. 1338 */ 1339 if (fromspace == UIO_USERSPACE) { 1340 if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL, 1341 NULL)) == 0) { 1342 pnp = &rpn; 1343 } else { 1344 /* 1345 * The file disappeared or otherwise 1346 * became inaccessible since we opened 1347 * it; might as well fail the mount 1348 * since the mount point is no longer 1349 * accessible. 1350 */ 1351 pn_free(&rpn); 1352 pn_free(&pn); 1353 goto errout; 1354 } 1355 } else { 1356 pnp = &pn; 1357 } 1358 mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP); 1359 (void) strcpy(mountpt, pnp->pn_path); 1360 1361 /* 1362 * If the addition of the zone's rootpath 1363 * would push us over a total path length 1364 * of MAXPATHLEN, we fail the mount with 1365 * ENAMETOOLONG, which is what we would have 1366 * gotten if we were trying to perform the same 1367 * mount in the global zone. 1368 * 1369 * strlen() doesn't count the trailing 1370 * '\0', but zone_rootpathlen counts both a 1371 * trailing '/' and the terminating '\0'. 1372 */ 1373 if ((curproc->p_zone->zone_rootpathlen - 1 + 1374 strlen(mountpt)) > MAXPATHLEN || 1375 (resource != NULL && 1376 (curproc->p_zone->zone_rootpathlen - 1 + 1377 strlen(resource)) > MAXPATHLEN)) { 1378 error = ENAMETOOLONG; 1379 } 1380 1381 pn_free(&rpn); 1382 pn_free(&pn); 1383 } 1384 1385 if (error) 1386 goto errout; 1387 1388 /* 1389 * Prevent path name resolution from proceeding past 1390 * the mount point. 1391 */ 1392 if (vn_vfswlock(vp) != 0) { 1393 error = EBUSY; 1394 goto errout; 1395 } 1396 1397 /* 1398 * Verify that it's legitimate to establish a mount on 1399 * the prospective mount point. 1400 */ 1401 if (vn_mountedvfs(vp) != NULL) { 1402 /* 1403 * The mount point lock was obtained after some 1404 * other thread raced through and established a mount. 1405 */ 1406 vn_vfsunlock(vp); 1407 error = EBUSY; 1408 goto errout; 1409 } 1410 if (vp->v_flag & VNOMOUNT) { 1411 vn_vfsunlock(vp); 1412 error = EINVAL; 1413 goto errout; 1414 } 1415 } 1416 if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) { 1417 uap->dataptr = NULL; 1418 uap->datalen = 0; 1419 } 1420 1421 /* 1422 * If this is a remount, we don't want to create a new VFS. 1423 * Instead, we pass the existing one with a remount flag. 1424 */ 1425 if (remount) { 1426 /* 1427 * Confirm that the mount point is the root vnode of the 1428 * file system that is being remounted. 1429 * This can happen if the user specifies a different 1430 * mount point directory pathname in the (re)mount command. 1431 * 1432 * Code below can only be reached if splice is true, so it's 1433 * safe to do vn_vfsunlock() here. 1434 */ 1435 if ((vp->v_flag & VROOT) == 0) { 1436 vn_vfsunlock(vp); 1437 error = ENOENT; 1438 goto errout; 1439 } 1440 /* 1441 * Disallow making file systems read-only unless file system 1442 * explicitly allows it in its vfssw. Ignore other flags. 1443 */ 1444 if (rdonly && vn_is_readonly(vp) == 0 && 1445 (vswp->vsw_flag & VSW_CANRWRO) == 0) { 1446 vn_vfsunlock(vp); 1447 error = EINVAL; 1448 goto errout; 1449 } 1450 /* 1451 * Disallow changing the NBMAND disposition of the file 1452 * system on remounts. 1453 */ 1454 if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) || 1455 (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) { 1456 vn_vfsunlock(vp); 1457 error = EINVAL; 1458 goto errout; 1459 } 1460 vfsp = vp->v_vfsp; 1461 ovflags = vfsp->vfs_flag; 1462 vfsp->vfs_flag |= VFS_REMOUNT; 1463 vfsp->vfs_flag &= ~VFS_RDONLY; 1464 } else { 1465 vfsp = vfs_alloc(KM_SLEEP); 1466 VFS_INIT(vfsp, vfsops, NULL); 1467 } 1468 1469 VFS_HOLD(vfsp); 1470 1471 if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) { 1472 if (!remount) { 1473 if (splice) 1474 vn_vfsunlock(vp); 1475 vfs_free(vfsp); 1476 } else { 1477 vn_vfsunlock(vp); 1478 VFS_RELE(vfsp); 1479 } 1480 goto errout; 1481 } 1482 1483 /* 1484 * PRIV_SYS_MOUNT doesn't mean you can become root. 1485 */ 1486 if (vfsp->vfs_lofi_id != 0) { 1487 uap->flags |= MS_NOSUID; 1488 vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0); 1489 } 1490 1491 /* 1492 * The vfs_reflock is not used anymore the code below explicitly 1493 * holds it preventing others accesing it directly. 1494 */ 1495 if ((sema_tryp(&vfsp->vfs_reflock) == 0) && 1496 !(vfsp->vfs_flag & VFS_REMOUNT)) 1497 cmn_err(CE_WARN, 1498 "mount type %s couldn't get vfs_reflock", vswp->vsw_name); 1499 1500 /* 1501 * Lock the vfs. If this is a remount we want to avoid spurious umount 1502 * failures that happen as a side-effect of fsflush() and other mount 1503 * and unmount operations that might be going on simultaneously and 1504 * may have locked the vfs currently. To not return EBUSY immediately 1505 * here we use vfs_lock_wait() instead vfs_lock() for the remount case. 1506 */ 1507 if (!remount) { 1508 if (error = vfs_lock(vfsp)) { 1509 vfsp->vfs_flag = ovflags; 1510 1511 lofi_remove(vfsp); 1512 1513 if (splice) 1514 vn_vfsunlock(vp); 1515 vfs_free(vfsp); 1516 goto errout; 1517 } 1518 } else { 1519 vfs_lock_wait(vfsp); 1520 } 1521 1522 /* 1523 * Add device to mount in progress table, global mounts require special 1524 * handling. It is possible that we have already done the lookupname 1525 * on a spliced, non-global fs. If so, we don't want to do it again 1526 * since we cannot do a lookupname after taking the 1527 * wlock above. This case is for a non-spliced, non-global filesystem. 1528 */ 1529 if (!addmip) { 1530 if ((uap->flags & MS_GLOBAL) == 0 && 1531 lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) { 1532 addmip = 1; 1533 } 1534 } 1535 1536 if (addmip) { 1537 vnode_t *lvp = NULL; 1538 1539 error = vfs_get_lofi(vfsp, &lvp); 1540 if (error > 0) { 1541 lofi_remove(vfsp); 1542 1543 if (splice) 1544 vn_vfsunlock(vp); 1545 vfs_unlock(vfsp); 1546 1547 if (remount) { 1548 VFS_RELE(vfsp); 1549 } else { 1550 vfs_free(vfsp); 1551 } 1552 1553 goto errout; 1554 } else if (error == -1) { 1555 bdev = bvp->v_rdev; 1556 VN_RELE(bvp); 1557 } else { 1558 bdev = lvp->v_rdev; 1559 VN_RELE(lvp); 1560 VN_RELE(bvp); 1561 } 1562 1563 vfs_addmip(bdev, vfsp); 1564 addmip = 0; 1565 delmip = 1; 1566 } 1567 /* 1568 * Invalidate cached entry for the mount point. 1569 */ 1570 if (splice) 1571 dnlc_purge_vp(vp); 1572 1573 /* 1574 * If have an option string but the filesystem doesn't supply a 1575 * prototype options table, create a table with the global 1576 * options and sufficient room to accept all the options in the 1577 * string. Then parse the passed in option string 1578 * accepting all the options in the string. This gives us an 1579 * option table with all the proper cancel properties for the 1580 * global options. 1581 * 1582 * Filesystems that supply a prototype options table are handled 1583 * earlier in this function. 1584 */ 1585 if (uap->flags & MS_OPTIONSTR) { 1586 if (!(vswp->vsw_flag & VSW_HASPROTO)) { 1587 mntopts_t tmp_mntopts; 1588 1589 tmp_mntopts.mo_count = 0; 1590 vfs_createopttbl_extend(&tmp_mntopts, inargs, 1591 &mnt_mntopts); 1592 vfs_parsemntopts(&tmp_mntopts, inargs, 1); 1593 vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts); 1594 vfs_freeopttbl(&tmp_mntopts); 1595 } 1596 } 1597 1598 /* 1599 * Serialize with zone state transitions. 1600 * See vfs_list_add; zone mounted into is: 1601 * zone_find_by_path(refstr_value(vfsp->vfs_mntpt)) 1602 * not the zone doing the mount (curproc->p_zone), but if we're already 1603 * inside a NGZ, then we know what zone we are. 1604 */ 1605 if (INGLOBALZONE(curproc)) { 1606 zone = zone_find_by_path(mountpt); 1607 ASSERT(zone != NULL); 1608 } else { 1609 zone = curproc->p_zone; 1610 /* 1611 * zone_find_by_path does a hold, so do one here too so that 1612 * we can do a zone_rele after mount_completed. 1613 */ 1614 zone_hold(zone); 1615 } 1616 mount_in_progress(zone); 1617 /* 1618 * Instantiate (or reinstantiate) the file system. If appropriate, 1619 * splice it into the file system name space. 1620 * 1621 * We want VFS_MOUNT() to be able to override the vfs_resource 1622 * string if necessary (ie, mntfs), and also for a remount to 1623 * change the same (necessary when remounting '/' during boot). 1624 * So we set up vfs_mntpt and vfs_resource to what we think they 1625 * should be, then hand off control to VFS_MOUNT() which can 1626 * override this. 1627 * 1628 * For safety's sake, when changing vfs_resource or vfs_mntpt of 1629 * a vfs which is on the vfs list (i.e. during a remount), we must 1630 * never set those fields to NULL. Several bits of code make 1631 * assumptions that the fields are always valid. 1632 */ 1633 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1634 if (remount) { 1635 if ((oldresource = vfsp->vfs_resource) != NULL) 1636 refstr_hold(oldresource); 1637 if ((oldmntpt = vfsp->vfs_mntpt) != NULL) 1638 refstr_hold(oldmntpt); 1639 } 1640 vfs_setresource(vfsp, resource, 0); 1641 vfs_setmntpoint(vfsp, mountpt, 0); 1642 1643 /* 1644 * going to mount on this vnode, so notify. 1645 */ 1646 vnevent_mountedover(vp, NULL); 1647 error = VFS_MOUNT(vfsp, vp, uap, credp); 1648 1649 if (uap->flags & MS_RDONLY) 1650 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1651 if (uap->flags & MS_NOSUID) 1652 vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0); 1653 if (uap->flags & MS_GLOBAL) 1654 vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0); 1655 1656 if (error) { 1657 lofi_remove(vfsp); 1658 1659 if (remount) { 1660 /* put back pre-remount options */ 1661 vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts); 1662 vfs_setmntpoint(vfsp, refstr_value(oldmntpt), 1663 VFSSP_VERBATIM); 1664 if (oldmntpt) 1665 refstr_rele(oldmntpt); 1666 vfs_setresource(vfsp, refstr_value(oldresource), 1667 VFSSP_VERBATIM); 1668 if (oldresource) 1669 refstr_rele(oldresource); 1670 vfsp->vfs_flag = ovflags; 1671 vfs_unlock(vfsp); 1672 VFS_RELE(vfsp); 1673 } else { 1674 vfs_unlock(vfsp); 1675 vfs_freemnttab(vfsp); 1676 vfs_free(vfsp); 1677 } 1678 } else { 1679 /* 1680 * Set the mount time to now 1681 */ 1682 vfsp->vfs_mtime = ddi_get_time(); 1683 if (remount) { 1684 vfsp->vfs_flag &= ~VFS_REMOUNT; 1685 if (oldresource) 1686 refstr_rele(oldresource); 1687 if (oldmntpt) 1688 refstr_rele(oldmntpt); 1689 } else if (splice) { 1690 /* 1691 * Link vfsp into the name space at the mount 1692 * point. Vfs_add() is responsible for 1693 * holding the mount point which will be 1694 * released when vfs_remove() is called. 1695 */ 1696 vfs_add(vp, vfsp, uap->flags); 1697 } else { 1698 /* 1699 * Hold the reference to file system which is 1700 * not linked into the name space. 1701 */ 1702 vfsp->vfs_zone = NULL; 1703 VFS_HOLD(vfsp); 1704 vfsp->vfs_vnodecovered = NULL; 1705 } 1706 /* 1707 * Set flags for global options encountered 1708 */ 1709 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 1710 vfsp->vfs_flag |= VFS_RDONLY; 1711 else 1712 vfsp->vfs_flag &= ~VFS_RDONLY; 1713 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 1714 vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES); 1715 } else { 1716 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 1717 vfsp->vfs_flag |= VFS_NODEVICES; 1718 else 1719 vfsp->vfs_flag &= ~VFS_NODEVICES; 1720 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 1721 vfsp->vfs_flag |= VFS_NOSETUID; 1722 else 1723 vfsp->vfs_flag &= ~VFS_NOSETUID; 1724 } 1725 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 1726 vfsp->vfs_flag |= VFS_NBMAND; 1727 else 1728 vfsp->vfs_flag &= ~VFS_NBMAND; 1729 1730 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 1731 vfsp->vfs_flag |= VFS_XATTR; 1732 else 1733 vfsp->vfs_flag &= ~VFS_XATTR; 1734 1735 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 1736 vfsp->vfs_flag |= VFS_NOEXEC; 1737 else 1738 vfsp->vfs_flag &= ~VFS_NOEXEC; 1739 1740 /* 1741 * Now construct the output option string of options 1742 * we recognized. 1743 */ 1744 if (uap->flags & MS_OPTIONSTR) { 1745 vfs_list_read_lock(); 1746 copyout_error = vfs_buildoptionstr( 1747 &vfsp->vfs_mntopts, inargs, optlen); 1748 vfs_list_unlock(); 1749 if (copyout_error == 0 && 1750 (uap->flags & MS_SYSSPACE) == 0) { 1751 copyout_error = copyoutstr(inargs, opts, 1752 optlen, NULL); 1753 } 1754 } 1755 1756 /* 1757 * If this isn't a remount, set up the vopstats before 1758 * anyone can touch this. We only allow spliced file 1759 * systems (file systems which are in the namespace) to 1760 * have the VFS_STATS flag set. 1761 * NOTE: PxFS mounts the underlying file system with 1762 * MS_NOSPLICE set and copies those vfs_flags to its private 1763 * vfs structure. As a result, PxFS should never have 1764 * the VFS_STATS flag or else we might access the vfs 1765 * statistics-related fields prior to them being 1766 * properly initialized. 1767 */ 1768 if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) { 1769 initialize_vopstats(&vfsp->vfs_vopstats); 1770 /* 1771 * We need to set vfs_vskap to NULL because there's 1772 * a chance it won't be set below. This is checked 1773 * in teardown_vopstats() so we can't have garbage. 1774 */ 1775 vfsp->vfs_vskap = NULL; 1776 vfsp->vfs_flag |= VFS_STATS; 1777 vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp); 1778 } 1779 1780 if (vswp->vsw_flag & VSW_XID) 1781 vfsp->vfs_flag |= VFS_XID; 1782 1783 vfs_unlock(vfsp); 1784 } 1785 mount_completed(zone); 1786 zone_rele(zone); 1787 if (splice) 1788 vn_vfsunlock(vp); 1789 1790 if ((error == 0) && (copyout_error == 0)) { 1791 if (!remount) { 1792 /* 1793 * Don't call get_vskstat_anchor() while holding 1794 * locks since it allocates memory and calls 1795 * VFS_STATVFS(). For NFS, the latter can generate 1796 * an over-the-wire call. 1797 */ 1798 vskap = get_vskstat_anchor(vfsp); 1799 /* Only take the lock if we have something to do */ 1800 if (vskap != NULL) { 1801 vfs_lock_wait(vfsp); 1802 if (vfsp->vfs_flag & VFS_STATS) { 1803 vfsp->vfs_vskap = vskap; 1804 } 1805 vfs_unlock(vfsp); 1806 } 1807 } 1808 /* Return vfsp to caller. */ 1809 *vfspp = vfsp; 1810 } 1811 errout: 1812 vfs_freeopttbl(&mnt_mntopts); 1813 if (resource != NULL) 1814 kmem_free(resource, strlen(resource) + 1); 1815 if (mountpt != NULL) 1816 kmem_free(mountpt, strlen(mountpt) + 1); 1817 /* 1818 * It is possible we errored prior to adding to mount in progress 1819 * table. Must free vnode we acquired with successful lookupname. 1820 */ 1821 if (addmip) 1822 VN_RELE(bvp); 1823 if (delmip) 1824 vfs_delmip(vfsp); 1825 ASSERT(vswp != NULL); 1826 vfs_unrefvfssw(vswp); 1827 if (inargs != opts) 1828 kmem_free(inargs, MAX_MNTOPT_STR); 1829 if (copyout_error) { 1830 lofi_remove(vfsp); 1831 VFS_RELE(vfsp); 1832 error = copyout_error; 1833 } 1834 return (error); 1835 } 1836 1837 static void 1838 vfs_setpath( 1839 struct vfs *vfsp, /* vfs being updated */ 1840 refstr_t **refp, /* Ref-count string to contain the new path */ 1841 const char *newpath, /* Path to add to refp (above) */ 1842 uint32_t flag) /* flag */ 1843 { 1844 size_t len; 1845 refstr_t *ref; 1846 zone_t *zone = curproc->p_zone; 1847 char *sp; 1848 int have_list_lock = 0; 1849 1850 ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp)); 1851 1852 /* 1853 * New path must be less than MAXPATHLEN because mntfs 1854 * will only display up to MAXPATHLEN bytes. This is currently 1855 * safe, because domount() uses pn_get(), and other callers 1856 * similarly cap the size to fewer than MAXPATHLEN bytes. 1857 */ 1858 1859 ASSERT(strlen(newpath) < MAXPATHLEN); 1860 1861 /* mntfs requires consistency while vfs list lock is held */ 1862 1863 if (VFS_ON_LIST(vfsp)) { 1864 have_list_lock = 1; 1865 vfs_list_lock(); 1866 } 1867 1868 if (*refp != NULL) 1869 refstr_rele(*refp); 1870 1871 /* 1872 * If we are in a non-global zone then we prefix the supplied path, 1873 * newpath, with the zone's root path, with two exceptions. The first 1874 * is where we have been explicitly directed to avoid doing so; this 1875 * will be the case following a failed remount, where the path supplied 1876 * will be a saved version which must now be restored. The second 1877 * exception is where newpath is not a pathname but a descriptive name, 1878 * e.g. "procfs". 1879 */ 1880 if (zone == global_zone || (flag & VFSSP_VERBATIM) || *newpath != '/') { 1881 ref = refstr_alloc(newpath); 1882 goto out; 1883 } 1884 1885 /* 1886 * Truncate the trailing '/' in the zoneroot, and merge 1887 * in the zone's rootpath with the "newpath" (resource 1888 * or mountpoint) passed in. 1889 * 1890 * The size of the required buffer is thus the size of 1891 * the buffer required for the passed-in newpath 1892 * (strlen(newpath) + 1), plus the size of the buffer 1893 * required to hold zone_rootpath (zone_rootpathlen) 1894 * minus one for one of the now-superfluous NUL 1895 * terminations, minus one for the trailing '/'. 1896 * 1897 * That gives us: 1898 * 1899 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1 1900 * 1901 * Which is what we have below. 1902 */ 1903 1904 len = strlen(newpath) + zone->zone_rootpathlen - 1; 1905 sp = kmem_alloc(len, KM_SLEEP); 1906 1907 /* 1908 * Copy everything including the trailing slash, which 1909 * we then overwrite with the NUL character. 1910 */ 1911 1912 (void) strcpy(sp, zone->zone_rootpath); 1913 sp[zone->zone_rootpathlen - 2] = '\0'; 1914 (void) strcat(sp, newpath); 1915 1916 ref = refstr_alloc(sp); 1917 kmem_free(sp, len); 1918 out: 1919 *refp = ref; 1920 1921 if (have_list_lock) { 1922 vfs_mnttab_modtimeupd(); 1923 vfs_list_unlock(); 1924 } 1925 } 1926 1927 /* 1928 * Record a mounted resource name in a vfs structure. 1929 * If vfsp is already mounted, caller must hold the vfs lock. 1930 */ 1931 void 1932 vfs_setresource(struct vfs *vfsp, const char *resource, uint32_t flag) 1933 { 1934 if (resource == NULL || resource[0] == '\0') 1935 resource = VFS_NORESOURCE; 1936 vfs_setpath(vfsp, &vfsp->vfs_resource, resource, flag); 1937 } 1938 1939 /* 1940 * Record a mount point name in a vfs structure. 1941 * If vfsp is already mounted, caller must hold the vfs lock. 1942 */ 1943 void 1944 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt, uint32_t flag) 1945 { 1946 if (mntpt == NULL || mntpt[0] == '\0') 1947 mntpt = VFS_NOMNTPT; 1948 vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt, flag); 1949 } 1950 1951 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */ 1952 1953 refstr_t * 1954 vfs_getresource(const struct vfs *vfsp) 1955 { 1956 refstr_t *resource; 1957 1958 vfs_list_read_lock(); 1959 resource = vfsp->vfs_resource; 1960 refstr_hold(resource); 1961 vfs_list_unlock(); 1962 1963 return (resource); 1964 } 1965 1966 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */ 1967 1968 refstr_t * 1969 vfs_getmntpoint(const struct vfs *vfsp) 1970 { 1971 refstr_t *mntpt; 1972 1973 vfs_list_read_lock(); 1974 mntpt = vfsp->vfs_mntpt; 1975 refstr_hold(mntpt); 1976 vfs_list_unlock(); 1977 1978 return (mntpt); 1979 } 1980 1981 /* 1982 * Create an empty options table with enough empty slots to hold all 1983 * The options in the options string passed as an argument. 1984 * Potentially prepend another options table. 1985 * 1986 * Note: caller is responsible for locking the vfs list, if needed, 1987 * to protect mops. 1988 */ 1989 static void 1990 vfs_createopttbl_extend(mntopts_t *mops, const char *opts, 1991 const mntopts_t *mtmpl) 1992 { 1993 const char *s = opts; 1994 uint_t count; 1995 1996 if (opts == NULL || *opts == '\0') { 1997 count = 0; 1998 } else { 1999 count = 1; 2000 2001 /* 2002 * Count number of options in the string 2003 */ 2004 for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) { 2005 count++; 2006 s++; 2007 } 2008 } 2009 vfs_copyopttbl_extend(mtmpl, mops, count); 2010 } 2011 2012 /* 2013 * Create an empty options table with enough empty slots to hold all 2014 * The options in the options string passed as an argument. 2015 * 2016 * This function is *not* for general use by filesystems. 2017 * 2018 * Note: caller is responsible for locking the vfs list, if needed, 2019 * to protect mops. 2020 */ 2021 void 2022 vfs_createopttbl(mntopts_t *mops, const char *opts) 2023 { 2024 vfs_createopttbl_extend(mops, opts, NULL); 2025 } 2026 2027 2028 /* 2029 * Swap two mount options tables 2030 */ 2031 static void 2032 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2) 2033 { 2034 uint_t tmpcnt; 2035 mntopt_t *tmplist; 2036 2037 tmpcnt = optbl2->mo_count; 2038 tmplist = optbl2->mo_list; 2039 optbl2->mo_count = optbl1->mo_count; 2040 optbl2->mo_list = optbl1->mo_list; 2041 optbl1->mo_count = tmpcnt; 2042 optbl1->mo_list = tmplist; 2043 } 2044 2045 static void 2046 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2) 2047 { 2048 vfs_list_lock(); 2049 vfs_swapopttbl_nolock(optbl1, optbl2); 2050 vfs_mnttab_modtimeupd(); 2051 vfs_list_unlock(); 2052 } 2053 2054 static char ** 2055 vfs_copycancelopt_extend(char **const moc, int extend) 2056 { 2057 int i = 0; 2058 int j; 2059 char **result; 2060 2061 if (moc != NULL) { 2062 for (; moc[i] != NULL; i++) 2063 /* count number of options to cancel */; 2064 } 2065 2066 if (i + extend == 0) 2067 return (NULL); 2068 2069 result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP); 2070 2071 for (j = 0; j < i; j++) { 2072 result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP); 2073 (void) strcpy(result[j], moc[j]); 2074 } 2075 for (; j <= i + extend; j++) 2076 result[j] = NULL; 2077 2078 return (result); 2079 } 2080 2081 static void 2082 vfs_copyopt(const mntopt_t *s, mntopt_t *d) 2083 { 2084 char *sp, *dp; 2085 2086 d->mo_flags = s->mo_flags; 2087 d->mo_data = s->mo_data; 2088 sp = s->mo_name; 2089 if (sp != NULL) { 2090 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 2091 (void) strcpy(dp, sp); 2092 d->mo_name = dp; 2093 } else { 2094 d->mo_name = NULL; /* should never happen */ 2095 } 2096 2097 d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0); 2098 2099 sp = s->mo_arg; 2100 if (sp != NULL) { 2101 dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP); 2102 (void) strcpy(dp, sp); 2103 d->mo_arg = dp; 2104 } else { 2105 d->mo_arg = NULL; 2106 } 2107 } 2108 2109 /* 2110 * Copy a mount options table, possibly allocating some spare 2111 * slots at the end. It is permissible to copy_extend the NULL table. 2112 */ 2113 static void 2114 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra) 2115 { 2116 uint_t i, count; 2117 mntopt_t *motbl; 2118 2119 /* 2120 * Clear out any existing stuff in the options table being initialized 2121 */ 2122 vfs_freeopttbl(dmo); 2123 count = (smo == NULL) ? 0 : smo->mo_count; 2124 if ((count + extra) == 0) /* nothing to do */ 2125 return; 2126 dmo->mo_count = count + extra; 2127 motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP); 2128 dmo->mo_list = motbl; 2129 for (i = 0; i < count; i++) { 2130 vfs_copyopt(&smo->mo_list[i], &motbl[i]); 2131 } 2132 for (i = count; i < count + extra; i++) { 2133 motbl[i].mo_flags = MO_EMPTY; 2134 } 2135 } 2136 2137 /* 2138 * Copy a mount options table. 2139 * 2140 * This function is *not* for general use by filesystems. 2141 * 2142 * Note: caller is responsible for locking the vfs list, if needed, 2143 * to protect smo and dmo. 2144 */ 2145 void 2146 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo) 2147 { 2148 vfs_copyopttbl_extend(smo, dmo, 0); 2149 } 2150 2151 static char ** 2152 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2) 2153 { 2154 int c1 = 0; 2155 int c2 = 0; 2156 char **result; 2157 char **sp1, **sp2, **dp; 2158 2159 /* 2160 * First we count both lists of cancel options. 2161 * If either is NULL or has no elements, we return a copy of 2162 * the other. 2163 */ 2164 if (mop1->mo_cancel != NULL) { 2165 for (; mop1->mo_cancel[c1] != NULL; c1++) 2166 /* count cancel options in mop1 */; 2167 } 2168 2169 if (c1 == 0) 2170 return (vfs_copycancelopt_extend(mop2->mo_cancel, 0)); 2171 2172 if (mop2->mo_cancel != NULL) { 2173 for (; mop2->mo_cancel[c2] != NULL; c2++) 2174 /* count cancel options in mop2 */; 2175 } 2176 2177 result = vfs_copycancelopt_extend(mop1->mo_cancel, c2); 2178 2179 if (c2 == 0) 2180 return (result); 2181 2182 /* 2183 * When we get here, we've got two sets of cancel options; 2184 * we need to merge the two sets. We know that the result 2185 * array has "c1+c2+1" entries and in the end we might shrink 2186 * it. 2187 * Result now has a copy of the c1 entries from mop1; we'll 2188 * now lookup all the entries of mop2 in mop1 and copy it if 2189 * it is unique. 2190 * This operation is O(n^2) but it's only called once per 2191 * filesystem per duplicate option. This is a situation 2192 * which doesn't arise with the filesystems in ON and 2193 * n is generally 1. 2194 */ 2195 2196 dp = &result[c1]; 2197 for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) { 2198 for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) { 2199 if (strcmp(*sp1, *sp2) == 0) 2200 break; 2201 } 2202 if (*sp1 == NULL) { 2203 /* 2204 * Option *sp2 not found in mop1, so copy it. 2205 * The calls to vfs_copycancelopt_extend() 2206 * guarantee that there's enough room. 2207 */ 2208 *dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP); 2209 (void) strcpy(*dp++, *sp2); 2210 } 2211 } 2212 if (dp != &result[c1+c2]) { 2213 size_t bytes = (dp - result + 1) * sizeof (char *); 2214 char **nres = kmem_alloc(bytes, KM_SLEEP); 2215 2216 bcopy(result, nres, bytes); 2217 kmem_free(result, (c1 + c2 + 1) * sizeof (char *)); 2218 result = nres; 2219 } 2220 return (result); 2221 } 2222 2223 /* 2224 * Merge two mount option tables (outer and inner) into one. This is very 2225 * similar to "merging" global variables and automatic variables in C. 2226 * 2227 * This isn't (and doesn't have to be) fast. 2228 * 2229 * This function is *not* for general use by filesystems. 2230 * 2231 * Note: caller is responsible for locking the vfs list, if needed, 2232 * to protect omo, imo & dmo. 2233 */ 2234 void 2235 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo) 2236 { 2237 uint_t i, count; 2238 mntopt_t *mop, *motbl; 2239 uint_t freeidx; 2240 2241 /* 2242 * First determine how much space we need to allocate. 2243 */ 2244 count = omo->mo_count; 2245 for (i = 0; i < imo->mo_count; i++) { 2246 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2247 continue; 2248 if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL) 2249 count++; 2250 } 2251 ASSERT(count >= omo->mo_count && 2252 count <= omo->mo_count + imo->mo_count); 2253 motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP); 2254 for (i = 0; i < omo->mo_count; i++) 2255 vfs_copyopt(&omo->mo_list[i], &motbl[i]); 2256 freeidx = omo->mo_count; 2257 for (i = 0; i < imo->mo_count; i++) { 2258 if (imo->mo_list[i].mo_flags & MO_EMPTY) 2259 continue; 2260 if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) { 2261 char **newcanp; 2262 uint_t index = mop - omo->mo_list; 2263 2264 newcanp = vfs_mergecancelopts(mop, &motbl[index]); 2265 2266 vfs_freeopt(&motbl[index]); 2267 vfs_copyopt(&imo->mo_list[i], &motbl[index]); 2268 2269 vfs_freecancelopt(motbl[index].mo_cancel); 2270 motbl[index].mo_cancel = newcanp; 2271 } else { 2272 /* 2273 * If it's a new option, just copy it over to the first 2274 * free location. 2275 */ 2276 vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]); 2277 } 2278 } 2279 dmo->mo_count = count; 2280 dmo->mo_list = motbl; 2281 } 2282 2283 /* 2284 * Functions to set and clear mount options in a mount options table. 2285 */ 2286 2287 /* 2288 * Clear a mount option, if it exists. 2289 * 2290 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2291 * the vfs list. 2292 */ 2293 static void 2294 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab) 2295 { 2296 struct mntopt *mop; 2297 uint_t i, count; 2298 2299 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2300 2301 count = mops->mo_count; 2302 for (i = 0; i < count; i++) { 2303 mop = &mops->mo_list[i]; 2304 2305 if (mop->mo_flags & MO_EMPTY) 2306 continue; 2307 if (strcmp(opt, mop->mo_name)) 2308 continue; 2309 mop->mo_flags &= ~MO_SET; 2310 if (mop->mo_arg != NULL) { 2311 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2312 } 2313 mop->mo_arg = NULL; 2314 if (update_mnttab) 2315 vfs_mnttab_modtimeupd(); 2316 break; 2317 } 2318 } 2319 2320 void 2321 vfs_clearmntopt(struct vfs *vfsp, const char *opt) 2322 { 2323 int gotlock = 0; 2324 2325 if (VFS_ON_LIST(vfsp)) { 2326 gotlock = 1; 2327 vfs_list_lock(); 2328 } 2329 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock); 2330 if (gotlock) 2331 vfs_list_unlock(); 2332 } 2333 2334 2335 /* 2336 * Set a mount option on. If it's not found in the table, it's silently 2337 * ignored. If the option has MO_IGNORE set, it is still set unless the 2338 * VFS_NOFORCEOPT bit is set in the flags. Also, VFS_DISPLAY/VFS_NODISPLAY flag 2339 * bits can be used to toggle the MO_NODISPLAY bit for the option. 2340 * If the VFS_CREATEOPT flag bit is set then the first option slot with 2341 * MO_EMPTY set is created as the option passed in. 2342 * 2343 * The update_mnttab arg indicates whether mops is part of a vfs that is on 2344 * the vfs list. 2345 */ 2346 static void 2347 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt, 2348 const char *arg, int flags, int update_mnttab) 2349 { 2350 mntopt_t *mop; 2351 uint_t i, count; 2352 char *sp; 2353 2354 ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist)); 2355 2356 if (flags & VFS_CREATEOPT) { 2357 if (vfs_hasopt(mops, opt) != NULL) { 2358 flags &= ~VFS_CREATEOPT; 2359 } 2360 } 2361 count = mops->mo_count; 2362 for (i = 0; i < count; i++) { 2363 mop = &mops->mo_list[i]; 2364 2365 if (mop->mo_flags & MO_EMPTY) { 2366 if ((flags & VFS_CREATEOPT) == 0) 2367 continue; 2368 sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP); 2369 (void) strcpy(sp, opt); 2370 mop->mo_name = sp; 2371 if (arg != NULL) 2372 mop->mo_flags = MO_HASVALUE; 2373 else 2374 mop->mo_flags = 0; 2375 } else if (strcmp(opt, mop->mo_name)) { 2376 continue; 2377 } 2378 if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT)) 2379 break; 2380 if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) { 2381 sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP); 2382 (void) strcpy(sp, arg); 2383 } else { 2384 sp = NULL; 2385 } 2386 if (mop->mo_arg != NULL) 2387 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2388 mop->mo_arg = sp; 2389 if (flags & VFS_DISPLAY) 2390 mop->mo_flags &= ~MO_NODISPLAY; 2391 if (flags & VFS_NODISPLAY) 2392 mop->mo_flags |= MO_NODISPLAY; 2393 mop->mo_flags |= MO_SET; 2394 if (mop->mo_cancel != NULL) { 2395 char **cp; 2396 2397 for (cp = mop->mo_cancel; *cp != NULL; cp++) 2398 vfs_clearmntopt_nolock(mops, *cp, 0); 2399 } 2400 if (update_mnttab) 2401 vfs_mnttab_modtimeupd(); 2402 break; 2403 } 2404 } 2405 2406 void 2407 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags) 2408 { 2409 int gotlock = 0; 2410 2411 if (VFS_ON_LIST(vfsp)) { 2412 gotlock = 1; 2413 vfs_list_lock(); 2414 } 2415 vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock); 2416 if (gotlock) 2417 vfs_list_unlock(); 2418 } 2419 2420 2421 /* 2422 * Add a "tag" option to a mounted file system's options list. 2423 * 2424 * Note: caller is responsible for locking the vfs list, if needed, 2425 * to protect mops. 2426 */ 2427 static mntopt_t * 2428 vfs_addtag(mntopts_t *mops, const char *tag) 2429 { 2430 uint_t count; 2431 mntopt_t *mop, *motbl; 2432 2433 count = mops->mo_count + 1; 2434 motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP); 2435 if (mops->mo_count) { 2436 size_t len = (count - 1) * sizeof (mntopt_t); 2437 2438 bcopy(mops->mo_list, motbl, len); 2439 kmem_free(mops->mo_list, len); 2440 } 2441 mops->mo_count = count; 2442 mops->mo_list = motbl; 2443 mop = &motbl[count - 1]; 2444 mop->mo_flags = MO_TAG; 2445 mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP); 2446 (void) strcpy(mop->mo_name, tag); 2447 return (mop); 2448 } 2449 2450 /* 2451 * Allow users to set arbitrary "tags" in a vfs's mount options. 2452 * Broader use within the kernel is discouraged. 2453 */ 2454 int 2455 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2456 cred_t *cr) 2457 { 2458 vfs_t *vfsp; 2459 mntopts_t *mops; 2460 mntopt_t *mop; 2461 int found = 0; 2462 dev_t dev = makedevice(major, minor); 2463 int err = 0; 2464 char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP); 2465 2466 /* 2467 * Find the desired mounted file system 2468 */ 2469 vfs_list_lock(); 2470 vfsp = rootvfs; 2471 do { 2472 if (vfsp->vfs_dev == dev && 2473 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2474 found = 1; 2475 break; 2476 } 2477 vfsp = vfsp->vfs_next; 2478 } while (vfsp != rootvfs); 2479 2480 if (!found) { 2481 err = EINVAL; 2482 goto out; 2483 } 2484 err = secpolicy_fs_config(cr, vfsp); 2485 if (err != 0) 2486 goto out; 2487 2488 mops = &vfsp->vfs_mntopts; 2489 /* 2490 * Add tag if it doesn't already exist 2491 */ 2492 if ((mop = vfs_hasopt(mops, tag)) == NULL) { 2493 int len; 2494 2495 (void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR); 2496 len = strlen(buf); 2497 if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) { 2498 err = ENAMETOOLONG; 2499 goto out; 2500 } 2501 mop = vfs_addtag(mops, tag); 2502 } 2503 if ((mop->mo_flags & MO_TAG) == 0) { 2504 err = EINVAL; 2505 goto out; 2506 } 2507 vfs_setmntopt_nolock(mops, tag, NULL, 0, 1); 2508 out: 2509 vfs_list_unlock(); 2510 kmem_free(buf, MAX_MNTOPT_STR); 2511 return (err); 2512 } 2513 2514 /* 2515 * Allow users to remove arbitrary "tags" in a vfs's mount options. 2516 * Broader use within the kernel is discouraged. 2517 */ 2518 int 2519 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag, 2520 cred_t *cr) 2521 { 2522 vfs_t *vfsp; 2523 mntopt_t *mop; 2524 int found = 0; 2525 dev_t dev = makedevice(major, minor); 2526 int err = 0; 2527 2528 /* 2529 * Find the desired mounted file system 2530 */ 2531 vfs_list_lock(); 2532 vfsp = rootvfs; 2533 do { 2534 if (vfsp->vfs_dev == dev && 2535 strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) { 2536 found = 1; 2537 break; 2538 } 2539 vfsp = vfsp->vfs_next; 2540 } while (vfsp != rootvfs); 2541 2542 if (!found) { 2543 err = EINVAL; 2544 goto out; 2545 } 2546 err = secpolicy_fs_config(cr, vfsp); 2547 if (err != 0) 2548 goto out; 2549 2550 if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) { 2551 err = EINVAL; 2552 goto out; 2553 } 2554 if ((mop->mo_flags & MO_TAG) == 0) { 2555 err = EINVAL; 2556 goto out; 2557 } 2558 vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1); 2559 out: 2560 vfs_list_unlock(); 2561 return (err); 2562 } 2563 2564 /* 2565 * Function to parse an option string and fill in a mount options table. 2566 * Unknown options are silently ignored. The input option string is modified 2567 * by replacing separators with nulls. If the create flag is set, options 2568 * not found in the table are just added on the fly. The table must have 2569 * an option slot marked MO_EMPTY to add an option on the fly. 2570 * 2571 * This function is *not* for general use by filesystems. 2572 * 2573 * Note: caller is responsible for locking the vfs list, if needed, 2574 * to protect mops.. 2575 */ 2576 void 2577 vfs_parsemntopts(mntopts_t *mops, char *osp, int create) 2578 { 2579 char *s = osp, *p, *nextop, *valp, *cp, *ep; 2580 int setflg = VFS_NOFORCEOPT; 2581 2582 if (osp == NULL) 2583 return; 2584 while (*s != '\0') { 2585 p = strchr(s, ','); /* find next option */ 2586 if (p == NULL) { 2587 cp = NULL; 2588 p = s + strlen(s); 2589 } else { 2590 cp = p; /* save location of comma */ 2591 *p++ = '\0'; /* mark end and point to next option */ 2592 } 2593 nextop = p; 2594 p = strchr(s, '='); /* look for value */ 2595 if (p == NULL) { 2596 valp = NULL; /* no value supplied */ 2597 } else { 2598 ep = p; /* save location of equals */ 2599 *p++ = '\0'; /* end option and point to value */ 2600 valp = p; 2601 } 2602 /* 2603 * set option into options table 2604 */ 2605 if (create) 2606 setflg |= VFS_CREATEOPT; 2607 vfs_setmntopt_nolock(mops, s, valp, setflg, 0); 2608 if (cp != NULL) 2609 *cp = ','; /* restore the comma */ 2610 if (valp != NULL) 2611 *ep = '='; /* restore the equals */ 2612 s = nextop; 2613 } 2614 } 2615 2616 /* 2617 * Function to inquire if an option exists in a mount options table. 2618 * Returns a pointer to the option if it exists, else NULL. 2619 * 2620 * This function is *not* for general use by filesystems. 2621 * 2622 * Note: caller is responsible for locking the vfs list, if needed, 2623 * to protect mops. 2624 */ 2625 struct mntopt * 2626 vfs_hasopt(const mntopts_t *mops, const char *opt) 2627 { 2628 struct mntopt *mop; 2629 uint_t i, count; 2630 2631 count = mops->mo_count; 2632 for (i = 0; i < count; i++) { 2633 mop = &mops->mo_list[i]; 2634 2635 if (mop->mo_flags & MO_EMPTY) 2636 continue; 2637 if (strcmp(opt, mop->mo_name) == 0) 2638 return (mop); 2639 } 2640 return (NULL); 2641 } 2642 2643 /* 2644 * Function to inquire if an option is set in a mount options table. 2645 * Returns non-zero if set and fills in the arg pointer with a pointer to 2646 * the argument string or NULL if there is no argument string. 2647 */ 2648 static int 2649 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp) 2650 { 2651 struct mntopt *mop; 2652 uint_t i, count; 2653 2654 count = mops->mo_count; 2655 for (i = 0; i < count; i++) { 2656 mop = &mops->mo_list[i]; 2657 2658 if (mop->mo_flags & MO_EMPTY) 2659 continue; 2660 if (strcmp(opt, mop->mo_name)) 2661 continue; 2662 if ((mop->mo_flags & MO_SET) == 0) 2663 return (0); 2664 if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0) 2665 *argp = mop->mo_arg; 2666 return (1); 2667 } 2668 return (0); 2669 } 2670 2671 2672 int 2673 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp) 2674 { 2675 int ret; 2676 2677 vfs_list_read_lock(); 2678 ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp); 2679 vfs_list_unlock(); 2680 return (ret); 2681 } 2682 2683 2684 /* 2685 * Construct a comma separated string of the options set in the given 2686 * mount table, return the string in the given buffer. Return non-zero if 2687 * the buffer would overflow. 2688 * 2689 * This function is *not* for general use by filesystems. 2690 * 2691 * Note: caller is responsible for locking the vfs list, if needed, 2692 * to protect mp. 2693 */ 2694 int 2695 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len) 2696 { 2697 char *cp; 2698 uint_t i; 2699 2700 buf[0] = '\0'; 2701 cp = buf; 2702 for (i = 0; i < mp->mo_count; i++) { 2703 struct mntopt *mop; 2704 2705 mop = &mp->mo_list[i]; 2706 if (mop->mo_flags & MO_SET) { 2707 int optlen, comma = 0; 2708 2709 if (buf[0] != '\0') 2710 comma = 1; 2711 optlen = strlen(mop->mo_name); 2712 if (strlen(buf) + comma + optlen + 1 > len) 2713 goto err; 2714 if (comma) 2715 *cp++ = ','; 2716 (void) strcpy(cp, mop->mo_name); 2717 cp += optlen; 2718 /* 2719 * Append option value if there is one 2720 */ 2721 if (mop->mo_arg != NULL) { 2722 int arglen; 2723 2724 arglen = strlen(mop->mo_arg); 2725 if (strlen(buf) + arglen + 2 > len) 2726 goto err; 2727 *cp++ = '='; 2728 (void) strcpy(cp, mop->mo_arg); 2729 cp += arglen; 2730 } 2731 } 2732 } 2733 return (0); 2734 err: 2735 return (EOVERFLOW); 2736 } 2737 2738 static void 2739 vfs_freecancelopt(char **moc) 2740 { 2741 if (moc != NULL) { 2742 int ccnt = 0; 2743 char **cp; 2744 2745 for (cp = moc; *cp != NULL; cp++) { 2746 kmem_free(*cp, strlen(*cp) + 1); 2747 ccnt++; 2748 } 2749 kmem_free(moc, (ccnt + 1) * sizeof (char *)); 2750 } 2751 } 2752 2753 static void 2754 vfs_freeopt(mntopt_t *mop) 2755 { 2756 if (mop->mo_name != NULL) 2757 kmem_free(mop->mo_name, strlen(mop->mo_name) + 1); 2758 2759 vfs_freecancelopt(mop->mo_cancel); 2760 2761 if (mop->mo_arg != NULL) 2762 kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1); 2763 } 2764 2765 /* 2766 * Free a mount options table 2767 * 2768 * This function is *not* for general use by filesystems. 2769 * 2770 * Note: caller is responsible for locking the vfs list, if needed, 2771 * to protect mp. 2772 */ 2773 void 2774 vfs_freeopttbl(mntopts_t *mp) 2775 { 2776 uint_t i, count; 2777 2778 count = mp->mo_count; 2779 for (i = 0; i < count; i++) { 2780 vfs_freeopt(&mp->mo_list[i]); 2781 } 2782 if (count) { 2783 kmem_free(mp->mo_list, sizeof (mntopt_t) * count); 2784 mp->mo_count = 0; 2785 mp->mo_list = NULL; 2786 } 2787 } 2788 2789 2790 /* ARGSUSED */ 2791 static int 2792 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2793 caller_context_t *ct) 2794 { 2795 return (0); 2796 } 2797 2798 /* ARGSUSED */ 2799 static int 2800 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, 2801 caller_context_t *ct) 2802 { 2803 return (0); 2804 } 2805 2806 /* 2807 * The dummy vnode is currently used only by file events notification 2808 * module which is just interested in the timestamps. 2809 */ 2810 /* ARGSUSED */ 2811 static int 2812 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2813 caller_context_t *ct) 2814 { 2815 bzero(vap, sizeof (vattr_t)); 2816 vap->va_type = VREG; 2817 vap->va_nlink = 1; 2818 vap->va_ctime = vfs_mnttab_ctime; 2819 /* 2820 * it is ok to just copy mtime as the time will be monotonically 2821 * increasing. 2822 */ 2823 vap->va_mtime = vfs_mnttab_mtime; 2824 vap->va_atime = vap->va_mtime; 2825 return (0); 2826 } 2827 2828 static void 2829 vfs_mnttabvp_setup(void) 2830 { 2831 vnode_t *tvp; 2832 vnodeops_t *vfs_mntdummyvnops; 2833 const fs_operation_def_t mnt_dummyvnodeops_template[] = { 2834 VOPNAME_READ, { .vop_read = vfs_mntdummyread }, 2835 VOPNAME_WRITE, { .vop_write = vfs_mntdummywrite }, 2836 VOPNAME_GETATTR, { .vop_getattr = vfs_mntdummygetattr }, 2837 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 2838 NULL, NULL 2839 }; 2840 2841 if (vn_make_ops("mnttab", mnt_dummyvnodeops_template, 2842 &vfs_mntdummyvnops) != 0) { 2843 cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed"); 2844 /* Shouldn't happen, but not bad enough to panic */ 2845 return; 2846 } 2847 2848 /* 2849 * A global dummy vnode is allocated to represent mntfs files. 2850 * The mntfs file (/etc/mnttab) can be monitored for file events 2851 * and receive an event when mnttab changes. Dummy VOP calls 2852 * will be made on this vnode. The file events notification module 2853 * intercepts this vnode and delivers relevant events. 2854 */ 2855 tvp = vn_alloc(KM_SLEEP); 2856 tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE; 2857 vn_setops(tvp, vfs_mntdummyvnops); 2858 tvp->v_type = VREG; 2859 /* 2860 * The mnt dummy ops do not reference v_data. 2861 * No other module intercepting this vnode should either. 2862 * Just set it to point to itself. 2863 */ 2864 tvp->v_data = (caddr_t)tvp; 2865 tvp->v_vfsp = rootvfs; 2866 vfs_mntdummyvp = tvp; 2867 } 2868 2869 /* 2870 * performs fake read/write ops 2871 */ 2872 static void 2873 vfs_mnttab_rwop(int rw) 2874 { 2875 struct uio uio; 2876 struct iovec iov; 2877 char buf[1]; 2878 2879 if (vfs_mntdummyvp == NULL) 2880 return; 2881 2882 bzero(&uio, sizeof (uio)); 2883 bzero(&iov, sizeof (iov)); 2884 iov.iov_base = buf; 2885 iov.iov_len = 0; 2886 uio.uio_iov = &iov; 2887 uio.uio_iovcnt = 1; 2888 uio.uio_loffset = 0; 2889 uio.uio_segflg = UIO_SYSSPACE; 2890 uio.uio_resid = 0; 2891 if (rw) { 2892 (void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2893 } else { 2894 (void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL); 2895 } 2896 } 2897 2898 /* 2899 * Generate a write operation. 2900 */ 2901 void 2902 vfs_mnttab_writeop(void) 2903 { 2904 vfs_mnttab_rwop(1); 2905 } 2906 2907 /* 2908 * Generate a read operation. 2909 */ 2910 void 2911 vfs_mnttab_readop(void) 2912 { 2913 vfs_mnttab_rwop(0); 2914 } 2915 2916 /* 2917 * Free any mnttab information recorded in the vfs struct. 2918 * The vfs must not be on the vfs list. 2919 */ 2920 static void 2921 vfs_freemnttab(struct vfs *vfsp) 2922 { 2923 ASSERT(!VFS_ON_LIST(vfsp)); 2924 2925 /* 2926 * Free device and mount point information 2927 */ 2928 if (vfsp->vfs_mntpt != NULL) { 2929 refstr_rele(vfsp->vfs_mntpt); 2930 vfsp->vfs_mntpt = NULL; 2931 } 2932 if (vfsp->vfs_resource != NULL) { 2933 refstr_rele(vfsp->vfs_resource); 2934 vfsp->vfs_resource = NULL; 2935 } 2936 /* 2937 * Now free mount options information 2938 */ 2939 vfs_freeopttbl(&vfsp->vfs_mntopts); 2940 } 2941 2942 /* 2943 * Return the last mnttab modification time 2944 */ 2945 void 2946 vfs_mnttab_modtime(timespec_t *ts) 2947 { 2948 ASSERT(RW_LOCK_HELD(&vfslist)); 2949 *ts = vfs_mnttab_mtime; 2950 } 2951 2952 /* 2953 * See if mnttab is changed 2954 */ 2955 void 2956 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp) 2957 { 2958 int changed; 2959 2960 *phpp = (struct pollhead *)NULL; 2961 2962 /* 2963 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime. 2964 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe 2965 * to not grab the vfs list lock because tv_sec is monotonically 2966 * increasing. 2967 */ 2968 2969 changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) || 2970 (old->tv_sec != vfs_mnttab_mtime.tv_sec); 2971 if (!changed) { 2972 *phpp = &vfs_pollhd; 2973 } 2974 } 2975 2976 /* Provide a unique and monotonically-increasing timestamp. */ 2977 void 2978 vfs_mono_time(timespec_t *ts) 2979 { 2980 static volatile hrtime_t hrt; /* The saved time. */ 2981 hrtime_t newhrt, oldhrt; /* For effecting the CAS. */ 2982 timespec_t newts; 2983 2984 /* 2985 * Try gethrestime() first, but be prepared to fabricate a sensible 2986 * answer at the first sign of any trouble. 2987 */ 2988 gethrestime(&newts); 2989 newhrt = ts2hrt(&newts); 2990 for (;;) { 2991 oldhrt = hrt; 2992 if (newhrt <= hrt) 2993 newhrt = hrt + 1; 2994 if (atomic_cas_64((uint64_t *)&hrt, oldhrt, newhrt) == oldhrt) 2995 break; 2996 } 2997 hrt2ts(newhrt, ts); 2998 } 2999 3000 /* 3001 * Update the mnttab modification time and wake up any waiters for 3002 * mnttab changes 3003 */ 3004 void 3005 vfs_mnttab_modtimeupd() 3006 { 3007 hrtime_t oldhrt, newhrt; 3008 3009 ASSERT(RW_WRITE_HELD(&vfslist)); 3010 oldhrt = ts2hrt(&vfs_mnttab_mtime); 3011 gethrestime(&vfs_mnttab_mtime); 3012 newhrt = ts2hrt(&vfs_mnttab_mtime); 3013 if (oldhrt == (hrtime_t)0) 3014 vfs_mnttab_ctime = vfs_mnttab_mtime; 3015 /* 3016 * Attempt to provide unique mtime (like uniqtime but not). 3017 */ 3018 if (newhrt == oldhrt) { 3019 newhrt++; 3020 hrt2ts(newhrt, &vfs_mnttab_mtime); 3021 } 3022 pollwakeup(&vfs_pollhd, (short)POLLRDBAND); 3023 vfs_mnttab_writeop(); 3024 } 3025 3026 int 3027 dounmount(struct vfs *vfsp, int flag, cred_t *cr) 3028 { 3029 vnode_t *coveredvp; 3030 int error; 3031 extern void teardown_vopstats(vfs_t *); 3032 3033 /* 3034 * Get covered vnode. This will be NULL if the vfs is not linked 3035 * into the file system name space (i.e., domount() with MNT_NOSPICE). 3036 */ 3037 coveredvp = vfsp->vfs_vnodecovered; 3038 ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp)); 3039 3040 /* 3041 * Purge all dnlc entries for this vfs. 3042 */ 3043 (void) dnlc_purge_vfsp(vfsp, 0); 3044 3045 /* For forcible umount, skip VFS_SYNC() since it may hang */ 3046 if ((flag & MS_FORCE) == 0) 3047 (void) VFS_SYNC(vfsp, 0, cr); 3048 3049 /* 3050 * Lock the vfs to maintain fs status quo during unmount. This 3051 * has to be done after the sync because ufs_update tries to acquire 3052 * the vfs_reflock. 3053 */ 3054 vfs_lock_wait(vfsp); 3055 3056 if (error = VFS_UNMOUNT(vfsp, flag, cr)) { 3057 vfs_unlock(vfsp); 3058 if (coveredvp != NULL) 3059 vn_vfsunlock(coveredvp); 3060 } else if (coveredvp != NULL) { 3061 teardown_vopstats(vfsp); 3062 /* 3063 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered) 3064 * when it frees vfsp so we do a VN_HOLD() so we can 3065 * continue to use coveredvp afterwards. 3066 */ 3067 VN_HOLD(coveredvp); 3068 vfs_remove(vfsp); 3069 vn_vfsunlock(coveredvp); 3070 VN_RELE(coveredvp); 3071 } else { 3072 teardown_vopstats(vfsp); 3073 /* 3074 * Release the reference to vfs that is not linked 3075 * into the name space. 3076 */ 3077 vfs_unlock(vfsp); 3078 VFS_RELE(vfsp); 3079 } 3080 return (error); 3081 } 3082 3083 3084 /* 3085 * Vfs_unmountall() is called by uadmin() to unmount all 3086 * mounted file systems (except the root file system) during shutdown. 3087 * It follows the existing locking protocol when traversing the vfs list 3088 * to sync and unmount vfses. Even though there should be no 3089 * other thread running while the system is shutting down, it is prudent 3090 * to still follow the locking protocol. 3091 */ 3092 void 3093 vfs_unmountall(void) 3094 { 3095 struct vfs *vfsp; 3096 struct vfs *prev_vfsp = NULL; 3097 int error; 3098 3099 /* 3100 * Toss all dnlc entries now so that the per-vfs sync 3101 * and unmount operations don't have to slog through 3102 * a bunch of uninteresting vnodes over and over again. 3103 */ 3104 dnlc_purge(); 3105 3106 vfs_list_lock(); 3107 for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) { 3108 prev_vfsp = vfsp->vfs_prev; 3109 3110 if (vfs_lock(vfsp) != 0) 3111 continue; 3112 error = vn_vfswlock(vfsp->vfs_vnodecovered); 3113 vfs_unlock(vfsp); 3114 if (error) 3115 continue; 3116 3117 vfs_list_unlock(); 3118 3119 (void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED()); 3120 (void) dounmount(vfsp, 0, CRED()); 3121 3122 /* 3123 * Since we dropped the vfslist lock above we must 3124 * verify that next_vfsp still exists, else start over. 3125 */ 3126 vfs_list_lock(); 3127 for (vfsp = rootvfs->vfs_prev; 3128 vfsp != rootvfs; vfsp = vfsp->vfs_prev) 3129 if (vfsp == prev_vfsp) 3130 break; 3131 if (vfsp == rootvfs && prev_vfsp != rootvfs) 3132 prev_vfsp = rootvfs->vfs_prev; 3133 } 3134 vfs_list_unlock(); 3135 } 3136 3137 /* 3138 * Called to add an entry to the end of the vfs mount in progress list 3139 */ 3140 void 3141 vfs_addmip(dev_t dev, struct vfs *vfsp) 3142 { 3143 struct ipmnt *mipp; 3144 3145 mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP); 3146 mipp->mip_next = NULL; 3147 mipp->mip_dev = dev; 3148 mipp->mip_vfsp = vfsp; 3149 mutex_enter(&vfs_miplist_mutex); 3150 if (vfs_miplist_end != NULL) 3151 vfs_miplist_end->mip_next = mipp; 3152 else 3153 vfs_miplist = mipp; 3154 vfs_miplist_end = mipp; 3155 mutex_exit(&vfs_miplist_mutex); 3156 } 3157 3158 /* 3159 * Called to remove an entry from the mount in progress list 3160 * Either because the mount completed or it failed. 3161 */ 3162 void 3163 vfs_delmip(struct vfs *vfsp) 3164 { 3165 struct ipmnt *mipp, *mipprev; 3166 3167 mutex_enter(&vfs_miplist_mutex); 3168 mipprev = NULL; 3169 for (mipp = vfs_miplist; 3170 mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) { 3171 mipprev = mipp; 3172 } 3173 if (mipp == NULL) 3174 return; /* shouldn't happen */ 3175 if (mipp == vfs_miplist_end) 3176 vfs_miplist_end = mipprev; 3177 if (mipprev == NULL) 3178 vfs_miplist = mipp->mip_next; 3179 else 3180 mipprev->mip_next = mipp->mip_next; 3181 mutex_exit(&vfs_miplist_mutex); 3182 kmem_free(mipp, sizeof (struct ipmnt)); 3183 } 3184 3185 /* 3186 * vfs_add is called by a specific filesystem's mount routine to add 3187 * the new vfs into the vfs list/hash and to cover the mounted-on vnode. 3188 * The vfs should already have been locked by the caller. 3189 * 3190 * coveredvp is NULL if this is the root. 3191 */ 3192 void 3193 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag) 3194 { 3195 int newflag; 3196 3197 ASSERT(vfs_lock_held(vfsp)); 3198 VFS_HOLD(vfsp); 3199 newflag = vfsp->vfs_flag; 3200 if (mflag & MS_RDONLY) 3201 newflag |= VFS_RDONLY; 3202 else 3203 newflag &= ~VFS_RDONLY; 3204 if (mflag & MS_NOSUID) 3205 newflag |= (VFS_NOSETUID|VFS_NODEVICES); 3206 else 3207 newflag &= ~(VFS_NOSETUID|VFS_NODEVICES); 3208 if (mflag & MS_NOMNTTAB) 3209 newflag |= VFS_NOMNTTAB; 3210 else 3211 newflag &= ~VFS_NOMNTTAB; 3212 3213 if (coveredvp != NULL) { 3214 ASSERT(vn_vfswlock_held(coveredvp)); 3215 coveredvp->v_vfsmountedhere = vfsp; 3216 VN_HOLD(coveredvp); 3217 } 3218 vfsp->vfs_vnodecovered = coveredvp; 3219 vfsp->vfs_flag = newflag; 3220 3221 vfs_list_add(vfsp); 3222 } 3223 3224 /* 3225 * Remove a vfs from the vfs list, null out the pointer from the 3226 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer 3227 * from the vfs to the covered vnode (vfs_vnodecovered). Release the 3228 * reference to the vfs and to the covered vnode. 3229 * 3230 * Called from dounmount after it's confirmed with the file system 3231 * that the unmount is legal. 3232 */ 3233 void 3234 vfs_remove(struct vfs *vfsp) 3235 { 3236 vnode_t *vp; 3237 3238 ASSERT(vfs_lock_held(vfsp)); 3239 3240 /* 3241 * Can't unmount root. Should never happen because fs will 3242 * be busy. 3243 */ 3244 if (vfsp == rootvfs) 3245 panic("vfs_remove: unmounting root"); 3246 3247 vfs_list_remove(vfsp); 3248 3249 /* 3250 * Unhook from the file system name space. 3251 */ 3252 vp = vfsp->vfs_vnodecovered; 3253 ASSERT(vn_vfswlock_held(vp)); 3254 vp->v_vfsmountedhere = NULL; 3255 vfsp->vfs_vnodecovered = NULL; 3256 VN_RELE(vp); 3257 3258 /* 3259 * Release lock and wakeup anybody waiting. 3260 */ 3261 vfs_unlock(vfsp); 3262 VFS_RELE(vfsp); 3263 } 3264 3265 /* 3266 * Lock a filesystem to prevent access to it while mounting, 3267 * unmounting and syncing. Return EBUSY immediately if lock 3268 * can't be acquired. 3269 */ 3270 int 3271 vfs_lock(vfs_t *vfsp) 3272 { 3273 vn_vfslocks_entry_t *vpvfsentry; 3274 3275 vpvfsentry = vn_vfslocks_getlock(vfsp); 3276 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 3277 return (0); 3278 3279 vn_vfslocks_rele(vpvfsentry); 3280 return (EBUSY); 3281 } 3282 3283 int 3284 vfs_rlock(vfs_t *vfsp) 3285 { 3286 vn_vfslocks_entry_t *vpvfsentry; 3287 3288 vpvfsentry = vn_vfslocks_getlock(vfsp); 3289 3290 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 3291 return (0); 3292 3293 vn_vfslocks_rele(vpvfsentry); 3294 return (EBUSY); 3295 } 3296 3297 void 3298 vfs_lock_wait(vfs_t *vfsp) 3299 { 3300 vn_vfslocks_entry_t *vpvfsentry; 3301 3302 vpvfsentry = vn_vfslocks_getlock(vfsp); 3303 rwst_enter(&vpvfsentry->ve_lock, RW_WRITER); 3304 } 3305 3306 void 3307 vfs_rlock_wait(vfs_t *vfsp) 3308 { 3309 vn_vfslocks_entry_t *vpvfsentry; 3310 3311 vpvfsentry = vn_vfslocks_getlock(vfsp); 3312 rwst_enter(&vpvfsentry->ve_lock, RW_READER); 3313 } 3314 3315 /* 3316 * Unlock a locked filesystem. 3317 */ 3318 void 3319 vfs_unlock(vfs_t *vfsp) 3320 { 3321 vn_vfslocks_entry_t *vpvfsentry; 3322 3323 /* 3324 * vfs_unlock will mimic sema_v behaviour to fix 4748018. 3325 * And these changes should remain for the patch changes as it is. 3326 */ 3327 if (panicstr) 3328 return; 3329 3330 /* 3331 * ve_refcount needs to be dropped twice here. 3332 * 1. To release refernce after a call to vfs_locks_getlock() 3333 * 2. To release the reference from the locking routines like 3334 * vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,. 3335 */ 3336 3337 vpvfsentry = vn_vfslocks_getlock(vfsp); 3338 vn_vfslocks_rele(vpvfsentry); 3339 3340 rwst_exit(&vpvfsentry->ve_lock); 3341 vn_vfslocks_rele(vpvfsentry); 3342 } 3343 3344 /* 3345 * Utility routine that allows a filesystem to construct its 3346 * fsid in "the usual way" - by munging some underlying dev_t and 3347 * the filesystem type number into the 64-bit fsid. Note that 3348 * this implicitly relies on dev_t persistence to make filesystem 3349 * id's persistent. 3350 * 3351 * There's nothing to prevent an individual fs from constructing its 3352 * fsid in a different way, and indeed they should. 3353 * 3354 * Since we want fsids to be 32-bit quantities (so that they can be 3355 * exported identically by either 32-bit or 64-bit APIs, as well as 3356 * the fact that fsid's are "known" to NFS), we compress the device 3357 * number given down to 32-bits, and panic if that isn't possible. 3358 */ 3359 void 3360 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val) 3361 { 3362 if (!cmpldev((dev32_t *)&fsi->val[0], dev)) 3363 panic("device number too big for fsid!"); 3364 fsi->val[1] = val; 3365 } 3366 3367 int 3368 vfs_lock_held(vfs_t *vfsp) 3369 { 3370 int held; 3371 vn_vfslocks_entry_t *vpvfsentry; 3372 3373 /* 3374 * vfs_lock_held will mimic sema_held behaviour 3375 * if panicstr is set. And these changes should remain 3376 * for the patch changes as it is. 3377 */ 3378 if (panicstr) 3379 return (1); 3380 3381 vpvfsentry = vn_vfslocks_getlock(vfsp); 3382 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 3383 3384 vn_vfslocks_rele(vpvfsentry); 3385 return (held); 3386 } 3387 3388 struct _kthread * 3389 vfs_lock_owner(vfs_t *vfsp) 3390 { 3391 struct _kthread *owner; 3392 vn_vfslocks_entry_t *vpvfsentry; 3393 3394 /* 3395 * vfs_wlock_held will mimic sema_held behaviour 3396 * if panicstr is set. And these changes should remain 3397 * for the patch changes as it is. 3398 */ 3399 if (panicstr) 3400 return (NULL); 3401 3402 vpvfsentry = vn_vfslocks_getlock(vfsp); 3403 owner = rwst_owner(&vpvfsentry->ve_lock); 3404 3405 vn_vfslocks_rele(vpvfsentry); 3406 return (owner); 3407 } 3408 3409 /* 3410 * vfs list locking. 3411 * 3412 * Rather than manipulate the vfslist lock directly, we abstract into lock 3413 * and unlock routines to allow the locking implementation to be changed for 3414 * clustering. 3415 * 3416 * Whenever the vfs list is modified through its hash links, the overall list 3417 * lock must be obtained before locking the relevant hash bucket. But to see 3418 * whether a given vfs is on the list, it suffices to obtain the lock for the 3419 * hash bucket without getting the overall list lock. (See getvfs() below.) 3420 */ 3421 3422 void 3423 vfs_list_lock() 3424 { 3425 rw_enter(&vfslist, RW_WRITER); 3426 } 3427 3428 void 3429 vfs_list_read_lock() 3430 { 3431 rw_enter(&vfslist, RW_READER); 3432 } 3433 3434 void 3435 vfs_list_unlock() 3436 { 3437 rw_exit(&vfslist); 3438 } 3439 3440 /* 3441 * Low level worker routines for adding entries to and removing entries from 3442 * the vfs list. 3443 */ 3444 3445 static void 3446 vfs_hash_add(struct vfs *vfsp, int insert_at_head) 3447 { 3448 int vhno; 3449 struct vfs **hp; 3450 dev_t dev; 3451 3452 ASSERT(RW_WRITE_HELD(&vfslist)); 3453 3454 dev = expldev(vfsp->vfs_fsid.val[0]); 3455 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3456 3457 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3458 3459 /* 3460 * Link into the hash table, inserting it at the end, so that LOFS 3461 * with the same fsid as UFS (or other) file systems will not hide the 3462 * UFS. 3463 */ 3464 if (insert_at_head) { 3465 vfsp->vfs_hash = rvfs_list[vhno].rvfs_head; 3466 rvfs_list[vhno].rvfs_head = vfsp; 3467 } else { 3468 for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL; 3469 hp = &(*hp)->vfs_hash) 3470 continue; 3471 /* 3472 * hp now contains the address of the pointer to update 3473 * to effect the insertion. 3474 */ 3475 vfsp->vfs_hash = NULL; 3476 *hp = vfsp; 3477 } 3478 3479 rvfs_list[vhno].rvfs_len++; 3480 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3481 } 3482 3483 3484 static void 3485 vfs_hash_remove(struct vfs *vfsp) 3486 { 3487 int vhno; 3488 struct vfs *tvfsp; 3489 dev_t dev; 3490 3491 ASSERT(RW_WRITE_HELD(&vfslist)); 3492 3493 dev = expldev(vfsp->vfs_fsid.val[0]); 3494 vhno = VFSHASH(getmajor(dev), getminor(dev)); 3495 3496 mutex_enter(&rvfs_list[vhno].rvfs_lock); 3497 3498 /* 3499 * Remove from hash. 3500 */ 3501 if (rvfs_list[vhno].rvfs_head == vfsp) { 3502 rvfs_list[vhno].rvfs_head = vfsp->vfs_hash; 3503 rvfs_list[vhno].rvfs_len--; 3504 goto foundit; 3505 } 3506 for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL; 3507 tvfsp = tvfsp->vfs_hash) { 3508 if (tvfsp->vfs_hash == vfsp) { 3509 tvfsp->vfs_hash = vfsp->vfs_hash; 3510 rvfs_list[vhno].rvfs_len--; 3511 goto foundit; 3512 } 3513 } 3514 cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash"); 3515 3516 foundit: 3517 3518 mutex_exit(&rvfs_list[vhno].rvfs_lock); 3519 } 3520 3521 3522 void 3523 vfs_list_add(struct vfs *vfsp) 3524 { 3525 zone_t *zone; 3526 3527 /* 3528 * Typically, the vfs_t will have been created on behalf of the file 3529 * system in vfs_init, where it will have been provided with a 3530 * vfs_impl_t. This, however, might be lacking if the vfs_t was created 3531 * by an unbundled file system. We therefore check for such an example 3532 * before stamping the vfs_t with its creation time for the benefit of 3533 * mntfs. 3534 */ 3535 if (vfsp->vfs_implp == NULL) 3536 vfsimpl_setup(vfsp); 3537 vfs_mono_time(&vfsp->vfs_hrctime); 3538 3539 /* 3540 * The zone that owns the mount is the one that performed the mount. 3541 * Note that this isn't necessarily the same as the zone mounted into. 3542 * The corresponding zone_rele_ref() will be done when the vfs_t 3543 * is being free'd. 3544 */ 3545 vfsp->vfs_zone = curproc->p_zone; 3546 zone_init_ref(&vfsp->vfs_implp->vi_zone_ref); 3547 zone_hold_ref(vfsp->vfs_zone, &vfsp->vfs_implp->vi_zone_ref, 3548 ZONE_REF_VFS); 3549 3550 /* 3551 * Find the zone mounted into, and put this mount on its vfs list. 3552 */ 3553 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3554 ASSERT(zone != NULL); 3555 /* 3556 * Special casing for the root vfs. This structure is allocated 3557 * statically and hooked onto rootvfs at link time. During the 3558 * vfs_mountroot call at system startup time, the root file system's 3559 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct 3560 * as argument. The code below must detect and handle this special 3561 * case. The only apparent justification for this special casing is 3562 * to ensure that the root file system appears at the head of the 3563 * list. 3564 * 3565 * XXX: I'm assuming that it's ok to do normal list locking when 3566 * adding the entry for the root file system (this used to be 3567 * done with no locks held). 3568 */ 3569 vfs_list_lock(); 3570 /* 3571 * Link into the vfs list proper. 3572 */ 3573 if (vfsp == &root) { 3574 /* 3575 * Assert: This vfs is already on the list as its first entry. 3576 * Thus, there's nothing to do. 3577 */ 3578 ASSERT(rootvfs == vfsp); 3579 /* 3580 * Add it to the head of the global zone's vfslist. 3581 */ 3582 ASSERT(zone == global_zone); 3583 ASSERT(zone->zone_vfslist == NULL); 3584 zone->zone_vfslist = vfsp; 3585 } else { 3586 /* 3587 * Link to end of list using vfs_prev (as rootvfs is now a 3588 * doubly linked circular list) so list is in mount order for 3589 * mnttab use. 3590 */ 3591 rootvfs->vfs_prev->vfs_next = vfsp; 3592 vfsp->vfs_prev = rootvfs->vfs_prev; 3593 rootvfs->vfs_prev = vfsp; 3594 vfsp->vfs_next = rootvfs; 3595 3596 /* 3597 * Do it again for the zone-private list (which may be NULL). 3598 */ 3599 if (zone->zone_vfslist == NULL) { 3600 ASSERT(zone != global_zone); 3601 zone->zone_vfslist = vfsp; 3602 } else { 3603 zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp; 3604 vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev; 3605 zone->zone_vfslist->vfs_zone_prev = vfsp; 3606 vfsp->vfs_zone_next = zone->zone_vfslist; 3607 } 3608 } 3609 3610 /* 3611 * Link into the hash table, inserting it at the end, so that LOFS 3612 * with the same fsid as UFS (or other) file systems will not hide 3613 * the UFS. 3614 */ 3615 vfs_hash_add(vfsp, 0); 3616 3617 /* 3618 * update the mnttab modification time 3619 */ 3620 vfs_mnttab_modtimeupd(); 3621 vfs_list_unlock(); 3622 zone_rele(zone); 3623 } 3624 3625 void 3626 vfs_list_remove(struct vfs *vfsp) 3627 { 3628 zone_t *zone; 3629 3630 zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); 3631 ASSERT(zone != NULL); 3632 /* 3633 * Callers are responsible for preventing attempts to unmount the 3634 * root. 3635 */ 3636 ASSERT(vfsp != rootvfs); 3637 3638 vfs_list_lock(); 3639 3640 /* 3641 * Remove from hash. 3642 */ 3643 vfs_hash_remove(vfsp); 3644 3645 /* 3646 * Remove from vfs list. 3647 */ 3648 vfsp->vfs_prev->vfs_next = vfsp->vfs_next; 3649 vfsp->vfs_next->vfs_prev = vfsp->vfs_prev; 3650 vfsp->vfs_next = vfsp->vfs_prev = NULL; 3651 3652 /* 3653 * Remove from zone-specific vfs list. 3654 */ 3655 if (zone->zone_vfslist == vfsp) 3656 zone->zone_vfslist = vfsp->vfs_zone_next; 3657 3658 if (vfsp->vfs_zone_next == vfsp) { 3659 ASSERT(vfsp->vfs_zone_prev == vfsp); 3660 ASSERT(zone->zone_vfslist == vfsp); 3661 zone->zone_vfslist = NULL; 3662 } 3663 3664 vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next; 3665 vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev; 3666 vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL; 3667 3668 /* 3669 * update the mnttab modification time 3670 */ 3671 vfs_mnttab_modtimeupd(); 3672 vfs_list_unlock(); 3673 zone_rele(zone); 3674 } 3675 3676 struct vfs * 3677 getvfs(fsid_t *fsid) 3678 { 3679 struct vfs *vfsp; 3680 int val0 = fsid->val[0]; 3681 int val1 = fsid->val[1]; 3682 dev_t dev = expldev(val0); 3683 int vhno = VFSHASH(getmajor(dev), getminor(dev)); 3684 kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock; 3685 3686 mutex_enter(hmp); 3687 for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) { 3688 if (vfsp->vfs_fsid.val[0] == val0 && 3689 vfsp->vfs_fsid.val[1] == val1) { 3690 VFS_HOLD(vfsp); 3691 mutex_exit(hmp); 3692 return (vfsp); 3693 } 3694 } 3695 mutex_exit(hmp); 3696 return (NULL); 3697 } 3698 3699 /* 3700 * Search the vfs mount in progress list for a specified device/vfs entry. 3701 * Returns 0 if the first entry in the list that the device matches has the 3702 * given vfs pointer as well. If the device matches but a different vfs 3703 * pointer is encountered in the list before the given vfs pointer then 3704 * a 1 is returned. 3705 */ 3706 3707 int 3708 vfs_devmounting(dev_t dev, struct vfs *vfsp) 3709 { 3710 int retval = 0; 3711 struct ipmnt *mipp; 3712 3713 mutex_enter(&vfs_miplist_mutex); 3714 for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) { 3715 if (mipp->mip_dev == dev) { 3716 if (mipp->mip_vfsp != vfsp) 3717 retval = 1; 3718 break; 3719 } 3720 } 3721 mutex_exit(&vfs_miplist_mutex); 3722 return (retval); 3723 } 3724 3725 /* 3726 * Search the vfs list for a specified device. Returns 1, if entry is found 3727 * or 0 if no suitable entry is found. 3728 */ 3729 3730 int 3731 vfs_devismounted(dev_t dev) 3732 { 3733 struct vfs *vfsp; 3734 int found; 3735 3736 vfs_list_read_lock(); 3737 vfsp = rootvfs; 3738 found = 0; 3739 do { 3740 if (vfsp->vfs_dev == dev) { 3741 found = 1; 3742 break; 3743 } 3744 vfsp = vfsp->vfs_next; 3745 } while (vfsp != rootvfs); 3746 3747 vfs_list_unlock(); 3748 return (found); 3749 } 3750 3751 /* 3752 * Search the vfs list for a specified device. Returns a pointer to it 3753 * or NULL if no suitable entry is found. The caller of this routine 3754 * is responsible for releasing the returned vfs pointer. 3755 */ 3756 struct vfs * 3757 vfs_dev2vfsp(dev_t dev) 3758 { 3759 struct vfs *vfsp; 3760 int found; 3761 3762 vfs_list_read_lock(); 3763 vfsp = rootvfs; 3764 found = 0; 3765 do { 3766 /* 3767 * The following could be made more efficient by making 3768 * the entire loop use vfs_zone_next if the call is from 3769 * a zone. The only callers, however, ustat(2) and 3770 * umount2(2), don't seem to justify the added 3771 * complexity at present. 3772 */ 3773 if (vfsp->vfs_dev == dev && 3774 ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt), 3775 curproc->p_zone)) { 3776 VFS_HOLD(vfsp); 3777 found = 1; 3778 break; 3779 } 3780 vfsp = vfsp->vfs_next; 3781 } while (vfsp != rootvfs); 3782 vfs_list_unlock(); 3783 return (found ? vfsp: NULL); 3784 } 3785 3786 /* 3787 * Search the vfs list for a specified mntpoint. Returns a pointer to it 3788 * or NULL if no suitable entry is found. The caller of this routine 3789 * is responsible for releasing the returned vfs pointer. 3790 * 3791 * Note that if multiple mntpoints match, the last one matching is 3792 * returned in an attempt to return the "top" mount when overlay 3793 * mounts are covering the same mount point. This is accomplished by starting 3794 * at the end of the list and working our way backwards, stopping at the first 3795 * matching mount. 3796 */ 3797 struct vfs * 3798 vfs_mntpoint2vfsp(const char *mp) 3799 { 3800 struct vfs *vfsp; 3801 struct vfs *retvfsp = NULL; 3802 zone_t *zone = curproc->p_zone; 3803 struct vfs *list; 3804 3805 vfs_list_read_lock(); 3806 if (getzoneid() == GLOBAL_ZONEID) { 3807 /* 3808 * The global zone may see filesystems in any zone. 3809 */ 3810 vfsp = rootvfs->vfs_prev; 3811 do { 3812 if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) { 3813 retvfsp = vfsp; 3814 break; 3815 } 3816 vfsp = vfsp->vfs_prev; 3817 } while (vfsp != rootvfs->vfs_prev); 3818 } else if ((list = zone->zone_vfslist) != NULL) { 3819 const char *mntpt; 3820 3821 vfsp = list->vfs_zone_prev; 3822 do { 3823 mntpt = refstr_value(vfsp->vfs_mntpt); 3824 mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); 3825 if (strcmp(mntpt, mp) == 0) { 3826 retvfsp = vfsp; 3827 break; 3828 } 3829 vfsp = vfsp->vfs_zone_prev; 3830 } while (vfsp != list->vfs_zone_prev); 3831 } 3832 if (retvfsp) 3833 VFS_HOLD(retvfsp); 3834 vfs_list_unlock(); 3835 return (retvfsp); 3836 } 3837 3838 /* 3839 * Search the vfs list for a specified vfsops. 3840 * if vfs entry is found then return 1, else 0. 3841 */ 3842 int 3843 vfs_opsinuse(vfsops_t *ops) 3844 { 3845 struct vfs *vfsp; 3846 int found; 3847 3848 vfs_list_read_lock(); 3849 vfsp = rootvfs; 3850 found = 0; 3851 do { 3852 if (vfs_getops(vfsp) == ops) { 3853 found = 1; 3854 break; 3855 } 3856 vfsp = vfsp->vfs_next; 3857 } while (vfsp != rootvfs); 3858 vfs_list_unlock(); 3859 return (found); 3860 } 3861 3862 /* 3863 * Allocate an entry in vfssw for a file system type 3864 */ 3865 struct vfssw * 3866 allocate_vfssw(const char *type) 3867 { 3868 struct vfssw *vswp; 3869 3870 if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) { 3871 /* 3872 * The vfssw table uses the empty string to identify an 3873 * available entry; we cannot add any type which has 3874 * a leading NUL. The string length is limited to 3875 * the size of the st_fstype array in struct stat. 3876 */ 3877 return (NULL); 3878 } 3879 3880 ASSERT(VFSSW_WRITE_LOCKED()); 3881 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) 3882 if (!ALLOCATED_VFSSW(vswp)) { 3883 vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP); 3884 (void) strcpy(vswp->vsw_name, type); 3885 ASSERT(vswp->vsw_count == 0); 3886 vswp->vsw_count = 1; 3887 mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL); 3888 return (vswp); 3889 } 3890 return (NULL); 3891 } 3892 3893 /* 3894 * Impose additional layer of translation between vfstype names 3895 * and module names in the filesystem. 3896 */ 3897 static const char * 3898 vfs_to_modname(const char *vfstype) 3899 { 3900 if (strcmp(vfstype, "proc") == 0) { 3901 vfstype = "procfs"; 3902 } else if (strcmp(vfstype, "fd") == 0) { 3903 vfstype = "fdfs"; 3904 } else if (strncmp(vfstype, "nfs", 3) == 0) { 3905 vfstype = "nfs"; 3906 } 3907 3908 return (vfstype); 3909 } 3910 3911 /* 3912 * Find a vfssw entry given a file system type name. 3913 * Try to autoload the filesystem if it's not found. 3914 * If it's installed, return the vfssw locked to prevent unloading. 3915 */ 3916 struct vfssw * 3917 vfs_getvfssw(const char *type) 3918 { 3919 struct vfssw *vswp; 3920 const char *modname; 3921 3922 RLOCK_VFSSW(); 3923 vswp = vfs_getvfsswbyname(type); 3924 modname = vfs_to_modname(type); 3925 3926 if (rootdir == NULL) { 3927 /* 3928 * If we haven't yet loaded the root file system, then our 3929 * _init won't be called until later. Allocate vfssw entry, 3930 * because mod_installfs won't be called. 3931 */ 3932 if (vswp == NULL) { 3933 RUNLOCK_VFSSW(); 3934 WLOCK_VFSSW(); 3935 if ((vswp = vfs_getvfsswbyname(type)) == NULL) { 3936 if ((vswp = allocate_vfssw(type)) == NULL) { 3937 WUNLOCK_VFSSW(); 3938 return (NULL); 3939 } 3940 } 3941 WUNLOCK_VFSSW(); 3942 RLOCK_VFSSW(); 3943 } 3944 if (!VFS_INSTALLED(vswp)) { 3945 RUNLOCK_VFSSW(); 3946 (void) modloadonly("fs", modname); 3947 } else 3948 RUNLOCK_VFSSW(); 3949 return (vswp); 3950 } 3951 3952 /* 3953 * Try to load the filesystem. Before calling modload(), we drop 3954 * our lock on the VFS switch table, and pick it up after the 3955 * module is loaded. However, there is a potential race: the 3956 * module could be unloaded after the call to modload() completes 3957 * but before we pick up the lock and drive on. Therefore, 3958 * we keep reloading the module until we've loaded the module 3959 * _and_ we have the lock on the VFS switch table. 3960 */ 3961 while (vswp == NULL || !VFS_INSTALLED(vswp)) { 3962 RUNLOCK_VFSSW(); 3963 if (modload("fs", modname) == -1) 3964 return (NULL); 3965 RLOCK_VFSSW(); 3966 if (vswp == NULL) 3967 if ((vswp = vfs_getvfsswbyname(type)) == NULL) 3968 break; 3969 } 3970 RUNLOCK_VFSSW(); 3971 3972 return (vswp); 3973 } 3974 3975 /* 3976 * Find a vfssw entry given a file system type name. 3977 */ 3978 struct vfssw * 3979 vfs_getvfsswbyname(const char *type) 3980 { 3981 struct vfssw *vswp; 3982 3983 ASSERT(VFSSW_LOCKED()); 3984 if (type == NULL || *type == '\0') 3985 return (NULL); 3986 3987 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 3988 if (strcmp(type, vswp->vsw_name) == 0) { 3989 vfs_refvfssw(vswp); 3990 return (vswp); 3991 } 3992 } 3993 3994 return (NULL); 3995 } 3996 3997 /* 3998 * Find a vfssw entry given a set of vfsops. 3999 */ 4000 struct vfssw * 4001 vfs_getvfsswbyvfsops(vfsops_t *vfsops) 4002 { 4003 struct vfssw *vswp; 4004 4005 RLOCK_VFSSW(); 4006 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 4007 if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) { 4008 vfs_refvfssw(vswp); 4009 RUNLOCK_VFSSW(); 4010 return (vswp); 4011 } 4012 } 4013 RUNLOCK_VFSSW(); 4014 4015 return (NULL); 4016 } 4017 4018 /* 4019 * Reference a vfssw entry. 4020 */ 4021 void 4022 vfs_refvfssw(struct vfssw *vswp) 4023 { 4024 4025 mutex_enter(&vswp->vsw_lock); 4026 vswp->vsw_count++; 4027 mutex_exit(&vswp->vsw_lock); 4028 } 4029 4030 /* 4031 * Unreference a vfssw entry. 4032 */ 4033 void 4034 vfs_unrefvfssw(struct vfssw *vswp) 4035 { 4036 4037 mutex_enter(&vswp->vsw_lock); 4038 vswp->vsw_count--; 4039 mutex_exit(&vswp->vsw_lock); 4040 } 4041 4042 static int sync_retries = 20; /* number of retries when not making progress */ 4043 static int sync_triesleft; /* portion of sync_retries remaining */ 4044 4045 static pgcnt_t old_pgcnt, new_pgcnt; 4046 static int new_bufcnt, old_bufcnt; 4047 4048 /* 4049 * Sync all of the mounted filesystems, and then wait for the actual i/o to 4050 * complete. We wait by counting the number of dirty pages and buffers, 4051 * pushing them out using bio_busy() and page_busy(), and then counting again. 4052 * This routine is used during the uadmin A_SHUTDOWN code. It should only 4053 * be used after some higher-level mechanism has quiesced the system so that 4054 * new writes are not being initiated while we are waiting for completion. 4055 * 4056 * To ensure finite running time, our algorithm uses sync_triesleft (a progress 4057 * counter used by the vfs_syncall() loop below). It is declared above so 4058 * it can be found easily in the debugger. 4059 * 4060 * The sync_triesleft counter is updated by vfs_syncall() itself. If we make 4061 * sync_retries consecutive calls to bio_busy() and page_busy() without 4062 * decreasing either the number of dirty buffers or dirty pages below the 4063 * lowest count we have seen so far, we give up and return from vfs_syncall(). 4064 * 4065 * Each loop iteration ends with a call to delay() one second to allow time for 4066 * i/o completion and to permit the user time to read our progress messages. 4067 */ 4068 void 4069 vfs_syncall(void) 4070 { 4071 if (rootdir == NULL && !modrootloaded) 4072 return; /* no filesystems have been loaded yet */ 4073 4074 printf("syncing file systems..."); 4075 sync(); 4076 4077 sync_triesleft = sync_retries; 4078 4079 old_bufcnt = new_bufcnt = INT_MAX; 4080 old_pgcnt = new_pgcnt = ULONG_MAX; 4081 4082 while (sync_triesleft > 0) { 4083 old_bufcnt = MIN(old_bufcnt, new_bufcnt); 4084 old_pgcnt = MIN(old_pgcnt, new_pgcnt); 4085 4086 new_bufcnt = bio_busy(B_TRUE); 4087 new_pgcnt = page_busy(B_TRUE); 4088 4089 if (new_bufcnt == 0 && new_pgcnt == 0) 4090 break; 4091 4092 if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt) 4093 sync_triesleft = sync_retries; 4094 else 4095 sync_triesleft--; 4096 4097 if (new_bufcnt) 4098 printf(" [%d]", new_bufcnt); 4099 if (new_pgcnt) 4100 printf(" %lu", new_pgcnt); 4101 4102 delay(hz); 4103 } 4104 4105 if (new_bufcnt != 0 || new_pgcnt != 0) 4106 printf(" done (not all i/o completed)\n"); 4107 else 4108 printf(" done\n"); 4109 4110 delay(hz); 4111 } 4112 4113 /* 4114 * Map VFS flags to statvfs flags. These shouldn't really be separate 4115 * flags at all. 4116 */ 4117 uint_t 4118 vf_to_stf(uint_t vf) 4119 { 4120 uint_t stf = 0; 4121 4122 if (vf & VFS_RDONLY) 4123 stf |= ST_RDONLY; 4124 if (vf & VFS_NOSETUID) 4125 stf |= ST_NOSUID; 4126 if (vf & VFS_NOTRUNC) 4127 stf |= ST_NOTRUNC; 4128 4129 return (stf); 4130 } 4131 4132 /* 4133 * Entries for (illegal) fstype 0. 4134 */ 4135 /* ARGSUSED */ 4136 int 4137 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr) 4138 { 4139 cmn_err(CE_PANIC, "stray vfs operation"); 4140 return (0); 4141 } 4142 4143 /* 4144 * Entries for (illegal) fstype 0. 4145 */ 4146 int 4147 vfsstray(void) 4148 { 4149 cmn_err(CE_PANIC, "stray vfs operation"); 4150 return (0); 4151 } 4152 4153 /* 4154 * Support for dealing with forced UFS unmount and its interaction with 4155 * LOFS. Could be used by any filesystem. 4156 * See bug 1203132. 4157 */ 4158 int 4159 vfs_EIO(void) 4160 { 4161 return (EIO); 4162 } 4163 4164 /* 4165 * We've gotta define the op for sync separately, since the compiler gets 4166 * confused if we mix and match ANSI and normal style prototypes when 4167 * a "short" argument is present and spits out a warning. 4168 */ 4169 /*ARGSUSED*/ 4170 int 4171 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr) 4172 { 4173 return (EIO); 4174 } 4175 4176 vfs_t EIO_vfs; 4177 vfsops_t *EIO_vfsops; 4178 4179 /* 4180 * Called from startup() to initialize all loaded vfs's 4181 */ 4182 void 4183 vfsinit(void) 4184 { 4185 struct vfssw *vswp; 4186 int error; 4187 extern int vopstats_enabled; 4188 extern void vopstats_startup(); 4189 4190 static const fs_operation_def_t EIO_vfsops_template[] = { 4191 VFSNAME_MOUNT, { .error = vfs_EIO }, 4192 VFSNAME_UNMOUNT, { .error = vfs_EIO }, 4193 VFSNAME_ROOT, { .error = vfs_EIO }, 4194 VFSNAME_STATVFS, { .error = vfs_EIO }, 4195 VFSNAME_SYNC, { .vfs_sync = vfs_EIO_sync }, 4196 VFSNAME_VGET, { .error = vfs_EIO }, 4197 VFSNAME_MOUNTROOT, { .error = vfs_EIO }, 4198 VFSNAME_FREEVFS, { .error = vfs_EIO }, 4199 VFSNAME_VNSTATE, { .error = vfs_EIO }, 4200 NULL, NULL 4201 }; 4202 4203 static const fs_operation_def_t stray_vfsops_template[] = { 4204 VFSNAME_MOUNT, { .error = vfsstray }, 4205 VFSNAME_UNMOUNT, { .error = vfsstray }, 4206 VFSNAME_ROOT, { .error = vfsstray }, 4207 VFSNAME_STATVFS, { .error = vfsstray }, 4208 VFSNAME_SYNC, { .vfs_sync = vfsstray_sync }, 4209 VFSNAME_VGET, { .error = vfsstray }, 4210 VFSNAME_MOUNTROOT, { .error = vfsstray }, 4211 VFSNAME_FREEVFS, { .error = vfsstray }, 4212 VFSNAME_VNSTATE, { .error = vfsstray }, 4213 NULL, NULL 4214 }; 4215 4216 /* Create vfs cache */ 4217 vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs), 4218 sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0); 4219 4220 /* Initialize the vnode cache (file systems may use it during init). */ 4221 vn_create_cache(); 4222 4223 /* Setup event monitor framework */ 4224 fem_init(); 4225 4226 /* Initialize the dummy stray file system type. */ 4227 error = vfs_setfsops(0, stray_vfsops_template, NULL); 4228 4229 /* Initialize the dummy EIO file system. */ 4230 error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops); 4231 if (error != 0) { 4232 cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template"); 4233 /* Shouldn't happen, but not bad enough to panic */ 4234 } 4235 4236 VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL); 4237 4238 /* 4239 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup 4240 * on this vfs can immediately notice it's invalid. 4241 */ 4242 EIO_vfs.vfs_flag |= VFS_UNMOUNTED; 4243 4244 /* 4245 * Call the init routines of non-loadable filesystems only. 4246 * Filesystems which are loaded as separate modules will be 4247 * initialized by the module loading code instead. 4248 */ 4249 4250 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { 4251 RLOCK_VFSSW(); 4252 if (vswp->vsw_init != NULL) 4253 (*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name); 4254 RUNLOCK_VFSSW(); 4255 } 4256 4257 vopstats_startup(); 4258 4259 if (vopstats_enabled) { 4260 /* EIO_vfs can collect stats, but we don't retrieve them */ 4261 initialize_vopstats(&EIO_vfs.vfs_vopstats); 4262 EIO_vfs.vfs_fstypevsp = NULL; 4263 EIO_vfs.vfs_vskap = NULL; 4264 EIO_vfs.vfs_flag |= VFS_STATS; 4265 } 4266 4267 xattr_init(); 4268 4269 reparse_point_init(); 4270 } 4271 4272 vfs_t * 4273 vfs_alloc(int kmflag) 4274 { 4275 vfs_t *vfsp; 4276 4277 vfsp = kmem_cache_alloc(vfs_cache, kmflag); 4278 4279 /* 4280 * Do the simplest initialization here. 4281 * Everything else gets done in vfs_init() 4282 */ 4283 bzero(vfsp, sizeof (vfs_t)); 4284 return (vfsp); 4285 } 4286 4287 void 4288 vfs_free(vfs_t *vfsp) 4289 { 4290 /* 4291 * One would be tempted to assert that "vfsp->vfs_count == 0". 4292 * The problem is that this gets called out of domount() with 4293 * a partially initialized vfs and a vfs_count of 1. This is 4294 * also called from vfs_rele() with a vfs_count of 0. We can't 4295 * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully 4296 * returned. This is because VFS_MOUNT() fully initializes the 4297 * vfs structure and its associated data. VFS_RELE() will call 4298 * VFS_FREEVFS() which may panic the system if the data structures 4299 * aren't fully initialized from a successful VFS_MOUNT()). 4300 */ 4301 4302 /* If FEM was in use, make sure everything gets cleaned up */ 4303 if (vfsp->vfs_femhead) { 4304 ASSERT(vfsp->vfs_femhead->femh_list == NULL); 4305 mutex_destroy(&vfsp->vfs_femhead->femh_lock); 4306 kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead))); 4307 vfsp->vfs_femhead = NULL; 4308 } 4309 4310 if (vfsp->vfs_implp) 4311 vfsimpl_teardown(vfsp); 4312 sema_destroy(&vfsp->vfs_reflock); 4313 kmem_cache_free(vfs_cache, vfsp); 4314 } 4315 4316 /* 4317 * Increments the vfs reference count by one atomically. 4318 */ 4319 void 4320 vfs_hold(vfs_t *vfsp) 4321 { 4322 atomic_inc_32(&vfsp->vfs_count); 4323 ASSERT(vfsp->vfs_count != 0); 4324 } 4325 4326 /* 4327 * Decrements the vfs reference count by one atomically. When 4328 * vfs reference count becomes zero, it calls the file system 4329 * specific vfs_freevfs() to free up the resources. 4330 */ 4331 void 4332 vfs_rele(vfs_t *vfsp) 4333 { 4334 ASSERT(vfsp->vfs_count != 0); 4335 if (atomic_dec_32_nv(&vfsp->vfs_count) == 0) { 4336 VFS_FREEVFS(vfsp); 4337 lofi_remove(vfsp); 4338 if (vfsp->vfs_zone) 4339 zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, 4340 ZONE_REF_VFS); 4341 vfs_freemnttab(vfsp); 4342 vfs_free(vfsp); 4343 } 4344 } 4345 4346 /* 4347 * Generic operations vector support. 4348 * 4349 * This is used to build operations vectors for both the vfs and vnode. 4350 * It's normally called only when a file system is loaded. 4351 * 4352 * There are many possible algorithms for this, including the following: 4353 * 4354 * (1) scan the list of known operations; for each, see if the file system 4355 * includes an entry for it, and fill it in as appropriate. 4356 * 4357 * (2) set up defaults for all known operations. scan the list of ops 4358 * supplied by the file system; for each which is both supplied and 4359 * known, fill it in. 4360 * 4361 * (3) sort the lists of known ops & supplied ops; scan the list, filling 4362 * in entries as we go. 4363 * 4364 * we choose (1) for simplicity, and because performance isn't critical here. 4365 * note that (2) could be sped up using a precomputed hash table on known ops. 4366 * (3) could be faster than either, but only if the lists were very large or 4367 * supplied in sorted order. 4368 * 4369 */ 4370 4371 int 4372 fs_build_vector(void *vector, int *unused_ops, 4373 const fs_operation_trans_def_t *translation, 4374 const fs_operation_def_t *operations) 4375 { 4376 int i, num_trans, num_ops, used; 4377 4378 /* 4379 * Count the number of translations and the number of supplied 4380 * operations. 4381 */ 4382 4383 { 4384 const fs_operation_trans_def_t *p; 4385 4386 for (num_trans = 0, p = translation; 4387 p->name != NULL; 4388 num_trans++, p++) 4389 ; 4390 } 4391 4392 { 4393 const fs_operation_def_t *p; 4394 4395 for (num_ops = 0, p = operations; 4396 p->name != NULL; 4397 num_ops++, p++) 4398 ; 4399 } 4400 4401 /* Walk through each operation known to our caller. There will be */ 4402 /* one entry in the supplied "translation table" for each. */ 4403 4404 used = 0; 4405 4406 for (i = 0; i < num_trans; i++) { 4407 int j, found; 4408 char *curname; 4409 fs_generic_func_p result; 4410 fs_generic_func_p *location; 4411 4412 curname = translation[i].name; 4413 4414 /* Look for a matching operation in the list supplied by the */ 4415 /* file system. */ 4416 4417 found = 0; 4418 4419 for (j = 0; j < num_ops; j++) { 4420 if (strcmp(operations[j].name, curname) == 0) { 4421 used++; 4422 found = 1; 4423 break; 4424 } 4425 } 4426 4427 /* 4428 * If the file system is using a "placeholder" for default 4429 * or error functions, grab the appropriate function out of 4430 * the translation table. If the file system didn't supply 4431 * this operation at all, use the default function. 4432 */ 4433 4434 if (found) { 4435 result = operations[j].func.fs_generic; 4436 if (result == fs_default) { 4437 result = translation[i].defaultFunc; 4438 } else if (result == fs_error) { 4439 result = translation[i].errorFunc; 4440 } else if (result == NULL) { 4441 /* Null values are PROHIBITED */ 4442 return (EINVAL); 4443 } 4444 } else { 4445 result = translation[i].defaultFunc; 4446 } 4447 4448 /* Now store the function into the operations vector. */ 4449 4450 location = (fs_generic_func_p *) 4451 (((char *)vector) + translation[i].offset); 4452 4453 *location = result; 4454 } 4455 4456 *unused_ops = num_ops - used; 4457 4458 return (0); 4459 } 4460 4461 /* Placeholder functions, should never be called. */ 4462 4463 int 4464 fs_error(void) 4465 { 4466 cmn_err(CE_PANIC, "fs_error called"); 4467 return (0); 4468 } 4469 4470 int 4471 fs_default(void) 4472 { 4473 cmn_err(CE_PANIC, "fs_default called"); 4474 return (0); 4475 } 4476 4477 #ifdef __sparc 4478 4479 /* 4480 * Part of the implementation of booting off a mirrored root 4481 * involves a change of dev_t for the root device. To 4482 * accomplish this, first remove the existing hash table 4483 * entry for the root device, convert to the new dev_t, 4484 * then re-insert in the hash table at the head of the list. 4485 */ 4486 void 4487 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype) 4488 { 4489 vfs_list_lock(); 4490 4491 vfs_hash_remove(vfsp); 4492 4493 vfsp->vfs_dev = ndev; 4494 vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype); 4495 4496 vfs_hash_add(vfsp, 1); 4497 4498 vfs_list_unlock(); 4499 } 4500 4501 #else /* x86 NEWBOOT */ 4502 4503 #if defined(__x86) 4504 extern int hvmboot_rootconf(); 4505 #endif /* __x86 */ 4506 4507 extern ib_boot_prop_t *iscsiboot_prop; 4508 4509 int 4510 rootconf() 4511 { 4512 int error; 4513 struct vfssw *vsw; 4514 extern void pm_init(); 4515 char *fstyp, *fsmod; 4516 int ret = -1; 4517 4518 getrootfs(&fstyp, &fsmod); 4519 4520 #if defined(__x86) 4521 /* 4522 * hvmboot_rootconf() is defined in the hvm_bootstrap misc module, 4523 * which lives in /platform/i86hvm, and hence is only available when 4524 * booted in an x86 hvm environment. If the hvm_bootstrap misc module 4525 * is not available then the modstub for this function will return 0. 4526 * If the hvm_bootstrap misc module is available it will be loaded 4527 * and hvmboot_rootconf() will be invoked. 4528 */ 4529 if (error = hvmboot_rootconf()) 4530 return (error); 4531 #endif /* __x86 */ 4532 4533 if (error = clboot_rootconf()) 4534 return (error); 4535 4536 if (modload("fs", fsmod) == -1) 4537 panic("Cannot _init %s module", fsmod); 4538 4539 RLOCK_VFSSW(); 4540 vsw = vfs_getvfsswbyname(fstyp); 4541 RUNLOCK_VFSSW(); 4542 if (vsw == NULL) { 4543 cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp); 4544 return (ENXIO); 4545 } 4546 VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0); 4547 VFS_HOLD(rootvfs); 4548 4549 /* always mount readonly first */ 4550 rootvfs->vfs_flag |= VFS_RDONLY; 4551 4552 pm_init(); 4553 4554 if (netboot && iscsiboot_prop) { 4555 cmn_err(CE_WARN, "NFS boot and iSCSI boot" 4556 " shouldn't happen in the same time"); 4557 return (EINVAL); 4558 } 4559 4560 if (netboot || iscsiboot_prop) { 4561 ret = strplumb(); 4562 if (ret != 0) { 4563 cmn_err(CE_WARN, "Cannot plumb network device %d", ret); 4564 return (EFAULT); 4565 } 4566 } 4567 4568 if ((ret == 0) && iscsiboot_prop) { 4569 ret = modload("drv", "iscsi"); 4570 /* -1 indicates fail */ 4571 if (ret == -1) { 4572 cmn_err(CE_WARN, "Failed to load iscsi module"); 4573 iscsi_boot_prop_free(); 4574 return (EINVAL); 4575 } else { 4576 if (!i_ddi_attach_pseudo_node("iscsi")) { 4577 cmn_err(CE_WARN, 4578 "Failed to attach iscsi driver"); 4579 iscsi_boot_prop_free(); 4580 return (ENODEV); 4581 } 4582 } 4583 } 4584 4585 error = VFS_MOUNTROOT(rootvfs, ROOT_INIT); 4586 vfs_unrefvfssw(vsw); 4587 rootdev = rootvfs->vfs_dev; 4588 4589 if (error) 4590 cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n", 4591 rootfs.bo_name, fstyp); 4592 else 4593 cmn_err(CE_CONT, "?root on %s fstype %s\n", 4594 rootfs.bo_name, fstyp); 4595 return (error); 4596 } 4597 4598 /* 4599 * XXX this is called by nfs only and should probably be removed 4600 * If booted with ASKNAME, prompt on the console for a filesystem 4601 * name and return it. 4602 */ 4603 void 4604 getfsname(char *askfor, char *name, size_t namelen) 4605 { 4606 if (boothowto & RB_ASKNAME) { 4607 printf("%s name: ", askfor); 4608 console_gets(name, namelen); 4609 } 4610 } 4611 4612 /* 4613 * Init the root filesystem type (rootfs.bo_fstype) from the "fstype" 4614 * property. 4615 * 4616 * Filesystem types starting with the prefix "nfs" are diskless clients; 4617 * init the root filename name (rootfs.bo_name), too. 4618 * 4619 * If we are booting via NFS we currently have these options: 4620 * nfs - dynamically choose NFS V2, V3, or V4 (default) 4621 * nfs2 - force NFS V2 4622 * nfs3 - force NFS V3 4623 * nfs4 - force NFS V4 4624 * Because we need to maintain backward compatibility with the naming 4625 * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c) 4626 * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs". The dynamic 4627 * nfs module will map the type back to either "nfs", "nfs3", or "nfs4". 4628 * This is only for root filesystems, all other uses will expect 4629 * that "nfs" == NFS V2. 4630 */ 4631 static void 4632 getrootfs(char **fstypp, char **fsmodp) 4633 { 4634 extern char *strplumb_get_netdev_path(void); 4635 char *propstr = NULL; 4636 4637 /* 4638 * Check fstype property; for diskless it should be one of "nfs", 4639 * "nfs2", "nfs3" or "nfs4". 4640 */ 4641 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4642 DDI_PROP_DONTPASS, "fstype", &propstr) 4643 == DDI_SUCCESS) { 4644 (void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME); 4645 ddi_prop_free(propstr); 4646 4647 /* 4648 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set, 4649 * assume the type of this root filesystem is 'zfs'. 4650 */ 4651 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4652 DDI_PROP_DONTPASS, "zfs-bootfs", &propstr) 4653 == DDI_SUCCESS) { 4654 (void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME); 4655 ddi_prop_free(propstr); 4656 } 4657 4658 if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) { 4659 *fstypp = *fsmodp = rootfs.bo_fstype; 4660 return; 4661 } 4662 4663 ++netboot; 4664 4665 if (strcmp(rootfs.bo_fstype, "nfs2") == 0) 4666 (void) strcpy(rootfs.bo_fstype, "nfs"); 4667 else if (strcmp(rootfs.bo_fstype, "nfs") == 0) 4668 (void) strcpy(rootfs.bo_fstype, "nfsdyn"); 4669 4670 /* 4671 * check if path to network interface is specified in bootpath 4672 * or by a hypervisor domain configuration file. 4673 * XXPV - enable strlumb_get_netdev_path() 4674 */ 4675 if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS, 4676 "xpv-nfsroot")) { 4677 (void) strcpy(rootfs.bo_name, "/xpvd/xnf@0"); 4678 } else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), 4679 DDI_PROP_DONTPASS, "bootpath", &propstr) 4680 == DDI_SUCCESS) { 4681 (void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME); 4682 ddi_prop_free(propstr); 4683 } else { 4684 /* attempt to determine netdev_path via boot_mac address */ 4685 netdev_path = strplumb_get_netdev_path(); 4686 if (netdev_path == NULL) 4687 panic("cannot find boot network interface"); 4688 (void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME); 4689 } 4690 *fstypp = rootfs.bo_fstype; 4691 *fsmodp = "nfs"; 4692 } 4693 #endif 4694 4695 /* 4696 * VFS feature routines 4697 */ 4698 4699 #define VFTINDEX(feature) (((feature) >> 32) & 0xFFFFFFFF) 4700 #define VFTBITS(feature) ((feature) & 0xFFFFFFFFLL) 4701 4702 /* Register a feature in the vfs */ 4703 void 4704 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature) 4705 { 4706 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4707 if (vfsp->vfs_implp == NULL) 4708 return; 4709 4710 vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature); 4711 } 4712 4713 void 4714 vfs_clear_feature(vfs_t *vfsp, vfs_feature_t feature) 4715 { 4716 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4717 if (vfsp->vfs_implp == NULL) 4718 return; 4719 vfsp->vfs_featureset[VFTINDEX(feature)] &= VFTBITS(~feature); 4720 } 4721 4722 /* 4723 * Query a vfs for a feature. 4724 * Returns 1 if feature is present, 0 if not 4725 */ 4726 int 4727 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature) 4728 { 4729 int ret = 0; 4730 4731 /* Note that vfs_featureset[] is found in *vfsp->vfs_implp */ 4732 if (vfsp->vfs_implp == NULL) 4733 return (ret); 4734 4735 if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature)) 4736 ret = 1; 4737 4738 return (ret); 4739 } 4740 4741 /* 4742 * Propagate feature set from one vfs to another 4743 */ 4744 void 4745 vfs_propagate_features(vfs_t *from, vfs_t *to) 4746 { 4747 int i; 4748 4749 if (to->vfs_implp == NULL || from->vfs_implp == NULL) 4750 return; 4751 4752 for (i = 1; i <= to->vfs_featureset[0]; i++) { 4753 to->vfs_featureset[i] = from->vfs_featureset[i]; 4754 } 4755 } 4756 4757 #define LOFINODE_PATH "/dev/lofi/%d" 4758 4759 /* 4760 * Return the vnode for the lofi node if there's a lofi mount in place. 4761 * Returns -1 when there's no lofi node, 0 on success, and > 0 on 4762 * failure. 4763 */ 4764 int 4765 vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp) 4766 { 4767 char *path = NULL; 4768 int strsize; 4769 int err; 4770 4771 if (vfsp->vfs_lofi_id == 0) { 4772 *vpp = NULL; 4773 return (-1); 4774 } 4775 4776 strsize = snprintf(NULL, 0, LOFINODE_PATH, vfsp->vfs_lofi_id); 4777 path = kmem_alloc(strsize + 1, KM_SLEEP); 4778 (void) snprintf(path, strsize + 1, LOFINODE_PATH, vfsp->vfs_lofi_id); 4779 4780 /* 4781 * We may be inside a zone, so we need to use the /dev path, but 4782 * it's created asynchronously, so we wait here. 4783 */ 4784 for (;;) { 4785 err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp); 4786 4787 if (err != ENOENT) 4788 break; 4789 4790 if ((err = delay_sig(hz / 8)) == EINTR) 4791 break; 4792 } 4793 4794 if (err) 4795 *vpp = NULL; 4796 4797 kmem_free(path, strsize + 1); 4798 return (err); 4799 } 4800