xref: /titanic_51/usr/src/uts/common/fs/vfs.c (revision 12ef07e9fce5f59ad851a4e70cb765f3c56c11a5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/user.h>
47 #include <sys/fstyp.h>
48 #include <sys/kmem.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 #include <sys/mount.h>
52 #include <sys/vfs.h>
53 #include <sys/fem.h>
54 #include <sys/mntent.h>
55 #include <sys/stat.h>
56 #include <sys/statvfs.h>
57 #include <sys/statfs.h>
58 #include <sys/cred.h>
59 #include <sys/vnode.h>
60 #include <sys/rwstlock.h>
61 #include <sys/dnlc.h>
62 #include <sys/file.h>
63 #include <sys/time.h>
64 #include <sys/atomic.h>
65 #include <sys/cmn_err.h>
66 #include <sys/buf.h>
67 #include <sys/swap.h>
68 #include <sys/debug.h>
69 #include <sys/vnode.h>
70 #include <sys/modctl.h>
71 #include <sys/ddi.h>
72 #include <sys/pathname.h>
73 #include <sys/bootconf.h>
74 #include <sys/dumphdr.h>
75 #include <sys/dc_ki.h>
76 #include <sys/poll.h>
77 #include <sys/sunddi.h>
78 #include <sys/sysmacros.h>
79 #include <sys/zone.h>
80 #include <sys/policy.h>
81 #include <sys/ctfs.h>
82 #include <sys/objfs.h>
83 #include <sys/console.h>
84 #include <sys/reboot.h>
85 
86 #include <vm/page.h>
87 
88 #include <fs/fs_subr.h>
89 
90 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
91 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
92     const char *, int, int);
93 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
94 static void vfs_freemnttab(struct vfs *);
95 static void vfs_freeopt(mntopt_t *);
96 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
97 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
98 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
99 static void vfs_createopttbl_extend(mntopts_t *, const char *,
100     const mntopts_t *);
101 static char **vfs_copycancelopt_extend(char **const, int);
102 static void vfs_freecancelopt(char **);
103 static char *getrootfs(void);
104 static int getmacpath(dev_info_t *, void *);
105 
106 struct ipmnt {
107 	struct ipmnt	*mip_next;
108 	dev_t		mip_dev;
109 	struct vfs	*mip_vfsp;
110 };
111 
112 static kmutex_t		vfs_miplist_mutex;
113 static struct ipmnt	*vfs_miplist = NULL;
114 static struct ipmnt	*vfs_miplist_end = NULL;
115 
116 /*
117  * VFS global data.
118  */
119 vnode_t *rootdir;		/* pointer to root inode vnode. */
120 vnode_t *devicesdir;		/* pointer to inode of devices root */
121 
122 char *server_rootpath;		/* root path for diskless clients */
123 char *server_hostname;		/* hostname of diskless server */
124 
125 static struct vfs root;
126 static struct vfs devices;
127 struct vfs *rootvfs = &root;	/* pointer to root vfs; head of VFS list. */
128 rvfs_t *rvfs_list;		/* array of vfs ptrs for vfs hash list */
129 int vfshsz = 512;		/* # of heads/locks in vfs hash arrays */
130 				/* must be power of 2!	*/
131 timespec_t vfs_mnttab_ctime;	/* mnttab created time */
132 timespec_t vfs_mnttab_mtime;	/* mnttab last modified time */
133 char *vfs_dummyfstype = "\0";
134 struct pollhead vfs_pollhd;	/* for mnttab pollers */
135 
136 /*
137  * Table for generic options recognized in the VFS layer and acted
138  * on at this level before parsing file system specific options.
139  * The nosuid option is stronger than any of the devices and setuid
140  * options, so those are canceled when nosuid is seen.
141  *
142  * All options which are added here need to be added to the
143  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
144  */
145 /*
146  * VFS Mount options table
147  */
148 static char *ro_cancel[] = { MNTOPT_RW, NULL };
149 static char *rw_cancel[] = { MNTOPT_RO, NULL };
150 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
151 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
152     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
153 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
154 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
155 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
156 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
157 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
158 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
159 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
160 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
161 
162 static const mntopt_t mntopts[] = {
163 /*
164  *	option name		cancel options		default arg	flags
165  */
166 	{ MNTOPT_REMOUNT,	NULL,			NULL,
167 		MO_NODISPLAY, (void *)0 },
168 	{ MNTOPT_RO,		ro_cancel,		NULL,		0,
169 		(void *)0 },
170 	{ MNTOPT_RW,		rw_cancel,		NULL,		0,
171 		(void *)0 },
172 	{ MNTOPT_SUID,		suid_cancel,		NULL,		0,
173 		(void *)0 },
174 	{ MNTOPT_NOSUID,	nosuid_cancel,		NULL,		0,
175 		(void *)0 },
176 	{ MNTOPT_DEVICES,	devices_cancel,		NULL,		0,
177 		(void *)0 },
178 	{ MNTOPT_NODEVICES,	nodevices_cancel,	NULL,		0,
179 		(void *)0 },
180 	{ MNTOPT_SETUID,	setuid_cancel,		NULL,		0,
181 		(void *)0 },
182 	{ MNTOPT_NOSETUID,	nosetuid_cancel,	NULL,		0,
183 		(void *)0 },
184 	{ MNTOPT_NBMAND,	nbmand_cancel,		NULL,		0,
185 		(void *)0 },
186 	{ MNTOPT_NONBMAND,	nonbmand_cancel,	NULL,		0,
187 		(void *)0 },
188 	{ MNTOPT_EXEC,		exec_cancel,		NULL,		0,
189 		(void *)0 },
190 	{ MNTOPT_NOEXEC,	noexec_cancel,		NULL,		0,
191 		(void *)0 },
192 };
193 
194 const mntopts_t vfs_mntopts = {
195 	sizeof (mntopts) / sizeof (mntopt_t),
196 	(mntopt_t *)&mntopts[0]
197 };
198 
199 /*
200  * File system operation dispatch functions.
201  */
202 
203 int
204 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
205 {
206 	return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
207 }
208 
209 int
210 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
211 {
212 	return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
213 }
214 
215 int
216 fsop_root(vfs_t *vfsp, vnode_t **vpp)
217 {
218 	refstr_t *mntpt;
219 	int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
220 	/*
221 	 * Make sure this root has a path.  With lofs, it is possible to have
222 	 * a NULL mountpoint.
223 	 */
224 	if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
225 		mntpt = vfs_getmntpoint(vfsp);
226 		vn_setpath_str(*vpp, refstr_value(mntpt),
227 		    strlen(refstr_value(mntpt)));
228 		refstr_rele(mntpt);
229 	}
230 
231 	return (ret);
232 }
233 
234 int
235 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
236 {
237 	return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
238 }
239 
240 int
241 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
242 {
243 	return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
244 }
245 
246 int
247 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
248 {
249 	return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
250 }
251 
252 int
253 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
254 {
255 	return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
256 }
257 
258 void
259 fsop_freefs(vfs_t *vfsp)
260 {
261 	(*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
262 }
263 
264 int
265 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
266 {
267 	return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
268 }
269 
270 int
271 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
272 {
273 	ASSERT((fstype >= 0) && (fstype < nfstype));
274 
275 	if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
276 		return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
277 	else
278 		return (ENOTSUP);
279 }
280 
281 /*
282  * File system initialization.  vfs_setfsops() must be called from a file
283  * system's init routine.
284  */
285 
286 static int
287 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
288     int *unused_ops)
289 {
290 	static const fs_operation_trans_def_t vfs_ops_table[] = {
291 		VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
292 			fs_nosys, fs_nosys,
293 
294 		VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
295 			fs_nosys, fs_nosys,
296 
297 		VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
298 			fs_nosys, fs_nosys,
299 
300 		VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
301 			fs_nosys, fs_nosys,
302 
303 		VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
304 			(fs_generic_func_p) fs_sync,
305 			(fs_generic_func_p) fs_sync,	/* No errors allowed */
306 
307 		VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
308 			fs_nosys, fs_nosys,
309 
310 		VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
311 			fs_nosys, fs_nosys,
312 
313 		VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
314 			(fs_generic_func_p)fs_freevfs,
315 			(fs_generic_func_p)fs_freevfs,	/* Shouldn't fail */
316 
317 		VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
318 			(fs_generic_func_p)fs_nosys,
319 			(fs_generic_func_p)fs_nosys,
320 
321 		NULL, 0, NULL, NULL
322 	};
323 
324 	return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
325 }
326 
327 int
328 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
329 {
330 	int error;
331 	int unused_ops;
332 
333 	/* Verify that fstype refers to a loaded fs (and not fsid 0). */
334 
335 	if ((fstype <= 0) || (fstype >= nfstype))
336 		return (EINVAL);
337 
338 	if (!ALLOCATED_VFSSW(&vfssw[fstype]))
339 		return (EINVAL);
340 
341 	/* Set up the operations vector. */
342 
343 	error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
344 
345 	if (error != 0)
346 		return (error);
347 
348 	vfssw[fstype].vsw_flag |= VSW_INSTALLED;
349 
350 	if (actual != NULL)
351 		*actual = &vfssw[fstype].vsw_vfsops;
352 
353 #if DEBUG
354 	if (unused_ops != 0)
355 		cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
356 		    "but not used", vfssw[fstype].vsw_name, unused_ops);
357 #endif
358 
359 	return (0);
360 }
361 
362 int
363 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
364 {
365 	int error;
366 	int unused_ops;
367 
368 	*actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
369 
370 	error = fs_copyfsops(template, *actual, &unused_ops);
371 	if (error != 0) {
372 		kmem_free(*actual, sizeof (vfsops_t));
373 		*actual = NULL;
374 		return (error);
375 	}
376 
377 	return (0);
378 }
379 
380 /*
381  * Free a vfsops structure created as a result of vfs_makefsops().
382  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
383  * vfs_freevfsops_by_type().
384  */
385 void
386 vfs_freevfsops(vfsops_t *vfsops)
387 {
388 	kmem_free(vfsops, sizeof (vfsops_t));
389 }
390 
391 /*
392  * Since the vfsops structure is part of the vfssw table and wasn't
393  * really allocated, we're not really freeing anything.  We keep
394  * the name for consistency with vfs_freevfsops().  We do, however,
395  * need to take care of a little bookkeeping.
396  * NOTE: For a vfsops structure created by vfs_setfsops(), use
397  * vfs_freevfsops_by_type().
398  */
399 int
400 vfs_freevfsops_by_type(int fstype)
401 {
402 
403 	/* Verify that fstype refers to a loaded fs (and not fsid 0). */
404 	if ((fstype <= 0) || (fstype >= nfstype))
405 		return (EINVAL);
406 
407 	WLOCK_VFSSW();
408 	if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
409 		WUNLOCK_VFSSW();
410 		return (EINVAL);
411 	}
412 
413 	vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
414 	WUNLOCK_VFSSW();
415 
416 	return (0);
417 }
418 
419 /* Support routines used to reference vfs_op */
420 
421 /* Set the operations vector for a vfs */
422 void
423 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
424 {
425 	vfsops_t	*op;
426 
427 	ASSERT(vfsp != NULL);
428 	ASSERT(vfsops != NULL);
429 
430 	op = vfsp->vfs_op;
431 	membar_consumer();
432 	if (vfsp->vfs_femhead == NULL &&
433 	    casptr(&vfsp->vfs_op, op, vfsops) == op) {
434 		return;
435 	}
436 	fsem_setvfsops(vfsp, vfsops);
437 }
438 
439 /* Retrieve the operations vector for a vfs */
440 vfsops_t *
441 vfs_getops(vfs_t *vfsp)
442 {
443 	vfsops_t	*op;
444 
445 	ASSERT(vfsp != NULL);
446 
447 	op = vfsp->vfs_op;
448 	membar_consumer();
449 	if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
450 		return (op);
451 	} else {
452 		return (fsem_getvfsops(vfsp));
453 	}
454 }
455 
456 /*
457  * Returns non-zero (1) if the vfsops matches that of the vfs.
458  * Returns zero (0) if not.
459  */
460 int
461 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
462 {
463 	return (vfs_getops(vfsp) == vfsops);
464 }
465 
466 /*
467  * Returns non-zero (1) if the file system has installed a non-default,
468  * non-error vfs_sync routine.  Returns zero (0) otherwise.
469  */
470 int
471 vfs_can_sync(vfs_t *vfsp)
472 {
473 	/* vfs_sync() routine is not the default/error function */
474 	return (vfs_getops(vfsp)->vfs_sync != fs_sync);
475 }
476 
477 /*
478  * Initialize a vfs structure.
479  */
480 void
481 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
482 {
483 	vfsp->vfs_count = 0;
484 	vfsp->vfs_next = vfsp;
485 	vfsp->vfs_prev = vfsp;
486 	vfsp->vfs_zone_next = vfsp;
487 	vfsp->vfs_zone_prev = vfsp;
488 	vfsp->vfs_flag = 0;
489 	vfsp->vfs_data = (data);
490 	vfsp->vfs_resource = NULL;
491 	vfsp->vfs_mntpt = NULL;
492 	vfsp->vfs_mntopts.mo_count = 0;
493 	vfsp->vfs_mntopts.mo_list = NULL;
494 	vfsp->vfs_femhead = NULL;
495 	vfsp->vfs_zone = NULL;
496 	/*
497 	 * Note: Don't initialize vfs_vskap, vfs_fstypevsp since it
498 	 * could be a problem for unbundled file systems.
499 	 */
500 	vfs_setops((vfsp), (op));
501 	sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
502 }
503 
504 
505 /*
506  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
507  * fstatvfs, and sysfs moved to common/syscall.
508  */
509 
510 /*
511  * Update every mounted file system.  We call the vfs_sync operation of
512  * each file system type, passing it a NULL vfsp to indicate that all
513  * mounted file systems of that type should be updated.
514  */
515 void
516 vfs_sync(int flag)
517 {
518 	struct vfssw *vswp;
519 	RLOCK_VFSSW();
520 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
521 		if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
522 			vfs_refvfssw(vswp);
523 			RUNLOCK_VFSSW();
524 			(void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
525 			    CRED());
526 			vfs_unrefvfssw(vswp);
527 			RLOCK_VFSSW();
528 		}
529 	}
530 	RUNLOCK_VFSSW();
531 }
532 
533 void
534 sync(void)
535 {
536 	vfs_sync(0);
537 }
538 
539 /*
540  * External routines.
541  */
542 
543 krwlock_t vfssw_lock;	/* lock accesses to vfssw */
544 
545 /*
546  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
547  * but otherwise should be accessed only via vfs_list_lock() and
548  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
549  */
550 static krwlock_t vfslist;
551 
552 /*
553  * Mount devfs on /devices. This is done right after root is mounted
554  * to provide device access support for the system
555  */
556 static void
557 vfs_mountdevices(void)
558 {
559 	struct vfssw *vsw;
560 	struct vnode *mvp;
561 	struct mounta mounta = {	/* fake mounta for devfs_mount() */
562 		NULL,
563 		NULL,
564 		MS_SYSSPACE,
565 		NULL,
566 		NULL,
567 		0,
568 		NULL,
569 		0
570 	};
571 
572 	/*
573 	 * _init devfs module to fill in the vfssw
574 	 */
575 	if (modload("fs", "devfs") == -1)
576 		cmn_err(CE_PANIC, "Cannot _init devfs module\n");
577 
578 	/*
579 	 * Hold vfs
580 	 */
581 	RLOCK_VFSSW();
582 	vsw = vfs_getvfsswbyname("devfs");
583 	VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
584 	VFS_HOLD(&devices);
585 
586 	/*
587 	 * Locate mount point
588 	 */
589 	if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
590 		cmn_err(CE_PANIC, "Cannot find /devices\n");
591 
592 	/*
593 	 * Perform the mount of /devices
594 	 */
595 	if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
596 		cmn_err(CE_PANIC, "Cannot mount /devices\n");
597 
598 	RUNLOCK_VFSSW();
599 
600 	/*
601 	 * Set appropriate members and add to vfs list for mnttab display
602 	 */
603 	vfs_setresource(&devices, "/devices");
604 	vfs_setmntpoint(&devices, "/devices");
605 
606 	/*
607 	 * Hold the root of /devices so it won't go away
608 	 */
609 	if (VFS_ROOT(&devices, &devicesdir))
610 		cmn_err(CE_PANIC, "vfs_mountdevices: not devices root");
611 	VN_HOLD(devicesdir);
612 
613 	if (vfs_lock(&devices) != 0) {
614 		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
615 		return;
616 	}
617 
618 	if (vn_vfswlock(mvp) != 0) {
619 		vfs_unlock(&devices);
620 		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
621 		return;
622 	}
623 
624 	vfs_add(mvp, &devices, 0);
625 	vn_vfsunlock(mvp);
626 	vfs_unlock(&devices);
627 }
628 
629 /*
630  * Mount required filesystem. This is done right after root is mounted.
631  */
632 static void
633 vfs_mountfs(char *module, char *spec, char *path)
634 {
635 	struct vnode *mvp;
636 	struct mounta mounta;
637 	vfs_t *vfsp;
638 
639 	mounta.flags = MS_SYSSPACE | MS_DATA;
640 	mounta.fstype = module;
641 	mounta.spec = spec;
642 	mounta.dir = path;
643 	if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
644 		cmn_err(CE_WARN, "Cannot find %s\n", path);
645 		return;
646 	}
647 	if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
648 		cmn_err(CE_WARN, "Cannot mount %s\n", path);
649 	else
650 		VFS_RELE(vfsp);
651 	VN_RELE(mvp);
652 }
653 
654 /*
655  * vfs_mountroot is called by main() to mount the root filesystem.
656  */
657 void
658 vfs_mountroot(void)
659 {
660 	struct vnode	*rvp = NULL;
661 	char		*path;
662 	size_t		plen;
663 	struct vfssw	*vswp;
664 	extern void setup_vopstats(vfs_t *);
665 
666 	rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
667 	rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
668 
669 	/*
670 	 * Alloc the vfs hash bucket array and locks
671 	 */
672 	rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
673 
674 	/*
675 	 * Call machine-dependent routine "rootconf" to choose a root
676 	 * file system type.
677 	 */
678 	if (rootconf())
679 		cmn_err(CE_PANIC, "vfs_mountroot: cannot mount root");
680 	/*
681 	 * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
682 	 * to point to it.  These are used by lookuppn() so that it
683 	 * knows where to start from ('/' or '.').
684 	 */
685 	vfs_setmntpoint(rootvfs, "/");
686 	if (VFS_ROOT(rootvfs, &rootdir))
687 		cmn_err(CE_PANIC, "vfs_mountroot: no root vnode");
688 	u.u_cdir = rootdir;
689 	VN_HOLD(u.u_cdir);
690 	u.u_rdir = NULL;
691 
692 	/*
693 	 * Setup the global zone's rootvp, now that it exists.
694 	 */
695 	global_zone->zone_rootvp = rootdir;
696 	VN_HOLD(global_zone->zone_rootvp);
697 
698 	/*
699 	 * Notify the module code that it can begin using the
700 	 * root filesystem instead of the boot program's services.
701 	 */
702 	modrootloaded = 1;
703 	/*
704 	 * Set up mnttab information for root
705 	 */
706 	vfs_setresource(rootvfs, rootfs.bo_name);
707 
708 	/*
709 	 * Notify cluster software that the root filesystem is available.
710 	 */
711 	clboot_mountroot();
712 
713 	/* Now that we're all done with the root FS, set up its vopstats */
714 	if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
715 		/* Set flag for statistics collection */
716 		if (vswp->vsw_flag & VSW_STATS) {
717 			rootvfs->vfs_flag |= VFS_STATS;
718 		}
719 		vfs_unrefvfssw(vswp);
720 	}
721 	setup_vopstats(rootvfs);
722 
723 	/*
724 	 * Mount /devices, /system/contract, /etc/mnttab, /etc/svc/volatile,
725 	 * /system/object, and /proc.
726 	 */
727 	vfs_mountdevices();
728 
729 	vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
730 	vfs_mountfs("proc", "/proc", "/proc");
731 	vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
732 	vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
733 	vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
734 
735 #ifdef __sparc
736 	/*
737 	 * This bit of magic can go away when we convert sparc to
738 	 * the new boot architecture based on ramdisk.
739 	 *
740 	 * Booting off a mirrored root volume:
741 	 * At this point, we have booted and mounted root on a
742 	 * single component of the mirror.  Complete the boot
743 	 * by configuring SVM and converting the root to the
744 	 * dev_t of the mirrored root device.  This dev_t conversion
745 	 * only works because the underlying device doesn't change.
746 	 */
747 	if (root_is_svm) {
748 		if (svm_rootconf()) {
749 			cmn_err(CE_PANIC, "vfs_mountroot: cannot remount root");
750 		}
751 
752 		/*
753 		 * mnttab should reflect the new root device
754 		 */
755 		vfs_lock_wait(rootvfs);
756 		vfs_setresource(rootvfs, rootfs.bo_name);
757 		vfs_unlock(rootvfs);
758 	}
759 #endif /* __sparc */
760 
761 	/*
762 	 * Look up the root device via devfs so that a dv_node is
763 	 * created for it. The vnode is never VN_RELE()ed.
764 	 * We allocate more than MAXPATHLEN so that the
765 	 * buffer passed to i_ddi_prompath_to_devfspath() is
766 	 * exactly MAXPATHLEN (the function expects a buffer
767 	 * of that length).
768 	 */
769 	plen = strlen("/devices");
770 	path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
771 	(void) strcpy(path, "/devices");
772 
773 	if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
774 	    != DDI_SUCCESS ||
775 	    lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
776 
777 		/* NUL terminate in case "path" has garbage */
778 		path[plen + MAXPATHLEN - 1] = '\0';
779 #ifdef	DEBUG
780 		cmn_err(CE_WARN, "!Cannot lookup root device: %s", path);
781 #endif
782 	}
783 	kmem_free(path, plen + MAXPATHLEN);
784 }
785 
786 /*
787  * If remount failed and we're in a zone we need to check for the zone
788  * root path and strip it before the call to vfs_setpath().
789  *
790  * If strpath doesn't begin with the zone_rootpath the original
791  * strpath is returned unchanged.
792  */
793 static const char *
794 stripzonepath(const char *strpath)
795 {
796 	char *str1, *str2;
797 	int i;
798 	zone_t *zonep = curproc->p_zone;
799 
800 	if (zonep->zone_rootpath == NULL || strpath == NULL) {
801 		return (NULL);
802 	}
803 
804 	/*
805 	 * we check for the end of the string at one past the
806 	 * current position because the zone_rootpath always
807 	 * ends with "/" but we don't want to strip that off.
808 	 */
809 	str1 = zonep->zone_rootpath;
810 	str2 = (char *)strpath;
811 	ASSERT(str1[0] != '\0');
812 	for (i = 0; str1[i + 1] != '\0'; i++) {
813 		if (str1[i] != str2[i])
814 			return ((char *)strpath);
815 	}
816 	return (&str2[i]);
817 }
818 
819 /*
820  * Common mount code.  Called from the system call entry point, from autofs,
821  * and from pxfs.
822  *
823  * Takes the effective file system type, mount arguments, the mount point
824  * vnode, flags specifying whether the mount is a remount and whether it
825  * should be entered into the vfs list, and credentials.  Fills in its vfspp
826  * parameter with the mounted file system instance's vfs.
827  *
828  * Note that the effective file system type is specified as a string.  It may
829  * be null, in which case it's determined from the mount arguments, and may
830  * differ from the type specified in the mount arguments; this is a hook to
831  * allow interposition when instantiating file system instances.
832  *
833  * The caller is responsible for releasing its own hold on the mount point
834  * vp (this routine does its own hold when necessary).
835  * Also note that for remounts, the mount point vp should be the vnode for
836  * the root of the file system rather than the vnode that the file system
837  * is mounted on top of.
838  */
839 int
840 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
841 	struct vfs **vfspp)
842 {
843 	struct vfssw	*vswp;
844 	vfsops_t	*vfsops;
845 	struct vfs	*vfsp;
846 	struct vnode	*bvp;
847 	dev_t		bdev = 0;
848 	mntopts_t	mnt_mntopts;
849 	int		error = 0;
850 	int		copyout_error = 0;
851 	int		ovflags;
852 	char		*opts = uap->optptr;
853 	char		*inargs = opts;
854 	int		optlen = uap->optlen;
855 	int		remount;
856 	int		rdonly;
857 	int		nbmand = 0;
858 	int		delmip = 0;
859 	int		addmip = 0;
860 	int		splice = ((uap->flags & MS_NOSPLICE) == 0);
861 	int		fromspace = (uap->flags & MS_SYSSPACE) ?
862 				UIO_SYSSPACE : UIO_USERSPACE;
863 	char		*resource = NULL, *mountpt = NULL;
864 	refstr_t	*oldresource, *oldmntpt;
865 	struct pathname	pn, rpn;
866 	extern void setup_vopstats(vfs_t *);
867 
868 	/*
869 	 * The v_flag value for the mount point vp is permanently set
870 	 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
871 	 * for mount point locking.
872 	 */
873 	mutex_enter(&vp->v_lock);
874 	vp->v_flag |= VVFSLOCK;
875 	mutex_exit(&vp->v_lock);
876 
877 	mnt_mntopts.mo_count = 0;
878 	/*
879 	 * Find the ops vector to use to invoke the file system-specific mount
880 	 * method.  If the fsname argument is non-NULL, use it directly.
881 	 * Otherwise, dig the file system type information out of the mount
882 	 * arguments.
883 	 *
884 	 * A side effect is to hold the vfssw entry.
885 	 *
886 	 * Mount arguments can be specified in several ways, which are
887 	 * distinguished by flag bit settings.  The preferred way is to set
888 	 * MS_OPTIONSTR, indicating an 8 argument mount with the file system
889 	 * type supplied as a character string and the last two arguments
890 	 * being a pointer to a character buffer and the size of the buffer.
891 	 * On entry, the buffer holds a null terminated list of options; on
892 	 * return, the string is the list of options the file system
893 	 * recognized. If MS_DATA is set arguments five and six point to a
894 	 * block of binary data which the file system interprets.
895 	 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
896 	 * consistently with these conventions.  To handle them, we check to
897 	 * see whether the pointer to the file system name has a numeric value
898 	 * less than 256.  If so, we treat it as an index.
899 	 */
900 	if (fsname != NULL) {
901 		if ((vswp = vfs_getvfssw(fsname)) == NULL) {
902 			return (EINVAL);
903 		}
904 	} else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
905 		size_t n;
906 		uint_t fstype;
907 		char name[FSTYPSZ];
908 
909 		if ((fstype = (uintptr_t)uap->fstype) < 256) {
910 			RLOCK_VFSSW();
911 			if (fstype == 0 || fstype >= nfstype ||
912 			    !ALLOCATED_VFSSW(&vfssw[fstype])) {
913 				RUNLOCK_VFSSW();
914 				return (EINVAL);
915 			}
916 			(void) strcpy(name, vfssw[fstype].vsw_name);
917 			RUNLOCK_VFSSW();
918 			if ((vswp = vfs_getvfssw(name)) == NULL)
919 				return (EINVAL);
920 		} else {
921 			/*
922 			 * Handle either kernel or user address space.
923 			 */
924 			if (uap->flags & MS_SYSSPACE) {
925 				error = copystr(uap->fstype, name,
926 				    FSTYPSZ, &n);
927 			} else {
928 				error = copyinstr(uap->fstype, name,
929 				    FSTYPSZ, &n);
930 			}
931 			if (error) {
932 				if (error == ENAMETOOLONG)
933 					return (EINVAL);
934 				return (error);
935 			}
936 			if ((vswp = vfs_getvfssw(name)) == NULL)
937 				return (EINVAL);
938 		}
939 	} else {
940 		if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
941 			return (EINVAL);
942 	}
943 	if (!VFS_INSTALLED(vswp))
944 		return (EINVAL);
945 	vfsops = &vswp->vsw_vfsops;
946 
947 	vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
948 	/*
949 	 * Fetch mount options and parse them for generic vfs options
950 	 */
951 	if (uap->flags & MS_OPTIONSTR) {
952 		/*
953 		 * Limit the buffer size
954 		 */
955 		if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
956 			error = EINVAL;
957 			goto errout;
958 		}
959 		if ((uap->flags & MS_SYSSPACE) == 0) {
960 			inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
961 			inargs[0] = '\0';
962 			if (optlen) {
963 				error = copyinstr(opts, inargs, (size_t)optlen,
964 					NULL);
965 				if (error) {
966 					goto errout;
967 				}
968 			}
969 		}
970 		vfs_parsemntopts(&mnt_mntopts, inargs, 0);
971 	}
972 	/*
973 	 * Flag bits override the options string.
974 	 */
975 	if (uap->flags & MS_REMOUNT)
976 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
977 	if (uap->flags & MS_RDONLY)
978 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
979 	if (uap->flags & MS_NOSUID)
980 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
981 
982 	/*
983 	 * Check if this is a remount; must be set in the option string and
984 	 * the file system must support a remount option.
985 	 */
986 	if (remount = vfs_optionisset_nolock(&mnt_mntopts,
987 	    MNTOPT_REMOUNT, NULL)) {
988 		if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
989 			error = ENOTSUP;
990 			goto errout;
991 		}
992 		uap->flags |= MS_REMOUNT;
993 	}
994 
995 	/*
996 	 * uap->flags and vfs_optionisset() should agree.
997 	 */
998 	if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
999 		uap->flags |= MS_RDONLY;
1000 	}
1001 	if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1002 		uap->flags |= MS_NOSUID;
1003 	}
1004 	nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1005 	ASSERT(splice || !remount);
1006 	/*
1007 	 * If we are splicing the fs into the namespace,
1008 	 * perform mount point checks.
1009 	 *
1010 	 * We want to resolve the path for the mount point to eliminate
1011 	 * '.' and ".." and symlinks in mount points; we can't do the
1012 	 * same for the resource string, since it would turn
1013 	 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1014 	 * this before grabbing vn_vfswlock(), because otherwise we
1015 	 * would deadlock with lookuppn().
1016 	 */
1017 	if (splice) {
1018 		ASSERT(vp->v_count > 0);
1019 
1020 		/*
1021 		 * Pick up mount point and device from appropriate space.
1022 		 */
1023 		if (pn_get(uap->spec, fromspace, &pn) == 0) {
1024 			resource = kmem_alloc(pn.pn_pathlen + 1,
1025 			    KM_SLEEP);
1026 			(void) strcpy(resource, pn.pn_path);
1027 			pn_free(&pn);
1028 		}
1029 		/*
1030 		 * Do a lookupname prior to taking the
1031 		 * writelock. Mark this as completed if
1032 		 * successful for later cleanup and addition to
1033 		 * the mount in progress table.
1034 		 */
1035 		if ((uap->flags & MS_GLOBAL) == 0 &&
1036 		    lookupname(uap->spec, fromspace,
1037 			    FOLLOW, NULL, &bvp) == 0) {
1038 			addmip = 1;
1039 		}
1040 
1041 		if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1042 			pathname_t *pnp;
1043 
1044 			if (*pn.pn_path != '/') {
1045 				error = EINVAL;
1046 				pn_free(&pn);
1047 				goto errout;
1048 			}
1049 			pn_alloc(&rpn);
1050 			/*
1051 			 * Kludge to prevent autofs from deadlocking with
1052 			 * itself when it calls domount().
1053 			 *
1054 			 * If autofs is calling, it is because it is doing
1055 			 * (autofs) mounts in the process of an NFS mount.  A
1056 			 * lookuppn() here would cause us to block waiting for
1057 			 * said NFS mount to complete, which can't since this
1058 			 * is the thread that was supposed to doing it.
1059 			 */
1060 			if (fromspace == UIO_USERSPACE) {
1061 				if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1062 				    NULL)) == 0) {
1063 					pnp = &rpn;
1064 				} else {
1065 					/*
1066 					 * The file disappeared or otherwise
1067 					 * became inaccessible since we opened
1068 					 * it; might as well fail the mount
1069 					 * since the mount point is no longer
1070 					 * accessible.
1071 					 */
1072 					pn_free(&rpn);
1073 					pn_free(&pn);
1074 					goto errout;
1075 				}
1076 			} else {
1077 				pnp = &pn;
1078 			}
1079 			mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1080 			(void) strcpy(mountpt, pnp->pn_path);
1081 
1082 			/*
1083 			 * If the addition of the zone's rootpath
1084 			 * would push us over a total path length
1085 			 * of MAXPATHLEN, we fail the mount with
1086 			 * ENAMETOOLONG, which is what we would have
1087 			 * gotten if we were trying to perform the same
1088 			 * mount in the global zone.
1089 			 *
1090 			 * strlen() doesn't count the trailing
1091 			 * '\0', but zone_rootpathlen counts both a
1092 			 * trailing '/' and the terminating '\0'.
1093 			 */
1094 			if ((curproc->p_zone->zone_rootpathlen - 1 +
1095 			    strlen(mountpt)) > MAXPATHLEN ||
1096 			    (resource != NULL &&
1097 			    (curproc->p_zone->zone_rootpathlen - 1 +
1098 			    strlen(resource)) > MAXPATHLEN)) {
1099 				error = ENAMETOOLONG;
1100 			}
1101 
1102 			pn_free(&rpn);
1103 			pn_free(&pn);
1104 		}
1105 
1106 		if (error)
1107 			goto errout;
1108 
1109 		/*
1110 		 * Prevent path name resolution from proceeding past
1111 		 * the mount point.
1112 		 */
1113 		if (vn_vfswlock(vp) != 0) {
1114 			error = EBUSY;
1115 			goto errout;
1116 		}
1117 
1118 		/*
1119 		 * Verify that it's legitimate to establish a mount on
1120 		 * the prospective mount point.
1121 		 */
1122 		if (vn_mountedvfs(vp) != NULL) {
1123 			/*
1124 			 * The mount point lock was obtained after some
1125 			 * other thread raced through and established a mount.
1126 			 */
1127 			vn_vfsunlock(vp);
1128 			error = EBUSY;
1129 			goto errout;
1130 		}
1131 		if (vp->v_flag & VNOMOUNT) {
1132 			vn_vfsunlock(vp);
1133 			error = EINVAL;
1134 			goto errout;
1135 		}
1136 	}
1137 	if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1138 		uap->dataptr = NULL;
1139 		uap->datalen = 0;
1140 	}
1141 
1142 	/*
1143 	 * If this is a remount, we don't want to create a new VFS.
1144 	 * Instead, we pass the existing one with a remount flag.
1145 	 */
1146 	if (remount) {
1147 		/*
1148 		 * Confirm that the mount point is the root vnode of the
1149 		 * file system that is being remounted.
1150 		 * This can happen if the user specifies a different
1151 		 * mount point directory pathname in the (re)mount command.
1152 		 *
1153 		 * Code below can only be reached if splice is true, so it's
1154 		 * safe to do vn_vfsunlock() here.
1155 		 */
1156 		if ((vp->v_flag & VROOT) == 0) {
1157 			vn_vfsunlock(vp);
1158 			error = ENOENT;
1159 			goto errout;
1160 		}
1161 		/*
1162 		 * Disallow making file systems read-only unless file system
1163 		 * explicitly allows it in its vfssw.  Ignore other flags.
1164 		 */
1165 		if (rdonly && vn_is_readonly(vp) == 0 &&
1166 		    (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1167 			vn_vfsunlock(vp);
1168 			error = EINVAL;
1169 			goto errout;
1170 		}
1171 		/*
1172 		 * Changing the NBMAND setting on remounts is permitted
1173 		 * but logged since it can lead to unexpected behavior.
1174 		 * We also counsel against using it for / and /usr.
1175 		 */
1176 		if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1177 		    (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1178 			cmn_err(CE_WARN, "domount: nbmand turned %s via "
1179 			    "remounting %s", nbmand ? "on" : "off",
1180 			    refstr_value(vp->v_vfsp->vfs_mntpt));
1181 		}
1182 		vfsp = vp->v_vfsp;
1183 		ovflags = vfsp->vfs_flag;
1184 		vfsp->vfs_flag |= VFS_REMOUNT;
1185 		vfsp->vfs_flag &= ~VFS_RDONLY;
1186 	} else {
1187 		vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP);
1188 		VFS_INIT(vfsp, vfsops, NULL);
1189 	}
1190 
1191 	VFS_HOLD(vfsp);
1192 
1193 	/*
1194 	 * The vfs_reflock is not used anymore the code below explicitly
1195 	 * holds it preventing others accesing it directly.
1196 	 */
1197 	if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1198 	    !(vfsp->vfs_flag & VFS_REMOUNT))
1199 		cmn_err(CE_WARN,
1200 		    "mount type %s couldn't get vfs_reflock\n", vswp->vsw_name);
1201 
1202 	/*
1203 	 * Lock the vfs. If this is a remount we want to avoid spurious umount
1204 	 * failures that happen as a side-effect of fsflush() and other mount
1205 	 * and unmount operations that might be going on simultaneously and
1206 	 * may have locked the vfs currently. To not return EBUSY immediately
1207 	 * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1208 	 */
1209 	if (!remount) {
1210 		if (error = vfs_lock(vfsp)) {
1211 			vfsp->vfs_flag = ovflags;
1212 			if (splice)
1213 				vn_vfsunlock(vp);
1214 			kmem_free(vfsp, sizeof (struct vfs));
1215 			goto errout;
1216 		}
1217 	} else {
1218 		vfs_lock_wait(vfsp);
1219 	}
1220 
1221 	/*
1222 	 * Add device to mount in progress table, global mounts require special
1223 	 * handling. It is possible that we have already done the lookupname
1224 	 * on a spliced, non-global fs. If so, we don't want to do it again
1225 	 * since we cannot do a lookupname after taking the
1226 	 * wlock above. This case is for a non-spliced, non-global filesystem.
1227 	 */
1228 	if (!addmip) {
1229 	    if ((uap->flags & MS_GLOBAL) == 0 &&
1230 		lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1231 			addmip = 1;
1232 		}
1233 	}
1234 
1235 	if (addmip) {
1236 		bdev = bvp->v_rdev;
1237 		VN_RELE(bvp);
1238 		vfs_addmip(bdev, vfsp);
1239 		addmip = 0;
1240 		delmip = 1;
1241 	}
1242 	/*
1243 	 * Invalidate cached entry for the mount point.
1244 	 */
1245 	if (splice)
1246 		dnlc_purge_vp(vp);
1247 
1248 	/*
1249 	 * If have an option string but the filesystem doesn't supply a
1250 	 * prototype options table, create a table with the global
1251 	 * options and sufficient room to accept all the options in the
1252 	 * string.  Then parse the passed in option string
1253 	 * accepting all the options in the string.  This gives us an
1254 	 * option table with all the proper cancel properties for the
1255 	 * global options.
1256 	 *
1257 	 * Filesystems that supply a prototype options table are handled
1258 	 * earlier in this function.
1259 	 */
1260 	if (uap->flags & MS_OPTIONSTR) {
1261 		if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1262 			mntopts_t tmp_mntopts;
1263 
1264 			tmp_mntopts.mo_count = 0;
1265 			vfs_createopttbl_extend(&tmp_mntopts, inargs,
1266 			    &mnt_mntopts);
1267 			vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1268 			vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1269 			vfs_freeopttbl(&tmp_mntopts);
1270 		}
1271 	}
1272 
1273 	/*
1274 	 * Serialize with zone creations.
1275 	 */
1276 	mount_in_progress();
1277 	/*
1278 	 * Instantiate (or reinstantiate) the file system.  If appropriate,
1279 	 * splice it into the file system name space.
1280 	 *
1281 	 * We want VFS_MOUNT() to be able to override the vfs_resource
1282 	 * string if necessary (ie, mntfs), and also for a remount to
1283 	 * change the same (necessary when remounting '/' during boot).
1284 	 * So we set up vfs_mntpt and vfs_resource to what we think they
1285 	 * should be, then hand off control to VFS_MOUNT() which can
1286 	 * override this.
1287 	 *
1288 	 * For safety's sake, when changing vfs_resource or vfs_mntpt of
1289 	 * a vfs which is on the vfs list (i.e. during a remount), we must
1290 	 * never set those fields to NULL. Several bits of code make
1291 	 * assumptions that the fields are always valid.
1292 	 */
1293 	vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1294 	if (remount) {
1295 		if ((oldresource = vfsp->vfs_resource) != NULL)
1296 			refstr_hold(oldresource);
1297 		if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1298 			refstr_hold(oldmntpt);
1299 	}
1300 	vfs_setresource(vfsp, resource);
1301 	vfs_setmntpoint(vfsp, mountpt);
1302 
1303 	error = VFS_MOUNT(vfsp, vp, uap, credp);
1304 
1305 	if (uap->flags & MS_RDONLY)
1306 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1307 	if (uap->flags & MS_NOSUID)
1308 		vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1309 	if (uap->flags & MS_GLOBAL)
1310 		vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1311 
1312 	if (error) {
1313 		if (remount) {
1314 			/* put back pre-remount options */
1315 			vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1316 			vfs_setmntpoint(vfsp, (stripzonepath(
1317 					refstr_value(oldmntpt))));
1318 			if (oldmntpt)
1319 				refstr_rele(oldmntpt);
1320 			vfs_setresource(vfsp, (stripzonepath(
1321 					refstr_value(oldresource))));
1322 			if (oldresource)
1323 				refstr_rele(oldresource);
1324 			vfsp->vfs_flag = ovflags;
1325 			vfs_unlock(vfsp);
1326 			VFS_RELE(vfsp);
1327 		} else {
1328 			vfs_unlock(vfsp);
1329 			vfs_freemnttab(vfsp);
1330 			kmem_free(vfsp, sizeof (struct vfs));
1331 		}
1332 	} else {
1333 		/*
1334 		 * Set the mount time to now
1335 		 */
1336 		vfsp->vfs_mtime = ddi_get_time();
1337 		if (remount) {
1338 			vfsp->vfs_flag &= ~VFS_REMOUNT;
1339 			if (oldresource)
1340 				refstr_rele(oldresource);
1341 			if (oldmntpt)
1342 				refstr_rele(oldmntpt);
1343 		} else if (splice) {
1344 			/*
1345 			 * Link vfsp into the name space at the mount
1346 			 * point. Vfs_add() is responsible for
1347 			 * holding the mount point which will be
1348 			 * released when vfs_remove() is called.
1349 			 */
1350 			vfs_add(vp, vfsp, uap->flags);
1351 		} else {
1352 			/*
1353 			 * Hold the reference to file system which is
1354 			 * not linked into the name space.
1355 			 */
1356 			vfsp->vfs_zone = NULL;
1357 			VFS_HOLD(vfsp);
1358 			vfsp->vfs_vnodecovered = NULL;
1359 		}
1360 		/*
1361 		 * Set flags for global options encountered
1362 		 */
1363 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1364 			vfsp->vfs_flag |= VFS_RDONLY;
1365 		else
1366 			vfsp->vfs_flag &= ~VFS_RDONLY;
1367 		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1368 			vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1369 		} else {
1370 			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1371 				vfsp->vfs_flag |= VFS_NODEVICES;
1372 			else
1373 				vfsp->vfs_flag &= ~VFS_NODEVICES;
1374 			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1375 				vfsp->vfs_flag |= VFS_NOSETUID;
1376 			else
1377 				vfsp->vfs_flag &= ~VFS_NOSETUID;
1378 		}
1379 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1380 			vfsp->vfs_flag |= VFS_NBMAND;
1381 		else
1382 			vfsp->vfs_flag &= ~VFS_NBMAND;
1383 
1384 		if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1385 			vfsp->vfs_flag |= VFS_XATTR;
1386 		else
1387 			vfsp->vfs_flag &= ~VFS_XATTR;
1388 
1389 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1390 			vfsp->vfs_flag |= VFS_NOEXEC;
1391 		else
1392 			vfsp->vfs_flag &= ~VFS_NOEXEC;
1393 
1394 		/*
1395 		 * Now construct the output option string of options
1396 		 * we recognized.
1397 		 */
1398 		if (uap->flags & MS_OPTIONSTR) {
1399 			vfs_list_read_lock();
1400 			copyout_error = vfs_buildoptionstr(
1401 				&vfsp->vfs_mntopts, inargs, optlen);
1402 			vfs_list_unlock();
1403 			if (copyout_error == 0 &&
1404 			    (uap->flags & MS_SYSSPACE) == 0) {
1405 				copyout_error = copyoutstr(inargs, opts,
1406 				    optlen, NULL);
1407 			}
1408 		}
1409 
1410 		/* Set flag for statistics collection */
1411 		if (vswp->vsw_flag & VSW_STATS) {
1412 			vfsp->vfs_flag |= VFS_STATS;
1413 		}
1414 
1415 		vfs_unlock(vfsp);
1416 	}
1417 	mount_completed();
1418 	if (splice)
1419 		vn_vfsunlock(vp);
1420 
1421 	if ((error == 0) && (copyout_error == 0)) {
1422 		/*
1423 		 * If this isn't a remount, set up the vopstats before
1424 		 * anyone can touch this
1425 		 */
1426 		if (!remount)
1427 			setup_vopstats(vfsp);
1428 
1429 		/* Return vfsp to caller. */
1430 		*vfspp = vfsp;
1431 	}
1432 errout:
1433 	vfs_freeopttbl(&mnt_mntopts);
1434 	if (resource != NULL)
1435 		kmem_free(resource, strlen(resource) + 1);
1436 	if (mountpt != NULL)
1437 		kmem_free(mountpt, strlen(mountpt) + 1);
1438 	/*
1439 	 * It is possible we errored prior to adding to mount in progress
1440 	 * table. Must free vnode we acquired with successful lookupname.
1441 	 */
1442 	if (addmip)
1443 		VN_RELE(bvp);
1444 	if (delmip)
1445 		vfs_delmip(vfsp);
1446 	ASSERT(vswp != NULL);
1447 	vfs_unrefvfssw(vswp);
1448 	if (inargs != opts)
1449 		kmem_free(inargs, MAX_MNTOPT_STR);
1450 	if (copyout_error) {
1451 		VFS_RELE(vfsp);
1452 		error = copyout_error;
1453 	}
1454 	return (error);
1455 }
1456 
1457 static void
1458 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath)
1459 {
1460 	size_t len;
1461 	refstr_t *ref;
1462 	zone_t *zone = curproc->p_zone;
1463 	char *sp;
1464 	int have_list_lock = 0;
1465 
1466 	ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1467 
1468 	/*
1469 	 * New path must be less than MAXPATHLEN because mntfs
1470 	 * will only display up to MAXPATHLEN bytes. This is currently
1471 	 * safe, because domount() uses pn_get(), and other callers
1472 	 * similarly cap the size to fewer than MAXPATHLEN bytes.
1473 	 */
1474 
1475 	ASSERT(strlen(newpath) < MAXPATHLEN);
1476 
1477 	/* mntfs requires consistency while vfs list lock is held */
1478 
1479 	if (VFS_ON_LIST(vfsp)) {
1480 		have_list_lock = 1;
1481 		vfs_list_lock();
1482 	}
1483 
1484 	if (*refp != NULL)
1485 		refstr_rele(*refp);
1486 
1487 	/* Do we need to modify the path? */
1488 
1489 	if (zone == global_zone || *newpath != '/') {
1490 		ref = refstr_alloc(newpath);
1491 		goto out;
1492 	}
1493 
1494 	/*
1495 	 * Truncate the trailing '/' in the zoneroot, and merge
1496 	 * in the zone's rootpath with the "newpath" (resource
1497 	 * or mountpoint) passed in.
1498 	 *
1499 	 * The size of the required buffer is thus the size of
1500 	 * the buffer required for the passed-in newpath
1501 	 * (strlen(newpath) + 1), plus the size of the buffer
1502 	 * required to hold zone_rootpath (zone_rootpathlen)
1503 	 * minus one for one of the now-superfluous NUL
1504 	 * terminations, minus one for the trailing '/'.
1505 	 *
1506 	 * That gives us:
1507 	 *
1508 	 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1509 	 *
1510 	 * Which is what we have below.
1511 	 */
1512 
1513 	len = strlen(newpath) + zone->zone_rootpathlen - 1;
1514 	sp = kmem_alloc(len, KM_SLEEP);
1515 
1516 	/*
1517 	 * Copy everything including the trailing slash, which
1518 	 * we then overwrite with the NUL character.
1519 	 */
1520 
1521 	(void) strcpy(sp, zone->zone_rootpath);
1522 	sp[zone->zone_rootpathlen - 2] = '\0';
1523 	(void) strcat(sp, newpath);
1524 
1525 	ref = refstr_alloc(sp);
1526 	kmem_free(sp, len);
1527 out:
1528 	*refp = ref;
1529 
1530 	if (have_list_lock) {
1531 		vfs_mnttab_modtimeupd();
1532 		vfs_list_unlock();
1533 	}
1534 }
1535 
1536 /*
1537  * Record a mounted resource name in a vfs structure.
1538  * If vfsp is already mounted, caller must hold the vfs lock.
1539  */
1540 void
1541 vfs_setresource(struct vfs *vfsp, const char *resource)
1542 {
1543 	if (resource == NULL || resource[0] == '\0')
1544 		resource = VFS_NORESOURCE;
1545 	vfs_setpath(vfsp, &vfsp->vfs_resource, resource);
1546 }
1547 
1548 /*
1549  * Record a mount point name in a vfs structure.
1550  * If vfsp is already mounted, caller must hold the vfs lock.
1551  */
1552 void
1553 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt)
1554 {
1555 	if (mntpt == NULL || mntpt[0] == '\0')
1556 		mntpt = VFS_NOMNTPT;
1557 	vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt);
1558 }
1559 
1560 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1561 
1562 refstr_t *
1563 vfs_getresource(const struct vfs *vfsp)
1564 {
1565 	refstr_t *resource;
1566 
1567 	vfs_list_read_lock();
1568 	resource = vfsp->vfs_resource;
1569 	refstr_hold(resource);
1570 	vfs_list_unlock();
1571 
1572 	return (resource);
1573 }
1574 
1575 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1576 
1577 refstr_t *
1578 vfs_getmntpoint(const struct vfs *vfsp)
1579 {
1580 	refstr_t *mntpt;
1581 
1582 	vfs_list_read_lock();
1583 	mntpt = vfsp->vfs_mntpt;
1584 	refstr_hold(mntpt);
1585 	vfs_list_unlock();
1586 
1587 	return (mntpt);
1588 }
1589 
1590 /*
1591  * Create an empty options table with enough empty slots to hold all
1592  * The options in the options string passed as an argument.
1593  * Potentially prepend another options table.
1594  *
1595  * Note: caller is responsible for locking the vfs list, if needed,
1596  *       to protect mops.
1597  */
1598 static void
1599 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1600     const mntopts_t *mtmpl)
1601 {
1602 	const char *s = opts;
1603 	uint_t count;
1604 
1605 	if (opts == NULL || *opts == '\0') {
1606 		count = 0;
1607 	} else {
1608 		count = 1;
1609 
1610 		/*
1611 		 * Count number of options in the string
1612 		 */
1613 		for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1614 			count++;
1615 			s++;
1616 		}
1617 	}
1618 	vfs_copyopttbl_extend(mtmpl, mops, count);
1619 }
1620 
1621 /*
1622  * Create an empty options table with enough empty slots to hold all
1623  * The options in the options string passed as an argument.
1624  *
1625  * This function is *not* for general use by filesystems.
1626  *
1627  * Note: caller is responsible for locking the vfs list, if needed,
1628  *       to protect mops.
1629  */
1630 void
1631 vfs_createopttbl(mntopts_t *mops, const char *opts)
1632 {
1633 	vfs_createopttbl_extend(mops, opts, NULL);
1634 }
1635 
1636 
1637 /*
1638  * Swap two mount options tables
1639  */
1640 static void
1641 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
1642 {
1643 	uint_t tmpcnt;
1644 	mntopt_t *tmplist;
1645 
1646 	tmpcnt = optbl2->mo_count;
1647 	tmplist = optbl2->mo_list;
1648 	optbl2->mo_count = optbl1->mo_count;
1649 	optbl2->mo_list = optbl1->mo_list;
1650 	optbl1->mo_count = tmpcnt;
1651 	optbl1->mo_list = tmplist;
1652 }
1653 
1654 static void
1655 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
1656 {
1657 	vfs_list_lock();
1658 	vfs_swapopttbl_nolock(optbl1, optbl2);
1659 	vfs_mnttab_modtimeupd();
1660 	vfs_list_unlock();
1661 }
1662 
1663 static char **
1664 vfs_copycancelopt_extend(char **const moc, int extend)
1665 {
1666 	int i = 0;
1667 	int j;
1668 	char **result;
1669 
1670 	if (moc != NULL) {
1671 		for (; moc[i] != NULL; i++)
1672 			/* count number of options to cancel */;
1673 	}
1674 
1675 	if (i + extend == 0)
1676 		return (NULL);
1677 
1678 	result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
1679 
1680 	for (j = 0; j < i; j++) {
1681 		result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
1682 		(void) strcpy(result[j], moc[j]);
1683 	}
1684 	for (; j <= i + extend; j++)
1685 		result[j] = NULL;
1686 
1687 	return (result);
1688 }
1689 
1690 static void
1691 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
1692 {
1693 	char *sp, *dp;
1694 
1695 	d->mo_flags = s->mo_flags;
1696 	d->mo_data = s->mo_data;
1697 	sp = s->mo_name;
1698 	if (sp != NULL) {
1699 		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1700 		(void) strcpy(dp, sp);
1701 		d->mo_name = dp;
1702 	} else {
1703 		d->mo_name = NULL; /* should never happen */
1704 	}
1705 
1706 	d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
1707 
1708 	sp = s->mo_arg;
1709 	if (sp != NULL) {
1710 		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1711 		(void) strcpy(dp, sp);
1712 		d->mo_arg = dp;
1713 	} else {
1714 		d->mo_arg = NULL;
1715 	}
1716 }
1717 
1718 /*
1719  * Copy a mount options table, possibly allocating some spare
1720  * slots at the end.  It is permissible to copy_extend the NULL table.
1721  */
1722 static void
1723 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
1724 {
1725 	uint_t i, count;
1726 	mntopt_t *motbl;
1727 
1728 	/*
1729 	 * Clear out any existing stuff in the options table being initialized
1730 	 */
1731 	vfs_freeopttbl(dmo);
1732 	count = (smo == NULL) ? 0 : smo->mo_count;
1733 	if ((count + extra) == 0)	/* nothing to do */
1734 		return;
1735 	dmo->mo_count = count + extra;
1736 	motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
1737 	dmo->mo_list = motbl;
1738 	for (i = 0; i < count; i++) {
1739 		vfs_copyopt(&smo->mo_list[i], &motbl[i]);
1740 	}
1741 	for (i = count; i < count + extra; i++) {
1742 		motbl[i].mo_flags = MO_EMPTY;
1743 	}
1744 }
1745 
1746 /*
1747  * Copy a mount options table.
1748  *
1749  * This function is *not* for general use by filesystems.
1750  *
1751  * Note: caller is responsible for locking the vfs list, if needed,
1752  *       to protect smo and dmo.
1753  */
1754 void
1755 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
1756 {
1757 	vfs_copyopttbl_extend(smo, dmo, 0);
1758 }
1759 
1760 static char **
1761 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
1762 {
1763 	int c1 = 0;
1764 	int c2 = 0;
1765 	char **result;
1766 	char **sp1, **sp2, **dp;
1767 
1768 	/*
1769 	 * First we count both lists of cancel options.
1770 	 * If either is NULL or has no elements, we return a copy of
1771 	 * the other.
1772 	 */
1773 	if (mop1->mo_cancel != NULL) {
1774 		for (; mop1->mo_cancel[c1] != NULL; c1++)
1775 			/* count cancel options in mop1 */;
1776 	}
1777 
1778 	if (c1 == 0)
1779 		return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
1780 
1781 	if (mop2->mo_cancel != NULL) {
1782 		for (; mop2->mo_cancel[c2] != NULL; c2++)
1783 			/* count cancel options in mop2 */;
1784 	}
1785 
1786 	result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
1787 
1788 	if (c2 == 0)
1789 		return (result);
1790 
1791 	/*
1792 	 * When we get here, we've got two sets of cancel options;
1793 	 * we need to merge the two sets.  We know that the result
1794 	 * array has "c1+c2+1" entries and in the end we might shrink
1795 	 * it.
1796 	 * Result now has a copy of the c1 entries from mop1; we'll
1797 	 * now lookup all the entries of mop2 in mop1 and copy it if
1798 	 * it is unique.
1799 	 * This operation is O(n^2) but it's only called once per
1800 	 * filesystem per duplicate option.  This is a situation
1801 	 * which doesn't arise with the filesystems in ON and
1802 	 * n is generally 1.
1803 	 */
1804 
1805 	dp = &result[c1];
1806 	for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
1807 		for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
1808 			if (strcmp(*sp1, *sp2) == 0)
1809 				break;
1810 		}
1811 		if (*sp1 == NULL) {
1812 			/*
1813 			 * Option *sp2 not found in mop1, so copy it.
1814 			 * The calls to vfs_copycancelopt_extend()
1815 			 * guarantee that there's enough room.
1816 			 */
1817 			*dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
1818 			(void) strcpy(*dp++, *sp2);
1819 		}
1820 	}
1821 	if (dp != &result[c1+c2]) {
1822 		size_t bytes = (dp - result + 1) * sizeof (char *);
1823 		char **nres = kmem_alloc(bytes, KM_SLEEP);
1824 
1825 		bcopy(result, nres, bytes);
1826 		kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
1827 		result = nres;
1828 	}
1829 	return (result);
1830 }
1831 
1832 /*
1833  * Merge two mount option tables (outer and inner) into one.  This is very
1834  * similar to "merging" global variables and automatic variables in C.
1835  *
1836  * This isn't (and doesn't have to be) fast.
1837  *
1838  * This function is *not* for general use by filesystems.
1839  *
1840  * Note: caller is responsible for locking the vfs list, if needed,
1841  *       to protect omo, imo & dmo.
1842  */
1843 void
1844 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
1845 {
1846 	uint_t i, count;
1847 	mntopt_t *mop, *motbl;
1848 	uint_t freeidx;
1849 
1850 	/*
1851 	 * First determine how much space we need to allocate.
1852 	 */
1853 	count = omo->mo_count;
1854 	for (i = 0; i < imo->mo_count; i++) {
1855 		if (imo->mo_list[i].mo_flags & MO_EMPTY)
1856 			continue;
1857 		if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
1858 			count++;
1859 	}
1860 	ASSERT(count >= omo->mo_count &&
1861 	    count <= omo->mo_count + imo->mo_count);
1862 	motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
1863 	for (i = 0; i < omo->mo_count; i++)
1864 		vfs_copyopt(&omo->mo_list[i], &motbl[i]);
1865 	freeidx = omo->mo_count;
1866 	for (i = 0; i < imo->mo_count; i++) {
1867 		if (imo->mo_list[i].mo_flags & MO_EMPTY)
1868 			continue;
1869 		if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
1870 			char **newcanp;
1871 			uint_t index = mop - omo->mo_list;
1872 
1873 			newcanp = vfs_mergecancelopts(mop, &motbl[index]);
1874 
1875 			vfs_freeopt(&motbl[index]);
1876 			vfs_copyopt(&imo->mo_list[i], &motbl[index]);
1877 
1878 			vfs_freecancelopt(motbl[index].mo_cancel);
1879 			motbl[index].mo_cancel = newcanp;
1880 		} else {
1881 			/*
1882 			 * If it's a new option, just copy it over to the first
1883 			 * free location.
1884 			 */
1885 			vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
1886 		}
1887 	}
1888 	dmo->mo_count = count;
1889 	dmo->mo_list = motbl;
1890 }
1891 
1892 /*
1893  * Functions to set and clear mount options in a mount options table.
1894  */
1895 
1896 /*
1897  * Clear a mount option, if it exists.
1898  *
1899  * The update_mnttab arg indicates whether mops is part of a vfs that is on
1900  * the vfs list.
1901  */
1902 static void
1903 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
1904 {
1905 	struct mntopt *mop;
1906 	uint_t i, count;
1907 
1908 	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
1909 
1910 	count = mops->mo_count;
1911 	for (i = 0; i < count; i++) {
1912 		mop = &mops->mo_list[i];
1913 
1914 		if (mop->mo_flags & MO_EMPTY)
1915 			continue;
1916 		if (strcmp(opt, mop->mo_name))
1917 			continue;
1918 		mop->mo_flags &= ~MO_SET;
1919 		if (mop->mo_arg != NULL) {
1920 			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
1921 		}
1922 		mop->mo_arg = NULL;
1923 		if (update_mnttab)
1924 			vfs_mnttab_modtimeupd();
1925 		break;
1926 	}
1927 }
1928 
1929 void
1930 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
1931 {
1932 	int gotlock = 0;
1933 
1934 	if (VFS_ON_LIST(vfsp)) {
1935 		gotlock = 1;
1936 		vfs_list_lock();
1937 	}
1938 	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
1939 	if (gotlock)
1940 		vfs_list_unlock();
1941 }
1942 
1943 
1944 /*
1945  * Set a mount option on.  If it's not found in the table, it's silently
1946  * ignored.  If the option has MO_IGNORE set, it is still set unless the
1947  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
1948  * bits can be used to toggle the MO_NODISPLAY bit for the option.
1949  * If the VFS_CREATEOPT flag bit is set then the first option slot with
1950  * MO_EMPTY set is created as the option passed in.
1951  *
1952  * The update_mnttab arg indicates whether mops is part of a vfs that is on
1953  * the vfs list.
1954  */
1955 static void
1956 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
1957     const char *arg, int flags, int update_mnttab)
1958 {
1959 	mntopt_t *mop;
1960 	uint_t i, count;
1961 	char *sp;
1962 
1963 	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
1964 
1965 	if (flags & VFS_CREATEOPT) {
1966 		if (vfs_hasopt(mops, opt) != NULL) {
1967 			flags &= ~VFS_CREATEOPT;
1968 		}
1969 	}
1970 	count = mops->mo_count;
1971 	for (i = 0; i < count; i++) {
1972 		mop = &mops->mo_list[i];
1973 
1974 		if (mop->mo_flags & MO_EMPTY) {
1975 			if ((flags & VFS_CREATEOPT) == 0)
1976 				continue;
1977 			sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
1978 			(void) strcpy(sp, opt);
1979 			mop->mo_name = sp;
1980 			if (arg != NULL)
1981 				mop->mo_flags = MO_HASVALUE;
1982 			else
1983 				mop->mo_flags = 0;
1984 		} else if (strcmp(opt, mop->mo_name)) {
1985 			continue;
1986 		}
1987 		if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
1988 			break;
1989 		if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
1990 			sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
1991 			(void) strcpy(sp, arg);
1992 		} else {
1993 			sp = NULL;
1994 		}
1995 		if (mop->mo_arg != NULL)
1996 			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
1997 		mop->mo_arg = sp;
1998 		if (flags & VFS_DISPLAY)
1999 			mop->mo_flags &= ~MO_NODISPLAY;
2000 		if (flags & VFS_NODISPLAY)
2001 			mop->mo_flags |= MO_NODISPLAY;
2002 		mop->mo_flags |= MO_SET;
2003 		if (mop->mo_cancel != NULL) {
2004 			char **cp;
2005 
2006 			for (cp = mop->mo_cancel; *cp != NULL; cp++)
2007 				vfs_clearmntopt_nolock(mops, *cp, 0);
2008 		}
2009 		if (update_mnttab)
2010 			vfs_mnttab_modtimeupd();
2011 		break;
2012 	}
2013 }
2014 
2015 void
2016 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2017 {
2018 	int gotlock = 0;
2019 
2020 	if (VFS_ON_LIST(vfsp)) {
2021 		gotlock = 1;
2022 		vfs_list_lock();
2023 	}
2024 	vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2025 	if (gotlock)
2026 		vfs_list_unlock();
2027 }
2028 
2029 
2030 /*
2031  * Add a "tag" option to a mounted file system's options list.
2032  *
2033  * Note: caller is responsible for locking the vfs list, if needed,
2034  *       to protect mops.
2035  */
2036 static mntopt_t *
2037 vfs_addtag(mntopts_t *mops, const char *tag)
2038 {
2039 	uint_t count;
2040 	mntopt_t *mop, *motbl;
2041 
2042 	count = mops->mo_count + 1;
2043 	motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2044 	if (mops->mo_count) {
2045 		size_t len = (count - 1) * sizeof (mntopt_t);
2046 
2047 		bcopy(mops->mo_list, motbl, len);
2048 		kmem_free(mops->mo_list, len);
2049 	}
2050 	mops->mo_count = count;
2051 	mops->mo_list = motbl;
2052 	mop = &motbl[count - 1];
2053 	mop->mo_flags = MO_TAG;
2054 	mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2055 	(void) strcpy(mop->mo_name, tag);
2056 	return (mop);
2057 }
2058 
2059 /*
2060  * Allow users to set arbitrary "tags" in a vfs's mount options.
2061  * Broader use within the kernel is discouraged.
2062  */
2063 int
2064 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2065     cred_t *cr)
2066 {
2067 	vfs_t *vfsp;
2068 	mntopts_t *mops;
2069 	mntopt_t *mop;
2070 	int found = 0;
2071 	dev_t dev = makedevice(major, minor);
2072 	int err = 0;
2073 	char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2074 
2075 	/*
2076 	 * Find the desired mounted file system
2077 	 */
2078 	vfs_list_lock();
2079 	vfsp = rootvfs;
2080 	do {
2081 		if (vfsp->vfs_dev == dev &&
2082 		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2083 			found = 1;
2084 			break;
2085 		}
2086 		vfsp = vfsp->vfs_next;
2087 	} while (vfsp != rootvfs);
2088 
2089 	if (!found) {
2090 		err = EINVAL;
2091 		goto out;
2092 	}
2093 	err = secpolicy_fs_config(cr, vfsp);
2094 	if (err != 0)
2095 		goto out;
2096 
2097 	mops = &vfsp->vfs_mntopts;
2098 	/*
2099 	 * Add tag if it doesn't already exist
2100 	 */
2101 	if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2102 		int len;
2103 
2104 		(void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2105 		len = strlen(buf);
2106 		if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2107 			err = ENAMETOOLONG;
2108 			goto out;
2109 		}
2110 		mop = vfs_addtag(mops, tag);
2111 	}
2112 	if ((mop->mo_flags & MO_TAG) == 0) {
2113 		err = EINVAL;
2114 		goto out;
2115 	}
2116 	vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2117 out:
2118 	vfs_list_unlock();
2119 	kmem_free(buf, MAX_MNTOPT_STR);
2120 	return (err);
2121 }
2122 
2123 /*
2124  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2125  * Broader use within the kernel is discouraged.
2126  */
2127 int
2128 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2129     cred_t *cr)
2130 {
2131 	vfs_t *vfsp;
2132 	mntopt_t *mop;
2133 	int found = 0;
2134 	dev_t dev = makedevice(major, minor);
2135 	int err = 0;
2136 
2137 	/*
2138 	 * Find the desired mounted file system
2139 	 */
2140 	vfs_list_lock();
2141 	vfsp = rootvfs;
2142 	do {
2143 		if (vfsp->vfs_dev == dev &&
2144 		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2145 			found = 1;
2146 			break;
2147 		}
2148 		vfsp = vfsp->vfs_next;
2149 	} while (vfsp != rootvfs);
2150 
2151 	if (!found) {
2152 		err = EINVAL;
2153 		goto out;
2154 	}
2155 	err = secpolicy_fs_config(cr, vfsp);
2156 	if (err != 0)
2157 		goto out;
2158 
2159 	if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2160 		err = EINVAL;
2161 		goto out;
2162 	}
2163 	if ((mop->mo_flags & MO_TAG) == 0) {
2164 		err = EINVAL;
2165 		goto out;
2166 	}
2167 	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2168 out:
2169 	vfs_list_unlock();
2170 	return (err);
2171 }
2172 
2173 /*
2174  * Function to parse an option string and fill in a mount options table.
2175  * Unknown options are silently ignored.  The input option string is modified
2176  * by replacing separators with nulls.  If the create flag is set, options
2177  * not found in the table are just added on the fly.  The table must have
2178  * an option slot marked MO_EMPTY to add an option on the fly.
2179  *
2180  * This function is *not* for general use by filesystems.
2181  *
2182  * Note: caller is responsible for locking the vfs list, if needed,
2183  *       to protect mops..
2184  */
2185 void
2186 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2187 {
2188 	char *s = osp, *p, *nextop, *valp, *cp, *ep;
2189 	int setflg = VFS_NOFORCEOPT;
2190 
2191 	if (osp == NULL)
2192 		return;
2193 	while (*s != '\0') {
2194 		p = strchr(s, ',');	/* find next option */
2195 		if (p == NULL) {
2196 			cp = NULL;
2197 			p = s + strlen(s);
2198 		} else {
2199 			cp = p;		/* save location of comma */
2200 			*p++ = '\0';	/* mark end and point to next option */
2201 		}
2202 		nextop = p;
2203 		p = strchr(s, '=');	/* look for value */
2204 		if (p == NULL) {
2205 			valp = NULL;	/* no value supplied */
2206 		} else {
2207 			ep = p;		/* save location of equals */
2208 			*p++ = '\0';	/* end option and point to value */
2209 			valp = p;
2210 		}
2211 		/*
2212 		 * set option into options table
2213 		 */
2214 		if (create)
2215 			setflg |= VFS_CREATEOPT;
2216 		vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2217 		if (cp != NULL)
2218 			*cp = ',';	/* restore the comma */
2219 		if (valp != NULL)
2220 			*ep = '=';	/* restore the equals */
2221 		s = nextop;
2222 	}
2223 }
2224 
2225 /*
2226  * Function to inquire if an option exists in a mount options table.
2227  * Returns a pointer to the option if it exists, else NULL.
2228  *
2229  * This function is *not* for general use by filesystems.
2230  *
2231  * Note: caller is responsible for locking the vfs list, if needed,
2232  *       to protect mops.
2233  */
2234 struct mntopt *
2235 vfs_hasopt(const mntopts_t *mops, const char *opt)
2236 {
2237 	struct mntopt *mop;
2238 	uint_t i, count;
2239 
2240 	count = mops->mo_count;
2241 	for (i = 0; i < count; i++) {
2242 		mop = &mops->mo_list[i];
2243 
2244 		if (mop->mo_flags & MO_EMPTY)
2245 			continue;
2246 		if (strcmp(opt, mop->mo_name) == 0)
2247 			return (mop);
2248 	}
2249 	return (NULL);
2250 }
2251 
2252 /*
2253  * Function to inquire if an option is set in a mount options table.
2254  * Returns non-zero if set and fills in the arg pointer with a pointer to
2255  * the argument string or NULL if there is no argument string.
2256  */
2257 static int
2258 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2259 {
2260 	struct mntopt *mop;
2261 	uint_t i, count;
2262 
2263 	count = mops->mo_count;
2264 	for (i = 0; i < count; i++) {
2265 		mop = &mops->mo_list[i];
2266 
2267 		if (mop->mo_flags & MO_EMPTY)
2268 			continue;
2269 		if (strcmp(opt, mop->mo_name))
2270 			continue;
2271 		if ((mop->mo_flags & MO_SET) == 0)
2272 			return (0);
2273 		if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2274 			*argp = mop->mo_arg;
2275 		return (1);
2276 	}
2277 	return (0);
2278 }
2279 
2280 
2281 int
2282 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2283 {
2284 	int ret;
2285 
2286 	vfs_list_read_lock();
2287 	ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2288 	vfs_list_unlock();
2289 	return (ret);
2290 }
2291 
2292 
2293 /*
2294  * Construct a comma separated string of the options set in the given
2295  * mount table, return the string in the given buffer.  Return non-zero if
2296  * the buffer would overflow.
2297  *
2298  * This function is *not* for general use by filesystems.
2299  *
2300  * Note: caller is responsible for locking the vfs list, if needed,
2301  *       to protect mp.
2302  */
2303 int
2304 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2305 {
2306 	char *cp;
2307 	uint_t i;
2308 
2309 	buf[0] = '\0';
2310 	cp = buf;
2311 	for (i = 0; i < mp->mo_count; i++) {
2312 		struct mntopt *mop;
2313 
2314 		mop = &mp->mo_list[i];
2315 		if (mop->mo_flags & MO_SET) {
2316 			int optlen, comma = 0;
2317 
2318 			if (buf[0] != '\0')
2319 				comma = 1;
2320 			optlen = strlen(mop->mo_name);
2321 			if (strlen(buf) + comma + optlen + 1 > len)
2322 				goto err;
2323 			if (comma)
2324 				*cp++ = ',';
2325 			(void) strcpy(cp, mop->mo_name);
2326 			cp += optlen;
2327 			/*
2328 			 * Append option value if there is one
2329 			 */
2330 			if (mop->mo_arg != NULL) {
2331 				int arglen;
2332 
2333 				arglen = strlen(mop->mo_arg);
2334 				if (strlen(buf) + arglen + 2 > len)
2335 					goto err;
2336 				*cp++ = '=';
2337 				(void) strcpy(cp, mop->mo_arg);
2338 				cp += arglen;
2339 			}
2340 		}
2341 	}
2342 	return (0);
2343 err:
2344 	return (EOVERFLOW);
2345 }
2346 
2347 static void
2348 vfs_freecancelopt(char **moc)
2349 {
2350 	if (moc != NULL) {
2351 		int ccnt = 0;
2352 		char **cp;
2353 
2354 		for (cp = moc; *cp != NULL; cp++) {
2355 			kmem_free(*cp, strlen(*cp) + 1);
2356 			ccnt++;
2357 		}
2358 		kmem_free(moc, (ccnt + 1) * sizeof (char *));
2359 	}
2360 }
2361 
2362 static void
2363 vfs_freeopt(mntopt_t *mop)
2364 {
2365 	if (mop->mo_name != NULL)
2366 		kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2367 
2368 	vfs_freecancelopt(mop->mo_cancel);
2369 
2370 	if (mop->mo_arg != NULL)
2371 		kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2372 }
2373 
2374 /*
2375  * Free a mount options table
2376  *
2377  * This function is *not* for general use by filesystems.
2378  *
2379  * Note: caller is responsible for locking the vfs list, if needed,
2380  *       to protect mp.
2381  */
2382 void
2383 vfs_freeopttbl(mntopts_t *mp)
2384 {
2385 	uint_t i, count;
2386 
2387 	count = mp->mo_count;
2388 	for (i = 0; i < count; i++) {
2389 		vfs_freeopt(&mp->mo_list[i]);
2390 	}
2391 	if (count) {
2392 		kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2393 		mp->mo_count = 0;
2394 		mp->mo_list = NULL;
2395 	}
2396 }
2397 
2398 /*
2399  * Free any mnttab information recorded in the vfs struct.
2400  * The vfs must not be on the vfs list.
2401  */
2402 static void
2403 vfs_freemnttab(struct vfs *vfsp)
2404 {
2405 	ASSERT(!VFS_ON_LIST(vfsp));
2406 
2407 	/*
2408 	 * Free device and mount point information
2409 	 */
2410 	if (vfsp->vfs_mntpt != NULL) {
2411 		refstr_rele(vfsp->vfs_mntpt);
2412 		vfsp->vfs_mntpt = NULL;
2413 	}
2414 	if (vfsp->vfs_resource != NULL) {
2415 		refstr_rele(vfsp->vfs_resource);
2416 		vfsp->vfs_resource = NULL;
2417 	}
2418 	/*
2419 	 * Now free mount options information
2420 	 */
2421 	vfs_freeopttbl(&vfsp->vfs_mntopts);
2422 }
2423 
2424 /*
2425  * Return the last mnttab modification time
2426  */
2427 void
2428 vfs_mnttab_modtime(timespec_t *ts)
2429 {
2430 	ASSERT(RW_LOCK_HELD(&vfslist));
2431 	*ts = vfs_mnttab_mtime;
2432 }
2433 
2434 /*
2435  * See if mnttab is changed
2436  */
2437 void
2438 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2439 {
2440 	int changed;
2441 
2442 	*phpp = (struct pollhead *)NULL;
2443 
2444 	/*
2445 	 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2446 	 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2447 	 * to not grab the vfs list lock because tv_sec is monotonically
2448 	 * increasing.
2449 	 */
2450 
2451 	changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2452 	    (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2453 	if (!changed) {
2454 		*phpp = &vfs_pollhd;
2455 	}
2456 }
2457 
2458 /*
2459  * Update the mnttab modification time and wake up any waiters for
2460  * mnttab changes
2461  */
2462 void
2463 vfs_mnttab_modtimeupd()
2464 {
2465 	hrtime_t oldhrt, newhrt;
2466 
2467 	ASSERT(RW_WRITE_HELD(&vfslist));
2468 	oldhrt = ts2hrt(&vfs_mnttab_mtime);
2469 	gethrestime(&vfs_mnttab_mtime);
2470 	newhrt = ts2hrt(&vfs_mnttab_mtime);
2471 	if (oldhrt == (hrtime_t)0)
2472 		vfs_mnttab_ctime = vfs_mnttab_mtime;
2473 	/*
2474 	 * Attempt to provide unique mtime (like uniqtime but not).
2475 	 */
2476 	if (newhrt == oldhrt) {
2477 		newhrt++;
2478 		hrt2ts(newhrt, &vfs_mnttab_mtime);
2479 	}
2480 	pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2481 }
2482 
2483 int
2484 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
2485 {
2486 	vnode_t *coveredvp;
2487 	int error;
2488 	extern void teardown_vopstats(vfs_t *);
2489 
2490 	/*
2491 	 * Get covered vnode. This will be NULL if the vfs is not linked
2492 	 * into the file system name space (i.e., domount() with MNT_NOSPICE).
2493 	 */
2494 	coveredvp = vfsp->vfs_vnodecovered;
2495 	ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
2496 
2497 	/*
2498 	 * Purge all dnlc entries for this vfs.
2499 	 */
2500 	(void) dnlc_purge_vfsp(vfsp, 0);
2501 
2502 	/* For forcible umount, skip VFS_SYNC() since it may hang */
2503 	if ((flag & MS_FORCE) == 0)
2504 		(void) VFS_SYNC(vfsp, 0, cr);
2505 
2506 	/*
2507 	 * Lock the vfs to maintain fs status quo during unmount.  This
2508 	 * has to be done after the sync because ufs_update tries to acquire
2509 	 * the vfs_reflock.
2510 	 */
2511 	vfs_lock_wait(vfsp);
2512 
2513 	if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
2514 		vfs_unlock(vfsp);
2515 		if (coveredvp != NULL)
2516 			vn_vfsunlock(coveredvp);
2517 	} else if (coveredvp != NULL) {
2518 		teardown_vopstats(vfsp);
2519 		/*
2520 		 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
2521 		 * when it frees vfsp so we do a VN_HOLD() so we can
2522 		 * continue to use coveredvp afterwards.
2523 		 */
2524 		VN_HOLD(coveredvp);
2525 		vfs_remove(vfsp);
2526 		vn_vfsunlock(coveredvp);
2527 		VN_RELE(coveredvp);
2528 	} else {
2529 		teardown_vopstats(vfsp);
2530 		/*
2531 		 * Release the reference to vfs that is not linked
2532 		 * into the name space.
2533 		 */
2534 		vfs_unlock(vfsp);
2535 		VFS_RELE(vfsp);
2536 	}
2537 	return (error);
2538 }
2539 
2540 
2541 /*
2542  * Vfs_unmountall() is called by uadmin() to unmount all
2543  * mounted file systems (except the root file system) during shutdown.
2544  * It follows the existing locking protocol when traversing the vfs list
2545  * to sync and unmount vfses. Even though there should be no
2546  * other thread running while the system is shutting down, it is prudent
2547  * to still follow the locking protocol.
2548  */
2549 void
2550 vfs_unmountall(void)
2551 {
2552 	struct vfs *vfsp;
2553 	struct vfs *prev_vfsp = NULL;
2554 	int error;
2555 
2556 	/*
2557 	 * Toss all dnlc entries now so that the per-vfs sync
2558 	 * and unmount operations don't have to slog through
2559 	 * a bunch of uninteresting vnodes over and over again.
2560 	 */
2561 	dnlc_purge();
2562 
2563 	vfs_list_lock();
2564 	for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
2565 		prev_vfsp = vfsp->vfs_prev;
2566 
2567 		if (vfs_lock(vfsp) != 0)
2568 			continue;
2569 		error = vn_vfswlock(vfsp->vfs_vnodecovered);
2570 		vfs_unlock(vfsp);
2571 		if (error)
2572 			continue;
2573 
2574 		vfs_list_unlock();
2575 
2576 		(void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
2577 		(void) dounmount(vfsp, 0, CRED());
2578 
2579 		/*
2580 		 * Since we dropped the vfslist lock above we must
2581 		 * verify that next_vfsp still exists, else start over.
2582 		 */
2583 		vfs_list_lock();
2584 		for (vfsp = rootvfs->vfs_prev;
2585 			vfsp != rootvfs; vfsp = vfsp->vfs_prev)
2586 			if (vfsp == prev_vfsp)
2587 				break;
2588 		if (vfsp == rootvfs && prev_vfsp != rootvfs)
2589 			prev_vfsp = rootvfs->vfs_prev;
2590 	}
2591 	vfs_list_unlock();
2592 }
2593 
2594 /*
2595  * Called to add an entry to the end of the vfs mount in progress list
2596  */
2597 void
2598 vfs_addmip(dev_t dev, struct vfs *vfsp)
2599 {
2600 	struct ipmnt *mipp;
2601 
2602 	mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
2603 	mipp->mip_next = NULL;
2604 	mipp->mip_dev = dev;
2605 	mipp->mip_vfsp = vfsp;
2606 	mutex_enter(&vfs_miplist_mutex);
2607 	if (vfs_miplist_end != NULL)
2608 		vfs_miplist_end->mip_next = mipp;
2609 	else
2610 		vfs_miplist = mipp;
2611 	vfs_miplist_end = mipp;
2612 	mutex_exit(&vfs_miplist_mutex);
2613 }
2614 
2615 /*
2616  * Called to remove an entry from the mount in progress list
2617  * Either because the mount completed or it failed.
2618  */
2619 void
2620 vfs_delmip(struct vfs *vfsp)
2621 {
2622 	struct ipmnt *mipp, *mipprev;
2623 
2624 	mutex_enter(&vfs_miplist_mutex);
2625 	mipprev = NULL;
2626 	for (mipp = vfs_miplist;
2627 		mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
2628 		mipprev = mipp;
2629 	}
2630 	if (mipp == NULL)
2631 		return; /* shouldn't happen */
2632 	if (mipp == vfs_miplist_end)
2633 		vfs_miplist_end = mipprev;
2634 	if (mipprev == NULL)
2635 		vfs_miplist = mipp->mip_next;
2636 	else
2637 		mipprev->mip_next = mipp->mip_next;
2638 	mutex_exit(&vfs_miplist_mutex);
2639 	kmem_free(mipp, sizeof (struct ipmnt));
2640 }
2641 
2642 /*
2643  * vfs_add is called by a specific filesystem's mount routine to add
2644  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
2645  * The vfs should already have been locked by the caller.
2646  *
2647  * coveredvp is NULL if this is the root.
2648  */
2649 void
2650 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
2651 {
2652 	int newflag;
2653 
2654 	ASSERT(vfs_lock_held(vfsp));
2655 	VFS_HOLD(vfsp);
2656 	newflag = vfsp->vfs_flag;
2657 	if (mflag & MS_RDONLY)
2658 		newflag |= VFS_RDONLY;
2659 	else
2660 		newflag &= ~VFS_RDONLY;
2661 	if (mflag & MS_NOSUID)
2662 		newflag |= (VFS_NOSETUID|VFS_NODEVICES);
2663 	else
2664 		newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
2665 	if (mflag & MS_NOMNTTAB)
2666 		newflag |= VFS_NOMNTTAB;
2667 	else
2668 		newflag &= ~VFS_NOMNTTAB;
2669 
2670 	if (coveredvp != NULL) {
2671 		ASSERT(vn_vfswlock_held(coveredvp));
2672 		coveredvp->v_vfsmountedhere = vfsp;
2673 		VN_HOLD(coveredvp);
2674 	}
2675 	vfsp->vfs_vnodecovered = coveredvp;
2676 	vfsp->vfs_flag = newflag;
2677 
2678 	vfs_list_add(vfsp);
2679 }
2680 
2681 /*
2682  * Remove a vfs from the vfs list, null out the pointer from the
2683  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
2684  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
2685  * reference to the vfs and to the covered vnode.
2686  *
2687  * Called from dounmount after it's confirmed with the file system
2688  * that the unmount is legal.
2689  */
2690 void
2691 vfs_remove(struct vfs *vfsp)
2692 {
2693 	vnode_t *vp;
2694 
2695 	ASSERT(vfs_lock_held(vfsp));
2696 
2697 	/*
2698 	 * Can't unmount root.  Should never happen because fs will
2699 	 * be busy.
2700 	 */
2701 	if (vfsp == rootvfs)
2702 		cmn_err(CE_PANIC, "vfs_remove: unmounting root");
2703 
2704 	vfs_list_remove(vfsp);
2705 
2706 	/*
2707 	 * Unhook from the file system name space.
2708 	 */
2709 	vp = vfsp->vfs_vnodecovered;
2710 	ASSERT(vn_vfswlock_held(vp));
2711 	vp->v_vfsmountedhere = NULL;
2712 	vfsp->vfs_vnodecovered = NULL;
2713 	VN_RELE(vp);
2714 
2715 	/*
2716 	 * Release lock and wakeup anybody waiting.
2717 	 */
2718 	vfs_unlock(vfsp);
2719 	VFS_RELE(vfsp);
2720 }
2721 
2722 /*
2723  * Lock a filesystem to prevent access to it while mounting,
2724  * unmounting and syncing.  Return EBUSY immediately if lock
2725  * can't be acquired.
2726  */
2727 int
2728 vfs_lock(vfs_t *vfsp)
2729 {
2730 	vn_vfslocks_entry_t *vpvfsentry;
2731 
2732 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2733 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2734 		return (0);
2735 
2736 	vn_vfslocks_rele(vpvfsentry);
2737 	return (EBUSY);
2738 }
2739 
2740 int
2741 vfs_rlock(vfs_t *vfsp)
2742 {
2743 	vn_vfslocks_entry_t *vpvfsentry;
2744 
2745 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2746 
2747 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2748 		return (0);
2749 
2750 	vn_vfslocks_rele(vpvfsentry);
2751 	return (EBUSY);
2752 }
2753 
2754 void
2755 vfs_lock_wait(vfs_t *vfsp)
2756 {
2757 	vn_vfslocks_entry_t *vpvfsentry;
2758 
2759 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2760 	rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
2761 }
2762 
2763 void
2764 vfs_rlock_wait(vfs_t *vfsp)
2765 {
2766 	vn_vfslocks_entry_t *vpvfsentry;
2767 
2768 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2769 	rwst_enter(&vpvfsentry->ve_lock, RW_READER);
2770 }
2771 
2772 /*
2773  * Unlock a locked filesystem.
2774  */
2775 void
2776 vfs_unlock(vfs_t *vfsp)
2777 {
2778 	vn_vfslocks_entry_t *vpvfsentry;
2779 
2780 	/*
2781 	 * vfs_unlock will mimic sema_v behaviour to fix 4748018.
2782 	 * And these changes should remain for the patch changes as it is.
2783 	 */
2784 	if (panicstr)
2785 		return;
2786 
2787 	/*
2788 	 * ve_refcount needs to be dropped twice here.
2789 	 * 1. To release refernce after a call to vfs_locks_getlock()
2790 	 * 2. To release the reference from the locking routines like
2791 	 *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
2792 	 */
2793 
2794 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2795 	vn_vfslocks_rele(vpvfsentry);
2796 
2797 	rwst_exit(&vpvfsentry->ve_lock);
2798 	vn_vfslocks_rele(vpvfsentry);
2799 }
2800 
2801 /*
2802  * Utility routine that allows a filesystem to construct its
2803  * fsid in "the usual way" - by munging some underlying dev_t and
2804  * the filesystem type number into the 64-bit fsid.  Note that
2805  * this implicitly relies on dev_t persistence to make filesystem
2806  * id's persistent.
2807  *
2808  * There's nothing to prevent an individual fs from constructing its
2809  * fsid in a different way, and indeed they should.
2810  *
2811  * Since we want fsids to be 32-bit quantities (so that they can be
2812  * exported identically by either 32-bit or 64-bit APIs, as well as
2813  * the fact that fsid's are "known" to NFS), we compress the device
2814  * number given down to 32-bits, and panic if that isn't possible.
2815  */
2816 void
2817 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
2818 {
2819 	if (!cmpldev((dev32_t *)&fsi->val[0], dev))
2820 		panic("device number too big for fsid!");
2821 	fsi->val[1] = val;
2822 }
2823 
2824 int
2825 vfs_lock_held(vfs_t *vfsp)
2826 {
2827 	int held;
2828 	vn_vfslocks_entry_t *vpvfsentry;
2829 
2830 	/*
2831 	 * vfs_lock_held will mimic sema_held behaviour
2832 	 * if panicstr is set. And these changes should remain
2833 	 * for the patch changes as it is.
2834 	 */
2835 	if (panicstr)
2836 		return (1);
2837 
2838 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2839 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2840 
2841 	vn_vfslocks_rele(vpvfsentry);
2842 	return (held);
2843 }
2844 
2845 struct _kthread *
2846 vfs_lock_owner(vfs_t *vfsp)
2847 {
2848 	struct _kthread *owner;
2849 	vn_vfslocks_entry_t *vpvfsentry;
2850 
2851 	/*
2852 	 * vfs_wlock_held will mimic sema_held behaviour
2853 	 * if panicstr is set. And these changes should remain
2854 	 * for the patch changes as it is.
2855 	 */
2856 	if (panicstr)
2857 		return (NULL);
2858 
2859 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2860 	owner = rwst_owner(&vpvfsentry->ve_lock);
2861 
2862 	vn_vfslocks_rele(vpvfsentry);
2863 	return (owner);
2864 }
2865 
2866 /*
2867  * vfs list locking.
2868  *
2869  * Rather than manipulate the vfslist lock directly, we abstract into lock
2870  * and unlock routines to allow the locking implementation to be changed for
2871  * clustering.
2872  *
2873  * Whenever the vfs list is modified through its hash links, the overall list
2874  * lock must be obtained before locking the relevant hash bucket.  But to see
2875  * whether a given vfs is on the list, it suffices to obtain the lock for the
2876  * hash bucket without getting the overall list lock.  (See getvfs() below.)
2877  */
2878 
2879 void
2880 vfs_list_lock()
2881 {
2882 	rw_enter(&vfslist, RW_WRITER);
2883 }
2884 
2885 void
2886 vfs_list_read_lock()
2887 {
2888 	rw_enter(&vfslist, RW_READER);
2889 }
2890 
2891 void
2892 vfs_list_unlock()
2893 {
2894 	rw_exit(&vfslist);
2895 }
2896 
2897 /*
2898  * Low level worker routines for adding entries to and removing entries from
2899  * the vfs list.
2900  */
2901 
2902 static void
2903 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
2904 {
2905 	int vhno;
2906 	struct vfs **hp;
2907 	dev_t dev;
2908 
2909 	ASSERT(RW_WRITE_HELD(&vfslist));
2910 
2911 	dev = expldev(vfsp->vfs_fsid.val[0]);
2912 	vhno = VFSHASH(getmajor(dev), getminor(dev));
2913 
2914 	mutex_enter(&rvfs_list[vhno].rvfs_lock);
2915 
2916 	/*
2917 	 * Link into the hash table, inserting it at the end, so that LOFS
2918 	 * with the same fsid as UFS (or other) file systems will not hide the
2919 	 * UFS.
2920 	 */
2921 	if (insert_at_head) {
2922 		vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
2923 		rvfs_list[vhno].rvfs_head = vfsp;
2924 	} else {
2925 		for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
2926 		    hp = &(*hp)->vfs_hash)
2927 			continue;
2928 		/*
2929 		 * hp now contains the address of the pointer to update
2930 		 * to effect the insertion.
2931 		 */
2932 		vfsp->vfs_hash = NULL;
2933 		*hp = vfsp;
2934 	}
2935 
2936 	rvfs_list[vhno].rvfs_len++;
2937 	mutex_exit(&rvfs_list[vhno].rvfs_lock);
2938 }
2939 
2940 
2941 static void
2942 vfs_hash_remove(struct vfs *vfsp)
2943 {
2944 	int vhno;
2945 	struct vfs *tvfsp;
2946 	dev_t dev;
2947 
2948 	ASSERT(RW_WRITE_HELD(&vfslist));
2949 
2950 	dev = expldev(vfsp->vfs_fsid.val[0]);
2951 	vhno = VFSHASH(getmajor(dev), getminor(dev));
2952 
2953 	mutex_enter(&rvfs_list[vhno].rvfs_lock);
2954 
2955 	/*
2956 	 * Remove from hash.
2957 	 */
2958 	if (rvfs_list[vhno].rvfs_head == vfsp) {
2959 		rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
2960 		rvfs_list[vhno].rvfs_len--;
2961 		goto foundit;
2962 	}
2963 	for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
2964 	    tvfsp = tvfsp->vfs_hash) {
2965 		if (tvfsp->vfs_hash == vfsp) {
2966 			tvfsp->vfs_hash = vfsp->vfs_hash;
2967 			rvfs_list[vhno].rvfs_len--;
2968 			goto foundit;
2969 		}
2970 	}
2971 	cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
2972 
2973 foundit:
2974 
2975 	mutex_exit(&rvfs_list[vhno].rvfs_lock);
2976 }
2977 
2978 
2979 void
2980 vfs_list_add(struct vfs *vfsp)
2981 {
2982 	zone_t *zone;
2983 
2984 	/*
2985 	 * The zone that owns the mount is the one that performed the mount.
2986 	 * Note that this isn't necessarily the same as the zone mounted into.
2987 	 * The corresponding zone_rele() will be done when the vfs_t is
2988 	 * being free'd.
2989 	 */
2990 	vfsp->vfs_zone = curproc->p_zone;
2991 	zone_hold(vfsp->vfs_zone);
2992 
2993 	/*
2994 	 * Find the zone mounted into, and put this mount on its vfs list.
2995 	 */
2996 	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
2997 	ASSERT(zone != NULL);
2998 	/*
2999 	 * Special casing for the root vfs.  This structure is allocated
3000 	 * statically and hooked onto rootvfs at link time.  During the
3001 	 * vfs_mountroot call at system startup time, the root file system's
3002 	 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3003 	 * as argument.  The code below must detect and handle this special
3004 	 * case.  The only apparent justification for this special casing is
3005 	 * to ensure that the root file system appears at the head of the
3006 	 * list.
3007 	 *
3008 	 * XXX:	I'm assuming that it's ok to do normal list locking when
3009 	 *	adding the entry for the root file system (this used to be
3010 	 *	done with no locks held).
3011 	 */
3012 	vfs_list_lock();
3013 	/*
3014 	 * Link into the vfs list proper.
3015 	 */
3016 	if (vfsp == &root) {
3017 		/*
3018 		 * Assert: This vfs is already on the list as its first entry.
3019 		 * Thus, there's nothing to do.
3020 		 */
3021 		ASSERT(rootvfs == vfsp);
3022 		/*
3023 		 * Add it to the head of the global zone's vfslist.
3024 		 */
3025 		ASSERT(zone == global_zone);
3026 		ASSERT(zone->zone_vfslist == NULL);
3027 		zone->zone_vfslist = vfsp;
3028 	} else {
3029 		/*
3030 		 * Link to end of list using vfs_prev (as rootvfs is now a
3031 		 * doubly linked circular list) so list is in mount order for
3032 		 * mnttab use.
3033 		 */
3034 		rootvfs->vfs_prev->vfs_next = vfsp;
3035 		vfsp->vfs_prev = rootvfs->vfs_prev;
3036 		rootvfs->vfs_prev = vfsp;
3037 		vfsp->vfs_next = rootvfs;
3038 
3039 		/*
3040 		 * Do it again for the zone-private list (which may be NULL).
3041 		 */
3042 		if (zone->zone_vfslist == NULL) {
3043 			ASSERT(zone != global_zone);
3044 			zone->zone_vfslist = vfsp;
3045 		} else {
3046 			zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3047 			vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3048 			zone->zone_vfslist->vfs_zone_prev = vfsp;
3049 			vfsp->vfs_zone_next = zone->zone_vfslist;
3050 		}
3051 	}
3052 
3053 	/*
3054 	 * Link into the hash table, inserting it at the end, so that LOFS
3055 	 * with the same fsid as UFS (or other) file systems will not hide
3056 	 * the UFS.
3057 	 */
3058 	vfs_hash_add(vfsp, 0);
3059 
3060 	/*
3061 	 * update the mnttab modification time
3062 	 */
3063 	vfs_mnttab_modtimeupd();
3064 	vfs_list_unlock();
3065 	zone_rele(zone);
3066 }
3067 
3068 void
3069 vfs_list_remove(struct vfs *vfsp)
3070 {
3071 	zone_t *zone;
3072 
3073 	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3074 	ASSERT(zone != NULL);
3075 	/*
3076 	 * Callers are responsible for preventing attempts to unmount the
3077 	 * root.
3078 	 */
3079 	ASSERT(vfsp != rootvfs);
3080 
3081 	vfs_list_lock();
3082 
3083 	/*
3084 	 * Remove from hash.
3085 	 */
3086 	vfs_hash_remove(vfsp);
3087 
3088 	/*
3089 	 * Remove from vfs list.
3090 	 */
3091 	vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3092 	vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3093 	vfsp->vfs_next = vfsp->vfs_prev = NULL;
3094 
3095 	/*
3096 	 * Remove from zone-specific vfs list.
3097 	 */
3098 	if (zone->zone_vfslist == vfsp)
3099 		zone->zone_vfslist = vfsp->vfs_zone_next;
3100 
3101 	if (vfsp->vfs_zone_next == vfsp) {
3102 		ASSERT(vfsp->vfs_zone_prev == vfsp);
3103 		ASSERT(zone->zone_vfslist == vfsp);
3104 		zone->zone_vfslist = NULL;
3105 	}
3106 
3107 	vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3108 	vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3109 	vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3110 
3111 	/*
3112 	 * update the mnttab modification time
3113 	 */
3114 	vfs_mnttab_modtimeupd();
3115 	vfs_list_unlock();
3116 	zone_rele(zone);
3117 }
3118 
3119 struct vfs *
3120 getvfs(fsid_t *fsid)
3121 {
3122 	struct vfs *vfsp;
3123 	int val0 = fsid->val[0];
3124 	int val1 = fsid->val[1];
3125 	dev_t dev = expldev(val0);
3126 	int vhno = VFSHASH(getmajor(dev), getminor(dev));
3127 	kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3128 
3129 	mutex_enter(hmp);
3130 	for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3131 		if (vfsp->vfs_fsid.val[0] == val0 &&
3132 		    vfsp->vfs_fsid.val[1] == val1) {
3133 			VFS_HOLD(vfsp);
3134 			mutex_exit(hmp);
3135 			return (vfsp);
3136 		}
3137 	}
3138 	mutex_exit(hmp);
3139 	return (NULL);
3140 }
3141 
3142 /*
3143  * Search the vfs mount in progress list for a specified device/vfs entry.
3144  * Returns 0 if the first entry in the list that the device matches has the
3145  * given vfs pointer as well.  If the device matches but a different vfs
3146  * pointer is encountered in the list before the given vfs pointer then
3147  * a 1 is returned.
3148  */
3149 
3150 int
3151 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3152 {
3153 	int retval = 0;
3154 	struct ipmnt *mipp;
3155 
3156 	mutex_enter(&vfs_miplist_mutex);
3157 	for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3158 		if (mipp->mip_dev == dev) {
3159 			if (mipp->mip_vfsp != vfsp)
3160 				retval = 1;
3161 			break;
3162 		}
3163 	}
3164 	mutex_exit(&vfs_miplist_mutex);
3165 	return (retval);
3166 }
3167 
3168 /*
3169  * Search the vfs list for a specified device.  Returns 1, if entry is found
3170  * or 0 if no suitable entry is found.
3171  */
3172 
3173 int
3174 vfs_devismounted(dev_t dev)
3175 {
3176 	struct vfs *vfsp;
3177 	int found;
3178 
3179 	vfs_list_read_lock();
3180 	vfsp = rootvfs;
3181 	found = 0;
3182 	do {
3183 		if (vfsp->vfs_dev == dev) {
3184 			found = 1;
3185 			break;
3186 		}
3187 		vfsp = vfsp->vfs_next;
3188 	} while (vfsp != rootvfs);
3189 
3190 	vfs_list_unlock();
3191 	return (found);
3192 }
3193 
3194 /*
3195  * Search the vfs list for a specified device.  Returns a pointer to it
3196  * or NULL if no suitable entry is found. The caller of this routine
3197  * is responsible for releasing the returned vfs pointer.
3198  */
3199 struct vfs *
3200 vfs_dev2vfsp(dev_t dev)
3201 {
3202 	struct vfs *vfsp;
3203 	int found;
3204 
3205 	vfs_list_read_lock();
3206 	vfsp = rootvfs;
3207 	found = 0;
3208 	do {
3209 		/*
3210 		 * The following could be made more efficient by making
3211 		 * the entire loop use vfs_zone_next if the call is from
3212 		 * a zone.  The only callers, however, ustat(2) and
3213 		 * umount2(2), don't seem to justify the added
3214 		 * complexity at present.
3215 		 */
3216 		if (vfsp->vfs_dev == dev &&
3217 		    ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3218 		    curproc->p_zone)) {
3219 			VFS_HOLD(vfsp);
3220 			found = 1;
3221 			break;
3222 		}
3223 		vfsp = vfsp->vfs_next;
3224 	} while (vfsp != rootvfs);
3225 	vfs_list_unlock();
3226 	return (found ? vfsp: NULL);
3227 }
3228 
3229 /*
3230  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3231  * or NULL if no suitable entry is found. The caller of this routine
3232  * is responsible for releasing the returned vfs pointer.
3233  *
3234  * Note that if multiple mntpoints match, the last one matching is
3235  * returned in an attempt to return the "top" mount when overlay
3236  * mounts are covering the same mount point.  This is accomplished by starting
3237  * at the end of the list and working our way backwards, stopping at the first
3238  * matching mount.
3239  */
3240 struct vfs *
3241 vfs_mntpoint2vfsp(const char *mp)
3242 {
3243 	struct vfs *vfsp;
3244 	struct vfs *retvfsp = NULL;
3245 	zone_t *zone = curproc->p_zone;
3246 	struct vfs *list;
3247 
3248 	vfs_list_read_lock();
3249 	if (getzoneid() == GLOBAL_ZONEID) {
3250 		/*
3251 		 * The global zone may see filesystems in any zone.
3252 		 */
3253 		vfsp = rootvfs->vfs_prev;
3254 		do {
3255 			if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3256 				retvfsp = vfsp;
3257 				break;
3258 			}
3259 			vfsp = vfsp->vfs_prev;
3260 		} while (vfsp != rootvfs->vfs_prev);
3261 	} else if ((list = zone->zone_vfslist) != NULL) {
3262 		const char *mntpt;
3263 
3264 		vfsp = list->vfs_zone_prev;
3265 		do {
3266 			mntpt = refstr_value(vfsp->vfs_mntpt);
3267 			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3268 			if (strcmp(mntpt, mp) == 0) {
3269 				retvfsp = vfsp;
3270 				break;
3271 			}
3272 			vfsp = vfsp->vfs_zone_prev;
3273 		} while (vfsp != list->vfs_zone_prev);
3274 	}
3275 	if (retvfsp)
3276 		VFS_HOLD(retvfsp);
3277 	vfs_list_unlock();
3278 	return (retvfsp);
3279 }
3280 
3281 /*
3282  * Search the vfs list for a specified vfsops.
3283  * if vfs entry is found then return 1, else 0.
3284  */
3285 int
3286 vfs_opsinuse(vfsops_t *ops)
3287 {
3288 	struct vfs *vfsp;
3289 	int found;
3290 
3291 	vfs_list_read_lock();
3292 	vfsp = rootvfs;
3293 	found = 0;
3294 	do {
3295 		if (vfs_getops(vfsp) == ops) {
3296 			found = 1;
3297 			break;
3298 		}
3299 		vfsp = vfsp->vfs_next;
3300 	} while (vfsp != rootvfs);
3301 	vfs_list_unlock();
3302 	return (found);
3303 }
3304 
3305 /*
3306  * Allocate an entry in vfssw for a file system type
3307  */
3308 struct vfssw *
3309 allocate_vfssw(char *type)
3310 {
3311 	struct vfssw *vswp;
3312 
3313 	if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3314 		/*
3315 		 * The vfssw table uses the empty string to identify an
3316 		 * available entry; we cannot add any type which has
3317 		 * a leading NUL. The string length is limited to
3318 		 * the size of the st_fstype array in struct stat.
3319 		 */
3320 		return (NULL);
3321 	}
3322 
3323 	ASSERT(VFSSW_WRITE_LOCKED());
3324 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3325 		if (!ALLOCATED_VFSSW(vswp)) {
3326 			vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3327 			(void) strcpy(vswp->vsw_name, type);
3328 			ASSERT(vswp->vsw_count == 0);
3329 			vswp->vsw_count = 1;
3330 			mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3331 			return (vswp);
3332 		}
3333 	return (NULL);
3334 }
3335 
3336 /*
3337  * Impose additional layer of translation between vfstype names
3338  * and module names in the filesystem.
3339  */
3340 static char *
3341 vfs_to_modname(char *vfstype)
3342 {
3343 	if (strcmp(vfstype, "proc") == 0) {
3344 		vfstype = "procfs";
3345 	} else if (strcmp(vfstype, "fd") == 0) {
3346 		vfstype = "fdfs";
3347 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
3348 		vfstype = "nfs";
3349 	}
3350 
3351 	return (vfstype);
3352 }
3353 
3354 /*
3355  * Find a vfssw entry given a file system type name.
3356  * Try to autoload the filesystem if it's not found.
3357  * If it's installed, return the vfssw locked to prevent unloading.
3358  */
3359 struct vfssw *
3360 vfs_getvfssw(char *type)
3361 {
3362 	struct vfssw *vswp;
3363 	char	*modname;
3364 
3365 	RLOCK_VFSSW();
3366 	vswp = vfs_getvfsswbyname(type);
3367 	modname = vfs_to_modname(type);
3368 
3369 	if (rootdir == NULL) {
3370 		/*
3371 		 * If we haven't yet loaded the root file system, then our
3372 		 * _init won't be called until later. Allocate vfssw entry,
3373 		 * because mod_installfs won't be called.
3374 		 */
3375 		if (vswp == NULL) {
3376 			RUNLOCK_VFSSW();
3377 			WLOCK_VFSSW();
3378 			if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3379 				if ((vswp = allocate_vfssw(type)) == NULL) {
3380 					WUNLOCK_VFSSW();
3381 					return (NULL);
3382 				}
3383 			}
3384 			WUNLOCK_VFSSW();
3385 			RLOCK_VFSSW();
3386 		}
3387 		if (!VFS_INSTALLED(vswp)) {
3388 			RUNLOCK_VFSSW();
3389 			(void) modloadonly("fs", modname);
3390 		} else
3391 			RUNLOCK_VFSSW();
3392 		return (vswp);
3393 	}
3394 
3395 	/*
3396 	 * Try to load the filesystem.  Before calling modload(), we drop
3397 	 * our lock on the VFS switch table, and pick it up after the
3398 	 * module is loaded.  However, there is a potential race:  the
3399 	 * module could be unloaded after the call to modload() completes
3400 	 * but before we pick up the lock and drive on.  Therefore,
3401 	 * we keep reloading the module until we've loaded the module
3402 	 * _and_ we have the lock on the VFS switch table.
3403 	 */
3404 	while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3405 		RUNLOCK_VFSSW();
3406 		if (modload("fs", modname) == -1)
3407 			return (NULL);
3408 		RLOCK_VFSSW();
3409 		if (vswp == NULL)
3410 			if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3411 				break;
3412 	}
3413 	RUNLOCK_VFSSW();
3414 
3415 	return (vswp);
3416 }
3417 
3418 /*
3419  * Find a vfssw entry given a file system type name.
3420  */
3421 struct vfssw *
3422 vfs_getvfsswbyname(char *type)
3423 {
3424 	struct vfssw *vswp;
3425 
3426 	ASSERT(VFSSW_LOCKED());
3427 	if (type == NULL || *type == '\0')
3428 		return (NULL);
3429 
3430 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3431 		if (strcmp(type, vswp->vsw_name) == 0) {
3432 			vfs_refvfssw(vswp);
3433 			return (vswp);
3434 		}
3435 	}
3436 
3437 	return (NULL);
3438 }
3439 
3440 /*
3441  * Find a vfssw entry given a set of vfsops.
3442  */
3443 struct vfssw *
3444 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3445 {
3446 	struct vfssw *vswp;
3447 
3448 	RLOCK_VFSSW();
3449 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3450 		if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3451 			vfs_refvfssw(vswp);
3452 			RUNLOCK_VFSSW();
3453 			return (vswp);
3454 		}
3455 	}
3456 	RUNLOCK_VFSSW();
3457 
3458 	return (NULL);
3459 }
3460 
3461 /*
3462  * Reference a vfssw entry.
3463  */
3464 void
3465 vfs_refvfssw(struct vfssw *vswp)
3466 {
3467 
3468 	mutex_enter(&vswp->vsw_lock);
3469 	vswp->vsw_count++;
3470 	mutex_exit(&vswp->vsw_lock);
3471 }
3472 
3473 /*
3474  * Unreference a vfssw entry.
3475  */
3476 void
3477 vfs_unrefvfssw(struct vfssw *vswp)
3478 {
3479 
3480 	mutex_enter(&vswp->vsw_lock);
3481 	vswp->vsw_count--;
3482 	mutex_exit(&vswp->vsw_lock);
3483 }
3484 
3485 int sync_timeout = 30;		/* timeout for syncing a page during panic */
3486 int sync_timeleft;		/* portion of sync_timeout remaining */
3487 
3488 static int sync_retries = 20;	/* number of retries when not making progress */
3489 static int sync_triesleft;	/* portion of sync_retries remaining */
3490 
3491 static pgcnt_t old_pgcnt, new_pgcnt;
3492 static int new_bufcnt, old_bufcnt;
3493 
3494 /*
3495  * Sync all of the mounted filesystems, and then wait for the actual i/o to
3496  * complete.  We wait by counting the number of dirty pages and buffers,
3497  * pushing them out using bio_busy() and page_busy(), and then counting again.
3498  * This routine is used during both the uadmin A_SHUTDOWN code as well as
3499  * the SYNC phase of the panic code (see comments in panic.c).  It should only
3500  * be used after some higher-level mechanism has quiesced the system so that
3501  * new writes are not being initiated while we are waiting for completion.
3502  *
3503  * To ensure finite running time, our algorithm uses two timeout mechanisms:
3504  * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
3505  * sync_triesleft (a progress counter used by the vfs_syncall() loop below).
3506  * Together these ensure that syncing completes if our i/o paths are stuck.
3507  * The counters are declared above so they can be found easily in the debugger.
3508  *
3509  * The sync_timeleft counter is reset by bio_busy() and page_busy() using the
3510  * vfs_syncprogress() subroutine whenever we make progress through the lists of
3511  * pages and buffers.  It is decremented and expired by the deadman() cyclic.
3512  * When vfs_syncall() decides it is done, we disable the deadman() counter by
3513  * setting sync_timeleft to zero.  This timer guards against vfs_syncall()
3514  * deadlocking or hanging inside of a broken filesystem or driver routine.
3515  *
3516  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
3517  * sync_retries consecutive calls to bio_busy() and page_busy() without
3518  * decreasing either the number of dirty buffers or dirty pages below the
3519  * lowest count we have seen so far, we give up and return from vfs_syncall().
3520  *
3521  * Each loop iteration ends with a call to delay() one second to allow time for
3522  * i/o completion and to permit the user time to read our progress messages.
3523  */
3524 void
3525 vfs_syncall(void)
3526 {
3527 	if (rootdir == NULL && !modrootloaded)
3528 		return; /* panic during boot - no filesystems yet */
3529 
3530 	printf("syncing file systems...");
3531 	vfs_syncprogress();
3532 	sync();
3533 
3534 	vfs_syncprogress();
3535 	sync_triesleft = sync_retries;
3536 
3537 	old_bufcnt = new_bufcnt = INT_MAX;
3538 	old_pgcnt = new_pgcnt = ULONG_MAX;
3539 
3540 	while (sync_triesleft > 0) {
3541 		old_bufcnt = MIN(old_bufcnt, new_bufcnt);
3542 		old_pgcnt = MIN(old_pgcnt, new_pgcnt);
3543 
3544 		new_bufcnt = bio_busy(B_TRUE);
3545 		new_pgcnt = page_busy(B_TRUE);
3546 		vfs_syncprogress();
3547 
3548 		if (new_bufcnt == 0 && new_pgcnt == 0)
3549 			break;
3550 
3551 		if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
3552 			sync_triesleft = sync_retries;
3553 		else
3554 			sync_triesleft--;
3555 
3556 		if (new_bufcnt)
3557 			printf(" [%d]", new_bufcnt);
3558 		if (new_pgcnt)
3559 			printf(" %lu", new_pgcnt);
3560 
3561 		delay(hz);
3562 	}
3563 
3564 	if (new_bufcnt != 0 || new_pgcnt != 0)
3565 		printf(" done (not all i/o completed)\n");
3566 	else
3567 		printf(" done\n");
3568 
3569 	sync_timeleft = 0;
3570 	delay(hz);
3571 }
3572 
3573 /*
3574  * If we are in the middle of the sync phase of panic, reset sync_timeleft to
3575  * sync_timeout to indicate that we are making progress and the deadman()
3576  * omnipresent cyclic should not yet time us out.  Note that it is safe to
3577  * store to sync_timeleft here since the deadman() is firing at high-level
3578  * on top of us.  If we are racing with the deadman(), either the deadman()
3579  * will decrement the old value and then we will reset it, or we will
3580  * reset it and then the deadman() will immediately decrement it.  In either
3581  * case, correct behavior results.
3582  */
3583 void
3584 vfs_syncprogress(void)
3585 {
3586 	if (panicstr)
3587 		sync_timeleft = sync_timeout;
3588 }
3589 
3590 /*
3591  * Map VFS flags to statvfs flags.  These shouldn't really be separate
3592  * flags at all.
3593  */
3594 uint_t
3595 vf_to_stf(uint_t vf)
3596 {
3597 	uint_t stf = 0;
3598 
3599 	if (vf & VFS_RDONLY)
3600 		stf |= ST_RDONLY;
3601 	if (vf & VFS_NOSETUID)
3602 		stf |= ST_NOSUID;
3603 	if (vf & VFS_NOTRUNC)
3604 		stf |= ST_NOTRUNC;
3605 
3606 	return (stf);
3607 }
3608 
3609 /*
3610  * Use old-style function prototype for vfsstray() so
3611  * that we can use it anywhere in the vfsops structure.
3612  */
3613 int vfsstray();
3614 
3615 /*
3616  * Entries for (illegal) fstype 0.
3617  */
3618 /* ARGSUSED */
3619 int
3620 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
3621 {
3622 	cmn_err(CE_PANIC, "stray vfs operation");
3623 	return (0);
3624 }
3625 
3626 vfsops_t vfs_strayops = {
3627 	vfsstray,
3628 	vfsstray,
3629 	vfsstray,
3630 	vfsstray,
3631 	vfsstray_sync,
3632 	vfsstray,
3633 	vfsstray,
3634 	vfsstray
3635 };
3636 
3637 /*
3638  * Entries for (illegal) fstype 0.
3639  */
3640 int
3641 vfsstray(void)
3642 {
3643 	cmn_err(CE_PANIC, "stray vfs operation");
3644 	return (0);
3645 }
3646 
3647 /*
3648  * Support for dealing with forced UFS unmount and its interaction with
3649  * LOFS. Could be used by any filesystem.
3650  * See bug 1203132.
3651  */
3652 int
3653 vfs_EIO(void)
3654 {
3655 	return (EIO);
3656 }
3657 
3658 /*
3659  * We've gotta define the op for sync separately, since the compiler gets
3660  * confused if we mix and match ANSI and normal style prototypes when
3661  * a "short" argument is present and spits out a warning.
3662  */
3663 /*ARGSUSED*/
3664 int
3665 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
3666 {
3667 	return (EIO);
3668 }
3669 
3670 vfs_t EIO_vfs;
3671 vfsops_t *EIO_vfsops;
3672 
3673 /*
3674  * Called from startup() to initialize all loaded vfs's
3675  */
3676 void
3677 vfsinit(void)
3678 {
3679 	struct vfssw *vswp;
3680 	int error;
3681 	extern void vopstats_startup();
3682 	extern void setup_vopstats(vfs_t *);
3683 
3684 	static const fs_operation_def_t EIO_vfsops_template[] = {
3685 		VFSNAME_MOUNT,		vfs_EIO,
3686 		VFSNAME_UNMOUNT,	vfs_EIO,
3687 		VFSNAME_ROOT,		vfs_EIO,
3688 		VFSNAME_STATVFS,	vfs_EIO,
3689 		VFSNAME_SYNC, (fs_generic_func_p) vfs_EIO_sync,
3690 		VFSNAME_VGET,		vfs_EIO,
3691 		VFSNAME_MOUNTROOT,	vfs_EIO,
3692 		VFSNAME_FREEVFS,	vfs_EIO,
3693 		VFSNAME_VNSTATE,	vfs_EIO,
3694 		NULL, NULL
3695 	};
3696 
3697 
3698 	/* Initialize the vnode cache (file systems may use it during init). */
3699 
3700 	vn_create_cache();
3701 
3702 	/* Setup event monitor framework */
3703 
3704 	fem_init();
3705 
3706 	/* Initialize the dummy stray file system type. */
3707 
3708 	vfssw[0].vsw_vfsops = vfs_strayops;
3709 
3710 	/* Initialize the dummy EIO file system. */
3711 	error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
3712 	if (error != 0) {
3713 		cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
3714 		/* Shouldn't happen, but not bad enough to panic */
3715 	}
3716 
3717 	VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
3718 
3719 	/*
3720 	 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
3721 	 * on this vfs can immediately notice it's invalid.
3722 	 */
3723 	EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
3724 
3725 	/*
3726 	 * Call the init routines of non-loadable filesystems only.
3727 	 * Filesystems which are loaded as separate modules will be
3728 	 * initialized by the module loading code instead.
3729 	 */
3730 
3731 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3732 		RLOCK_VFSSW();
3733 		if (vswp->vsw_init != NULL)
3734 			(*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
3735 		RUNLOCK_VFSSW();
3736 	}
3737 
3738 	vopstats_startup();
3739 	setup_vopstats(&EIO_vfs);
3740 }
3741 
3742 /*
3743  * Increments the vfs reference count by one atomically.
3744  */
3745 void
3746 vfs_hold(vfs_t *vfsp)
3747 {
3748 	atomic_add_32(&vfsp->vfs_count, 1);
3749 	ASSERT(vfsp->vfs_count != 0);
3750 }
3751 
3752 /*
3753  * Decrements the vfs reference count by one atomically. When
3754  * vfs reference count becomes zero, it calls the file system
3755  * specific vfs_freevfs() to free up the resources.
3756  */
3757 void
3758 vfs_rele(vfs_t *vfsp)
3759 {
3760 	ASSERT(vfsp->vfs_count != 0);
3761 	if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) {
3762 		VFS_FREEVFS(vfsp);
3763 		if (vfsp->vfs_zone)
3764 			zone_rele(vfsp->vfs_zone);
3765 		vfs_freemnttab(vfsp);
3766 		sema_destroy(&vfsp->vfs_reflock);
3767 		kmem_free(vfsp, sizeof (*vfsp));
3768 	}
3769 }
3770 
3771 /*
3772  * Generic operations vector support.
3773  *
3774  * This is used to build operations vectors for both the vfs and vnode.
3775  * It's normally called only when a file system is loaded.
3776  *
3777  * There are many possible algorithms for this, including the following:
3778  *
3779  *   (1) scan the list of known operations; for each, see if the file system
3780  *       includes an entry for it, and fill it in as appropriate.
3781  *
3782  *   (2) set up defaults for all known operations.  scan the list of ops
3783  *       supplied by the file system; for each which is both supplied and
3784  *       known, fill it in.
3785  *
3786  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
3787  *       in entries as we go.
3788  *
3789  * we choose (1) for simplicity, and because performance isn't critical here.
3790  * note that (2) could be sped up using a precomputed hash table on known ops.
3791  * (3) could be faster than either, but only if the lists were very large or
3792  * supplied in sorted order.
3793  *
3794  */
3795 
3796 int
3797 fs_build_vector(void *vector, int *unused_ops,
3798     const fs_operation_trans_def_t *translation,
3799     const fs_operation_def_t *operations)
3800 {
3801 	int i, num_trans, num_ops, used;
3802 
3803 	/* Count the number of translations and the number of supplied */
3804 	/* operations. */
3805 
3806 	{
3807 		const fs_operation_trans_def_t *p;
3808 
3809 		for (num_trans = 0, p = translation;
3810 		    p->name != NULL;
3811 		    num_trans++, p++)
3812 			;
3813 	}
3814 
3815 	{
3816 		const fs_operation_def_t *p;
3817 
3818 		for (num_ops = 0, p = operations;
3819 		    p->name != NULL;
3820 		    num_ops++, p++)
3821 			;
3822 	}
3823 
3824 	/* Walk through each operation known to our caller.  There will be */
3825 	/* one entry in the supplied "translation table" for each. */
3826 
3827 	used = 0;
3828 
3829 	for (i = 0; i < num_trans; i++) {
3830 		int j, found;
3831 		char *curname;
3832 		fs_generic_func_p result;
3833 		fs_generic_func_p *location;
3834 
3835 		curname = translation[i].name;
3836 
3837 		/* Look for a matching operation in the list supplied by the */
3838 		/* file system. */
3839 
3840 		found = 0;
3841 
3842 		for (j = 0; j < num_ops; j++) {
3843 			if (strcmp(operations[j].name, curname) == 0) {
3844 				used++;
3845 				found = 1;
3846 				break;
3847 			}
3848 		}
3849 
3850 		/* If the file system is using a "placeholder" for default */
3851 		/* or error functions, grab the appropriate function out of */
3852 		/* the translation table.  If the file system didn't supply */
3853 		/* this operation at all, use the default function. */
3854 
3855 		if (found) {
3856 			result = operations[j].func;
3857 			if (result == fs_default) {
3858 				result = translation[i].defaultFunc;
3859 			} else if (result == fs_error) {
3860 				result = translation[i].errorFunc;
3861 			} else if (result == NULL) {
3862 				/* Null values are PROHIBITED */
3863 				return (EINVAL);
3864 			}
3865 		} else {
3866 			result = translation[i].defaultFunc;
3867 		}
3868 
3869 		/* Now store the function into the operations vector. */
3870 
3871 		location = (fs_generic_func_p *)
3872 		    (((char *)vector) + translation[i].offset);
3873 
3874 		*location = result;
3875 	}
3876 
3877 	*unused_ops = num_ops - used;
3878 
3879 	return (0);
3880 }
3881 
3882 /* Placeholder functions, should never be called. */
3883 
3884 int
3885 fs_error(void)
3886 {
3887 	cmn_err(CE_PANIC, "fs_error called");
3888 	return (0);
3889 }
3890 
3891 int
3892 fs_default(void)
3893 {
3894 	cmn_err(CE_PANIC, "fs_default called");
3895 	return (0);
3896 }
3897 
3898 #ifdef __sparc
3899 
3900 /*
3901  * Part of the implementation of booting off a mirrored root
3902  * involves a change of dev_t for the root device.  To
3903  * accomplish this, first remove the existing hash table
3904  * entry for the root device, convert to the new dev_t,
3905  * then re-insert in the hash table at the head of the list.
3906  */
3907 void
3908 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
3909 {
3910 	vfs_list_lock();
3911 
3912 	vfs_hash_remove(vfsp);
3913 
3914 	vfsp->vfs_dev = ndev;
3915 	vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
3916 
3917 	vfs_hash_add(vfsp, 1);
3918 
3919 	vfs_list_unlock();
3920 }
3921 
3922 #else /* x86 NEWBOOT */
3923 
3924 int
3925 rootconf()
3926 {
3927 	int error;
3928 	struct vfssw *vsw;
3929 	extern void pm_init();
3930 	char *fstyp;
3931 
3932 	fstyp = getrootfs();
3933 
3934 	if (error = clboot_rootconf())
3935 		return (error);
3936 
3937 	if (modload("fs", fstyp) == -1)
3938 		cmn_err(CE_PANIC, "Cannot _init %s module\n", fstyp);
3939 
3940 	RLOCK_VFSSW();
3941 	vsw = vfs_getvfsswbyname(fstyp);
3942 	RUNLOCK_VFSSW();
3943 	VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
3944 	VFS_HOLD(rootvfs);
3945 
3946 	/* always mount readonly first */
3947 	rootvfs->vfs_flag |= VFS_RDONLY;
3948 
3949 	pm_init();
3950 
3951 	if (netboot)
3952 		(void) strplumb();
3953 
3954 	error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
3955 	vfs_unrefvfssw(vsw);
3956 	rootdev = rootvfs->vfs_dev;
3957 
3958 	if (error)
3959 		cmn_err(CE_PANIC, "cannot mount root path %s", svm_bootpath);
3960 	return (error);
3961 }
3962 
3963 /*
3964  * XXX this is called by nfs only and should probably be removed
3965  * If booted with ASKNAME, prompt on the console for a filesystem
3966  * name and return it.
3967  */
3968 void
3969 getfsname(char *askfor, char *name, size_t namelen)
3970 {
3971 	if (boothowto & RB_ASKNAME) {
3972 		printf("%s name: ", askfor);
3973 		console_gets(name, namelen);
3974 	}
3975 }
3976 
3977 /*
3978  * If server_path exists, then we are booting a diskless
3979  * client. Otherwise, we default to ufs. Zfs should perhaps be
3980  * another property.
3981  */
3982 static char *
3983 getrootfs(void)
3984 {
3985 	extern char *strplumb_get_netdev_path(void);
3986 	char *propstr = NULL;
3987 
3988 	/* check fstype property; it should be nfsdyn for diskless */
3989 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
3990 	    DDI_PROP_DONTPASS, "fstype", &propstr)
3991 	    == DDI_SUCCESS) {
3992 		(void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
3993 		ddi_prop_free(propstr);
3994 	}
3995 
3996 	if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0)
3997 		return (rootfs.bo_fstype);
3998 
3999 	++netboot;
4000 	/* check if path to network interface is specified in bootpath */
4001 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4002 	    DDI_PROP_DONTPASS, "bootpath", &propstr)
4003 	    == DDI_SUCCESS) {
4004 		(void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4005 		ddi_prop_free(propstr);
4006 	} else {
4007 		/* attempt to determine netdev_path via boot_mac address */
4008 		netdev_path = strplumb_get_netdev_path();
4009 		if (netdev_path == NULL)
4010 			cmn_err(CE_PANIC,
4011 			    "Cannot find boot network interface\n");
4012 		(void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
4013 	}
4014 	return ("nfs");
4015 }
4016 #endif
4017