xref: /titanic_51/usr/src/uts/common/fs/vfs.c (revision c793af95640863cd29868fc7c419c5d2496b207b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/user.h>
47 #include <sys/fstyp.h>
48 #include <sys/kmem.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 #include <sys/mount.h>
52 #include <sys/vfs.h>
53 #include <sys/fem.h>
54 #include <sys/mntent.h>
55 #include <sys/stat.h>
56 #include <sys/statvfs.h>
57 #include <sys/statfs.h>
58 #include <sys/cred.h>
59 #include <sys/vnode.h>
60 #include <sys/rwstlock.h>
61 #include <sys/dnlc.h>
62 #include <sys/file.h>
63 #include <sys/time.h>
64 #include <sys/atomic.h>
65 #include <sys/cmn_err.h>
66 #include <sys/buf.h>
67 #include <sys/swap.h>
68 #include <sys/debug.h>
69 #include <sys/vnode.h>
70 #include <sys/modctl.h>
71 #include <sys/ddi.h>
72 #include <sys/pathname.h>
73 #include <sys/bootconf.h>
74 #include <sys/dumphdr.h>
75 #include <sys/dc_ki.h>
76 #include <sys/poll.h>
77 #include <sys/sunddi.h>
78 #include <sys/sysmacros.h>
79 #include <sys/zone.h>
80 #include <sys/policy.h>
81 #include <sys/ctfs.h>
82 #include <sys/objfs.h>
83 #include <sys/console.h>
84 #include <sys/reboot.h>
85 
86 #include <vm/page.h>
87 
88 #include <fs/fs_subr.h>
89 
90 /* Private interfaces to create vopstats-related data structures */
91 extern void		initialize_vopstats(vopstats_t *);
92 extern vopstats_t	*get_fstype_vopstats(struct vfs *, struct vfssw *);
93 extern vsk_anchor_t	*get_vskstat_anchor(struct vfs *);
94 
95 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
96 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
97     const char *, int, int);
98 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
99 static void vfs_freemnttab(struct vfs *);
100 static void vfs_freeopt(mntopt_t *);
101 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
102 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
103 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
104 static void vfs_createopttbl_extend(mntopts_t *, const char *,
105     const mntopts_t *);
106 static char **vfs_copycancelopt_extend(char **const, int);
107 static void vfs_freecancelopt(char **);
108 static char *getrootfs(void);
109 static int getmacpath(dev_info_t *, void *);
110 
111 struct ipmnt {
112 	struct ipmnt	*mip_next;
113 	dev_t		mip_dev;
114 	struct vfs	*mip_vfsp;
115 };
116 
117 static kmutex_t		vfs_miplist_mutex;
118 static struct ipmnt	*vfs_miplist = NULL;
119 static struct ipmnt	*vfs_miplist_end = NULL;
120 
121 /*
122  * VFS global data.
123  */
124 vnode_t *rootdir;		/* pointer to root inode vnode. */
125 vnode_t *devicesdir;		/* pointer to inode of devices root */
126 
127 char *server_rootpath;		/* root path for diskless clients */
128 char *server_hostname;		/* hostname of diskless server */
129 
130 static struct vfs root;
131 static struct vfs devices;
132 struct vfs *rootvfs = &root;	/* pointer to root vfs; head of VFS list. */
133 rvfs_t *rvfs_list;		/* array of vfs ptrs for vfs hash list */
134 int vfshsz = 512;		/* # of heads/locks in vfs hash arrays */
135 				/* must be power of 2!	*/
136 timespec_t vfs_mnttab_ctime;	/* mnttab created time */
137 timespec_t vfs_mnttab_mtime;	/* mnttab last modified time */
138 char *vfs_dummyfstype = "\0";
139 struct pollhead vfs_pollhd;	/* for mnttab pollers */
140 
141 /*
142  * Table for generic options recognized in the VFS layer and acted
143  * on at this level before parsing file system specific options.
144  * The nosuid option is stronger than any of the devices and setuid
145  * options, so those are canceled when nosuid is seen.
146  *
147  * All options which are added here need to be added to the
148  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
149  */
150 /*
151  * VFS Mount options table
152  */
153 static char *ro_cancel[] = { MNTOPT_RW, NULL };
154 static char *rw_cancel[] = { MNTOPT_RO, NULL };
155 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
156 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
157     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
158 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
159 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
160 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
161 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
162 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
163 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
164 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
165 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
166 
167 static const mntopt_t mntopts[] = {
168 /*
169  *	option name		cancel options		default arg	flags
170  */
171 	{ MNTOPT_REMOUNT,	NULL,			NULL,
172 		MO_NODISPLAY, (void *)0 },
173 	{ MNTOPT_RO,		ro_cancel,		NULL,		0,
174 		(void *)0 },
175 	{ MNTOPT_RW,		rw_cancel,		NULL,		0,
176 		(void *)0 },
177 	{ MNTOPT_SUID,		suid_cancel,		NULL,		0,
178 		(void *)0 },
179 	{ MNTOPT_NOSUID,	nosuid_cancel,		NULL,		0,
180 		(void *)0 },
181 	{ MNTOPT_DEVICES,	devices_cancel,		NULL,		0,
182 		(void *)0 },
183 	{ MNTOPT_NODEVICES,	nodevices_cancel,	NULL,		0,
184 		(void *)0 },
185 	{ MNTOPT_SETUID,	setuid_cancel,		NULL,		0,
186 		(void *)0 },
187 	{ MNTOPT_NOSETUID,	nosetuid_cancel,	NULL,		0,
188 		(void *)0 },
189 	{ MNTOPT_NBMAND,	nbmand_cancel,		NULL,		0,
190 		(void *)0 },
191 	{ MNTOPT_NONBMAND,	nonbmand_cancel,	NULL,		0,
192 		(void *)0 },
193 	{ MNTOPT_EXEC,		exec_cancel,		NULL,		0,
194 		(void *)0 },
195 	{ MNTOPT_NOEXEC,	noexec_cancel,		NULL,		0,
196 		(void *)0 },
197 };
198 
199 const mntopts_t vfs_mntopts = {
200 	sizeof (mntopts) / sizeof (mntopt_t),
201 	(mntopt_t *)&mntopts[0]
202 };
203 
204 /*
205  * File system operation dispatch functions.
206  */
207 
208 int
209 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
210 {
211 	return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
212 }
213 
214 int
215 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
216 {
217 	return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
218 }
219 
220 int
221 fsop_root(vfs_t *vfsp, vnode_t **vpp)
222 {
223 	refstr_t *mntpt;
224 	int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
225 	/*
226 	 * Make sure this root has a path.  With lofs, it is possible to have
227 	 * a NULL mountpoint.
228 	 */
229 	if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
230 		mntpt = vfs_getmntpoint(vfsp);
231 		vn_setpath_str(*vpp, refstr_value(mntpt),
232 		    strlen(refstr_value(mntpt)));
233 		refstr_rele(mntpt);
234 	}
235 
236 	return (ret);
237 }
238 
239 int
240 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
241 {
242 	return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
243 }
244 
245 int
246 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
247 {
248 	return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
249 }
250 
251 int
252 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
253 {
254 	return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
255 }
256 
257 int
258 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
259 {
260 	return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
261 }
262 
263 void
264 fsop_freefs(vfs_t *vfsp)
265 {
266 	(*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
267 }
268 
269 int
270 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
271 {
272 	return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
273 }
274 
275 int
276 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
277 {
278 	ASSERT((fstype >= 0) && (fstype < nfstype));
279 
280 	if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
281 		return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
282 	else
283 		return (ENOTSUP);
284 }
285 
286 /*
287  * File system initialization.  vfs_setfsops() must be called from a file
288  * system's init routine.
289  */
290 
291 static int
292 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
293     int *unused_ops)
294 {
295 	static const fs_operation_trans_def_t vfs_ops_table[] = {
296 		VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
297 			fs_nosys, fs_nosys,
298 
299 		VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
300 			fs_nosys, fs_nosys,
301 
302 		VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
303 			fs_nosys, fs_nosys,
304 
305 		VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
306 			fs_nosys, fs_nosys,
307 
308 		VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
309 			(fs_generic_func_p) fs_sync,
310 			(fs_generic_func_p) fs_sync,	/* No errors allowed */
311 
312 		VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
313 			fs_nosys, fs_nosys,
314 
315 		VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
316 			fs_nosys, fs_nosys,
317 
318 		VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
319 			(fs_generic_func_p)fs_freevfs,
320 			(fs_generic_func_p)fs_freevfs,	/* Shouldn't fail */
321 
322 		VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
323 			(fs_generic_func_p)fs_nosys,
324 			(fs_generic_func_p)fs_nosys,
325 
326 		NULL, 0, NULL, NULL
327 	};
328 
329 	return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
330 }
331 
332 int
333 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
334 {
335 	int error;
336 	int unused_ops;
337 
338 	/* Verify that fstype refers to a loaded fs (and not fsid 0). */
339 
340 	if ((fstype <= 0) || (fstype >= nfstype))
341 		return (EINVAL);
342 
343 	if (!ALLOCATED_VFSSW(&vfssw[fstype]))
344 		return (EINVAL);
345 
346 	/* Set up the operations vector. */
347 
348 	error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
349 
350 	if (error != 0)
351 		return (error);
352 
353 	vfssw[fstype].vsw_flag |= VSW_INSTALLED;
354 
355 	if (actual != NULL)
356 		*actual = &vfssw[fstype].vsw_vfsops;
357 
358 #if DEBUG
359 	if (unused_ops != 0)
360 		cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
361 		    "but not used", vfssw[fstype].vsw_name, unused_ops);
362 #endif
363 
364 	return (0);
365 }
366 
367 int
368 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
369 {
370 	int error;
371 	int unused_ops;
372 
373 	*actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
374 
375 	error = fs_copyfsops(template, *actual, &unused_ops);
376 	if (error != 0) {
377 		kmem_free(*actual, sizeof (vfsops_t));
378 		*actual = NULL;
379 		return (error);
380 	}
381 
382 	return (0);
383 }
384 
385 /*
386  * Free a vfsops structure created as a result of vfs_makefsops().
387  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
388  * vfs_freevfsops_by_type().
389  */
390 void
391 vfs_freevfsops(vfsops_t *vfsops)
392 {
393 	kmem_free(vfsops, sizeof (vfsops_t));
394 }
395 
396 /*
397  * Since the vfsops structure is part of the vfssw table and wasn't
398  * really allocated, we're not really freeing anything.  We keep
399  * the name for consistency with vfs_freevfsops().  We do, however,
400  * need to take care of a little bookkeeping.
401  * NOTE: For a vfsops structure created by vfs_setfsops(), use
402  * vfs_freevfsops_by_type().
403  */
404 int
405 vfs_freevfsops_by_type(int fstype)
406 {
407 
408 	/* Verify that fstype refers to a loaded fs (and not fsid 0). */
409 	if ((fstype <= 0) || (fstype >= nfstype))
410 		return (EINVAL);
411 
412 	WLOCK_VFSSW();
413 	if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
414 		WUNLOCK_VFSSW();
415 		return (EINVAL);
416 	}
417 
418 	vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
419 	WUNLOCK_VFSSW();
420 
421 	return (0);
422 }
423 
424 /* Support routines used to reference vfs_op */
425 
426 /* Set the operations vector for a vfs */
427 void
428 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
429 {
430 	vfsops_t	*op;
431 
432 	ASSERT(vfsp != NULL);
433 	ASSERT(vfsops != NULL);
434 
435 	op = vfsp->vfs_op;
436 	membar_consumer();
437 	if ((vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) &&
438 	    casptr(&vfsp->vfs_op, op, vfsops) == op) {
439 		return;
440 	}
441 	fsem_setvfsops(vfsp, vfsops);
442 }
443 
444 /* Retrieve the operations vector for a vfs */
445 vfsops_t *
446 vfs_getops(vfs_t *vfsp)
447 {
448 	vfsops_t	*op;
449 
450 	ASSERT(vfsp != NULL);
451 
452 	op = vfsp->vfs_op;
453 	membar_consumer();
454 	if ((vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) &&
455 	    op == vfsp->vfs_op) {
456 		return (op);
457 	} else {
458 		return (fsem_getvfsops(vfsp));
459 	}
460 }
461 
462 /*
463  * Returns non-zero (1) if the vfsops matches that of the vfs.
464  * Returns zero (0) if not.
465  */
466 int
467 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
468 {
469 	return (vfs_getops(vfsp) == vfsops);
470 }
471 
472 /*
473  * Returns non-zero (1) if the file system has installed a non-default,
474  * non-error vfs_sync routine.  Returns zero (0) otherwise.
475  */
476 int
477 vfs_can_sync(vfs_t *vfsp)
478 {
479 	/* vfs_sync() routine is not the default/error function */
480 	return (vfs_getops(vfsp)->vfs_sync != fs_sync);
481 }
482 
483 /*
484  * Initialize a vfs structure.
485  */
486 void
487 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
488 {
489 	vfsp->vfs_count = 0;
490 	vfsp->vfs_next = vfsp;
491 	vfsp->vfs_prev = vfsp;
492 	vfsp->vfs_zone_next = vfsp;
493 	vfsp->vfs_zone_prev = vfsp;
494 	vfsp->vfs_flag = 0;
495 	vfsp->vfs_data = (data);
496 	vfsp->vfs_resource = NULL;
497 	vfsp->vfs_mntpt = NULL;
498 	vfsp->vfs_mntopts.mo_count = 0;
499 	vfsp->vfs_mntopts.mo_list = NULL;
500 	vfsp->vfs_implp = NULL;
501 	vfsp->vfs_zone = NULL;
502 	/*
503 	 * Note: Don't initialize any member of the vfs_impl_t structure
504 	 * here as it could be a problem for unbundled file systems.
505 	 */
506 	vfs_setops((vfsp), (op));
507 	sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
508 }
509 
510 /*
511  * Allocate and initialize the vfs implementation private data
512  * structure, vfs_impl_t.
513  */
514 void
515 vfsimpl_setup(vfs_t *vfsp)
516 {
517 	vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
518 	/* Note that this are #define'd in vfs.h */
519 	vfsp->vfs_femhead = NULL;
520 	vfsp->vfs_vskap = NULL;
521 	vfsp->vfs_fstypevsp = NULL;
522 }
523 
524 /*
525  * Release the vfs_impl_t structure, if it exists. Some unbundled
526  * filesystems may not use the newer version of vfs and thus
527  * would not contain this implementation private data structure.
528  */
529 void
530 vfsimpl_teardown(vfs_t *vfsp)
531 {
532 	vfs_impl_t	*vip = vfsp->vfs_implp;
533 
534 	if (vip == NULL)
535 		return;
536 
537 	if (vip->vi_femhead) {
538 		ASSERT(vip->vi_femhead->femh_list == NULL);
539 		mutex_destroy(&vip->vi_femhead->femh_lock);
540 		kmem_free(vip->vi_femhead, sizeof (*(vip->vi_femhead)));
541 		vip->vi_femhead = NULL;
542 	}
543 
544 	kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
545 	vfsp->vfs_implp = NULL;
546 }
547 
548 /*
549  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
550  * fstatvfs, and sysfs moved to common/syscall.
551  */
552 
553 /*
554  * Update every mounted file system.  We call the vfs_sync operation of
555  * each file system type, passing it a NULL vfsp to indicate that all
556  * mounted file systems of that type should be updated.
557  */
558 void
559 vfs_sync(int flag)
560 {
561 	struct vfssw *vswp;
562 	RLOCK_VFSSW();
563 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
564 		if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
565 			vfs_refvfssw(vswp);
566 			RUNLOCK_VFSSW();
567 			(void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
568 			    CRED());
569 			vfs_unrefvfssw(vswp);
570 			RLOCK_VFSSW();
571 		}
572 	}
573 	RUNLOCK_VFSSW();
574 }
575 
576 void
577 sync(void)
578 {
579 	vfs_sync(0);
580 }
581 
582 /*
583  * External routines.
584  */
585 
586 krwlock_t vfssw_lock;	/* lock accesses to vfssw */
587 
588 /*
589  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
590  * but otherwise should be accessed only via vfs_list_lock() and
591  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
592  */
593 static krwlock_t vfslist;
594 
595 /*
596  * Mount devfs on /devices. This is done right after root is mounted
597  * to provide device access support for the system
598  */
599 static void
600 vfs_mountdevices(void)
601 {
602 	struct vfssw *vsw;
603 	struct vnode *mvp;
604 	struct mounta mounta = {	/* fake mounta for devfs_mount() */
605 		NULL,
606 		NULL,
607 		MS_SYSSPACE,
608 		NULL,
609 		NULL,
610 		0,
611 		NULL,
612 		0
613 	};
614 
615 	/*
616 	 * _init devfs module to fill in the vfssw
617 	 */
618 	if (modload("fs", "devfs") == -1)
619 		cmn_err(CE_PANIC, "Cannot _init devfs module\n");
620 
621 	/*
622 	 * Hold vfs
623 	 */
624 	RLOCK_VFSSW();
625 	vsw = vfs_getvfsswbyname("devfs");
626 	VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
627 	VFS_HOLD(&devices);
628 
629 	/*
630 	 * Locate mount point
631 	 */
632 	if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
633 		cmn_err(CE_PANIC, "Cannot find /devices\n");
634 
635 	/*
636 	 * Perform the mount of /devices
637 	 */
638 	if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
639 		cmn_err(CE_PANIC, "Cannot mount /devices\n");
640 
641 	RUNLOCK_VFSSW();
642 
643 	/*
644 	 * Set appropriate members and add to vfs list for mnttab display
645 	 */
646 	vfs_setresource(&devices, "/devices");
647 	vfs_setmntpoint(&devices, "/devices");
648 
649 	/*
650 	 * Hold the root of /devices so it won't go away
651 	 */
652 	if (VFS_ROOT(&devices, &devicesdir))
653 		cmn_err(CE_PANIC, "vfs_mountdevices: not devices root");
654 	VN_HOLD(devicesdir);
655 
656 	if (vfs_lock(&devices) != 0) {
657 		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
658 		return;
659 	}
660 
661 	if (vn_vfswlock(mvp) != 0) {
662 		vfs_unlock(&devices);
663 		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
664 		return;
665 	}
666 
667 	vfs_add(mvp, &devices, 0);
668 	vn_vfsunlock(mvp);
669 	vfs_unlock(&devices);
670 }
671 
672 /*
673  * Mount required filesystem. This is done right after root is mounted.
674  */
675 static void
676 vfs_mountfs(char *module, char *spec, char *path)
677 {
678 	struct vnode *mvp;
679 	struct mounta mounta;
680 	vfs_t *vfsp;
681 
682 	mounta.flags = MS_SYSSPACE | MS_DATA;
683 	mounta.fstype = module;
684 	mounta.spec = spec;
685 	mounta.dir = path;
686 	if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
687 		cmn_err(CE_WARN, "Cannot find %s\n", path);
688 		return;
689 	}
690 	if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
691 		cmn_err(CE_WARN, "Cannot mount %s\n", path);
692 	else
693 		VFS_RELE(vfsp);
694 	VN_RELE(mvp);
695 }
696 
697 /*
698  * vfs_mountroot is called by main() to mount the root filesystem.
699  */
700 void
701 vfs_mountroot(void)
702 {
703 	struct vnode	*rvp = NULL;
704 	char		*path;
705 	size_t		plen;
706 	struct vfssw	*vswp;
707 
708 	rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
709 	rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
710 
711 	/*
712 	 * Alloc the vfs hash bucket array and locks
713 	 */
714 	rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
715 
716 	/*
717 	 * Call machine-dependent routine "rootconf" to choose a root
718 	 * file system type.
719 	 */
720 	if (rootconf())
721 		cmn_err(CE_PANIC, "vfs_mountroot: cannot mount root");
722 	/*
723 	 * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
724 	 * to point to it.  These are used by lookuppn() so that it
725 	 * knows where to start from ('/' or '.').
726 	 */
727 	vfs_setmntpoint(rootvfs, "/");
728 	if (VFS_ROOT(rootvfs, &rootdir))
729 		cmn_err(CE_PANIC, "vfs_mountroot: no root vnode");
730 	u.u_cdir = rootdir;
731 	VN_HOLD(u.u_cdir);
732 	u.u_rdir = NULL;
733 
734 	/*
735 	 * Setup the global zone's rootvp, now that it exists.
736 	 */
737 	global_zone->zone_rootvp = rootdir;
738 	VN_HOLD(global_zone->zone_rootvp);
739 
740 	/*
741 	 * Notify the module code that it can begin using the
742 	 * root filesystem instead of the boot program's services.
743 	 */
744 	modrootloaded = 1;
745 	/*
746 	 * Set up mnttab information for root
747 	 */
748 	vfs_setresource(rootvfs, rootfs.bo_name);
749 
750 	/*
751 	 * Notify cluster software that the root filesystem is available.
752 	 */
753 	clboot_mountroot();
754 
755 	/* Now that we're all done with the root FS, set up its vopstats */
756 	if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
757 		/* Set flag for statistics collection */
758 		if (vswp->vsw_flag & VSW_STATS) {
759 			initialize_vopstats(&rootvfs->vfs_vopstats);
760 			rootvfs->vfs_flag |= VFS_STATS;
761 			rootvfs->vfs_fstypevsp =
762 			    get_fstype_vopstats(rootvfs, vswp);
763 			rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
764 		}
765 		vfs_unrefvfssw(vswp);
766 	}
767 
768 	/*
769 	 * Mount /devices, /system/contract, /etc/mnttab, /etc/svc/volatile,
770 	 * /system/object, and /proc.
771 	 */
772 	vfs_mountdevices();
773 
774 	vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
775 	vfs_mountfs("proc", "/proc", "/proc");
776 	vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
777 	vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
778 	vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
779 
780 #ifdef __sparc
781 	/*
782 	 * This bit of magic can go away when we convert sparc to
783 	 * the new boot architecture based on ramdisk.
784 	 *
785 	 * Booting off a mirrored root volume:
786 	 * At this point, we have booted and mounted root on a
787 	 * single component of the mirror.  Complete the boot
788 	 * by configuring SVM and converting the root to the
789 	 * dev_t of the mirrored root device.  This dev_t conversion
790 	 * only works because the underlying device doesn't change.
791 	 */
792 	if (root_is_svm) {
793 		if (svm_rootconf()) {
794 			cmn_err(CE_PANIC, "vfs_mountroot: cannot remount root");
795 		}
796 
797 		/*
798 		 * mnttab should reflect the new root device
799 		 */
800 		vfs_lock_wait(rootvfs);
801 		vfs_setresource(rootvfs, rootfs.bo_name);
802 		vfs_unlock(rootvfs);
803 	}
804 #endif /* __sparc */
805 
806 	/*
807 	 * Look up the root device via devfs so that a dv_node is
808 	 * created for it. The vnode is never VN_RELE()ed.
809 	 * We allocate more than MAXPATHLEN so that the
810 	 * buffer passed to i_ddi_prompath_to_devfspath() is
811 	 * exactly MAXPATHLEN (the function expects a buffer
812 	 * of that length).
813 	 */
814 	plen = strlen("/devices");
815 	path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
816 	(void) strcpy(path, "/devices");
817 
818 	if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
819 	    != DDI_SUCCESS ||
820 	    lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
821 
822 		/* NUL terminate in case "path" has garbage */
823 		path[plen + MAXPATHLEN - 1] = '\0';
824 #ifdef	DEBUG
825 		cmn_err(CE_WARN, "!Cannot lookup root device: %s", path);
826 #endif
827 	}
828 	kmem_free(path, plen + MAXPATHLEN);
829 }
830 
831 /*
832  * If remount failed and we're in a zone we need to check for the zone
833  * root path and strip it before the call to vfs_setpath().
834  *
835  * If strpath doesn't begin with the zone_rootpath the original
836  * strpath is returned unchanged.
837  */
838 static const char *
839 stripzonepath(const char *strpath)
840 {
841 	char *str1, *str2;
842 	int i;
843 	zone_t *zonep = curproc->p_zone;
844 
845 	if (zonep->zone_rootpath == NULL || strpath == NULL) {
846 		return (NULL);
847 	}
848 
849 	/*
850 	 * we check for the end of the string at one past the
851 	 * current position because the zone_rootpath always
852 	 * ends with "/" but we don't want to strip that off.
853 	 */
854 	str1 = zonep->zone_rootpath;
855 	str2 = (char *)strpath;
856 	ASSERT(str1[0] != '\0');
857 	for (i = 0; str1[i + 1] != '\0'; i++) {
858 		if (str1[i] != str2[i])
859 			return ((char *)strpath);
860 	}
861 	return (&str2[i]);
862 }
863 
864 /*
865  * Common mount code.  Called from the system call entry point, from autofs,
866  * and from pxfs.
867  *
868  * Takes the effective file system type, mount arguments, the mount point
869  * vnode, flags specifying whether the mount is a remount and whether it
870  * should be entered into the vfs list, and credentials.  Fills in its vfspp
871  * parameter with the mounted file system instance's vfs.
872  *
873  * Note that the effective file system type is specified as a string.  It may
874  * be null, in which case it's determined from the mount arguments, and may
875  * differ from the type specified in the mount arguments; this is a hook to
876  * allow interposition when instantiating file system instances.
877  *
878  * The caller is responsible for releasing its own hold on the mount point
879  * vp (this routine does its own hold when necessary).
880  * Also note that for remounts, the mount point vp should be the vnode for
881  * the root of the file system rather than the vnode that the file system
882  * is mounted on top of.
883  */
884 int
885 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
886 	struct vfs **vfspp)
887 {
888 	struct vfssw	*vswp;
889 	vfsops_t	*vfsops;
890 	struct vfs	*vfsp;
891 	struct vnode	*bvp;
892 	dev_t		bdev = 0;
893 	mntopts_t	mnt_mntopts;
894 	int		error = 0;
895 	int		copyout_error = 0;
896 	int		ovflags;
897 	char		*opts = uap->optptr;
898 	char		*inargs = opts;
899 	int		optlen = uap->optlen;
900 	int		remount;
901 	int		rdonly;
902 	int		nbmand = 0;
903 	int		delmip = 0;
904 	int		addmip = 0;
905 	int		splice = ((uap->flags & MS_NOSPLICE) == 0);
906 	int		fromspace = (uap->flags & MS_SYSSPACE) ?
907 				UIO_SYSSPACE : UIO_USERSPACE;
908 	char		*resource = NULL, *mountpt = NULL;
909 	refstr_t	*oldresource, *oldmntpt;
910 	struct pathname	pn, rpn;
911 	vsk_anchor_t	*vskap;
912 
913 	/*
914 	 * The v_flag value for the mount point vp is permanently set
915 	 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
916 	 * for mount point locking.
917 	 */
918 	mutex_enter(&vp->v_lock);
919 	vp->v_flag |= VVFSLOCK;
920 	mutex_exit(&vp->v_lock);
921 
922 	mnt_mntopts.mo_count = 0;
923 	/*
924 	 * Find the ops vector to use to invoke the file system-specific mount
925 	 * method.  If the fsname argument is non-NULL, use it directly.
926 	 * Otherwise, dig the file system type information out of the mount
927 	 * arguments.
928 	 *
929 	 * A side effect is to hold the vfssw entry.
930 	 *
931 	 * Mount arguments can be specified in several ways, which are
932 	 * distinguished by flag bit settings.  The preferred way is to set
933 	 * MS_OPTIONSTR, indicating an 8 argument mount with the file system
934 	 * type supplied as a character string and the last two arguments
935 	 * being a pointer to a character buffer and the size of the buffer.
936 	 * On entry, the buffer holds a null terminated list of options; on
937 	 * return, the string is the list of options the file system
938 	 * recognized. If MS_DATA is set arguments five and six point to a
939 	 * block of binary data which the file system interprets.
940 	 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
941 	 * consistently with these conventions.  To handle them, we check to
942 	 * see whether the pointer to the file system name has a numeric value
943 	 * less than 256.  If so, we treat it as an index.
944 	 */
945 	if (fsname != NULL) {
946 		if ((vswp = vfs_getvfssw(fsname)) == NULL) {
947 			return (EINVAL);
948 		}
949 	} else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
950 		size_t n;
951 		uint_t fstype;
952 		char name[FSTYPSZ];
953 
954 		if ((fstype = (uintptr_t)uap->fstype) < 256) {
955 			RLOCK_VFSSW();
956 			if (fstype == 0 || fstype >= nfstype ||
957 			    !ALLOCATED_VFSSW(&vfssw[fstype])) {
958 				RUNLOCK_VFSSW();
959 				return (EINVAL);
960 			}
961 			(void) strcpy(name, vfssw[fstype].vsw_name);
962 			RUNLOCK_VFSSW();
963 			if ((vswp = vfs_getvfssw(name)) == NULL)
964 				return (EINVAL);
965 		} else {
966 			/*
967 			 * Handle either kernel or user address space.
968 			 */
969 			if (uap->flags & MS_SYSSPACE) {
970 				error = copystr(uap->fstype, name,
971 				    FSTYPSZ, &n);
972 			} else {
973 				error = copyinstr(uap->fstype, name,
974 				    FSTYPSZ, &n);
975 			}
976 			if (error) {
977 				if (error == ENAMETOOLONG)
978 					return (EINVAL);
979 				return (error);
980 			}
981 			if ((vswp = vfs_getvfssw(name)) == NULL)
982 				return (EINVAL);
983 		}
984 	} else {
985 		if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
986 			return (EINVAL);
987 	}
988 	if (!VFS_INSTALLED(vswp))
989 		return (EINVAL);
990 	vfsops = &vswp->vsw_vfsops;
991 
992 	vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
993 	/*
994 	 * Fetch mount options and parse them for generic vfs options
995 	 */
996 	if (uap->flags & MS_OPTIONSTR) {
997 		/*
998 		 * Limit the buffer size
999 		 */
1000 		if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1001 			error = EINVAL;
1002 			goto errout;
1003 		}
1004 		if ((uap->flags & MS_SYSSPACE) == 0) {
1005 			inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1006 			inargs[0] = '\0';
1007 			if (optlen) {
1008 				error = copyinstr(opts, inargs, (size_t)optlen,
1009 					NULL);
1010 				if (error) {
1011 					goto errout;
1012 				}
1013 			}
1014 		}
1015 		vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1016 	}
1017 	/*
1018 	 * Flag bits override the options string.
1019 	 */
1020 	if (uap->flags & MS_REMOUNT)
1021 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1022 	if (uap->flags & MS_RDONLY)
1023 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1024 	if (uap->flags & MS_NOSUID)
1025 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1026 
1027 	/*
1028 	 * Check if this is a remount; must be set in the option string and
1029 	 * the file system must support a remount option.
1030 	 */
1031 	if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1032 	    MNTOPT_REMOUNT, NULL)) {
1033 		if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1034 			error = ENOTSUP;
1035 			goto errout;
1036 		}
1037 		uap->flags |= MS_REMOUNT;
1038 	}
1039 
1040 	/*
1041 	 * uap->flags and vfs_optionisset() should agree.
1042 	 */
1043 	if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1044 		uap->flags |= MS_RDONLY;
1045 	}
1046 	if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1047 		uap->flags |= MS_NOSUID;
1048 	}
1049 	nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1050 	ASSERT(splice || !remount);
1051 	/*
1052 	 * If we are splicing the fs into the namespace,
1053 	 * perform mount point checks.
1054 	 *
1055 	 * We want to resolve the path for the mount point to eliminate
1056 	 * '.' and ".." and symlinks in mount points; we can't do the
1057 	 * same for the resource string, since it would turn
1058 	 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1059 	 * this before grabbing vn_vfswlock(), because otherwise we
1060 	 * would deadlock with lookuppn().
1061 	 */
1062 	if (splice) {
1063 		ASSERT(vp->v_count > 0);
1064 
1065 		/*
1066 		 * Pick up mount point and device from appropriate space.
1067 		 */
1068 		if (pn_get(uap->spec, fromspace, &pn) == 0) {
1069 			resource = kmem_alloc(pn.pn_pathlen + 1,
1070 			    KM_SLEEP);
1071 			(void) strcpy(resource, pn.pn_path);
1072 			pn_free(&pn);
1073 		}
1074 		/*
1075 		 * Do a lookupname prior to taking the
1076 		 * writelock. Mark this as completed if
1077 		 * successful for later cleanup and addition to
1078 		 * the mount in progress table.
1079 		 */
1080 		if ((uap->flags & MS_GLOBAL) == 0 &&
1081 		    lookupname(uap->spec, fromspace,
1082 			    FOLLOW, NULL, &bvp) == 0) {
1083 			addmip = 1;
1084 		}
1085 
1086 		if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1087 			pathname_t *pnp;
1088 
1089 			if (*pn.pn_path != '/') {
1090 				error = EINVAL;
1091 				pn_free(&pn);
1092 				goto errout;
1093 			}
1094 			pn_alloc(&rpn);
1095 			/*
1096 			 * Kludge to prevent autofs from deadlocking with
1097 			 * itself when it calls domount().
1098 			 *
1099 			 * If autofs is calling, it is because it is doing
1100 			 * (autofs) mounts in the process of an NFS mount.  A
1101 			 * lookuppn() here would cause us to block waiting for
1102 			 * said NFS mount to complete, which can't since this
1103 			 * is the thread that was supposed to doing it.
1104 			 */
1105 			if (fromspace == UIO_USERSPACE) {
1106 				if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1107 				    NULL)) == 0) {
1108 					pnp = &rpn;
1109 				} else {
1110 					/*
1111 					 * The file disappeared or otherwise
1112 					 * became inaccessible since we opened
1113 					 * it; might as well fail the mount
1114 					 * since the mount point is no longer
1115 					 * accessible.
1116 					 */
1117 					pn_free(&rpn);
1118 					pn_free(&pn);
1119 					goto errout;
1120 				}
1121 			} else {
1122 				pnp = &pn;
1123 			}
1124 			mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1125 			(void) strcpy(mountpt, pnp->pn_path);
1126 
1127 			/*
1128 			 * If the addition of the zone's rootpath
1129 			 * would push us over a total path length
1130 			 * of MAXPATHLEN, we fail the mount with
1131 			 * ENAMETOOLONG, which is what we would have
1132 			 * gotten if we were trying to perform the same
1133 			 * mount in the global zone.
1134 			 *
1135 			 * strlen() doesn't count the trailing
1136 			 * '\0', but zone_rootpathlen counts both a
1137 			 * trailing '/' and the terminating '\0'.
1138 			 */
1139 			if ((curproc->p_zone->zone_rootpathlen - 1 +
1140 			    strlen(mountpt)) > MAXPATHLEN ||
1141 			    (resource != NULL &&
1142 			    (curproc->p_zone->zone_rootpathlen - 1 +
1143 			    strlen(resource)) > MAXPATHLEN)) {
1144 				error = ENAMETOOLONG;
1145 			}
1146 
1147 			pn_free(&rpn);
1148 			pn_free(&pn);
1149 		}
1150 
1151 		if (error)
1152 			goto errout;
1153 
1154 		/*
1155 		 * Prevent path name resolution from proceeding past
1156 		 * the mount point.
1157 		 */
1158 		if (vn_vfswlock(vp) != 0) {
1159 			error = EBUSY;
1160 			goto errout;
1161 		}
1162 
1163 		/*
1164 		 * Verify that it's legitimate to establish a mount on
1165 		 * the prospective mount point.
1166 		 */
1167 		if (vn_mountedvfs(vp) != NULL) {
1168 			/*
1169 			 * The mount point lock was obtained after some
1170 			 * other thread raced through and established a mount.
1171 			 */
1172 			vn_vfsunlock(vp);
1173 			error = EBUSY;
1174 			goto errout;
1175 		}
1176 		if (vp->v_flag & VNOMOUNT) {
1177 			vn_vfsunlock(vp);
1178 			error = EINVAL;
1179 			goto errout;
1180 		}
1181 	}
1182 	if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1183 		uap->dataptr = NULL;
1184 		uap->datalen = 0;
1185 	}
1186 
1187 	/*
1188 	 * If this is a remount, we don't want to create a new VFS.
1189 	 * Instead, we pass the existing one with a remount flag.
1190 	 */
1191 	if (remount) {
1192 		/*
1193 		 * Confirm that the mount point is the root vnode of the
1194 		 * file system that is being remounted.
1195 		 * This can happen if the user specifies a different
1196 		 * mount point directory pathname in the (re)mount command.
1197 		 *
1198 		 * Code below can only be reached if splice is true, so it's
1199 		 * safe to do vn_vfsunlock() here.
1200 		 */
1201 		if ((vp->v_flag & VROOT) == 0) {
1202 			vn_vfsunlock(vp);
1203 			error = ENOENT;
1204 			goto errout;
1205 		}
1206 		/*
1207 		 * Disallow making file systems read-only unless file system
1208 		 * explicitly allows it in its vfssw.  Ignore other flags.
1209 		 */
1210 		if (rdonly && vn_is_readonly(vp) == 0 &&
1211 		    (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1212 			vn_vfsunlock(vp);
1213 			error = EINVAL;
1214 			goto errout;
1215 		}
1216 		/*
1217 		 * Changing the NBMAND setting on remounts is permitted
1218 		 * but logged since it can lead to unexpected behavior.
1219 		 * We also counsel against using it for / and /usr.
1220 		 */
1221 		if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1222 		    (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1223 			cmn_err(CE_WARN, "domount: nbmand turned %s via "
1224 			    "remounting %s", nbmand ? "on" : "off",
1225 			    refstr_value(vp->v_vfsp->vfs_mntpt));
1226 		}
1227 		vfsp = vp->v_vfsp;
1228 		ovflags = vfsp->vfs_flag;
1229 		vfsp->vfs_flag |= VFS_REMOUNT;
1230 		vfsp->vfs_flag &= ~VFS_RDONLY;
1231 	} else {
1232 		vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP);
1233 		VFS_INIT(vfsp, vfsops, NULL);
1234 	}
1235 
1236 	VFS_HOLD(vfsp);
1237 
1238 	/*
1239 	 * The vfs_reflock is not used anymore the code below explicitly
1240 	 * holds it preventing others accesing it directly.
1241 	 */
1242 	if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1243 	    !(vfsp->vfs_flag & VFS_REMOUNT))
1244 		cmn_err(CE_WARN,
1245 		    "mount type %s couldn't get vfs_reflock\n", vswp->vsw_name);
1246 
1247 	/*
1248 	 * Lock the vfs. If this is a remount we want to avoid spurious umount
1249 	 * failures that happen as a side-effect of fsflush() and other mount
1250 	 * and unmount operations that might be going on simultaneously and
1251 	 * may have locked the vfs currently. To not return EBUSY immediately
1252 	 * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1253 	 */
1254 	if (!remount) {
1255 		if (error = vfs_lock(vfsp)) {
1256 			vfsp->vfs_flag = ovflags;
1257 			if (splice)
1258 				vn_vfsunlock(vp);
1259 			if (vfsp->vfs_implp)
1260 				vfsimpl_teardown(vfsp);
1261 			kmem_free(vfsp, sizeof (struct vfs));
1262 			goto errout;
1263 		}
1264 	} else {
1265 		vfs_lock_wait(vfsp);
1266 	}
1267 
1268 	/*
1269 	 * Add device to mount in progress table, global mounts require special
1270 	 * handling. It is possible that we have already done the lookupname
1271 	 * on a spliced, non-global fs. If so, we don't want to do it again
1272 	 * since we cannot do a lookupname after taking the
1273 	 * wlock above. This case is for a non-spliced, non-global filesystem.
1274 	 */
1275 	if (!addmip) {
1276 	    if ((uap->flags & MS_GLOBAL) == 0 &&
1277 		lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1278 			addmip = 1;
1279 		}
1280 	}
1281 
1282 	if (addmip) {
1283 		bdev = bvp->v_rdev;
1284 		VN_RELE(bvp);
1285 		vfs_addmip(bdev, vfsp);
1286 		addmip = 0;
1287 		delmip = 1;
1288 	}
1289 	/*
1290 	 * Invalidate cached entry for the mount point.
1291 	 */
1292 	if (splice)
1293 		dnlc_purge_vp(vp);
1294 
1295 	/*
1296 	 * If have an option string but the filesystem doesn't supply a
1297 	 * prototype options table, create a table with the global
1298 	 * options and sufficient room to accept all the options in the
1299 	 * string.  Then parse the passed in option string
1300 	 * accepting all the options in the string.  This gives us an
1301 	 * option table with all the proper cancel properties for the
1302 	 * global options.
1303 	 *
1304 	 * Filesystems that supply a prototype options table are handled
1305 	 * earlier in this function.
1306 	 */
1307 	if (uap->flags & MS_OPTIONSTR) {
1308 		if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1309 			mntopts_t tmp_mntopts;
1310 
1311 			tmp_mntopts.mo_count = 0;
1312 			vfs_createopttbl_extend(&tmp_mntopts, inargs,
1313 			    &mnt_mntopts);
1314 			vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1315 			vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1316 			vfs_freeopttbl(&tmp_mntopts);
1317 		}
1318 	}
1319 
1320 	/*
1321 	 * Serialize with zone creations.
1322 	 */
1323 	mount_in_progress();
1324 	/*
1325 	 * Instantiate (or reinstantiate) the file system.  If appropriate,
1326 	 * splice it into the file system name space.
1327 	 *
1328 	 * We want VFS_MOUNT() to be able to override the vfs_resource
1329 	 * string if necessary (ie, mntfs), and also for a remount to
1330 	 * change the same (necessary when remounting '/' during boot).
1331 	 * So we set up vfs_mntpt and vfs_resource to what we think they
1332 	 * should be, then hand off control to VFS_MOUNT() which can
1333 	 * override this.
1334 	 *
1335 	 * For safety's sake, when changing vfs_resource or vfs_mntpt of
1336 	 * a vfs which is on the vfs list (i.e. during a remount), we must
1337 	 * never set those fields to NULL. Several bits of code make
1338 	 * assumptions that the fields are always valid.
1339 	 */
1340 	vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1341 	if (remount) {
1342 		if ((oldresource = vfsp->vfs_resource) != NULL)
1343 			refstr_hold(oldresource);
1344 		if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1345 			refstr_hold(oldmntpt);
1346 	}
1347 	vfs_setresource(vfsp, resource);
1348 	vfs_setmntpoint(vfsp, mountpt);
1349 
1350 	error = VFS_MOUNT(vfsp, vp, uap, credp);
1351 
1352 	if (uap->flags & MS_RDONLY)
1353 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1354 	if (uap->flags & MS_NOSUID)
1355 		vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1356 	if (uap->flags & MS_GLOBAL)
1357 		vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1358 
1359 	if (error) {
1360 		if (remount) {
1361 			/* put back pre-remount options */
1362 			vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1363 			vfs_setmntpoint(vfsp, (stripzonepath(
1364 					refstr_value(oldmntpt))));
1365 			if (oldmntpt)
1366 				refstr_rele(oldmntpt);
1367 			vfs_setresource(vfsp, (stripzonepath(
1368 					refstr_value(oldresource))));
1369 			if (oldresource)
1370 				refstr_rele(oldresource);
1371 			vfsp->vfs_flag = ovflags;
1372 			vfs_unlock(vfsp);
1373 			VFS_RELE(vfsp);
1374 		} else {
1375 			vfs_unlock(vfsp);
1376 			vfs_freemnttab(vfsp);
1377 			if (vfsp->vfs_implp)
1378 				vfsimpl_teardown(vfsp);
1379 			kmem_free(vfsp, sizeof (struct vfs));
1380 		}
1381 	} else {
1382 		/*
1383 		 * Set the mount time to now
1384 		 */
1385 		vfsp->vfs_mtime = ddi_get_time();
1386 		if (remount) {
1387 			vfsp->vfs_flag &= ~VFS_REMOUNT;
1388 			if (oldresource)
1389 				refstr_rele(oldresource);
1390 			if (oldmntpt)
1391 				refstr_rele(oldmntpt);
1392 		} else if (splice) {
1393 			/*
1394 			 * Link vfsp into the name space at the mount
1395 			 * point. Vfs_add() is responsible for
1396 			 * holding the mount point which will be
1397 			 * released when vfs_remove() is called.
1398 			 */
1399 			vfs_add(vp, vfsp, uap->flags);
1400 		} else {
1401 			/*
1402 			 * Hold the reference to file system which is
1403 			 * not linked into the name space.
1404 			 */
1405 			vfsp->vfs_zone = NULL;
1406 			VFS_HOLD(vfsp);
1407 			vfsp->vfs_vnodecovered = NULL;
1408 		}
1409 		/*
1410 		 * Set flags for global options encountered
1411 		 */
1412 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1413 			vfsp->vfs_flag |= VFS_RDONLY;
1414 		else
1415 			vfsp->vfs_flag &= ~VFS_RDONLY;
1416 		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1417 			vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1418 		} else {
1419 			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1420 				vfsp->vfs_flag |= VFS_NODEVICES;
1421 			else
1422 				vfsp->vfs_flag &= ~VFS_NODEVICES;
1423 			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1424 				vfsp->vfs_flag |= VFS_NOSETUID;
1425 			else
1426 				vfsp->vfs_flag &= ~VFS_NOSETUID;
1427 		}
1428 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1429 			vfsp->vfs_flag |= VFS_NBMAND;
1430 		else
1431 			vfsp->vfs_flag &= ~VFS_NBMAND;
1432 
1433 		if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1434 			vfsp->vfs_flag |= VFS_XATTR;
1435 		else
1436 			vfsp->vfs_flag &= ~VFS_XATTR;
1437 
1438 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1439 			vfsp->vfs_flag |= VFS_NOEXEC;
1440 		else
1441 			vfsp->vfs_flag &= ~VFS_NOEXEC;
1442 
1443 		/*
1444 		 * Now construct the output option string of options
1445 		 * we recognized.
1446 		 */
1447 		if (uap->flags & MS_OPTIONSTR) {
1448 			vfs_list_read_lock();
1449 			copyout_error = vfs_buildoptionstr(
1450 				&vfsp->vfs_mntopts, inargs, optlen);
1451 			vfs_list_unlock();
1452 			if (copyout_error == 0 &&
1453 			    (uap->flags & MS_SYSSPACE) == 0) {
1454 				copyout_error = copyoutstr(inargs, opts,
1455 				    optlen, NULL);
1456 			}
1457 		}
1458 
1459 		/*
1460 		 * If this isn't a remount, set up the vopstats before
1461 		 * anyone can touch this. We only allow spliced file
1462 		 * systems (file systems which are in the namespace) to
1463 		 * have the VFS_STATS flag set.
1464 		 * NOTE: PxFS mounts the underlying file system with
1465 		 * MS_NOSPLICE set and copies those vfs_flags to its private
1466 		 * vfs structure. As a result, PxFS should never have
1467 		 * the VFS_STATS flag or else we might access the vfs
1468 		 * statistics-related fields prior to them being
1469 		 * properly initialized.
1470 		 */
1471 		if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1472 			initialize_vopstats(&vfsp->vfs_vopstats);
1473 			/*
1474 			 * We need to set vfs_vskap to NULL because there's
1475 			 * a chance it won't be set below.  This is checked
1476 			 * in teardown_vopstats() so we can't have garbage.
1477 			 */
1478 			vfsp->vfs_vskap = NULL;
1479 			vfsp->vfs_flag |= VFS_STATS;
1480 			vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1481 		}
1482 
1483 		vfs_unlock(vfsp);
1484 	}
1485 	mount_completed();
1486 	if (splice)
1487 		vn_vfsunlock(vp);
1488 
1489 	if ((error == 0) && (copyout_error == 0)) {
1490 		if (!remount) {
1491 			/*
1492 			 * Don't call get_vskstat_anchor() while holding
1493 			 * locks since it allocates memory and calls
1494 			 * VFS_STATVFS().  For NFS, the latter can generate
1495 			 * an over-the-wire call.
1496 			 */
1497 			vskap = get_vskstat_anchor(vfsp);
1498 			/* Only take the lock if we have something to do */
1499 			if (vskap != NULL) {
1500 				vfs_lock_wait(vfsp);
1501 				if (vfsp->vfs_flag & VFS_STATS) {
1502 					vfsp->vfs_vskap = vskap;
1503 				}
1504 				vfs_unlock(vfsp);
1505 			}
1506 		}
1507 		/* Return vfsp to caller. */
1508 		*vfspp = vfsp;
1509 	}
1510 errout:
1511 	vfs_freeopttbl(&mnt_mntopts);
1512 	if (resource != NULL)
1513 		kmem_free(resource, strlen(resource) + 1);
1514 	if (mountpt != NULL)
1515 		kmem_free(mountpt, strlen(mountpt) + 1);
1516 	/*
1517 	 * It is possible we errored prior to adding to mount in progress
1518 	 * table. Must free vnode we acquired with successful lookupname.
1519 	 */
1520 	if (addmip)
1521 		VN_RELE(bvp);
1522 	if (delmip)
1523 		vfs_delmip(vfsp);
1524 	ASSERT(vswp != NULL);
1525 	vfs_unrefvfssw(vswp);
1526 	if (inargs != opts)
1527 		kmem_free(inargs, MAX_MNTOPT_STR);
1528 	if (copyout_error) {
1529 		VFS_RELE(vfsp);
1530 		error = copyout_error;
1531 	}
1532 	return (error);
1533 }
1534 
1535 static void
1536 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath)
1537 {
1538 	size_t len;
1539 	refstr_t *ref;
1540 	zone_t *zone = curproc->p_zone;
1541 	char *sp;
1542 	int have_list_lock = 0;
1543 
1544 	ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1545 
1546 	/*
1547 	 * New path must be less than MAXPATHLEN because mntfs
1548 	 * will only display up to MAXPATHLEN bytes. This is currently
1549 	 * safe, because domount() uses pn_get(), and other callers
1550 	 * similarly cap the size to fewer than MAXPATHLEN bytes.
1551 	 */
1552 
1553 	ASSERT(strlen(newpath) < MAXPATHLEN);
1554 
1555 	/* mntfs requires consistency while vfs list lock is held */
1556 
1557 	if (VFS_ON_LIST(vfsp)) {
1558 		have_list_lock = 1;
1559 		vfs_list_lock();
1560 	}
1561 
1562 	if (*refp != NULL)
1563 		refstr_rele(*refp);
1564 
1565 	/* Do we need to modify the path? */
1566 
1567 	if (zone == global_zone || *newpath != '/') {
1568 		ref = refstr_alloc(newpath);
1569 		goto out;
1570 	}
1571 
1572 	/*
1573 	 * Truncate the trailing '/' in the zoneroot, and merge
1574 	 * in the zone's rootpath with the "newpath" (resource
1575 	 * or mountpoint) passed in.
1576 	 *
1577 	 * The size of the required buffer is thus the size of
1578 	 * the buffer required for the passed-in newpath
1579 	 * (strlen(newpath) + 1), plus the size of the buffer
1580 	 * required to hold zone_rootpath (zone_rootpathlen)
1581 	 * minus one for one of the now-superfluous NUL
1582 	 * terminations, minus one for the trailing '/'.
1583 	 *
1584 	 * That gives us:
1585 	 *
1586 	 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1587 	 *
1588 	 * Which is what we have below.
1589 	 */
1590 
1591 	len = strlen(newpath) + zone->zone_rootpathlen - 1;
1592 	sp = kmem_alloc(len, KM_SLEEP);
1593 
1594 	/*
1595 	 * Copy everything including the trailing slash, which
1596 	 * we then overwrite with the NUL character.
1597 	 */
1598 
1599 	(void) strcpy(sp, zone->zone_rootpath);
1600 	sp[zone->zone_rootpathlen - 2] = '\0';
1601 	(void) strcat(sp, newpath);
1602 
1603 	ref = refstr_alloc(sp);
1604 	kmem_free(sp, len);
1605 out:
1606 	*refp = ref;
1607 
1608 	if (have_list_lock) {
1609 		vfs_mnttab_modtimeupd();
1610 		vfs_list_unlock();
1611 	}
1612 }
1613 
1614 /*
1615  * Record a mounted resource name in a vfs structure.
1616  * If vfsp is already mounted, caller must hold the vfs lock.
1617  */
1618 void
1619 vfs_setresource(struct vfs *vfsp, const char *resource)
1620 {
1621 	if (resource == NULL || resource[0] == '\0')
1622 		resource = VFS_NORESOURCE;
1623 	vfs_setpath(vfsp, &vfsp->vfs_resource, resource);
1624 }
1625 
1626 /*
1627  * Record a mount point name in a vfs structure.
1628  * If vfsp is already mounted, caller must hold the vfs lock.
1629  */
1630 void
1631 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt)
1632 {
1633 	if (mntpt == NULL || mntpt[0] == '\0')
1634 		mntpt = VFS_NOMNTPT;
1635 	vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt);
1636 }
1637 
1638 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1639 
1640 refstr_t *
1641 vfs_getresource(const struct vfs *vfsp)
1642 {
1643 	refstr_t *resource;
1644 
1645 	vfs_list_read_lock();
1646 	resource = vfsp->vfs_resource;
1647 	refstr_hold(resource);
1648 	vfs_list_unlock();
1649 
1650 	return (resource);
1651 }
1652 
1653 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1654 
1655 refstr_t *
1656 vfs_getmntpoint(const struct vfs *vfsp)
1657 {
1658 	refstr_t *mntpt;
1659 
1660 	vfs_list_read_lock();
1661 	mntpt = vfsp->vfs_mntpt;
1662 	refstr_hold(mntpt);
1663 	vfs_list_unlock();
1664 
1665 	return (mntpt);
1666 }
1667 
1668 /*
1669  * Create an empty options table with enough empty slots to hold all
1670  * The options in the options string passed as an argument.
1671  * Potentially prepend another options table.
1672  *
1673  * Note: caller is responsible for locking the vfs list, if needed,
1674  *       to protect mops.
1675  */
1676 static void
1677 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1678     const mntopts_t *mtmpl)
1679 {
1680 	const char *s = opts;
1681 	uint_t count;
1682 
1683 	if (opts == NULL || *opts == '\0') {
1684 		count = 0;
1685 	} else {
1686 		count = 1;
1687 
1688 		/*
1689 		 * Count number of options in the string
1690 		 */
1691 		for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1692 			count++;
1693 			s++;
1694 		}
1695 	}
1696 	vfs_copyopttbl_extend(mtmpl, mops, count);
1697 }
1698 
1699 /*
1700  * Create an empty options table with enough empty slots to hold all
1701  * The options in the options string passed as an argument.
1702  *
1703  * This function is *not* for general use by filesystems.
1704  *
1705  * Note: caller is responsible for locking the vfs list, if needed,
1706  *       to protect mops.
1707  */
1708 void
1709 vfs_createopttbl(mntopts_t *mops, const char *opts)
1710 {
1711 	vfs_createopttbl_extend(mops, opts, NULL);
1712 }
1713 
1714 
1715 /*
1716  * Swap two mount options tables
1717  */
1718 static void
1719 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
1720 {
1721 	uint_t tmpcnt;
1722 	mntopt_t *tmplist;
1723 
1724 	tmpcnt = optbl2->mo_count;
1725 	tmplist = optbl2->mo_list;
1726 	optbl2->mo_count = optbl1->mo_count;
1727 	optbl2->mo_list = optbl1->mo_list;
1728 	optbl1->mo_count = tmpcnt;
1729 	optbl1->mo_list = tmplist;
1730 }
1731 
1732 static void
1733 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
1734 {
1735 	vfs_list_lock();
1736 	vfs_swapopttbl_nolock(optbl1, optbl2);
1737 	vfs_mnttab_modtimeupd();
1738 	vfs_list_unlock();
1739 }
1740 
1741 static char **
1742 vfs_copycancelopt_extend(char **const moc, int extend)
1743 {
1744 	int i = 0;
1745 	int j;
1746 	char **result;
1747 
1748 	if (moc != NULL) {
1749 		for (; moc[i] != NULL; i++)
1750 			/* count number of options to cancel */;
1751 	}
1752 
1753 	if (i + extend == 0)
1754 		return (NULL);
1755 
1756 	result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
1757 
1758 	for (j = 0; j < i; j++) {
1759 		result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
1760 		(void) strcpy(result[j], moc[j]);
1761 	}
1762 	for (; j <= i + extend; j++)
1763 		result[j] = NULL;
1764 
1765 	return (result);
1766 }
1767 
1768 static void
1769 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
1770 {
1771 	char *sp, *dp;
1772 
1773 	d->mo_flags = s->mo_flags;
1774 	d->mo_data = s->mo_data;
1775 	sp = s->mo_name;
1776 	if (sp != NULL) {
1777 		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1778 		(void) strcpy(dp, sp);
1779 		d->mo_name = dp;
1780 	} else {
1781 		d->mo_name = NULL; /* should never happen */
1782 	}
1783 
1784 	d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
1785 
1786 	sp = s->mo_arg;
1787 	if (sp != NULL) {
1788 		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1789 		(void) strcpy(dp, sp);
1790 		d->mo_arg = dp;
1791 	} else {
1792 		d->mo_arg = NULL;
1793 	}
1794 }
1795 
1796 /*
1797  * Copy a mount options table, possibly allocating some spare
1798  * slots at the end.  It is permissible to copy_extend the NULL table.
1799  */
1800 static void
1801 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
1802 {
1803 	uint_t i, count;
1804 	mntopt_t *motbl;
1805 
1806 	/*
1807 	 * Clear out any existing stuff in the options table being initialized
1808 	 */
1809 	vfs_freeopttbl(dmo);
1810 	count = (smo == NULL) ? 0 : smo->mo_count;
1811 	if ((count + extra) == 0)	/* nothing to do */
1812 		return;
1813 	dmo->mo_count = count + extra;
1814 	motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
1815 	dmo->mo_list = motbl;
1816 	for (i = 0; i < count; i++) {
1817 		vfs_copyopt(&smo->mo_list[i], &motbl[i]);
1818 	}
1819 	for (i = count; i < count + extra; i++) {
1820 		motbl[i].mo_flags = MO_EMPTY;
1821 	}
1822 }
1823 
1824 /*
1825  * Copy a mount options table.
1826  *
1827  * This function is *not* for general use by filesystems.
1828  *
1829  * Note: caller is responsible for locking the vfs list, if needed,
1830  *       to protect smo and dmo.
1831  */
1832 void
1833 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
1834 {
1835 	vfs_copyopttbl_extend(smo, dmo, 0);
1836 }
1837 
1838 static char **
1839 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
1840 {
1841 	int c1 = 0;
1842 	int c2 = 0;
1843 	char **result;
1844 	char **sp1, **sp2, **dp;
1845 
1846 	/*
1847 	 * First we count both lists of cancel options.
1848 	 * If either is NULL or has no elements, we return a copy of
1849 	 * the other.
1850 	 */
1851 	if (mop1->mo_cancel != NULL) {
1852 		for (; mop1->mo_cancel[c1] != NULL; c1++)
1853 			/* count cancel options in mop1 */;
1854 	}
1855 
1856 	if (c1 == 0)
1857 		return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
1858 
1859 	if (mop2->mo_cancel != NULL) {
1860 		for (; mop2->mo_cancel[c2] != NULL; c2++)
1861 			/* count cancel options in mop2 */;
1862 	}
1863 
1864 	result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
1865 
1866 	if (c2 == 0)
1867 		return (result);
1868 
1869 	/*
1870 	 * When we get here, we've got two sets of cancel options;
1871 	 * we need to merge the two sets.  We know that the result
1872 	 * array has "c1+c2+1" entries and in the end we might shrink
1873 	 * it.
1874 	 * Result now has a copy of the c1 entries from mop1; we'll
1875 	 * now lookup all the entries of mop2 in mop1 and copy it if
1876 	 * it is unique.
1877 	 * This operation is O(n^2) but it's only called once per
1878 	 * filesystem per duplicate option.  This is a situation
1879 	 * which doesn't arise with the filesystems in ON and
1880 	 * n is generally 1.
1881 	 */
1882 
1883 	dp = &result[c1];
1884 	for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
1885 		for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
1886 			if (strcmp(*sp1, *sp2) == 0)
1887 				break;
1888 		}
1889 		if (*sp1 == NULL) {
1890 			/*
1891 			 * Option *sp2 not found in mop1, so copy it.
1892 			 * The calls to vfs_copycancelopt_extend()
1893 			 * guarantee that there's enough room.
1894 			 */
1895 			*dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
1896 			(void) strcpy(*dp++, *sp2);
1897 		}
1898 	}
1899 	if (dp != &result[c1+c2]) {
1900 		size_t bytes = (dp - result + 1) * sizeof (char *);
1901 		char **nres = kmem_alloc(bytes, KM_SLEEP);
1902 
1903 		bcopy(result, nres, bytes);
1904 		kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
1905 		result = nres;
1906 	}
1907 	return (result);
1908 }
1909 
1910 /*
1911  * Merge two mount option tables (outer and inner) into one.  This is very
1912  * similar to "merging" global variables and automatic variables in C.
1913  *
1914  * This isn't (and doesn't have to be) fast.
1915  *
1916  * This function is *not* for general use by filesystems.
1917  *
1918  * Note: caller is responsible for locking the vfs list, if needed,
1919  *       to protect omo, imo & dmo.
1920  */
1921 void
1922 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
1923 {
1924 	uint_t i, count;
1925 	mntopt_t *mop, *motbl;
1926 	uint_t freeidx;
1927 
1928 	/*
1929 	 * First determine how much space we need to allocate.
1930 	 */
1931 	count = omo->mo_count;
1932 	for (i = 0; i < imo->mo_count; i++) {
1933 		if (imo->mo_list[i].mo_flags & MO_EMPTY)
1934 			continue;
1935 		if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
1936 			count++;
1937 	}
1938 	ASSERT(count >= omo->mo_count &&
1939 	    count <= omo->mo_count + imo->mo_count);
1940 	motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
1941 	for (i = 0; i < omo->mo_count; i++)
1942 		vfs_copyopt(&omo->mo_list[i], &motbl[i]);
1943 	freeidx = omo->mo_count;
1944 	for (i = 0; i < imo->mo_count; i++) {
1945 		if (imo->mo_list[i].mo_flags & MO_EMPTY)
1946 			continue;
1947 		if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
1948 			char **newcanp;
1949 			uint_t index = mop - omo->mo_list;
1950 
1951 			newcanp = vfs_mergecancelopts(mop, &motbl[index]);
1952 
1953 			vfs_freeopt(&motbl[index]);
1954 			vfs_copyopt(&imo->mo_list[i], &motbl[index]);
1955 
1956 			vfs_freecancelopt(motbl[index].mo_cancel);
1957 			motbl[index].mo_cancel = newcanp;
1958 		} else {
1959 			/*
1960 			 * If it's a new option, just copy it over to the first
1961 			 * free location.
1962 			 */
1963 			vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
1964 		}
1965 	}
1966 	dmo->mo_count = count;
1967 	dmo->mo_list = motbl;
1968 }
1969 
1970 /*
1971  * Functions to set and clear mount options in a mount options table.
1972  */
1973 
1974 /*
1975  * Clear a mount option, if it exists.
1976  *
1977  * The update_mnttab arg indicates whether mops is part of a vfs that is on
1978  * the vfs list.
1979  */
1980 static void
1981 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
1982 {
1983 	struct mntopt *mop;
1984 	uint_t i, count;
1985 
1986 	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
1987 
1988 	count = mops->mo_count;
1989 	for (i = 0; i < count; i++) {
1990 		mop = &mops->mo_list[i];
1991 
1992 		if (mop->mo_flags & MO_EMPTY)
1993 			continue;
1994 		if (strcmp(opt, mop->mo_name))
1995 			continue;
1996 		mop->mo_flags &= ~MO_SET;
1997 		if (mop->mo_arg != NULL) {
1998 			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
1999 		}
2000 		mop->mo_arg = NULL;
2001 		if (update_mnttab)
2002 			vfs_mnttab_modtimeupd();
2003 		break;
2004 	}
2005 }
2006 
2007 void
2008 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2009 {
2010 	int gotlock = 0;
2011 
2012 	if (VFS_ON_LIST(vfsp)) {
2013 		gotlock = 1;
2014 		vfs_list_lock();
2015 	}
2016 	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2017 	if (gotlock)
2018 		vfs_list_unlock();
2019 }
2020 
2021 
2022 /*
2023  * Set a mount option on.  If it's not found in the table, it's silently
2024  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2025  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2026  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2027  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2028  * MO_EMPTY set is created as the option passed in.
2029  *
2030  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2031  * the vfs list.
2032  */
2033 static void
2034 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2035     const char *arg, int flags, int update_mnttab)
2036 {
2037 	mntopt_t *mop;
2038 	uint_t i, count;
2039 	char *sp;
2040 
2041 	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2042 
2043 	if (flags & VFS_CREATEOPT) {
2044 		if (vfs_hasopt(mops, opt) != NULL) {
2045 			flags &= ~VFS_CREATEOPT;
2046 		}
2047 	}
2048 	count = mops->mo_count;
2049 	for (i = 0; i < count; i++) {
2050 		mop = &mops->mo_list[i];
2051 
2052 		if (mop->mo_flags & MO_EMPTY) {
2053 			if ((flags & VFS_CREATEOPT) == 0)
2054 				continue;
2055 			sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2056 			(void) strcpy(sp, opt);
2057 			mop->mo_name = sp;
2058 			if (arg != NULL)
2059 				mop->mo_flags = MO_HASVALUE;
2060 			else
2061 				mop->mo_flags = 0;
2062 		} else if (strcmp(opt, mop->mo_name)) {
2063 			continue;
2064 		}
2065 		if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2066 			break;
2067 		if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2068 			sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2069 			(void) strcpy(sp, arg);
2070 		} else {
2071 			sp = NULL;
2072 		}
2073 		if (mop->mo_arg != NULL)
2074 			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2075 		mop->mo_arg = sp;
2076 		if (flags & VFS_DISPLAY)
2077 			mop->mo_flags &= ~MO_NODISPLAY;
2078 		if (flags & VFS_NODISPLAY)
2079 			mop->mo_flags |= MO_NODISPLAY;
2080 		mop->mo_flags |= MO_SET;
2081 		if (mop->mo_cancel != NULL) {
2082 			char **cp;
2083 
2084 			for (cp = mop->mo_cancel; *cp != NULL; cp++)
2085 				vfs_clearmntopt_nolock(mops, *cp, 0);
2086 		}
2087 		if (update_mnttab)
2088 			vfs_mnttab_modtimeupd();
2089 		break;
2090 	}
2091 }
2092 
2093 void
2094 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2095 {
2096 	int gotlock = 0;
2097 
2098 	if (VFS_ON_LIST(vfsp)) {
2099 		gotlock = 1;
2100 		vfs_list_lock();
2101 	}
2102 	vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2103 	if (gotlock)
2104 		vfs_list_unlock();
2105 }
2106 
2107 
2108 /*
2109  * Add a "tag" option to a mounted file system's options list.
2110  *
2111  * Note: caller is responsible for locking the vfs list, if needed,
2112  *       to protect mops.
2113  */
2114 static mntopt_t *
2115 vfs_addtag(mntopts_t *mops, const char *tag)
2116 {
2117 	uint_t count;
2118 	mntopt_t *mop, *motbl;
2119 
2120 	count = mops->mo_count + 1;
2121 	motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2122 	if (mops->mo_count) {
2123 		size_t len = (count - 1) * sizeof (mntopt_t);
2124 
2125 		bcopy(mops->mo_list, motbl, len);
2126 		kmem_free(mops->mo_list, len);
2127 	}
2128 	mops->mo_count = count;
2129 	mops->mo_list = motbl;
2130 	mop = &motbl[count - 1];
2131 	mop->mo_flags = MO_TAG;
2132 	mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2133 	(void) strcpy(mop->mo_name, tag);
2134 	return (mop);
2135 }
2136 
2137 /*
2138  * Allow users to set arbitrary "tags" in a vfs's mount options.
2139  * Broader use within the kernel is discouraged.
2140  */
2141 int
2142 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2143     cred_t *cr)
2144 {
2145 	vfs_t *vfsp;
2146 	mntopts_t *mops;
2147 	mntopt_t *mop;
2148 	int found = 0;
2149 	dev_t dev = makedevice(major, minor);
2150 	int err = 0;
2151 	char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2152 
2153 	/*
2154 	 * Find the desired mounted file system
2155 	 */
2156 	vfs_list_lock();
2157 	vfsp = rootvfs;
2158 	do {
2159 		if (vfsp->vfs_dev == dev &&
2160 		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2161 			found = 1;
2162 			break;
2163 		}
2164 		vfsp = vfsp->vfs_next;
2165 	} while (vfsp != rootvfs);
2166 
2167 	if (!found) {
2168 		err = EINVAL;
2169 		goto out;
2170 	}
2171 	err = secpolicy_fs_config(cr, vfsp);
2172 	if (err != 0)
2173 		goto out;
2174 
2175 	mops = &vfsp->vfs_mntopts;
2176 	/*
2177 	 * Add tag if it doesn't already exist
2178 	 */
2179 	if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2180 		int len;
2181 
2182 		(void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2183 		len = strlen(buf);
2184 		if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2185 			err = ENAMETOOLONG;
2186 			goto out;
2187 		}
2188 		mop = vfs_addtag(mops, tag);
2189 	}
2190 	if ((mop->mo_flags & MO_TAG) == 0) {
2191 		err = EINVAL;
2192 		goto out;
2193 	}
2194 	vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2195 out:
2196 	vfs_list_unlock();
2197 	kmem_free(buf, MAX_MNTOPT_STR);
2198 	return (err);
2199 }
2200 
2201 /*
2202  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2203  * Broader use within the kernel is discouraged.
2204  */
2205 int
2206 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2207     cred_t *cr)
2208 {
2209 	vfs_t *vfsp;
2210 	mntopt_t *mop;
2211 	int found = 0;
2212 	dev_t dev = makedevice(major, minor);
2213 	int err = 0;
2214 
2215 	/*
2216 	 * Find the desired mounted file system
2217 	 */
2218 	vfs_list_lock();
2219 	vfsp = rootvfs;
2220 	do {
2221 		if (vfsp->vfs_dev == dev &&
2222 		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2223 			found = 1;
2224 			break;
2225 		}
2226 		vfsp = vfsp->vfs_next;
2227 	} while (vfsp != rootvfs);
2228 
2229 	if (!found) {
2230 		err = EINVAL;
2231 		goto out;
2232 	}
2233 	err = secpolicy_fs_config(cr, vfsp);
2234 	if (err != 0)
2235 		goto out;
2236 
2237 	if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2238 		err = EINVAL;
2239 		goto out;
2240 	}
2241 	if ((mop->mo_flags & MO_TAG) == 0) {
2242 		err = EINVAL;
2243 		goto out;
2244 	}
2245 	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2246 out:
2247 	vfs_list_unlock();
2248 	return (err);
2249 }
2250 
2251 /*
2252  * Function to parse an option string and fill in a mount options table.
2253  * Unknown options are silently ignored.  The input option string is modified
2254  * by replacing separators with nulls.  If the create flag is set, options
2255  * not found in the table are just added on the fly.  The table must have
2256  * an option slot marked MO_EMPTY to add an option on the fly.
2257  *
2258  * This function is *not* for general use by filesystems.
2259  *
2260  * Note: caller is responsible for locking the vfs list, if needed,
2261  *       to protect mops..
2262  */
2263 void
2264 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2265 {
2266 	char *s = osp, *p, *nextop, *valp, *cp, *ep;
2267 	int setflg = VFS_NOFORCEOPT;
2268 
2269 	if (osp == NULL)
2270 		return;
2271 	while (*s != '\0') {
2272 		p = strchr(s, ',');	/* find next option */
2273 		if (p == NULL) {
2274 			cp = NULL;
2275 			p = s + strlen(s);
2276 		} else {
2277 			cp = p;		/* save location of comma */
2278 			*p++ = '\0';	/* mark end and point to next option */
2279 		}
2280 		nextop = p;
2281 		p = strchr(s, '=');	/* look for value */
2282 		if (p == NULL) {
2283 			valp = NULL;	/* no value supplied */
2284 		} else {
2285 			ep = p;		/* save location of equals */
2286 			*p++ = '\0';	/* end option and point to value */
2287 			valp = p;
2288 		}
2289 		/*
2290 		 * set option into options table
2291 		 */
2292 		if (create)
2293 			setflg |= VFS_CREATEOPT;
2294 		vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2295 		if (cp != NULL)
2296 			*cp = ',';	/* restore the comma */
2297 		if (valp != NULL)
2298 			*ep = '=';	/* restore the equals */
2299 		s = nextop;
2300 	}
2301 }
2302 
2303 /*
2304  * Function to inquire if an option exists in a mount options table.
2305  * Returns a pointer to the option if it exists, else NULL.
2306  *
2307  * This function is *not* for general use by filesystems.
2308  *
2309  * Note: caller is responsible for locking the vfs list, if needed,
2310  *       to protect mops.
2311  */
2312 struct mntopt *
2313 vfs_hasopt(const mntopts_t *mops, const char *opt)
2314 {
2315 	struct mntopt *mop;
2316 	uint_t i, count;
2317 
2318 	count = mops->mo_count;
2319 	for (i = 0; i < count; i++) {
2320 		mop = &mops->mo_list[i];
2321 
2322 		if (mop->mo_flags & MO_EMPTY)
2323 			continue;
2324 		if (strcmp(opt, mop->mo_name) == 0)
2325 			return (mop);
2326 	}
2327 	return (NULL);
2328 }
2329 
2330 /*
2331  * Function to inquire if an option is set in a mount options table.
2332  * Returns non-zero if set and fills in the arg pointer with a pointer to
2333  * the argument string or NULL if there is no argument string.
2334  */
2335 static int
2336 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2337 {
2338 	struct mntopt *mop;
2339 	uint_t i, count;
2340 
2341 	count = mops->mo_count;
2342 	for (i = 0; i < count; i++) {
2343 		mop = &mops->mo_list[i];
2344 
2345 		if (mop->mo_flags & MO_EMPTY)
2346 			continue;
2347 		if (strcmp(opt, mop->mo_name))
2348 			continue;
2349 		if ((mop->mo_flags & MO_SET) == 0)
2350 			return (0);
2351 		if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2352 			*argp = mop->mo_arg;
2353 		return (1);
2354 	}
2355 	return (0);
2356 }
2357 
2358 
2359 int
2360 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2361 {
2362 	int ret;
2363 
2364 	vfs_list_read_lock();
2365 	ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2366 	vfs_list_unlock();
2367 	return (ret);
2368 }
2369 
2370 
2371 /*
2372  * Construct a comma separated string of the options set in the given
2373  * mount table, return the string in the given buffer.  Return non-zero if
2374  * the buffer would overflow.
2375  *
2376  * This function is *not* for general use by filesystems.
2377  *
2378  * Note: caller is responsible for locking the vfs list, if needed,
2379  *       to protect mp.
2380  */
2381 int
2382 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2383 {
2384 	char *cp;
2385 	uint_t i;
2386 
2387 	buf[0] = '\0';
2388 	cp = buf;
2389 	for (i = 0; i < mp->mo_count; i++) {
2390 		struct mntopt *mop;
2391 
2392 		mop = &mp->mo_list[i];
2393 		if (mop->mo_flags & MO_SET) {
2394 			int optlen, comma = 0;
2395 
2396 			if (buf[0] != '\0')
2397 				comma = 1;
2398 			optlen = strlen(mop->mo_name);
2399 			if (strlen(buf) + comma + optlen + 1 > len)
2400 				goto err;
2401 			if (comma)
2402 				*cp++ = ',';
2403 			(void) strcpy(cp, mop->mo_name);
2404 			cp += optlen;
2405 			/*
2406 			 * Append option value if there is one
2407 			 */
2408 			if (mop->mo_arg != NULL) {
2409 				int arglen;
2410 
2411 				arglen = strlen(mop->mo_arg);
2412 				if (strlen(buf) + arglen + 2 > len)
2413 					goto err;
2414 				*cp++ = '=';
2415 				(void) strcpy(cp, mop->mo_arg);
2416 				cp += arglen;
2417 			}
2418 		}
2419 	}
2420 	return (0);
2421 err:
2422 	return (EOVERFLOW);
2423 }
2424 
2425 static void
2426 vfs_freecancelopt(char **moc)
2427 {
2428 	if (moc != NULL) {
2429 		int ccnt = 0;
2430 		char **cp;
2431 
2432 		for (cp = moc; *cp != NULL; cp++) {
2433 			kmem_free(*cp, strlen(*cp) + 1);
2434 			ccnt++;
2435 		}
2436 		kmem_free(moc, (ccnt + 1) * sizeof (char *));
2437 	}
2438 }
2439 
2440 static void
2441 vfs_freeopt(mntopt_t *mop)
2442 {
2443 	if (mop->mo_name != NULL)
2444 		kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2445 
2446 	vfs_freecancelopt(mop->mo_cancel);
2447 
2448 	if (mop->mo_arg != NULL)
2449 		kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2450 }
2451 
2452 /*
2453  * Free a mount options table
2454  *
2455  * This function is *not* for general use by filesystems.
2456  *
2457  * Note: caller is responsible for locking the vfs list, if needed,
2458  *       to protect mp.
2459  */
2460 void
2461 vfs_freeopttbl(mntopts_t *mp)
2462 {
2463 	uint_t i, count;
2464 
2465 	count = mp->mo_count;
2466 	for (i = 0; i < count; i++) {
2467 		vfs_freeopt(&mp->mo_list[i]);
2468 	}
2469 	if (count) {
2470 		kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2471 		mp->mo_count = 0;
2472 		mp->mo_list = NULL;
2473 	}
2474 }
2475 
2476 /*
2477  * Free any mnttab information recorded in the vfs struct.
2478  * The vfs must not be on the vfs list.
2479  */
2480 static void
2481 vfs_freemnttab(struct vfs *vfsp)
2482 {
2483 	ASSERT(!VFS_ON_LIST(vfsp));
2484 
2485 	/*
2486 	 * Free device and mount point information
2487 	 */
2488 	if (vfsp->vfs_mntpt != NULL) {
2489 		refstr_rele(vfsp->vfs_mntpt);
2490 		vfsp->vfs_mntpt = NULL;
2491 	}
2492 	if (vfsp->vfs_resource != NULL) {
2493 		refstr_rele(vfsp->vfs_resource);
2494 		vfsp->vfs_resource = NULL;
2495 	}
2496 	/*
2497 	 * Now free mount options information
2498 	 */
2499 	vfs_freeopttbl(&vfsp->vfs_mntopts);
2500 }
2501 
2502 /*
2503  * Return the last mnttab modification time
2504  */
2505 void
2506 vfs_mnttab_modtime(timespec_t *ts)
2507 {
2508 	ASSERT(RW_LOCK_HELD(&vfslist));
2509 	*ts = vfs_mnttab_mtime;
2510 }
2511 
2512 /*
2513  * See if mnttab is changed
2514  */
2515 void
2516 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2517 {
2518 	int changed;
2519 
2520 	*phpp = (struct pollhead *)NULL;
2521 
2522 	/*
2523 	 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2524 	 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2525 	 * to not grab the vfs list lock because tv_sec is monotonically
2526 	 * increasing.
2527 	 */
2528 
2529 	changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2530 	    (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2531 	if (!changed) {
2532 		*phpp = &vfs_pollhd;
2533 	}
2534 }
2535 
2536 /*
2537  * Update the mnttab modification time and wake up any waiters for
2538  * mnttab changes
2539  */
2540 void
2541 vfs_mnttab_modtimeupd()
2542 {
2543 	hrtime_t oldhrt, newhrt;
2544 
2545 	ASSERT(RW_WRITE_HELD(&vfslist));
2546 	oldhrt = ts2hrt(&vfs_mnttab_mtime);
2547 	gethrestime(&vfs_mnttab_mtime);
2548 	newhrt = ts2hrt(&vfs_mnttab_mtime);
2549 	if (oldhrt == (hrtime_t)0)
2550 		vfs_mnttab_ctime = vfs_mnttab_mtime;
2551 	/*
2552 	 * Attempt to provide unique mtime (like uniqtime but not).
2553 	 */
2554 	if (newhrt == oldhrt) {
2555 		newhrt++;
2556 		hrt2ts(newhrt, &vfs_mnttab_mtime);
2557 	}
2558 	pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2559 }
2560 
2561 int
2562 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
2563 {
2564 	vnode_t *coveredvp;
2565 	int error;
2566 	extern void teardown_vopstats(vfs_t *);
2567 
2568 	/*
2569 	 * Get covered vnode. This will be NULL if the vfs is not linked
2570 	 * into the file system name space (i.e., domount() with MNT_NOSPICE).
2571 	 */
2572 	coveredvp = vfsp->vfs_vnodecovered;
2573 	ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
2574 
2575 	/*
2576 	 * Purge all dnlc entries for this vfs.
2577 	 */
2578 	(void) dnlc_purge_vfsp(vfsp, 0);
2579 
2580 	/* For forcible umount, skip VFS_SYNC() since it may hang */
2581 	if ((flag & MS_FORCE) == 0)
2582 		(void) VFS_SYNC(vfsp, 0, cr);
2583 
2584 	/*
2585 	 * Lock the vfs to maintain fs status quo during unmount.  This
2586 	 * has to be done after the sync because ufs_update tries to acquire
2587 	 * the vfs_reflock.
2588 	 */
2589 	vfs_lock_wait(vfsp);
2590 
2591 	if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
2592 		vfs_unlock(vfsp);
2593 		if (coveredvp != NULL)
2594 			vn_vfsunlock(coveredvp);
2595 	} else if (coveredvp != NULL) {
2596 		teardown_vopstats(vfsp);
2597 		/*
2598 		 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
2599 		 * when it frees vfsp so we do a VN_HOLD() so we can
2600 		 * continue to use coveredvp afterwards.
2601 		 */
2602 		VN_HOLD(coveredvp);
2603 		vfs_remove(vfsp);
2604 		vn_vfsunlock(coveredvp);
2605 		VN_RELE(coveredvp);
2606 	} else {
2607 		teardown_vopstats(vfsp);
2608 		/*
2609 		 * Release the reference to vfs that is not linked
2610 		 * into the name space.
2611 		 */
2612 		vfs_unlock(vfsp);
2613 		VFS_RELE(vfsp);
2614 	}
2615 	return (error);
2616 }
2617 
2618 
2619 /*
2620  * Vfs_unmountall() is called by uadmin() to unmount all
2621  * mounted file systems (except the root file system) during shutdown.
2622  * It follows the existing locking protocol when traversing the vfs list
2623  * to sync and unmount vfses. Even though there should be no
2624  * other thread running while the system is shutting down, it is prudent
2625  * to still follow the locking protocol.
2626  */
2627 void
2628 vfs_unmountall(void)
2629 {
2630 	struct vfs *vfsp;
2631 	struct vfs *prev_vfsp = NULL;
2632 	int error;
2633 
2634 	/*
2635 	 * Toss all dnlc entries now so that the per-vfs sync
2636 	 * and unmount operations don't have to slog through
2637 	 * a bunch of uninteresting vnodes over and over again.
2638 	 */
2639 	dnlc_purge();
2640 
2641 	vfs_list_lock();
2642 	for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
2643 		prev_vfsp = vfsp->vfs_prev;
2644 
2645 		if (vfs_lock(vfsp) != 0)
2646 			continue;
2647 		error = vn_vfswlock(vfsp->vfs_vnodecovered);
2648 		vfs_unlock(vfsp);
2649 		if (error)
2650 			continue;
2651 
2652 		vfs_list_unlock();
2653 
2654 		(void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
2655 		(void) dounmount(vfsp, 0, CRED());
2656 
2657 		/*
2658 		 * Since we dropped the vfslist lock above we must
2659 		 * verify that next_vfsp still exists, else start over.
2660 		 */
2661 		vfs_list_lock();
2662 		for (vfsp = rootvfs->vfs_prev;
2663 			vfsp != rootvfs; vfsp = vfsp->vfs_prev)
2664 			if (vfsp == prev_vfsp)
2665 				break;
2666 		if (vfsp == rootvfs && prev_vfsp != rootvfs)
2667 			prev_vfsp = rootvfs->vfs_prev;
2668 	}
2669 	vfs_list_unlock();
2670 }
2671 
2672 /*
2673  * Called to add an entry to the end of the vfs mount in progress list
2674  */
2675 void
2676 vfs_addmip(dev_t dev, struct vfs *vfsp)
2677 {
2678 	struct ipmnt *mipp;
2679 
2680 	mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
2681 	mipp->mip_next = NULL;
2682 	mipp->mip_dev = dev;
2683 	mipp->mip_vfsp = vfsp;
2684 	mutex_enter(&vfs_miplist_mutex);
2685 	if (vfs_miplist_end != NULL)
2686 		vfs_miplist_end->mip_next = mipp;
2687 	else
2688 		vfs_miplist = mipp;
2689 	vfs_miplist_end = mipp;
2690 	mutex_exit(&vfs_miplist_mutex);
2691 }
2692 
2693 /*
2694  * Called to remove an entry from the mount in progress list
2695  * Either because the mount completed or it failed.
2696  */
2697 void
2698 vfs_delmip(struct vfs *vfsp)
2699 {
2700 	struct ipmnt *mipp, *mipprev;
2701 
2702 	mutex_enter(&vfs_miplist_mutex);
2703 	mipprev = NULL;
2704 	for (mipp = vfs_miplist;
2705 		mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
2706 		mipprev = mipp;
2707 	}
2708 	if (mipp == NULL)
2709 		return; /* shouldn't happen */
2710 	if (mipp == vfs_miplist_end)
2711 		vfs_miplist_end = mipprev;
2712 	if (mipprev == NULL)
2713 		vfs_miplist = mipp->mip_next;
2714 	else
2715 		mipprev->mip_next = mipp->mip_next;
2716 	mutex_exit(&vfs_miplist_mutex);
2717 	kmem_free(mipp, sizeof (struct ipmnt));
2718 }
2719 
2720 /*
2721  * vfs_add is called by a specific filesystem's mount routine to add
2722  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
2723  * The vfs should already have been locked by the caller.
2724  *
2725  * coveredvp is NULL if this is the root.
2726  */
2727 void
2728 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
2729 {
2730 	int newflag;
2731 
2732 	ASSERT(vfs_lock_held(vfsp));
2733 	VFS_HOLD(vfsp);
2734 	newflag = vfsp->vfs_flag;
2735 	if (mflag & MS_RDONLY)
2736 		newflag |= VFS_RDONLY;
2737 	else
2738 		newflag &= ~VFS_RDONLY;
2739 	if (mflag & MS_NOSUID)
2740 		newflag |= (VFS_NOSETUID|VFS_NODEVICES);
2741 	else
2742 		newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
2743 	if (mflag & MS_NOMNTTAB)
2744 		newflag |= VFS_NOMNTTAB;
2745 	else
2746 		newflag &= ~VFS_NOMNTTAB;
2747 
2748 	if (coveredvp != NULL) {
2749 		ASSERT(vn_vfswlock_held(coveredvp));
2750 		coveredvp->v_vfsmountedhere = vfsp;
2751 		VN_HOLD(coveredvp);
2752 	}
2753 	vfsp->vfs_vnodecovered = coveredvp;
2754 	vfsp->vfs_flag = newflag;
2755 
2756 	vfs_list_add(vfsp);
2757 }
2758 
2759 /*
2760  * Remove a vfs from the vfs list, null out the pointer from the
2761  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
2762  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
2763  * reference to the vfs and to the covered vnode.
2764  *
2765  * Called from dounmount after it's confirmed with the file system
2766  * that the unmount is legal.
2767  */
2768 void
2769 vfs_remove(struct vfs *vfsp)
2770 {
2771 	vnode_t *vp;
2772 
2773 	ASSERT(vfs_lock_held(vfsp));
2774 
2775 	/*
2776 	 * Can't unmount root.  Should never happen because fs will
2777 	 * be busy.
2778 	 */
2779 	if (vfsp == rootvfs)
2780 		cmn_err(CE_PANIC, "vfs_remove: unmounting root");
2781 
2782 	vfs_list_remove(vfsp);
2783 
2784 	/*
2785 	 * Unhook from the file system name space.
2786 	 */
2787 	vp = vfsp->vfs_vnodecovered;
2788 	ASSERT(vn_vfswlock_held(vp));
2789 	vp->v_vfsmountedhere = NULL;
2790 	vfsp->vfs_vnodecovered = NULL;
2791 	VN_RELE(vp);
2792 
2793 	/*
2794 	 * Release lock and wakeup anybody waiting.
2795 	 */
2796 	vfs_unlock(vfsp);
2797 	VFS_RELE(vfsp);
2798 }
2799 
2800 /*
2801  * Lock a filesystem to prevent access to it while mounting,
2802  * unmounting and syncing.  Return EBUSY immediately if lock
2803  * can't be acquired.
2804  */
2805 int
2806 vfs_lock(vfs_t *vfsp)
2807 {
2808 	vn_vfslocks_entry_t *vpvfsentry;
2809 
2810 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2811 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2812 		return (0);
2813 
2814 	vn_vfslocks_rele(vpvfsentry);
2815 	return (EBUSY);
2816 }
2817 
2818 int
2819 vfs_rlock(vfs_t *vfsp)
2820 {
2821 	vn_vfslocks_entry_t *vpvfsentry;
2822 
2823 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2824 
2825 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2826 		return (0);
2827 
2828 	vn_vfslocks_rele(vpvfsentry);
2829 	return (EBUSY);
2830 }
2831 
2832 void
2833 vfs_lock_wait(vfs_t *vfsp)
2834 {
2835 	vn_vfslocks_entry_t *vpvfsentry;
2836 
2837 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2838 	rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
2839 }
2840 
2841 void
2842 vfs_rlock_wait(vfs_t *vfsp)
2843 {
2844 	vn_vfslocks_entry_t *vpvfsentry;
2845 
2846 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2847 	rwst_enter(&vpvfsentry->ve_lock, RW_READER);
2848 }
2849 
2850 /*
2851  * Unlock a locked filesystem.
2852  */
2853 void
2854 vfs_unlock(vfs_t *vfsp)
2855 {
2856 	vn_vfslocks_entry_t *vpvfsentry;
2857 
2858 	/*
2859 	 * vfs_unlock will mimic sema_v behaviour to fix 4748018.
2860 	 * And these changes should remain for the patch changes as it is.
2861 	 */
2862 	if (panicstr)
2863 		return;
2864 
2865 	/*
2866 	 * ve_refcount needs to be dropped twice here.
2867 	 * 1. To release refernce after a call to vfs_locks_getlock()
2868 	 * 2. To release the reference from the locking routines like
2869 	 *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
2870 	 */
2871 
2872 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2873 	vn_vfslocks_rele(vpvfsentry);
2874 
2875 	rwst_exit(&vpvfsentry->ve_lock);
2876 	vn_vfslocks_rele(vpvfsentry);
2877 }
2878 
2879 /*
2880  * Utility routine that allows a filesystem to construct its
2881  * fsid in "the usual way" - by munging some underlying dev_t and
2882  * the filesystem type number into the 64-bit fsid.  Note that
2883  * this implicitly relies on dev_t persistence to make filesystem
2884  * id's persistent.
2885  *
2886  * There's nothing to prevent an individual fs from constructing its
2887  * fsid in a different way, and indeed they should.
2888  *
2889  * Since we want fsids to be 32-bit quantities (so that they can be
2890  * exported identically by either 32-bit or 64-bit APIs, as well as
2891  * the fact that fsid's are "known" to NFS), we compress the device
2892  * number given down to 32-bits, and panic if that isn't possible.
2893  */
2894 void
2895 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
2896 {
2897 	if (!cmpldev((dev32_t *)&fsi->val[0], dev))
2898 		panic("device number too big for fsid!");
2899 	fsi->val[1] = val;
2900 }
2901 
2902 int
2903 vfs_lock_held(vfs_t *vfsp)
2904 {
2905 	int held;
2906 	vn_vfslocks_entry_t *vpvfsentry;
2907 
2908 	/*
2909 	 * vfs_lock_held will mimic sema_held behaviour
2910 	 * if panicstr is set. And these changes should remain
2911 	 * for the patch changes as it is.
2912 	 */
2913 	if (panicstr)
2914 		return (1);
2915 
2916 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2917 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2918 
2919 	vn_vfslocks_rele(vpvfsentry);
2920 	return (held);
2921 }
2922 
2923 struct _kthread *
2924 vfs_lock_owner(vfs_t *vfsp)
2925 {
2926 	struct _kthread *owner;
2927 	vn_vfslocks_entry_t *vpvfsentry;
2928 
2929 	/*
2930 	 * vfs_wlock_held will mimic sema_held behaviour
2931 	 * if panicstr is set. And these changes should remain
2932 	 * for the patch changes as it is.
2933 	 */
2934 	if (panicstr)
2935 		return (NULL);
2936 
2937 	vpvfsentry = vn_vfslocks_getlock(vfsp);
2938 	owner = rwst_owner(&vpvfsentry->ve_lock);
2939 
2940 	vn_vfslocks_rele(vpvfsentry);
2941 	return (owner);
2942 }
2943 
2944 /*
2945  * vfs list locking.
2946  *
2947  * Rather than manipulate the vfslist lock directly, we abstract into lock
2948  * and unlock routines to allow the locking implementation to be changed for
2949  * clustering.
2950  *
2951  * Whenever the vfs list is modified through its hash links, the overall list
2952  * lock must be obtained before locking the relevant hash bucket.  But to see
2953  * whether a given vfs is on the list, it suffices to obtain the lock for the
2954  * hash bucket without getting the overall list lock.  (See getvfs() below.)
2955  */
2956 
2957 void
2958 vfs_list_lock()
2959 {
2960 	rw_enter(&vfslist, RW_WRITER);
2961 }
2962 
2963 void
2964 vfs_list_read_lock()
2965 {
2966 	rw_enter(&vfslist, RW_READER);
2967 }
2968 
2969 void
2970 vfs_list_unlock()
2971 {
2972 	rw_exit(&vfslist);
2973 }
2974 
2975 /*
2976  * Low level worker routines for adding entries to and removing entries from
2977  * the vfs list.
2978  */
2979 
2980 static void
2981 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
2982 {
2983 	int vhno;
2984 	struct vfs **hp;
2985 	dev_t dev;
2986 
2987 	ASSERT(RW_WRITE_HELD(&vfslist));
2988 
2989 	dev = expldev(vfsp->vfs_fsid.val[0]);
2990 	vhno = VFSHASH(getmajor(dev), getminor(dev));
2991 
2992 	mutex_enter(&rvfs_list[vhno].rvfs_lock);
2993 
2994 	/*
2995 	 * Link into the hash table, inserting it at the end, so that LOFS
2996 	 * with the same fsid as UFS (or other) file systems will not hide the
2997 	 * UFS.
2998 	 */
2999 	if (insert_at_head) {
3000 		vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3001 		rvfs_list[vhno].rvfs_head = vfsp;
3002 	} else {
3003 		for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3004 		    hp = &(*hp)->vfs_hash)
3005 			continue;
3006 		/*
3007 		 * hp now contains the address of the pointer to update
3008 		 * to effect the insertion.
3009 		 */
3010 		vfsp->vfs_hash = NULL;
3011 		*hp = vfsp;
3012 	}
3013 
3014 	rvfs_list[vhno].rvfs_len++;
3015 	mutex_exit(&rvfs_list[vhno].rvfs_lock);
3016 }
3017 
3018 
3019 static void
3020 vfs_hash_remove(struct vfs *vfsp)
3021 {
3022 	int vhno;
3023 	struct vfs *tvfsp;
3024 	dev_t dev;
3025 
3026 	ASSERT(RW_WRITE_HELD(&vfslist));
3027 
3028 	dev = expldev(vfsp->vfs_fsid.val[0]);
3029 	vhno = VFSHASH(getmajor(dev), getminor(dev));
3030 
3031 	mutex_enter(&rvfs_list[vhno].rvfs_lock);
3032 
3033 	/*
3034 	 * Remove from hash.
3035 	 */
3036 	if (rvfs_list[vhno].rvfs_head == vfsp) {
3037 		rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3038 		rvfs_list[vhno].rvfs_len--;
3039 		goto foundit;
3040 	}
3041 	for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3042 	    tvfsp = tvfsp->vfs_hash) {
3043 		if (tvfsp->vfs_hash == vfsp) {
3044 			tvfsp->vfs_hash = vfsp->vfs_hash;
3045 			rvfs_list[vhno].rvfs_len--;
3046 			goto foundit;
3047 		}
3048 	}
3049 	cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3050 
3051 foundit:
3052 
3053 	mutex_exit(&rvfs_list[vhno].rvfs_lock);
3054 }
3055 
3056 
3057 void
3058 vfs_list_add(struct vfs *vfsp)
3059 {
3060 	zone_t *zone;
3061 
3062 	/*
3063 	 * The zone that owns the mount is the one that performed the mount.
3064 	 * Note that this isn't necessarily the same as the zone mounted into.
3065 	 * The corresponding zone_rele() will be done when the vfs_t is
3066 	 * being free'd.
3067 	 */
3068 	vfsp->vfs_zone = curproc->p_zone;
3069 	zone_hold(vfsp->vfs_zone);
3070 
3071 	/*
3072 	 * Find the zone mounted into, and put this mount on its vfs list.
3073 	 */
3074 	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3075 	ASSERT(zone != NULL);
3076 	/*
3077 	 * Special casing for the root vfs.  This structure is allocated
3078 	 * statically and hooked onto rootvfs at link time.  During the
3079 	 * vfs_mountroot call at system startup time, the root file system's
3080 	 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3081 	 * as argument.  The code below must detect and handle this special
3082 	 * case.  The only apparent justification for this special casing is
3083 	 * to ensure that the root file system appears at the head of the
3084 	 * list.
3085 	 *
3086 	 * XXX:	I'm assuming that it's ok to do normal list locking when
3087 	 *	adding the entry for the root file system (this used to be
3088 	 *	done with no locks held).
3089 	 */
3090 	vfs_list_lock();
3091 	/*
3092 	 * Link into the vfs list proper.
3093 	 */
3094 	if (vfsp == &root) {
3095 		/*
3096 		 * Assert: This vfs is already on the list as its first entry.
3097 		 * Thus, there's nothing to do.
3098 		 */
3099 		ASSERT(rootvfs == vfsp);
3100 		/*
3101 		 * Add it to the head of the global zone's vfslist.
3102 		 */
3103 		ASSERT(zone == global_zone);
3104 		ASSERT(zone->zone_vfslist == NULL);
3105 		zone->zone_vfslist = vfsp;
3106 	} else {
3107 		/*
3108 		 * Link to end of list using vfs_prev (as rootvfs is now a
3109 		 * doubly linked circular list) so list is in mount order for
3110 		 * mnttab use.
3111 		 */
3112 		rootvfs->vfs_prev->vfs_next = vfsp;
3113 		vfsp->vfs_prev = rootvfs->vfs_prev;
3114 		rootvfs->vfs_prev = vfsp;
3115 		vfsp->vfs_next = rootvfs;
3116 
3117 		/*
3118 		 * Do it again for the zone-private list (which may be NULL).
3119 		 */
3120 		if (zone->zone_vfslist == NULL) {
3121 			ASSERT(zone != global_zone);
3122 			zone->zone_vfslist = vfsp;
3123 		} else {
3124 			zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3125 			vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3126 			zone->zone_vfslist->vfs_zone_prev = vfsp;
3127 			vfsp->vfs_zone_next = zone->zone_vfslist;
3128 		}
3129 	}
3130 
3131 	/*
3132 	 * Link into the hash table, inserting it at the end, so that LOFS
3133 	 * with the same fsid as UFS (or other) file systems will not hide
3134 	 * the UFS.
3135 	 */
3136 	vfs_hash_add(vfsp, 0);
3137 
3138 	/*
3139 	 * update the mnttab modification time
3140 	 */
3141 	vfs_mnttab_modtimeupd();
3142 	vfs_list_unlock();
3143 	zone_rele(zone);
3144 }
3145 
3146 void
3147 vfs_list_remove(struct vfs *vfsp)
3148 {
3149 	zone_t *zone;
3150 
3151 	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3152 	ASSERT(zone != NULL);
3153 	/*
3154 	 * Callers are responsible for preventing attempts to unmount the
3155 	 * root.
3156 	 */
3157 	ASSERT(vfsp != rootvfs);
3158 
3159 	vfs_list_lock();
3160 
3161 	/*
3162 	 * Remove from hash.
3163 	 */
3164 	vfs_hash_remove(vfsp);
3165 
3166 	/*
3167 	 * Remove from vfs list.
3168 	 */
3169 	vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3170 	vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3171 	vfsp->vfs_next = vfsp->vfs_prev = NULL;
3172 
3173 	/*
3174 	 * Remove from zone-specific vfs list.
3175 	 */
3176 	if (zone->zone_vfslist == vfsp)
3177 		zone->zone_vfslist = vfsp->vfs_zone_next;
3178 
3179 	if (vfsp->vfs_zone_next == vfsp) {
3180 		ASSERT(vfsp->vfs_zone_prev == vfsp);
3181 		ASSERT(zone->zone_vfslist == vfsp);
3182 		zone->zone_vfslist = NULL;
3183 	}
3184 
3185 	vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3186 	vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3187 	vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3188 
3189 	/*
3190 	 * update the mnttab modification time
3191 	 */
3192 	vfs_mnttab_modtimeupd();
3193 	vfs_list_unlock();
3194 	zone_rele(zone);
3195 }
3196 
3197 struct vfs *
3198 getvfs(fsid_t *fsid)
3199 {
3200 	struct vfs *vfsp;
3201 	int val0 = fsid->val[0];
3202 	int val1 = fsid->val[1];
3203 	dev_t dev = expldev(val0);
3204 	int vhno = VFSHASH(getmajor(dev), getminor(dev));
3205 	kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3206 
3207 	mutex_enter(hmp);
3208 	for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3209 		if (vfsp->vfs_fsid.val[0] == val0 &&
3210 		    vfsp->vfs_fsid.val[1] == val1) {
3211 			VFS_HOLD(vfsp);
3212 			mutex_exit(hmp);
3213 			return (vfsp);
3214 		}
3215 	}
3216 	mutex_exit(hmp);
3217 	return (NULL);
3218 }
3219 
3220 /*
3221  * Search the vfs mount in progress list for a specified device/vfs entry.
3222  * Returns 0 if the first entry in the list that the device matches has the
3223  * given vfs pointer as well.  If the device matches but a different vfs
3224  * pointer is encountered in the list before the given vfs pointer then
3225  * a 1 is returned.
3226  */
3227 
3228 int
3229 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3230 {
3231 	int retval = 0;
3232 	struct ipmnt *mipp;
3233 
3234 	mutex_enter(&vfs_miplist_mutex);
3235 	for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3236 		if (mipp->mip_dev == dev) {
3237 			if (mipp->mip_vfsp != vfsp)
3238 				retval = 1;
3239 			break;
3240 		}
3241 	}
3242 	mutex_exit(&vfs_miplist_mutex);
3243 	return (retval);
3244 }
3245 
3246 /*
3247  * Search the vfs list for a specified device.  Returns 1, if entry is found
3248  * or 0 if no suitable entry is found.
3249  */
3250 
3251 int
3252 vfs_devismounted(dev_t dev)
3253 {
3254 	struct vfs *vfsp;
3255 	int found;
3256 
3257 	vfs_list_read_lock();
3258 	vfsp = rootvfs;
3259 	found = 0;
3260 	do {
3261 		if (vfsp->vfs_dev == dev) {
3262 			found = 1;
3263 			break;
3264 		}
3265 		vfsp = vfsp->vfs_next;
3266 	} while (vfsp != rootvfs);
3267 
3268 	vfs_list_unlock();
3269 	return (found);
3270 }
3271 
3272 /*
3273  * Search the vfs list for a specified device.  Returns a pointer to it
3274  * or NULL if no suitable entry is found. The caller of this routine
3275  * is responsible for releasing the returned vfs pointer.
3276  */
3277 struct vfs *
3278 vfs_dev2vfsp(dev_t dev)
3279 {
3280 	struct vfs *vfsp;
3281 	int found;
3282 
3283 	vfs_list_read_lock();
3284 	vfsp = rootvfs;
3285 	found = 0;
3286 	do {
3287 		/*
3288 		 * The following could be made more efficient by making
3289 		 * the entire loop use vfs_zone_next if the call is from
3290 		 * a zone.  The only callers, however, ustat(2) and
3291 		 * umount2(2), don't seem to justify the added
3292 		 * complexity at present.
3293 		 */
3294 		if (vfsp->vfs_dev == dev &&
3295 		    ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3296 		    curproc->p_zone)) {
3297 			VFS_HOLD(vfsp);
3298 			found = 1;
3299 			break;
3300 		}
3301 		vfsp = vfsp->vfs_next;
3302 	} while (vfsp != rootvfs);
3303 	vfs_list_unlock();
3304 	return (found ? vfsp: NULL);
3305 }
3306 
3307 /*
3308  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3309  * or NULL if no suitable entry is found. The caller of this routine
3310  * is responsible for releasing the returned vfs pointer.
3311  *
3312  * Note that if multiple mntpoints match, the last one matching is
3313  * returned in an attempt to return the "top" mount when overlay
3314  * mounts are covering the same mount point.  This is accomplished by starting
3315  * at the end of the list and working our way backwards, stopping at the first
3316  * matching mount.
3317  */
3318 struct vfs *
3319 vfs_mntpoint2vfsp(const char *mp)
3320 {
3321 	struct vfs *vfsp;
3322 	struct vfs *retvfsp = NULL;
3323 	zone_t *zone = curproc->p_zone;
3324 	struct vfs *list;
3325 
3326 	vfs_list_read_lock();
3327 	if (getzoneid() == GLOBAL_ZONEID) {
3328 		/*
3329 		 * The global zone may see filesystems in any zone.
3330 		 */
3331 		vfsp = rootvfs->vfs_prev;
3332 		do {
3333 			if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3334 				retvfsp = vfsp;
3335 				break;
3336 			}
3337 			vfsp = vfsp->vfs_prev;
3338 		} while (vfsp != rootvfs->vfs_prev);
3339 	} else if ((list = zone->zone_vfslist) != NULL) {
3340 		const char *mntpt;
3341 
3342 		vfsp = list->vfs_zone_prev;
3343 		do {
3344 			mntpt = refstr_value(vfsp->vfs_mntpt);
3345 			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3346 			if (strcmp(mntpt, mp) == 0) {
3347 				retvfsp = vfsp;
3348 				break;
3349 			}
3350 			vfsp = vfsp->vfs_zone_prev;
3351 		} while (vfsp != list->vfs_zone_prev);
3352 	}
3353 	if (retvfsp)
3354 		VFS_HOLD(retvfsp);
3355 	vfs_list_unlock();
3356 	return (retvfsp);
3357 }
3358 
3359 /*
3360  * Search the vfs list for a specified vfsops.
3361  * if vfs entry is found then return 1, else 0.
3362  */
3363 int
3364 vfs_opsinuse(vfsops_t *ops)
3365 {
3366 	struct vfs *vfsp;
3367 	int found;
3368 
3369 	vfs_list_read_lock();
3370 	vfsp = rootvfs;
3371 	found = 0;
3372 	do {
3373 		if (vfs_getops(vfsp) == ops) {
3374 			found = 1;
3375 			break;
3376 		}
3377 		vfsp = vfsp->vfs_next;
3378 	} while (vfsp != rootvfs);
3379 	vfs_list_unlock();
3380 	return (found);
3381 }
3382 
3383 /*
3384  * Allocate an entry in vfssw for a file system type
3385  */
3386 struct vfssw *
3387 allocate_vfssw(char *type)
3388 {
3389 	struct vfssw *vswp;
3390 
3391 	if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3392 		/*
3393 		 * The vfssw table uses the empty string to identify an
3394 		 * available entry; we cannot add any type which has
3395 		 * a leading NUL. The string length is limited to
3396 		 * the size of the st_fstype array in struct stat.
3397 		 */
3398 		return (NULL);
3399 	}
3400 
3401 	ASSERT(VFSSW_WRITE_LOCKED());
3402 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3403 		if (!ALLOCATED_VFSSW(vswp)) {
3404 			vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3405 			(void) strcpy(vswp->vsw_name, type);
3406 			ASSERT(vswp->vsw_count == 0);
3407 			vswp->vsw_count = 1;
3408 			mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3409 			return (vswp);
3410 		}
3411 	return (NULL);
3412 }
3413 
3414 /*
3415  * Impose additional layer of translation between vfstype names
3416  * and module names in the filesystem.
3417  */
3418 static char *
3419 vfs_to_modname(char *vfstype)
3420 {
3421 	if (strcmp(vfstype, "proc") == 0) {
3422 		vfstype = "procfs";
3423 	} else if (strcmp(vfstype, "fd") == 0) {
3424 		vfstype = "fdfs";
3425 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
3426 		vfstype = "nfs";
3427 	}
3428 
3429 	return (vfstype);
3430 }
3431 
3432 /*
3433  * Find a vfssw entry given a file system type name.
3434  * Try to autoload the filesystem if it's not found.
3435  * If it's installed, return the vfssw locked to prevent unloading.
3436  */
3437 struct vfssw *
3438 vfs_getvfssw(char *type)
3439 {
3440 	struct vfssw *vswp;
3441 	char	*modname;
3442 
3443 	RLOCK_VFSSW();
3444 	vswp = vfs_getvfsswbyname(type);
3445 	modname = vfs_to_modname(type);
3446 
3447 	if (rootdir == NULL) {
3448 		/*
3449 		 * If we haven't yet loaded the root file system, then our
3450 		 * _init won't be called until later. Allocate vfssw entry,
3451 		 * because mod_installfs won't be called.
3452 		 */
3453 		if (vswp == NULL) {
3454 			RUNLOCK_VFSSW();
3455 			WLOCK_VFSSW();
3456 			if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3457 				if ((vswp = allocate_vfssw(type)) == NULL) {
3458 					WUNLOCK_VFSSW();
3459 					return (NULL);
3460 				}
3461 			}
3462 			WUNLOCK_VFSSW();
3463 			RLOCK_VFSSW();
3464 		}
3465 		if (!VFS_INSTALLED(vswp)) {
3466 			RUNLOCK_VFSSW();
3467 			(void) modloadonly("fs", modname);
3468 		} else
3469 			RUNLOCK_VFSSW();
3470 		return (vswp);
3471 	}
3472 
3473 	/*
3474 	 * Try to load the filesystem.  Before calling modload(), we drop
3475 	 * our lock on the VFS switch table, and pick it up after the
3476 	 * module is loaded.  However, there is a potential race:  the
3477 	 * module could be unloaded after the call to modload() completes
3478 	 * but before we pick up the lock and drive on.  Therefore,
3479 	 * we keep reloading the module until we've loaded the module
3480 	 * _and_ we have the lock on the VFS switch table.
3481 	 */
3482 	while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3483 		RUNLOCK_VFSSW();
3484 		if (modload("fs", modname) == -1)
3485 			return (NULL);
3486 		RLOCK_VFSSW();
3487 		if (vswp == NULL)
3488 			if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3489 				break;
3490 	}
3491 	RUNLOCK_VFSSW();
3492 
3493 	return (vswp);
3494 }
3495 
3496 /*
3497  * Find a vfssw entry given a file system type name.
3498  */
3499 struct vfssw *
3500 vfs_getvfsswbyname(char *type)
3501 {
3502 	struct vfssw *vswp;
3503 
3504 	ASSERT(VFSSW_LOCKED());
3505 	if (type == NULL || *type == '\0')
3506 		return (NULL);
3507 
3508 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3509 		if (strcmp(type, vswp->vsw_name) == 0) {
3510 			vfs_refvfssw(vswp);
3511 			return (vswp);
3512 		}
3513 	}
3514 
3515 	return (NULL);
3516 }
3517 
3518 /*
3519  * Find a vfssw entry given a set of vfsops.
3520  */
3521 struct vfssw *
3522 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3523 {
3524 	struct vfssw *vswp;
3525 
3526 	RLOCK_VFSSW();
3527 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3528 		if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3529 			vfs_refvfssw(vswp);
3530 			RUNLOCK_VFSSW();
3531 			return (vswp);
3532 		}
3533 	}
3534 	RUNLOCK_VFSSW();
3535 
3536 	return (NULL);
3537 }
3538 
3539 /*
3540  * Reference a vfssw entry.
3541  */
3542 void
3543 vfs_refvfssw(struct vfssw *vswp)
3544 {
3545 
3546 	mutex_enter(&vswp->vsw_lock);
3547 	vswp->vsw_count++;
3548 	mutex_exit(&vswp->vsw_lock);
3549 }
3550 
3551 /*
3552  * Unreference a vfssw entry.
3553  */
3554 void
3555 vfs_unrefvfssw(struct vfssw *vswp)
3556 {
3557 
3558 	mutex_enter(&vswp->vsw_lock);
3559 	vswp->vsw_count--;
3560 	mutex_exit(&vswp->vsw_lock);
3561 }
3562 
3563 int sync_timeout = 30;		/* timeout for syncing a page during panic */
3564 int sync_timeleft;		/* portion of sync_timeout remaining */
3565 
3566 static int sync_retries = 20;	/* number of retries when not making progress */
3567 static int sync_triesleft;	/* portion of sync_retries remaining */
3568 
3569 static pgcnt_t old_pgcnt, new_pgcnt;
3570 static int new_bufcnt, old_bufcnt;
3571 
3572 /*
3573  * Sync all of the mounted filesystems, and then wait for the actual i/o to
3574  * complete.  We wait by counting the number of dirty pages and buffers,
3575  * pushing them out using bio_busy() and page_busy(), and then counting again.
3576  * This routine is used during both the uadmin A_SHUTDOWN code as well as
3577  * the SYNC phase of the panic code (see comments in panic.c).  It should only
3578  * be used after some higher-level mechanism has quiesced the system so that
3579  * new writes are not being initiated while we are waiting for completion.
3580  *
3581  * To ensure finite running time, our algorithm uses two timeout mechanisms:
3582  * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
3583  * sync_triesleft (a progress counter used by the vfs_syncall() loop below).
3584  * Together these ensure that syncing completes if our i/o paths are stuck.
3585  * The counters are declared above so they can be found easily in the debugger.
3586  *
3587  * The sync_timeleft counter is reset by bio_busy() and page_busy() using the
3588  * vfs_syncprogress() subroutine whenever we make progress through the lists of
3589  * pages and buffers.  It is decremented and expired by the deadman() cyclic.
3590  * When vfs_syncall() decides it is done, we disable the deadman() counter by
3591  * setting sync_timeleft to zero.  This timer guards against vfs_syncall()
3592  * deadlocking or hanging inside of a broken filesystem or driver routine.
3593  *
3594  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
3595  * sync_retries consecutive calls to bio_busy() and page_busy() without
3596  * decreasing either the number of dirty buffers or dirty pages below the
3597  * lowest count we have seen so far, we give up and return from vfs_syncall().
3598  *
3599  * Each loop iteration ends with a call to delay() one second to allow time for
3600  * i/o completion and to permit the user time to read our progress messages.
3601  */
3602 void
3603 vfs_syncall(void)
3604 {
3605 	if (rootdir == NULL && !modrootloaded)
3606 		return; /* panic during boot - no filesystems yet */
3607 
3608 	printf("syncing file systems...");
3609 	vfs_syncprogress();
3610 	sync();
3611 
3612 	vfs_syncprogress();
3613 	sync_triesleft = sync_retries;
3614 
3615 	old_bufcnt = new_bufcnt = INT_MAX;
3616 	old_pgcnt = new_pgcnt = ULONG_MAX;
3617 
3618 	while (sync_triesleft > 0) {
3619 		old_bufcnt = MIN(old_bufcnt, new_bufcnt);
3620 		old_pgcnt = MIN(old_pgcnt, new_pgcnt);
3621 
3622 		new_bufcnt = bio_busy(B_TRUE);
3623 		new_pgcnt = page_busy(B_TRUE);
3624 		vfs_syncprogress();
3625 
3626 		if (new_bufcnt == 0 && new_pgcnt == 0)
3627 			break;
3628 
3629 		if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
3630 			sync_triesleft = sync_retries;
3631 		else
3632 			sync_triesleft--;
3633 
3634 		if (new_bufcnt)
3635 			printf(" [%d]", new_bufcnt);
3636 		if (new_pgcnt)
3637 			printf(" %lu", new_pgcnt);
3638 
3639 		delay(hz);
3640 	}
3641 
3642 	if (new_bufcnt != 0 || new_pgcnt != 0)
3643 		printf(" done (not all i/o completed)\n");
3644 	else
3645 		printf(" done\n");
3646 
3647 	sync_timeleft = 0;
3648 	delay(hz);
3649 }
3650 
3651 /*
3652  * If we are in the middle of the sync phase of panic, reset sync_timeleft to
3653  * sync_timeout to indicate that we are making progress and the deadman()
3654  * omnipresent cyclic should not yet time us out.  Note that it is safe to
3655  * store to sync_timeleft here since the deadman() is firing at high-level
3656  * on top of us.  If we are racing with the deadman(), either the deadman()
3657  * will decrement the old value and then we will reset it, or we will
3658  * reset it and then the deadman() will immediately decrement it.  In either
3659  * case, correct behavior results.
3660  */
3661 void
3662 vfs_syncprogress(void)
3663 {
3664 	if (panicstr)
3665 		sync_timeleft = sync_timeout;
3666 }
3667 
3668 /*
3669  * Map VFS flags to statvfs flags.  These shouldn't really be separate
3670  * flags at all.
3671  */
3672 uint_t
3673 vf_to_stf(uint_t vf)
3674 {
3675 	uint_t stf = 0;
3676 
3677 	if (vf & VFS_RDONLY)
3678 		stf |= ST_RDONLY;
3679 	if (vf & VFS_NOSETUID)
3680 		stf |= ST_NOSUID;
3681 	if (vf & VFS_NOTRUNC)
3682 		stf |= ST_NOTRUNC;
3683 
3684 	return (stf);
3685 }
3686 
3687 /*
3688  * Use old-style function prototype for vfsstray() so
3689  * that we can use it anywhere in the vfsops structure.
3690  */
3691 int vfsstray();
3692 
3693 /*
3694  * Entries for (illegal) fstype 0.
3695  */
3696 /* ARGSUSED */
3697 int
3698 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
3699 {
3700 	cmn_err(CE_PANIC, "stray vfs operation");
3701 	return (0);
3702 }
3703 
3704 vfsops_t vfs_strayops = {
3705 	vfsstray,
3706 	vfsstray,
3707 	vfsstray,
3708 	vfsstray,
3709 	vfsstray_sync,
3710 	vfsstray,
3711 	vfsstray,
3712 	vfsstray
3713 };
3714 
3715 /*
3716  * Entries for (illegal) fstype 0.
3717  */
3718 int
3719 vfsstray(void)
3720 {
3721 	cmn_err(CE_PANIC, "stray vfs operation");
3722 	return (0);
3723 }
3724 
3725 /*
3726  * Support for dealing with forced UFS unmount and its interaction with
3727  * LOFS. Could be used by any filesystem.
3728  * See bug 1203132.
3729  */
3730 int
3731 vfs_EIO(void)
3732 {
3733 	return (EIO);
3734 }
3735 
3736 /*
3737  * We've gotta define the op for sync separately, since the compiler gets
3738  * confused if we mix and match ANSI and normal style prototypes when
3739  * a "short" argument is present and spits out a warning.
3740  */
3741 /*ARGSUSED*/
3742 int
3743 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
3744 {
3745 	return (EIO);
3746 }
3747 
3748 vfs_t EIO_vfs;
3749 vfsops_t *EIO_vfsops;
3750 
3751 /*
3752  * Called from startup() to initialize all loaded vfs's
3753  */
3754 void
3755 vfsinit(void)
3756 {
3757 	struct vfssw *vswp;
3758 	int error;
3759 	extern int vopstats_enabled;
3760 	extern void vopstats_startup();
3761 
3762 	static const fs_operation_def_t EIO_vfsops_template[] = {
3763 		VFSNAME_MOUNT,		vfs_EIO,
3764 		VFSNAME_UNMOUNT,	vfs_EIO,
3765 		VFSNAME_ROOT,		vfs_EIO,
3766 		VFSNAME_STATVFS,	vfs_EIO,
3767 		VFSNAME_SYNC, (fs_generic_func_p) vfs_EIO_sync,
3768 		VFSNAME_VGET,		vfs_EIO,
3769 		VFSNAME_MOUNTROOT,	vfs_EIO,
3770 		VFSNAME_FREEVFS,	vfs_EIO,
3771 		VFSNAME_VNSTATE,	vfs_EIO,
3772 		NULL, NULL
3773 	};
3774 
3775 
3776 	/* Initialize the vnode cache (file systems may use it during init). */
3777 
3778 	vn_create_cache();
3779 
3780 	/* Setup event monitor framework */
3781 
3782 	fem_init();
3783 
3784 	/* Initialize the dummy stray file system type. */
3785 
3786 	vfssw[0].vsw_vfsops = vfs_strayops;
3787 
3788 	/* Initialize the dummy EIO file system. */
3789 	error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
3790 	if (error != 0) {
3791 		cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
3792 		/* Shouldn't happen, but not bad enough to panic */
3793 	}
3794 
3795 	VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
3796 
3797 	/*
3798 	 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
3799 	 * on this vfs can immediately notice it's invalid.
3800 	 */
3801 	EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
3802 
3803 	/*
3804 	 * Call the init routines of non-loadable filesystems only.
3805 	 * Filesystems which are loaded as separate modules will be
3806 	 * initialized by the module loading code instead.
3807 	 */
3808 
3809 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3810 		RLOCK_VFSSW();
3811 		if (vswp->vsw_init != NULL)
3812 			(*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
3813 		RUNLOCK_VFSSW();
3814 	}
3815 
3816 	vopstats_startup();
3817 
3818 	if (vopstats_enabled) {
3819 		/* EIO_vfs can collect stats, but we don't retrieve them */
3820 		initialize_vopstats(&EIO_vfs.vfs_vopstats);
3821 		EIO_vfs.vfs_fstypevsp = NULL;
3822 		EIO_vfs.vfs_vskap = NULL;
3823 		EIO_vfs.vfs_flag |= VFS_STATS;
3824 	}
3825 }
3826 
3827 /*
3828  * Increments the vfs reference count by one atomically.
3829  */
3830 void
3831 vfs_hold(vfs_t *vfsp)
3832 {
3833 	atomic_add_32(&vfsp->vfs_count, 1);
3834 	ASSERT(vfsp->vfs_count != 0);
3835 }
3836 
3837 /*
3838  * Decrements the vfs reference count by one atomically. When
3839  * vfs reference count becomes zero, it calls the file system
3840  * specific vfs_freevfs() to free up the resources.
3841  */
3842 void
3843 vfs_rele(vfs_t *vfsp)
3844 {
3845 	ASSERT(vfsp->vfs_count != 0);
3846 	if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) {
3847 		VFS_FREEVFS(vfsp);
3848 		if (vfsp->vfs_zone)
3849 			zone_rele(vfsp->vfs_zone);
3850 		vfs_freemnttab(vfsp);
3851 		if (vfsp->vfs_implp)
3852 			vfsimpl_teardown(vfsp);
3853 		sema_destroy(&vfsp->vfs_reflock);
3854 		kmem_free(vfsp, sizeof (*vfsp));
3855 	}
3856 }
3857 
3858 /*
3859  * Generic operations vector support.
3860  *
3861  * This is used to build operations vectors for both the vfs and vnode.
3862  * It's normally called only when a file system is loaded.
3863  *
3864  * There are many possible algorithms for this, including the following:
3865  *
3866  *   (1) scan the list of known operations; for each, see if the file system
3867  *       includes an entry for it, and fill it in as appropriate.
3868  *
3869  *   (2) set up defaults for all known operations.  scan the list of ops
3870  *       supplied by the file system; for each which is both supplied and
3871  *       known, fill it in.
3872  *
3873  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
3874  *       in entries as we go.
3875  *
3876  * we choose (1) for simplicity, and because performance isn't critical here.
3877  * note that (2) could be sped up using a precomputed hash table on known ops.
3878  * (3) could be faster than either, but only if the lists were very large or
3879  * supplied in sorted order.
3880  *
3881  */
3882 
3883 int
3884 fs_build_vector(void *vector, int *unused_ops,
3885     const fs_operation_trans_def_t *translation,
3886     const fs_operation_def_t *operations)
3887 {
3888 	int i, num_trans, num_ops, used;
3889 
3890 	/* Count the number of translations and the number of supplied */
3891 	/* operations. */
3892 
3893 	{
3894 		const fs_operation_trans_def_t *p;
3895 
3896 		for (num_trans = 0, p = translation;
3897 		    p->name != NULL;
3898 		    num_trans++, p++)
3899 			;
3900 	}
3901 
3902 	{
3903 		const fs_operation_def_t *p;
3904 
3905 		for (num_ops = 0, p = operations;
3906 		    p->name != NULL;
3907 		    num_ops++, p++)
3908 			;
3909 	}
3910 
3911 	/* Walk through each operation known to our caller.  There will be */
3912 	/* one entry in the supplied "translation table" for each. */
3913 
3914 	used = 0;
3915 
3916 	for (i = 0; i < num_trans; i++) {
3917 		int j, found;
3918 		char *curname;
3919 		fs_generic_func_p result;
3920 		fs_generic_func_p *location;
3921 
3922 		curname = translation[i].name;
3923 
3924 		/* Look for a matching operation in the list supplied by the */
3925 		/* file system. */
3926 
3927 		found = 0;
3928 
3929 		for (j = 0; j < num_ops; j++) {
3930 			if (strcmp(operations[j].name, curname) == 0) {
3931 				used++;
3932 				found = 1;
3933 				break;
3934 			}
3935 		}
3936 
3937 		/* If the file system is using a "placeholder" for default */
3938 		/* or error functions, grab the appropriate function out of */
3939 		/* the translation table.  If the file system didn't supply */
3940 		/* this operation at all, use the default function. */
3941 
3942 		if (found) {
3943 			result = operations[j].func;
3944 			if (result == fs_default) {
3945 				result = translation[i].defaultFunc;
3946 			} else if (result == fs_error) {
3947 				result = translation[i].errorFunc;
3948 			} else if (result == NULL) {
3949 				/* Null values are PROHIBITED */
3950 				return (EINVAL);
3951 			}
3952 		} else {
3953 			result = translation[i].defaultFunc;
3954 		}
3955 
3956 		/* Now store the function into the operations vector. */
3957 
3958 		location = (fs_generic_func_p *)
3959 		    (((char *)vector) + translation[i].offset);
3960 
3961 		*location = result;
3962 	}
3963 
3964 	*unused_ops = num_ops - used;
3965 
3966 	return (0);
3967 }
3968 
3969 /* Placeholder functions, should never be called. */
3970 
3971 int
3972 fs_error(void)
3973 {
3974 	cmn_err(CE_PANIC, "fs_error called");
3975 	return (0);
3976 }
3977 
3978 int
3979 fs_default(void)
3980 {
3981 	cmn_err(CE_PANIC, "fs_default called");
3982 	return (0);
3983 }
3984 
3985 #ifdef __sparc
3986 
3987 /*
3988  * Part of the implementation of booting off a mirrored root
3989  * involves a change of dev_t for the root device.  To
3990  * accomplish this, first remove the existing hash table
3991  * entry for the root device, convert to the new dev_t,
3992  * then re-insert in the hash table at the head of the list.
3993  */
3994 void
3995 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
3996 {
3997 	vfs_list_lock();
3998 
3999 	vfs_hash_remove(vfsp);
4000 
4001 	vfsp->vfs_dev = ndev;
4002 	vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4003 
4004 	vfs_hash_add(vfsp, 1);
4005 
4006 	vfs_list_unlock();
4007 }
4008 
4009 #else /* x86 NEWBOOT */
4010 
4011 int
4012 rootconf()
4013 {
4014 	int error;
4015 	struct vfssw *vsw;
4016 	extern void pm_init();
4017 	char *fstyp;
4018 
4019 	fstyp = getrootfs();
4020 
4021 	if (error = clboot_rootconf())
4022 		return (error);
4023 
4024 	if (modload("fs", fstyp) == -1)
4025 		cmn_err(CE_PANIC, "Cannot _init %s module\n", fstyp);
4026 
4027 	RLOCK_VFSSW();
4028 	vsw = vfs_getvfsswbyname(fstyp);
4029 	RUNLOCK_VFSSW();
4030 	VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4031 	VFS_HOLD(rootvfs);
4032 
4033 	/* always mount readonly first */
4034 	rootvfs->vfs_flag |= VFS_RDONLY;
4035 
4036 	pm_init();
4037 
4038 	if (netboot)
4039 		(void) strplumb();
4040 
4041 	error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4042 	vfs_unrefvfssw(vsw);
4043 	rootdev = rootvfs->vfs_dev;
4044 
4045 	if (error)
4046 		cmn_err(CE_PANIC, "cannot mount root path %s", svm_bootpath);
4047 	return (error);
4048 }
4049 
4050 /*
4051  * XXX this is called by nfs only and should probably be removed
4052  * If booted with ASKNAME, prompt on the console for a filesystem
4053  * name and return it.
4054  */
4055 void
4056 getfsname(char *askfor, char *name, size_t namelen)
4057 {
4058 	if (boothowto & RB_ASKNAME) {
4059 		printf("%s name: ", askfor);
4060 		console_gets(name, namelen);
4061 	}
4062 }
4063 
4064 /*
4065  * If server_path exists, then we are booting a diskless
4066  * client. Otherwise, we default to ufs. Zfs should perhaps be
4067  * another property.
4068  */
4069 static char *
4070 getrootfs(void)
4071 {
4072 	extern char *strplumb_get_netdev_path(void);
4073 	char *propstr = NULL;
4074 
4075 	/* check fstype property; it should be nfsdyn for diskless */
4076 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4077 	    DDI_PROP_DONTPASS, "fstype", &propstr)
4078 	    == DDI_SUCCESS) {
4079 		(void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4080 		ddi_prop_free(propstr);
4081 	}
4082 
4083 	if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0)
4084 		return (rootfs.bo_fstype);
4085 
4086 	++netboot;
4087 	/* check if path to network interface is specified in bootpath */
4088 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4089 	    DDI_PROP_DONTPASS, "bootpath", &propstr)
4090 	    == DDI_SUCCESS) {
4091 		(void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4092 		ddi_prop_free(propstr);
4093 	} else {
4094 		/* attempt to determine netdev_path via boot_mac address */
4095 		netdev_path = strplumb_get_netdev_path();
4096 		if (netdev_path == NULL)
4097 			cmn_err(CE_PANIC,
4098 			    "Cannot find boot network interface\n");
4099 		(void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
4100 	}
4101 	return ("nfs");
4102 }
4103 #endif
4104