xref: /titanic_50/usr/src/uts/common/fs/vfs.c (revision 8696d418011068e5cedf3a229f7a6613e7798e92)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/user.h>
47 #include <sys/fstyp.h>
48 #include <sys/kmem.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 #include <sys/mount.h>
52 #include <sys/vfs.h>
53 #include <sys/vfs_opreg.h>
54 #include <sys/fem.h>
55 #include <sys/mntent.h>
56 #include <sys/stat.h>
57 #include <sys/statvfs.h>
58 #include <sys/statfs.h>
59 #include <sys/cred.h>
60 #include <sys/vnode.h>
61 #include <sys/rwstlock.h>
62 #include <sys/dnlc.h>
63 #include <sys/file.h>
64 #include <sys/time.h>
65 #include <sys/atomic.h>
66 #include <sys/cmn_err.h>
67 #include <sys/buf.h>
68 #include <sys/swap.h>
69 #include <sys/debug.h>
70 #include <sys/vnode.h>
71 #include <sys/modctl.h>
72 #include <sys/ddi.h>
73 #include <sys/pathname.h>
74 #include <sys/bootconf.h>
75 #include <sys/dumphdr.h>
76 #include <sys/dc_ki.h>
77 #include <sys/poll.h>
78 #include <sys/sunddi.h>
79 #include <sys/sysmacros.h>
80 #include <sys/zone.h>
81 #include <sys/policy.h>
82 #include <sys/ctfs.h>
83 #include <sys/objfs.h>
84 #include <sys/console.h>
85 #include <sys/reboot.h>
86 
87 #include <vm/page.h>
88 
89 #include <fs/fs_subr.h>
90 
91 /* Private interfaces to create vopstats-related data structures */
92 extern void		initialize_vopstats(vopstats_t *);
93 extern vopstats_t	*get_fstype_vopstats(struct vfs *, struct vfssw *);
94 extern vsk_anchor_t	*get_vskstat_anchor(struct vfs *);
95 
96 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
97 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
98     const char *, int, int);
99 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
100 static void vfs_freemnttab(struct vfs *);
101 static void vfs_freeopt(mntopt_t *);
102 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
103 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
104 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
105 static void vfs_createopttbl_extend(mntopts_t *, const char *,
106     const mntopts_t *);
107 static char **vfs_copycancelopt_extend(char **const, int);
108 static void vfs_freecancelopt(char **);
109 static void getrootfs(char **, char **);
110 static int getmacpath(dev_info_t *, void *);
111 static void vfs_mnttabvp_setup(void);
112 
113 struct ipmnt {
114 	struct ipmnt	*mip_next;
115 	dev_t		mip_dev;
116 	struct vfs	*mip_vfsp;
117 };
118 
119 static kmutex_t		vfs_miplist_mutex;
120 static struct ipmnt	*vfs_miplist = NULL;
121 static struct ipmnt	*vfs_miplist_end = NULL;
122 
123 /*
124  * VFS global data.
125  */
126 vnode_t *rootdir;		/* pointer to root inode vnode. */
127 vnode_t *devicesdir;		/* pointer to inode of devices root */
128 vnode_t	*devdir;		/* pointer to inode of dev root */
129 
130 char *server_rootpath;		/* root path for diskless clients */
131 char *server_hostname;		/* hostname of diskless server */
132 
133 static struct vfs root;
134 static struct vfs devices;
135 static struct vfs dev;
136 struct vfs *rootvfs = &root;	/* pointer to root vfs; head of VFS list. */
137 rvfs_t *rvfs_list;		/* array of vfs ptrs for vfs hash list */
138 int vfshsz = 512;		/* # of heads/locks in vfs hash arrays */
139 				/* must be power of 2!	*/
140 timespec_t vfs_mnttab_ctime;	/* mnttab created time */
141 timespec_t vfs_mnttab_mtime;	/* mnttab last modified time */
142 char *vfs_dummyfstype = "\0";
143 struct pollhead vfs_pollhd;	/* for mnttab pollers */
144 struct vnode *vfs_mntdummyvp;	/* to fake mnttab read/write for file events */
145 int	mntfstype;		/* will be set once mnt fs is mounted */
146 
147 /*
148  * Table for generic options recognized in the VFS layer and acted
149  * on at this level before parsing file system specific options.
150  * The nosuid option is stronger than any of the devices and setuid
151  * options, so those are canceled when nosuid is seen.
152  *
153  * All options which are added here need to be added to the
154  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
155  */
156 /*
157  * VFS Mount options table
158  */
159 static char *ro_cancel[] = { MNTOPT_RW, NULL };
160 static char *rw_cancel[] = { MNTOPT_RO, NULL };
161 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
162 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
163     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
164 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
165 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
166 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
167 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
168 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
169 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
170 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
171 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
172 
173 static const mntopt_t mntopts[] = {
174 /*
175  *	option name		cancel options		default arg	flags
176  */
177 	{ MNTOPT_REMOUNT,	NULL,			NULL,
178 		MO_NODISPLAY, (void *)0 },
179 	{ MNTOPT_RO,		ro_cancel,		NULL,		0,
180 		(void *)0 },
181 	{ MNTOPT_RW,		rw_cancel,		NULL,		0,
182 		(void *)0 },
183 	{ MNTOPT_SUID,		suid_cancel,		NULL,		0,
184 		(void *)0 },
185 	{ MNTOPT_NOSUID,	nosuid_cancel,		NULL,		0,
186 		(void *)0 },
187 	{ MNTOPT_DEVICES,	devices_cancel,		NULL,		0,
188 		(void *)0 },
189 	{ MNTOPT_NODEVICES,	nodevices_cancel,	NULL,		0,
190 		(void *)0 },
191 	{ MNTOPT_SETUID,	setuid_cancel,		NULL,		0,
192 		(void *)0 },
193 	{ MNTOPT_NOSETUID,	nosetuid_cancel,	NULL,		0,
194 		(void *)0 },
195 	{ MNTOPT_NBMAND,	nbmand_cancel,		NULL,		0,
196 		(void *)0 },
197 	{ MNTOPT_NONBMAND,	nonbmand_cancel,	NULL,		0,
198 		(void *)0 },
199 	{ MNTOPT_EXEC,		exec_cancel,		NULL,		0,
200 		(void *)0 },
201 	{ MNTOPT_NOEXEC,	noexec_cancel,		NULL,		0,
202 		(void *)0 },
203 };
204 
205 const mntopts_t vfs_mntopts = {
206 	sizeof (mntopts) / sizeof (mntopt_t),
207 	(mntopt_t *)&mntopts[0]
208 };
209 
210 /*
211  * File system operation dispatch functions.
212  */
213 
214 int
215 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
216 {
217 	return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
218 }
219 
220 int
221 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
222 {
223 	return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
224 }
225 
226 int
227 fsop_root(vfs_t *vfsp, vnode_t **vpp)
228 {
229 	refstr_t *mntpt;
230 	int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
231 	/*
232 	 * Make sure this root has a path.  With lofs, it is possible to have
233 	 * a NULL mountpoint.
234 	 */
235 	if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
236 		mntpt = vfs_getmntpoint(vfsp);
237 		vn_setpath_str(*vpp, refstr_value(mntpt),
238 		    strlen(refstr_value(mntpt)));
239 		refstr_rele(mntpt);
240 	}
241 
242 	return (ret);
243 }
244 
245 int
246 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
247 {
248 	return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
249 }
250 
251 int
252 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
253 {
254 	return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
255 }
256 
257 int
258 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
259 {
260 	return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
261 }
262 
263 int
264 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
265 {
266 	return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
267 }
268 
269 void
270 fsop_freefs(vfs_t *vfsp)
271 {
272 	(*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
273 }
274 
275 int
276 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
277 {
278 	return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
279 }
280 
281 int
282 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
283 {
284 	ASSERT((fstype >= 0) && (fstype < nfstype));
285 
286 	if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
287 		return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
288 	else
289 		return (ENOTSUP);
290 }
291 
292 /*
293  * File system initialization.  vfs_setfsops() must be called from a file
294  * system's init routine.
295  */
296 
297 static int
298 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
299     int *unused_ops)
300 {
301 	static const fs_operation_trans_def_t vfs_ops_table[] = {
302 		VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
303 			fs_nosys, fs_nosys,
304 
305 		VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
306 			fs_nosys, fs_nosys,
307 
308 		VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
309 			fs_nosys, fs_nosys,
310 
311 		VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
312 			fs_nosys, fs_nosys,
313 
314 		VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
315 			(fs_generic_func_p) fs_sync,
316 			(fs_generic_func_p) fs_sync,	/* No errors allowed */
317 
318 		VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
319 			fs_nosys, fs_nosys,
320 
321 		VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
322 			fs_nosys, fs_nosys,
323 
324 		VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
325 			(fs_generic_func_p)fs_freevfs,
326 			(fs_generic_func_p)fs_freevfs,	/* Shouldn't fail */
327 
328 		VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
329 			(fs_generic_func_p)fs_nosys,
330 			(fs_generic_func_p)fs_nosys,
331 
332 		NULL, 0, NULL, NULL
333 	};
334 
335 	return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
336 }
337 
338 int
339 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
340 {
341 	int error;
342 	int unused_ops;
343 
344 	/*
345 	 * Verify that fstype refers to a valid fs.  Note that
346 	 * 0 is valid since it's used to set "stray" ops.
347 	 */
348 	if ((fstype < 0) || (fstype >= nfstype))
349 		return (EINVAL);
350 
351 	if (!ALLOCATED_VFSSW(&vfssw[fstype]))
352 		return (EINVAL);
353 
354 	/* Set up the operations vector. */
355 
356 	error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
357 
358 	if (error != 0)
359 		return (error);
360 
361 	vfssw[fstype].vsw_flag |= VSW_INSTALLED;
362 
363 	if (actual != NULL)
364 		*actual = &vfssw[fstype].vsw_vfsops;
365 
366 #if DEBUG
367 	if (unused_ops != 0)
368 		cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
369 		    "but not used", vfssw[fstype].vsw_name, unused_ops);
370 #endif
371 
372 	return (0);
373 }
374 
375 int
376 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
377 {
378 	int error;
379 	int unused_ops;
380 
381 	*actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
382 
383 	error = fs_copyfsops(template, *actual, &unused_ops);
384 	if (error != 0) {
385 		kmem_free(*actual, sizeof (vfsops_t));
386 		*actual = NULL;
387 		return (error);
388 	}
389 
390 	return (0);
391 }
392 
393 /*
394  * Free a vfsops structure created as a result of vfs_makefsops().
395  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
396  * vfs_freevfsops_by_type().
397  */
398 void
399 vfs_freevfsops(vfsops_t *vfsops)
400 {
401 	kmem_free(vfsops, sizeof (vfsops_t));
402 }
403 
404 /*
405  * Since the vfsops structure is part of the vfssw table and wasn't
406  * really allocated, we're not really freeing anything.  We keep
407  * the name for consistency with vfs_freevfsops().  We do, however,
408  * need to take care of a little bookkeeping.
409  * NOTE: For a vfsops structure created by vfs_setfsops(), use
410  * vfs_freevfsops_by_type().
411  */
412 int
413 vfs_freevfsops_by_type(int fstype)
414 {
415 
416 	/* Verify that fstype refers to a loaded fs (and not fsid 0). */
417 	if ((fstype <= 0) || (fstype >= nfstype))
418 		return (EINVAL);
419 
420 	WLOCK_VFSSW();
421 	if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
422 		WUNLOCK_VFSSW();
423 		return (EINVAL);
424 	}
425 
426 	vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
427 	WUNLOCK_VFSSW();
428 
429 	return (0);
430 }
431 
432 /* Support routines used to reference vfs_op */
433 
434 /* Set the operations vector for a vfs */
435 void
436 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
437 {
438 	vfsops_t	*op;
439 
440 	ASSERT(vfsp != NULL);
441 	ASSERT(vfsops != NULL);
442 
443 	op = vfsp->vfs_op;
444 	membar_consumer();
445 	if ((vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) &&
446 	    casptr(&vfsp->vfs_op, op, vfsops) == op) {
447 		return;
448 	}
449 	fsem_setvfsops(vfsp, vfsops);
450 }
451 
452 /* Retrieve the operations vector for a vfs */
453 vfsops_t *
454 vfs_getops(vfs_t *vfsp)
455 {
456 	vfsops_t	*op;
457 
458 	ASSERT(vfsp != NULL);
459 
460 	op = vfsp->vfs_op;
461 	membar_consumer();
462 	if ((vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) &&
463 	    op == vfsp->vfs_op) {
464 		return (op);
465 	} else {
466 		return (fsem_getvfsops(vfsp));
467 	}
468 }
469 
470 /*
471  * Returns non-zero (1) if the vfsops matches that of the vfs.
472  * Returns zero (0) if not.
473  */
474 int
475 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
476 {
477 	return (vfs_getops(vfsp) == vfsops);
478 }
479 
480 /*
481  * Returns non-zero (1) if the file system has installed a non-default,
482  * non-error vfs_sync routine.  Returns zero (0) otherwise.
483  */
484 int
485 vfs_can_sync(vfs_t *vfsp)
486 {
487 	/* vfs_sync() routine is not the default/error function */
488 	return (vfs_getops(vfsp)->vfs_sync != fs_sync);
489 }
490 
491 /*
492  * Initialize a vfs structure.
493  */
494 void
495 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
496 {
497 	vfsp->vfs_count = 0;
498 	vfsp->vfs_next = vfsp;
499 	vfsp->vfs_prev = vfsp;
500 	vfsp->vfs_zone_next = vfsp;
501 	vfsp->vfs_zone_prev = vfsp;
502 	vfsp->vfs_flag = 0;
503 	vfsp->vfs_data = (data);
504 	vfsp->vfs_resource = NULL;
505 	vfsp->vfs_mntpt = NULL;
506 	vfsp->vfs_mntopts.mo_count = 0;
507 	vfsp->vfs_mntopts.mo_list = NULL;
508 	vfsp->vfs_implp = NULL;
509 	vfsp->vfs_zone = NULL;
510 	/*
511 	 * Note: Don't initialize any member of the vfs_impl_t structure
512 	 * here as it could be a problem for unbundled file systems.
513 	 */
514 	vfs_setops((vfsp), (op));
515 	sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
516 }
517 
518 /*
519  * Allocate and initialize the vfs implementation private data
520  * structure, vfs_impl_t.
521  */
522 void
523 vfsimpl_setup(vfs_t *vfsp)
524 {
525 	vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
526 	/* Note that this are #define'd in vfs.h */
527 	vfsp->vfs_femhead = NULL;
528 	vfsp->vfs_vskap = NULL;
529 	vfsp->vfs_fstypevsp = NULL;
530 }
531 
532 /*
533  * Release the vfs_impl_t structure, if it exists. Some unbundled
534  * filesystems may not use the newer version of vfs and thus
535  * would not contain this implementation private data structure.
536  */
537 void
538 vfsimpl_teardown(vfs_t *vfsp)
539 {
540 	vfs_impl_t	*vip = vfsp->vfs_implp;
541 
542 	if (vip == NULL)
543 		return;
544 
545 	if (vip->vi_femhead) {
546 		ASSERT(vip->vi_femhead->femh_list == NULL);
547 		mutex_destroy(&vip->vi_femhead->femh_lock);
548 		kmem_free(vip->vi_femhead, sizeof (*(vip->vi_femhead)));
549 		vip->vi_femhead = NULL;
550 	}
551 
552 	kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
553 	vfsp->vfs_implp = NULL;
554 }
555 
556 /*
557  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
558  * fstatvfs, and sysfs moved to common/syscall.
559  */
560 
561 /*
562  * Update every mounted file system.  We call the vfs_sync operation of
563  * each file system type, passing it a NULL vfsp to indicate that all
564  * mounted file systems of that type should be updated.
565  */
566 void
567 vfs_sync(int flag)
568 {
569 	struct vfssw *vswp;
570 	RLOCK_VFSSW();
571 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
572 		if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
573 			vfs_refvfssw(vswp);
574 			RUNLOCK_VFSSW();
575 			(void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
576 			    CRED());
577 			vfs_unrefvfssw(vswp);
578 			RLOCK_VFSSW();
579 		}
580 	}
581 	RUNLOCK_VFSSW();
582 }
583 
584 void
585 sync(void)
586 {
587 	vfs_sync(0);
588 }
589 
590 /*
591  * External routines.
592  */
593 
594 krwlock_t vfssw_lock;	/* lock accesses to vfssw */
595 
596 /*
597  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
598  * but otherwise should be accessed only via vfs_list_lock() and
599  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
600  */
601 static krwlock_t vfslist;
602 
603 /*
604  * Mount devfs on /devices. This is done right after root is mounted
605  * to provide device access support for the system
606  */
607 static void
608 vfs_mountdevices(void)
609 {
610 	struct vfssw *vsw;
611 	struct vnode *mvp;
612 	struct mounta mounta = {	/* fake mounta for devfs_mount() */
613 		NULL,
614 		NULL,
615 		MS_SYSSPACE,
616 		NULL,
617 		NULL,
618 		0,
619 		NULL,
620 		0
621 	};
622 
623 	/*
624 	 * _init devfs module to fill in the vfssw
625 	 */
626 	if (modload("fs", "devfs") == -1)
627 		panic("Cannot _init devfs module");
628 
629 	/*
630 	 * Hold vfs
631 	 */
632 	RLOCK_VFSSW();
633 	vsw = vfs_getvfsswbyname("devfs");
634 	VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
635 	VFS_HOLD(&devices);
636 
637 	/*
638 	 * Locate mount point
639 	 */
640 	if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
641 		panic("Cannot find /devices");
642 
643 	/*
644 	 * Perform the mount of /devices
645 	 */
646 	if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
647 		panic("Cannot mount /devices");
648 
649 	RUNLOCK_VFSSW();
650 
651 	/*
652 	 * Set appropriate members and add to vfs list for mnttab display
653 	 */
654 	vfs_setresource(&devices, "/devices");
655 	vfs_setmntpoint(&devices, "/devices");
656 
657 	/*
658 	 * Hold the root of /devices so it won't go away
659 	 */
660 	if (VFS_ROOT(&devices, &devicesdir))
661 		panic("vfs_mountdevices: not devices root");
662 
663 	if (vfs_lock(&devices) != 0) {
664 		VN_RELE(devicesdir);
665 		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
666 		return;
667 	}
668 
669 	if (vn_vfswlock(mvp) != 0) {
670 		vfs_unlock(&devices);
671 		VN_RELE(devicesdir);
672 		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
673 		return;
674 	}
675 
676 	vfs_add(mvp, &devices, 0);
677 	vn_vfsunlock(mvp);
678 	vfs_unlock(&devices);
679 	VN_RELE(devicesdir);
680 }
681 
682 /*
683  * mount the first instance of /dev  to root and remain mounted
684  */
685 static void
686 vfs_mountdev1(void)
687 {
688 	struct vfssw *vsw;
689 	struct vnode *mvp;
690 	struct mounta mounta = {	/* fake mounta for sdev_mount() */
691 		NULL,
692 		NULL,
693 		MS_SYSSPACE | MS_OVERLAY,
694 		NULL,
695 		NULL,
696 		0,
697 		NULL,
698 		0
699 	};
700 
701 	/*
702 	 * _init dev module to fill in the vfssw
703 	 */
704 	if (modload("fs", "dev") == -1)
705 		cmn_err(CE_PANIC, "Cannot _init dev module\n");
706 
707 	/*
708 	 * Hold vfs
709 	 */
710 	RLOCK_VFSSW();
711 	vsw = vfs_getvfsswbyname("dev");
712 	VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
713 	VFS_HOLD(&dev);
714 
715 	/*
716 	 * Locate mount point
717 	 */
718 	if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
719 		cmn_err(CE_PANIC, "Cannot find /dev\n");
720 
721 	/*
722 	 * Perform the mount of /dev
723 	 */
724 	if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
725 		cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
726 
727 	RUNLOCK_VFSSW();
728 
729 	/*
730 	 * Set appropriate members and add to vfs list for mnttab display
731 	 */
732 	vfs_setresource(&dev, "/dev");
733 	vfs_setmntpoint(&dev, "/dev");
734 
735 	/*
736 	 * Hold the root of /dev so it won't go away
737 	 */
738 	if (VFS_ROOT(&dev, &devdir))
739 		cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
740 
741 	if (vfs_lock(&dev) != 0) {
742 		VN_RELE(devdir);
743 		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
744 		return;
745 	}
746 
747 	if (vn_vfswlock(mvp) != 0) {
748 		vfs_unlock(&dev);
749 		VN_RELE(devdir);
750 		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
751 		return;
752 	}
753 
754 	vfs_add(mvp, &dev, 0);
755 	vn_vfsunlock(mvp);
756 	vfs_unlock(&dev);
757 	VN_RELE(devdir);
758 }
759 
760 /*
761  * Mount required filesystem. This is done right after root is mounted.
762  */
763 static void
764 vfs_mountfs(char *module, char *spec, char *path)
765 {
766 	struct vnode *mvp;
767 	struct mounta mounta;
768 	vfs_t *vfsp;
769 
770 	mounta.flags = MS_SYSSPACE | MS_DATA;
771 	mounta.fstype = module;
772 	mounta.spec = spec;
773 	mounta.dir = path;
774 	if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
775 		cmn_err(CE_WARN, "Cannot find %s", path);
776 		return;
777 	}
778 	if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
779 		cmn_err(CE_WARN, "Cannot mount %s", path);
780 	else
781 		VFS_RELE(vfsp);
782 	VN_RELE(mvp);
783 }
784 
785 /*
786  * vfs_mountroot is called by main() to mount the root filesystem.
787  */
788 void
789 vfs_mountroot(void)
790 {
791 	struct vnode	*rvp = NULL;
792 	char		*path;
793 	size_t		plen;
794 	struct vfssw	*vswp;
795 
796 	rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
797 	rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
798 
799 	/*
800 	 * Alloc the vfs hash bucket array and locks
801 	 */
802 	rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
803 
804 	/*
805 	 * Call machine-dependent routine "rootconf" to choose a root
806 	 * file system type.
807 	 */
808 	if (rootconf())
809 		panic("vfs_mountroot: cannot mount root");
810 	/*
811 	 * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
812 	 * to point to it.  These are used by lookuppn() so that it
813 	 * knows where to start from ('/' or '.').
814 	 */
815 	vfs_setmntpoint(rootvfs, "/");
816 	if (VFS_ROOT(rootvfs, &rootdir))
817 		panic("vfs_mountroot: no root vnode");
818 	PTOU(curproc)->u_cdir = rootdir;
819 	VN_HOLD(PTOU(curproc)->u_cdir);
820 	PTOU(curproc)->u_rdir = NULL;
821 
822 	/*
823 	 * Setup the global zone's rootvp, now that it exists.
824 	 */
825 	global_zone->zone_rootvp = rootdir;
826 	VN_HOLD(global_zone->zone_rootvp);
827 
828 	/*
829 	 * Notify the module code that it can begin using the
830 	 * root filesystem instead of the boot program's services.
831 	 */
832 	modrootloaded = 1;
833 	/*
834 	 * Set up mnttab information for root
835 	 */
836 	vfs_setresource(rootvfs, rootfs.bo_name);
837 
838 	/*
839 	 * Notify cluster software that the root filesystem is available.
840 	 */
841 	clboot_mountroot();
842 
843 	/* Now that we're all done with the root FS, set up its vopstats */
844 	if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
845 		/* Set flag for statistics collection */
846 		if (vswp->vsw_flag & VSW_STATS) {
847 			initialize_vopstats(&rootvfs->vfs_vopstats);
848 			rootvfs->vfs_flag |= VFS_STATS;
849 			rootvfs->vfs_fstypevsp =
850 			    get_fstype_vopstats(rootvfs, vswp);
851 			rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
852 		}
853 		vfs_unrefvfssw(vswp);
854 	}
855 
856 	/*
857 	 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
858 	 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
859 	 */
860 	vfs_mountdevices();
861 	vfs_mountdev1();
862 
863 	vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
864 	vfs_mountfs("proc", "/proc", "/proc");
865 	vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
866 	vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
867 	vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
868 
869 	if (getzoneid() == GLOBAL_ZONEID) {
870 		vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
871 	}
872 
873 #ifdef __sparc
874 	/*
875 	 * This bit of magic can go away when we convert sparc to
876 	 * the new boot architecture based on ramdisk.
877 	 *
878 	 * Booting off a mirrored root volume:
879 	 * At this point, we have booted and mounted root on a
880 	 * single component of the mirror.  Complete the boot
881 	 * by configuring SVM and converting the root to the
882 	 * dev_t of the mirrored root device.  This dev_t conversion
883 	 * only works because the underlying device doesn't change.
884 	 */
885 	if (root_is_svm) {
886 		if (svm_rootconf()) {
887 			panic("vfs_mountroot: cannot remount root");
888 		}
889 
890 		/*
891 		 * mnttab should reflect the new root device
892 		 */
893 		vfs_lock_wait(rootvfs);
894 		vfs_setresource(rootvfs, rootfs.bo_name);
895 		vfs_unlock(rootvfs);
896 	}
897 #endif /* __sparc */
898 
899 	/*
900 	 * Look up the root device via devfs so that a dv_node is
901 	 * created for it. The vnode is never VN_RELE()ed.
902 	 * We allocate more than MAXPATHLEN so that the
903 	 * buffer passed to i_ddi_prompath_to_devfspath() is
904 	 * exactly MAXPATHLEN (the function expects a buffer
905 	 * of that length).
906 	 */
907 	plen = strlen("/devices");
908 	path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
909 	(void) strcpy(path, "/devices");
910 
911 	if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
912 	    != DDI_SUCCESS ||
913 	    lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
914 
915 		/* NUL terminate in case "path" has garbage */
916 		path[plen + MAXPATHLEN - 1] = '\0';
917 #ifdef	DEBUG
918 		cmn_err(CE_WARN, "!Cannot lookup root device: %s", path);
919 #endif
920 	}
921 	kmem_free(path, plen + MAXPATHLEN);
922 	vfs_mnttabvp_setup();
923 }
924 
925 /*
926  * If remount failed and we're in a zone we need to check for the zone
927  * root path and strip it before the call to vfs_setpath().
928  *
929  * If strpath doesn't begin with the zone_rootpath the original
930  * strpath is returned unchanged.
931  */
932 static const char *
933 stripzonepath(const char *strpath)
934 {
935 	char *str1, *str2;
936 	int i;
937 	zone_t *zonep = curproc->p_zone;
938 
939 	if (zonep->zone_rootpath == NULL || strpath == NULL) {
940 		return (NULL);
941 	}
942 
943 	/*
944 	 * we check for the end of the string at one past the
945 	 * current position because the zone_rootpath always
946 	 * ends with "/" but we don't want to strip that off.
947 	 */
948 	str1 = zonep->zone_rootpath;
949 	str2 = (char *)strpath;
950 	ASSERT(str1[0] != '\0');
951 	for (i = 0; str1[i + 1] != '\0'; i++) {
952 		if (str1[i] != str2[i])
953 			return ((char *)strpath);
954 	}
955 	return (&str2[i]);
956 }
957 
958 /*
959  * Common mount code.  Called from the system call entry point, from autofs,
960  * nfsv4 trigger mounts, and from pxfs.
961  *
962  * Takes the effective file system type, mount arguments, the mount point
963  * vnode, flags specifying whether the mount is a remount and whether it
964  * should be entered into the vfs list, and credentials.  Fills in its vfspp
965  * parameter with the mounted file system instance's vfs.
966  *
967  * Note that the effective file system type is specified as a string.  It may
968  * be null, in which case it's determined from the mount arguments, and may
969  * differ from the type specified in the mount arguments; this is a hook to
970  * allow interposition when instantiating file system instances.
971  *
972  * The caller is responsible for releasing its own hold on the mount point
973  * vp (this routine does its own hold when necessary).
974  * Also note that for remounts, the mount point vp should be the vnode for
975  * the root of the file system rather than the vnode that the file system
976  * is mounted on top of.
977  */
978 int
979 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
980 	struct vfs **vfspp)
981 {
982 	struct vfssw	*vswp;
983 	vfsops_t	*vfsops;
984 	struct vfs	*vfsp;
985 	struct vnode	*bvp;
986 	dev_t		bdev = 0;
987 	mntopts_t	mnt_mntopts;
988 	int		error = 0;
989 	int		copyout_error = 0;
990 	int		ovflags;
991 	char		*opts = uap->optptr;
992 	char		*inargs = opts;
993 	int		optlen = uap->optlen;
994 	int		remount;
995 	int		rdonly;
996 	int		nbmand = 0;
997 	int		delmip = 0;
998 	int		addmip = 0;
999 	int		splice = ((uap->flags & MS_NOSPLICE) == 0);
1000 	int		fromspace = (uap->flags & MS_SYSSPACE) ?
1001 	    UIO_SYSSPACE : UIO_USERSPACE;
1002 	char		*resource = NULL, *mountpt = NULL;
1003 	refstr_t	*oldresource, *oldmntpt;
1004 	struct pathname	pn, rpn;
1005 	vsk_anchor_t	*vskap;
1006 
1007 	/*
1008 	 * The v_flag value for the mount point vp is permanently set
1009 	 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1010 	 * for mount point locking.
1011 	 */
1012 	mutex_enter(&vp->v_lock);
1013 	vp->v_flag |= VVFSLOCK;
1014 	mutex_exit(&vp->v_lock);
1015 
1016 	mnt_mntopts.mo_count = 0;
1017 	/*
1018 	 * Find the ops vector to use to invoke the file system-specific mount
1019 	 * method.  If the fsname argument is non-NULL, use it directly.
1020 	 * Otherwise, dig the file system type information out of the mount
1021 	 * arguments.
1022 	 *
1023 	 * A side effect is to hold the vfssw entry.
1024 	 *
1025 	 * Mount arguments can be specified in several ways, which are
1026 	 * distinguished by flag bit settings.  The preferred way is to set
1027 	 * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1028 	 * type supplied as a character string and the last two arguments
1029 	 * being a pointer to a character buffer and the size of the buffer.
1030 	 * On entry, the buffer holds a null terminated list of options; on
1031 	 * return, the string is the list of options the file system
1032 	 * recognized. If MS_DATA is set arguments five and six point to a
1033 	 * block of binary data which the file system interprets.
1034 	 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1035 	 * consistently with these conventions.  To handle them, we check to
1036 	 * see whether the pointer to the file system name has a numeric value
1037 	 * less than 256.  If so, we treat it as an index.
1038 	 */
1039 	if (fsname != NULL) {
1040 		if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1041 			return (EINVAL);
1042 		}
1043 	} else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1044 		size_t n;
1045 		uint_t fstype;
1046 		char name[FSTYPSZ];
1047 
1048 		if ((fstype = (uintptr_t)uap->fstype) < 256) {
1049 			RLOCK_VFSSW();
1050 			if (fstype == 0 || fstype >= nfstype ||
1051 			    !ALLOCATED_VFSSW(&vfssw[fstype])) {
1052 				RUNLOCK_VFSSW();
1053 				return (EINVAL);
1054 			}
1055 			(void) strcpy(name, vfssw[fstype].vsw_name);
1056 			RUNLOCK_VFSSW();
1057 			if ((vswp = vfs_getvfssw(name)) == NULL)
1058 				return (EINVAL);
1059 		} else {
1060 			/*
1061 			 * Handle either kernel or user address space.
1062 			 */
1063 			if (uap->flags & MS_SYSSPACE) {
1064 				error = copystr(uap->fstype, name,
1065 				    FSTYPSZ, &n);
1066 			} else {
1067 				error = copyinstr(uap->fstype, name,
1068 				    FSTYPSZ, &n);
1069 			}
1070 			if (error) {
1071 				if (error == ENAMETOOLONG)
1072 					return (EINVAL);
1073 				return (error);
1074 			}
1075 			if ((vswp = vfs_getvfssw(name)) == NULL)
1076 				return (EINVAL);
1077 		}
1078 	} else {
1079 		if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1080 			return (EINVAL);
1081 	}
1082 	if (!VFS_INSTALLED(vswp))
1083 		return (EINVAL);
1084 	vfsops = &vswp->vsw_vfsops;
1085 
1086 	vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1087 	/*
1088 	 * Fetch mount options and parse them for generic vfs options
1089 	 */
1090 	if (uap->flags & MS_OPTIONSTR) {
1091 		/*
1092 		 * Limit the buffer size
1093 		 */
1094 		if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1095 			error = EINVAL;
1096 			goto errout;
1097 		}
1098 		if ((uap->flags & MS_SYSSPACE) == 0) {
1099 			inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1100 			inargs[0] = '\0';
1101 			if (optlen) {
1102 				error = copyinstr(opts, inargs, (size_t)optlen,
1103 				    NULL);
1104 				if (error) {
1105 					goto errout;
1106 				}
1107 			}
1108 		}
1109 		vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1110 	}
1111 	/*
1112 	 * Flag bits override the options string.
1113 	 */
1114 	if (uap->flags & MS_REMOUNT)
1115 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1116 	if (uap->flags & MS_RDONLY)
1117 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1118 	if (uap->flags & MS_NOSUID)
1119 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1120 
1121 	/*
1122 	 * Check if this is a remount; must be set in the option string and
1123 	 * the file system must support a remount option.
1124 	 */
1125 	if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1126 	    MNTOPT_REMOUNT, NULL)) {
1127 		if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1128 			error = ENOTSUP;
1129 			goto errout;
1130 		}
1131 		uap->flags |= MS_REMOUNT;
1132 	}
1133 
1134 	/*
1135 	 * uap->flags and vfs_optionisset() should agree.
1136 	 */
1137 	if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1138 		uap->flags |= MS_RDONLY;
1139 	}
1140 	if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1141 		uap->flags |= MS_NOSUID;
1142 	}
1143 	nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1144 	ASSERT(splice || !remount);
1145 	/*
1146 	 * If we are splicing the fs into the namespace,
1147 	 * perform mount point checks.
1148 	 *
1149 	 * We want to resolve the path for the mount point to eliminate
1150 	 * '.' and ".." and symlinks in mount points; we can't do the
1151 	 * same for the resource string, since it would turn
1152 	 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1153 	 * this before grabbing vn_vfswlock(), because otherwise we
1154 	 * would deadlock with lookuppn().
1155 	 */
1156 	if (splice) {
1157 		ASSERT(vp->v_count > 0);
1158 
1159 		/*
1160 		 * Pick up mount point and device from appropriate space.
1161 		 */
1162 		if (pn_get(uap->spec, fromspace, &pn) == 0) {
1163 			resource = kmem_alloc(pn.pn_pathlen + 1,
1164 			    KM_SLEEP);
1165 			(void) strcpy(resource, pn.pn_path);
1166 			pn_free(&pn);
1167 		}
1168 		/*
1169 		 * Do a lookupname prior to taking the
1170 		 * writelock. Mark this as completed if
1171 		 * successful for later cleanup and addition to
1172 		 * the mount in progress table.
1173 		 */
1174 		if ((uap->flags & MS_GLOBAL) == 0 &&
1175 		    lookupname(uap->spec, fromspace,
1176 		    FOLLOW, NULL, &bvp) == 0) {
1177 			addmip = 1;
1178 		}
1179 
1180 		if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1181 			pathname_t *pnp;
1182 
1183 			if (*pn.pn_path != '/') {
1184 				error = EINVAL;
1185 				pn_free(&pn);
1186 				goto errout;
1187 			}
1188 			pn_alloc(&rpn);
1189 			/*
1190 			 * Kludge to prevent autofs from deadlocking with
1191 			 * itself when it calls domount().
1192 			 *
1193 			 * If autofs is calling, it is because it is doing
1194 			 * (autofs) mounts in the process of an NFS mount.  A
1195 			 * lookuppn() here would cause us to block waiting for
1196 			 * said NFS mount to complete, which can't since this
1197 			 * is the thread that was supposed to doing it.
1198 			 */
1199 			if (fromspace == UIO_USERSPACE) {
1200 				if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1201 				    NULL)) == 0) {
1202 					pnp = &rpn;
1203 				} else {
1204 					/*
1205 					 * The file disappeared or otherwise
1206 					 * became inaccessible since we opened
1207 					 * it; might as well fail the mount
1208 					 * since the mount point is no longer
1209 					 * accessible.
1210 					 */
1211 					pn_free(&rpn);
1212 					pn_free(&pn);
1213 					goto errout;
1214 				}
1215 			} else {
1216 				pnp = &pn;
1217 			}
1218 			mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1219 			(void) strcpy(mountpt, pnp->pn_path);
1220 
1221 			/*
1222 			 * If the addition of the zone's rootpath
1223 			 * would push us over a total path length
1224 			 * of MAXPATHLEN, we fail the mount with
1225 			 * ENAMETOOLONG, which is what we would have
1226 			 * gotten if we were trying to perform the same
1227 			 * mount in the global zone.
1228 			 *
1229 			 * strlen() doesn't count the trailing
1230 			 * '\0', but zone_rootpathlen counts both a
1231 			 * trailing '/' and the terminating '\0'.
1232 			 */
1233 			if ((curproc->p_zone->zone_rootpathlen - 1 +
1234 			    strlen(mountpt)) > MAXPATHLEN ||
1235 			    (resource != NULL &&
1236 			    (curproc->p_zone->zone_rootpathlen - 1 +
1237 			    strlen(resource)) > MAXPATHLEN)) {
1238 				error = ENAMETOOLONG;
1239 			}
1240 
1241 			pn_free(&rpn);
1242 			pn_free(&pn);
1243 		}
1244 
1245 		if (error)
1246 			goto errout;
1247 
1248 		/*
1249 		 * Prevent path name resolution from proceeding past
1250 		 * the mount point.
1251 		 */
1252 		if (vn_vfswlock(vp) != 0) {
1253 			error = EBUSY;
1254 			goto errout;
1255 		}
1256 
1257 		/*
1258 		 * Verify that it's legitimate to establish a mount on
1259 		 * the prospective mount point.
1260 		 */
1261 		if (vn_mountedvfs(vp) != NULL) {
1262 			/*
1263 			 * The mount point lock was obtained after some
1264 			 * other thread raced through and established a mount.
1265 			 */
1266 			vn_vfsunlock(vp);
1267 			error = EBUSY;
1268 			goto errout;
1269 		}
1270 		if (vp->v_flag & VNOMOUNT) {
1271 			vn_vfsunlock(vp);
1272 			error = EINVAL;
1273 			goto errout;
1274 		}
1275 	}
1276 	if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1277 		uap->dataptr = NULL;
1278 		uap->datalen = 0;
1279 	}
1280 
1281 	/*
1282 	 * If this is a remount, we don't want to create a new VFS.
1283 	 * Instead, we pass the existing one with a remount flag.
1284 	 */
1285 	if (remount) {
1286 		/*
1287 		 * Confirm that the mount point is the root vnode of the
1288 		 * file system that is being remounted.
1289 		 * This can happen if the user specifies a different
1290 		 * mount point directory pathname in the (re)mount command.
1291 		 *
1292 		 * Code below can only be reached if splice is true, so it's
1293 		 * safe to do vn_vfsunlock() here.
1294 		 */
1295 		if ((vp->v_flag & VROOT) == 0) {
1296 			vn_vfsunlock(vp);
1297 			error = ENOENT;
1298 			goto errout;
1299 		}
1300 		/*
1301 		 * Disallow making file systems read-only unless file system
1302 		 * explicitly allows it in its vfssw.  Ignore other flags.
1303 		 */
1304 		if (rdonly && vn_is_readonly(vp) == 0 &&
1305 		    (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1306 			vn_vfsunlock(vp);
1307 			error = EINVAL;
1308 			goto errout;
1309 		}
1310 		/*
1311 		 * Changing the NBMAND setting on remounts is permitted
1312 		 * but logged since it can lead to unexpected behavior.
1313 		 * We also counsel against using it for / and /usr.
1314 		 */
1315 		if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1316 		    (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1317 			cmn_err(CE_WARN, "domount: nbmand turned %s via "
1318 			    "remounting %s", nbmand ? "on" : "off",
1319 			    refstr_value(vp->v_vfsp->vfs_mntpt));
1320 		}
1321 		vfsp = vp->v_vfsp;
1322 		ovflags = vfsp->vfs_flag;
1323 		vfsp->vfs_flag |= VFS_REMOUNT;
1324 		vfsp->vfs_flag &= ~VFS_RDONLY;
1325 	} else {
1326 		vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP);
1327 		VFS_INIT(vfsp, vfsops, NULL);
1328 	}
1329 
1330 	VFS_HOLD(vfsp);
1331 
1332 	/*
1333 	 * The vfs_reflock is not used anymore the code below explicitly
1334 	 * holds it preventing others accesing it directly.
1335 	 */
1336 	if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1337 	    !(vfsp->vfs_flag & VFS_REMOUNT))
1338 		cmn_err(CE_WARN,
1339 		    "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1340 
1341 	/*
1342 	 * Lock the vfs. If this is a remount we want to avoid spurious umount
1343 	 * failures that happen as a side-effect of fsflush() and other mount
1344 	 * and unmount operations that might be going on simultaneously and
1345 	 * may have locked the vfs currently. To not return EBUSY immediately
1346 	 * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1347 	 */
1348 	if (!remount) {
1349 		if (error = vfs_lock(vfsp)) {
1350 			vfsp->vfs_flag = ovflags;
1351 			if (splice)
1352 				vn_vfsunlock(vp);
1353 			if (vfsp->vfs_implp)
1354 				vfsimpl_teardown(vfsp);
1355 			kmem_free(vfsp, sizeof (struct vfs));
1356 			goto errout;
1357 		}
1358 	} else {
1359 		vfs_lock_wait(vfsp);
1360 	}
1361 
1362 	/*
1363 	 * Add device to mount in progress table, global mounts require special
1364 	 * handling. It is possible that we have already done the lookupname
1365 	 * on a spliced, non-global fs. If so, we don't want to do it again
1366 	 * since we cannot do a lookupname after taking the
1367 	 * wlock above. This case is for a non-spliced, non-global filesystem.
1368 	 */
1369 	if (!addmip) {
1370 		if ((uap->flags & MS_GLOBAL) == 0 &&
1371 		    lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1372 			addmip = 1;
1373 		}
1374 	}
1375 
1376 	if (addmip) {
1377 		bdev = bvp->v_rdev;
1378 		VN_RELE(bvp);
1379 		vfs_addmip(bdev, vfsp);
1380 		addmip = 0;
1381 		delmip = 1;
1382 	}
1383 	/*
1384 	 * Invalidate cached entry for the mount point.
1385 	 */
1386 	if (splice)
1387 		dnlc_purge_vp(vp);
1388 
1389 	/*
1390 	 * If have an option string but the filesystem doesn't supply a
1391 	 * prototype options table, create a table with the global
1392 	 * options and sufficient room to accept all the options in the
1393 	 * string.  Then parse the passed in option string
1394 	 * accepting all the options in the string.  This gives us an
1395 	 * option table with all the proper cancel properties for the
1396 	 * global options.
1397 	 *
1398 	 * Filesystems that supply a prototype options table are handled
1399 	 * earlier in this function.
1400 	 */
1401 	if (uap->flags & MS_OPTIONSTR) {
1402 		if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1403 			mntopts_t tmp_mntopts;
1404 
1405 			tmp_mntopts.mo_count = 0;
1406 			vfs_createopttbl_extend(&tmp_mntopts, inargs,
1407 			    &mnt_mntopts);
1408 			vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1409 			vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1410 			vfs_freeopttbl(&tmp_mntopts);
1411 		}
1412 	}
1413 
1414 	/*
1415 	 * Serialize with zone creations.
1416 	 */
1417 	mount_in_progress();
1418 	/*
1419 	 * Instantiate (or reinstantiate) the file system.  If appropriate,
1420 	 * splice it into the file system name space.
1421 	 *
1422 	 * We want VFS_MOUNT() to be able to override the vfs_resource
1423 	 * string if necessary (ie, mntfs), and also for a remount to
1424 	 * change the same (necessary when remounting '/' during boot).
1425 	 * So we set up vfs_mntpt and vfs_resource to what we think they
1426 	 * should be, then hand off control to VFS_MOUNT() which can
1427 	 * override this.
1428 	 *
1429 	 * For safety's sake, when changing vfs_resource or vfs_mntpt of
1430 	 * a vfs which is on the vfs list (i.e. during a remount), we must
1431 	 * never set those fields to NULL. Several bits of code make
1432 	 * assumptions that the fields are always valid.
1433 	 */
1434 	vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1435 	if (remount) {
1436 		if ((oldresource = vfsp->vfs_resource) != NULL)
1437 			refstr_hold(oldresource);
1438 		if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1439 			refstr_hold(oldmntpt);
1440 	}
1441 	vfs_setresource(vfsp, resource);
1442 	vfs_setmntpoint(vfsp, mountpt);
1443 
1444 	/*
1445 	 * going to mount on this vnode, so notify.
1446 	 */
1447 	vnevent_mountedover(vp);
1448 	error = VFS_MOUNT(vfsp, vp, uap, credp);
1449 
1450 	if (uap->flags & MS_RDONLY)
1451 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1452 	if (uap->flags & MS_NOSUID)
1453 		vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1454 	if (uap->flags & MS_GLOBAL)
1455 		vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1456 
1457 	if (error) {
1458 		if (remount) {
1459 			/* put back pre-remount options */
1460 			vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1461 			vfs_setmntpoint(vfsp, (stripzonepath(
1462 			    refstr_value(oldmntpt))));
1463 			if (oldmntpt)
1464 				refstr_rele(oldmntpt);
1465 			vfs_setresource(vfsp, (stripzonepath(
1466 			    refstr_value(oldresource))));
1467 			if (oldresource)
1468 				refstr_rele(oldresource);
1469 			vfsp->vfs_flag = ovflags;
1470 			vfs_unlock(vfsp);
1471 			VFS_RELE(vfsp);
1472 		} else {
1473 			vfs_unlock(vfsp);
1474 			vfs_freemnttab(vfsp);
1475 			if (vfsp->vfs_implp)
1476 				vfsimpl_teardown(vfsp);
1477 			kmem_free(vfsp, sizeof (struct vfs));
1478 		}
1479 	} else {
1480 		/*
1481 		 * Set the mount time to now
1482 		 */
1483 		vfsp->vfs_mtime = ddi_get_time();
1484 		if (remount) {
1485 			vfsp->vfs_flag &= ~VFS_REMOUNT;
1486 			if (oldresource)
1487 				refstr_rele(oldresource);
1488 			if (oldmntpt)
1489 				refstr_rele(oldmntpt);
1490 		} else if (splice) {
1491 			/*
1492 			 * Link vfsp into the name space at the mount
1493 			 * point. Vfs_add() is responsible for
1494 			 * holding the mount point which will be
1495 			 * released when vfs_remove() is called.
1496 			 */
1497 			vfs_add(vp, vfsp, uap->flags);
1498 		} else {
1499 			/*
1500 			 * Hold the reference to file system which is
1501 			 * not linked into the name space.
1502 			 */
1503 			vfsp->vfs_zone = NULL;
1504 			VFS_HOLD(vfsp);
1505 			vfsp->vfs_vnodecovered = NULL;
1506 		}
1507 		/*
1508 		 * Set flags for global options encountered
1509 		 */
1510 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1511 			vfsp->vfs_flag |= VFS_RDONLY;
1512 		else
1513 			vfsp->vfs_flag &= ~VFS_RDONLY;
1514 		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1515 			vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1516 		} else {
1517 			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1518 				vfsp->vfs_flag |= VFS_NODEVICES;
1519 			else
1520 				vfsp->vfs_flag &= ~VFS_NODEVICES;
1521 			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1522 				vfsp->vfs_flag |= VFS_NOSETUID;
1523 			else
1524 				vfsp->vfs_flag &= ~VFS_NOSETUID;
1525 		}
1526 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1527 			vfsp->vfs_flag |= VFS_NBMAND;
1528 		else
1529 			vfsp->vfs_flag &= ~VFS_NBMAND;
1530 
1531 		if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1532 			vfsp->vfs_flag |= VFS_XATTR;
1533 		else
1534 			vfsp->vfs_flag &= ~VFS_XATTR;
1535 
1536 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1537 			vfsp->vfs_flag |= VFS_NOEXEC;
1538 		else
1539 			vfsp->vfs_flag &= ~VFS_NOEXEC;
1540 
1541 		/*
1542 		 * Now construct the output option string of options
1543 		 * we recognized.
1544 		 */
1545 		if (uap->flags & MS_OPTIONSTR) {
1546 			vfs_list_read_lock();
1547 			copyout_error = vfs_buildoptionstr(
1548 			    &vfsp->vfs_mntopts, inargs, optlen);
1549 			vfs_list_unlock();
1550 			if (copyout_error == 0 &&
1551 			    (uap->flags & MS_SYSSPACE) == 0) {
1552 				copyout_error = copyoutstr(inargs, opts,
1553 				    optlen, NULL);
1554 			}
1555 		}
1556 
1557 		/*
1558 		 * If this isn't a remount, set up the vopstats before
1559 		 * anyone can touch this. We only allow spliced file
1560 		 * systems (file systems which are in the namespace) to
1561 		 * have the VFS_STATS flag set.
1562 		 * NOTE: PxFS mounts the underlying file system with
1563 		 * MS_NOSPLICE set and copies those vfs_flags to its private
1564 		 * vfs structure. As a result, PxFS should never have
1565 		 * the VFS_STATS flag or else we might access the vfs
1566 		 * statistics-related fields prior to them being
1567 		 * properly initialized.
1568 		 */
1569 		if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1570 			initialize_vopstats(&vfsp->vfs_vopstats);
1571 			/*
1572 			 * We need to set vfs_vskap to NULL because there's
1573 			 * a chance it won't be set below.  This is checked
1574 			 * in teardown_vopstats() so we can't have garbage.
1575 			 */
1576 			vfsp->vfs_vskap = NULL;
1577 			vfsp->vfs_flag |= VFS_STATS;
1578 			vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1579 		}
1580 
1581 		if (vswp->vsw_flag & VSW_XID)
1582 			vfsp->vfs_flag |= VFS_XID;
1583 
1584 		vfs_unlock(vfsp);
1585 	}
1586 	mount_completed();
1587 	if (splice)
1588 		vn_vfsunlock(vp);
1589 
1590 	if ((error == 0) && (copyout_error == 0)) {
1591 		if (!remount) {
1592 			/*
1593 			 * Don't call get_vskstat_anchor() while holding
1594 			 * locks since it allocates memory and calls
1595 			 * VFS_STATVFS().  For NFS, the latter can generate
1596 			 * an over-the-wire call.
1597 			 */
1598 			vskap = get_vskstat_anchor(vfsp);
1599 			/* Only take the lock if we have something to do */
1600 			if (vskap != NULL) {
1601 				vfs_lock_wait(vfsp);
1602 				if (vfsp->vfs_flag & VFS_STATS) {
1603 					vfsp->vfs_vskap = vskap;
1604 				}
1605 				vfs_unlock(vfsp);
1606 			}
1607 		}
1608 		/* Return vfsp to caller. */
1609 		*vfspp = vfsp;
1610 	}
1611 errout:
1612 	vfs_freeopttbl(&mnt_mntopts);
1613 	if (resource != NULL)
1614 		kmem_free(resource, strlen(resource) + 1);
1615 	if (mountpt != NULL)
1616 		kmem_free(mountpt, strlen(mountpt) + 1);
1617 	/*
1618 	 * It is possible we errored prior to adding to mount in progress
1619 	 * table. Must free vnode we acquired with successful lookupname.
1620 	 */
1621 	if (addmip)
1622 		VN_RELE(bvp);
1623 	if (delmip)
1624 		vfs_delmip(vfsp);
1625 	ASSERT(vswp != NULL);
1626 	vfs_unrefvfssw(vswp);
1627 	if (inargs != opts)
1628 		kmem_free(inargs, MAX_MNTOPT_STR);
1629 	if (copyout_error) {
1630 		VFS_RELE(vfsp);
1631 		error = copyout_error;
1632 	}
1633 	return (error);
1634 }
1635 
1636 static void
1637 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath)
1638 {
1639 	size_t len;
1640 	refstr_t *ref;
1641 	zone_t *zone = curproc->p_zone;
1642 	char *sp;
1643 	int have_list_lock = 0;
1644 
1645 	ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1646 
1647 	/*
1648 	 * New path must be less than MAXPATHLEN because mntfs
1649 	 * will only display up to MAXPATHLEN bytes. This is currently
1650 	 * safe, because domount() uses pn_get(), and other callers
1651 	 * similarly cap the size to fewer than MAXPATHLEN bytes.
1652 	 */
1653 
1654 	ASSERT(strlen(newpath) < MAXPATHLEN);
1655 
1656 	/* mntfs requires consistency while vfs list lock is held */
1657 
1658 	if (VFS_ON_LIST(vfsp)) {
1659 		have_list_lock = 1;
1660 		vfs_list_lock();
1661 	}
1662 
1663 	if (*refp != NULL)
1664 		refstr_rele(*refp);
1665 
1666 	/* Do we need to modify the path? */
1667 
1668 	if (zone == global_zone || *newpath != '/') {
1669 		ref = refstr_alloc(newpath);
1670 		goto out;
1671 	}
1672 
1673 	/*
1674 	 * Truncate the trailing '/' in the zoneroot, and merge
1675 	 * in the zone's rootpath with the "newpath" (resource
1676 	 * or mountpoint) passed in.
1677 	 *
1678 	 * The size of the required buffer is thus the size of
1679 	 * the buffer required for the passed-in newpath
1680 	 * (strlen(newpath) + 1), plus the size of the buffer
1681 	 * required to hold zone_rootpath (zone_rootpathlen)
1682 	 * minus one for one of the now-superfluous NUL
1683 	 * terminations, minus one for the trailing '/'.
1684 	 *
1685 	 * That gives us:
1686 	 *
1687 	 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1688 	 *
1689 	 * Which is what we have below.
1690 	 */
1691 
1692 	len = strlen(newpath) + zone->zone_rootpathlen - 1;
1693 	sp = kmem_alloc(len, KM_SLEEP);
1694 
1695 	/*
1696 	 * Copy everything including the trailing slash, which
1697 	 * we then overwrite with the NUL character.
1698 	 */
1699 
1700 	(void) strcpy(sp, zone->zone_rootpath);
1701 	sp[zone->zone_rootpathlen - 2] = '\0';
1702 	(void) strcat(sp, newpath);
1703 
1704 	ref = refstr_alloc(sp);
1705 	kmem_free(sp, len);
1706 out:
1707 	*refp = ref;
1708 
1709 	if (have_list_lock) {
1710 		vfs_mnttab_modtimeupd();
1711 		vfs_list_unlock();
1712 	}
1713 }
1714 
1715 /*
1716  * Record a mounted resource name in a vfs structure.
1717  * If vfsp is already mounted, caller must hold the vfs lock.
1718  */
1719 void
1720 vfs_setresource(struct vfs *vfsp, const char *resource)
1721 {
1722 	if (resource == NULL || resource[0] == '\0')
1723 		resource = VFS_NORESOURCE;
1724 	vfs_setpath(vfsp, &vfsp->vfs_resource, resource);
1725 }
1726 
1727 /*
1728  * Record a mount point name in a vfs structure.
1729  * If vfsp is already mounted, caller must hold the vfs lock.
1730  */
1731 void
1732 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt)
1733 {
1734 	if (mntpt == NULL || mntpt[0] == '\0')
1735 		mntpt = VFS_NOMNTPT;
1736 	vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt);
1737 }
1738 
1739 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1740 
1741 refstr_t *
1742 vfs_getresource(const struct vfs *vfsp)
1743 {
1744 	refstr_t *resource;
1745 
1746 	vfs_list_read_lock();
1747 	resource = vfsp->vfs_resource;
1748 	refstr_hold(resource);
1749 	vfs_list_unlock();
1750 
1751 	return (resource);
1752 }
1753 
1754 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1755 
1756 refstr_t *
1757 vfs_getmntpoint(const struct vfs *vfsp)
1758 {
1759 	refstr_t *mntpt;
1760 
1761 	vfs_list_read_lock();
1762 	mntpt = vfsp->vfs_mntpt;
1763 	refstr_hold(mntpt);
1764 	vfs_list_unlock();
1765 
1766 	return (mntpt);
1767 }
1768 
1769 /*
1770  * Create an empty options table with enough empty slots to hold all
1771  * The options in the options string passed as an argument.
1772  * Potentially prepend another options table.
1773  *
1774  * Note: caller is responsible for locking the vfs list, if needed,
1775  *       to protect mops.
1776  */
1777 static void
1778 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1779     const mntopts_t *mtmpl)
1780 {
1781 	const char *s = opts;
1782 	uint_t count;
1783 
1784 	if (opts == NULL || *opts == '\0') {
1785 		count = 0;
1786 	} else {
1787 		count = 1;
1788 
1789 		/*
1790 		 * Count number of options in the string
1791 		 */
1792 		for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1793 			count++;
1794 			s++;
1795 		}
1796 	}
1797 	vfs_copyopttbl_extend(mtmpl, mops, count);
1798 }
1799 
1800 /*
1801  * Create an empty options table with enough empty slots to hold all
1802  * The options in the options string passed as an argument.
1803  *
1804  * This function is *not* for general use by filesystems.
1805  *
1806  * Note: caller is responsible for locking the vfs list, if needed,
1807  *       to protect mops.
1808  */
1809 void
1810 vfs_createopttbl(mntopts_t *mops, const char *opts)
1811 {
1812 	vfs_createopttbl_extend(mops, opts, NULL);
1813 }
1814 
1815 
1816 /*
1817  * Swap two mount options tables
1818  */
1819 static void
1820 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
1821 {
1822 	uint_t tmpcnt;
1823 	mntopt_t *tmplist;
1824 
1825 	tmpcnt = optbl2->mo_count;
1826 	tmplist = optbl2->mo_list;
1827 	optbl2->mo_count = optbl1->mo_count;
1828 	optbl2->mo_list = optbl1->mo_list;
1829 	optbl1->mo_count = tmpcnt;
1830 	optbl1->mo_list = tmplist;
1831 }
1832 
1833 static void
1834 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
1835 {
1836 	vfs_list_lock();
1837 	vfs_swapopttbl_nolock(optbl1, optbl2);
1838 	vfs_mnttab_modtimeupd();
1839 	vfs_list_unlock();
1840 }
1841 
1842 static char **
1843 vfs_copycancelopt_extend(char **const moc, int extend)
1844 {
1845 	int i = 0;
1846 	int j;
1847 	char **result;
1848 
1849 	if (moc != NULL) {
1850 		for (; moc[i] != NULL; i++)
1851 			/* count number of options to cancel */;
1852 	}
1853 
1854 	if (i + extend == 0)
1855 		return (NULL);
1856 
1857 	result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
1858 
1859 	for (j = 0; j < i; j++) {
1860 		result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
1861 		(void) strcpy(result[j], moc[j]);
1862 	}
1863 	for (; j <= i + extend; j++)
1864 		result[j] = NULL;
1865 
1866 	return (result);
1867 }
1868 
1869 static void
1870 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
1871 {
1872 	char *sp, *dp;
1873 
1874 	d->mo_flags = s->mo_flags;
1875 	d->mo_data = s->mo_data;
1876 	sp = s->mo_name;
1877 	if (sp != NULL) {
1878 		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1879 		(void) strcpy(dp, sp);
1880 		d->mo_name = dp;
1881 	} else {
1882 		d->mo_name = NULL; /* should never happen */
1883 	}
1884 
1885 	d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
1886 
1887 	sp = s->mo_arg;
1888 	if (sp != NULL) {
1889 		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1890 		(void) strcpy(dp, sp);
1891 		d->mo_arg = dp;
1892 	} else {
1893 		d->mo_arg = NULL;
1894 	}
1895 }
1896 
1897 /*
1898  * Copy a mount options table, possibly allocating some spare
1899  * slots at the end.  It is permissible to copy_extend the NULL table.
1900  */
1901 static void
1902 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
1903 {
1904 	uint_t i, count;
1905 	mntopt_t *motbl;
1906 
1907 	/*
1908 	 * Clear out any existing stuff in the options table being initialized
1909 	 */
1910 	vfs_freeopttbl(dmo);
1911 	count = (smo == NULL) ? 0 : smo->mo_count;
1912 	if ((count + extra) == 0)	/* nothing to do */
1913 		return;
1914 	dmo->mo_count = count + extra;
1915 	motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
1916 	dmo->mo_list = motbl;
1917 	for (i = 0; i < count; i++) {
1918 		vfs_copyopt(&smo->mo_list[i], &motbl[i]);
1919 	}
1920 	for (i = count; i < count + extra; i++) {
1921 		motbl[i].mo_flags = MO_EMPTY;
1922 	}
1923 }
1924 
1925 /*
1926  * Copy a mount options table.
1927  *
1928  * This function is *not* for general use by filesystems.
1929  *
1930  * Note: caller is responsible for locking the vfs list, if needed,
1931  *       to protect smo and dmo.
1932  */
1933 void
1934 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
1935 {
1936 	vfs_copyopttbl_extend(smo, dmo, 0);
1937 }
1938 
1939 static char **
1940 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
1941 {
1942 	int c1 = 0;
1943 	int c2 = 0;
1944 	char **result;
1945 	char **sp1, **sp2, **dp;
1946 
1947 	/*
1948 	 * First we count both lists of cancel options.
1949 	 * If either is NULL or has no elements, we return a copy of
1950 	 * the other.
1951 	 */
1952 	if (mop1->mo_cancel != NULL) {
1953 		for (; mop1->mo_cancel[c1] != NULL; c1++)
1954 			/* count cancel options in mop1 */;
1955 	}
1956 
1957 	if (c1 == 0)
1958 		return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
1959 
1960 	if (mop2->mo_cancel != NULL) {
1961 		for (; mop2->mo_cancel[c2] != NULL; c2++)
1962 			/* count cancel options in mop2 */;
1963 	}
1964 
1965 	result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
1966 
1967 	if (c2 == 0)
1968 		return (result);
1969 
1970 	/*
1971 	 * When we get here, we've got two sets of cancel options;
1972 	 * we need to merge the two sets.  We know that the result
1973 	 * array has "c1+c2+1" entries and in the end we might shrink
1974 	 * it.
1975 	 * Result now has a copy of the c1 entries from mop1; we'll
1976 	 * now lookup all the entries of mop2 in mop1 and copy it if
1977 	 * it is unique.
1978 	 * This operation is O(n^2) but it's only called once per
1979 	 * filesystem per duplicate option.  This is a situation
1980 	 * which doesn't arise with the filesystems in ON and
1981 	 * n is generally 1.
1982 	 */
1983 
1984 	dp = &result[c1];
1985 	for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
1986 		for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
1987 			if (strcmp(*sp1, *sp2) == 0)
1988 				break;
1989 		}
1990 		if (*sp1 == NULL) {
1991 			/*
1992 			 * Option *sp2 not found in mop1, so copy it.
1993 			 * The calls to vfs_copycancelopt_extend()
1994 			 * guarantee that there's enough room.
1995 			 */
1996 			*dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
1997 			(void) strcpy(*dp++, *sp2);
1998 		}
1999 	}
2000 	if (dp != &result[c1+c2]) {
2001 		size_t bytes = (dp - result + 1) * sizeof (char *);
2002 		char **nres = kmem_alloc(bytes, KM_SLEEP);
2003 
2004 		bcopy(result, nres, bytes);
2005 		kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2006 		result = nres;
2007 	}
2008 	return (result);
2009 }
2010 
2011 /*
2012  * Merge two mount option tables (outer and inner) into one.  This is very
2013  * similar to "merging" global variables and automatic variables in C.
2014  *
2015  * This isn't (and doesn't have to be) fast.
2016  *
2017  * This function is *not* for general use by filesystems.
2018  *
2019  * Note: caller is responsible for locking the vfs list, if needed,
2020  *       to protect omo, imo & dmo.
2021  */
2022 void
2023 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2024 {
2025 	uint_t i, count;
2026 	mntopt_t *mop, *motbl;
2027 	uint_t freeidx;
2028 
2029 	/*
2030 	 * First determine how much space we need to allocate.
2031 	 */
2032 	count = omo->mo_count;
2033 	for (i = 0; i < imo->mo_count; i++) {
2034 		if (imo->mo_list[i].mo_flags & MO_EMPTY)
2035 			continue;
2036 		if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2037 			count++;
2038 	}
2039 	ASSERT(count >= omo->mo_count &&
2040 	    count <= omo->mo_count + imo->mo_count);
2041 	motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2042 	for (i = 0; i < omo->mo_count; i++)
2043 		vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2044 	freeidx = omo->mo_count;
2045 	for (i = 0; i < imo->mo_count; i++) {
2046 		if (imo->mo_list[i].mo_flags & MO_EMPTY)
2047 			continue;
2048 		if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2049 			char **newcanp;
2050 			uint_t index = mop - omo->mo_list;
2051 
2052 			newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2053 
2054 			vfs_freeopt(&motbl[index]);
2055 			vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2056 
2057 			vfs_freecancelopt(motbl[index].mo_cancel);
2058 			motbl[index].mo_cancel = newcanp;
2059 		} else {
2060 			/*
2061 			 * If it's a new option, just copy it over to the first
2062 			 * free location.
2063 			 */
2064 			vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2065 		}
2066 	}
2067 	dmo->mo_count = count;
2068 	dmo->mo_list = motbl;
2069 }
2070 
2071 /*
2072  * Functions to set and clear mount options in a mount options table.
2073  */
2074 
2075 /*
2076  * Clear a mount option, if it exists.
2077  *
2078  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2079  * the vfs list.
2080  */
2081 static void
2082 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2083 {
2084 	struct mntopt *mop;
2085 	uint_t i, count;
2086 
2087 	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2088 
2089 	count = mops->mo_count;
2090 	for (i = 0; i < count; i++) {
2091 		mop = &mops->mo_list[i];
2092 
2093 		if (mop->mo_flags & MO_EMPTY)
2094 			continue;
2095 		if (strcmp(opt, mop->mo_name))
2096 			continue;
2097 		mop->mo_flags &= ~MO_SET;
2098 		if (mop->mo_arg != NULL) {
2099 			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2100 		}
2101 		mop->mo_arg = NULL;
2102 		if (update_mnttab)
2103 			vfs_mnttab_modtimeupd();
2104 		break;
2105 	}
2106 }
2107 
2108 void
2109 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2110 {
2111 	int gotlock = 0;
2112 
2113 	if (VFS_ON_LIST(vfsp)) {
2114 		gotlock = 1;
2115 		vfs_list_lock();
2116 	}
2117 	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2118 	if (gotlock)
2119 		vfs_list_unlock();
2120 }
2121 
2122 
2123 /*
2124  * Set a mount option on.  If it's not found in the table, it's silently
2125  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2126  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2127  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2128  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2129  * MO_EMPTY set is created as the option passed in.
2130  *
2131  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2132  * the vfs list.
2133  */
2134 static void
2135 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2136     const char *arg, int flags, int update_mnttab)
2137 {
2138 	mntopt_t *mop;
2139 	uint_t i, count;
2140 	char *sp;
2141 
2142 	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2143 
2144 	if (flags & VFS_CREATEOPT) {
2145 		if (vfs_hasopt(mops, opt) != NULL) {
2146 			flags &= ~VFS_CREATEOPT;
2147 		}
2148 	}
2149 	count = mops->mo_count;
2150 	for (i = 0; i < count; i++) {
2151 		mop = &mops->mo_list[i];
2152 
2153 		if (mop->mo_flags & MO_EMPTY) {
2154 			if ((flags & VFS_CREATEOPT) == 0)
2155 				continue;
2156 			sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2157 			(void) strcpy(sp, opt);
2158 			mop->mo_name = sp;
2159 			if (arg != NULL)
2160 				mop->mo_flags = MO_HASVALUE;
2161 			else
2162 				mop->mo_flags = 0;
2163 		} else if (strcmp(opt, mop->mo_name)) {
2164 			continue;
2165 		}
2166 		if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2167 			break;
2168 		if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2169 			sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2170 			(void) strcpy(sp, arg);
2171 		} else {
2172 			sp = NULL;
2173 		}
2174 		if (mop->mo_arg != NULL)
2175 			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2176 		mop->mo_arg = sp;
2177 		if (flags & VFS_DISPLAY)
2178 			mop->mo_flags &= ~MO_NODISPLAY;
2179 		if (flags & VFS_NODISPLAY)
2180 			mop->mo_flags |= MO_NODISPLAY;
2181 		mop->mo_flags |= MO_SET;
2182 		if (mop->mo_cancel != NULL) {
2183 			char **cp;
2184 
2185 			for (cp = mop->mo_cancel; *cp != NULL; cp++)
2186 				vfs_clearmntopt_nolock(mops, *cp, 0);
2187 		}
2188 		if (update_mnttab)
2189 			vfs_mnttab_modtimeupd();
2190 		break;
2191 	}
2192 }
2193 
2194 void
2195 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2196 {
2197 	int gotlock = 0;
2198 
2199 	if (VFS_ON_LIST(vfsp)) {
2200 		gotlock = 1;
2201 		vfs_list_lock();
2202 	}
2203 	vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2204 	if (gotlock)
2205 		vfs_list_unlock();
2206 }
2207 
2208 
2209 /*
2210  * Add a "tag" option to a mounted file system's options list.
2211  *
2212  * Note: caller is responsible for locking the vfs list, if needed,
2213  *       to protect mops.
2214  */
2215 static mntopt_t *
2216 vfs_addtag(mntopts_t *mops, const char *tag)
2217 {
2218 	uint_t count;
2219 	mntopt_t *mop, *motbl;
2220 
2221 	count = mops->mo_count + 1;
2222 	motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2223 	if (mops->mo_count) {
2224 		size_t len = (count - 1) * sizeof (mntopt_t);
2225 
2226 		bcopy(mops->mo_list, motbl, len);
2227 		kmem_free(mops->mo_list, len);
2228 	}
2229 	mops->mo_count = count;
2230 	mops->mo_list = motbl;
2231 	mop = &motbl[count - 1];
2232 	mop->mo_flags = MO_TAG;
2233 	mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2234 	(void) strcpy(mop->mo_name, tag);
2235 	return (mop);
2236 }
2237 
2238 /*
2239  * Allow users to set arbitrary "tags" in a vfs's mount options.
2240  * Broader use within the kernel is discouraged.
2241  */
2242 int
2243 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2244     cred_t *cr)
2245 {
2246 	vfs_t *vfsp;
2247 	mntopts_t *mops;
2248 	mntopt_t *mop;
2249 	int found = 0;
2250 	dev_t dev = makedevice(major, minor);
2251 	int err = 0;
2252 	char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2253 
2254 	/*
2255 	 * Find the desired mounted file system
2256 	 */
2257 	vfs_list_lock();
2258 	vfsp = rootvfs;
2259 	do {
2260 		if (vfsp->vfs_dev == dev &&
2261 		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2262 			found = 1;
2263 			break;
2264 		}
2265 		vfsp = vfsp->vfs_next;
2266 	} while (vfsp != rootvfs);
2267 
2268 	if (!found) {
2269 		err = EINVAL;
2270 		goto out;
2271 	}
2272 	err = secpolicy_fs_config(cr, vfsp);
2273 	if (err != 0)
2274 		goto out;
2275 
2276 	mops = &vfsp->vfs_mntopts;
2277 	/*
2278 	 * Add tag if it doesn't already exist
2279 	 */
2280 	if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2281 		int len;
2282 
2283 		(void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2284 		len = strlen(buf);
2285 		if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2286 			err = ENAMETOOLONG;
2287 			goto out;
2288 		}
2289 		mop = vfs_addtag(mops, tag);
2290 	}
2291 	if ((mop->mo_flags & MO_TAG) == 0) {
2292 		err = EINVAL;
2293 		goto out;
2294 	}
2295 	vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2296 out:
2297 	vfs_list_unlock();
2298 	kmem_free(buf, MAX_MNTOPT_STR);
2299 	return (err);
2300 }
2301 
2302 /*
2303  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2304  * Broader use within the kernel is discouraged.
2305  */
2306 int
2307 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2308     cred_t *cr)
2309 {
2310 	vfs_t *vfsp;
2311 	mntopt_t *mop;
2312 	int found = 0;
2313 	dev_t dev = makedevice(major, minor);
2314 	int err = 0;
2315 
2316 	/*
2317 	 * Find the desired mounted file system
2318 	 */
2319 	vfs_list_lock();
2320 	vfsp = rootvfs;
2321 	do {
2322 		if (vfsp->vfs_dev == dev &&
2323 		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2324 			found = 1;
2325 			break;
2326 		}
2327 		vfsp = vfsp->vfs_next;
2328 	} while (vfsp != rootvfs);
2329 
2330 	if (!found) {
2331 		err = EINVAL;
2332 		goto out;
2333 	}
2334 	err = secpolicy_fs_config(cr, vfsp);
2335 	if (err != 0)
2336 		goto out;
2337 
2338 	if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2339 		err = EINVAL;
2340 		goto out;
2341 	}
2342 	if ((mop->mo_flags & MO_TAG) == 0) {
2343 		err = EINVAL;
2344 		goto out;
2345 	}
2346 	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2347 out:
2348 	vfs_list_unlock();
2349 	return (err);
2350 }
2351 
2352 /*
2353  * Function to parse an option string and fill in a mount options table.
2354  * Unknown options are silently ignored.  The input option string is modified
2355  * by replacing separators with nulls.  If the create flag is set, options
2356  * not found in the table are just added on the fly.  The table must have
2357  * an option slot marked MO_EMPTY to add an option on the fly.
2358  *
2359  * This function is *not* for general use by filesystems.
2360  *
2361  * Note: caller is responsible for locking the vfs list, if needed,
2362  *       to protect mops..
2363  */
2364 void
2365 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2366 {
2367 	char *s = osp, *p, *nextop, *valp, *cp, *ep;
2368 	int setflg = VFS_NOFORCEOPT;
2369 
2370 	if (osp == NULL)
2371 		return;
2372 	while (*s != '\0') {
2373 		p = strchr(s, ',');	/* find next option */
2374 		if (p == NULL) {
2375 			cp = NULL;
2376 			p = s + strlen(s);
2377 		} else {
2378 			cp = p;		/* save location of comma */
2379 			*p++ = '\0';	/* mark end and point to next option */
2380 		}
2381 		nextop = p;
2382 		p = strchr(s, '=');	/* look for value */
2383 		if (p == NULL) {
2384 			valp = NULL;	/* no value supplied */
2385 		} else {
2386 			ep = p;		/* save location of equals */
2387 			*p++ = '\0';	/* end option and point to value */
2388 			valp = p;
2389 		}
2390 		/*
2391 		 * set option into options table
2392 		 */
2393 		if (create)
2394 			setflg |= VFS_CREATEOPT;
2395 		vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2396 		if (cp != NULL)
2397 			*cp = ',';	/* restore the comma */
2398 		if (valp != NULL)
2399 			*ep = '=';	/* restore the equals */
2400 		s = nextop;
2401 	}
2402 }
2403 
2404 /*
2405  * Function to inquire if an option exists in a mount options table.
2406  * Returns a pointer to the option if it exists, else NULL.
2407  *
2408  * This function is *not* for general use by filesystems.
2409  *
2410  * Note: caller is responsible for locking the vfs list, if needed,
2411  *       to protect mops.
2412  */
2413 struct mntopt *
2414 vfs_hasopt(const mntopts_t *mops, const char *opt)
2415 {
2416 	struct mntopt *mop;
2417 	uint_t i, count;
2418 
2419 	count = mops->mo_count;
2420 	for (i = 0; i < count; i++) {
2421 		mop = &mops->mo_list[i];
2422 
2423 		if (mop->mo_flags & MO_EMPTY)
2424 			continue;
2425 		if (strcmp(opt, mop->mo_name) == 0)
2426 			return (mop);
2427 	}
2428 	return (NULL);
2429 }
2430 
2431 /*
2432  * Function to inquire if an option is set in a mount options table.
2433  * Returns non-zero if set and fills in the arg pointer with a pointer to
2434  * the argument string or NULL if there is no argument string.
2435  */
2436 static int
2437 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2438 {
2439 	struct mntopt *mop;
2440 	uint_t i, count;
2441 
2442 	count = mops->mo_count;
2443 	for (i = 0; i < count; i++) {
2444 		mop = &mops->mo_list[i];
2445 
2446 		if (mop->mo_flags & MO_EMPTY)
2447 			continue;
2448 		if (strcmp(opt, mop->mo_name))
2449 			continue;
2450 		if ((mop->mo_flags & MO_SET) == 0)
2451 			return (0);
2452 		if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2453 			*argp = mop->mo_arg;
2454 		return (1);
2455 	}
2456 	return (0);
2457 }
2458 
2459 
2460 int
2461 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2462 {
2463 	int ret;
2464 
2465 	vfs_list_read_lock();
2466 	ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2467 	vfs_list_unlock();
2468 	return (ret);
2469 }
2470 
2471 
2472 /*
2473  * Construct a comma separated string of the options set in the given
2474  * mount table, return the string in the given buffer.  Return non-zero if
2475  * the buffer would overflow.
2476  *
2477  * This function is *not* for general use by filesystems.
2478  *
2479  * Note: caller is responsible for locking the vfs list, if needed,
2480  *       to protect mp.
2481  */
2482 int
2483 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2484 {
2485 	char *cp;
2486 	uint_t i;
2487 
2488 	buf[0] = '\0';
2489 	cp = buf;
2490 	for (i = 0; i < mp->mo_count; i++) {
2491 		struct mntopt *mop;
2492 
2493 		mop = &mp->mo_list[i];
2494 		if (mop->mo_flags & MO_SET) {
2495 			int optlen, comma = 0;
2496 
2497 			if (buf[0] != '\0')
2498 				comma = 1;
2499 			optlen = strlen(mop->mo_name);
2500 			if (strlen(buf) + comma + optlen + 1 > len)
2501 				goto err;
2502 			if (comma)
2503 				*cp++ = ',';
2504 			(void) strcpy(cp, mop->mo_name);
2505 			cp += optlen;
2506 			/*
2507 			 * Append option value if there is one
2508 			 */
2509 			if (mop->mo_arg != NULL) {
2510 				int arglen;
2511 
2512 				arglen = strlen(mop->mo_arg);
2513 				if (strlen(buf) + arglen + 2 > len)
2514 					goto err;
2515 				*cp++ = '=';
2516 				(void) strcpy(cp, mop->mo_arg);
2517 				cp += arglen;
2518 			}
2519 		}
2520 	}
2521 	return (0);
2522 err:
2523 	return (EOVERFLOW);
2524 }
2525 
2526 static void
2527 vfs_freecancelopt(char **moc)
2528 {
2529 	if (moc != NULL) {
2530 		int ccnt = 0;
2531 		char **cp;
2532 
2533 		for (cp = moc; *cp != NULL; cp++) {
2534 			kmem_free(*cp, strlen(*cp) + 1);
2535 			ccnt++;
2536 		}
2537 		kmem_free(moc, (ccnt + 1) * sizeof (char *));
2538 	}
2539 }
2540 
2541 static void
2542 vfs_freeopt(mntopt_t *mop)
2543 {
2544 	if (mop->mo_name != NULL)
2545 		kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2546 
2547 	vfs_freecancelopt(mop->mo_cancel);
2548 
2549 	if (mop->mo_arg != NULL)
2550 		kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2551 }
2552 
2553 /*
2554  * Free a mount options table
2555  *
2556  * This function is *not* for general use by filesystems.
2557  *
2558  * Note: caller is responsible for locking the vfs list, if needed,
2559  *       to protect mp.
2560  */
2561 void
2562 vfs_freeopttbl(mntopts_t *mp)
2563 {
2564 	uint_t i, count;
2565 
2566 	count = mp->mo_count;
2567 	for (i = 0; i < count; i++) {
2568 		vfs_freeopt(&mp->mo_list[i]);
2569 	}
2570 	if (count) {
2571 		kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2572 		mp->mo_count = 0;
2573 		mp->mo_list = NULL;
2574 	}
2575 }
2576 
2577 
2578 /* ARGSUSED */
2579 static int
2580 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2581 	caller_context_t *ct)
2582 {
2583 	return (0);
2584 }
2585 
2586 /* ARGSUSED */
2587 static int
2588 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2589 	caller_context_t *ct)
2590 {
2591 	return (0);
2592 }
2593 
2594 /*
2595  * The dummy vnode is currently used only by file events notification
2596  * module which is just interested in the timestamps.
2597  */
2598 /* ARGSUSED */
2599 static int
2600 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
2601 {
2602 	bzero(vap, sizeof (vattr_t));
2603 	vap->va_type = VREG;
2604 	vap->va_nlink = 1;
2605 	vap->va_ctime = vfs_mnttab_ctime;
2606 	/*
2607 	 * it is ok to just copy mtime as the time will be monotonically
2608 	 * increasing.
2609 	 */
2610 	vap->va_mtime = vfs_mnttab_mtime;
2611 	vap->va_atime = vap->va_mtime;
2612 	return (0);
2613 }
2614 
2615 static void
2616 vfs_mnttabvp_setup(void)
2617 {
2618 	vnode_t *tvp;
2619 	vnodeops_t *vfs_mntdummyvnops;
2620 	const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2621 		VOPNAME_READ, 		{ .vop_read = vfs_mntdummyread },
2622 		VOPNAME_WRITE, 		{ .vop_write = vfs_mntdummywrite },
2623 		VOPNAME_GETATTR,	{ .vop_getattr = vfs_mntdummygetattr },
2624 		VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
2625 		NULL,			NULL
2626 	};
2627 
2628 	if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2629 	    &vfs_mntdummyvnops) != 0) {
2630 		cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2631 		/* Shouldn't happen, but not bad enough to panic */
2632 		return;
2633 	}
2634 
2635 	/*
2636 	 * A global dummy vnode is allocated to represent mntfs files.
2637 	 * The mntfs file (/etc/mnttab) can be monitored for file events
2638 	 * and receive an event when mnttab changes. Dummy VOP calls
2639 	 * will be made on this vnode. The file events notification module
2640 	 * intercepts this vnode and delivers relevant events.
2641 	 */
2642 	tvp = vn_alloc(KM_SLEEP);
2643 	tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2644 	vn_setops(tvp, vfs_mntdummyvnops);
2645 	tvp->v_type = VREG;
2646 	/*
2647 	 * The mnt dummy ops do not reference v_data.
2648 	 * No other module intercepting this vnode should either.
2649 	 * Just set it to point to itself.
2650 	 */
2651 	tvp->v_data = (caddr_t)tvp;
2652 	tvp->v_vfsp = rootvfs;
2653 	vfs_mntdummyvp = tvp;
2654 }
2655 
2656 /*
2657  * performs fake read/write ops
2658  */
2659 static void
2660 vfs_mnttab_rwop(int rw)
2661 {
2662 	struct uio	uio;
2663 	struct iovec	iov;
2664 	char	buf[1];
2665 
2666 	if (vfs_mntdummyvp == NULL)
2667 		return;
2668 
2669 	bzero(&uio, sizeof (uio));
2670 	bzero(&iov, sizeof (iov));
2671 	iov.iov_base = buf;
2672 	iov.iov_len = 0;
2673 	uio.uio_iov = &iov;
2674 	uio.uio_iovcnt = 1;
2675 	uio.uio_loffset = 0;
2676 	uio.uio_segflg = UIO_SYSSPACE;
2677 	uio.uio_resid = 0;
2678 	if (rw) {
2679 		(void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2680 	} else {
2681 		(void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2682 	}
2683 }
2684 
2685 /*
2686  * Generate a write operation.
2687  */
2688 void
2689 vfs_mnttab_writeop(void)
2690 {
2691 	vfs_mnttab_rwop(1);
2692 }
2693 
2694 /*
2695  * Generate a read operation.
2696  */
2697 void
2698 vfs_mnttab_readop(void)
2699 {
2700 	vfs_mnttab_rwop(0);
2701 }
2702 
2703 /*
2704  * Free any mnttab information recorded in the vfs struct.
2705  * The vfs must not be on the vfs list.
2706  */
2707 static void
2708 vfs_freemnttab(struct vfs *vfsp)
2709 {
2710 	ASSERT(!VFS_ON_LIST(vfsp));
2711 
2712 	/*
2713 	 * Free device and mount point information
2714 	 */
2715 	if (vfsp->vfs_mntpt != NULL) {
2716 		refstr_rele(vfsp->vfs_mntpt);
2717 		vfsp->vfs_mntpt = NULL;
2718 	}
2719 	if (vfsp->vfs_resource != NULL) {
2720 		refstr_rele(vfsp->vfs_resource);
2721 		vfsp->vfs_resource = NULL;
2722 	}
2723 	/*
2724 	 * Now free mount options information
2725 	 */
2726 	vfs_freeopttbl(&vfsp->vfs_mntopts);
2727 }
2728 
2729 /*
2730  * Return the last mnttab modification time
2731  */
2732 void
2733 vfs_mnttab_modtime(timespec_t *ts)
2734 {
2735 	ASSERT(RW_LOCK_HELD(&vfslist));
2736 	*ts = vfs_mnttab_mtime;
2737 }
2738 
2739 /*
2740  * See if mnttab is changed
2741  */
2742 void
2743 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2744 {
2745 	int changed;
2746 
2747 	*phpp = (struct pollhead *)NULL;
2748 
2749 	/*
2750 	 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2751 	 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2752 	 * to not grab the vfs list lock because tv_sec is monotonically
2753 	 * increasing.
2754 	 */
2755 
2756 	changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2757 	    (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2758 	if (!changed) {
2759 		*phpp = &vfs_pollhd;
2760 	}
2761 }
2762 
2763 /*
2764  * Update the mnttab modification time and wake up any waiters for
2765  * mnttab changes
2766  */
2767 void
2768 vfs_mnttab_modtimeupd()
2769 {
2770 	hrtime_t oldhrt, newhrt;
2771 
2772 	ASSERT(RW_WRITE_HELD(&vfslist));
2773 	oldhrt = ts2hrt(&vfs_mnttab_mtime);
2774 	gethrestime(&vfs_mnttab_mtime);
2775 	newhrt = ts2hrt(&vfs_mnttab_mtime);
2776 	if (oldhrt == (hrtime_t)0)
2777 		vfs_mnttab_ctime = vfs_mnttab_mtime;
2778 	/*
2779 	 * Attempt to provide unique mtime (like uniqtime but not).
2780 	 */
2781 	if (newhrt == oldhrt) {
2782 		newhrt++;
2783 		hrt2ts(newhrt, &vfs_mnttab_mtime);
2784 	}
2785 	pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2786 	vfs_mnttab_writeop();
2787 }
2788 
2789 int
2790 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
2791 {
2792 	vnode_t *coveredvp;
2793 	int error;
2794 	extern void teardown_vopstats(vfs_t *);
2795 
2796 	/*
2797 	 * Get covered vnode. This will be NULL if the vfs is not linked
2798 	 * into the file system name space (i.e., domount() with MNT_NOSPICE).
2799 	 */
2800 	coveredvp = vfsp->vfs_vnodecovered;
2801 	ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
2802 
2803 	/*
2804 	 * Purge all dnlc entries for this vfs.
2805 	 */
2806 	(void) dnlc_purge_vfsp(vfsp, 0);
2807 
2808 	/* For forcible umount, skip VFS_SYNC() since it may hang */
2809 	if ((flag & MS_FORCE) == 0)
2810 		(void) VFS_SYNC(vfsp, 0, cr);
2811 
2812 	/*
2813 	 * Lock the vfs to maintain fs status quo during unmount.  This
2814 	 * has to be done after the sync because ufs_update tries to acquire
2815 	 * the vfs_reflock.
2816 	 */
2817 	vfs_lock_wait(vfsp);
2818 
2819 	if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
2820 		vfs_unlock(vfsp);
2821 		if (coveredvp != NULL)
2822 			vn_vfsunlock(coveredvp);
2823 	} else if (coveredvp != NULL) {
2824 		teardown_vopstats(vfsp);
2825 		/*
2826 		 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
2827 		 * when it frees vfsp so we do a VN_HOLD() so we can
2828 		 * continue to use coveredvp afterwards.
2829 		 */
2830 		VN_HOLD(coveredvp);
2831 		vfs_remove(vfsp);
2832 		vn_vfsunlock(coveredvp);
2833 		VN_RELE(coveredvp);
2834 	} else {
2835 		teardown_vopstats(vfsp);
2836 		/*
2837 		 * Release the reference to vfs that is not linked
2838 		 * into the name space.
2839 		 */
2840 		vfs_unlock(vfsp);
2841 		VFS_RELE(vfsp);
2842 	}
2843 	return (error);
2844 }
2845 
2846 
2847 /*
2848  * Vfs_unmountall() is called by uadmin() to unmount all
2849  * mounted file systems (except the root file system) during shutdown.
2850  * It follows the existing locking protocol when traversing the vfs list
2851  * to sync and unmount vfses. Even though there should be no
2852  * other thread running while the system is shutting down, it is prudent
2853  * to still follow the locking protocol.
2854  */
2855 void
2856 vfs_unmountall(void)
2857 {
2858 	struct vfs *vfsp;
2859 	struct vfs *prev_vfsp = NULL;
2860 	int error;
2861 
2862 	/*
2863 	 * Toss all dnlc entries now so that the per-vfs sync
2864 	 * and unmount operations don't have to slog through
2865 	 * a bunch of uninteresting vnodes over and over again.
2866 	 */
2867 	dnlc_purge();
2868 
2869 	vfs_list_lock();
2870 	for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
2871 		prev_vfsp = vfsp->vfs_prev;
2872 
2873 		if (vfs_lock(vfsp) != 0)
2874 			continue;
2875 		error = vn_vfswlock(vfsp->vfs_vnodecovered);
2876 		vfs_unlock(vfsp);
2877 		if (error)
2878 			continue;
2879 
2880 		vfs_list_unlock();
2881 
2882 		(void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
2883 		(void) dounmount(vfsp, 0, CRED());
2884 
2885 		/*
2886 		 * Since we dropped the vfslist lock above we must
2887 		 * verify that next_vfsp still exists, else start over.
2888 		 */
2889 		vfs_list_lock();
2890 		for (vfsp = rootvfs->vfs_prev;
2891 		    vfsp != rootvfs; vfsp = vfsp->vfs_prev)
2892 			if (vfsp == prev_vfsp)
2893 				break;
2894 		if (vfsp == rootvfs && prev_vfsp != rootvfs)
2895 			prev_vfsp = rootvfs->vfs_prev;
2896 	}
2897 	vfs_list_unlock();
2898 }
2899 
2900 /*
2901  * Called to add an entry to the end of the vfs mount in progress list
2902  */
2903 void
2904 vfs_addmip(dev_t dev, struct vfs *vfsp)
2905 {
2906 	struct ipmnt *mipp;
2907 
2908 	mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
2909 	mipp->mip_next = NULL;
2910 	mipp->mip_dev = dev;
2911 	mipp->mip_vfsp = vfsp;
2912 	mutex_enter(&vfs_miplist_mutex);
2913 	if (vfs_miplist_end != NULL)
2914 		vfs_miplist_end->mip_next = mipp;
2915 	else
2916 		vfs_miplist = mipp;
2917 	vfs_miplist_end = mipp;
2918 	mutex_exit(&vfs_miplist_mutex);
2919 }
2920 
2921 /*
2922  * Called to remove an entry from the mount in progress list
2923  * Either because the mount completed or it failed.
2924  */
2925 void
2926 vfs_delmip(struct vfs *vfsp)
2927 {
2928 	struct ipmnt *mipp, *mipprev;
2929 
2930 	mutex_enter(&vfs_miplist_mutex);
2931 	mipprev = NULL;
2932 	for (mipp = vfs_miplist;
2933 	    mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
2934 		mipprev = mipp;
2935 	}
2936 	if (mipp == NULL)
2937 		return; /* shouldn't happen */
2938 	if (mipp == vfs_miplist_end)
2939 		vfs_miplist_end = mipprev;
2940 	if (mipprev == NULL)
2941 		vfs_miplist = mipp->mip_next;
2942 	else
2943 		mipprev->mip_next = mipp->mip_next;
2944 	mutex_exit(&vfs_miplist_mutex);
2945 	kmem_free(mipp, sizeof (struct ipmnt));
2946 }
2947 
2948 /*
2949  * vfs_add is called by a specific filesystem's mount routine to add
2950  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
2951  * The vfs should already have been locked by the caller.
2952  *
2953  * coveredvp is NULL if this is the root.
2954  */
2955 void
2956 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
2957 {
2958 	int newflag;
2959 
2960 	ASSERT(vfs_lock_held(vfsp));
2961 	VFS_HOLD(vfsp);
2962 	newflag = vfsp->vfs_flag;
2963 	if (mflag & MS_RDONLY)
2964 		newflag |= VFS_RDONLY;
2965 	else
2966 		newflag &= ~VFS_RDONLY;
2967 	if (mflag & MS_NOSUID)
2968 		newflag |= (VFS_NOSETUID|VFS_NODEVICES);
2969 	else
2970 		newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
2971 	if (mflag & MS_NOMNTTAB)
2972 		newflag |= VFS_NOMNTTAB;
2973 	else
2974 		newflag &= ~VFS_NOMNTTAB;
2975 
2976 	if (coveredvp != NULL) {
2977 		ASSERT(vn_vfswlock_held(coveredvp));
2978 		coveredvp->v_vfsmountedhere = vfsp;
2979 		VN_HOLD(coveredvp);
2980 	}
2981 	vfsp->vfs_vnodecovered = coveredvp;
2982 	vfsp->vfs_flag = newflag;
2983 
2984 	vfs_list_add(vfsp);
2985 }
2986 
2987 /*
2988  * Remove a vfs from the vfs list, null out the pointer from the
2989  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
2990  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
2991  * reference to the vfs and to the covered vnode.
2992  *
2993  * Called from dounmount after it's confirmed with the file system
2994  * that the unmount is legal.
2995  */
2996 void
2997 vfs_remove(struct vfs *vfsp)
2998 {
2999 	vnode_t *vp;
3000 
3001 	ASSERT(vfs_lock_held(vfsp));
3002 
3003 	/*
3004 	 * Can't unmount root.  Should never happen because fs will
3005 	 * be busy.
3006 	 */
3007 	if (vfsp == rootvfs)
3008 		panic("vfs_remove: unmounting root");
3009 
3010 	vfs_list_remove(vfsp);
3011 
3012 	/*
3013 	 * Unhook from the file system name space.
3014 	 */
3015 	vp = vfsp->vfs_vnodecovered;
3016 	ASSERT(vn_vfswlock_held(vp));
3017 	vp->v_vfsmountedhere = NULL;
3018 	vfsp->vfs_vnodecovered = NULL;
3019 	VN_RELE(vp);
3020 
3021 	/*
3022 	 * Release lock and wakeup anybody waiting.
3023 	 */
3024 	vfs_unlock(vfsp);
3025 	VFS_RELE(vfsp);
3026 }
3027 
3028 /*
3029  * Lock a filesystem to prevent access to it while mounting,
3030  * unmounting and syncing.  Return EBUSY immediately if lock
3031  * can't be acquired.
3032  */
3033 int
3034 vfs_lock(vfs_t *vfsp)
3035 {
3036 	vn_vfslocks_entry_t *vpvfsentry;
3037 
3038 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3039 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3040 		return (0);
3041 
3042 	vn_vfslocks_rele(vpvfsentry);
3043 	return (EBUSY);
3044 }
3045 
3046 int
3047 vfs_rlock(vfs_t *vfsp)
3048 {
3049 	vn_vfslocks_entry_t *vpvfsentry;
3050 
3051 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3052 
3053 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3054 		return (0);
3055 
3056 	vn_vfslocks_rele(vpvfsentry);
3057 	return (EBUSY);
3058 }
3059 
3060 void
3061 vfs_lock_wait(vfs_t *vfsp)
3062 {
3063 	vn_vfslocks_entry_t *vpvfsentry;
3064 
3065 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3066 	rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3067 }
3068 
3069 void
3070 vfs_rlock_wait(vfs_t *vfsp)
3071 {
3072 	vn_vfslocks_entry_t *vpvfsentry;
3073 
3074 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3075 	rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3076 }
3077 
3078 /*
3079  * Unlock a locked filesystem.
3080  */
3081 void
3082 vfs_unlock(vfs_t *vfsp)
3083 {
3084 	vn_vfslocks_entry_t *vpvfsentry;
3085 
3086 	/*
3087 	 * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3088 	 * And these changes should remain for the patch changes as it is.
3089 	 */
3090 	if (panicstr)
3091 		return;
3092 
3093 	/*
3094 	 * ve_refcount needs to be dropped twice here.
3095 	 * 1. To release refernce after a call to vfs_locks_getlock()
3096 	 * 2. To release the reference from the locking routines like
3097 	 *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3098 	 */
3099 
3100 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3101 	vn_vfslocks_rele(vpvfsentry);
3102 
3103 	rwst_exit(&vpvfsentry->ve_lock);
3104 	vn_vfslocks_rele(vpvfsentry);
3105 }
3106 
3107 /*
3108  * Utility routine that allows a filesystem to construct its
3109  * fsid in "the usual way" - by munging some underlying dev_t and
3110  * the filesystem type number into the 64-bit fsid.  Note that
3111  * this implicitly relies on dev_t persistence to make filesystem
3112  * id's persistent.
3113  *
3114  * There's nothing to prevent an individual fs from constructing its
3115  * fsid in a different way, and indeed they should.
3116  *
3117  * Since we want fsids to be 32-bit quantities (so that they can be
3118  * exported identically by either 32-bit or 64-bit APIs, as well as
3119  * the fact that fsid's are "known" to NFS), we compress the device
3120  * number given down to 32-bits, and panic if that isn't possible.
3121  */
3122 void
3123 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3124 {
3125 	if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3126 		panic("device number too big for fsid!");
3127 	fsi->val[1] = val;
3128 }
3129 
3130 int
3131 vfs_lock_held(vfs_t *vfsp)
3132 {
3133 	int held;
3134 	vn_vfslocks_entry_t *vpvfsentry;
3135 
3136 	/*
3137 	 * vfs_lock_held will mimic sema_held behaviour
3138 	 * if panicstr is set. And these changes should remain
3139 	 * for the patch changes as it is.
3140 	 */
3141 	if (panicstr)
3142 		return (1);
3143 
3144 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3145 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3146 
3147 	vn_vfslocks_rele(vpvfsentry);
3148 	return (held);
3149 }
3150 
3151 struct _kthread *
3152 vfs_lock_owner(vfs_t *vfsp)
3153 {
3154 	struct _kthread *owner;
3155 	vn_vfslocks_entry_t *vpvfsentry;
3156 
3157 	/*
3158 	 * vfs_wlock_held will mimic sema_held behaviour
3159 	 * if panicstr is set. And these changes should remain
3160 	 * for the patch changes as it is.
3161 	 */
3162 	if (panicstr)
3163 		return (NULL);
3164 
3165 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3166 	owner = rwst_owner(&vpvfsentry->ve_lock);
3167 
3168 	vn_vfslocks_rele(vpvfsentry);
3169 	return (owner);
3170 }
3171 
3172 /*
3173  * vfs list locking.
3174  *
3175  * Rather than manipulate the vfslist lock directly, we abstract into lock
3176  * and unlock routines to allow the locking implementation to be changed for
3177  * clustering.
3178  *
3179  * Whenever the vfs list is modified through its hash links, the overall list
3180  * lock must be obtained before locking the relevant hash bucket.  But to see
3181  * whether a given vfs is on the list, it suffices to obtain the lock for the
3182  * hash bucket without getting the overall list lock.  (See getvfs() below.)
3183  */
3184 
3185 void
3186 vfs_list_lock()
3187 {
3188 	rw_enter(&vfslist, RW_WRITER);
3189 }
3190 
3191 void
3192 vfs_list_read_lock()
3193 {
3194 	rw_enter(&vfslist, RW_READER);
3195 }
3196 
3197 void
3198 vfs_list_unlock()
3199 {
3200 	rw_exit(&vfslist);
3201 }
3202 
3203 /*
3204  * Low level worker routines for adding entries to and removing entries from
3205  * the vfs list.
3206  */
3207 
3208 static void
3209 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3210 {
3211 	int vhno;
3212 	struct vfs **hp;
3213 	dev_t dev;
3214 
3215 	ASSERT(RW_WRITE_HELD(&vfslist));
3216 
3217 	dev = expldev(vfsp->vfs_fsid.val[0]);
3218 	vhno = VFSHASH(getmajor(dev), getminor(dev));
3219 
3220 	mutex_enter(&rvfs_list[vhno].rvfs_lock);
3221 
3222 	/*
3223 	 * Link into the hash table, inserting it at the end, so that LOFS
3224 	 * with the same fsid as UFS (or other) file systems will not hide the
3225 	 * UFS.
3226 	 */
3227 	if (insert_at_head) {
3228 		vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3229 		rvfs_list[vhno].rvfs_head = vfsp;
3230 	} else {
3231 		for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3232 		    hp = &(*hp)->vfs_hash)
3233 			continue;
3234 		/*
3235 		 * hp now contains the address of the pointer to update
3236 		 * to effect the insertion.
3237 		 */
3238 		vfsp->vfs_hash = NULL;
3239 		*hp = vfsp;
3240 	}
3241 
3242 	rvfs_list[vhno].rvfs_len++;
3243 	mutex_exit(&rvfs_list[vhno].rvfs_lock);
3244 }
3245 
3246 
3247 static void
3248 vfs_hash_remove(struct vfs *vfsp)
3249 {
3250 	int vhno;
3251 	struct vfs *tvfsp;
3252 	dev_t dev;
3253 
3254 	ASSERT(RW_WRITE_HELD(&vfslist));
3255 
3256 	dev = expldev(vfsp->vfs_fsid.val[0]);
3257 	vhno = VFSHASH(getmajor(dev), getminor(dev));
3258 
3259 	mutex_enter(&rvfs_list[vhno].rvfs_lock);
3260 
3261 	/*
3262 	 * Remove from hash.
3263 	 */
3264 	if (rvfs_list[vhno].rvfs_head == vfsp) {
3265 		rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3266 		rvfs_list[vhno].rvfs_len--;
3267 		goto foundit;
3268 	}
3269 	for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3270 	    tvfsp = tvfsp->vfs_hash) {
3271 		if (tvfsp->vfs_hash == vfsp) {
3272 			tvfsp->vfs_hash = vfsp->vfs_hash;
3273 			rvfs_list[vhno].rvfs_len--;
3274 			goto foundit;
3275 		}
3276 	}
3277 	cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3278 
3279 foundit:
3280 
3281 	mutex_exit(&rvfs_list[vhno].rvfs_lock);
3282 }
3283 
3284 
3285 void
3286 vfs_list_add(struct vfs *vfsp)
3287 {
3288 	zone_t *zone;
3289 
3290 	/*
3291 	 * The zone that owns the mount is the one that performed the mount.
3292 	 * Note that this isn't necessarily the same as the zone mounted into.
3293 	 * The corresponding zone_rele() will be done when the vfs_t is
3294 	 * being free'd.
3295 	 */
3296 	vfsp->vfs_zone = curproc->p_zone;
3297 	zone_hold(vfsp->vfs_zone);
3298 
3299 	/*
3300 	 * Find the zone mounted into, and put this mount on its vfs list.
3301 	 */
3302 	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3303 	ASSERT(zone != NULL);
3304 	/*
3305 	 * Special casing for the root vfs.  This structure is allocated
3306 	 * statically and hooked onto rootvfs at link time.  During the
3307 	 * vfs_mountroot call at system startup time, the root file system's
3308 	 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3309 	 * as argument.  The code below must detect and handle this special
3310 	 * case.  The only apparent justification for this special casing is
3311 	 * to ensure that the root file system appears at the head of the
3312 	 * list.
3313 	 *
3314 	 * XXX:	I'm assuming that it's ok to do normal list locking when
3315 	 *	adding the entry for the root file system (this used to be
3316 	 *	done with no locks held).
3317 	 */
3318 	vfs_list_lock();
3319 	/*
3320 	 * Link into the vfs list proper.
3321 	 */
3322 	if (vfsp == &root) {
3323 		/*
3324 		 * Assert: This vfs is already on the list as its first entry.
3325 		 * Thus, there's nothing to do.
3326 		 */
3327 		ASSERT(rootvfs == vfsp);
3328 		/*
3329 		 * Add it to the head of the global zone's vfslist.
3330 		 */
3331 		ASSERT(zone == global_zone);
3332 		ASSERT(zone->zone_vfslist == NULL);
3333 		zone->zone_vfslist = vfsp;
3334 	} else {
3335 		/*
3336 		 * Link to end of list using vfs_prev (as rootvfs is now a
3337 		 * doubly linked circular list) so list is in mount order for
3338 		 * mnttab use.
3339 		 */
3340 		rootvfs->vfs_prev->vfs_next = vfsp;
3341 		vfsp->vfs_prev = rootvfs->vfs_prev;
3342 		rootvfs->vfs_prev = vfsp;
3343 		vfsp->vfs_next = rootvfs;
3344 
3345 		/*
3346 		 * Do it again for the zone-private list (which may be NULL).
3347 		 */
3348 		if (zone->zone_vfslist == NULL) {
3349 			ASSERT(zone != global_zone);
3350 			zone->zone_vfslist = vfsp;
3351 		} else {
3352 			zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3353 			vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3354 			zone->zone_vfslist->vfs_zone_prev = vfsp;
3355 			vfsp->vfs_zone_next = zone->zone_vfslist;
3356 		}
3357 	}
3358 
3359 	/*
3360 	 * Link into the hash table, inserting it at the end, so that LOFS
3361 	 * with the same fsid as UFS (or other) file systems will not hide
3362 	 * the UFS.
3363 	 */
3364 	vfs_hash_add(vfsp, 0);
3365 
3366 	/*
3367 	 * update the mnttab modification time
3368 	 */
3369 	vfs_mnttab_modtimeupd();
3370 	vfs_list_unlock();
3371 	zone_rele(zone);
3372 }
3373 
3374 void
3375 vfs_list_remove(struct vfs *vfsp)
3376 {
3377 	zone_t *zone;
3378 
3379 	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3380 	ASSERT(zone != NULL);
3381 	/*
3382 	 * Callers are responsible for preventing attempts to unmount the
3383 	 * root.
3384 	 */
3385 	ASSERT(vfsp != rootvfs);
3386 
3387 	vfs_list_lock();
3388 
3389 	/*
3390 	 * Remove from hash.
3391 	 */
3392 	vfs_hash_remove(vfsp);
3393 
3394 	/*
3395 	 * Remove from vfs list.
3396 	 */
3397 	vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3398 	vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3399 	vfsp->vfs_next = vfsp->vfs_prev = NULL;
3400 
3401 	/*
3402 	 * Remove from zone-specific vfs list.
3403 	 */
3404 	if (zone->zone_vfslist == vfsp)
3405 		zone->zone_vfslist = vfsp->vfs_zone_next;
3406 
3407 	if (vfsp->vfs_zone_next == vfsp) {
3408 		ASSERT(vfsp->vfs_zone_prev == vfsp);
3409 		ASSERT(zone->zone_vfslist == vfsp);
3410 		zone->zone_vfslist = NULL;
3411 	}
3412 
3413 	vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3414 	vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3415 	vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3416 
3417 	/*
3418 	 * update the mnttab modification time
3419 	 */
3420 	vfs_mnttab_modtimeupd();
3421 	vfs_list_unlock();
3422 	zone_rele(zone);
3423 }
3424 
3425 struct vfs *
3426 getvfs(fsid_t *fsid)
3427 {
3428 	struct vfs *vfsp;
3429 	int val0 = fsid->val[0];
3430 	int val1 = fsid->val[1];
3431 	dev_t dev = expldev(val0);
3432 	int vhno = VFSHASH(getmajor(dev), getminor(dev));
3433 	kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3434 
3435 	mutex_enter(hmp);
3436 	for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3437 		if (vfsp->vfs_fsid.val[0] == val0 &&
3438 		    vfsp->vfs_fsid.val[1] == val1) {
3439 			VFS_HOLD(vfsp);
3440 			mutex_exit(hmp);
3441 			return (vfsp);
3442 		}
3443 	}
3444 	mutex_exit(hmp);
3445 	return (NULL);
3446 }
3447 
3448 /*
3449  * Search the vfs mount in progress list for a specified device/vfs entry.
3450  * Returns 0 if the first entry in the list that the device matches has the
3451  * given vfs pointer as well.  If the device matches but a different vfs
3452  * pointer is encountered in the list before the given vfs pointer then
3453  * a 1 is returned.
3454  */
3455 
3456 int
3457 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3458 {
3459 	int retval = 0;
3460 	struct ipmnt *mipp;
3461 
3462 	mutex_enter(&vfs_miplist_mutex);
3463 	for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3464 		if (mipp->mip_dev == dev) {
3465 			if (mipp->mip_vfsp != vfsp)
3466 				retval = 1;
3467 			break;
3468 		}
3469 	}
3470 	mutex_exit(&vfs_miplist_mutex);
3471 	return (retval);
3472 }
3473 
3474 /*
3475  * Search the vfs list for a specified device.  Returns 1, if entry is found
3476  * or 0 if no suitable entry is found.
3477  */
3478 
3479 int
3480 vfs_devismounted(dev_t dev)
3481 {
3482 	struct vfs *vfsp;
3483 	int found;
3484 
3485 	vfs_list_read_lock();
3486 	vfsp = rootvfs;
3487 	found = 0;
3488 	do {
3489 		if (vfsp->vfs_dev == dev) {
3490 			found = 1;
3491 			break;
3492 		}
3493 		vfsp = vfsp->vfs_next;
3494 	} while (vfsp != rootvfs);
3495 
3496 	vfs_list_unlock();
3497 	return (found);
3498 }
3499 
3500 /*
3501  * Search the vfs list for a specified device.  Returns a pointer to it
3502  * or NULL if no suitable entry is found. The caller of this routine
3503  * is responsible for releasing the returned vfs pointer.
3504  */
3505 struct vfs *
3506 vfs_dev2vfsp(dev_t dev)
3507 {
3508 	struct vfs *vfsp;
3509 	int found;
3510 
3511 	vfs_list_read_lock();
3512 	vfsp = rootvfs;
3513 	found = 0;
3514 	do {
3515 		/*
3516 		 * The following could be made more efficient by making
3517 		 * the entire loop use vfs_zone_next if the call is from
3518 		 * a zone.  The only callers, however, ustat(2) and
3519 		 * umount2(2), don't seem to justify the added
3520 		 * complexity at present.
3521 		 */
3522 		if (vfsp->vfs_dev == dev &&
3523 		    ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3524 		    curproc->p_zone)) {
3525 			VFS_HOLD(vfsp);
3526 			found = 1;
3527 			break;
3528 		}
3529 		vfsp = vfsp->vfs_next;
3530 	} while (vfsp != rootvfs);
3531 	vfs_list_unlock();
3532 	return (found ? vfsp: NULL);
3533 }
3534 
3535 /*
3536  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3537  * or NULL if no suitable entry is found. The caller of this routine
3538  * is responsible for releasing the returned vfs pointer.
3539  *
3540  * Note that if multiple mntpoints match, the last one matching is
3541  * returned in an attempt to return the "top" mount when overlay
3542  * mounts are covering the same mount point.  This is accomplished by starting
3543  * at the end of the list and working our way backwards, stopping at the first
3544  * matching mount.
3545  */
3546 struct vfs *
3547 vfs_mntpoint2vfsp(const char *mp)
3548 {
3549 	struct vfs *vfsp;
3550 	struct vfs *retvfsp = NULL;
3551 	zone_t *zone = curproc->p_zone;
3552 	struct vfs *list;
3553 
3554 	vfs_list_read_lock();
3555 	if (getzoneid() == GLOBAL_ZONEID) {
3556 		/*
3557 		 * The global zone may see filesystems in any zone.
3558 		 */
3559 		vfsp = rootvfs->vfs_prev;
3560 		do {
3561 			if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3562 				retvfsp = vfsp;
3563 				break;
3564 			}
3565 			vfsp = vfsp->vfs_prev;
3566 		} while (vfsp != rootvfs->vfs_prev);
3567 	} else if ((list = zone->zone_vfslist) != NULL) {
3568 		const char *mntpt;
3569 
3570 		vfsp = list->vfs_zone_prev;
3571 		do {
3572 			mntpt = refstr_value(vfsp->vfs_mntpt);
3573 			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3574 			if (strcmp(mntpt, mp) == 0) {
3575 				retvfsp = vfsp;
3576 				break;
3577 			}
3578 			vfsp = vfsp->vfs_zone_prev;
3579 		} while (vfsp != list->vfs_zone_prev);
3580 	}
3581 	if (retvfsp)
3582 		VFS_HOLD(retvfsp);
3583 	vfs_list_unlock();
3584 	return (retvfsp);
3585 }
3586 
3587 /*
3588  * Search the vfs list for a specified vfsops.
3589  * if vfs entry is found then return 1, else 0.
3590  */
3591 int
3592 vfs_opsinuse(vfsops_t *ops)
3593 {
3594 	struct vfs *vfsp;
3595 	int found;
3596 
3597 	vfs_list_read_lock();
3598 	vfsp = rootvfs;
3599 	found = 0;
3600 	do {
3601 		if (vfs_getops(vfsp) == ops) {
3602 			found = 1;
3603 			break;
3604 		}
3605 		vfsp = vfsp->vfs_next;
3606 	} while (vfsp != rootvfs);
3607 	vfs_list_unlock();
3608 	return (found);
3609 }
3610 
3611 /*
3612  * Allocate an entry in vfssw for a file system type
3613  */
3614 struct vfssw *
3615 allocate_vfssw(char *type)
3616 {
3617 	struct vfssw *vswp;
3618 
3619 	if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3620 		/*
3621 		 * The vfssw table uses the empty string to identify an
3622 		 * available entry; we cannot add any type which has
3623 		 * a leading NUL. The string length is limited to
3624 		 * the size of the st_fstype array in struct stat.
3625 		 */
3626 		return (NULL);
3627 	}
3628 
3629 	ASSERT(VFSSW_WRITE_LOCKED());
3630 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3631 		if (!ALLOCATED_VFSSW(vswp)) {
3632 			vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3633 			(void) strcpy(vswp->vsw_name, type);
3634 			ASSERT(vswp->vsw_count == 0);
3635 			vswp->vsw_count = 1;
3636 			mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3637 			return (vswp);
3638 		}
3639 	return (NULL);
3640 }
3641 
3642 /*
3643  * Impose additional layer of translation between vfstype names
3644  * and module names in the filesystem.
3645  */
3646 static char *
3647 vfs_to_modname(char *vfstype)
3648 {
3649 	if (strcmp(vfstype, "proc") == 0) {
3650 		vfstype = "procfs";
3651 	} else if (strcmp(vfstype, "fd") == 0) {
3652 		vfstype = "fdfs";
3653 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
3654 		vfstype = "nfs";
3655 	}
3656 
3657 	return (vfstype);
3658 }
3659 
3660 /*
3661  * Find a vfssw entry given a file system type name.
3662  * Try to autoload the filesystem if it's not found.
3663  * If it's installed, return the vfssw locked to prevent unloading.
3664  */
3665 struct vfssw *
3666 vfs_getvfssw(char *type)
3667 {
3668 	struct vfssw *vswp;
3669 	char	*modname;
3670 
3671 	RLOCK_VFSSW();
3672 	vswp = vfs_getvfsswbyname(type);
3673 	modname = vfs_to_modname(type);
3674 
3675 	if (rootdir == NULL) {
3676 		/*
3677 		 * If we haven't yet loaded the root file system, then our
3678 		 * _init won't be called until later. Allocate vfssw entry,
3679 		 * because mod_installfs won't be called.
3680 		 */
3681 		if (vswp == NULL) {
3682 			RUNLOCK_VFSSW();
3683 			WLOCK_VFSSW();
3684 			if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3685 				if ((vswp = allocate_vfssw(type)) == NULL) {
3686 					WUNLOCK_VFSSW();
3687 					return (NULL);
3688 				}
3689 			}
3690 			WUNLOCK_VFSSW();
3691 			RLOCK_VFSSW();
3692 		}
3693 		if (!VFS_INSTALLED(vswp)) {
3694 			RUNLOCK_VFSSW();
3695 			(void) modloadonly("fs", modname);
3696 		} else
3697 			RUNLOCK_VFSSW();
3698 		return (vswp);
3699 	}
3700 
3701 	/*
3702 	 * Try to load the filesystem.  Before calling modload(), we drop
3703 	 * our lock on the VFS switch table, and pick it up after the
3704 	 * module is loaded.  However, there is a potential race:  the
3705 	 * module could be unloaded after the call to modload() completes
3706 	 * but before we pick up the lock and drive on.  Therefore,
3707 	 * we keep reloading the module until we've loaded the module
3708 	 * _and_ we have the lock on the VFS switch table.
3709 	 */
3710 	while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3711 		RUNLOCK_VFSSW();
3712 		if (modload("fs", modname) == -1)
3713 			return (NULL);
3714 		RLOCK_VFSSW();
3715 		if (vswp == NULL)
3716 			if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3717 				break;
3718 	}
3719 	RUNLOCK_VFSSW();
3720 
3721 	return (vswp);
3722 }
3723 
3724 /*
3725  * Find a vfssw entry given a file system type name.
3726  */
3727 struct vfssw *
3728 vfs_getvfsswbyname(char *type)
3729 {
3730 	struct vfssw *vswp;
3731 
3732 	ASSERT(VFSSW_LOCKED());
3733 	if (type == NULL || *type == '\0')
3734 		return (NULL);
3735 
3736 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3737 		if (strcmp(type, vswp->vsw_name) == 0) {
3738 			vfs_refvfssw(vswp);
3739 			return (vswp);
3740 		}
3741 	}
3742 
3743 	return (NULL);
3744 }
3745 
3746 /*
3747  * Find a vfssw entry given a set of vfsops.
3748  */
3749 struct vfssw *
3750 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3751 {
3752 	struct vfssw *vswp;
3753 
3754 	RLOCK_VFSSW();
3755 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3756 		if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3757 			vfs_refvfssw(vswp);
3758 			RUNLOCK_VFSSW();
3759 			return (vswp);
3760 		}
3761 	}
3762 	RUNLOCK_VFSSW();
3763 
3764 	return (NULL);
3765 }
3766 
3767 /*
3768  * Reference a vfssw entry.
3769  */
3770 void
3771 vfs_refvfssw(struct vfssw *vswp)
3772 {
3773 
3774 	mutex_enter(&vswp->vsw_lock);
3775 	vswp->vsw_count++;
3776 	mutex_exit(&vswp->vsw_lock);
3777 }
3778 
3779 /*
3780  * Unreference a vfssw entry.
3781  */
3782 void
3783 vfs_unrefvfssw(struct vfssw *vswp)
3784 {
3785 
3786 	mutex_enter(&vswp->vsw_lock);
3787 	vswp->vsw_count--;
3788 	mutex_exit(&vswp->vsw_lock);
3789 }
3790 
3791 int sync_timeout = 30;		/* timeout for syncing a page during panic */
3792 int sync_timeleft;		/* portion of sync_timeout remaining */
3793 
3794 static int sync_retries = 20;	/* number of retries when not making progress */
3795 static int sync_triesleft;	/* portion of sync_retries remaining */
3796 
3797 static pgcnt_t old_pgcnt, new_pgcnt;
3798 static int new_bufcnt, old_bufcnt;
3799 
3800 /*
3801  * Sync all of the mounted filesystems, and then wait for the actual i/o to
3802  * complete.  We wait by counting the number of dirty pages and buffers,
3803  * pushing them out using bio_busy() and page_busy(), and then counting again.
3804  * This routine is used during both the uadmin A_SHUTDOWN code as well as
3805  * the SYNC phase of the panic code (see comments in panic.c).  It should only
3806  * be used after some higher-level mechanism has quiesced the system so that
3807  * new writes are not being initiated while we are waiting for completion.
3808  *
3809  * To ensure finite running time, our algorithm uses two timeout mechanisms:
3810  * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
3811  * sync_triesleft (a progress counter used by the vfs_syncall() loop below).
3812  * Together these ensure that syncing completes if our i/o paths are stuck.
3813  * The counters are declared above so they can be found easily in the debugger.
3814  *
3815  * The sync_timeleft counter is reset by bio_busy() and page_busy() using the
3816  * vfs_syncprogress() subroutine whenever we make progress through the lists of
3817  * pages and buffers.  It is decremented and expired by the deadman() cyclic.
3818  * When vfs_syncall() decides it is done, we disable the deadman() counter by
3819  * setting sync_timeleft to zero.  This timer guards against vfs_syncall()
3820  * deadlocking or hanging inside of a broken filesystem or driver routine.
3821  *
3822  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
3823  * sync_retries consecutive calls to bio_busy() and page_busy() without
3824  * decreasing either the number of dirty buffers or dirty pages below the
3825  * lowest count we have seen so far, we give up and return from vfs_syncall().
3826  *
3827  * Each loop iteration ends with a call to delay() one second to allow time for
3828  * i/o completion and to permit the user time to read our progress messages.
3829  */
3830 void
3831 vfs_syncall(void)
3832 {
3833 	if (rootdir == NULL && !modrootloaded)
3834 		return; /* panic during boot - no filesystems yet */
3835 
3836 	printf("syncing file systems...");
3837 	vfs_syncprogress();
3838 	sync();
3839 
3840 	vfs_syncprogress();
3841 	sync_triesleft = sync_retries;
3842 
3843 	old_bufcnt = new_bufcnt = INT_MAX;
3844 	old_pgcnt = new_pgcnt = ULONG_MAX;
3845 
3846 	while (sync_triesleft > 0) {
3847 		old_bufcnt = MIN(old_bufcnt, new_bufcnt);
3848 		old_pgcnt = MIN(old_pgcnt, new_pgcnt);
3849 
3850 		new_bufcnt = bio_busy(B_TRUE);
3851 		new_pgcnt = page_busy(B_TRUE);
3852 		vfs_syncprogress();
3853 
3854 		if (new_bufcnt == 0 && new_pgcnt == 0)
3855 			break;
3856 
3857 		if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
3858 			sync_triesleft = sync_retries;
3859 		else
3860 			sync_triesleft--;
3861 
3862 		if (new_bufcnt)
3863 			printf(" [%d]", new_bufcnt);
3864 		if (new_pgcnt)
3865 			printf(" %lu", new_pgcnt);
3866 
3867 		delay(hz);
3868 	}
3869 
3870 	if (new_bufcnt != 0 || new_pgcnt != 0)
3871 		printf(" done (not all i/o completed)\n");
3872 	else
3873 		printf(" done\n");
3874 
3875 	sync_timeleft = 0;
3876 	delay(hz);
3877 }
3878 
3879 /*
3880  * If we are in the middle of the sync phase of panic, reset sync_timeleft to
3881  * sync_timeout to indicate that we are making progress and the deadman()
3882  * omnipresent cyclic should not yet time us out.  Note that it is safe to
3883  * store to sync_timeleft here since the deadman() is firing at high-level
3884  * on top of us.  If we are racing with the deadman(), either the deadman()
3885  * will decrement the old value and then we will reset it, or we will
3886  * reset it and then the deadman() will immediately decrement it.  In either
3887  * case, correct behavior results.
3888  */
3889 void
3890 vfs_syncprogress(void)
3891 {
3892 	if (panicstr)
3893 		sync_timeleft = sync_timeout;
3894 }
3895 
3896 /*
3897  * Map VFS flags to statvfs flags.  These shouldn't really be separate
3898  * flags at all.
3899  */
3900 uint_t
3901 vf_to_stf(uint_t vf)
3902 {
3903 	uint_t stf = 0;
3904 
3905 	if (vf & VFS_RDONLY)
3906 		stf |= ST_RDONLY;
3907 	if (vf & VFS_NOSETUID)
3908 		stf |= ST_NOSUID;
3909 	if (vf & VFS_NOTRUNC)
3910 		stf |= ST_NOTRUNC;
3911 
3912 	return (stf);
3913 }
3914 
3915 /*
3916  * Entries for (illegal) fstype 0.
3917  */
3918 /* ARGSUSED */
3919 int
3920 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
3921 {
3922 	cmn_err(CE_PANIC, "stray vfs operation");
3923 	return (0);
3924 }
3925 
3926 /*
3927  * Entries for (illegal) fstype 0.
3928  */
3929 int
3930 vfsstray(void)
3931 {
3932 	cmn_err(CE_PANIC, "stray vfs operation");
3933 	return (0);
3934 }
3935 
3936 /*
3937  * Support for dealing with forced UFS unmount and its interaction with
3938  * LOFS. Could be used by any filesystem.
3939  * See bug 1203132.
3940  */
3941 int
3942 vfs_EIO(void)
3943 {
3944 	return (EIO);
3945 }
3946 
3947 /*
3948  * We've gotta define the op for sync separately, since the compiler gets
3949  * confused if we mix and match ANSI and normal style prototypes when
3950  * a "short" argument is present and spits out a warning.
3951  */
3952 /*ARGSUSED*/
3953 int
3954 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
3955 {
3956 	return (EIO);
3957 }
3958 
3959 vfs_t EIO_vfs;
3960 vfsops_t *EIO_vfsops;
3961 
3962 /*
3963  * Called from startup() to initialize all loaded vfs's
3964  */
3965 void
3966 vfsinit(void)
3967 {
3968 	struct vfssw *vswp;
3969 	int error;
3970 	extern int vopstats_enabled;
3971 	extern void vopstats_startup();
3972 
3973 	static const fs_operation_def_t EIO_vfsops_template[] = {
3974 		VFSNAME_MOUNT,		{ .error = vfs_EIO },
3975 		VFSNAME_UNMOUNT,	{ .error = vfs_EIO },
3976 		VFSNAME_ROOT,		{ .error = vfs_EIO },
3977 		VFSNAME_STATVFS,	{ .error = vfs_EIO },
3978 		VFSNAME_SYNC, 		{ .vfs_sync = vfs_EIO_sync },
3979 		VFSNAME_VGET,		{ .error = vfs_EIO },
3980 		VFSNAME_MOUNTROOT,	{ .error = vfs_EIO },
3981 		VFSNAME_FREEVFS,	{ .error = vfs_EIO },
3982 		VFSNAME_VNSTATE,	{ .error = vfs_EIO },
3983 		NULL, NULL
3984 	};
3985 
3986 	static const fs_operation_def_t stray_vfsops_template[] = {
3987 		VFSNAME_MOUNT,		{ .error = vfsstray },
3988 		VFSNAME_UNMOUNT,	{ .error = vfsstray },
3989 		VFSNAME_ROOT,		{ .error = vfsstray },
3990 		VFSNAME_STATVFS,	{ .error = vfsstray },
3991 		VFSNAME_SYNC, 		{ .vfs_sync = vfsstray_sync },
3992 		VFSNAME_VGET,		{ .error = vfsstray },
3993 		VFSNAME_MOUNTROOT,	{ .error = vfsstray },
3994 		VFSNAME_FREEVFS,	{ .error = vfsstray },
3995 		VFSNAME_VNSTATE,	{ .error = vfsstray },
3996 		NULL, NULL
3997 	};
3998 
3999 	/* Initialize the vnode cache (file systems may use it during init). */
4000 
4001 	vn_create_cache();
4002 
4003 	/* Setup event monitor framework */
4004 
4005 	fem_init();
4006 
4007 	/* Initialize the dummy stray file system type. */
4008 	error = vfs_setfsops(0, stray_vfsops_template, NULL);
4009 
4010 	/* Initialize the dummy EIO file system. */
4011 	error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4012 	if (error != 0) {
4013 		cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4014 		/* Shouldn't happen, but not bad enough to panic */
4015 	}
4016 
4017 	VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4018 
4019 	/*
4020 	 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4021 	 * on this vfs can immediately notice it's invalid.
4022 	 */
4023 	EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4024 
4025 	/*
4026 	 * Call the init routines of non-loadable filesystems only.
4027 	 * Filesystems which are loaded as separate modules will be
4028 	 * initialized by the module loading code instead.
4029 	 */
4030 
4031 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4032 		RLOCK_VFSSW();
4033 		if (vswp->vsw_init != NULL)
4034 			(*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4035 		RUNLOCK_VFSSW();
4036 	}
4037 
4038 	vopstats_startup();
4039 
4040 	if (vopstats_enabled) {
4041 		/* EIO_vfs can collect stats, but we don't retrieve them */
4042 		initialize_vopstats(&EIO_vfs.vfs_vopstats);
4043 		EIO_vfs.vfs_fstypevsp = NULL;
4044 		EIO_vfs.vfs_vskap = NULL;
4045 		EIO_vfs.vfs_flag |= VFS_STATS;
4046 	}
4047 }
4048 
4049 /*
4050  * Increments the vfs reference count by one atomically.
4051  */
4052 void
4053 vfs_hold(vfs_t *vfsp)
4054 {
4055 	atomic_add_32(&vfsp->vfs_count, 1);
4056 	ASSERT(vfsp->vfs_count != 0);
4057 }
4058 
4059 /*
4060  * Decrements the vfs reference count by one atomically. When
4061  * vfs reference count becomes zero, it calls the file system
4062  * specific vfs_freevfs() to free up the resources.
4063  */
4064 void
4065 vfs_rele(vfs_t *vfsp)
4066 {
4067 	ASSERT(vfsp->vfs_count != 0);
4068 	if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) {
4069 		VFS_FREEVFS(vfsp);
4070 		if (vfsp->vfs_zone)
4071 			zone_rele(vfsp->vfs_zone);
4072 		vfs_freemnttab(vfsp);
4073 		if (vfsp->vfs_implp)
4074 			vfsimpl_teardown(vfsp);
4075 		sema_destroy(&vfsp->vfs_reflock);
4076 		kmem_free(vfsp, sizeof (*vfsp));
4077 	}
4078 }
4079 
4080 /*
4081  * Generic operations vector support.
4082  *
4083  * This is used to build operations vectors for both the vfs and vnode.
4084  * It's normally called only when a file system is loaded.
4085  *
4086  * There are many possible algorithms for this, including the following:
4087  *
4088  *   (1) scan the list of known operations; for each, see if the file system
4089  *       includes an entry for it, and fill it in as appropriate.
4090  *
4091  *   (2) set up defaults for all known operations.  scan the list of ops
4092  *       supplied by the file system; for each which is both supplied and
4093  *       known, fill it in.
4094  *
4095  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4096  *       in entries as we go.
4097  *
4098  * we choose (1) for simplicity, and because performance isn't critical here.
4099  * note that (2) could be sped up using a precomputed hash table on known ops.
4100  * (3) could be faster than either, but only if the lists were very large or
4101  * supplied in sorted order.
4102  *
4103  */
4104 
4105 int
4106 fs_build_vector(void *vector, int *unused_ops,
4107     const fs_operation_trans_def_t *translation,
4108     const fs_operation_def_t *operations)
4109 {
4110 	int i, num_trans, num_ops, used;
4111 
4112 	/*
4113 	 * Count the number of translations and the number of supplied
4114 	 * operations.
4115 	 */
4116 
4117 	{
4118 		const fs_operation_trans_def_t *p;
4119 
4120 		for (num_trans = 0, p = translation;
4121 		    p->name != NULL;
4122 		    num_trans++, p++)
4123 			;
4124 	}
4125 
4126 	{
4127 		const fs_operation_def_t *p;
4128 
4129 		for (num_ops = 0, p = operations;
4130 		    p->name != NULL;
4131 		    num_ops++, p++)
4132 			;
4133 	}
4134 
4135 	/* Walk through each operation known to our caller.  There will be */
4136 	/* one entry in the supplied "translation table" for each. */
4137 
4138 	used = 0;
4139 
4140 	for (i = 0; i < num_trans; i++) {
4141 		int j, found;
4142 		char *curname;
4143 		fs_generic_func_p result;
4144 		fs_generic_func_p *location;
4145 
4146 		curname = translation[i].name;
4147 
4148 		/* Look for a matching operation in the list supplied by the */
4149 		/* file system. */
4150 
4151 		found = 0;
4152 
4153 		for (j = 0; j < num_ops; j++) {
4154 			if (strcmp(operations[j].name, curname) == 0) {
4155 				used++;
4156 				found = 1;
4157 				break;
4158 			}
4159 		}
4160 
4161 		/*
4162 		 * If the file system is using a "placeholder" for default
4163 		 * or error functions, grab the appropriate function out of
4164 		 * the translation table.  If the file system didn't supply
4165 		 * this operation at all, use the default function.
4166 		 */
4167 
4168 		if (found) {
4169 			result = operations[j].func.fs_generic;
4170 			if (result == fs_default) {
4171 				result = translation[i].defaultFunc;
4172 			} else if (result == fs_error) {
4173 				result = translation[i].errorFunc;
4174 			} else if (result == NULL) {
4175 				/* Null values are PROHIBITED */
4176 				return (EINVAL);
4177 			}
4178 		} else {
4179 			result = translation[i].defaultFunc;
4180 		}
4181 
4182 		/* Now store the function into the operations vector. */
4183 
4184 		location = (fs_generic_func_p *)
4185 		    (((char *)vector) + translation[i].offset);
4186 
4187 		*location = result;
4188 	}
4189 
4190 	*unused_ops = num_ops - used;
4191 
4192 	return (0);
4193 }
4194 
4195 /* Placeholder functions, should never be called. */
4196 
4197 int
4198 fs_error(void)
4199 {
4200 	cmn_err(CE_PANIC, "fs_error called");
4201 	return (0);
4202 }
4203 
4204 int
4205 fs_default(void)
4206 {
4207 	cmn_err(CE_PANIC, "fs_default called");
4208 	return (0);
4209 }
4210 
4211 #ifdef __sparc
4212 
4213 /*
4214  * Part of the implementation of booting off a mirrored root
4215  * involves a change of dev_t for the root device.  To
4216  * accomplish this, first remove the existing hash table
4217  * entry for the root device, convert to the new dev_t,
4218  * then re-insert in the hash table at the head of the list.
4219  */
4220 void
4221 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4222 {
4223 	vfs_list_lock();
4224 
4225 	vfs_hash_remove(vfsp);
4226 
4227 	vfsp->vfs_dev = ndev;
4228 	vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4229 
4230 	vfs_hash_add(vfsp, 1);
4231 
4232 	vfs_list_unlock();
4233 }
4234 
4235 #else /* x86 NEWBOOT */
4236 
4237 int
4238 rootconf()
4239 {
4240 	int error;
4241 	struct vfssw *vsw;
4242 	extern void pm_init();
4243 	char *fstyp, *fsmod;
4244 
4245 	getrootfs(&fstyp, &fsmod);
4246 
4247 	if (error = clboot_rootconf())
4248 		return (error);
4249 
4250 	if (modload("fs", fsmod) == -1)
4251 		panic("Cannot _init %s module", fsmod);
4252 
4253 	RLOCK_VFSSW();
4254 	vsw = vfs_getvfsswbyname(fstyp);
4255 	RUNLOCK_VFSSW();
4256 	VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4257 	VFS_HOLD(rootvfs);
4258 
4259 	/* always mount readonly first */
4260 	rootvfs->vfs_flag |= VFS_RDONLY;
4261 
4262 	pm_init();
4263 
4264 	if (netboot)
4265 		(void) strplumb();
4266 
4267 	error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4268 	vfs_unrefvfssw(vsw);
4269 	rootdev = rootvfs->vfs_dev;
4270 
4271 	if (error)
4272 		panic("cannot mount root path %s", rootfs.bo_name);
4273 	return (error);
4274 }
4275 
4276 /*
4277  * XXX this is called by nfs only and should probably be removed
4278  * If booted with ASKNAME, prompt on the console for a filesystem
4279  * name and return it.
4280  */
4281 void
4282 getfsname(char *askfor, char *name, size_t namelen)
4283 {
4284 	if (boothowto & RB_ASKNAME) {
4285 		printf("%s name: ", askfor);
4286 		console_gets(name, namelen);
4287 	}
4288 }
4289 
4290 /*
4291  * If server_path exists, then we are booting a diskless
4292  * client. Otherwise, we default to ufs. Zfs should perhaps be
4293  * another property.
4294  */
4295 static void
4296 getrootfs(char **fstypp, char **fsmodp)
4297 {
4298 	extern char *strplumb_get_netdev_path(void);
4299 	char *propstr = NULL;
4300 
4301 	/* check fstype property; it should be nfsdyn for diskless */
4302 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4303 	    DDI_PROP_DONTPASS, "fstype", &propstr)
4304 	    == DDI_SUCCESS) {
4305 		(void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4306 		ddi_prop_free(propstr);
4307 
4308 	/*
4309 	 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4310 	 * assume the type of this root filesystem is 'zfs'.
4311 	 */
4312 	} else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4313 	    DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4314 	    == DDI_SUCCESS) {
4315 		(void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4316 		ddi_prop_free(propstr);
4317 	}
4318 
4319 	if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4320 		*fstypp = *fsmodp = rootfs.bo_fstype;
4321 		return;
4322 	}
4323 
4324 	++netboot;
4325 	/*
4326 	 * check if path to network interface is specified in bootpath
4327 	 * or by a hypervisor domain configuration file.
4328 	 * XXPV - enable strlumb_get_netdev_path()
4329 	 */
4330 	if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4331 	    "xpv-nfsroot")) {
4332 		(void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4333 	} else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4334 	    DDI_PROP_DONTPASS, "bootpath", &propstr)
4335 	    == DDI_SUCCESS) {
4336 		(void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4337 		ddi_prop_free(propstr);
4338 	} else {
4339 		/* attempt to determine netdev_path via boot_mac address */
4340 		netdev_path = strplumb_get_netdev_path();
4341 		if (netdev_path == NULL)
4342 			panic("cannot find boot network interface");
4343 		(void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
4344 	}
4345 	*fstypp = rootfs.bo_fstype;
4346 	*fsmodp = "nfs";
4347 }
4348 #endif
4349