xref: /titanic_44/usr/src/uts/common/fs/vfs.c (revision b60f2a0b921611326383e4789e0874e9e8a2e708)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <sys/user.h>
47 #include <sys/fstyp.h>
48 #include <sys/kmem.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 #include <sys/mount.h>
52 #include <sys/vfs.h>
53 #include <sys/vfs_opreg.h>
54 #include <sys/fem.h>
55 #include <sys/mntent.h>
56 #include <sys/stat.h>
57 #include <sys/statvfs.h>
58 #include <sys/statfs.h>
59 #include <sys/cred.h>
60 #include <sys/vnode.h>
61 #include <sys/rwstlock.h>
62 #include <sys/dnlc.h>
63 #include <sys/file.h>
64 #include <sys/time.h>
65 #include <sys/atomic.h>
66 #include <sys/cmn_err.h>
67 #include <sys/buf.h>
68 #include <sys/swap.h>
69 #include <sys/debug.h>
70 #include <sys/vnode.h>
71 #include <sys/modctl.h>
72 #include <sys/ddi.h>
73 #include <sys/pathname.h>
74 #include <sys/bootconf.h>
75 #include <sys/dumphdr.h>
76 #include <sys/dc_ki.h>
77 #include <sys/poll.h>
78 #include <sys/sunddi.h>
79 #include <sys/sysmacros.h>
80 #include <sys/zone.h>
81 #include <sys/policy.h>
82 #include <sys/ctfs.h>
83 #include <sys/objfs.h>
84 #include <sys/console.h>
85 #include <sys/reboot.h>
86 #include <sys/attr.h>
87 #include <sys/spa.h>
88 
89 #include <vm/page.h>
90 
91 #include <fs/fs_subr.h>
92 
93 /* Private interfaces to create vopstats-related data structures */
94 extern void		initialize_vopstats(vopstats_t *);
95 extern vopstats_t	*get_fstype_vopstats(struct vfs *, struct vfssw *);
96 extern vsk_anchor_t	*get_vskstat_anchor(struct vfs *);
97 
98 static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
99 static void vfs_setmntopt_nolock(mntopts_t *, const char *,
100     const char *, int, int);
101 static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
102 static void vfs_freemnttab(struct vfs *);
103 static void vfs_freeopt(mntopt_t *);
104 static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
105 static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
106 static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
107 static void vfs_createopttbl_extend(mntopts_t *, const char *,
108     const mntopts_t *);
109 static char **vfs_copycancelopt_extend(char **const, int);
110 static void vfs_freecancelopt(char **);
111 static void getrootfs(char **, char **);
112 static int getmacpath(dev_info_t *, void *);
113 static void vfs_mnttabvp_setup(void);
114 
115 struct ipmnt {
116 	struct ipmnt	*mip_next;
117 	dev_t		mip_dev;
118 	struct vfs	*mip_vfsp;
119 };
120 
121 static kmutex_t		vfs_miplist_mutex;
122 static struct ipmnt	*vfs_miplist = NULL;
123 static struct ipmnt	*vfs_miplist_end = NULL;
124 
125 static kmem_cache_t *vfs_cache;	/* Pointer to VFS kmem cache */
126 
127 /*
128  * VFS global data.
129  */
130 vnode_t *rootdir;		/* pointer to root inode vnode. */
131 vnode_t *devicesdir;		/* pointer to inode of devices root */
132 vnode_t	*devdir;		/* pointer to inode of dev root */
133 
134 char *server_rootpath;		/* root path for diskless clients */
135 char *server_hostname;		/* hostname of diskless server */
136 
137 static struct vfs root;
138 static struct vfs devices;
139 static struct vfs dev;
140 struct vfs *rootvfs = &root;	/* pointer to root vfs; head of VFS list. */
141 rvfs_t *rvfs_list;		/* array of vfs ptrs for vfs hash list */
142 int vfshsz = 512;		/* # of heads/locks in vfs hash arrays */
143 				/* must be power of 2!	*/
144 timespec_t vfs_mnttab_ctime;	/* mnttab created time */
145 timespec_t vfs_mnttab_mtime;	/* mnttab last modified time */
146 char *vfs_dummyfstype = "\0";
147 struct pollhead vfs_pollhd;	/* for mnttab pollers */
148 struct vnode *vfs_mntdummyvp;	/* to fake mnttab read/write for file events */
149 int	mntfstype;		/* will be set once mnt fs is mounted */
150 
151 /*
152  * Table for generic options recognized in the VFS layer and acted
153  * on at this level before parsing file system specific options.
154  * The nosuid option is stronger than any of the devices and setuid
155  * options, so those are canceled when nosuid is seen.
156  *
157  * All options which are added here need to be added to the
158  * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
159  */
160 /*
161  * VFS Mount options table
162  */
163 static char *ro_cancel[] = { MNTOPT_RW, NULL };
164 static char *rw_cancel[] = { MNTOPT_RO, NULL };
165 static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
166 static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
167     MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
168 static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
169 static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
170 static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
171 static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
172 static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
173 static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
174 static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
175 static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
176 
177 static const mntopt_t mntopts[] = {
178 /*
179  *	option name		cancel options		default arg	flags
180  */
181 	{ MNTOPT_REMOUNT,	NULL,			NULL,
182 		MO_NODISPLAY, (void *)0 },
183 	{ MNTOPT_RO,		ro_cancel,		NULL,		0,
184 		(void *)0 },
185 	{ MNTOPT_RW,		rw_cancel,		NULL,		0,
186 		(void *)0 },
187 	{ MNTOPT_SUID,		suid_cancel,		NULL,		0,
188 		(void *)0 },
189 	{ MNTOPT_NOSUID,	nosuid_cancel,		NULL,		0,
190 		(void *)0 },
191 	{ MNTOPT_DEVICES,	devices_cancel,		NULL,		0,
192 		(void *)0 },
193 	{ MNTOPT_NODEVICES,	nodevices_cancel,	NULL,		0,
194 		(void *)0 },
195 	{ MNTOPT_SETUID,	setuid_cancel,		NULL,		0,
196 		(void *)0 },
197 	{ MNTOPT_NOSETUID,	nosetuid_cancel,	NULL,		0,
198 		(void *)0 },
199 	{ MNTOPT_NBMAND,	nbmand_cancel,		NULL,		0,
200 		(void *)0 },
201 	{ MNTOPT_NONBMAND,	nonbmand_cancel,	NULL,		0,
202 		(void *)0 },
203 	{ MNTOPT_EXEC,		exec_cancel,		NULL,		0,
204 		(void *)0 },
205 	{ MNTOPT_NOEXEC,	noexec_cancel,		NULL,		0,
206 		(void *)0 },
207 };
208 
209 const mntopts_t vfs_mntopts = {
210 	sizeof (mntopts) / sizeof (mntopt_t),
211 	(mntopt_t *)&mntopts[0]
212 };
213 
214 /*
215  * File system operation dispatch functions.
216  */
217 
218 int
219 fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
220 {
221 	return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
222 }
223 
224 int
225 fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
226 {
227 	return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
228 }
229 
230 int
231 fsop_root(vfs_t *vfsp, vnode_t **vpp)
232 {
233 	refstr_t *mntpt;
234 	int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
235 	/*
236 	 * Make sure this root has a path.  With lofs, it is possible to have
237 	 * a NULL mountpoint.
238 	 */
239 	if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
240 		mntpt = vfs_getmntpoint(vfsp);
241 		vn_setpath_str(*vpp, refstr_value(mntpt),
242 		    strlen(refstr_value(mntpt)));
243 		refstr_rele(mntpt);
244 	}
245 
246 	return (ret);
247 }
248 
249 int
250 fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
251 {
252 	return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
253 }
254 
255 int
256 fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
257 {
258 	return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
259 }
260 
261 int
262 fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
263 {
264 	/*
265 	 * In order to handle system attribute fids in a manner
266 	 * transparent to the underlying fs, we embed the fid for
267 	 * the sysattr parent object in the sysattr fid and tack on
268 	 * some extra bytes that only the sysattr layer knows about.
269 	 *
270 	 * This guarantees that sysattr fids are larger than other fids
271 	 * for this vfs. If the vfs supports sysattrs (implied
272 	 * by VFSFT_XVATTR support), we cannot have a size collision
273 	 * with XATTR_FIDSZ.
274 	 */
275 	if (vfs_has_feature(vfsp, VFSFT_XVATTR) &&
276 	    fidp->fid_len == XATTR_FIDSZ)
277 		return (xattr_dir_vget(vfsp, vpp, fidp));
278 
279 	return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
280 }
281 
282 int
283 fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
284 {
285 	return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
286 }
287 
288 void
289 fsop_freefs(vfs_t *vfsp)
290 {
291 	(*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
292 }
293 
294 int
295 fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
296 {
297 	return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
298 }
299 
300 int
301 fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
302 {
303 	ASSERT((fstype >= 0) && (fstype < nfstype));
304 
305 	if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
306 		return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
307 	else
308 		return (ENOTSUP);
309 }
310 
311 /*
312  * File system initialization.  vfs_setfsops() must be called from a file
313  * system's init routine.
314  */
315 
316 static int
317 fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
318     int *unused_ops)
319 {
320 	static const fs_operation_trans_def_t vfs_ops_table[] = {
321 		VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
322 			fs_nosys, fs_nosys,
323 
324 		VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
325 			fs_nosys, fs_nosys,
326 
327 		VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
328 			fs_nosys, fs_nosys,
329 
330 		VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
331 			fs_nosys, fs_nosys,
332 
333 		VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
334 			(fs_generic_func_p) fs_sync,
335 			(fs_generic_func_p) fs_sync,	/* No errors allowed */
336 
337 		VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
338 			fs_nosys, fs_nosys,
339 
340 		VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
341 			fs_nosys, fs_nosys,
342 
343 		VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
344 			(fs_generic_func_p)fs_freevfs,
345 			(fs_generic_func_p)fs_freevfs,	/* Shouldn't fail */
346 
347 		VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
348 			(fs_generic_func_p)fs_nosys,
349 			(fs_generic_func_p)fs_nosys,
350 
351 		NULL, 0, NULL, NULL
352 	};
353 
354 	return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
355 }
356 
357 void
358 zfs_boot_init() {
359 
360 	if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
361 		spa_boot_init();
362 }
363 
364 int
365 vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
366 {
367 	int error;
368 	int unused_ops;
369 
370 	/*
371 	 * Verify that fstype refers to a valid fs.  Note that
372 	 * 0 is valid since it's used to set "stray" ops.
373 	 */
374 	if ((fstype < 0) || (fstype >= nfstype))
375 		return (EINVAL);
376 
377 	if (!ALLOCATED_VFSSW(&vfssw[fstype]))
378 		return (EINVAL);
379 
380 	/* Set up the operations vector. */
381 
382 	error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
383 
384 	if (error != 0)
385 		return (error);
386 
387 	vfssw[fstype].vsw_flag |= VSW_INSTALLED;
388 
389 	if (actual != NULL)
390 		*actual = &vfssw[fstype].vsw_vfsops;
391 
392 #if DEBUG
393 	if (unused_ops != 0)
394 		cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
395 		    "but not used", vfssw[fstype].vsw_name, unused_ops);
396 #endif
397 
398 	return (0);
399 }
400 
401 int
402 vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
403 {
404 	int error;
405 	int unused_ops;
406 
407 	*actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
408 
409 	error = fs_copyfsops(template, *actual, &unused_ops);
410 	if (error != 0) {
411 		kmem_free(*actual, sizeof (vfsops_t));
412 		*actual = NULL;
413 		return (error);
414 	}
415 
416 	return (0);
417 }
418 
419 /*
420  * Free a vfsops structure created as a result of vfs_makefsops().
421  * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
422  * vfs_freevfsops_by_type().
423  */
424 void
425 vfs_freevfsops(vfsops_t *vfsops)
426 {
427 	kmem_free(vfsops, sizeof (vfsops_t));
428 }
429 
430 /*
431  * Since the vfsops structure is part of the vfssw table and wasn't
432  * really allocated, we're not really freeing anything.  We keep
433  * the name for consistency with vfs_freevfsops().  We do, however,
434  * need to take care of a little bookkeeping.
435  * NOTE: For a vfsops structure created by vfs_setfsops(), use
436  * vfs_freevfsops_by_type().
437  */
438 int
439 vfs_freevfsops_by_type(int fstype)
440 {
441 
442 	/* Verify that fstype refers to a loaded fs (and not fsid 0). */
443 	if ((fstype <= 0) || (fstype >= nfstype))
444 		return (EINVAL);
445 
446 	WLOCK_VFSSW();
447 	if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
448 		WUNLOCK_VFSSW();
449 		return (EINVAL);
450 	}
451 
452 	vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
453 	WUNLOCK_VFSSW();
454 
455 	return (0);
456 }
457 
458 /* Support routines used to reference vfs_op */
459 
460 /* Set the operations vector for a vfs */
461 void
462 vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
463 {
464 	vfsops_t	*op;
465 
466 	ASSERT(vfsp != NULL);
467 	ASSERT(vfsops != NULL);
468 
469 	op = vfsp->vfs_op;
470 	membar_consumer();
471 	if (vfsp->vfs_femhead == NULL &&
472 	    casptr(&vfsp->vfs_op, op, vfsops) == op) {
473 		return;
474 	}
475 	fsem_setvfsops(vfsp, vfsops);
476 }
477 
478 /* Retrieve the operations vector for a vfs */
479 vfsops_t *
480 vfs_getops(vfs_t *vfsp)
481 {
482 	vfsops_t	*op;
483 
484 	ASSERT(vfsp != NULL);
485 
486 	op = vfsp->vfs_op;
487 	membar_consumer();
488 	if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
489 		return (op);
490 	} else {
491 		return (fsem_getvfsops(vfsp));
492 	}
493 }
494 
495 /*
496  * Returns non-zero (1) if the vfsops matches that of the vfs.
497  * Returns zero (0) if not.
498  */
499 int
500 vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
501 {
502 	return (vfs_getops(vfsp) == vfsops);
503 }
504 
505 /*
506  * Returns non-zero (1) if the file system has installed a non-default,
507  * non-error vfs_sync routine.  Returns zero (0) otherwise.
508  */
509 int
510 vfs_can_sync(vfs_t *vfsp)
511 {
512 	/* vfs_sync() routine is not the default/error function */
513 	return (vfs_getops(vfsp)->vfs_sync != fs_sync);
514 }
515 
516 /*
517  * Initialize a vfs structure.
518  */
519 void
520 vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
521 {
522 	/* Other initialization has been moved to vfs_alloc() */
523 	vfsp->vfs_count = 0;
524 	vfsp->vfs_next = vfsp;
525 	vfsp->vfs_prev = vfsp;
526 	vfsp->vfs_zone_next = vfsp;
527 	vfsp->vfs_zone_prev = vfsp;
528 	sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
529 	vfsimpl_setup(vfsp);
530 	vfsp->vfs_data = (data);
531 	vfs_setops((vfsp), (op));
532 }
533 
534 /*
535  * Allocate and initialize the vfs implementation private data
536  * structure, vfs_impl_t.
537  */
538 void
539 vfsimpl_setup(vfs_t *vfsp)
540 {
541 	int i;
542 
543 	if (vfsp->vfs_implp != NULL) {
544 		return;
545 	}
546 
547 	vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
548 	/* Note that these are #define'd in vfs.h */
549 	vfsp->vfs_vskap = NULL;
550 	vfsp->vfs_fstypevsp = NULL;
551 
552 	/* Set size of counted array, then zero the array */
553 	vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
554 	for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
555 		vfsp->vfs_featureset[i] = 0;
556 	}
557 }
558 
559 /*
560  * Release the vfs_impl_t structure, if it exists. Some unbundled
561  * filesystems may not use the newer version of vfs and thus
562  * would not contain this implementation private data structure.
563  */
564 void
565 vfsimpl_teardown(vfs_t *vfsp)
566 {
567 	vfs_impl_t	*vip = vfsp->vfs_implp;
568 
569 	if (vip == NULL)
570 		return;
571 
572 	kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
573 	vfsp->vfs_implp = NULL;
574 }
575 
576 /*
577  * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
578  * fstatvfs, and sysfs moved to common/syscall.
579  */
580 
581 /*
582  * Update every mounted file system.  We call the vfs_sync operation of
583  * each file system type, passing it a NULL vfsp to indicate that all
584  * mounted file systems of that type should be updated.
585  */
586 void
587 vfs_sync(int flag)
588 {
589 	struct vfssw *vswp;
590 	RLOCK_VFSSW();
591 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
592 		if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
593 			vfs_refvfssw(vswp);
594 			RUNLOCK_VFSSW();
595 			(void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
596 			    CRED());
597 			vfs_unrefvfssw(vswp);
598 			RLOCK_VFSSW();
599 		}
600 	}
601 	RUNLOCK_VFSSW();
602 }
603 
604 void
605 sync(void)
606 {
607 	vfs_sync(0);
608 }
609 
610 /*
611  * External routines.
612  */
613 
614 krwlock_t vfssw_lock;	/* lock accesses to vfssw */
615 
616 /*
617  * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
618  * but otherwise should be accessed only via vfs_list_lock() and
619  * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
620  */
621 static krwlock_t vfslist;
622 
623 /*
624  * Mount devfs on /devices. This is done right after root is mounted
625  * to provide device access support for the system
626  */
627 static void
628 vfs_mountdevices(void)
629 {
630 	struct vfssw *vsw;
631 	struct vnode *mvp;
632 	struct mounta mounta = {	/* fake mounta for devfs_mount() */
633 		NULL,
634 		NULL,
635 		MS_SYSSPACE,
636 		NULL,
637 		NULL,
638 		0,
639 		NULL,
640 		0
641 	};
642 
643 	/*
644 	 * _init devfs module to fill in the vfssw
645 	 */
646 	if (modload("fs", "devfs") == -1)
647 		panic("Cannot _init devfs module");
648 
649 	/*
650 	 * Hold vfs
651 	 */
652 	RLOCK_VFSSW();
653 	vsw = vfs_getvfsswbyname("devfs");
654 	VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
655 	VFS_HOLD(&devices);
656 
657 	/*
658 	 * Locate mount point
659 	 */
660 	if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
661 		panic("Cannot find /devices");
662 
663 	/*
664 	 * Perform the mount of /devices
665 	 */
666 	if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
667 		panic("Cannot mount /devices");
668 
669 	RUNLOCK_VFSSW();
670 
671 	/*
672 	 * Set appropriate members and add to vfs list for mnttab display
673 	 */
674 	vfs_setresource(&devices, "/devices");
675 	vfs_setmntpoint(&devices, "/devices");
676 
677 	/*
678 	 * Hold the root of /devices so it won't go away
679 	 */
680 	if (VFS_ROOT(&devices, &devicesdir))
681 		panic("vfs_mountdevices: not devices root");
682 
683 	if (vfs_lock(&devices) != 0) {
684 		VN_RELE(devicesdir);
685 		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
686 		return;
687 	}
688 
689 	if (vn_vfswlock(mvp) != 0) {
690 		vfs_unlock(&devices);
691 		VN_RELE(devicesdir);
692 		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
693 		return;
694 	}
695 
696 	vfs_add(mvp, &devices, 0);
697 	vn_vfsunlock(mvp);
698 	vfs_unlock(&devices);
699 	VN_RELE(devicesdir);
700 }
701 
702 /*
703  * mount the first instance of /dev  to root and remain mounted
704  */
705 static void
706 vfs_mountdev1(void)
707 {
708 	struct vfssw *vsw;
709 	struct vnode *mvp;
710 	struct mounta mounta = {	/* fake mounta for sdev_mount() */
711 		NULL,
712 		NULL,
713 		MS_SYSSPACE | MS_OVERLAY,
714 		NULL,
715 		NULL,
716 		0,
717 		NULL,
718 		0
719 	};
720 
721 	/*
722 	 * _init dev module to fill in the vfssw
723 	 */
724 	if (modload("fs", "dev") == -1)
725 		cmn_err(CE_PANIC, "Cannot _init dev module\n");
726 
727 	/*
728 	 * Hold vfs
729 	 */
730 	RLOCK_VFSSW();
731 	vsw = vfs_getvfsswbyname("dev");
732 	VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
733 	VFS_HOLD(&dev);
734 
735 	/*
736 	 * Locate mount point
737 	 */
738 	if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
739 		cmn_err(CE_PANIC, "Cannot find /dev\n");
740 
741 	/*
742 	 * Perform the mount of /dev
743 	 */
744 	if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
745 		cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
746 
747 	RUNLOCK_VFSSW();
748 
749 	/*
750 	 * Set appropriate members and add to vfs list for mnttab display
751 	 */
752 	vfs_setresource(&dev, "/dev");
753 	vfs_setmntpoint(&dev, "/dev");
754 
755 	/*
756 	 * Hold the root of /dev so it won't go away
757 	 */
758 	if (VFS_ROOT(&dev, &devdir))
759 		cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
760 
761 	if (vfs_lock(&dev) != 0) {
762 		VN_RELE(devdir);
763 		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
764 		return;
765 	}
766 
767 	if (vn_vfswlock(mvp) != 0) {
768 		vfs_unlock(&dev);
769 		VN_RELE(devdir);
770 		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
771 		return;
772 	}
773 
774 	vfs_add(mvp, &dev, 0);
775 	vn_vfsunlock(mvp);
776 	vfs_unlock(&dev);
777 	VN_RELE(devdir);
778 }
779 
780 /*
781  * Mount required filesystem. This is done right after root is mounted.
782  */
783 static void
784 vfs_mountfs(char *module, char *spec, char *path)
785 {
786 	struct vnode *mvp;
787 	struct mounta mounta;
788 	vfs_t *vfsp;
789 
790 	mounta.flags = MS_SYSSPACE | MS_DATA;
791 	mounta.fstype = module;
792 	mounta.spec = spec;
793 	mounta.dir = path;
794 	if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
795 		cmn_err(CE_WARN, "Cannot find %s", path);
796 		return;
797 	}
798 	if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
799 		cmn_err(CE_WARN, "Cannot mount %s", path);
800 	else
801 		VFS_RELE(vfsp);
802 	VN_RELE(mvp);
803 }
804 
805 /*
806  * vfs_mountroot is called by main() to mount the root filesystem.
807  */
808 void
809 vfs_mountroot(void)
810 {
811 	struct vnode	*rvp = NULL;
812 	char		*path;
813 	size_t		plen;
814 	struct vfssw	*vswp;
815 
816 	rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
817 	rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
818 
819 	/*
820 	 * Alloc the vfs hash bucket array and locks
821 	 */
822 	rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
823 
824 	/*
825 	 * Call machine-dependent routine "rootconf" to choose a root
826 	 * file system type.
827 	 */
828 	if (rootconf())
829 		panic("vfs_mountroot: cannot mount root");
830 	/*
831 	 * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
832 	 * to point to it.  These are used by lookuppn() so that it
833 	 * knows where to start from ('/' or '.').
834 	 */
835 	vfs_setmntpoint(rootvfs, "/");
836 	if (VFS_ROOT(rootvfs, &rootdir))
837 		panic("vfs_mountroot: no root vnode");
838 	PTOU(curproc)->u_cdir = rootdir;
839 	VN_HOLD(PTOU(curproc)->u_cdir);
840 	PTOU(curproc)->u_rdir = NULL;
841 
842 	/*
843 	 * Setup the global zone's rootvp, now that it exists.
844 	 */
845 	global_zone->zone_rootvp = rootdir;
846 	VN_HOLD(global_zone->zone_rootvp);
847 
848 	/*
849 	 * Notify the module code that it can begin using the
850 	 * root filesystem instead of the boot program's services.
851 	 */
852 	modrootloaded = 1;
853 
854 	/*
855 	 * Special handling for a ZFS root file system.
856 	 */
857 	zfs_boot_init();
858 
859 	/*
860 	 * Set up mnttab information for root
861 	 */
862 	vfs_setresource(rootvfs, rootfs.bo_name);
863 
864 	/*
865 	 * Notify cluster software that the root filesystem is available.
866 	 */
867 	clboot_mountroot();
868 
869 	/* Now that we're all done with the root FS, set up its vopstats */
870 	if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
871 		/* Set flag for statistics collection */
872 		if (vswp->vsw_flag & VSW_STATS) {
873 			initialize_vopstats(&rootvfs->vfs_vopstats);
874 			rootvfs->vfs_flag |= VFS_STATS;
875 			rootvfs->vfs_fstypevsp =
876 			    get_fstype_vopstats(rootvfs, vswp);
877 			rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
878 		}
879 		vfs_unrefvfssw(vswp);
880 	}
881 
882 	/*
883 	 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
884 	 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
885 	 */
886 	vfs_mountdevices();
887 	vfs_mountdev1();
888 
889 	vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
890 	vfs_mountfs("proc", "/proc", "/proc");
891 	vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
892 	vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
893 	vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
894 
895 	if (getzoneid() == GLOBAL_ZONEID) {
896 		vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
897 	}
898 
899 #ifdef __sparc
900 	/*
901 	 * This bit of magic can go away when we convert sparc to
902 	 * the new boot architecture based on ramdisk.
903 	 *
904 	 * Booting off a mirrored root volume:
905 	 * At this point, we have booted and mounted root on a
906 	 * single component of the mirror.  Complete the boot
907 	 * by configuring SVM and converting the root to the
908 	 * dev_t of the mirrored root device.  This dev_t conversion
909 	 * only works because the underlying device doesn't change.
910 	 */
911 	if (root_is_svm) {
912 		if (svm_rootconf()) {
913 			panic("vfs_mountroot: cannot remount root");
914 		}
915 
916 		/*
917 		 * mnttab should reflect the new root device
918 		 */
919 		vfs_lock_wait(rootvfs);
920 		vfs_setresource(rootvfs, rootfs.bo_name);
921 		vfs_unlock(rootvfs);
922 	}
923 #endif /* __sparc */
924 
925 	/*
926 	 * Look up the root device via devfs so that a dv_node is
927 	 * created for it. The vnode is never VN_RELE()ed.
928 	 * We allocate more than MAXPATHLEN so that the
929 	 * buffer passed to i_ddi_prompath_to_devfspath() is
930 	 * exactly MAXPATHLEN (the function expects a buffer
931 	 * of that length).
932 	 */
933 	plen = strlen("/devices");
934 	path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
935 	(void) strcpy(path, "/devices");
936 
937 	if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
938 	    != DDI_SUCCESS ||
939 	    lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
940 
941 		/* NUL terminate in case "path" has garbage */
942 		path[plen + MAXPATHLEN - 1] = '\0';
943 #ifdef	DEBUG
944 		cmn_err(CE_WARN, "!Cannot lookup root device: %s", path);
945 #endif
946 	}
947 	kmem_free(path, plen + MAXPATHLEN);
948 	vfs_mnttabvp_setup();
949 }
950 
951 /*
952  * If remount failed and we're in a zone we need to check for the zone
953  * root path and strip it before the call to vfs_setpath().
954  *
955  * If strpath doesn't begin with the zone_rootpath the original
956  * strpath is returned unchanged.
957  */
958 static const char *
959 stripzonepath(const char *strpath)
960 {
961 	char *str1, *str2;
962 	int i;
963 	zone_t *zonep = curproc->p_zone;
964 
965 	if (zonep->zone_rootpath == NULL || strpath == NULL) {
966 		return (NULL);
967 	}
968 
969 	/*
970 	 * we check for the end of the string at one past the
971 	 * current position because the zone_rootpath always
972 	 * ends with "/" but we don't want to strip that off.
973 	 */
974 	str1 = zonep->zone_rootpath;
975 	str2 = (char *)strpath;
976 	ASSERT(str1[0] != '\0');
977 	for (i = 0; str1[i + 1] != '\0'; i++) {
978 		if (str1[i] != str2[i])
979 			return ((char *)strpath);
980 	}
981 	return (&str2[i]);
982 }
983 
984 /*
985  * Common mount code.  Called from the system call entry point, from autofs,
986  * nfsv4 trigger mounts, and from pxfs.
987  *
988  * Takes the effective file system type, mount arguments, the mount point
989  * vnode, flags specifying whether the mount is a remount and whether it
990  * should be entered into the vfs list, and credentials.  Fills in its vfspp
991  * parameter with the mounted file system instance's vfs.
992  *
993  * Note that the effective file system type is specified as a string.  It may
994  * be null, in which case it's determined from the mount arguments, and may
995  * differ from the type specified in the mount arguments; this is a hook to
996  * allow interposition when instantiating file system instances.
997  *
998  * The caller is responsible for releasing its own hold on the mount point
999  * vp (this routine does its own hold when necessary).
1000  * Also note that for remounts, the mount point vp should be the vnode for
1001  * the root of the file system rather than the vnode that the file system
1002  * is mounted on top of.
1003  */
1004 int
1005 domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1006 	struct vfs **vfspp)
1007 {
1008 	struct vfssw	*vswp;
1009 	vfsops_t	*vfsops;
1010 	struct vfs	*vfsp;
1011 	struct vnode	*bvp;
1012 	dev_t		bdev = 0;
1013 	mntopts_t	mnt_mntopts;
1014 	int		error = 0;
1015 	int		copyout_error = 0;
1016 	int		ovflags;
1017 	char		*opts = uap->optptr;
1018 	char		*inargs = opts;
1019 	int		optlen = uap->optlen;
1020 	int		remount;
1021 	int		rdonly;
1022 	int		nbmand = 0;
1023 	int		delmip = 0;
1024 	int		addmip = 0;
1025 	int		splice = ((uap->flags & MS_NOSPLICE) == 0);
1026 	int		fromspace = (uap->flags & MS_SYSSPACE) ?
1027 	    UIO_SYSSPACE : UIO_USERSPACE;
1028 	char		*resource = NULL, *mountpt = NULL;
1029 	refstr_t	*oldresource, *oldmntpt;
1030 	struct pathname	pn, rpn;
1031 	vsk_anchor_t	*vskap;
1032 
1033 	/*
1034 	 * The v_flag value for the mount point vp is permanently set
1035 	 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1036 	 * for mount point locking.
1037 	 */
1038 	mutex_enter(&vp->v_lock);
1039 	vp->v_flag |= VVFSLOCK;
1040 	mutex_exit(&vp->v_lock);
1041 
1042 	mnt_mntopts.mo_count = 0;
1043 	/*
1044 	 * Find the ops vector to use to invoke the file system-specific mount
1045 	 * method.  If the fsname argument is non-NULL, use it directly.
1046 	 * Otherwise, dig the file system type information out of the mount
1047 	 * arguments.
1048 	 *
1049 	 * A side effect is to hold the vfssw entry.
1050 	 *
1051 	 * Mount arguments can be specified in several ways, which are
1052 	 * distinguished by flag bit settings.  The preferred way is to set
1053 	 * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1054 	 * type supplied as a character string and the last two arguments
1055 	 * being a pointer to a character buffer and the size of the buffer.
1056 	 * On entry, the buffer holds a null terminated list of options; on
1057 	 * return, the string is the list of options the file system
1058 	 * recognized. If MS_DATA is set arguments five and six point to a
1059 	 * block of binary data which the file system interprets.
1060 	 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1061 	 * consistently with these conventions.  To handle them, we check to
1062 	 * see whether the pointer to the file system name has a numeric value
1063 	 * less than 256.  If so, we treat it as an index.
1064 	 */
1065 	if (fsname != NULL) {
1066 		if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1067 			return (EINVAL);
1068 		}
1069 	} else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1070 		size_t n;
1071 		uint_t fstype;
1072 		char name[FSTYPSZ];
1073 
1074 		if ((fstype = (uintptr_t)uap->fstype) < 256) {
1075 			RLOCK_VFSSW();
1076 			if (fstype == 0 || fstype >= nfstype ||
1077 			    !ALLOCATED_VFSSW(&vfssw[fstype])) {
1078 				RUNLOCK_VFSSW();
1079 				return (EINVAL);
1080 			}
1081 			(void) strcpy(name, vfssw[fstype].vsw_name);
1082 			RUNLOCK_VFSSW();
1083 			if ((vswp = vfs_getvfssw(name)) == NULL)
1084 				return (EINVAL);
1085 		} else {
1086 			/*
1087 			 * Handle either kernel or user address space.
1088 			 */
1089 			if (uap->flags & MS_SYSSPACE) {
1090 				error = copystr(uap->fstype, name,
1091 				    FSTYPSZ, &n);
1092 			} else {
1093 				error = copyinstr(uap->fstype, name,
1094 				    FSTYPSZ, &n);
1095 			}
1096 			if (error) {
1097 				if (error == ENAMETOOLONG)
1098 					return (EINVAL);
1099 				return (error);
1100 			}
1101 			if ((vswp = vfs_getvfssw(name)) == NULL)
1102 				return (EINVAL);
1103 		}
1104 	} else {
1105 		if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1106 			return (EINVAL);
1107 	}
1108 	if (!VFS_INSTALLED(vswp))
1109 		return (EINVAL);
1110 	vfsops = &vswp->vsw_vfsops;
1111 
1112 	vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1113 	/*
1114 	 * Fetch mount options and parse them for generic vfs options
1115 	 */
1116 	if (uap->flags & MS_OPTIONSTR) {
1117 		/*
1118 		 * Limit the buffer size
1119 		 */
1120 		if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1121 			error = EINVAL;
1122 			goto errout;
1123 		}
1124 		if ((uap->flags & MS_SYSSPACE) == 0) {
1125 			inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1126 			inargs[0] = '\0';
1127 			if (optlen) {
1128 				error = copyinstr(opts, inargs, (size_t)optlen,
1129 				    NULL);
1130 				if (error) {
1131 					goto errout;
1132 				}
1133 			}
1134 		}
1135 		vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1136 	}
1137 	/*
1138 	 * Flag bits override the options string.
1139 	 */
1140 	if (uap->flags & MS_REMOUNT)
1141 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1142 	if (uap->flags & MS_RDONLY)
1143 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1144 	if (uap->flags & MS_NOSUID)
1145 		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1146 
1147 	/*
1148 	 * Check if this is a remount; must be set in the option string and
1149 	 * the file system must support a remount option.
1150 	 */
1151 	if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1152 	    MNTOPT_REMOUNT, NULL)) {
1153 		if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1154 			error = ENOTSUP;
1155 			goto errout;
1156 		}
1157 		uap->flags |= MS_REMOUNT;
1158 	}
1159 
1160 	/*
1161 	 * uap->flags and vfs_optionisset() should agree.
1162 	 */
1163 	if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1164 		uap->flags |= MS_RDONLY;
1165 	}
1166 	if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1167 		uap->flags |= MS_NOSUID;
1168 	}
1169 	nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1170 	ASSERT(splice || !remount);
1171 	/*
1172 	 * If we are splicing the fs into the namespace,
1173 	 * perform mount point checks.
1174 	 *
1175 	 * We want to resolve the path for the mount point to eliminate
1176 	 * '.' and ".." and symlinks in mount points; we can't do the
1177 	 * same for the resource string, since it would turn
1178 	 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1179 	 * this before grabbing vn_vfswlock(), because otherwise we
1180 	 * would deadlock with lookuppn().
1181 	 */
1182 	if (splice) {
1183 		ASSERT(vp->v_count > 0);
1184 
1185 		/*
1186 		 * Pick up mount point and device from appropriate space.
1187 		 */
1188 		if (pn_get(uap->spec, fromspace, &pn) == 0) {
1189 			resource = kmem_alloc(pn.pn_pathlen + 1,
1190 			    KM_SLEEP);
1191 			(void) strcpy(resource, pn.pn_path);
1192 			pn_free(&pn);
1193 		}
1194 		/*
1195 		 * Do a lookupname prior to taking the
1196 		 * writelock. Mark this as completed if
1197 		 * successful for later cleanup and addition to
1198 		 * the mount in progress table.
1199 		 */
1200 		if ((uap->flags & MS_GLOBAL) == 0 &&
1201 		    lookupname(uap->spec, fromspace,
1202 		    FOLLOW, NULL, &bvp) == 0) {
1203 			addmip = 1;
1204 		}
1205 
1206 		if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1207 			pathname_t *pnp;
1208 
1209 			if (*pn.pn_path != '/') {
1210 				error = EINVAL;
1211 				pn_free(&pn);
1212 				goto errout;
1213 			}
1214 			pn_alloc(&rpn);
1215 			/*
1216 			 * Kludge to prevent autofs from deadlocking with
1217 			 * itself when it calls domount().
1218 			 *
1219 			 * If autofs is calling, it is because it is doing
1220 			 * (autofs) mounts in the process of an NFS mount.  A
1221 			 * lookuppn() here would cause us to block waiting for
1222 			 * said NFS mount to complete, which can't since this
1223 			 * is the thread that was supposed to doing it.
1224 			 */
1225 			if (fromspace == UIO_USERSPACE) {
1226 				if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1227 				    NULL)) == 0) {
1228 					pnp = &rpn;
1229 				} else {
1230 					/*
1231 					 * The file disappeared or otherwise
1232 					 * became inaccessible since we opened
1233 					 * it; might as well fail the mount
1234 					 * since the mount point is no longer
1235 					 * accessible.
1236 					 */
1237 					pn_free(&rpn);
1238 					pn_free(&pn);
1239 					goto errout;
1240 				}
1241 			} else {
1242 				pnp = &pn;
1243 			}
1244 			mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1245 			(void) strcpy(mountpt, pnp->pn_path);
1246 
1247 			/*
1248 			 * If the addition of the zone's rootpath
1249 			 * would push us over a total path length
1250 			 * of MAXPATHLEN, we fail the mount with
1251 			 * ENAMETOOLONG, which is what we would have
1252 			 * gotten if we were trying to perform the same
1253 			 * mount in the global zone.
1254 			 *
1255 			 * strlen() doesn't count the trailing
1256 			 * '\0', but zone_rootpathlen counts both a
1257 			 * trailing '/' and the terminating '\0'.
1258 			 */
1259 			if ((curproc->p_zone->zone_rootpathlen - 1 +
1260 			    strlen(mountpt)) > MAXPATHLEN ||
1261 			    (resource != NULL &&
1262 			    (curproc->p_zone->zone_rootpathlen - 1 +
1263 			    strlen(resource)) > MAXPATHLEN)) {
1264 				error = ENAMETOOLONG;
1265 			}
1266 
1267 			pn_free(&rpn);
1268 			pn_free(&pn);
1269 		}
1270 
1271 		if (error)
1272 			goto errout;
1273 
1274 		/*
1275 		 * Prevent path name resolution from proceeding past
1276 		 * the mount point.
1277 		 */
1278 		if (vn_vfswlock(vp) != 0) {
1279 			error = EBUSY;
1280 			goto errout;
1281 		}
1282 
1283 		/*
1284 		 * Verify that it's legitimate to establish a mount on
1285 		 * the prospective mount point.
1286 		 */
1287 		if (vn_mountedvfs(vp) != NULL) {
1288 			/*
1289 			 * The mount point lock was obtained after some
1290 			 * other thread raced through and established a mount.
1291 			 */
1292 			vn_vfsunlock(vp);
1293 			error = EBUSY;
1294 			goto errout;
1295 		}
1296 		if (vp->v_flag & VNOMOUNT) {
1297 			vn_vfsunlock(vp);
1298 			error = EINVAL;
1299 			goto errout;
1300 		}
1301 	}
1302 	if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1303 		uap->dataptr = NULL;
1304 		uap->datalen = 0;
1305 	}
1306 
1307 	/*
1308 	 * If this is a remount, we don't want to create a new VFS.
1309 	 * Instead, we pass the existing one with a remount flag.
1310 	 */
1311 	if (remount) {
1312 		/*
1313 		 * Confirm that the mount point is the root vnode of the
1314 		 * file system that is being remounted.
1315 		 * This can happen if the user specifies a different
1316 		 * mount point directory pathname in the (re)mount command.
1317 		 *
1318 		 * Code below can only be reached if splice is true, so it's
1319 		 * safe to do vn_vfsunlock() here.
1320 		 */
1321 		if ((vp->v_flag & VROOT) == 0) {
1322 			vn_vfsunlock(vp);
1323 			error = ENOENT;
1324 			goto errout;
1325 		}
1326 		/*
1327 		 * Disallow making file systems read-only unless file system
1328 		 * explicitly allows it in its vfssw.  Ignore other flags.
1329 		 */
1330 		if (rdonly && vn_is_readonly(vp) == 0 &&
1331 		    (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1332 			vn_vfsunlock(vp);
1333 			error = EINVAL;
1334 			goto errout;
1335 		}
1336 		/*
1337 		 * Disallow changing the NBMAND disposition of the file
1338 		 * system on remounts.
1339 		 */
1340 		if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1341 		    (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1342 			vn_vfsunlock(vp);
1343 			error = EINVAL;
1344 			goto errout;
1345 		}
1346 		vfsp = vp->v_vfsp;
1347 		ovflags = vfsp->vfs_flag;
1348 		vfsp->vfs_flag |= VFS_REMOUNT;
1349 		vfsp->vfs_flag &= ~VFS_RDONLY;
1350 	} else {
1351 		vfsp = vfs_alloc(KM_SLEEP);
1352 		VFS_INIT(vfsp, vfsops, NULL);
1353 	}
1354 
1355 	VFS_HOLD(vfsp);
1356 
1357 	/*
1358 	 * The vfs_reflock is not used anymore the code below explicitly
1359 	 * holds it preventing others accesing it directly.
1360 	 */
1361 	if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1362 	    !(vfsp->vfs_flag & VFS_REMOUNT))
1363 		cmn_err(CE_WARN,
1364 		    "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1365 
1366 	/*
1367 	 * Lock the vfs. If this is a remount we want to avoid spurious umount
1368 	 * failures that happen as a side-effect of fsflush() and other mount
1369 	 * and unmount operations that might be going on simultaneously and
1370 	 * may have locked the vfs currently. To not return EBUSY immediately
1371 	 * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1372 	 */
1373 	if (!remount) {
1374 		if (error = vfs_lock(vfsp)) {
1375 			vfsp->vfs_flag = ovflags;
1376 			if (splice)
1377 				vn_vfsunlock(vp);
1378 			vfs_free(vfsp);
1379 			goto errout;
1380 		}
1381 	} else {
1382 		vfs_lock_wait(vfsp);
1383 	}
1384 
1385 	/*
1386 	 * Add device to mount in progress table, global mounts require special
1387 	 * handling. It is possible that we have already done the lookupname
1388 	 * on a spliced, non-global fs. If so, we don't want to do it again
1389 	 * since we cannot do a lookupname after taking the
1390 	 * wlock above. This case is for a non-spliced, non-global filesystem.
1391 	 */
1392 	if (!addmip) {
1393 		if ((uap->flags & MS_GLOBAL) == 0 &&
1394 		    lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1395 			addmip = 1;
1396 		}
1397 	}
1398 
1399 	if (addmip) {
1400 		bdev = bvp->v_rdev;
1401 		VN_RELE(bvp);
1402 		vfs_addmip(bdev, vfsp);
1403 		addmip = 0;
1404 		delmip = 1;
1405 	}
1406 	/*
1407 	 * Invalidate cached entry for the mount point.
1408 	 */
1409 	if (splice)
1410 		dnlc_purge_vp(vp);
1411 
1412 	/*
1413 	 * If have an option string but the filesystem doesn't supply a
1414 	 * prototype options table, create a table with the global
1415 	 * options and sufficient room to accept all the options in the
1416 	 * string.  Then parse the passed in option string
1417 	 * accepting all the options in the string.  This gives us an
1418 	 * option table with all the proper cancel properties for the
1419 	 * global options.
1420 	 *
1421 	 * Filesystems that supply a prototype options table are handled
1422 	 * earlier in this function.
1423 	 */
1424 	if (uap->flags & MS_OPTIONSTR) {
1425 		if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1426 			mntopts_t tmp_mntopts;
1427 
1428 			tmp_mntopts.mo_count = 0;
1429 			vfs_createopttbl_extend(&tmp_mntopts, inargs,
1430 			    &mnt_mntopts);
1431 			vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1432 			vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1433 			vfs_freeopttbl(&tmp_mntopts);
1434 		}
1435 	}
1436 
1437 	/*
1438 	 * Serialize with zone creations.
1439 	 */
1440 	mount_in_progress();
1441 	/*
1442 	 * Instantiate (or reinstantiate) the file system.  If appropriate,
1443 	 * splice it into the file system name space.
1444 	 *
1445 	 * We want VFS_MOUNT() to be able to override the vfs_resource
1446 	 * string if necessary (ie, mntfs), and also for a remount to
1447 	 * change the same (necessary when remounting '/' during boot).
1448 	 * So we set up vfs_mntpt and vfs_resource to what we think they
1449 	 * should be, then hand off control to VFS_MOUNT() which can
1450 	 * override this.
1451 	 *
1452 	 * For safety's sake, when changing vfs_resource or vfs_mntpt of
1453 	 * a vfs which is on the vfs list (i.e. during a remount), we must
1454 	 * never set those fields to NULL. Several bits of code make
1455 	 * assumptions that the fields are always valid.
1456 	 */
1457 	vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1458 	if (remount) {
1459 		if ((oldresource = vfsp->vfs_resource) != NULL)
1460 			refstr_hold(oldresource);
1461 		if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1462 			refstr_hold(oldmntpt);
1463 	}
1464 	vfs_setresource(vfsp, resource);
1465 	vfs_setmntpoint(vfsp, mountpt);
1466 
1467 	/*
1468 	 * going to mount on this vnode, so notify.
1469 	 */
1470 	vnevent_mountedover(vp, NULL);
1471 	error = VFS_MOUNT(vfsp, vp, uap, credp);
1472 
1473 	if (uap->flags & MS_RDONLY)
1474 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1475 	if (uap->flags & MS_NOSUID)
1476 		vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1477 	if (uap->flags & MS_GLOBAL)
1478 		vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1479 
1480 	if (error) {
1481 		if (remount) {
1482 			/* put back pre-remount options */
1483 			vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1484 			vfs_setmntpoint(vfsp, (stripzonepath(
1485 			    refstr_value(oldmntpt))));
1486 			if (oldmntpt)
1487 				refstr_rele(oldmntpt);
1488 			vfs_setresource(vfsp, (stripzonepath(
1489 			    refstr_value(oldresource))));
1490 			if (oldresource)
1491 				refstr_rele(oldresource);
1492 			vfsp->vfs_flag = ovflags;
1493 			vfs_unlock(vfsp);
1494 			VFS_RELE(vfsp);
1495 		} else {
1496 			vfs_unlock(vfsp);
1497 			vfs_freemnttab(vfsp);
1498 			vfs_free(vfsp);
1499 		}
1500 	} else {
1501 		/*
1502 		 * Set the mount time to now
1503 		 */
1504 		vfsp->vfs_mtime = ddi_get_time();
1505 		if (remount) {
1506 			vfsp->vfs_flag &= ~VFS_REMOUNT;
1507 			if (oldresource)
1508 				refstr_rele(oldresource);
1509 			if (oldmntpt)
1510 				refstr_rele(oldmntpt);
1511 		} else if (splice) {
1512 			/*
1513 			 * Link vfsp into the name space at the mount
1514 			 * point. Vfs_add() is responsible for
1515 			 * holding the mount point which will be
1516 			 * released when vfs_remove() is called.
1517 			 */
1518 			vfs_add(vp, vfsp, uap->flags);
1519 		} else {
1520 			/*
1521 			 * Hold the reference to file system which is
1522 			 * not linked into the name space.
1523 			 */
1524 			vfsp->vfs_zone = NULL;
1525 			VFS_HOLD(vfsp);
1526 			vfsp->vfs_vnodecovered = NULL;
1527 		}
1528 		/*
1529 		 * Set flags for global options encountered
1530 		 */
1531 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1532 			vfsp->vfs_flag |= VFS_RDONLY;
1533 		else
1534 			vfsp->vfs_flag &= ~VFS_RDONLY;
1535 		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1536 			vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1537 		} else {
1538 			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1539 				vfsp->vfs_flag |= VFS_NODEVICES;
1540 			else
1541 				vfsp->vfs_flag &= ~VFS_NODEVICES;
1542 			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1543 				vfsp->vfs_flag |= VFS_NOSETUID;
1544 			else
1545 				vfsp->vfs_flag &= ~VFS_NOSETUID;
1546 		}
1547 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1548 			vfsp->vfs_flag |= VFS_NBMAND;
1549 		else
1550 			vfsp->vfs_flag &= ~VFS_NBMAND;
1551 
1552 		if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1553 			vfsp->vfs_flag |= VFS_XATTR;
1554 		else
1555 			vfsp->vfs_flag &= ~VFS_XATTR;
1556 
1557 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1558 			vfsp->vfs_flag |= VFS_NOEXEC;
1559 		else
1560 			vfsp->vfs_flag &= ~VFS_NOEXEC;
1561 
1562 		/*
1563 		 * Now construct the output option string of options
1564 		 * we recognized.
1565 		 */
1566 		if (uap->flags & MS_OPTIONSTR) {
1567 			vfs_list_read_lock();
1568 			copyout_error = vfs_buildoptionstr(
1569 			    &vfsp->vfs_mntopts, inargs, optlen);
1570 			vfs_list_unlock();
1571 			if (copyout_error == 0 &&
1572 			    (uap->flags & MS_SYSSPACE) == 0) {
1573 				copyout_error = copyoutstr(inargs, opts,
1574 				    optlen, NULL);
1575 			}
1576 		}
1577 
1578 		/*
1579 		 * If this isn't a remount, set up the vopstats before
1580 		 * anyone can touch this. We only allow spliced file
1581 		 * systems (file systems which are in the namespace) to
1582 		 * have the VFS_STATS flag set.
1583 		 * NOTE: PxFS mounts the underlying file system with
1584 		 * MS_NOSPLICE set and copies those vfs_flags to its private
1585 		 * vfs structure. As a result, PxFS should never have
1586 		 * the VFS_STATS flag or else we might access the vfs
1587 		 * statistics-related fields prior to them being
1588 		 * properly initialized.
1589 		 */
1590 		if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1591 			initialize_vopstats(&vfsp->vfs_vopstats);
1592 			/*
1593 			 * We need to set vfs_vskap to NULL because there's
1594 			 * a chance it won't be set below.  This is checked
1595 			 * in teardown_vopstats() so we can't have garbage.
1596 			 */
1597 			vfsp->vfs_vskap = NULL;
1598 			vfsp->vfs_flag |= VFS_STATS;
1599 			vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1600 		}
1601 
1602 		if (vswp->vsw_flag & VSW_XID)
1603 			vfsp->vfs_flag |= VFS_XID;
1604 
1605 		vfs_unlock(vfsp);
1606 	}
1607 	mount_completed();
1608 	if (splice)
1609 		vn_vfsunlock(vp);
1610 
1611 	if ((error == 0) && (copyout_error == 0)) {
1612 		if (!remount) {
1613 			/*
1614 			 * Don't call get_vskstat_anchor() while holding
1615 			 * locks since it allocates memory and calls
1616 			 * VFS_STATVFS().  For NFS, the latter can generate
1617 			 * an over-the-wire call.
1618 			 */
1619 			vskap = get_vskstat_anchor(vfsp);
1620 			/* Only take the lock if we have something to do */
1621 			if (vskap != NULL) {
1622 				vfs_lock_wait(vfsp);
1623 				if (vfsp->vfs_flag & VFS_STATS) {
1624 					vfsp->vfs_vskap = vskap;
1625 				}
1626 				vfs_unlock(vfsp);
1627 			}
1628 		}
1629 		/* Return vfsp to caller. */
1630 		*vfspp = vfsp;
1631 	}
1632 errout:
1633 	vfs_freeopttbl(&mnt_mntopts);
1634 	if (resource != NULL)
1635 		kmem_free(resource, strlen(resource) + 1);
1636 	if (mountpt != NULL)
1637 		kmem_free(mountpt, strlen(mountpt) + 1);
1638 	/*
1639 	 * It is possible we errored prior to adding to mount in progress
1640 	 * table. Must free vnode we acquired with successful lookupname.
1641 	 */
1642 	if (addmip)
1643 		VN_RELE(bvp);
1644 	if (delmip)
1645 		vfs_delmip(vfsp);
1646 	ASSERT(vswp != NULL);
1647 	vfs_unrefvfssw(vswp);
1648 	if (inargs != opts)
1649 		kmem_free(inargs, MAX_MNTOPT_STR);
1650 	if (copyout_error) {
1651 		VFS_RELE(vfsp);
1652 		error = copyout_error;
1653 	}
1654 	return (error);
1655 }
1656 
1657 static void
1658 vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath)
1659 {
1660 	size_t len;
1661 	refstr_t *ref;
1662 	zone_t *zone = curproc->p_zone;
1663 	char *sp;
1664 	int have_list_lock = 0;
1665 
1666 	ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1667 
1668 	/*
1669 	 * New path must be less than MAXPATHLEN because mntfs
1670 	 * will only display up to MAXPATHLEN bytes. This is currently
1671 	 * safe, because domount() uses pn_get(), and other callers
1672 	 * similarly cap the size to fewer than MAXPATHLEN bytes.
1673 	 */
1674 
1675 	ASSERT(strlen(newpath) < MAXPATHLEN);
1676 
1677 	/* mntfs requires consistency while vfs list lock is held */
1678 
1679 	if (VFS_ON_LIST(vfsp)) {
1680 		have_list_lock = 1;
1681 		vfs_list_lock();
1682 	}
1683 
1684 	if (*refp != NULL)
1685 		refstr_rele(*refp);
1686 
1687 	/* Do we need to modify the path? */
1688 
1689 	if (zone == global_zone || *newpath != '/') {
1690 		ref = refstr_alloc(newpath);
1691 		goto out;
1692 	}
1693 
1694 	/*
1695 	 * Truncate the trailing '/' in the zoneroot, and merge
1696 	 * in the zone's rootpath with the "newpath" (resource
1697 	 * or mountpoint) passed in.
1698 	 *
1699 	 * The size of the required buffer is thus the size of
1700 	 * the buffer required for the passed-in newpath
1701 	 * (strlen(newpath) + 1), plus the size of the buffer
1702 	 * required to hold zone_rootpath (zone_rootpathlen)
1703 	 * minus one for one of the now-superfluous NUL
1704 	 * terminations, minus one for the trailing '/'.
1705 	 *
1706 	 * That gives us:
1707 	 *
1708 	 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1709 	 *
1710 	 * Which is what we have below.
1711 	 */
1712 
1713 	len = strlen(newpath) + zone->zone_rootpathlen - 1;
1714 	sp = kmem_alloc(len, KM_SLEEP);
1715 
1716 	/*
1717 	 * Copy everything including the trailing slash, which
1718 	 * we then overwrite with the NUL character.
1719 	 */
1720 
1721 	(void) strcpy(sp, zone->zone_rootpath);
1722 	sp[zone->zone_rootpathlen - 2] = '\0';
1723 	(void) strcat(sp, newpath);
1724 
1725 	ref = refstr_alloc(sp);
1726 	kmem_free(sp, len);
1727 out:
1728 	*refp = ref;
1729 
1730 	if (have_list_lock) {
1731 		vfs_mnttab_modtimeupd();
1732 		vfs_list_unlock();
1733 	}
1734 }
1735 
1736 /*
1737  * Record a mounted resource name in a vfs structure.
1738  * If vfsp is already mounted, caller must hold the vfs lock.
1739  */
1740 void
1741 vfs_setresource(struct vfs *vfsp, const char *resource)
1742 {
1743 	if (resource == NULL || resource[0] == '\0')
1744 		resource = VFS_NORESOURCE;
1745 	vfs_setpath(vfsp, &vfsp->vfs_resource, resource);
1746 }
1747 
1748 /*
1749  * Record a mount point name in a vfs structure.
1750  * If vfsp is already mounted, caller must hold the vfs lock.
1751  */
1752 void
1753 vfs_setmntpoint(struct vfs *vfsp, const char *mntpt)
1754 {
1755 	if (mntpt == NULL || mntpt[0] == '\0')
1756 		mntpt = VFS_NOMNTPT;
1757 	vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt);
1758 }
1759 
1760 /* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1761 
1762 refstr_t *
1763 vfs_getresource(const struct vfs *vfsp)
1764 {
1765 	refstr_t *resource;
1766 
1767 	vfs_list_read_lock();
1768 	resource = vfsp->vfs_resource;
1769 	refstr_hold(resource);
1770 	vfs_list_unlock();
1771 
1772 	return (resource);
1773 }
1774 
1775 /* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1776 
1777 refstr_t *
1778 vfs_getmntpoint(const struct vfs *vfsp)
1779 {
1780 	refstr_t *mntpt;
1781 
1782 	vfs_list_read_lock();
1783 	mntpt = vfsp->vfs_mntpt;
1784 	refstr_hold(mntpt);
1785 	vfs_list_unlock();
1786 
1787 	return (mntpt);
1788 }
1789 
1790 /*
1791  * Create an empty options table with enough empty slots to hold all
1792  * The options in the options string passed as an argument.
1793  * Potentially prepend another options table.
1794  *
1795  * Note: caller is responsible for locking the vfs list, if needed,
1796  *       to protect mops.
1797  */
1798 static void
1799 vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1800     const mntopts_t *mtmpl)
1801 {
1802 	const char *s = opts;
1803 	uint_t count;
1804 
1805 	if (opts == NULL || *opts == '\0') {
1806 		count = 0;
1807 	} else {
1808 		count = 1;
1809 
1810 		/*
1811 		 * Count number of options in the string
1812 		 */
1813 		for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1814 			count++;
1815 			s++;
1816 		}
1817 	}
1818 	vfs_copyopttbl_extend(mtmpl, mops, count);
1819 }
1820 
1821 /*
1822  * Create an empty options table with enough empty slots to hold all
1823  * The options in the options string passed as an argument.
1824  *
1825  * This function is *not* for general use by filesystems.
1826  *
1827  * Note: caller is responsible for locking the vfs list, if needed,
1828  *       to protect mops.
1829  */
1830 void
1831 vfs_createopttbl(mntopts_t *mops, const char *opts)
1832 {
1833 	vfs_createopttbl_extend(mops, opts, NULL);
1834 }
1835 
1836 
1837 /*
1838  * Swap two mount options tables
1839  */
1840 static void
1841 vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
1842 {
1843 	uint_t tmpcnt;
1844 	mntopt_t *tmplist;
1845 
1846 	tmpcnt = optbl2->mo_count;
1847 	tmplist = optbl2->mo_list;
1848 	optbl2->mo_count = optbl1->mo_count;
1849 	optbl2->mo_list = optbl1->mo_list;
1850 	optbl1->mo_count = tmpcnt;
1851 	optbl1->mo_list = tmplist;
1852 }
1853 
1854 static void
1855 vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
1856 {
1857 	vfs_list_lock();
1858 	vfs_swapopttbl_nolock(optbl1, optbl2);
1859 	vfs_mnttab_modtimeupd();
1860 	vfs_list_unlock();
1861 }
1862 
1863 static char **
1864 vfs_copycancelopt_extend(char **const moc, int extend)
1865 {
1866 	int i = 0;
1867 	int j;
1868 	char **result;
1869 
1870 	if (moc != NULL) {
1871 		for (; moc[i] != NULL; i++)
1872 			/* count number of options to cancel */;
1873 	}
1874 
1875 	if (i + extend == 0)
1876 		return (NULL);
1877 
1878 	result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
1879 
1880 	for (j = 0; j < i; j++) {
1881 		result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
1882 		(void) strcpy(result[j], moc[j]);
1883 	}
1884 	for (; j <= i + extend; j++)
1885 		result[j] = NULL;
1886 
1887 	return (result);
1888 }
1889 
1890 static void
1891 vfs_copyopt(const mntopt_t *s, mntopt_t *d)
1892 {
1893 	char *sp, *dp;
1894 
1895 	d->mo_flags = s->mo_flags;
1896 	d->mo_data = s->mo_data;
1897 	sp = s->mo_name;
1898 	if (sp != NULL) {
1899 		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1900 		(void) strcpy(dp, sp);
1901 		d->mo_name = dp;
1902 	} else {
1903 		d->mo_name = NULL; /* should never happen */
1904 	}
1905 
1906 	d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
1907 
1908 	sp = s->mo_arg;
1909 	if (sp != NULL) {
1910 		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
1911 		(void) strcpy(dp, sp);
1912 		d->mo_arg = dp;
1913 	} else {
1914 		d->mo_arg = NULL;
1915 	}
1916 }
1917 
1918 /*
1919  * Copy a mount options table, possibly allocating some spare
1920  * slots at the end.  It is permissible to copy_extend the NULL table.
1921  */
1922 static void
1923 vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
1924 {
1925 	uint_t i, count;
1926 	mntopt_t *motbl;
1927 
1928 	/*
1929 	 * Clear out any existing stuff in the options table being initialized
1930 	 */
1931 	vfs_freeopttbl(dmo);
1932 	count = (smo == NULL) ? 0 : smo->mo_count;
1933 	if ((count + extra) == 0)	/* nothing to do */
1934 		return;
1935 	dmo->mo_count = count + extra;
1936 	motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
1937 	dmo->mo_list = motbl;
1938 	for (i = 0; i < count; i++) {
1939 		vfs_copyopt(&smo->mo_list[i], &motbl[i]);
1940 	}
1941 	for (i = count; i < count + extra; i++) {
1942 		motbl[i].mo_flags = MO_EMPTY;
1943 	}
1944 }
1945 
1946 /*
1947  * Copy a mount options table.
1948  *
1949  * This function is *not* for general use by filesystems.
1950  *
1951  * Note: caller is responsible for locking the vfs list, if needed,
1952  *       to protect smo and dmo.
1953  */
1954 void
1955 vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
1956 {
1957 	vfs_copyopttbl_extend(smo, dmo, 0);
1958 }
1959 
1960 static char **
1961 vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
1962 {
1963 	int c1 = 0;
1964 	int c2 = 0;
1965 	char **result;
1966 	char **sp1, **sp2, **dp;
1967 
1968 	/*
1969 	 * First we count both lists of cancel options.
1970 	 * If either is NULL or has no elements, we return a copy of
1971 	 * the other.
1972 	 */
1973 	if (mop1->mo_cancel != NULL) {
1974 		for (; mop1->mo_cancel[c1] != NULL; c1++)
1975 			/* count cancel options in mop1 */;
1976 	}
1977 
1978 	if (c1 == 0)
1979 		return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
1980 
1981 	if (mop2->mo_cancel != NULL) {
1982 		for (; mop2->mo_cancel[c2] != NULL; c2++)
1983 			/* count cancel options in mop2 */;
1984 	}
1985 
1986 	result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
1987 
1988 	if (c2 == 0)
1989 		return (result);
1990 
1991 	/*
1992 	 * When we get here, we've got two sets of cancel options;
1993 	 * we need to merge the two sets.  We know that the result
1994 	 * array has "c1+c2+1" entries and in the end we might shrink
1995 	 * it.
1996 	 * Result now has a copy of the c1 entries from mop1; we'll
1997 	 * now lookup all the entries of mop2 in mop1 and copy it if
1998 	 * it is unique.
1999 	 * This operation is O(n^2) but it's only called once per
2000 	 * filesystem per duplicate option.  This is a situation
2001 	 * which doesn't arise with the filesystems in ON and
2002 	 * n is generally 1.
2003 	 */
2004 
2005 	dp = &result[c1];
2006 	for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2007 		for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2008 			if (strcmp(*sp1, *sp2) == 0)
2009 				break;
2010 		}
2011 		if (*sp1 == NULL) {
2012 			/*
2013 			 * Option *sp2 not found in mop1, so copy it.
2014 			 * The calls to vfs_copycancelopt_extend()
2015 			 * guarantee that there's enough room.
2016 			 */
2017 			*dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2018 			(void) strcpy(*dp++, *sp2);
2019 		}
2020 	}
2021 	if (dp != &result[c1+c2]) {
2022 		size_t bytes = (dp - result + 1) * sizeof (char *);
2023 		char **nres = kmem_alloc(bytes, KM_SLEEP);
2024 
2025 		bcopy(result, nres, bytes);
2026 		kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2027 		result = nres;
2028 	}
2029 	return (result);
2030 }
2031 
2032 /*
2033  * Merge two mount option tables (outer and inner) into one.  This is very
2034  * similar to "merging" global variables and automatic variables in C.
2035  *
2036  * This isn't (and doesn't have to be) fast.
2037  *
2038  * This function is *not* for general use by filesystems.
2039  *
2040  * Note: caller is responsible for locking the vfs list, if needed,
2041  *       to protect omo, imo & dmo.
2042  */
2043 void
2044 vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2045 {
2046 	uint_t i, count;
2047 	mntopt_t *mop, *motbl;
2048 	uint_t freeidx;
2049 
2050 	/*
2051 	 * First determine how much space we need to allocate.
2052 	 */
2053 	count = omo->mo_count;
2054 	for (i = 0; i < imo->mo_count; i++) {
2055 		if (imo->mo_list[i].mo_flags & MO_EMPTY)
2056 			continue;
2057 		if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2058 			count++;
2059 	}
2060 	ASSERT(count >= omo->mo_count &&
2061 	    count <= omo->mo_count + imo->mo_count);
2062 	motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2063 	for (i = 0; i < omo->mo_count; i++)
2064 		vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2065 	freeidx = omo->mo_count;
2066 	for (i = 0; i < imo->mo_count; i++) {
2067 		if (imo->mo_list[i].mo_flags & MO_EMPTY)
2068 			continue;
2069 		if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2070 			char **newcanp;
2071 			uint_t index = mop - omo->mo_list;
2072 
2073 			newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2074 
2075 			vfs_freeopt(&motbl[index]);
2076 			vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2077 
2078 			vfs_freecancelopt(motbl[index].mo_cancel);
2079 			motbl[index].mo_cancel = newcanp;
2080 		} else {
2081 			/*
2082 			 * If it's a new option, just copy it over to the first
2083 			 * free location.
2084 			 */
2085 			vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2086 		}
2087 	}
2088 	dmo->mo_count = count;
2089 	dmo->mo_list = motbl;
2090 }
2091 
2092 /*
2093  * Functions to set and clear mount options in a mount options table.
2094  */
2095 
2096 /*
2097  * Clear a mount option, if it exists.
2098  *
2099  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2100  * the vfs list.
2101  */
2102 static void
2103 vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2104 {
2105 	struct mntopt *mop;
2106 	uint_t i, count;
2107 
2108 	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2109 
2110 	count = mops->mo_count;
2111 	for (i = 0; i < count; i++) {
2112 		mop = &mops->mo_list[i];
2113 
2114 		if (mop->mo_flags & MO_EMPTY)
2115 			continue;
2116 		if (strcmp(opt, mop->mo_name))
2117 			continue;
2118 		mop->mo_flags &= ~MO_SET;
2119 		if (mop->mo_arg != NULL) {
2120 			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2121 		}
2122 		mop->mo_arg = NULL;
2123 		if (update_mnttab)
2124 			vfs_mnttab_modtimeupd();
2125 		break;
2126 	}
2127 }
2128 
2129 void
2130 vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2131 {
2132 	int gotlock = 0;
2133 
2134 	if (VFS_ON_LIST(vfsp)) {
2135 		gotlock = 1;
2136 		vfs_list_lock();
2137 	}
2138 	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2139 	if (gotlock)
2140 		vfs_list_unlock();
2141 }
2142 
2143 
2144 /*
2145  * Set a mount option on.  If it's not found in the table, it's silently
2146  * ignored.  If the option has MO_IGNORE set, it is still set unless the
2147  * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2148  * bits can be used to toggle the MO_NODISPLAY bit for the option.
2149  * If the VFS_CREATEOPT flag bit is set then the first option slot with
2150  * MO_EMPTY set is created as the option passed in.
2151  *
2152  * The update_mnttab arg indicates whether mops is part of a vfs that is on
2153  * the vfs list.
2154  */
2155 static void
2156 vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2157     const char *arg, int flags, int update_mnttab)
2158 {
2159 	mntopt_t *mop;
2160 	uint_t i, count;
2161 	char *sp;
2162 
2163 	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2164 
2165 	if (flags & VFS_CREATEOPT) {
2166 		if (vfs_hasopt(mops, opt) != NULL) {
2167 			flags &= ~VFS_CREATEOPT;
2168 		}
2169 	}
2170 	count = mops->mo_count;
2171 	for (i = 0; i < count; i++) {
2172 		mop = &mops->mo_list[i];
2173 
2174 		if (mop->mo_flags & MO_EMPTY) {
2175 			if ((flags & VFS_CREATEOPT) == 0)
2176 				continue;
2177 			sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2178 			(void) strcpy(sp, opt);
2179 			mop->mo_name = sp;
2180 			if (arg != NULL)
2181 				mop->mo_flags = MO_HASVALUE;
2182 			else
2183 				mop->mo_flags = 0;
2184 		} else if (strcmp(opt, mop->mo_name)) {
2185 			continue;
2186 		}
2187 		if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2188 			break;
2189 		if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2190 			sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2191 			(void) strcpy(sp, arg);
2192 		} else {
2193 			sp = NULL;
2194 		}
2195 		if (mop->mo_arg != NULL)
2196 			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2197 		mop->mo_arg = sp;
2198 		if (flags & VFS_DISPLAY)
2199 			mop->mo_flags &= ~MO_NODISPLAY;
2200 		if (flags & VFS_NODISPLAY)
2201 			mop->mo_flags |= MO_NODISPLAY;
2202 		mop->mo_flags |= MO_SET;
2203 		if (mop->mo_cancel != NULL) {
2204 			char **cp;
2205 
2206 			for (cp = mop->mo_cancel; *cp != NULL; cp++)
2207 				vfs_clearmntopt_nolock(mops, *cp, 0);
2208 		}
2209 		if (update_mnttab)
2210 			vfs_mnttab_modtimeupd();
2211 		break;
2212 	}
2213 }
2214 
2215 void
2216 vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2217 {
2218 	int gotlock = 0;
2219 
2220 	if (VFS_ON_LIST(vfsp)) {
2221 		gotlock = 1;
2222 		vfs_list_lock();
2223 	}
2224 	vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2225 	if (gotlock)
2226 		vfs_list_unlock();
2227 }
2228 
2229 
2230 /*
2231  * Add a "tag" option to a mounted file system's options list.
2232  *
2233  * Note: caller is responsible for locking the vfs list, if needed,
2234  *       to protect mops.
2235  */
2236 static mntopt_t *
2237 vfs_addtag(mntopts_t *mops, const char *tag)
2238 {
2239 	uint_t count;
2240 	mntopt_t *mop, *motbl;
2241 
2242 	count = mops->mo_count + 1;
2243 	motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2244 	if (mops->mo_count) {
2245 		size_t len = (count - 1) * sizeof (mntopt_t);
2246 
2247 		bcopy(mops->mo_list, motbl, len);
2248 		kmem_free(mops->mo_list, len);
2249 	}
2250 	mops->mo_count = count;
2251 	mops->mo_list = motbl;
2252 	mop = &motbl[count - 1];
2253 	mop->mo_flags = MO_TAG;
2254 	mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2255 	(void) strcpy(mop->mo_name, tag);
2256 	return (mop);
2257 }
2258 
2259 /*
2260  * Allow users to set arbitrary "tags" in a vfs's mount options.
2261  * Broader use within the kernel is discouraged.
2262  */
2263 int
2264 vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2265     cred_t *cr)
2266 {
2267 	vfs_t *vfsp;
2268 	mntopts_t *mops;
2269 	mntopt_t *mop;
2270 	int found = 0;
2271 	dev_t dev = makedevice(major, minor);
2272 	int err = 0;
2273 	char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2274 
2275 	/*
2276 	 * Find the desired mounted file system
2277 	 */
2278 	vfs_list_lock();
2279 	vfsp = rootvfs;
2280 	do {
2281 		if (vfsp->vfs_dev == dev &&
2282 		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2283 			found = 1;
2284 			break;
2285 		}
2286 		vfsp = vfsp->vfs_next;
2287 	} while (vfsp != rootvfs);
2288 
2289 	if (!found) {
2290 		err = EINVAL;
2291 		goto out;
2292 	}
2293 	err = secpolicy_fs_config(cr, vfsp);
2294 	if (err != 0)
2295 		goto out;
2296 
2297 	mops = &vfsp->vfs_mntopts;
2298 	/*
2299 	 * Add tag if it doesn't already exist
2300 	 */
2301 	if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2302 		int len;
2303 
2304 		(void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2305 		len = strlen(buf);
2306 		if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2307 			err = ENAMETOOLONG;
2308 			goto out;
2309 		}
2310 		mop = vfs_addtag(mops, tag);
2311 	}
2312 	if ((mop->mo_flags & MO_TAG) == 0) {
2313 		err = EINVAL;
2314 		goto out;
2315 	}
2316 	vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2317 out:
2318 	vfs_list_unlock();
2319 	kmem_free(buf, MAX_MNTOPT_STR);
2320 	return (err);
2321 }
2322 
2323 /*
2324  * Allow users to remove arbitrary "tags" in a vfs's mount options.
2325  * Broader use within the kernel is discouraged.
2326  */
2327 int
2328 vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2329     cred_t *cr)
2330 {
2331 	vfs_t *vfsp;
2332 	mntopt_t *mop;
2333 	int found = 0;
2334 	dev_t dev = makedevice(major, minor);
2335 	int err = 0;
2336 
2337 	/*
2338 	 * Find the desired mounted file system
2339 	 */
2340 	vfs_list_lock();
2341 	vfsp = rootvfs;
2342 	do {
2343 		if (vfsp->vfs_dev == dev &&
2344 		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2345 			found = 1;
2346 			break;
2347 		}
2348 		vfsp = vfsp->vfs_next;
2349 	} while (vfsp != rootvfs);
2350 
2351 	if (!found) {
2352 		err = EINVAL;
2353 		goto out;
2354 	}
2355 	err = secpolicy_fs_config(cr, vfsp);
2356 	if (err != 0)
2357 		goto out;
2358 
2359 	if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2360 		err = EINVAL;
2361 		goto out;
2362 	}
2363 	if ((mop->mo_flags & MO_TAG) == 0) {
2364 		err = EINVAL;
2365 		goto out;
2366 	}
2367 	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2368 out:
2369 	vfs_list_unlock();
2370 	return (err);
2371 }
2372 
2373 /*
2374  * Function to parse an option string and fill in a mount options table.
2375  * Unknown options are silently ignored.  The input option string is modified
2376  * by replacing separators with nulls.  If the create flag is set, options
2377  * not found in the table are just added on the fly.  The table must have
2378  * an option slot marked MO_EMPTY to add an option on the fly.
2379  *
2380  * This function is *not* for general use by filesystems.
2381  *
2382  * Note: caller is responsible for locking the vfs list, if needed,
2383  *       to protect mops..
2384  */
2385 void
2386 vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2387 {
2388 	char *s = osp, *p, *nextop, *valp, *cp, *ep;
2389 	int setflg = VFS_NOFORCEOPT;
2390 
2391 	if (osp == NULL)
2392 		return;
2393 	while (*s != '\0') {
2394 		p = strchr(s, ',');	/* find next option */
2395 		if (p == NULL) {
2396 			cp = NULL;
2397 			p = s + strlen(s);
2398 		} else {
2399 			cp = p;		/* save location of comma */
2400 			*p++ = '\0';	/* mark end and point to next option */
2401 		}
2402 		nextop = p;
2403 		p = strchr(s, '=');	/* look for value */
2404 		if (p == NULL) {
2405 			valp = NULL;	/* no value supplied */
2406 		} else {
2407 			ep = p;		/* save location of equals */
2408 			*p++ = '\0';	/* end option and point to value */
2409 			valp = p;
2410 		}
2411 		/*
2412 		 * set option into options table
2413 		 */
2414 		if (create)
2415 			setflg |= VFS_CREATEOPT;
2416 		vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2417 		if (cp != NULL)
2418 			*cp = ',';	/* restore the comma */
2419 		if (valp != NULL)
2420 			*ep = '=';	/* restore the equals */
2421 		s = nextop;
2422 	}
2423 }
2424 
2425 /*
2426  * Function to inquire if an option exists in a mount options table.
2427  * Returns a pointer to the option if it exists, else NULL.
2428  *
2429  * This function is *not* for general use by filesystems.
2430  *
2431  * Note: caller is responsible for locking the vfs list, if needed,
2432  *       to protect mops.
2433  */
2434 struct mntopt *
2435 vfs_hasopt(const mntopts_t *mops, const char *opt)
2436 {
2437 	struct mntopt *mop;
2438 	uint_t i, count;
2439 
2440 	count = mops->mo_count;
2441 	for (i = 0; i < count; i++) {
2442 		mop = &mops->mo_list[i];
2443 
2444 		if (mop->mo_flags & MO_EMPTY)
2445 			continue;
2446 		if (strcmp(opt, mop->mo_name) == 0)
2447 			return (mop);
2448 	}
2449 	return (NULL);
2450 }
2451 
2452 /*
2453  * Function to inquire if an option is set in a mount options table.
2454  * Returns non-zero if set and fills in the arg pointer with a pointer to
2455  * the argument string or NULL if there is no argument string.
2456  */
2457 static int
2458 vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2459 {
2460 	struct mntopt *mop;
2461 	uint_t i, count;
2462 
2463 	count = mops->mo_count;
2464 	for (i = 0; i < count; i++) {
2465 		mop = &mops->mo_list[i];
2466 
2467 		if (mop->mo_flags & MO_EMPTY)
2468 			continue;
2469 		if (strcmp(opt, mop->mo_name))
2470 			continue;
2471 		if ((mop->mo_flags & MO_SET) == 0)
2472 			return (0);
2473 		if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2474 			*argp = mop->mo_arg;
2475 		return (1);
2476 	}
2477 	return (0);
2478 }
2479 
2480 
2481 int
2482 vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2483 {
2484 	int ret;
2485 
2486 	vfs_list_read_lock();
2487 	ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2488 	vfs_list_unlock();
2489 	return (ret);
2490 }
2491 
2492 
2493 /*
2494  * Construct a comma separated string of the options set in the given
2495  * mount table, return the string in the given buffer.  Return non-zero if
2496  * the buffer would overflow.
2497  *
2498  * This function is *not* for general use by filesystems.
2499  *
2500  * Note: caller is responsible for locking the vfs list, if needed,
2501  *       to protect mp.
2502  */
2503 int
2504 vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2505 {
2506 	char *cp;
2507 	uint_t i;
2508 
2509 	buf[0] = '\0';
2510 	cp = buf;
2511 	for (i = 0; i < mp->mo_count; i++) {
2512 		struct mntopt *mop;
2513 
2514 		mop = &mp->mo_list[i];
2515 		if (mop->mo_flags & MO_SET) {
2516 			int optlen, comma = 0;
2517 
2518 			if (buf[0] != '\0')
2519 				comma = 1;
2520 			optlen = strlen(mop->mo_name);
2521 			if (strlen(buf) + comma + optlen + 1 > len)
2522 				goto err;
2523 			if (comma)
2524 				*cp++ = ',';
2525 			(void) strcpy(cp, mop->mo_name);
2526 			cp += optlen;
2527 			/*
2528 			 * Append option value if there is one
2529 			 */
2530 			if (mop->mo_arg != NULL) {
2531 				int arglen;
2532 
2533 				arglen = strlen(mop->mo_arg);
2534 				if (strlen(buf) + arglen + 2 > len)
2535 					goto err;
2536 				*cp++ = '=';
2537 				(void) strcpy(cp, mop->mo_arg);
2538 				cp += arglen;
2539 			}
2540 		}
2541 	}
2542 	return (0);
2543 err:
2544 	return (EOVERFLOW);
2545 }
2546 
2547 static void
2548 vfs_freecancelopt(char **moc)
2549 {
2550 	if (moc != NULL) {
2551 		int ccnt = 0;
2552 		char **cp;
2553 
2554 		for (cp = moc; *cp != NULL; cp++) {
2555 			kmem_free(*cp, strlen(*cp) + 1);
2556 			ccnt++;
2557 		}
2558 		kmem_free(moc, (ccnt + 1) * sizeof (char *));
2559 	}
2560 }
2561 
2562 static void
2563 vfs_freeopt(mntopt_t *mop)
2564 {
2565 	if (mop->mo_name != NULL)
2566 		kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2567 
2568 	vfs_freecancelopt(mop->mo_cancel);
2569 
2570 	if (mop->mo_arg != NULL)
2571 		kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2572 }
2573 
2574 /*
2575  * Free a mount options table
2576  *
2577  * This function is *not* for general use by filesystems.
2578  *
2579  * Note: caller is responsible for locking the vfs list, if needed,
2580  *       to protect mp.
2581  */
2582 void
2583 vfs_freeopttbl(mntopts_t *mp)
2584 {
2585 	uint_t i, count;
2586 
2587 	count = mp->mo_count;
2588 	for (i = 0; i < count; i++) {
2589 		vfs_freeopt(&mp->mo_list[i]);
2590 	}
2591 	if (count) {
2592 		kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2593 		mp->mo_count = 0;
2594 		mp->mo_list = NULL;
2595 	}
2596 }
2597 
2598 
2599 /* ARGSUSED */
2600 static int
2601 vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2602 	caller_context_t *ct)
2603 {
2604 	return (0);
2605 }
2606 
2607 /* ARGSUSED */
2608 static int
2609 vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2610 	caller_context_t *ct)
2611 {
2612 	return (0);
2613 }
2614 
2615 /*
2616  * The dummy vnode is currently used only by file events notification
2617  * module which is just interested in the timestamps.
2618  */
2619 /* ARGSUSED */
2620 static int
2621 vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2622     caller_context_t *ct)
2623 {
2624 	bzero(vap, sizeof (vattr_t));
2625 	vap->va_type = VREG;
2626 	vap->va_nlink = 1;
2627 	vap->va_ctime = vfs_mnttab_ctime;
2628 	/*
2629 	 * it is ok to just copy mtime as the time will be monotonically
2630 	 * increasing.
2631 	 */
2632 	vap->va_mtime = vfs_mnttab_mtime;
2633 	vap->va_atime = vap->va_mtime;
2634 	return (0);
2635 }
2636 
2637 static void
2638 vfs_mnttabvp_setup(void)
2639 {
2640 	vnode_t *tvp;
2641 	vnodeops_t *vfs_mntdummyvnops;
2642 	const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2643 		VOPNAME_READ, 		{ .vop_read = vfs_mntdummyread },
2644 		VOPNAME_WRITE, 		{ .vop_write = vfs_mntdummywrite },
2645 		VOPNAME_GETATTR,	{ .vop_getattr = vfs_mntdummygetattr },
2646 		VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
2647 		NULL,			NULL
2648 	};
2649 
2650 	if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2651 	    &vfs_mntdummyvnops) != 0) {
2652 		cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2653 		/* Shouldn't happen, but not bad enough to panic */
2654 		return;
2655 	}
2656 
2657 	/*
2658 	 * A global dummy vnode is allocated to represent mntfs files.
2659 	 * The mntfs file (/etc/mnttab) can be monitored for file events
2660 	 * and receive an event when mnttab changes. Dummy VOP calls
2661 	 * will be made on this vnode. The file events notification module
2662 	 * intercepts this vnode and delivers relevant events.
2663 	 */
2664 	tvp = vn_alloc(KM_SLEEP);
2665 	tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2666 	vn_setops(tvp, vfs_mntdummyvnops);
2667 	tvp->v_type = VREG;
2668 	/*
2669 	 * The mnt dummy ops do not reference v_data.
2670 	 * No other module intercepting this vnode should either.
2671 	 * Just set it to point to itself.
2672 	 */
2673 	tvp->v_data = (caddr_t)tvp;
2674 	tvp->v_vfsp = rootvfs;
2675 	vfs_mntdummyvp = tvp;
2676 }
2677 
2678 /*
2679  * performs fake read/write ops
2680  */
2681 static void
2682 vfs_mnttab_rwop(int rw)
2683 {
2684 	struct uio	uio;
2685 	struct iovec	iov;
2686 	char	buf[1];
2687 
2688 	if (vfs_mntdummyvp == NULL)
2689 		return;
2690 
2691 	bzero(&uio, sizeof (uio));
2692 	bzero(&iov, sizeof (iov));
2693 	iov.iov_base = buf;
2694 	iov.iov_len = 0;
2695 	uio.uio_iov = &iov;
2696 	uio.uio_iovcnt = 1;
2697 	uio.uio_loffset = 0;
2698 	uio.uio_segflg = UIO_SYSSPACE;
2699 	uio.uio_resid = 0;
2700 	if (rw) {
2701 		(void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2702 	} else {
2703 		(void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2704 	}
2705 }
2706 
2707 /*
2708  * Generate a write operation.
2709  */
2710 void
2711 vfs_mnttab_writeop(void)
2712 {
2713 	vfs_mnttab_rwop(1);
2714 }
2715 
2716 /*
2717  * Generate a read operation.
2718  */
2719 void
2720 vfs_mnttab_readop(void)
2721 {
2722 	vfs_mnttab_rwop(0);
2723 }
2724 
2725 /*
2726  * Free any mnttab information recorded in the vfs struct.
2727  * The vfs must not be on the vfs list.
2728  */
2729 static void
2730 vfs_freemnttab(struct vfs *vfsp)
2731 {
2732 	ASSERT(!VFS_ON_LIST(vfsp));
2733 
2734 	/*
2735 	 * Free device and mount point information
2736 	 */
2737 	if (vfsp->vfs_mntpt != NULL) {
2738 		refstr_rele(vfsp->vfs_mntpt);
2739 		vfsp->vfs_mntpt = NULL;
2740 	}
2741 	if (vfsp->vfs_resource != NULL) {
2742 		refstr_rele(vfsp->vfs_resource);
2743 		vfsp->vfs_resource = NULL;
2744 	}
2745 	/*
2746 	 * Now free mount options information
2747 	 */
2748 	vfs_freeopttbl(&vfsp->vfs_mntopts);
2749 }
2750 
2751 /*
2752  * Return the last mnttab modification time
2753  */
2754 void
2755 vfs_mnttab_modtime(timespec_t *ts)
2756 {
2757 	ASSERT(RW_LOCK_HELD(&vfslist));
2758 	*ts = vfs_mnttab_mtime;
2759 }
2760 
2761 /*
2762  * See if mnttab is changed
2763  */
2764 void
2765 vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2766 {
2767 	int changed;
2768 
2769 	*phpp = (struct pollhead *)NULL;
2770 
2771 	/*
2772 	 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2773 	 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2774 	 * to not grab the vfs list lock because tv_sec is monotonically
2775 	 * increasing.
2776 	 */
2777 
2778 	changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2779 	    (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2780 	if (!changed) {
2781 		*phpp = &vfs_pollhd;
2782 	}
2783 }
2784 
2785 /*
2786  * Update the mnttab modification time and wake up any waiters for
2787  * mnttab changes
2788  */
2789 void
2790 vfs_mnttab_modtimeupd()
2791 {
2792 	hrtime_t oldhrt, newhrt;
2793 
2794 	ASSERT(RW_WRITE_HELD(&vfslist));
2795 	oldhrt = ts2hrt(&vfs_mnttab_mtime);
2796 	gethrestime(&vfs_mnttab_mtime);
2797 	newhrt = ts2hrt(&vfs_mnttab_mtime);
2798 	if (oldhrt == (hrtime_t)0)
2799 		vfs_mnttab_ctime = vfs_mnttab_mtime;
2800 	/*
2801 	 * Attempt to provide unique mtime (like uniqtime but not).
2802 	 */
2803 	if (newhrt == oldhrt) {
2804 		newhrt++;
2805 		hrt2ts(newhrt, &vfs_mnttab_mtime);
2806 	}
2807 	pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2808 	vfs_mnttab_writeop();
2809 }
2810 
2811 int
2812 dounmount(struct vfs *vfsp, int flag, cred_t *cr)
2813 {
2814 	vnode_t *coveredvp;
2815 	int error;
2816 	extern void teardown_vopstats(vfs_t *);
2817 
2818 	/*
2819 	 * Get covered vnode. This will be NULL if the vfs is not linked
2820 	 * into the file system name space (i.e., domount() with MNT_NOSPICE).
2821 	 */
2822 	coveredvp = vfsp->vfs_vnodecovered;
2823 	ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
2824 
2825 	/*
2826 	 * Purge all dnlc entries for this vfs.
2827 	 */
2828 	(void) dnlc_purge_vfsp(vfsp, 0);
2829 
2830 	/* For forcible umount, skip VFS_SYNC() since it may hang */
2831 	if ((flag & MS_FORCE) == 0)
2832 		(void) VFS_SYNC(vfsp, 0, cr);
2833 
2834 	/*
2835 	 * Lock the vfs to maintain fs status quo during unmount.  This
2836 	 * has to be done after the sync because ufs_update tries to acquire
2837 	 * the vfs_reflock.
2838 	 */
2839 	vfs_lock_wait(vfsp);
2840 
2841 	if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
2842 		vfs_unlock(vfsp);
2843 		if (coveredvp != NULL)
2844 			vn_vfsunlock(coveredvp);
2845 	} else if (coveredvp != NULL) {
2846 		teardown_vopstats(vfsp);
2847 		/*
2848 		 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
2849 		 * when it frees vfsp so we do a VN_HOLD() so we can
2850 		 * continue to use coveredvp afterwards.
2851 		 */
2852 		VN_HOLD(coveredvp);
2853 		vfs_remove(vfsp);
2854 		vn_vfsunlock(coveredvp);
2855 		VN_RELE(coveredvp);
2856 	} else {
2857 		teardown_vopstats(vfsp);
2858 		/*
2859 		 * Release the reference to vfs that is not linked
2860 		 * into the name space.
2861 		 */
2862 		vfs_unlock(vfsp);
2863 		VFS_RELE(vfsp);
2864 	}
2865 	return (error);
2866 }
2867 
2868 
2869 /*
2870  * Vfs_unmountall() is called by uadmin() to unmount all
2871  * mounted file systems (except the root file system) during shutdown.
2872  * It follows the existing locking protocol when traversing the vfs list
2873  * to sync and unmount vfses. Even though there should be no
2874  * other thread running while the system is shutting down, it is prudent
2875  * to still follow the locking protocol.
2876  */
2877 void
2878 vfs_unmountall(void)
2879 {
2880 	struct vfs *vfsp;
2881 	struct vfs *prev_vfsp = NULL;
2882 	int error;
2883 
2884 	/*
2885 	 * Toss all dnlc entries now so that the per-vfs sync
2886 	 * and unmount operations don't have to slog through
2887 	 * a bunch of uninteresting vnodes over and over again.
2888 	 */
2889 	dnlc_purge();
2890 
2891 	vfs_list_lock();
2892 	for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
2893 		prev_vfsp = vfsp->vfs_prev;
2894 
2895 		if (vfs_lock(vfsp) != 0)
2896 			continue;
2897 		error = vn_vfswlock(vfsp->vfs_vnodecovered);
2898 		vfs_unlock(vfsp);
2899 		if (error)
2900 			continue;
2901 
2902 		vfs_list_unlock();
2903 
2904 		(void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
2905 		(void) dounmount(vfsp, 0, CRED());
2906 
2907 		/*
2908 		 * Since we dropped the vfslist lock above we must
2909 		 * verify that next_vfsp still exists, else start over.
2910 		 */
2911 		vfs_list_lock();
2912 		for (vfsp = rootvfs->vfs_prev;
2913 		    vfsp != rootvfs; vfsp = vfsp->vfs_prev)
2914 			if (vfsp == prev_vfsp)
2915 				break;
2916 		if (vfsp == rootvfs && prev_vfsp != rootvfs)
2917 			prev_vfsp = rootvfs->vfs_prev;
2918 	}
2919 	vfs_list_unlock();
2920 }
2921 
2922 /*
2923  * Called to add an entry to the end of the vfs mount in progress list
2924  */
2925 void
2926 vfs_addmip(dev_t dev, struct vfs *vfsp)
2927 {
2928 	struct ipmnt *mipp;
2929 
2930 	mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
2931 	mipp->mip_next = NULL;
2932 	mipp->mip_dev = dev;
2933 	mipp->mip_vfsp = vfsp;
2934 	mutex_enter(&vfs_miplist_mutex);
2935 	if (vfs_miplist_end != NULL)
2936 		vfs_miplist_end->mip_next = mipp;
2937 	else
2938 		vfs_miplist = mipp;
2939 	vfs_miplist_end = mipp;
2940 	mutex_exit(&vfs_miplist_mutex);
2941 }
2942 
2943 /*
2944  * Called to remove an entry from the mount in progress list
2945  * Either because the mount completed or it failed.
2946  */
2947 void
2948 vfs_delmip(struct vfs *vfsp)
2949 {
2950 	struct ipmnt *mipp, *mipprev;
2951 
2952 	mutex_enter(&vfs_miplist_mutex);
2953 	mipprev = NULL;
2954 	for (mipp = vfs_miplist;
2955 	    mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
2956 		mipprev = mipp;
2957 	}
2958 	if (mipp == NULL)
2959 		return; /* shouldn't happen */
2960 	if (mipp == vfs_miplist_end)
2961 		vfs_miplist_end = mipprev;
2962 	if (mipprev == NULL)
2963 		vfs_miplist = mipp->mip_next;
2964 	else
2965 		mipprev->mip_next = mipp->mip_next;
2966 	mutex_exit(&vfs_miplist_mutex);
2967 	kmem_free(mipp, sizeof (struct ipmnt));
2968 }
2969 
2970 /*
2971  * vfs_add is called by a specific filesystem's mount routine to add
2972  * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
2973  * The vfs should already have been locked by the caller.
2974  *
2975  * coveredvp is NULL if this is the root.
2976  */
2977 void
2978 vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
2979 {
2980 	int newflag;
2981 
2982 	ASSERT(vfs_lock_held(vfsp));
2983 	VFS_HOLD(vfsp);
2984 	newflag = vfsp->vfs_flag;
2985 	if (mflag & MS_RDONLY)
2986 		newflag |= VFS_RDONLY;
2987 	else
2988 		newflag &= ~VFS_RDONLY;
2989 	if (mflag & MS_NOSUID)
2990 		newflag |= (VFS_NOSETUID|VFS_NODEVICES);
2991 	else
2992 		newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
2993 	if (mflag & MS_NOMNTTAB)
2994 		newflag |= VFS_NOMNTTAB;
2995 	else
2996 		newflag &= ~VFS_NOMNTTAB;
2997 
2998 	if (coveredvp != NULL) {
2999 		ASSERT(vn_vfswlock_held(coveredvp));
3000 		coveredvp->v_vfsmountedhere = vfsp;
3001 		VN_HOLD(coveredvp);
3002 	}
3003 	vfsp->vfs_vnodecovered = coveredvp;
3004 	vfsp->vfs_flag = newflag;
3005 
3006 	vfs_list_add(vfsp);
3007 }
3008 
3009 /*
3010  * Remove a vfs from the vfs list, null out the pointer from the
3011  * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3012  * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3013  * reference to the vfs and to the covered vnode.
3014  *
3015  * Called from dounmount after it's confirmed with the file system
3016  * that the unmount is legal.
3017  */
3018 void
3019 vfs_remove(struct vfs *vfsp)
3020 {
3021 	vnode_t *vp;
3022 
3023 	ASSERT(vfs_lock_held(vfsp));
3024 
3025 	/*
3026 	 * Can't unmount root.  Should never happen because fs will
3027 	 * be busy.
3028 	 */
3029 	if (vfsp == rootvfs)
3030 		panic("vfs_remove: unmounting root");
3031 
3032 	vfs_list_remove(vfsp);
3033 
3034 	/*
3035 	 * Unhook from the file system name space.
3036 	 */
3037 	vp = vfsp->vfs_vnodecovered;
3038 	ASSERT(vn_vfswlock_held(vp));
3039 	vp->v_vfsmountedhere = NULL;
3040 	vfsp->vfs_vnodecovered = NULL;
3041 	VN_RELE(vp);
3042 
3043 	/*
3044 	 * Release lock and wakeup anybody waiting.
3045 	 */
3046 	vfs_unlock(vfsp);
3047 	VFS_RELE(vfsp);
3048 }
3049 
3050 /*
3051  * Lock a filesystem to prevent access to it while mounting,
3052  * unmounting and syncing.  Return EBUSY immediately if lock
3053  * can't be acquired.
3054  */
3055 int
3056 vfs_lock(vfs_t *vfsp)
3057 {
3058 	vn_vfslocks_entry_t *vpvfsentry;
3059 
3060 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3061 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3062 		return (0);
3063 
3064 	vn_vfslocks_rele(vpvfsentry);
3065 	return (EBUSY);
3066 }
3067 
3068 int
3069 vfs_rlock(vfs_t *vfsp)
3070 {
3071 	vn_vfslocks_entry_t *vpvfsentry;
3072 
3073 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3074 
3075 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3076 		return (0);
3077 
3078 	vn_vfslocks_rele(vpvfsentry);
3079 	return (EBUSY);
3080 }
3081 
3082 void
3083 vfs_lock_wait(vfs_t *vfsp)
3084 {
3085 	vn_vfslocks_entry_t *vpvfsentry;
3086 
3087 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3088 	rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3089 }
3090 
3091 void
3092 vfs_rlock_wait(vfs_t *vfsp)
3093 {
3094 	vn_vfslocks_entry_t *vpvfsentry;
3095 
3096 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3097 	rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3098 }
3099 
3100 /*
3101  * Unlock a locked filesystem.
3102  */
3103 void
3104 vfs_unlock(vfs_t *vfsp)
3105 {
3106 	vn_vfslocks_entry_t *vpvfsentry;
3107 
3108 	/*
3109 	 * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3110 	 * And these changes should remain for the patch changes as it is.
3111 	 */
3112 	if (panicstr)
3113 		return;
3114 
3115 	/*
3116 	 * ve_refcount needs to be dropped twice here.
3117 	 * 1. To release refernce after a call to vfs_locks_getlock()
3118 	 * 2. To release the reference from the locking routines like
3119 	 *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3120 	 */
3121 
3122 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3123 	vn_vfslocks_rele(vpvfsentry);
3124 
3125 	rwst_exit(&vpvfsentry->ve_lock);
3126 	vn_vfslocks_rele(vpvfsentry);
3127 }
3128 
3129 /*
3130  * Utility routine that allows a filesystem to construct its
3131  * fsid in "the usual way" - by munging some underlying dev_t and
3132  * the filesystem type number into the 64-bit fsid.  Note that
3133  * this implicitly relies on dev_t persistence to make filesystem
3134  * id's persistent.
3135  *
3136  * There's nothing to prevent an individual fs from constructing its
3137  * fsid in a different way, and indeed they should.
3138  *
3139  * Since we want fsids to be 32-bit quantities (so that they can be
3140  * exported identically by either 32-bit or 64-bit APIs, as well as
3141  * the fact that fsid's are "known" to NFS), we compress the device
3142  * number given down to 32-bits, and panic if that isn't possible.
3143  */
3144 void
3145 vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3146 {
3147 	if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3148 		panic("device number too big for fsid!");
3149 	fsi->val[1] = val;
3150 }
3151 
3152 int
3153 vfs_lock_held(vfs_t *vfsp)
3154 {
3155 	int held;
3156 	vn_vfslocks_entry_t *vpvfsentry;
3157 
3158 	/*
3159 	 * vfs_lock_held will mimic sema_held behaviour
3160 	 * if panicstr is set. And these changes should remain
3161 	 * for the patch changes as it is.
3162 	 */
3163 	if (panicstr)
3164 		return (1);
3165 
3166 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3167 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3168 
3169 	vn_vfslocks_rele(vpvfsentry);
3170 	return (held);
3171 }
3172 
3173 struct _kthread *
3174 vfs_lock_owner(vfs_t *vfsp)
3175 {
3176 	struct _kthread *owner;
3177 	vn_vfslocks_entry_t *vpvfsentry;
3178 
3179 	/*
3180 	 * vfs_wlock_held will mimic sema_held behaviour
3181 	 * if panicstr is set. And these changes should remain
3182 	 * for the patch changes as it is.
3183 	 */
3184 	if (panicstr)
3185 		return (NULL);
3186 
3187 	vpvfsentry = vn_vfslocks_getlock(vfsp);
3188 	owner = rwst_owner(&vpvfsentry->ve_lock);
3189 
3190 	vn_vfslocks_rele(vpvfsentry);
3191 	return (owner);
3192 }
3193 
3194 /*
3195  * vfs list locking.
3196  *
3197  * Rather than manipulate the vfslist lock directly, we abstract into lock
3198  * and unlock routines to allow the locking implementation to be changed for
3199  * clustering.
3200  *
3201  * Whenever the vfs list is modified through its hash links, the overall list
3202  * lock must be obtained before locking the relevant hash bucket.  But to see
3203  * whether a given vfs is on the list, it suffices to obtain the lock for the
3204  * hash bucket without getting the overall list lock.  (See getvfs() below.)
3205  */
3206 
3207 void
3208 vfs_list_lock()
3209 {
3210 	rw_enter(&vfslist, RW_WRITER);
3211 }
3212 
3213 void
3214 vfs_list_read_lock()
3215 {
3216 	rw_enter(&vfslist, RW_READER);
3217 }
3218 
3219 void
3220 vfs_list_unlock()
3221 {
3222 	rw_exit(&vfslist);
3223 }
3224 
3225 /*
3226  * Low level worker routines for adding entries to and removing entries from
3227  * the vfs list.
3228  */
3229 
3230 static void
3231 vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3232 {
3233 	int vhno;
3234 	struct vfs **hp;
3235 	dev_t dev;
3236 
3237 	ASSERT(RW_WRITE_HELD(&vfslist));
3238 
3239 	dev = expldev(vfsp->vfs_fsid.val[0]);
3240 	vhno = VFSHASH(getmajor(dev), getminor(dev));
3241 
3242 	mutex_enter(&rvfs_list[vhno].rvfs_lock);
3243 
3244 	/*
3245 	 * Link into the hash table, inserting it at the end, so that LOFS
3246 	 * with the same fsid as UFS (or other) file systems will not hide the
3247 	 * UFS.
3248 	 */
3249 	if (insert_at_head) {
3250 		vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3251 		rvfs_list[vhno].rvfs_head = vfsp;
3252 	} else {
3253 		for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3254 		    hp = &(*hp)->vfs_hash)
3255 			continue;
3256 		/*
3257 		 * hp now contains the address of the pointer to update
3258 		 * to effect the insertion.
3259 		 */
3260 		vfsp->vfs_hash = NULL;
3261 		*hp = vfsp;
3262 	}
3263 
3264 	rvfs_list[vhno].rvfs_len++;
3265 	mutex_exit(&rvfs_list[vhno].rvfs_lock);
3266 }
3267 
3268 
3269 static void
3270 vfs_hash_remove(struct vfs *vfsp)
3271 {
3272 	int vhno;
3273 	struct vfs *tvfsp;
3274 	dev_t dev;
3275 
3276 	ASSERT(RW_WRITE_HELD(&vfslist));
3277 
3278 	dev = expldev(vfsp->vfs_fsid.val[0]);
3279 	vhno = VFSHASH(getmajor(dev), getminor(dev));
3280 
3281 	mutex_enter(&rvfs_list[vhno].rvfs_lock);
3282 
3283 	/*
3284 	 * Remove from hash.
3285 	 */
3286 	if (rvfs_list[vhno].rvfs_head == vfsp) {
3287 		rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3288 		rvfs_list[vhno].rvfs_len--;
3289 		goto foundit;
3290 	}
3291 	for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3292 	    tvfsp = tvfsp->vfs_hash) {
3293 		if (tvfsp->vfs_hash == vfsp) {
3294 			tvfsp->vfs_hash = vfsp->vfs_hash;
3295 			rvfs_list[vhno].rvfs_len--;
3296 			goto foundit;
3297 		}
3298 	}
3299 	cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3300 
3301 foundit:
3302 
3303 	mutex_exit(&rvfs_list[vhno].rvfs_lock);
3304 }
3305 
3306 
3307 void
3308 vfs_list_add(struct vfs *vfsp)
3309 {
3310 	zone_t *zone;
3311 
3312 	/*
3313 	 * The zone that owns the mount is the one that performed the mount.
3314 	 * Note that this isn't necessarily the same as the zone mounted into.
3315 	 * The corresponding zone_rele() will be done when the vfs_t is
3316 	 * being free'd.
3317 	 */
3318 	vfsp->vfs_zone = curproc->p_zone;
3319 	zone_hold(vfsp->vfs_zone);
3320 
3321 	/*
3322 	 * Find the zone mounted into, and put this mount on its vfs list.
3323 	 */
3324 	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3325 	ASSERT(zone != NULL);
3326 	/*
3327 	 * Special casing for the root vfs.  This structure is allocated
3328 	 * statically and hooked onto rootvfs at link time.  During the
3329 	 * vfs_mountroot call at system startup time, the root file system's
3330 	 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3331 	 * as argument.  The code below must detect and handle this special
3332 	 * case.  The only apparent justification for this special casing is
3333 	 * to ensure that the root file system appears at the head of the
3334 	 * list.
3335 	 *
3336 	 * XXX:	I'm assuming that it's ok to do normal list locking when
3337 	 *	adding the entry for the root file system (this used to be
3338 	 *	done with no locks held).
3339 	 */
3340 	vfs_list_lock();
3341 	/*
3342 	 * Link into the vfs list proper.
3343 	 */
3344 	if (vfsp == &root) {
3345 		/*
3346 		 * Assert: This vfs is already on the list as its first entry.
3347 		 * Thus, there's nothing to do.
3348 		 */
3349 		ASSERT(rootvfs == vfsp);
3350 		/*
3351 		 * Add it to the head of the global zone's vfslist.
3352 		 */
3353 		ASSERT(zone == global_zone);
3354 		ASSERT(zone->zone_vfslist == NULL);
3355 		zone->zone_vfslist = vfsp;
3356 	} else {
3357 		/*
3358 		 * Link to end of list using vfs_prev (as rootvfs is now a
3359 		 * doubly linked circular list) so list is in mount order for
3360 		 * mnttab use.
3361 		 */
3362 		rootvfs->vfs_prev->vfs_next = vfsp;
3363 		vfsp->vfs_prev = rootvfs->vfs_prev;
3364 		rootvfs->vfs_prev = vfsp;
3365 		vfsp->vfs_next = rootvfs;
3366 
3367 		/*
3368 		 * Do it again for the zone-private list (which may be NULL).
3369 		 */
3370 		if (zone->zone_vfslist == NULL) {
3371 			ASSERT(zone != global_zone);
3372 			zone->zone_vfslist = vfsp;
3373 		} else {
3374 			zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3375 			vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3376 			zone->zone_vfslist->vfs_zone_prev = vfsp;
3377 			vfsp->vfs_zone_next = zone->zone_vfslist;
3378 		}
3379 	}
3380 
3381 	/*
3382 	 * Link into the hash table, inserting it at the end, so that LOFS
3383 	 * with the same fsid as UFS (or other) file systems will not hide
3384 	 * the UFS.
3385 	 */
3386 	vfs_hash_add(vfsp, 0);
3387 
3388 	/*
3389 	 * update the mnttab modification time
3390 	 */
3391 	vfs_mnttab_modtimeupd();
3392 	vfs_list_unlock();
3393 	zone_rele(zone);
3394 }
3395 
3396 void
3397 vfs_list_remove(struct vfs *vfsp)
3398 {
3399 	zone_t *zone;
3400 
3401 	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3402 	ASSERT(zone != NULL);
3403 	/*
3404 	 * Callers are responsible for preventing attempts to unmount the
3405 	 * root.
3406 	 */
3407 	ASSERT(vfsp != rootvfs);
3408 
3409 	vfs_list_lock();
3410 
3411 	/*
3412 	 * Remove from hash.
3413 	 */
3414 	vfs_hash_remove(vfsp);
3415 
3416 	/*
3417 	 * Remove from vfs list.
3418 	 */
3419 	vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3420 	vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3421 	vfsp->vfs_next = vfsp->vfs_prev = NULL;
3422 
3423 	/*
3424 	 * Remove from zone-specific vfs list.
3425 	 */
3426 	if (zone->zone_vfslist == vfsp)
3427 		zone->zone_vfslist = vfsp->vfs_zone_next;
3428 
3429 	if (vfsp->vfs_zone_next == vfsp) {
3430 		ASSERT(vfsp->vfs_zone_prev == vfsp);
3431 		ASSERT(zone->zone_vfslist == vfsp);
3432 		zone->zone_vfslist = NULL;
3433 	}
3434 
3435 	vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3436 	vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3437 	vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3438 
3439 	/*
3440 	 * update the mnttab modification time
3441 	 */
3442 	vfs_mnttab_modtimeupd();
3443 	vfs_list_unlock();
3444 	zone_rele(zone);
3445 }
3446 
3447 struct vfs *
3448 getvfs(fsid_t *fsid)
3449 {
3450 	struct vfs *vfsp;
3451 	int val0 = fsid->val[0];
3452 	int val1 = fsid->val[1];
3453 	dev_t dev = expldev(val0);
3454 	int vhno = VFSHASH(getmajor(dev), getminor(dev));
3455 	kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3456 
3457 	mutex_enter(hmp);
3458 	for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3459 		if (vfsp->vfs_fsid.val[0] == val0 &&
3460 		    vfsp->vfs_fsid.val[1] == val1) {
3461 			VFS_HOLD(vfsp);
3462 			mutex_exit(hmp);
3463 			return (vfsp);
3464 		}
3465 	}
3466 	mutex_exit(hmp);
3467 	return (NULL);
3468 }
3469 
3470 /*
3471  * Search the vfs mount in progress list for a specified device/vfs entry.
3472  * Returns 0 if the first entry in the list that the device matches has the
3473  * given vfs pointer as well.  If the device matches but a different vfs
3474  * pointer is encountered in the list before the given vfs pointer then
3475  * a 1 is returned.
3476  */
3477 
3478 int
3479 vfs_devmounting(dev_t dev, struct vfs *vfsp)
3480 {
3481 	int retval = 0;
3482 	struct ipmnt *mipp;
3483 
3484 	mutex_enter(&vfs_miplist_mutex);
3485 	for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3486 		if (mipp->mip_dev == dev) {
3487 			if (mipp->mip_vfsp != vfsp)
3488 				retval = 1;
3489 			break;
3490 		}
3491 	}
3492 	mutex_exit(&vfs_miplist_mutex);
3493 	return (retval);
3494 }
3495 
3496 /*
3497  * Search the vfs list for a specified device.  Returns 1, if entry is found
3498  * or 0 if no suitable entry is found.
3499  */
3500 
3501 int
3502 vfs_devismounted(dev_t dev)
3503 {
3504 	struct vfs *vfsp;
3505 	int found;
3506 
3507 	vfs_list_read_lock();
3508 	vfsp = rootvfs;
3509 	found = 0;
3510 	do {
3511 		if (vfsp->vfs_dev == dev) {
3512 			found = 1;
3513 			break;
3514 		}
3515 		vfsp = vfsp->vfs_next;
3516 	} while (vfsp != rootvfs);
3517 
3518 	vfs_list_unlock();
3519 	return (found);
3520 }
3521 
3522 /*
3523  * Search the vfs list for a specified device.  Returns a pointer to it
3524  * or NULL if no suitable entry is found. The caller of this routine
3525  * is responsible for releasing the returned vfs pointer.
3526  */
3527 struct vfs *
3528 vfs_dev2vfsp(dev_t dev)
3529 {
3530 	struct vfs *vfsp;
3531 	int found;
3532 
3533 	vfs_list_read_lock();
3534 	vfsp = rootvfs;
3535 	found = 0;
3536 	do {
3537 		/*
3538 		 * The following could be made more efficient by making
3539 		 * the entire loop use vfs_zone_next if the call is from
3540 		 * a zone.  The only callers, however, ustat(2) and
3541 		 * umount2(2), don't seem to justify the added
3542 		 * complexity at present.
3543 		 */
3544 		if (vfsp->vfs_dev == dev &&
3545 		    ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3546 		    curproc->p_zone)) {
3547 			VFS_HOLD(vfsp);
3548 			found = 1;
3549 			break;
3550 		}
3551 		vfsp = vfsp->vfs_next;
3552 	} while (vfsp != rootvfs);
3553 	vfs_list_unlock();
3554 	return (found ? vfsp: NULL);
3555 }
3556 
3557 /*
3558  * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3559  * or NULL if no suitable entry is found. The caller of this routine
3560  * is responsible for releasing the returned vfs pointer.
3561  *
3562  * Note that if multiple mntpoints match, the last one matching is
3563  * returned in an attempt to return the "top" mount when overlay
3564  * mounts are covering the same mount point.  This is accomplished by starting
3565  * at the end of the list and working our way backwards, stopping at the first
3566  * matching mount.
3567  */
3568 struct vfs *
3569 vfs_mntpoint2vfsp(const char *mp)
3570 {
3571 	struct vfs *vfsp;
3572 	struct vfs *retvfsp = NULL;
3573 	zone_t *zone = curproc->p_zone;
3574 	struct vfs *list;
3575 
3576 	vfs_list_read_lock();
3577 	if (getzoneid() == GLOBAL_ZONEID) {
3578 		/*
3579 		 * The global zone may see filesystems in any zone.
3580 		 */
3581 		vfsp = rootvfs->vfs_prev;
3582 		do {
3583 			if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3584 				retvfsp = vfsp;
3585 				break;
3586 			}
3587 			vfsp = vfsp->vfs_prev;
3588 		} while (vfsp != rootvfs->vfs_prev);
3589 	} else if ((list = zone->zone_vfslist) != NULL) {
3590 		const char *mntpt;
3591 
3592 		vfsp = list->vfs_zone_prev;
3593 		do {
3594 			mntpt = refstr_value(vfsp->vfs_mntpt);
3595 			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3596 			if (strcmp(mntpt, mp) == 0) {
3597 				retvfsp = vfsp;
3598 				break;
3599 			}
3600 			vfsp = vfsp->vfs_zone_prev;
3601 		} while (vfsp != list->vfs_zone_prev);
3602 	}
3603 	if (retvfsp)
3604 		VFS_HOLD(retvfsp);
3605 	vfs_list_unlock();
3606 	return (retvfsp);
3607 }
3608 
3609 /*
3610  * Search the vfs list for a specified vfsops.
3611  * if vfs entry is found then return 1, else 0.
3612  */
3613 int
3614 vfs_opsinuse(vfsops_t *ops)
3615 {
3616 	struct vfs *vfsp;
3617 	int found;
3618 
3619 	vfs_list_read_lock();
3620 	vfsp = rootvfs;
3621 	found = 0;
3622 	do {
3623 		if (vfs_getops(vfsp) == ops) {
3624 			found = 1;
3625 			break;
3626 		}
3627 		vfsp = vfsp->vfs_next;
3628 	} while (vfsp != rootvfs);
3629 	vfs_list_unlock();
3630 	return (found);
3631 }
3632 
3633 /*
3634  * Allocate an entry in vfssw for a file system type
3635  */
3636 struct vfssw *
3637 allocate_vfssw(char *type)
3638 {
3639 	struct vfssw *vswp;
3640 
3641 	if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3642 		/*
3643 		 * The vfssw table uses the empty string to identify an
3644 		 * available entry; we cannot add any type which has
3645 		 * a leading NUL. The string length is limited to
3646 		 * the size of the st_fstype array in struct stat.
3647 		 */
3648 		return (NULL);
3649 	}
3650 
3651 	ASSERT(VFSSW_WRITE_LOCKED());
3652 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3653 		if (!ALLOCATED_VFSSW(vswp)) {
3654 			vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3655 			(void) strcpy(vswp->vsw_name, type);
3656 			ASSERT(vswp->vsw_count == 0);
3657 			vswp->vsw_count = 1;
3658 			mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3659 			return (vswp);
3660 		}
3661 	return (NULL);
3662 }
3663 
3664 /*
3665  * Impose additional layer of translation between vfstype names
3666  * and module names in the filesystem.
3667  */
3668 static char *
3669 vfs_to_modname(char *vfstype)
3670 {
3671 	if (strcmp(vfstype, "proc") == 0) {
3672 		vfstype = "procfs";
3673 	} else if (strcmp(vfstype, "fd") == 0) {
3674 		vfstype = "fdfs";
3675 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
3676 		vfstype = "nfs";
3677 	}
3678 
3679 	return (vfstype);
3680 }
3681 
3682 /*
3683  * Find a vfssw entry given a file system type name.
3684  * Try to autoload the filesystem if it's not found.
3685  * If it's installed, return the vfssw locked to prevent unloading.
3686  */
3687 struct vfssw *
3688 vfs_getvfssw(char *type)
3689 {
3690 	struct vfssw *vswp;
3691 	char	*modname;
3692 
3693 	RLOCK_VFSSW();
3694 	vswp = vfs_getvfsswbyname(type);
3695 	modname = vfs_to_modname(type);
3696 
3697 	if (rootdir == NULL) {
3698 		/*
3699 		 * If we haven't yet loaded the root file system, then our
3700 		 * _init won't be called until later. Allocate vfssw entry,
3701 		 * because mod_installfs won't be called.
3702 		 */
3703 		if (vswp == NULL) {
3704 			RUNLOCK_VFSSW();
3705 			WLOCK_VFSSW();
3706 			if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3707 				if ((vswp = allocate_vfssw(type)) == NULL) {
3708 					WUNLOCK_VFSSW();
3709 					return (NULL);
3710 				}
3711 			}
3712 			WUNLOCK_VFSSW();
3713 			RLOCK_VFSSW();
3714 		}
3715 		if (!VFS_INSTALLED(vswp)) {
3716 			RUNLOCK_VFSSW();
3717 			(void) modloadonly("fs", modname);
3718 		} else
3719 			RUNLOCK_VFSSW();
3720 		return (vswp);
3721 	}
3722 
3723 	/*
3724 	 * Try to load the filesystem.  Before calling modload(), we drop
3725 	 * our lock on the VFS switch table, and pick it up after the
3726 	 * module is loaded.  However, there is a potential race:  the
3727 	 * module could be unloaded after the call to modload() completes
3728 	 * but before we pick up the lock and drive on.  Therefore,
3729 	 * we keep reloading the module until we've loaded the module
3730 	 * _and_ we have the lock on the VFS switch table.
3731 	 */
3732 	while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3733 		RUNLOCK_VFSSW();
3734 		if (modload("fs", modname) == -1)
3735 			return (NULL);
3736 		RLOCK_VFSSW();
3737 		if (vswp == NULL)
3738 			if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3739 				break;
3740 	}
3741 	RUNLOCK_VFSSW();
3742 
3743 	return (vswp);
3744 }
3745 
3746 /*
3747  * Find a vfssw entry given a file system type name.
3748  */
3749 struct vfssw *
3750 vfs_getvfsswbyname(char *type)
3751 {
3752 	struct vfssw *vswp;
3753 
3754 	ASSERT(VFSSW_LOCKED());
3755 	if (type == NULL || *type == '\0')
3756 		return (NULL);
3757 
3758 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3759 		if (strcmp(type, vswp->vsw_name) == 0) {
3760 			vfs_refvfssw(vswp);
3761 			return (vswp);
3762 		}
3763 	}
3764 
3765 	return (NULL);
3766 }
3767 
3768 /*
3769  * Find a vfssw entry given a set of vfsops.
3770  */
3771 struct vfssw *
3772 vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3773 {
3774 	struct vfssw *vswp;
3775 
3776 	RLOCK_VFSSW();
3777 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3778 		if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3779 			vfs_refvfssw(vswp);
3780 			RUNLOCK_VFSSW();
3781 			return (vswp);
3782 		}
3783 	}
3784 	RUNLOCK_VFSSW();
3785 
3786 	return (NULL);
3787 }
3788 
3789 /*
3790  * Reference a vfssw entry.
3791  */
3792 void
3793 vfs_refvfssw(struct vfssw *vswp)
3794 {
3795 
3796 	mutex_enter(&vswp->vsw_lock);
3797 	vswp->vsw_count++;
3798 	mutex_exit(&vswp->vsw_lock);
3799 }
3800 
3801 /*
3802  * Unreference a vfssw entry.
3803  */
3804 void
3805 vfs_unrefvfssw(struct vfssw *vswp)
3806 {
3807 
3808 	mutex_enter(&vswp->vsw_lock);
3809 	vswp->vsw_count--;
3810 	mutex_exit(&vswp->vsw_lock);
3811 }
3812 
3813 int sync_timeout = 30;		/* timeout for syncing a page during panic */
3814 int sync_timeleft;		/* portion of sync_timeout remaining */
3815 
3816 static int sync_retries = 20;	/* number of retries when not making progress */
3817 static int sync_triesleft;	/* portion of sync_retries remaining */
3818 
3819 static pgcnt_t old_pgcnt, new_pgcnt;
3820 static int new_bufcnt, old_bufcnt;
3821 
3822 /*
3823  * Sync all of the mounted filesystems, and then wait for the actual i/o to
3824  * complete.  We wait by counting the number of dirty pages and buffers,
3825  * pushing them out using bio_busy() and page_busy(), and then counting again.
3826  * This routine is used during both the uadmin A_SHUTDOWN code as well as
3827  * the SYNC phase of the panic code (see comments in panic.c).  It should only
3828  * be used after some higher-level mechanism has quiesced the system so that
3829  * new writes are not being initiated while we are waiting for completion.
3830  *
3831  * To ensure finite running time, our algorithm uses two timeout mechanisms:
3832  * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
3833  * sync_triesleft (a progress counter used by the vfs_syncall() loop below).
3834  * Together these ensure that syncing completes if our i/o paths are stuck.
3835  * The counters are declared above so they can be found easily in the debugger.
3836  *
3837  * The sync_timeleft counter is reset by bio_busy() and page_busy() using the
3838  * vfs_syncprogress() subroutine whenever we make progress through the lists of
3839  * pages and buffers.  It is decremented and expired by the deadman() cyclic.
3840  * When vfs_syncall() decides it is done, we disable the deadman() counter by
3841  * setting sync_timeleft to zero.  This timer guards against vfs_syncall()
3842  * deadlocking or hanging inside of a broken filesystem or driver routine.
3843  *
3844  * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
3845  * sync_retries consecutive calls to bio_busy() and page_busy() without
3846  * decreasing either the number of dirty buffers or dirty pages below the
3847  * lowest count we have seen so far, we give up and return from vfs_syncall().
3848  *
3849  * Each loop iteration ends with a call to delay() one second to allow time for
3850  * i/o completion and to permit the user time to read our progress messages.
3851  */
3852 void
3853 vfs_syncall(void)
3854 {
3855 	if (rootdir == NULL && !modrootloaded)
3856 		return; /* panic during boot - no filesystems yet */
3857 
3858 	printf("syncing file systems...");
3859 	vfs_syncprogress();
3860 	sync();
3861 
3862 	vfs_syncprogress();
3863 	sync_triesleft = sync_retries;
3864 
3865 	old_bufcnt = new_bufcnt = INT_MAX;
3866 	old_pgcnt = new_pgcnt = ULONG_MAX;
3867 
3868 	while (sync_triesleft > 0) {
3869 		old_bufcnt = MIN(old_bufcnt, new_bufcnt);
3870 		old_pgcnt = MIN(old_pgcnt, new_pgcnt);
3871 
3872 		new_bufcnt = bio_busy(B_TRUE);
3873 		new_pgcnt = page_busy(B_TRUE);
3874 		vfs_syncprogress();
3875 
3876 		if (new_bufcnt == 0 && new_pgcnt == 0)
3877 			break;
3878 
3879 		if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
3880 			sync_triesleft = sync_retries;
3881 		else
3882 			sync_triesleft--;
3883 
3884 		if (new_bufcnt)
3885 			printf(" [%d]", new_bufcnt);
3886 		if (new_pgcnt)
3887 			printf(" %lu", new_pgcnt);
3888 
3889 		delay(hz);
3890 	}
3891 
3892 	if (new_bufcnt != 0 || new_pgcnt != 0)
3893 		printf(" done (not all i/o completed)\n");
3894 	else
3895 		printf(" done\n");
3896 
3897 	sync_timeleft = 0;
3898 	delay(hz);
3899 }
3900 
3901 /*
3902  * If we are in the middle of the sync phase of panic, reset sync_timeleft to
3903  * sync_timeout to indicate that we are making progress and the deadman()
3904  * omnipresent cyclic should not yet time us out.  Note that it is safe to
3905  * store to sync_timeleft here since the deadman() is firing at high-level
3906  * on top of us.  If we are racing with the deadman(), either the deadman()
3907  * will decrement the old value and then we will reset it, or we will
3908  * reset it and then the deadman() will immediately decrement it.  In either
3909  * case, correct behavior results.
3910  */
3911 void
3912 vfs_syncprogress(void)
3913 {
3914 	if (panicstr)
3915 		sync_timeleft = sync_timeout;
3916 }
3917 
3918 /*
3919  * Map VFS flags to statvfs flags.  These shouldn't really be separate
3920  * flags at all.
3921  */
3922 uint_t
3923 vf_to_stf(uint_t vf)
3924 {
3925 	uint_t stf = 0;
3926 
3927 	if (vf & VFS_RDONLY)
3928 		stf |= ST_RDONLY;
3929 	if (vf & VFS_NOSETUID)
3930 		stf |= ST_NOSUID;
3931 	if (vf & VFS_NOTRUNC)
3932 		stf |= ST_NOTRUNC;
3933 
3934 	return (stf);
3935 }
3936 
3937 /*
3938  * Entries for (illegal) fstype 0.
3939  */
3940 /* ARGSUSED */
3941 int
3942 vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
3943 {
3944 	cmn_err(CE_PANIC, "stray vfs operation");
3945 	return (0);
3946 }
3947 
3948 /*
3949  * Entries for (illegal) fstype 0.
3950  */
3951 int
3952 vfsstray(void)
3953 {
3954 	cmn_err(CE_PANIC, "stray vfs operation");
3955 	return (0);
3956 }
3957 
3958 /*
3959  * Support for dealing with forced UFS unmount and its interaction with
3960  * LOFS. Could be used by any filesystem.
3961  * See bug 1203132.
3962  */
3963 int
3964 vfs_EIO(void)
3965 {
3966 	return (EIO);
3967 }
3968 
3969 /*
3970  * We've gotta define the op for sync separately, since the compiler gets
3971  * confused if we mix and match ANSI and normal style prototypes when
3972  * a "short" argument is present and spits out a warning.
3973  */
3974 /*ARGSUSED*/
3975 int
3976 vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
3977 {
3978 	return (EIO);
3979 }
3980 
3981 vfs_t EIO_vfs;
3982 vfsops_t *EIO_vfsops;
3983 
3984 /*
3985  * Called from startup() to initialize all loaded vfs's
3986  */
3987 void
3988 vfsinit(void)
3989 {
3990 	struct vfssw *vswp;
3991 	int error;
3992 	extern int vopstats_enabled;
3993 	extern void vopstats_startup();
3994 
3995 	static const fs_operation_def_t EIO_vfsops_template[] = {
3996 		VFSNAME_MOUNT,		{ .error = vfs_EIO },
3997 		VFSNAME_UNMOUNT,	{ .error = vfs_EIO },
3998 		VFSNAME_ROOT,		{ .error = vfs_EIO },
3999 		VFSNAME_STATVFS,	{ .error = vfs_EIO },
4000 		VFSNAME_SYNC, 		{ .vfs_sync = vfs_EIO_sync },
4001 		VFSNAME_VGET,		{ .error = vfs_EIO },
4002 		VFSNAME_MOUNTROOT,	{ .error = vfs_EIO },
4003 		VFSNAME_FREEVFS,	{ .error = vfs_EIO },
4004 		VFSNAME_VNSTATE,	{ .error = vfs_EIO },
4005 		NULL, NULL
4006 	};
4007 
4008 	static const fs_operation_def_t stray_vfsops_template[] = {
4009 		VFSNAME_MOUNT,		{ .error = vfsstray },
4010 		VFSNAME_UNMOUNT,	{ .error = vfsstray },
4011 		VFSNAME_ROOT,		{ .error = vfsstray },
4012 		VFSNAME_STATVFS,	{ .error = vfsstray },
4013 		VFSNAME_SYNC, 		{ .vfs_sync = vfsstray_sync },
4014 		VFSNAME_VGET,		{ .error = vfsstray },
4015 		VFSNAME_MOUNTROOT,	{ .error = vfsstray },
4016 		VFSNAME_FREEVFS,	{ .error = vfsstray },
4017 		VFSNAME_VNSTATE,	{ .error = vfsstray },
4018 		NULL, NULL
4019 	};
4020 
4021 	/* Create vfs cache */
4022 	vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4023 	    sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4024 
4025 	/* Initialize the vnode cache (file systems may use it during init). */
4026 	vn_create_cache();
4027 
4028 	/* Setup event monitor framework */
4029 	fem_init();
4030 
4031 	/* Initialize the dummy stray file system type. */
4032 	error = vfs_setfsops(0, stray_vfsops_template, NULL);
4033 
4034 	/* Initialize the dummy EIO file system. */
4035 	error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4036 	if (error != 0) {
4037 		cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4038 		/* Shouldn't happen, but not bad enough to panic */
4039 	}
4040 
4041 	VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4042 
4043 	/*
4044 	 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4045 	 * on this vfs can immediately notice it's invalid.
4046 	 */
4047 	EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4048 
4049 	/*
4050 	 * Call the init routines of non-loadable filesystems only.
4051 	 * Filesystems which are loaded as separate modules will be
4052 	 * initialized by the module loading code instead.
4053 	 */
4054 
4055 	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4056 		RLOCK_VFSSW();
4057 		if (vswp->vsw_init != NULL)
4058 			(*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4059 		RUNLOCK_VFSSW();
4060 	}
4061 
4062 	vopstats_startup();
4063 
4064 	if (vopstats_enabled) {
4065 		/* EIO_vfs can collect stats, but we don't retrieve them */
4066 		initialize_vopstats(&EIO_vfs.vfs_vopstats);
4067 		EIO_vfs.vfs_fstypevsp = NULL;
4068 		EIO_vfs.vfs_vskap = NULL;
4069 		EIO_vfs.vfs_flag |= VFS_STATS;
4070 	}
4071 
4072 	xattr_init();
4073 }
4074 
4075 vfs_t *
4076 vfs_alloc(int kmflag)
4077 {
4078 	vfs_t *vfsp;
4079 
4080 	vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4081 
4082 	/*
4083 	 * Do the simplest initialization here.
4084 	 * Everything else gets done in vfs_init()
4085 	 */
4086 	bzero(vfsp, sizeof (vfs_t));
4087 	return (vfsp);
4088 }
4089 
4090 void
4091 vfs_free(vfs_t *vfsp)
4092 {
4093 	/*
4094 	 * One would be tempted to assert that "vfsp->vfs_count == 0".
4095 	 * The problem is that this gets called out of domount() with
4096 	 * a partially initialized vfs and a vfs_count of 1.  This is
4097 	 * also called from vfs_rele() with a vfs_count of 0.  We can't
4098 	 * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4099 	 * returned.  This is because VFS_MOUNT() fully initializes the
4100 	 * vfs structure and its associated data.  VFS_RELE() will call
4101 	 * VFS_FREEVFS() which may panic the system if the data structures
4102 	 * aren't fully initialized from a successful VFS_MOUNT()).
4103 	 */
4104 
4105 	/* If FEM was in use, make sure everything gets cleaned up */
4106 	if (vfsp->vfs_femhead) {
4107 		ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4108 		mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4109 		kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4110 		vfsp->vfs_femhead = NULL;
4111 	}
4112 
4113 	if (vfsp->vfs_implp)
4114 		vfsimpl_teardown(vfsp);
4115 	sema_destroy(&vfsp->vfs_reflock);
4116 	kmem_cache_free(vfs_cache, vfsp);
4117 }
4118 
4119 /*
4120  * Increments the vfs reference count by one atomically.
4121  */
4122 void
4123 vfs_hold(vfs_t *vfsp)
4124 {
4125 	atomic_add_32(&vfsp->vfs_count, 1);
4126 	ASSERT(vfsp->vfs_count != 0);
4127 }
4128 
4129 /*
4130  * Decrements the vfs reference count by one atomically. When
4131  * vfs reference count becomes zero, it calls the file system
4132  * specific vfs_freevfs() to free up the resources.
4133  */
4134 void
4135 vfs_rele(vfs_t *vfsp)
4136 {
4137 	ASSERT(vfsp->vfs_count != 0);
4138 	if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) {
4139 		VFS_FREEVFS(vfsp);
4140 		if (vfsp->vfs_zone)
4141 			zone_rele(vfsp->vfs_zone);
4142 		vfs_freemnttab(vfsp);
4143 		vfs_free(vfsp);
4144 	}
4145 }
4146 
4147 /*
4148  * Generic operations vector support.
4149  *
4150  * This is used to build operations vectors for both the vfs and vnode.
4151  * It's normally called only when a file system is loaded.
4152  *
4153  * There are many possible algorithms for this, including the following:
4154  *
4155  *   (1) scan the list of known operations; for each, see if the file system
4156  *       includes an entry for it, and fill it in as appropriate.
4157  *
4158  *   (2) set up defaults for all known operations.  scan the list of ops
4159  *       supplied by the file system; for each which is both supplied and
4160  *       known, fill it in.
4161  *
4162  *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4163  *       in entries as we go.
4164  *
4165  * we choose (1) for simplicity, and because performance isn't critical here.
4166  * note that (2) could be sped up using a precomputed hash table on known ops.
4167  * (3) could be faster than either, but only if the lists were very large or
4168  * supplied in sorted order.
4169  *
4170  */
4171 
4172 int
4173 fs_build_vector(void *vector, int *unused_ops,
4174     const fs_operation_trans_def_t *translation,
4175     const fs_operation_def_t *operations)
4176 {
4177 	int i, num_trans, num_ops, used;
4178 
4179 	/*
4180 	 * Count the number of translations and the number of supplied
4181 	 * operations.
4182 	 */
4183 
4184 	{
4185 		const fs_operation_trans_def_t *p;
4186 
4187 		for (num_trans = 0, p = translation;
4188 		    p->name != NULL;
4189 		    num_trans++, p++)
4190 			;
4191 	}
4192 
4193 	{
4194 		const fs_operation_def_t *p;
4195 
4196 		for (num_ops = 0, p = operations;
4197 		    p->name != NULL;
4198 		    num_ops++, p++)
4199 			;
4200 	}
4201 
4202 	/* Walk through each operation known to our caller.  There will be */
4203 	/* one entry in the supplied "translation table" for each. */
4204 
4205 	used = 0;
4206 
4207 	for (i = 0; i < num_trans; i++) {
4208 		int j, found;
4209 		char *curname;
4210 		fs_generic_func_p result;
4211 		fs_generic_func_p *location;
4212 
4213 		curname = translation[i].name;
4214 
4215 		/* Look for a matching operation in the list supplied by the */
4216 		/* file system. */
4217 
4218 		found = 0;
4219 
4220 		for (j = 0; j < num_ops; j++) {
4221 			if (strcmp(operations[j].name, curname) == 0) {
4222 				used++;
4223 				found = 1;
4224 				break;
4225 			}
4226 		}
4227 
4228 		/*
4229 		 * If the file system is using a "placeholder" for default
4230 		 * or error functions, grab the appropriate function out of
4231 		 * the translation table.  If the file system didn't supply
4232 		 * this operation at all, use the default function.
4233 		 */
4234 
4235 		if (found) {
4236 			result = operations[j].func.fs_generic;
4237 			if (result == fs_default) {
4238 				result = translation[i].defaultFunc;
4239 			} else if (result == fs_error) {
4240 				result = translation[i].errorFunc;
4241 			} else if (result == NULL) {
4242 				/* Null values are PROHIBITED */
4243 				return (EINVAL);
4244 			}
4245 		} else {
4246 			result = translation[i].defaultFunc;
4247 		}
4248 
4249 		/* Now store the function into the operations vector. */
4250 
4251 		location = (fs_generic_func_p *)
4252 		    (((char *)vector) + translation[i].offset);
4253 
4254 		*location = result;
4255 	}
4256 
4257 	*unused_ops = num_ops - used;
4258 
4259 	return (0);
4260 }
4261 
4262 /* Placeholder functions, should never be called. */
4263 
4264 int
4265 fs_error(void)
4266 {
4267 	cmn_err(CE_PANIC, "fs_error called");
4268 	return (0);
4269 }
4270 
4271 int
4272 fs_default(void)
4273 {
4274 	cmn_err(CE_PANIC, "fs_default called");
4275 	return (0);
4276 }
4277 
4278 #ifdef __sparc
4279 
4280 /*
4281  * Part of the implementation of booting off a mirrored root
4282  * involves a change of dev_t for the root device.  To
4283  * accomplish this, first remove the existing hash table
4284  * entry for the root device, convert to the new dev_t,
4285  * then re-insert in the hash table at the head of the list.
4286  */
4287 void
4288 vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4289 {
4290 	vfs_list_lock();
4291 
4292 	vfs_hash_remove(vfsp);
4293 
4294 	vfsp->vfs_dev = ndev;
4295 	vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4296 
4297 	vfs_hash_add(vfsp, 1);
4298 
4299 	vfs_list_unlock();
4300 }
4301 
4302 #else /* x86 NEWBOOT */
4303 
4304 #if defined(__x86)
4305 extern int hvmboot_rootconf();
4306 #endif /* __x86 */
4307 
4308 int
4309 rootconf()
4310 {
4311 	int error;
4312 	struct vfssw *vsw;
4313 	extern void pm_init();
4314 	char *fstyp, *fsmod;
4315 
4316 	getrootfs(&fstyp, &fsmod);
4317 
4318 #if defined(__x86)
4319 	/*
4320 	 * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4321 	 * which lives in /platform/i86hvm, and hence is only available when
4322 	 * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4323 	 * is not available then the modstub for this function will return 0.
4324 	 * If the hvm_bootstrap misc module is available it will be loaded
4325 	 * and hvmboot_rootconf() will be invoked.
4326 	 */
4327 	if (error = hvmboot_rootconf())
4328 		return (error);
4329 #endif /* __x86 */
4330 
4331 	if (error = clboot_rootconf())
4332 		return (error);
4333 
4334 	if (modload("fs", fsmod) == -1)
4335 		panic("Cannot _init %s module", fsmod);
4336 
4337 	RLOCK_VFSSW();
4338 	vsw = vfs_getvfsswbyname(fstyp);
4339 	RUNLOCK_VFSSW();
4340 	VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4341 	VFS_HOLD(rootvfs);
4342 
4343 	/* always mount readonly first */
4344 	rootvfs->vfs_flag |= VFS_RDONLY;
4345 
4346 	pm_init();
4347 
4348 	if (netboot)
4349 		(void) strplumb();
4350 
4351 	error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4352 	vfs_unrefvfssw(vsw);
4353 	rootdev = rootvfs->vfs_dev;
4354 
4355 	if (error)
4356 		panic("cannot mount root path %s", rootfs.bo_name);
4357 	return (error);
4358 }
4359 
4360 /*
4361  * XXX this is called by nfs only and should probably be removed
4362  * If booted with ASKNAME, prompt on the console for a filesystem
4363  * name and return it.
4364  */
4365 void
4366 getfsname(char *askfor, char *name, size_t namelen)
4367 {
4368 	if (boothowto & RB_ASKNAME) {
4369 		printf("%s name: ", askfor);
4370 		console_gets(name, namelen);
4371 	}
4372 }
4373 
4374 /*
4375  * If server_path exists, then we are booting a diskless
4376  * client. Otherwise, we default to ufs. Zfs should perhaps be
4377  * another property.
4378  */
4379 static void
4380 getrootfs(char **fstypp, char **fsmodp)
4381 {
4382 	extern char *strplumb_get_netdev_path(void);
4383 	char *propstr = NULL;
4384 
4385 	/* check fstype property; it should be nfsdyn for diskless */
4386 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4387 	    DDI_PROP_DONTPASS, "fstype", &propstr)
4388 	    == DDI_SUCCESS) {
4389 		(void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4390 		ddi_prop_free(propstr);
4391 
4392 	/*
4393 	 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4394 	 * assume the type of this root filesystem is 'zfs'.
4395 	 */
4396 	} else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4397 	    DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4398 	    == DDI_SUCCESS) {
4399 		(void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4400 		ddi_prop_free(propstr);
4401 	}
4402 
4403 	if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4404 		*fstypp = *fsmodp = rootfs.bo_fstype;
4405 		return;
4406 	}
4407 
4408 	++netboot;
4409 	/*
4410 	 * check if path to network interface is specified in bootpath
4411 	 * or by a hypervisor domain configuration file.
4412 	 * XXPV - enable strlumb_get_netdev_path()
4413 	 */
4414 	if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4415 	    "xpv-nfsroot")) {
4416 		(void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4417 	} else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4418 	    DDI_PROP_DONTPASS, "bootpath", &propstr)
4419 	    == DDI_SUCCESS) {
4420 		(void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4421 		ddi_prop_free(propstr);
4422 	} else {
4423 		/* attempt to determine netdev_path via boot_mac address */
4424 		netdev_path = strplumb_get_netdev_path();
4425 		if (netdev_path == NULL)
4426 			panic("cannot find boot network interface");
4427 		(void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
4428 	}
4429 	*fstypp = rootfs.bo_fstype;
4430 	*fsmodp = "nfs";
4431 }
4432 #endif
4433 
4434 /*
4435  * VFS feature routines
4436  */
4437 
4438 #define	VFTINDEX(feature)	(((feature) >> 32) & 0xFFFFFFFF)
4439 #define	VFTBITS(feature)	((feature) & 0xFFFFFFFFLL)
4440 
4441 /* Register a feature in the vfs */
4442 void
4443 vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4444 {
4445 	/* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4446 	if (vfsp->vfs_implp == NULL)
4447 		return;
4448 
4449 	vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4450 }
4451 
4452 /*
4453  * Query a vfs for a feature.
4454  * Returns 1 if feature is present, 0 if not
4455  */
4456 int
4457 vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4458 {
4459 	int	ret = 0;
4460 
4461 	/* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4462 	if (vfsp->vfs_implp == NULL)
4463 		return (ret);
4464 
4465 	if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4466 		ret = 1;
4467 
4468 	return (ret);
4469 }
4470 
4471 /*
4472  * Propagate feature set from one vfs to another
4473  */
4474 void
4475 vfs_propagate_features(vfs_t *from, vfs_t *to)
4476 {
4477 	int i;
4478 
4479 	if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4480 		return;
4481 
4482 	for (i = 1; i <= to->vfs_featureset[0]; i++) {
4483 		to->vfs_featureset[i] = from->vfs_featureset[i];
4484 	}
4485 }
4486