xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision 46b592853d0f4f11781b6b0a7533f267c6aee132)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29  * triggered from a "stub" rnode via a special set of vnodeops.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/mount.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 #include <sys/mntent.h>
68 
69 #include <rpc/types.h>
70 #include <rpc/auth.h>
71 #include <rpc/clnt.h>
72 
73 #include <nfs/nfs.h>
74 #include <nfs/nfs_clnt.h>
75 #include <nfs/nfs_acl.h>
76 #include <nfs/lm.h>
77 #include <nfs/nfs4.h>
78 #include <nfs/nfs4_kprot.h>
79 #include <nfs/rnode4.h>
80 #include <nfs/nfs4_clnt.h>
81 
82 #include <vm/hat.h>
83 #include <vm/as.h>
84 #include <vm/page.h>
85 #include <vm/pvn.h>
86 #include <vm/seg.h>
87 #include <vm/seg_map.h>
88 #include <vm/seg_kpm.h>
89 #include <vm/seg_vn.h>
90 
91 #include <fs/fs_subr.h>
92 
93 #include <sys/ddi.h>
94 #include <sys/int_fmtio.h>
95 
96 #include <sys/sunddi.h>
97 
98 #include <sys/priv_names.h>
99 
100 /*
101  * The automatic unmounter thread stuff!
102  */
103 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
104 
105 /*
106  * Just a default....
107  */
108 static uint_t nfs4_trigger_mount_to = 240;
109 
110 typedef struct nfs4_trigger_globals {
111 	kmutex_t		ntg_forest_lock;
112 	uint_t			ntg_mount_to;
113 	int			ntg_thread_started;
114 	nfs4_ephemeral_tree_t	*ntg_forest;
115 } nfs4_trigger_globals_t;
116 
117 kmutex_t	nfs4_ephemeral_thread_lock;
118 
119 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
120 
121 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
122 
123 /*
124  * Used for ephemeral mounts; contains data either duplicated from
125  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
126  *
127  * It's intended that this structure is used solely for ephemeral
128  * mount-type specific data, for passing this data to
129  * nfs4_trigger_nargs_create().
130  */
131 typedef struct ephemeral_servinfo {
132 	char			*esi_hostname;
133 	char			*esi_netname;
134 	char			*esi_path;
135 	int			esi_path_len;
136 	int			esi_mount_flags;
137 	struct netbuf		*esi_addr;
138 	struct netbuf		*esi_syncaddr;
139 	struct knetconfig	*esi_knconf;
140 } ephemeral_servinfo_t;
141 
142 /*
143  * Collect together the mount-type specific and generic data args.
144  */
145 typedef struct domount_args {
146 	ephemeral_servinfo_t	*dma_esi;
147 	char			*dma_hostlist; /* comma-sep. for RO failover */
148 	struct nfs_args		*dma_nargs;
149 } domount_args_t;
150 
151 
152 /*
153  * The vnode ops functions for a trigger stub vnode
154  */
155 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
156 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
157     caller_context_t *);
158 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
159     caller_context_t *);
160 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
161     caller_context_t *);
162 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
163     caller_context_t *);
164 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
165     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
166     int *, pathname_t *);
167 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
168     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
169     vsecattr_t *);
170 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
171     int);
172 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
173     caller_context_t *, int);
174 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
175     cred_t *, caller_context_t *, int);
176 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
177     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
178 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
179     caller_context_t *, int);
180 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
181     cred_t *, caller_context_t *, int);
182 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
183 
184 /*
185  * Regular NFSv4 vnodeops that we need to reference directly
186  */
187 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
188 		    caller_context_t *);
189 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
190 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
191 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
192 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
193 		    struct pathname *, int, vnode_t *, cred_t *,
194 		    caller_context_t *, int *, pathname_t *);
195 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
196 		    caller_context_t *);
197 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
198 		    caller_context_t *);
199 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
200 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
201 
202 static int	nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
203 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
204     cred_t *, vnode_t **);
205 static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *);
206 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
207     vnode_t *vp);
208 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *);
209 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
210 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
211     servinfo4_t *);
212 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
213     ephemeral_servinfo_t *);
214 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
215 static char	*nfs4_trigger_create_mntopts(vfs_t *);
216 static void	nfs4_trigger_destroy_mntopts(char *);
217 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
218 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
219 
220 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
221 
222 
223 vnodeops_t *nfs4_trigger_vnodeops;
224 
225 /*
226  * These are the vnodeops that we must define for stub vnodes.
227  *
228  *
229  * Many of the VOPs defined for NFSv4 do not need to be defined here,
230  * for various reasons. This will result in the VFS default function being
231  * used:
232  *
233  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
234  *   lost the reference to the stub vnode, meaning these should not be called:
235  *       close, read, write, ioctl, readdir, seek.
236  *
237  * - These VOPs are meaningless for vnodes without data pages. Since the
238  *   stub vnode is of type VDIR, these should not be called:
239  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
240  *
241  * - These VOPs are otherwise not applicable, and should not be called:
242  *       dump, setsecattr.
243  *
244  *
245  * These VOPs we do not want to define, but nor do we want the VFS default
246  * action. Instead, we specify the VFS error function, with fs_error(), but
247  * note that fs_error() is not actually called. Instead it results in the
248  * use of the error function defined for the particular VOP, in vn_ops_table[]:
249  *
250  * -   frlock, dispose, shrlock.
251  *
252  *
253  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
254  * NOTE: if any of these ops involve an OTW call with the stub FH, then
255  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
256  * to protect the security data in the servinfo4_t for the "parent"
257  * filesystem that contains the stub.
258  *
259  * - These VOPs should not trigger a mount, so that "ls -l" does not:
260  *       pathconf, getsecattr.
261  *
262  * - These VOPs would not make sense to trigger:
263  *       inactive, rwlock, rwunlock, fid, realvp.
264  */
265 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
266 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
267 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
268 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
269 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
270 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
271 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
272 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
273 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
274 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
275 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
276 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
277 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
278 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
279 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
280 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
281 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
282 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
283 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
284 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
285 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
286 	VOPNAME_FRLOCK,		{ .error = fs_error },
287 	VOPNAME_DISPOSE,	{ .error = fs_error },
288 	VOPNAME_SHRLOCK,	{ .error = fs_error },
289 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
290 	NULL, NULL
291 };
292 
293 static void
294 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
295 {
296 	ASSERT(mutex_owned(&net->net_cnt_lock));
297 	net->net_refcnt++;
298 	ASSERT(net->net_refcnt != 0);
299 }
300 
301 static void
302 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
303 {
304 	mutex_enter(&net->net_cnt_lock);
305 	nfs4_ephemeral_tree_incr(net);
306 	mutex_exit(&net->net_cnt_lock);
307 }
308 
309 /*
310  * We need a safe way to decrement the refcnt whilst the
311  * lock is being held.
312  */
313 static void
314 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
315 {
316 	ASSERT(mutex_owned(&net->net_cnt_lock));
317 	ASSERT(net->net_refcnt != 0);
318 	net->net_refcnt--;
319 }
320 
321 static void
322 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
323 {
324 	mutex_enter(&net->net_cnt_lock);
325 	nfs4_ephemeral_tree_decr(net);
326 	mutex_exit(&net->net_cnt_lock);
327 }
328 
329 /*
330  * Trigger ops for stub vnodes; for mirror mounts, etc.
331  *
332  * The general idea is that a "triggering" op will first call
333  * nfs4_trigger_mount(), which will find out whether a mount has already
334  * been triggered.
335  *
336  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
337  * of the covering vfs.
338  *
339  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
340  * and again set newvp, as above.
341  *
342  * The triggering op may then re-issue the VOP by calling it on newvp.
343  *
344  * Note that some ops may perform custom action, and may or may not need
345  * to trigger a mount.
346  *
347  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
348  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
349  * and that would just recurse. Instead, we call the v4 op directly,
350  * by name.  This is OK, since we know that the vnode is for NFSv4,
351  * otherwise it couldn't be a stub.
352  *
353  */
354 
355 static int
356 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
357 {
358 	int error;
359 	vnode_t *newvp;
360 
361 	error = nfs4_trigger_mount(*vpp, cr, &newvp);
362 	if (error)
363 		return (error);
364 
365 	/* Release the stub vnode, as we're losing the reference to it */
366 	VN_RELE(*vpp);
367 
368 	/* Give the caller the root vnode of the newly-mounted fs */
369 	*vpp = newvp;
370 
371 	/* return with VN_HELD(newvp) */
372 	return (VOP_OPEN(vpp, flag, cr, ct));
373 }
374 
375 /*
376  * For the majority of cases, nfs4_trigger_getattr() will not trigger
377  * a mount. However, if ATTR_TRIGGER is set, we are being informed
378  * that we need to force the mount before we attempt to determine
379  * the attributes. The intent is an atomic operation for security
380  * testing.
381  */
382 static int
383 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
384     caller_context_t *ct)
385 {
386 	int error;
387 
388 	if (flags & ATTR_TRIGGER) {
389 		vnode_t	*newvp;
390 
391 		error = nfs4_trigger_mount(vp, cr, &newvp);
392 		if (error)
393 			return (error);
394 
395 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
396 		VN_RELE(newvp);
397 	} else {
398 		error = nfs4_getattr(vp, vap, flags, cr, ct);
399 	}
400 
401 	return (error);
402 }
403 
404 static int
405 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
406 		caller_context_t *ct)
407 {
408 	int error;
409 	vnode_t *newvp;
410 
411 	error = nfs4_trigger_mount(vp, cr, &newvp);
412 	if (error)
413 		return (error);
414 
415 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
416 	VN_RELE(newvp);
417 
418 	return (error);
419 }
420 
421 static int
422 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
423     caller_context_t *ct)
424 {
425 	int error;
426 	vnode_t *newvp;
427 
428 	error = nfs4_trigger_mount(vp, cr, &newvp);
429 	if (error)
430 		return (error);
431 
432 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
433 	VN_RELE(newvp);
434 
435 	return (error);
436 }
437 
438 static int
439 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
440     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
441     caller_context_t *ct, int *deflags, pathname_t *rpnp)
442 {
443 	int error;
444 	vnode_t *newdvp;
445 	rnode4_t *drp = VTOR4(dvp);
446 
447 	ASSERT(RP_ISSTUB(drp));
448 
449 	/* for now, we only support mirror-mounts */
450 	ASSERT(RP_ISSTUB_MIRRORMOUNT(drp));
451 
452 	/*
453 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
454 	 * that up. Instead, pass onto the regular op, regardless of whether
455 	 * we've triggered a mount.
456 	 */
457 	if (strcmp(nm, "..") == 0)
458 		return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
459 		    ct, deflags, rpnp));
460 
461 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
462 	if (error)
463 		return (error);
464 
465 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
466 	    deflags, rpnp);
467 	VN_RELE(newdvp);
468 
469 	return (error);
470 }
471 
472 static int
473 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
474     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
475     int flags, caller_context_t *ct, vsecattr_t *vsecp)
476 {
477 	int error;
478 	vnode_t *newdvp;
479 
480 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
481 	if (error)
482 		return (error);
483 
484 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
485 	    flags, ct, vsecp);
486 	VN_RELE(newdvp);
487 
488 	return (error);
489 }
490 
491 static int
492 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
493     int flags)
494 {
495 	int error;
496 	vnode_t *newdvp;
497 
498 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
499 	if (error)
500 		return (error);
501 
502 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
503 	VN_RELE(newdvp);
504 
505 	return (error);
506 }
507 
508 static int
509 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
510     caller_context_t *ct, int flags)
511 {
512 	int error;
513 	vnode_t *newtdvp;
514 
515 	error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
516 	if (error)
517 		return (error);
518 
519 	/*
520 	 * We don't check whether svp is a stub. Let the NFSv4 code
521 	 * detect that error, and return accordingly.
522 	 */
523 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
524 	VN_RELE(newtdvp);
525 
526 	return (error);
527 }
528 
529 static int
530 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
531     cred_t *cr, caller_context_t *ct, int flags)
532 {
533 	int error;
534 	vnode_t *newsdvp;
535 	rnode4_t *tdrp = VTOR4(tdvp);
536 
537 	/*
538 	 * We know that sdvp is a stub, otherwise we would not be here.
539 	 *
540 	 * If tdvp is also be a stub, there are two possibilities: it
541 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
542 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
543 	 *
544 	 * In the former case, just trigger sdvp, and treat tdvp as
545 	 * though it were not a stub.
546 	 *
547 	 * In the latter case, it might be a different stub for the
548 	 * same server fs as sdvp, or for a different server fs.
549 	 * Regardless, from the client perspective this would still
550 	 * be a cross-filesystem rename, and should not be allowed,
551 	 * so return EXDEV, without triggering either mount.
552 	 */
553 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
554 		return (EXDEV);
555 
556 	error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
557 	if (error)
558 		return (error);
559 
560 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
561 
562 	VN_RELE(newsdvp);
563 
564 	return (error);
565 }
566 
567 /* ARGSUSED */
568 static int
569 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
570     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
571 {
572 	int error;
573 	vnode_t *newdvp;
574 
575 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
576 	if (error)
577 		return (error);
578 
579 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
580 	VN_RELE(newdvp);
581 
582 	return (error);
583 }
584 
585 static int
586 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
587     caller_context_t *ct, int flags)
588 {
589 	int error;
590 	vnode_t *newdvp;
591 
592 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
593 	if (error)
594 		return (error);
595 
596 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
597 	VN_RELE(newdvp);
598 
599 	return (error);
600 }
601 
602 static int
603 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
604     cred_t *cr, caller_context_t *ct, int flags)
605 {
606 	int error;
607 	vnode_t *newdvp;
608 
609 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
610 	if (error)
611 		return (error);
612 
613 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
614 	VN_RELE(newdvp);
615 
616 	return (error);
617 }
618 
619 static int
620 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
621     caller_context_t *ct)
622 {
623 	int error;
624 	vnode_t *newvp;
625 
626 	error = nfs4_trigger_mount(vp, cr, &newvp);
627 	if (error)
628 		return (error);
629 
630 	error = VOP_READLINK(newvp, uiop, cr, ct);
631 	VN_RELE(newvp);
632 
633 	return (error);
634 }
635 
636 /* end of trigger vnode ops */
637 
638 /*
639  * See if the mount has already been done by another caller.
640  */
641 static int
642 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
643     bool_t *was_mounted, vfs_t **vfsp)
644 {
645 	int		error;
646 	mntinfo4_t	*mi = VTOMI4(vp);
647 
648 	*was_mounted = FALSE;
649 
650 	error = vn_vfsrlock_wait(vp);
651 	if (error)
652 		return (error);
653 
654 	*vfsp = vn_mountedvfs(vp);
655 	if (*vfsp != NULL) {
656 		/* the mount has already occurred */
657 		error = VFS_ROOT(*vfsp, newvpp);
658 		if (!error) {
659 			/* need to update the reference time  */
660 			mutex_enter(&mi->mi_lock);
661 			if (mi->mi_ephemeral)
662 				mi->mi_ephemeral->ne_ref_time =
663 				    gethrestime_sec();
664 			mutex_exit(&mi->mi_lock);
665 
666 			*was_mounted = TRUE;
667 		}
668 	}
669 
670 	vn_vfsunlock(vp);
671 	return (0);
672 }
673 
674 /*
675  * Mount upon a trigger vnode; for mirror-mounts, etc.
676  *
677  * The mount may have already occurred, via another thread. If not,
678  * assemble the location information - which may require fetching - and
679  * perform the mount.
680  *
681  * Sets newvp to be the root of the fs that is now covering vp. Note
682  * that we return with VN_HELD(*newvp).
683  *
684  * The caller is responsible for passing the VOP onto the covering fs.
685  */
686 static int
687 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
688 {
689 	int			 error;
690 	vfs_t			*vfsp;
691 	rnode4_t		*rp = VTOR4(vp);
692 	mntinfo4_t		*mi = VTOMI4(vp);
693 	domount_args_t		*dma;
694 
695 	nfs4_ephemeral_tree_t	*net;
696 
697 	bool_t			must_unlock = FALSE;
698 	bool_t			is_building = FALSE;
699 	bool_t			was_mounted = FALSE;
700 
701 	cred_t			*mcred = NULL;
702 
703 	nfs4_trigger_globals_t	*ntg;
704 
705 	zone_t			*zone = curproc->p_zone;
706 
707 	ASSERT(RP_ISSTUB(rp));
708 
709 	/* for now, we only support mirror-mounts */
710 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
711 
712 	*newvpp = NULL;
713 
714 	/*
715 	 * Has the mount already occurred?
716 	 */
717 	error = nfs4_trigger_mounted_already(vp, newvpp,
718 	    &was_mounted, &vfsp);
719 	if (error || was_mounted)
720 		goto done;
721 
722 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
723 	ASSERT(ntg != NULL);
724 
725 	mutex_enter(&mi->mi_lock);
726 
727 	/*
728 	 * We need to lock down the ephemeral tree.
729 	 */
730 	if (mi->mi_ephemeral_tree == NULL) {
731 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
732 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
733 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
734 		net->net_refcnt = 1;
735 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
736 		is_building = TRUE;
737 
738 		/*
739 		 * We need to add it to the zone specific list for
740 		 * automatic unmounting and harvesting of deadwood.
741 		 */
742 		mutex_enter(&ntg->ntg_forest_lock);
743 		if (ntg->ntg_forest != NULL)
744 			net->net_next = ntg->ntg_forest;
745 		ntg->ntg_forest = net;
746 		mutex_exit(&ntg->ntg_forest_lock);
747 
748 		/*
749 		 * No lock order confusion with mi_lock because no
750 		 * other node could have grabbed net_tree_lock.
751 		 */
752 		mutex_enter(&net->net_tree_lock);
753 		mi->mi_ephemeral_tree = net;
754 		net->net_mount = mi;
755 		mutex_exit(&mi->mi_lock);
756 	} else {
757 		net = mi->mi_ephemeral_tree;
758 		nfs4_ephemeral_tree_hold(net);
759 
760 		mutex_exit(&mi->mi_lock);
761 
762 		mutex_enter(&net->net_tree_lock);
763 
764 		/*
765 		 * We can only procede if the tree is neither locked
766 		 * nor being torn down.
767 		 */
768 		mutex_enter(&net->net_cnt_lock);
769 		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
770 			nfs4_ephemeral_tree_decr(net);
771 			mutex_exit(&net->net_cnt_lock);
772 			mutex_exit(&net->net_tree_lock);
773 
774 			return (EIO);
775 		}
776 		mutex_exit(&net->net_cnt_lock);
777 	}
778 
779 	mutex_enter(&net->net_cnt_lock);
780 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
781 	mutex_exit(&net->net_cnt_lock);
782 
783 	must_unlock = TRUE;
784 
785 	dma = nfs4_trigger_domount_args_create(vp);
786 	if (dma == NULL) {
787 		error = EINVAL;
788 		goto done;
789 	}
790 
791 	/*
792 	 * Note that since we define mirror mounts to work
793 	 * for any user, we simply extend the privileges of
794 	 * the user's credentials to allow the mount to
795 	 * proceed.
796 	 */
797 	mcred = crdup(cr);
798 	if (mcred == NULL) {
799 		error = EINVAL;
800 		goto done;
801 	}
802 
803 	crset_zone_privall(mcred);
804 
805 	error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
806 	nfs4_trigger_domount_args_destroy(dma, vp);
807 
808 	crfree(mcred);
809 
810 done:
811 
812 	if (must_unlock) {
813 		mutex_enter(&net->net_cnt_lock);
814 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
815 		if (is_building)
816 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
817 		nfs4_ephemeral_tree_decr(net);
818 		mutex_exit(&net->net_cnt_lock);
819 
820 		mutex_exit(&net->net_tree_lock);
821 	}
822 
823 	if (!error && (newvpp == NULL || *newvpp == NULL))
824 		error = ENOSYS;
825 
826 	return (error);
827 }
828 
829 /*
830  * Collect together both the generic & mount-type specific args.
831  */
832 static domount_args_t *
833 nfs4_trigger_domount_args_create(vnode_t *vp)
834 {
835 	int nointr;
836 	char *hostlist;
837 	servinfo4_t *svp;
838 	struct nfs_args *nargs, *nargs_head;
839 	enum clnt_stat status;
840 	ephemeral_servinfo_t *esi, *esi_first;
841 	domount_args_t *dma;
842 	mntinfo4_t *mi = VTOMI4(vp);
843 
844 	nointr = !(mi->mi_flags & MI4_INT);
845 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
846 
847 	svp = mi->mi_curr_serv;
848 	/* check if the current server is responding */
849 	status = nfs4_trigger_ping_server(svp, nointr);
850 	if (status == RPC_SUCCESS) {
851 		esi_first = nfs4_trigger_esi_create(vp, svp);
852 		if (esi_first == NULL) {
853 			kmem_free(hostlist, MAXPATHLEN);
854 			return (NULL);
855 		}
856 
857 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
858 
859 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
860 	} else {
861 		/* current server did not respond */
862 		esi_first = NULL;
863 		nargs_head = NULL;
864 	}
865 	nargs = nargs_head;
866 
867 	/*
868 	 * NFS RO failover.
869 	 *
870 	 * If we have multiple servinfo4 structures, linked via sv_next,
871 	 * we must create one nfs_args for each, linking the nfs_args via
872 	 * nfs_ext_u.nfs_extB.next.
873 	 *
874 	 * We need to build a corresponding esi for each, too, but that is
875 	 * used solely for building nfs_args, and may be immediately
876 	 * discarded, as domount() requires the info from just one esi,
877 	 * but all the nfs_args.
878 	 *
879 	 * Currently, the NFS mount code will hang if not all servers
880 	 * requested are available. To avoid that, we need to ping each
881 	 * server, here, and remove it from the list if it is not
882 	 * responding. This has the side-effect of that server then
883 	 * being permanently unavailable for this failover mount, even if
884 	 * it recovers. That's unfortunate, but the best we can do until
885 	 * the mount code path is fixed.
886 	 */
887 
888 	/*
889 	 * If the current server was down, loop indefinitely until we find
890 	 * at least one responsive server.
891 	 */
892 	do {
893 		/* no locking needed for sv_next; it is only set at fs mount */
894 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
895 			struct nfs_args *next;
896 
897 			/*
898 			 * nargs_head: the head of the nfs_args list
899 			 * nargs: the current tail of the list
900 			 * next: the newly-created element to be added
901 			 */
902 
903 			/*
904 			 * We've already tried the current server, above;
905 			 * if it was responding, we have already included it
906 			 * and it may now be ignored.
907 			 *
908 			 * Otherwise, try it again, since it may now have
909 			 * recovered.
910 			 */
911 			if (svp == mi->mi_curr_serv && esi_first != NULL)
912 				continue;
913 
914 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
915 			if (svp->sv_flags & SV4_NOTINUSE) {
916 				nfs_rw_exit(&svp->sv_lock);
917 				continue;
918 			}
919 			nfs_rw_exit(&svp->sv_lock);
920 
921 			/* check if the server is responding */
922 			status = nfs4_trigger_ping_server(svp, nointr);
923 			/* if the server did not respond, ignore it */
924 			if (status != RPC_SUCCESS)
925 				continue;
926 
927 			esi = nfs4_trigger_esi_create(vp, svp);
928 			if (esi == NULL)
929 				continue;
930 
931 			/*
932 			 * If the original current server (mi_curr_serv)
933 			 * was down when when we first tried it,
934 			 * (i.e. esi_first == NULL),
935 			 * we select this new server (svp) to be the server
936 			 * that we will actually contact (esi_first).
937 			 *
938 			 * Note that it's possible that mi_curr_serv == svp,
939 			 * if that mi_curr_serv was down but has now recovered.
940 			 */
941 			next = nfs4_trigger_nargs_create(mi, svp, esi);
942 			if (esi_first == NULL) {
943 				ASSERT(nargs == NULL);
944 				ASSERT(nargs_head == NULL);
945 				nargs_head = next;
946 				esi_first = esi;
947 				(void) strlcpy(hostlist,
948 				    esi_first->esi_hostname, MAXPATHLEN);
949 			} else {
950 				ASSERT(nargs_head != NULL);
951 				nargs->nfs_ext_u.nfs_extB.next = next;
952 				(void) strlcat(hostlist, ",", MAXPATHLEN);
953 				(void) strlcat(hostlist, esi->esi_hostname,
954 				    MAXPATHLEN);
955 				/* esi was only needed for hostname & nargs */
956 				nfs4_trigger_esi_destroy(esi, vp);
957 			}
958 
959 			nargs = next;
960 		}
961 
962 		/* if we've had no response at all, wait a second */
963 		if (esi_first == NULL)
964 			delay(drv_usectohz(1000000));
965 
966 	} while (esi_first == NULL);
967 	ASSERT(nargs_head != NULL);
968 
969 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
970 	dma->dma_esi = esi_first;
971 	dma->dma_hostlist = hostlist;
972 	dma->dma_nargs = nargs_head;
973 
974 	return (dma);
975 }
976 
977 static void
978 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
979 {
980 	if (dma != NULL) {
981 		if (dma->dma_esi != NULL && vp != NULL)
982 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
983 
984 		if (dma->dma_hostlist != NULL)
985 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
986 
987 		if (dma->dma_nargs != NULL) {
988 			struct nfs_args *nargs = dma->dma_nargs;
989 
990 			do {
991 				struct nfs_args *next =
992 				    nargs->nfs_ext_u.nfs_extB.next;
993 
994 				nfs4_trigger_nargs_destroy(nargs);
995 				nargs = next;
996 			} while (nargs != NULL);
997 		}
998 
999 		kmem_free(dma, sizeof (domount_args_t));
1000 	}
1001 }
1002 
1003 /*
1004  * The ephemeral_servinfo_t struct contains basic information we will need to
1005  * perform the mount. Whilst the structure is generic across different
1006  * types of ephemeral mount, the way we gather its contents differs.
1007  */
1008 static ephemeral_servinfo_t *
1009 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp)
1010 {
1011 	ephemeral_servinfo_t *esi;
1012 	rnode4_t *rp = VTOR4(vp);
1013 
1014 	ASSERT(RP_ISSTUB(rp));
1015 
1016 	/* Call the ephemeral type-specific routine */
1017 	if (RP_ISSTUB_MIRRORMOUNT(rp))
1018 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1019 	else
1020 		esi = NULL;
1021 
1022 	/* for now, we only support mirror-mounts */
1023 	ASSERT(esi != NULL);
1024 
1025 	return (esi);
1026 }
1027 
1028 static void
1029 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1030 {
1031 	rnode4_t *rp = VTOR4(vp);
1032 
1033 	ASSERT(RP_ISSTUB(rp));
1034 
1035 	/* for now, we only support mirror-mounts */
1036 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
1037 
1038 	/* Currently, no need for an ephemeral type-specific routine */
1039 
1040 	/*
1041 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
1042 	 * and will be handled by nfs4_trigger_nargs_destroy().
1043 	 * We need only free the structure itself.
1044 	 */
1045 	if (esi != NULL)
1046 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
1047 }
1048 
1049 /*
1050  * Some of this may turn out to be common with other ephemeral types,
1051  * in which case it should be moved to nfs4_trigger_esi_create(), or a
1052  * common function called.
1053  */
1054 static ephemeral_servinfo_t *
1055 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1056 {
1057 	char			*stubpath;
1058 	struct knetconfig	*sikncp, *svkncp;
1059 	struct netbuf		*bufp;
1060 	ephemeral_servinfo_t	*esi;
1061 
1062 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1063 
1064 	/* initially set to be our type of ephemeral mount; may be added to */
1065 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1066 
1067 	/*
1068 	 * We're copying info from the stub rnode's servinfo4, but
1069 	 * we must create new copies, not pointers, since this information
1070 	 * is to be associated with the new mount, which will be
1071 	 * unmounted (and its structures freed) separately
1072 	 */
1073 
1074 	/*
1075 	 * Sizes passed to kmem_[z]alloc here must match those freed
1076 	 * in nfs4_free_args()
1077 	 */
1078 
1079 	/*
1080 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1081 	 * is difficult to avoid: as we need to read svp to calculate the
1082 	 * sizes to be allocated.
1083 	 */
1084 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1085 
1086 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1087 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1088 
1089 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1090 	bufp = esi->esi_addr;
1091 	bufp->len = svp->sv_addr.len;
1092 	bufp->maxlen = svp->sv_addr.maxlen;
1093 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1094 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1095 
1096 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1097 	sikncp = esi->esi_knconf;
1098 	svkncp = svp->sv_knconf;
1099 	sikncp->knc_semantics = svkncp->knc_semantics;
1100 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1101 	(void) strcat((char *)sikncp->knc_protofmly,
1102 	    (char *)svkncp->knc_protofmly);
1103 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1104 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1105 	sikncp->knc_rdev = svkncp->knc_rdev;
1106 
1107 	/*
1108 	 * Used when AUTH_DH is negotiated.
1109 	 *
1110 	 * This is ephemeral mount-type specific, since it contains the
1111 	 * server's time-sync syncaddr.
1112 	 */
1113 	if (svp->sv_dhsec) {
1114 		struct netbuf *bufp;
1115 		sec_data_t *sdata;
1116 		dh_k4_clntdata_t *data;
1117 
1118 		sdata = svp->sv_dhsec;
1119 		data = (dh_k4_clntdata_t *)sdata->data;
1120 		ASSERT(sdata->rpcflavor == AUTH_DH);
1121 
1122 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1123 		bufp->len = data->syncaddr.len;
1124 		bufp->maxlen = data->syncaddr.maxlen;
1125 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1126 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1127 		esi->esi_syncaddr = bufp;
1128 
1129 		if (data->netname != NULL) {
1130 			int nmlen = data->netnamelen;
1131 
1132 			/*
1133 			 * We need to copy from a dh_k4_clntdata_t
1134 			 * netname/netnamelen pair to a NUL-terminated
1135 			 * netname string suitable for putting in nfs_args,
1136 			 * where the latter has no netnamelen field.
1137 			 */
1138 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1139 			bcopy(data->netname, esi->esi_netname, nmlen);
1140 		}
1141 	} else {
1142 		esi->esi_syncaddr = NULL;
1143 		esi->esi_netname = NULL;
1144 	}
1145 
1146 	stubpath = fn_path(VTOSV(vp)->sv_name);
1147 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1148 	ASSERT(*stubpath == '.');
1149 	stubpath += 1;
1150 
1151 	/* for nfs_args->fh */
1152 	esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1;
1153 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1154 	(void) strcat(esi->esi_path, svp->sv_path);
1155 	(void) strcat(esi->esi_path, stubpath);
1156 
1157 	stubpath -= 1;
1158 	/* stubpath allocated by fn_path() */
1159 	kmem_free(stubpath, strlen(stubpath) + 1);
1160 
1161 	nfs_rw_exit(&svp->sv_lock);
1162 
1163 	return (esi);
1164 }
1165 
1166 /*
1167  * Assemble the args, and call the generic VFS mount function to
1168  * finally perform the ephemeral mount.
1169  */
1170 static int
1171 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1172     cred_t *cr, vnode_t **newvpp)
1173 {
1174 	struct mounta	*uap;
1175 	char		*mntpt, *orig_path, *path;
1176 	const char	*orig_mntpt;
1177 	int		retval;
1178 	int		mntpt_len;
1179 	int		spec_len;
1180 	zone_t		*zone = curproc->p_zone;
1181 	bool_t		has_leading_slash;
1182 	int		i;
1183 
1184 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1185 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1186 	struct nfs_args		*nargs = dma->dma_nargs;
1187 
1188 	/* first, construct the mount point for the ephemeral mount */
1189 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1190 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1191 
1192 	if (*orig_path == '.')
1193 		orig_path++;
1194 
1195 	/*
1196 	 * Get rid of zone's root path
1197 	 */
1198 	if (zone != global_zone) {
1199 		/*
1200 		 * -1 for trailing '/' and -1 for EOS.
1201 		 */
1202 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1203 		    zone->zone_rootpathlen - 1) == 0) {
1204 			orig_mntpt += (zone->zone_rootpathlen - 2);
1205 		}
1206 	}
1207 
1208 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1209 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1210 	(void) strcat(mntpt, orig_mntpt);
1211 	(void) strcat(mntpt, orig_path);
1212 
1213 	kmem_free(path, strlen(path) + 1);
1214 	path = esi->esi_path;
1215 	if (*path == '.')
1216 		path++;
1217 	if (path[0] == '/' && path[1] == '/')
1218 		path++;
1219 	has_leading_slash = (*path == '/');
1220 
1221 	spec_len = strlen(dma->dma_hostlist);
1222 	spec_len += strlen(path);
1223 
1224 	/* We are going to have to add this in */
1225 	if (!has_leading_slash)
1226 		spec_len++;
1227 
1228 	/* We need to get the ':' for dma_hostlist:esi_path */
1229 	spec_len++;
1230 
1231 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1232 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1233 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1234 	    has_leading_slash ? "" : "/", path);
1235 
1236 	uap->dir = mntpt;
1237 
1238 	uap->flags = MS_SYSSPACE | MS_DATA;
1239 	/* fstype-independent mount options not covered elsewhere */
1240 	/* copy parent's mount(1M) "-m" flag */
1241 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1242 		uap->flags |= MS_NOMNTTAB;
1243 
1244 	uap->fstype = MNTTYPE_NFS4;
1245 	uap->dataptr = (char *)nargs;
1246 	/* not needed for MS_SYSSPACE */
1247 	uap->datalen = 0;
1248 
1249 	/* use optptr to pass in extra mount options */
1250 	uap->flags |= MS_OPTIONSTR;
1251 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1252 	if (uap->optptr == NULL) {
1253 		retval = EINVAL;
1254 		goto done;
1255 	}
1256 
1257 	/* domount() expects us to count the trailing NUL */
1258 	uap->optlen = strlen(uap->optptr) + 1;
1259 
1260 	/*
1261 	 * If we get EBUSY, we try again once to see if we can perform
1262 	 * the mount. We do this because of a spurious race condition.
1263 	 */
1264 	for (i = 0; i < 2; i++) {
1265 		int	error;
1266 		bool_t	was_mounted;
1267 
1268 		retval = domount(NULL, uap, stubvp, cr, vfsp);
1269 		if (retval == 0) {
1270 			retval = VFS_ROOT(*vfsp, newvpp);
1271 			VFS_RELE(*vfsp);
1272 			break;
1273 		} else if (retval != EBUSY) {
1274 			break;
1275 		}
1276 
1277 		/*
1278 		 * We might find it mounted by the other racer...
1279 		 */
1280 		error = nfs4_trigger_mounted_already(stubvp,
1281 		    newvpp, &was_mounted, vfsp);
1282 		if (error) {
1283 			goto done;
1284 		} else if (was_mounted) {
1285 			retval = 0;
1286 			break;
1287 		}
1288 	}
1289 
1290 done:
1291 	if (uap->optptr)
1292 		nfs4_trigger_destroy_mntopts(uap->optptr);
1293 
1294 	kmem_free(uap->spec, spec_len + 1);
1295 	kmem_free(uap, sizeof (struct mounta));
1296 	kmem_free(mntpt, mntpt_len + 1);
1297 
1298 	return (retval);
1299 }
1300 
1301 /*
1302  * Build an nfs_args structure for passing to domount().
1303  *
1304  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1305  * generic data - common to all ephemeral mount types - is read directly
1306  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1307  */
1308 static struct nfs_args *
1309 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1310     ephemeral_servinfo_t *esi)
1311 {
1312 	sec_data_t *secdata;
1313 	struct nfs_args *nargs;
1314 
1315 	/* setup the nfs args */
1316 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1317 
1318 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1319 
1320 	nargs->addr = esi->esi_addr;
1321 
1322 	/* for AUTH_DH by negotiation */
1323 	if (esi->esi_syncaddr || esi->esi_netname) {
1324 		nargs->flags |= NFSMNT_SECURE;
1325 		nargs->syncaddr = esi->esi_syncaddr;
1326 		nargs->netname = esi->esi_netname;
1327 	}
1328 
1329 	nargs->flags |= NFSMNT_KNCONF;
1330 	nargs->knconf = esi->esi_knconf;
1331 	nargs->flags |= NFSMNT_HOSTNAME;
1332 	nargs->hostname = esi->esi_hostname;
1333 	nargs->fh = esi->esi_path;
1334 
1335 	/* general mount settings, all copied from parent mount */
1336 	mutex_enter(&mi->mi_lock);
1337 
1338 	if (!(mi->mi_flags & MI4_HARD))
1339 		nargs->flags |= NFSMNT_SOFT;
1340 
1341 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
1342 	    NFSMNT_RETRANS;
1343 	nargs->wsize = mi->mi_stsize;
1344 	nargs->rsize = mi->mi_tsize;
1345 	nargs->timeo = mi->mi_timeo;
1346 	nargs->retrans = mi->mi_retrans;
1347 
1348 	if (mi->mi_flags & MI4_INT)
1349 		nargs->flags |= NFSMNT_INT;
1350 	if (mi->mi_flags & MI4_NOAC)
1351 		nargs->flags |= NFSMNT_NOAC;
1352 
1353 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
1354 	    NFSMNT_ACDIRMAX;
1355 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
1356 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
1357 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
1358 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
1359 
1360 	if (mi->mi_flags & MI4_NOCTO)
1361 		nargs->flags |= NFSMNT_NOCTO;
1362 	if (mi->mi_flags & MI4_GRPID)
1363 		nargs->flags |= NFSMNT_GRPID;
1364 	if (mi->mi_flags & MI4_LLOCK)
1365 		nargs->flags |= NFSMNT_LLOCK;
1366 	if (mi->mi_flags & MI4_NOPRINT)
1367 		nargs->flags |= NFSMNT_NOPRINT;
1368 	if (mi->mi_flags & MI4_DIRECTIO)
1369 		nargs->flags |= NFSMNT_DIRECTIO;
1370 	if (mi->mi_flags & MI4_PUBLIC)
1371 		nargs->flags |= NFSMNT_PUBLIC;
1372 
1373 	mutex_exit(&mi->mi_lock);
1374 
1375 	/* add any specific flags for this type of ephemeral mount */
1376 	nargs->flags |= esi->esi_mount_flags;
1377 
1378 	/*
1379 	 * Security data & negotiation policy.
1380 	 *
1381 	 * We need to preserve the parent mount's preference for security
1382 	 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT.
1383 	 *
1384 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
1385 	 * security flavour was requested, with data in sv_secdata, and that
1386 	 * no negotiation should occur. If this specified flavour fails, that's
1387 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
1388 	 *
1389 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
1390 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
1391 	 * Possible flavours are recorded in an array in sv_secinfo, with
1392 	 * currently in-use flavour pointed to by sv_currsec.
1393 	 *
1394 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
1395 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
1396 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
1397 	 */
1398 	if (svp->sv_flags & SV4_TRYSECDEFAULT) {
1399 		/* enable negotiation for ephemeral mount */
1400 		nargs->flags |= NFSMNT_SECDEFAULT;
1401 
1402 		/*
1403 		 * As a starting point for negotiation, copy parent
1404 		 * mount's negotiated flavour (sv_currsec) if available,
1405 		 * or its passed-in flavour (sv_secdata) if not.
1406 		 */
1407 		if (svp->sv_currsec != NULL)
1408 			secdata = copy_sec_data(svp->sv_currsec);
1409 		else if (svp->sv_secdata != NULL)
1410 			secdata = copy_sec_data(svp->sv_secdata);
1411 		else
1412 			secdata = NULL;
1413 	} else {
1414 		/* do not enable negotiation; copy parent's passed-in flavour */
1415 		if (svp->sv_secdata != NULL)
1416 			secdata = copy_sec_data(svp->sv_secdata);
1417 		else
1418 			secdata = NULL;
1419 	}
1420 
1421 	nfs_rw_exit(&svp->sv_lock);
1422 
1423 	nargs->flags |= NFSMNT_NEWARGS;
1424 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
1425 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
1426 
1427 	/* for NFS RO failover; caller will set if necessary */
1428 	nargs->nfs_ext_u.nfs_extB.next = NULL;
1429 
1430 	return (nargs);
1431 }
1432 
1433 static void
1434 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
1435 {
1436 	/*
1437 	 * Either the mount failed, in which case the data is not needed, or
1438 	 * nfs4_mount() has either taken copies of what it needs or,
1439 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
1440 	 * whereby nfs4_free_args() will ignore it.
1441 	 */
1442 	nfs4_free_args(nargs);
1443 	kmem_free(nargs, sizeof (struct nfs_args));
1444 }
1445 
1446 /*
1447  * When we finally get into the mounting, we need to add this
1448  * node to the ephemeral tree.
1449  *
1450  * This is called from nfs4_mount().
1451  */
1452 int
1453 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
1454 {
1455 	mntinfo4_t		*mi_parent;
1456 	nfs4_ephemeral_t	*eph;
1457 	nfs4_ephemeral_tree_t	*net;
1458 
1459 	nfs4_ephemeral_t	*prior;
1460 	nfs4_ephemeral_t	*child;
1461 
1462 	nfs4_ephemeral_t	*peer;
1463 
1464 	nfs4_trigger_globals_t	*ntg;
1465 	zone_t			*zone = curproc->p_zone;
1466 
1467 	int			rc = 0;
1468 
1469 	mi_parent = VTOMI4(mvp);
1470 
1471 	/*
1472 	 * Get this before grabbing anything else!
1473 	 */
1474 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
1475 	if (!ntg->ntg_thread_started) {
1476 		nfs4_ephemeral_start_harvester(ntg);
1477 	}
1478 
1479 	mutex_enter(&mi_parent->mi_lock);
1480 	mutex_enter(&mi->mi_lock);
1481 
1482 	net = mi->mi_ephemeral_tree =
1483 	    mi_parent->mi_ephemeral_tree;
1484 
1485 	/*
1486 	 * If the mi_ephemeral_tree is NULL, then it
1487 	 * means that either the harvester or a manual
1488 	 * umount has cleared the tree out right before
1489 	 * we got here.
1490 	 *
1491 	 * There is nothing we can do here, so return
1492 	 * to the caller and let them decide whether they
1493 	 * try again.
1494 	 */
1495 	if (net == NULL) {
1496 		mutex_exit(&mi->mi_lock);
1497 		mutex_exit(&mi_parent->mi_lock);
1498 
1499 		return (EBUSY);
1500 	}
1501 
1502 	nfs4_ephemeral_tree_hold(net);
1503 
1504 	/*
1505 	 * We need to tack together the ephemeral mount
1506 	 * with this new mntinfo.
1507 	 */
1508 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
1509 	eph->ne_mount = mi;
1510 	eph->ne_ref_time = gethrestime_sec();
1511 
1512 	/*
1513 	 * We need to tell the ephemeral mount when
1514 	 * to time out.
1515 	 */
1516 	eph->ne_mount_to = ntg->ntg_mount_to;
1517 
1518 	mi->mi_flags |= MI4_EPHEMERAL;
1519 	mi->mi_ephemeral = eph;
1520 
1521 	/*
1522 	 * If the enclosing mntinfo4 is also ephemeral,
1523 	 * then we need to point to its enclosing parent.
1524 	 * Else the enclosing mntinfo4 is the enclosing parent.
1525 	 *
1526 	 * We also need to weave this ephemeral node
1527 	 * into the tree.
1528 	 */
1529 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
1530 		/*
1531 		 * We need to decide if we are
1532 		 * the root node of this branch
1533 		 * or if we are a sibling of this
1534 		 * branch.
1535 		 */
1536 		prior = mi_parent->mi_ephemeral;
1537 		if (prior == NULL) {
1538 			/*
1539 			 * Race condition, clean up, and
1540 			 * let caller handle mntinfo.
1541 			 */
1542 			mi->mi_flags &= ~MI4_EPHEMERAL;
1543 			mi->mi_ephemeral = NULL;
1544 			kmem_free(eph, sizeof (*eph));
1545 			rc = EBUSY;
1546 		} else {
1547 			if (prior->ne_child == NULL) {
1548 				prior->ne_child = eph;
1549 			} else {
1550 				child = prior->ne_child;
1551 
1552 				prior->ne_child = eph;
1553 				eph->ne_peer = child;
1554 
1555 				child->ne_prior = eph;
1556 			}
1557 
1558 			eph->ne_prior = prior;
1559 		}
1560 	} else {
1561 		/*
1562 		 * The parent mntinfo4 is the non-ephemeral
1563 		 * root of the ephemeral tree. We
1564 		 * need to decide if we are the root
1565 		 * node of that tree or if we are a
1566 		 * sibling of the root node.
1567 		 *
1568 		 * We are the root if there is no
1569 		 * other node.
1570 		 */
1571 		if (net->net_root == NULL) {
1572 			net->net_root = eph;
1573 		} else {
1574 			eph->ne_peer = peer = net->net_root;
1575 			ASSERT(peer != NULL);
1576 			net->net_root = eph;
1577 
1578 			peer->ne_prior = eph;
1579 		}
1580 
1581 		eph->ne_prior = NULL;
1582 	}
1583 
1584 	nfs4_ephemeral_tree_rele(net);
1585 
1586 	mutex_exit(&mi->mi_lock);
1587 	mutex_exit(&mi_parent->mi_lock);
1588 
1589 	return (rc);
1590 }
1591 
1592 /*
1593  * Commit the changes to the ephemeral tree for removing this node.
1594  */
1595 static void
1596 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
1597 {
1598 	nfs4_ephemeral_t	*e = eph;
1599 	nfs4_ephemeral_t	*peer;
1600 	nfs4_ephemeral_t	*prior;
1601 
1602 	peer = eph->ne_peer;
1603 	prior = e->ne_prior;
1604 
1605 	/*
1606 	 * If this branch root was not the
1607 	 * tree root, then we need to fix back pointers.
1608 	 */
1609 	if (prior) {
1610 		if (prior->ne_child == e) {
1611 			prior->ne_child = peer;
1612 		} else {
1613 			prior->ne_peer = peer;
1614 		}
1615 
1616 		if (peer)
1617 			peer->ne_prior = prior;
1618 	} else if (peer) {
1619 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
1620 		peer->ne_prior = NULL;
1621 	} else {
1622 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
1623 	}
1624 }
1625 
1626 /*
1627  * We want to avoid recursion at all costs. So we need to
1628  * unroll the tree. We do this by a depth first traversal to
1629  * leaf nodes. We blast away the leaf and work our way back
1630  * up and down the tree.
1631  */
1632 static int
1633 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
1634     int isTreeRoot, int flag, cred_t *cr)
1635 {
1636 	nfs4_ephemeral_t	*e = eph;
1637 	nfs4_ephemeral_t	*prior;
1638 	mntinfo4_t		*mi;
1639 	vfs_t			*vfsp;
1640 	int			error;
1641 
1642 	/*
1643 	 * We use the loop while unrolling the ephemeral tree.
1644 	 */
1645 	for (;;) {
1646 		/*
1647 		 * First we walk down the child.
1648 		 */
1649 		if (e->ne_child) {
1650 			prior = e;
1651 			e = e->ne_child;
1652 			continue;
1653 		}
1654 
1655 		/*
1656 		 * If we are the root of the branch we are removing,
1657 		 * we end it here. But if the branch is the root of
1658 		 * the tree, we have to forge on. We do not consider
1659 		 * the peer list for the root because while it may
1660 		 * be okay to remove, it is both extra work and a
1661 		 * potential for a false-positive error to stall the
1662 		 * unmount attempt.
1663 		 */
1664 		if (e == eph && isTreeRoot == FALSE)
1665 			return (0);
1666 
1667 		/*
1668 		 * Next we walk down the peer list.
1669 		 */
1670 		if (e->ne_peer) {
1671 			prior = e;
1672 			e = e->ne_peer;
1673 			continue;
1674 		}
1675 
1676 		/*
1677 		 * We can only remove the node passed in by the
1678 		 * caller if it is the root of the ephemeral tree.
1679 		 * Otherwise, the caller will remove it.
1680 		 */
1681 		if (e == eph && isTreeRoot == FALSE)
1682 			return (0);
1683 
1684 		/*
1685 		 * Okay, we have a leaf node, time
1686 		 * to prune it!
1687 		 *
1688 		 * Note that prior can only be NULL if
1689 		 * and only if it is the root of the
1690 		 * ephemeral tree.
1691 		 */
1692 		prior = e->ne_prior;
1693 
1694 		mi = e->ne_mount;
1695 		mutex_enter(&mi->mi_lock);
1696 		vfsp = mi->mi_vfsp;
1697 
1698 		/*
1699 		 * Cleared by umount2_engine.
1700 		 */
1701 		VFS_HOLD(vfsp);
1702 
1703 		/*
1704 		 * Inform nfs4_unmount to not recursively
1705 		 * descend into this node's children when it
1706 		 * gets processed.
1707 		 */
1708 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
1709 		mutex_exit(&mi->mi_lock);
1710 
1711 		error = umount2_engine(vfsp, flag, cr, FALSE);
1712 		if (error) {
1713 			/*
1714 			 * We need to reenable nfs4_unmount's ability
1715 			 * to recursively descend on this node.
1716 			 */
1717 			mutex_enter(&mi->mi_lock);
1718 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
1719 			mutex_exit(&mi->mi_lock);
1720 
1721 			return (error);
1722 		}
1723 
1724 		/*
1725 		 * If we are the current node, we do not want to
1726 		 * touch anything else. At this point, the only
1727 		 * way the current node can have survived to here
1728 		 * is if it is the root of the ephemeral tree and
1729 		 * we are unmounting the enclosing mntinfo4.
1730 		 */
1731 		if (e == eph) {
1732 			ASSERT(prior == NULL);
1733 			return (0);
1734 		}
1735 
1736 		/*
1737 		 * Stitch up the prior node. Note that since
1738 		 * we have handled the root of the tree, prior
1739 		 * must be non-NULL.
1740 		 */
1741 		ASSERT(prior != NULL);
1742 		if (prior->ne_child == e) {
1743 			prior->ne_child = NULL;
1744 		} else {
1745 			ASSERT(prior->ne_peer == e);
1746 
1747 			prior->ne_peer = NULL;
1748 		}
1749 
1750 		e = prior;
1751 	}
1752 
1753 	/* NOTREACHED */
1754 }
1755 
1756 /*
1757  * Common code to safely release net_cnt_lock and net_tree_lock
1758  */
1759 void
1760 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
1761     bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet)
1762 {
1763 	nfs4_ephemeral_tree_t	*net = *pnet;
1764 
1765 	if (*pmust_unlock) {
1766 		mutex_enter(&net->net_cnt_lock);
1767 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
1768 		if (*pmust_rele)
1769 			nfs4_ephemeral_tree_decr(net);
1770 		mutex_exit(&net->net_cnt_lock);
1771 
1772 		mutex_exit(&net->net_tree_lock);
1773 
1774 		*pmust_unlock = FALSE;
1775 	}
1776 }
1777 
1778 /*
1779  * While we may have removed any child or sibling nodes of this
1780  * ephemeral node, we can not nuke it until we know that there
1781  * were no actived vnodes on it. This will do that final
1782  * work once we know it is not busy.
1783  */
1784 void
1785 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
1786     bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet)
1787 {
1788 	/*
1789 	 * Now we need to get rid of the ephemeral data if it exists.
1790 	 */
1791 	mutex_enter(&mi->mi_lock);
1792 	if (mi->mi_ephemeral) {
1793 		/*
1794 		 * If we are the root node of an ephemeral branch
1795 		 * which is being removed, then we need to fixup
1796 		 * pointers into and out of the node.
1797 		 */
1798 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
1799 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
1800 
1801 		ASSERT(mi->mi_ephemeral != NULL);
1802 
1803 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
1804 		mi->mi_ephemeral = NULL;
1805 	}
1806 	mutex_exit(&mi->mi_lock);
1807 
1808 	nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele, pnet);
1809 }
1810 
1811 /*
1812  * Unmount an ephemeral node.
1813  */
1814 int
1815 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
1816     bool_t *pmust_unlock, bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet)
1817 {
1818 	int			error = 0;
1819 	nfs4_ephemeral_t	*eph;
1820 	nfs4_ephemeral_tree_t	*net;
1821 	int			is_derooting = FALSE;
1822 	int			is_recursed = FALSE;
1823 	int			was_locked = FALSE;
1824 
1825 	/*
1826 	 * Make sure to set the default state for cleaning
1827 	 * up the tree in the caller (and on the way out).
1828 	 */
1829 	*pmust_unlock = *pmust_rele = FALSE;
1830 
1831 	/*
1832 	 * The active vnodes on this file system may be ephemeral
1833 	 * children. We need to check for and try to unmount them
1834 	 * here. If any can not be unmounted, we are going
1835 	 * to return EBUSY.
1836 	 */
1837 	mutex_enter(&mi->mi_lock);
1838 
1839 	/*
1840 	 * If an ephemeral tree, we need to check to see if
1841 	 * the lock is already held. If it is, then we need
1842 	 * to see if we are being called as a result of
1843 	 * the recursive removal of some node of the tree or
1844 	 * if we are another attempt to remove the tree.
1845 	 *
1846 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
1847 	 * node. mi_ephemeral being non-NULL also does this.
1848 	 *
1849 	 * mi_ephemeral_tree being non-NULL is sufficient
1850 	 * to also indicate either it is an ephemeral node
1851 	 * or the enclosing mntinfo4.
1852 	 *
1853 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
1854 	 * when we delete the ephemeral node and need to
1855 	 * differentiate from an ephemeral node and the
1856 	 * enclosing root node.
1857 	 */
1858 	*pnet = net = mi->mi_ephemeral_tree;
1859 	if (net == NULL) {
1860 		mutex_exit(&mi->mi_lock);
1861 		return (0);
1862 	}
1863 
1864 	eph = mi->mi_ephemeral;
1865 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
1866 	is_derooting = (eph == NULL);
1867 
1868 	/*
1869 	 * If this is not recursion, then we need to
1870 	 * grab a ref count.
1871 	 *
1872 	 * But wait, we also do not want to do that
1873 	 * if a harvester thread has already grabbed
1874 	 * the lock.
1875 	 */
1876 	if (!is_recursed) {
1877 		mutex_enter(&net->net_cnt_lock);
1878 		if (net->net_status &
1879 		    NFS4_EPHEMERAL_TREE_LOCKED) {
1880 			/*
1881 			 * If the tree is locked, we need
1882 			 * to decide whether we are the
1883 			 * harvester or some explicit call
1884 			 * for a umount. The only way that
1885 			 * we are the harvester is if
1886 			 * MS_SYSSPACE is set.
1887 			 *
1888 			 * We only let the harvester through
1889 			 * at this point.
1890 			 *
1891 			 * We return EBUSY so that the
1892 			 * caller knows something is
1893 			 * going on. Note that by that
1894 			 * time, the umount in the other
1895 			 * thread may have already occured.
1896 			 */
1897 			if (!(flag & MS_SYSSPACE)) {
1898 				mutex_exit(&net->net_cnt_lock);
1899 				mutex_exit(&mi->mi_lock);
1900 
1901 				return (EBUSY);
1902 			}
1903 
1904 			was_locked = TRUE;
1905 		} else {
1906 			nfs4_ephemeral_tree_incr(net);
1907 			*pmust_rele = TRUE;
1908 		}
1909 
1910 		mutex_exit(&net->net_cnt_lock);
1911 	}
1912 	mutex_exit(&mi->mi_lock);
1913 
1914 	/*
1915 	 * If we are not the harvester, we need to check
1916 	 * to see if we need to grab the tree lock.
1917 	 */
1918 	if (was_locked == FALSE) {
1919 		/*
1920 		 * If we grab the lock, it means that no other
1921 		 * operation is working on the tree. If we don't
1922 		 * grab it, we need to decide if this is because
1923 		 * we are a recursive call or a new operation.
1924 		 */
1925 		if (mutex_tryenter(&net->net_tree_lock)) {
1926 			*pmust_unlock = TRUE;
1927 		} else {
1928 			/*
1929 			 * If we are a recursive call, we can
1930 			 * proceed without the lock.
1931 			 * Otherwise we have to wait until
1932 			 * the lock becomes free.
1933 			 */
1934 			if (!is_recursed) {
1935 				mutex_enter(&net->net_cnt_lock);
1936 				if (net->net_status &
1937 				    (NFS4_EPHEMERAL_TREE_DEROOTING
1938 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
1939 					nfs4_ephemeral_tree_decr(net);
1940 					mutex_exit(&net->net_cnt_lock);
1941 					*pmust_rele = FALSE;
1942 					goto is_busy;
1943 				}
1944 				mutex_exit(&net->net_cnt_lock);
1945 
1946 				/*
1947 				 * We can't hold any other locks whilst
1948 				 * we wait on this to free up.
1949 				 */
1950 				mutex_enter(&net->net_tree_lock);
1951 
1952 				/*
1953 				 * Note that while mi->mi_ephemeral
1954 				 * may change and thus we have to
1955 				 * update eph, it is the case that
1956 				 * we have tied down net and
1957 				 * do not care if mi->mi_ephemeral_tree
1958 				 * has changed.
1959 				 */
1960 				mutex_enter(&mi->mi_lock);
1961 				eph = mi->mi_ephemeral;
1962 				mutex_exit(&mi->mi_lock);
1963 
1964 				/*
1965 				 * Okay, we need to see if either the
1966 				 * tree got nuked or the current node
1967 				 * got nuked. Both of which will cause
1968 				 * an error.
1969 				 *
1970 				 * Note that a subsequent retry of the
1971 				 * umount shall work.
1972 				 */
1973 				mutex_enter(&net->net_cnt_lock);
1974 				if (net->net_status &
1975 				    NFS4_EPHEMERAL_TREE_INVALID ||
1976 				    (!is_derooting && eph == NULL)) {
1977 					nfs4_ephemeral_tree_decr(net);
1978 					mutex_exit(&net->net_cnt_lock);
1979 					mutex_exit(&net->net_tree_lock);
1980 					*pmust_rele = FALSE;
1981 					goto is_busy;
1982 				}
1983 				mutex_exit(&net->net_cnt_lock);
1984 				*pmust_unlock = TRUE;
1985 			}
1986 		}
1987 	}
1988 
1989 	/*
1990 	 * Only once we have grabbed the lock can we mark what we
1991 	 * are planning on doing to the ephemeral tree.
1992 	 */
1993 	if (*pmust_unlock) {
1994 		mutex_enter(&net->net_cnt_lock);
1995 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
1996 
1997 		/*
1998 		 * Check to see if we are nuking the root.
1999 		 */
2000 		if (is_derooting)
2001 			net->net_status |=
2002 			    NFS4_EPHEMERAL_TREE_DEROOTING;
2003 		mutex_exit(&net->net_cnt_lock);
2004 	}
2005 
2006 	if (!is_derooting) {
2007 		/*
2008 		 * Only work on children if the caller has not already
2009 		 * done so.
2010 		 */
2011 		if (!is_recursed) {
2012 			ASSERT(eph != NULL);
2013 
2014 			error = nfs4_ephemeral_unmount_engine(eph,
2015 			    FALSE, flag, cr);
2016 			if (error)
2017 				goto is_busy;
2018 		}
2019 	} else {
2020 		eph = net->net_root;
2021 
2022 		/*
2023 		 * Only work if there is something there.
2024 		 */
2025 		if (eph) {
2026 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2027 			    flag, cr);
2028 			if (error) {
2029 				mutex_enter(&net->net_cnt_lock);
2030 				net->net_status &=
2031 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
2032 				mutex_exit(&net->net_cnt_lock);
2033 				goto is_busy;
2034 			}
2035 
2036 			/*
2037 			 * Nothing else which goes wrong will
2038 			 * invalidate the blowing away of the
2039 			 * ephmeral tree.
2040 			 */
2041 			net->net_root = NULL;
2042 		}
2043 
2044 		/*
2045 		 * We have derooted and we have caused the tree to be
2046 		 * invalidated.
2047 		 */
2048 		mutex_enter(&net->net_cnt_lock);
2049 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2050 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2051 		if (was_locked == FALSE)
2052 			nfs4_ephemeral_tree_decr(net);
2053 		mutex_exit(&net->net_cnt_lock);
2054 
2055 		if (was_locked == FALSE)
2056 			mutex_exit(&net->net_tree_lock);
2057 
2058 		/*
2059 		 * We have just blown away any notation of this
2060 		 * tree being locked. We can't let the caller
2061 		 * try to clean things up.
2062 		 */
2063 		*pmust_unlock = FALSE;
2064 
2065 		/*
2066 		 * At this point, the tree should no longer be
2067 		 * associated with the mntinfo4. We need to pull
2068 		 * it off there and let the harvester take
2069 		 * care of it once the refcnt drops.
2070 		 */
2071 		mutex_enter(&mi->mi_lock);
2072 		mi->mi_ephemeral_tree = NULL;
2073 		mutex_exit(&mi->mi_lock);
2074 	}
2075 
2076 	return (0);
2077 
2078 is_busy:
2079 
2080 	nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele,
2081 	    pnet);
2082 
2083 	return (error);
2084 }
2085 
2086 /*
2087  * Do the umount and record any error in the parent.
2088  */
2089 static void
2090 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2091     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2092 {
2093 	int	error;
2094 
2095 	error = umount2_engine(vfsp, flag, kcred, FALSE);
2096 	if (error) {
2097 		if (prior) {
2098 			if (prior->ne_child == e)
2099 				prior->ne_state |=
2100 				    NFS4_EPHEMERAL_CHILD_ERROR;
2101 			else
2102 				prior->ne_state |=
2103 				    NFS4_EPHEMERAL_PEER_ERROR;
2104 		}
2105 	}
2106 }
2107 
2108 /*
2109  * For each tree in the forest (where the forest is in
2110  * effect all of the ephemeral trees for this zone),
2111  * scan to see if a node can be unmounted. Note that
2112  * unlike nfs4_ephemeral_unmount_engine(), we do
2113  * not process the current node before children or
2114  * siblings. I.e., if a node can be unmounted, we
2115  * do not recursively check to see if the nodes
2116  * hanging off of it can also be unmounted.
2117  *
2118  * Instead, we delve down deep to try and remove the
2119  * children first. Then, because we share code with
2120  * nfs4_ephemeral_unmount_engine(), we will try
2121  * them again. This could be a performance issue in
2122  * the future.
2123  *
2124  * Also note that unlike nfs4_ephemeral_unmount_engine(),
2125  * we do not halt on an error. We will not remove the
2126  * current node, but we will keep on trying to remove
2127  * the others.
2128  *
2129  * force indicates that we want the unmount to occur
2130  * even if there is something blocking it.
2131  *
2132  * time_check indicates that we want to see if the
2133  * mount has expired past mount_to or not. Typically
2134  * we want to do this and only on a shutdown of the
2135  * zone would we want to ignore the check.
2136  */
2137 static void
2138 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2139     bool_t force, bool_t time_check)
2140 {
2141 	nfs4_ephemeral_tree_t	*net;
2142 	nfs4_ephemeral_tree_t	*prev = NULL;
2143 	nfs4_ephemeral_tree_t	*next;
2144 	nfs4_ephemeral_t	*e;
2145 	nfs4_ephemeral_t	*prior;
2146 	time_t			now = gethrestime_sec();
2147 
2148 	nfs4_ephemeral_tree_t	*harvest = NULL;
2149 
2150 	int			flag;
2151 
2152 	mntinfo4_t		*mi;
2153 	vfs_t			*vfsp;
2154 
2155 	if (force)
2156 		flag = MS_FORCE | MS_SYSSPACE;
2157 	else
2158 		flag = MS_SYSSPACE;
2159 
2160 	mutex_enter(&ntg->ntg_forest_lock);
2161 	for (net = ntg->ntg_forest; net != NULL; net = next) {
2162 		next = net->net_next;
2163 
2164 		nfs4_ephemeral_tree_hold(net);
2165 
2166 		mutex_enter(&net->net_tree_lock);
2167 
2168 		/*
2169 		 * Let the unmount code know that the
2170 		 * tree is already locked!
2171 		 */
2172 		mutex_enter(&net->net_cnt_lock);
2173 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2174 		mutex_exit(&net->net_cnt_lock);
2175 
2176 		/*
2177 		 * If the intent is force all ephemeral nodes to
2178 		 * be unmounted in this zone, we can short circuit a
2179 		 * lot of tree traversal and simply zap the root node.
2180 		 */
2181 		if (force) {
2182 			if (net->net_root) {
2183 				mi = net->net_root->ne_mount;
2184 				vfsp = mi->mi_vfsp;
2185 
2186 				/*
2187 				 * Cleared by umount2_engine.
2188 				 */
2189 				VFS_HOLD(vfsp);
2190 
2191 				(void) umount2_engine(vfsp, flag,
2192 				    kcred, FALSE);
2193 
2194 				goto check_done;
2195 			}
2196 		}
2197 
2198 		e = net->net_root;
2199 		if (e)
2200 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2201 
2202 		while (e) {
2203 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2204 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2205 				if (e->ne_child) {
2206 					e = e->ne_child;
2207 					e->ne_state =
2208 					    NFS4_EPHEMERAL_VISIT_CHILD;
2209 				}
2210 
2211 				continue;
2212 			} else if (e->ne_state ==
2213 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2214 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2215 				if (e->ne_peer) {
2216 					e = e->ne_peer;
2217 					e->ne_state =
2218 					    NFS4_EPHEMERAL_VISIT_CHILD;
2219 				}
2220 
2221 				continue;
2222 			} else if (e->ne_state ==
2223 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2224 				prior = e->ne_prior;
2225 
2226 				/*
2227 				 * If a child reported an error, do
2228 				 * not bother trying to unmount.
2229 				 *
2230 				 * If your prior node is a parent,
2231 				 * pass the error up such that they
2232 				 * also do not try to unmount.
2233 				 *
2234 				 * However, if your prior is a sibling,
2235 				 * let them try to unmount if they can.
2236 				 */
2237 				if (prior) {
2238 					if (prior->ne_child == e)
2239 						prior->ne_state |=
2240 						    NFS4_EPHEMERAL_CHILD_ERROR;
2241 					else
2242 						prior->ne_state |=
2243 						    NFS4_EPHEMERAL_PEER_ERROR;
2244 				}
2245 
2246 				/*
2247 				 * Clear the error and if needed, process peers.
2248 				 *
2249 				 * Once we mask out the error, we know whether
2250 				 * or we have to process another node.
2251 				 */
2252 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2253 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2254 					e = prior;
2255 
2256 				continue;
2257 			} else if (e->ne_state ==
2258 			    NFS4_EPHEMERAL_PEER_ERROR) {
2259 				prior = e->ne_prior;
2260 
2261 				if (prior) {
2262 					if (prior->ne_child == e)
2263 						prior->ne_state =
2264 						    NFS4_EPHEMERAL_CHILD_ERROR;
2265 					else
2266 						prior->ne_state =
2267 						    NFS4_EPHEMERAL_PEER_ERROR;
2268 				}
2269 
2270 				/*
2271 				 * Clear the error from this node and do the
2272 				 * correct processing.
2273 				 */
2274 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2275 				continue;
2276 			}
2277 
2278 			prior = e->ne_prior;
2279 			e->ne_state = NFS4_EPHEMERAL_OK;
2280 
2281 			/*
2282 			 * It must be the case that we need to process
2283 			 * this node.
2284 			 */
2285 			if (!time_check ||
2286 			    now - e->ne_ref_time > e->ne_mount_to) {
2287 				mi = e->ne_mount;
2288 				vfsp = mi->mi_vfsp;
2289 
2290 				/*
2291 				 * Cleared by umount2_engine.
2292 				 */
2293 				VFS_HOLD(vfsp);
2294 
2295 				/*
2296 				 * Note that we effectively work down to the
2297 				 * leaf nodes first, try to unmount them,
2298 				 * then work our way back up into the leaf
2299 				 * nodes.
2300 				 *
2301 				 * Also note that we deal with a lot of
2302 				 * complexity by sharing the work with
2303 				 * the manual unmount code.
2304 				 */
2305 				nfs4_ephemeral_record_umount(vfsp, flag,
2306 				    e, prior);
2307 			}
2308 
2309 			e = prior;
2310 		}
2311 
2312 check_done:
2313 
2314 		/*
2315 		 * At this point we are done processing this tree.
2316 		 *
2317 		 * If the tree is invalid and we are the only reference
2318 		 * to it, then we push it on the local linked list
2319 		 * to remove it at the end. We avoid that action now
2320 		 * to keep the tree processing going along at a fair clip.
2321 		 *
2322 		 * Else, even if we are the only reference, we drop
2323 		 * our hold on the current tree and allow it to be
2324 		 * reused as needed.
2325 		 */
2326 		mutex_enter(&net->net_cnt_lock);
2327 		if (net->net_refcnt == 1 &&
2328 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
2329 			nfs4_ephemeral_tree_decr(net);
2330 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2331 			mutex_exit(&net->net_cnt_lock);
2332 			mutex_exit(&net->net_tree_lock);
2333 
2334 			if (prev)
2335 				prev->net_next = net->net_next;
2336 			else
2337 				ntg->ntg_forest = net->net_next;
2338 
2339 			net->net_next = harvest;
2340 			harvest = net;
2341 			continue;
2342 		}
2343 
2344 		nfs4_ephemeral_tree_decr(net);
2345 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2346 		mutex_exit(&net->net_cnt_lock);
2347 		mutex_exit(&net->net_tree_lock);
2348 
2349 		prev = net;
2350 	}
2351 	mutex_exit(&ntg->ntg_forest_lock);
2352 
2353 	for (net = harvest; net != NULL; net = next) {
2354 		next = net->net_next;
2355 
2356 		mutex_destroy(&net->net_tree_lock);
2357 		mutex_destroy(&net->net_cnt_lock);
2358 		kmem_free(net, sizeof (*net));
2359 	}
2360 }
2361 
2362 /*
2363  * This is the thread which decides when the harvesting
2364  * can proceed and when to kill it off for this zone.
2365  */
2366 static void
2367 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
2368 {
2369 	clock_t		timeleft;
2370 	zone_t		*zone = curproc->p_zone;
2371 
2372 	for (;;) {
2373 		timeleft = zone_status_timedwait(zone, lbolt +
2374 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
2375 
2376 		/*
2377 		 * zone is exiting...
2378 		 */
2379 		if (timeleft != -1) {
2380 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
2381 			zthread_exit();
2382 			/* NOTREACHED */
2383 		}
2384 
2385 		/*
2386 		 * Only bother scanning if there is potential
2387 		 * work to be done.
2388 		 */
2389 		if (ntg->ntg_forest == NULL)
2390 			continue;
2391 
2392 		/*
2393 		 * Now scan the list and get rid of everything which
2394 		 * is old.
2395 		 */
2396 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
2397 	}
2398 
2399 	/* NOTREACHED */
2400 }
2401 
2402 /*
2403  * The zone specific glue needed to start the unmount harvester.
2404  *
2405  * Note that we want to avoid holding the mutex as long as possible,
2406  * hence the multiple checks.
2407  *
2408  * The caller should avoid us getting down here in the first
2409  * place.
2410  */
2411 static void
2412 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
2413 {
2414 	/*
2415 	 * It got started before we got here...
2416 	 */
2417 	if (ntg->ntg_thread_started)
2418 		return;
2419 
2420 	mutex_enter(&nfs4_ephemeral_thread_lock);
2421 
2422 	if (ntg->ntg_thread_started) {
2423 		mutex_exit(&nfs4_ephemeral_thread_lock);
2424 		return;
2425 	}
2426 
2427 	/*
2428 	 * Start the unmounter harvester thread for this zone.
2429 	 */
2430 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
2431 	    ntg, 0, minclsyspri);
2432 
2433 	ntg->ntg_thread_started = TRUE;
2434 	mutex_exit(&nfs4_ephemeral_thread_lock);
2435 }
2436 
2437 /*ARGSUSED*/
2438 static void *
2439 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
2440 {
2441 	nfs4_trigger_globals_t	*ntg;
2442 
2443 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
2444 	ntg->ntg_thread_started = FALSE;
2445 
2446 	/*
2447 	 * This is the default....
2448 	 */
2449 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
2450 
2451 	mutex_init(&ntg->ntg_forest_lock, NULL,
2452 	    MUTEX_DEFAULT, NULL);
2453 
2454 	return (ntg);
2455 }
2456 
2457 /*
2458  * Try a nice gentle walk down the forest and convince
2459  * all of the trees to gracefully give it up.
2460  */
2461 /*ARGSUSED*/
2462 static void
2463 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
2464 {
2465 	nfs4_trigger_globals_t	*ntg = arg;
2466 
2467 	if (!ntg)
2468 		return;
2469 
2470 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
2471 }
2472 
2473 /*
2474  * Race along the forest and rip all of the trees out by
2475  * their rootballs!
2476  */
2477 /*ARGSUSED*/
2478 static void
2479 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
2480 {
2481 	nfs4_trigger_globals_t	*ntg = arg;
2482 
2483 	if (!ntg)
2484 		return;
2485 
2486 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
2487 
2488 	mutex_destroy(&ntg->ntg_forest_lock);
2489 	kmem_free(ntg, sizeof (*ntg));
2490 }
2491 
2492 /*
2493  * This is the zone independent cleanup needed for
2494  * emphemeral mount processing.
2495  */
2496 void
2497 nfs4_ephemeral_fini(void)
2498 {
2499 	(void) zone_key_delete(nfs4_ephemeral_key);
2500 	mutex_destroy(&nfs4_ephemeral_thread_lock);
2501 }
2502 
2503 /*
2504  * This is the zone independent initialization needed for
2505  * emphemeral mount processing.
2506  */
2507 void
2508 nfs4_ephemeral_init(void)
2509 {
2510 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
2511 	    NULL);
2512 
2513 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
2514 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
2515 }
2516 
2517 /*
2518  * nfssys() calls this function to set the per-zone
2519  * value of mount_to to drive when an ephemeral mount is
2520  * timed out. Each mount will grab a copy of this value
2521  * when mounted.
2522  */
2523 void
2524 nfs4_ephemeral_set_mount_to(uint_t mount_to)
2525 {
2526 	nfs4_trigger_globals_t	*ntg;
2527 	zone_t			*zone = curproc->p_zone;
2528 
2529 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2530 
2531 	ntg->ntg_mount_to = mount_to;
2532 }
2533 
2534 /*
2535  * Walk the list of v4 mount options; if they are currently set in vfsp,
2536  * append them to a new comma-separated mount option string, and return it.
2537  *
2538  * Caller should free by calling nfs4_trigger_destroy_mntopts().
2539  */
2540 static char *
2541 nfs4_trigger_create_mntopts(vfs_t *vfsp)
2542 {
2543 	uint_t i;
2544 	char *mntopts;
2545 	struct vfssw *vswp;
2546 	mntopts_t *optproto;
2547 
2548 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
2549 
2550 	/* get the list of applicable mount options for v4; locks *vswp */
2551 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
2552 	optproto = &vswp->vsw_optproto;
2553 
2554 	for (i = 0; i < optproto->mo_count; i++) {
2555 		struct mntopt *mop = &optproto->mo_list[i];
2556 
2557 		if (mop->mo_flags & MO_EMPTY)
2558 			continue;
2559 
2560 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
2561 			kmem_free(mntopts, MAX_MNTOPT_STR);
2562 			vfs_unrefvfssw(vswp);
2563 			return (NULL);
2564 		}
2565 	}
2566 
2567 	vfs_unrefvfssw(vswp);
2568 
2569 	/*
2570 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
2571 	 * and it may only be passed via MS_OPTIONSTR, so we
2572 	 * must handle it here.
2573 	 *
2574 	 * Ideally, it would be in the list, but NFS does not specify its
2575 	 * own opt proto list, it uses instead the default one. Since
2576 	 * not all filesystems support extended attrs, it would not be
2577 	 * appropriate to add it there.
2578 	 */
2579 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
2580 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
2581 		kmem_free(mntopts, MAX_MNTOPT_STR);
2582 		return (NULL);
2583 	}
2584 
2585 	return (mntopts);
2586 }
2587 
2588 static void
2589 nfs4_trigger_destroy_mntopts(char *mntopts)
2590 {
2591 	if (mntopts)
2592 		kmem_free(mntopts, MAX_MNTOPT_STR);
2593 }
2594 
2595 /*
2596  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
2597  */
2598 static int
2599 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
2600 {
2601 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
2602 		return (EINVAL);
2603 
2604 	if (vfs_optionisset(vfsp, optname, NULL)) {
2605 		size_t mntoptslen = strlen(mntopts);
2606 		size_t optnamelen = strlen(optname);
2607 
2608 		/* +1 for ',', +1 for NUL */
2609 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
2610 			return (EOVERFLOW);
2611 
2612 		/* first or subsequent mount option? */
2613 		if (*mntopts != '\0')
2614 			(void) strcat(mntopts, ",");
2615 
2616 		(void) strcat(mntopts, optname);
2617 	}
2618 
2619 	return (0);
2620 }
2621 
2622 static enum clnt_stat
2623 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
2624 {
2625 	int retries, error;
2626 	uint_t max_msgsize;
2627 	enum clnt_stat status;
2628 	CLIENT *cl;
2629 	struct timeval timeout;
2630 
2631 	/* as per recov_newserver() */
2632 	max_msgsize = 0;
2633 	retries = 1;
2634 	timeout.tv_sec = 2;
2635 	timeout.tv_usec = 0;
2636 
2637 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM,
2638 	    NFS_V4, max_msgsize, retries, CRED(), &cl);
2639 	if (error)
2640 		return (RPC_FAILED);
2641 
2642 	if (nointr)
2643 		cl->cl_nosignal = TRUE;
2644 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
2645 	    timeout);
2646 	if (nointr)
2647 		cl->cl_nosignal = FALSE;
2648 
2649 	AUTH_DESTROY(cl->cl_auth);
2650 	CLNT_DESTROY(cl);
2651 
2652 	return (status);
2653 }
2654