xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision 62c8caf3fac65817982e780c1efa988846153bf0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29  * triggered from a "stub" rnode via a special set of vnodeops.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/mount.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 #include <sys/mntent.h>
68 
69 #include <rpc/types.h>
70 #include <rpc/auth.h>
71 #include <rpc/clnt.h>
72 
73 #include <nfs/nfs.h>
74 #include <nfs/nfs_clnt.h>
75 #include <nfs/nfs_acl.h>
76 #include <nfs/lm.h>
77 #include <nfs/nfs4.h>
78 #include <nfs/nfs4_kprot.h>
79 #include <nfs/rnode4.h>
80 #include <nfs/nfs4_clnt.h>
81 
82 #include <vm/hat.h>
83 #include <vm/as.h>
84 #include <vm/page.h>
85 #include <vm/pvn.h>
86 #include <vm/seg.h>
87 #include <vm/seg_map.h>
88 #include <vm/seg_kpm.h>
89 #include <vm/seg_vn.h>
90 
91 #include <fs/fs_subr.h>
92 
93 #include <sys/ddi.h>
94 #include <sys/int_fmtio.h>
95 
96 #include <sys/sunddi.h>
97 
98 /*
99  * The automatic unmounter thread stuff!
100  */
101 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
102 
103 /*
104  * Just a default....
105  */
106 static uint_t nfs4_trigger_mount_to = 240;
107 
108 typedef struct nfs4_trigger_globals {
109 	kmutex_t		ntg_forest_lock;
110 	uint_t			ntg_mount_to;
111 	int			ntg_thread_started;
112 	nfs4_ephemeral_tree_t	*ntg_forest;
113 } nfs4_trigger_globals_t;
114 
115 kmutex_t	nfs4_ephemeral_thread_lock;
116 
117 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
118 
119 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
120 
121 /*
122  * Used for ephemeral mounts; contains data either duplicated from
123  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
124  *
125  * It's intended that this structure is used solely for ephemeral
126  * mount-type specific data, for passing this data to
127  * nfs4_trigger_nargs_create().
128  */
129 typedef struct ephemeral_servinfo {
130 	char			*esi_hostname;
131 	char			*esi_netname;
132 	char			*esi_path;
133 	int			esi_path_len;
134 	int			esi_mount_flags;
135 	struct netbuf		*esi_addr;
136 	struct netbuf		*esi_syncaddr;
137 	struct knetconfig	*esi_knconf;
138 } ephemeral_servinfo_t;
139 
140 /*
141  * Collect together the mount-type specific and generic data args.
142  */
143 typedef struct domount_args {
144 	ephemeral_servinfo_t	*dma_esi;
145 	char			*dma_hostlist; /* comma-sep. for RO failover */
146 	struct nfs_args		*dma_nargs;
147 } domount_args_t;
148 
149 
150 /*
151  * The vnode ops functions for a trigger stub vnode
152  */
153 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
154 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
155     caller_context_t *);
156 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
157     caller_context_t *);
158 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
159     caller_context_t *);
160 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
161     caller_context_t *);
162 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
163     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
164     int *, pathname_t *);
165 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
166     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
167     vsecattr_t *);
168 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
169     int);
170 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
171     caller_context_t *, int);
172 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
173     cred_t *, caller_context_t *, int);
174 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
175     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
176 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
177     caller_context_t *, int);
178 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
179     cred_t *, caller_context_t *, int);
180 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
181 
182 /*
183  * Regular NFSv4 vnodeops that we need to reference directly
184  */
185 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
186 		    caller_context_t *);
187 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
188 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
189 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
190 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
191 		    struct pathname *, int, vnode_t *, cred_t *,
192 		    caller_context_t *, int *, pathname_t *);
193 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
194 		    caller_context_t *);
195 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
196 		    caller_context_t *);
197 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
198 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
199 
200 static int	nfs4_trigger_mount(vnode_t *, vnode_t **);
201 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
202     cred_t *);
203 static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *);
204 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
205     vnode_t *vp);
206 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *);
207 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
208 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
209     servinfo4_t *);
210 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
211     ephemeral_servinfo_t *);
212 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
213 static char	*nfs4_trigger_create_mntopts(vfs_t *);
214 static void	nfs4_trigger_destroy_mntopts(char *);
215 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
216 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
217 
218 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
219 
220 
221 vnodeops_t *nfs4_trigger_vnodeops;
222 
223 /*
224  * These are the vnodeops that we must define for stub vnodes.
225  *
226  *
227  * Many of the VOPs defined for NFSv4 do not need to be defined here,
228  * for various reasons. This will result in the VFS default function being
229  * used:
230  *
231  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
232  *   lost the reference to the stub vnode, meaning these should not be called:
233  *       close, read, write, ioctl, readdir, seek.
234  *
235  * - These VOPs are meaningless for vnodes without data pages. Since the
236  *   stub vnode is of type VDIR, these should not be called:
237  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
238  *
239  * - These VOPs are otherwise not applicable, and should not be called:
240  *       dump, setsecattr.
241  *
242  *
243  * These VOPs we do not want to define, but nor do we want the VFS default
244  * action. Instead, we specify the VFS error function, with fs_error(), but
245  * note that fs_error() is not actually called. Instead it results in the
246  * use of the error function defined for the particular VOP, in vn_ops_table[]:
247  *
248  * -   frlock, dispose, shrlock.
249  *
250  *
251  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
252  * NOTE: if any of these ops involve an OTW call with the stub FH, then
253  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
254  * to protect the security data in the servinfo4_t for the "parent"
255  * filesystem that contains the stub.
256  *
257  * - These VOPs should not trigger a mount, so that "ls -l" does not:
258  *       pathconf, getsecattr.
259  *
260  * - These VOPs would not make sense to trigger:
261  *       inactive, rwlock, rwunlock, fid, realvp.
262  */
263 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
264 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
265 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
266 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
267 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
268 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
269 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
270 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
271 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
272 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
273 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
274 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
275 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
276 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
277 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
278 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
279 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
280 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
281 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
282 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
283 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
284 	VOPNAME_FRLOCK,		{ .error = fs_error },
285 	VOPNAME_DISPOSE,	{ .error = fs_error },
286 	VOPNAME_SHRLOCK,	{ .error = fs_error },
287 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
288 	NULL, NULL
289 };
290 
291 static void
292 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
293 {
294 	mutex_enter(&net->net_cnt_lock);
295 	net->net_refcnt++;
296 	ASSERT(net->net_refcnt != 0);
297 	mutex_exit(&net->net_cnt_lock);
298 }
299 
300 /*
301  * We need a safe way to decrement the refcnt whilst the
302  * lock is being held.
303  */
304 static void
305 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
306 {
307 	ASSERT(mutex_owned(&net->net_cnt_lock));
308 	ASSERT(net->net_refcnt != 0);
309 	net->net_refcnt--;
310 }
311 
312 static void
313 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
314 {
315 	mutex_enter(&net->net_cnt_lock);
316 	nfs4_ephemeral_tree_decr(net);
317 	mutex_exit(&net->net_cnt_lock);
318 }
319 
320 /*
321  * Trigger ops for stub vnodes; for mirror mounts, etc.
322  *
323  * The general idea is that a "triggering" op will first call
324  * nfs4_trigger_mount(), which will find out whether a mount has already
325  * been triggered.
326  *
327  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
328  * of the covering vfs.
329  *
330  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
331  * and again set newvp, as above.
332  *
333  * The triggering op may then re-issue the VOP by calling it on newvp.
334  *
335  * Note that some ops may perform custom action, and may or may not need
336  * to trigger a mount.
337  *
338  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
339  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
340  * and that would just recurse. Instead, we call the v4 op directly,
341  * by name.  This is OK, since we know that the vnode is for NFSv4,
342  * otherwise it couldn't be a stub.
343  *
344  */
345 
346 static int
347 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
348 {
349 	int error;
350 	vnode_t *newvp;
351 
352 	error = nfs4_trigger_mount(*vpp, &newvp);
353 	if (error)
354 		return (error);
355 
356 	/* Release the stub vnode, as we're losing the reference to it */
357 	VN_RELE(*vpp);
358 
359 	/* Give the caller the root vnode of the newly-mounted fs */
360 	*vpp = newvp;
361 
362 	/* return with VN_HELD(newvp) */
363 	return (VOP_OPEN(vpp, flag, cr, ct));
364 }
365 
366 /*
367  * For the majority of cases, nfs4_trigger_getattr() will not trigger
368  * a mount. However, if ATTR_TRIGGER is set, we are being informed
369  * that we need to force the mount before we attempt to determine
370  * the attributes. The intent is an atomic operation for security
371  * testing.
372  */
373 static int
374 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
375     caller_context_t *ct)
376 {
377 	int error;
378 
379 	if (flags & ATTR_TRIGGER) {
380 		vnode_t	*newvp;
381 
382 		error = nfs4_trigger_mount(vp, &newvp);
383 		if (error)
384 			return (error);
385 
386 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
387 		VN_RELE(newvp);
388 	} else {
389 		error = nfs4_getattr(vp, vap, flags, cr, ct);
390 	}
391 
392 	return (error);
393 }
394 
395 static int
396 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
397 		caller_context_t *ct)
398 {
399 	int error;
400 	vnode_t *newvp;
401 
402 	error = nfs4_trigger_mount(vp, &newvp);
403 	if (error)
404 		return (error);
405 
406 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
407 	VN_RELE(newvp);
408 
409 	return (error);
410 }
411 
412 static int
413 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
414     caller_context_t *ct)
415 {
416 	int error;
417 	vnode_t *newvp;
418 
419 	error = nfs4_trigger_mount(vp, &newvp);
420 	if (error)
421 		return (error);
422 
423 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
424 	VN_RELE(newvp);
425 
426 	return (error);
427 }
428 
429 static int
430 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
431     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
432     caller_context_t *ct, int *deflags, pathname_t *rpnp)
433 {
434 	int error;
435 	vnode_t *newdvp;
436 	rnode4_t *drp = VTOR4(dvp);
437 
438 	ASSERT(RP_ISSTUB(drp));
439 
440 	/* for now, we only support mirror-mounts */
441 	ASSERT(RP_ISSTUB_MIRRORMOUNT(drp));
442 
443 	/*
444 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
445 	 * that up. Instead, pass onto the regular op, regardless of whether
446 	 * we've triggered a mount.
447 	 */
448 	if (strcmp(nm, "..") == 0)
449 		return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
450 		    ct, deflags, rpnp));
451 
452 	error = nfs4_trigger_mount(dvp, &newdvp);
453 	if (error)
454 		return (error);
455 
456 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
457 	    deflags, rpnp);
458 	VN_RELE(newdvp);
459 
460 	return (error);
461 }
462 
463 static int
464 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
465     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
466     int flags, caller_context_t *ct, vsecattr_t *vsecp)
467 {
468 	int error;
469 	vnode_t *newdvp;
470 
471 	error = nfs4_trigger_mount(dvp, &newdvp);
472 	if (error)
473 		return (error);
474 
475 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
476 	    flags, ct, vsecp);
477 	VN_RELE(newdvp);
478 
479 	return (error);
480 }
481 
482 static int
483 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
484     int flags)
485 {
486 	int error;
487 	vnode_t *newdvp;
488 
489 	error = nfs4_trigger_mount(dvp, &newdvp);
490 	if (error)
491 		return (error);
492 
493 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
494 	VN_RELE(newdvp);
495 
496 	return (error);
497 }
498 
499 static int
500 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
501     caller_context_t *ct, int flags)
502 {
503 	int error;
504 	vnode_t *newtdvp;
505 
506 	error = nfs4_trigger_mount(tdvp, &newtdvp);
507 	if (error)
508 		return (error);
509 
510 	/*
511 	 * We don't check whether svp is a stub. Let the NFSv4 code
512 	 * detect that error, and return accordingly.
513 	 */
514 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
515 	VN_RELE(newtdvp);
516 
517 	return (error);
518 }
519 
520 static int
521 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
522     cred_t *cr, caller_context_t *ct, int flags)
523 {
524 	int error;
525 	vnode_t *newsdvp;
526 	rnode4_t *tdrp = VTOR4(tdvp);
527 
528 	/*
529 	 * We know that sdvp is a stub, otherwise we would not be here.
530 	 *
531 	 * If tdvp is also be a stub, there are two possibilities: it
532 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
533 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
534 	 *
535 	 * In the former case, just trigger sdvp, and treat tdvp as
536 	 * though it were not a stub.
537 	 *
538 	 * In the latter case, it might be a different stub for the
539 	 * same server fs as sdvp, or for a different server fs.
540 	 * Regardless, from the client perspective this would still
541 	 * be a cross-filesystem rename, and should not be allowed,
542 	 * so return EXDEV, without triggering either mount.
543 	 */
544 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
545 		return (EXDEV);
546 
547 	error = nfs4_trigger_mount(sdvp, &newsdvp);
548 	if (error)
549 		return (error);
550 
551 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
552 
553 	VN_RELE(newsdvp);
554 
555 	return (error);
556 }
557 
558 /* ARGSUSED */
559 static int
560 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
561     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
562 {
563 	int error;
564 	vnode_t *newdvp;
565 
566 	error = nfs4_trigger_mount(dvp, &newdvp);
567 	if (error)
568 		return (error);
569 
570 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
571 	VN_RELE(newdvp);
572 
573 	return (error);
574 }
575 
576 static int
577 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
578     caller_context_t *ct, int flags)
579 {
580 	int error;
581 	vnode_t *newdvp;
582 
583 	error = nfs4_trigger_mount(dvp, &newdvp);
584 	if (error)
585 		return (error);
586 
587 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
588 	VN_RELE(newdvp);
589 
590 	return (error);
591 }
592 
593 static int
594 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
595     cred_t *cr, caller_context_t *ct, int flags)
596 {
597 	int error;
598 	vnode_t *newdvp;
599 
600 	error = nfs4_trigger_mount(dvp, &newdvp);
601 	if (error)
602 		return (error);
603 
604 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
605 	VN_RELE(newdvp);
606 
607 	return (error);
608 }
609 
610 static int
611 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
612     caller_context_t *ct)
613 {
614 	int error;
615 	vnode_t *newvp;
616 
617 	error = nfs4_trigger_mount(vp, &newvp);
618 	if (error)
619 		return (error);
620 
621 	error = VOP_READLINK(newvp, uiop, cr, ct);
622 	VN_RELE(newvp);
623 
624 	return (error);
625 }
626 
627 /* end of trigger vnode ops */
628 
629 
630 /*
631  * Mount upon a trigger vnode; for mirror-mounts, etc.
632  *
633  * The mount may have already occurred, via another thread. If not,
634  * assemble the location information - which may require fetching - and
635  * perform the mount.
636  *
637  * Sets newvp to be the root of the fs that is now covering vp. Note
638  * that we return with VN_HELD(*newvp).
639  *
640  * The caller is responsible for passing the VOP onto the covering fs.
641  */
642 static int
643 nfs4_trigger_mount(vnode_t *vp, vnode_t **newvpp)
644 {
645 	int			 error;
646 	vfs_t			*vfsp;
647 	rnode4_t		*rp = VTOR4(vp);
648 	mntinfo4_t		*mi = VTOMI4(vp);
649 	domount_args_t		*dma;
650 
651 	nfs4_ephemeral_tree_t	*net;
652 
653 	bool_t			must_unlock = FALSE;
654 	bool_t			is_building = FALSE;
655 
656 	cred_t			*zcred;
657 
658 	nfs4_trigger_globals_t	*ntg;
659 
660 	zone_t			*zone = curproc->p_zone;
661 
662 	ASSERT(RP_ISSTUB(rp));
663 
664 	/* for now, we only support mirror-mounts */
665 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
666 
667 	*newvpp = NULL;
668 
669 	/*
670 	 * Has the mount already occurred?
671 	 */
672 	error = vn_vfsrlock_wait(vp);
673 	if (error)
674 		goto done;
675 	vfsp = vn_mountedvfs(vp);
676 	if (vfsp != NULL) {
677 		/* the mount has already occurred */
678 		error = VFS_ROOT(vfsp, newvpp);
679 		if (!error) {
680 			/* need to update the reference time  */
681 			mutex_enter(&mi->mi_lock);
682 			if (mi->mi_ephemeral)
683 				mi->mi_ephemeral->ne_ref_time =
684 				    gethrestime_sec();
685 			mutex_exit(&mi->mi_lock);
686 		}
687 
688 		vn_vfsunlock(vp);
689 		goto done;
690 	}
691 	vn_vfsunlock(vp);
692 
693 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
694 	ASSERT(ntg != NULL);
695 
696 	mutex_enter(&mi->mi_lock);
697 
698 	/*
699 	 * We need to lock down the ephemeral tree.
700 	 */
701 	if (mi->mi_ephemeral_tree == NULL) {
702 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
703 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
704 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
705 		net->net_refcnt = 1;
706 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
707 		is_building = TRUE;
708 
709 		/*
710 		 * We need to add it to the zone specific list for
711 		 * automatic unmounting and harvesting of deadwood.
712 		 */
713 		mutex_enter(&ntg->ntg_forest_lock);
714 		if (ntg->ntg_forest != NULL)
715 			net->net_next = ntg->ntg_forest;
716 		ntg->ntg_forest = net;
717 		mutex_exit(&ntg->ntg_forest_lock);
718 
719 		/*
720 		 * No lock order confusion with mi_lock because no
721 		 * other node could have grabbed net_tree_lock.
722 		 */
723 		mutex_enter(&net->net_tree_lock);
724 		mi->mi_ephemeral_tree = net;
725 		net->net_mount = mi;
726 		mutex_exit(&mi->mi_lock);
727 	} else {
728 		net = mi->mi_ephemeral_tree;
729 		mutex_exit(&mi->mi_lock);
730 
731 		nfs4_ephemeral_tree_hold(net);
732 
733 		mutex_enter(&net->net_tree_lock);
734 
735 		/*
736 		 * We can only procede if the tree is neither locked
737 		 * nor being torn down.
738 		 */
739 		mutex_enter(&net->net_cnt_lock);
740 		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
741 			nfs4_ephemeral_tree_decr(net);
742 			mutex_exit(&net->net_cnt_lock);
743 			mutex_exit(&net->net_tree_lock);
744 
745 			return (EIO);
746 		}
747 		mutex_exit(&net->net_cnt_lock);
748 	}
749 
750 	mutex_enter(&net->net_cnt_lock);
751 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
752 	mutex_exit(&net->net_cnt_lock);
753 
754 	must_unlock = TRUE;
755 
756 	dma = nfs4_trigger_domount_args_create(vp);
757 	if (dma == NULL) {
758 		error = EINVAL;
759 		goto done;
760 	}
761 
762 	/*
763 	 * Need to be root for this call to make mount work.
764 	 * Note that since we define mirror mounts to work
765 	 * for any user, we allow the mount to proceed. And
766 	 * we realize that the server will perform security
767 	 * checks to make sure that the client is allowed
768 	 * access. Finally, once the mount takes place,
769 	 * directory permissions will ensure that the
770 	 * content is secure.
771 	 */
772 	zcred = zone_get_kcred(getzoneid());
773 	ASSERT(zcred != NULL);
774 
775 	error = nfs4_trigger_domount(vp, dma, &vfsp, zcred);
776 	nfs4_trigger_domount_args_destroy(dma, vp);
777 
778 	crfree(zcred);
779 
780 	if (!error)
781 		error = VFS_ROOT(vfsp, newvpp);
782 done:
783 	if (must_unlock) {
784 		mutex_enter(&net->net_cnt_lock);
785 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
786 		if (is_building)
787 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
788 		nfs4_ephemeral_tree_decr(net);
789 		mutex_exit(&net->net_cnt_lock);
790 
791 		mutex_exit(&net->net_tree_lock);
792 	}
793 
794 	if (!error && (newvpp == NULL || *newvpp == NULL))
795 		error = ENOSYS;
796 
797 	return (error);
798 }
799 
800 /*
801  * Collect together both the generic & mount-type specific args.
802  */
803 static domount_args_t *
804 nfs4_trigger_domount_args_create(vnode_t *vp)
805 {
806 	int nointr;
807 	char *hostlist;
808 	servinfo4_t *svp;
809 	struct nfs_args *nargs, *nargs_head;
810 	enum clnt_stat status;
811 	ephemeral_servinfo_t *esi, *esi_first;
812 	domount_args_t *dma;
813 	mntinfo4_t *mi = VTOMI4(vp);
814 
815 	nointr = !(mi->mi_flags & MI4_INT);
816 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
817 
818 	svp = mi->mi_curr_serv;
819 	/* check if the current server is responding */
820 	status = nfs4_trigger_ping_server(svp, nointr);
821 	if (status == RPC_SUCCESS) {
822 		esi_first = nfs4_trigger_esi_create(vp, svp);
823 		if (esi_first == NULL) {
824 			kmem_free(hostlist, MAXPATHLEN);
825 			return (NULL);
826 		}
827 
828 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
829 
830 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
831 	} else {
832 		/* current server did not respond */
833 		esi_first = NULL;
834 		nargs_head = NULL;
835 	}
836 	nargs = nargs_head;
837 
838 	/*
839 	 * NFS RO failover.
840 	 *
841 	 * If we have multiple servinfo4 structures, linked via sv_next,
842 	 * we must create one nfs_args for each, linking the nfs_args via
843 	 * nfs_ext_u.nfs_extB.next.
844 	 *
845 	 * We need to build a corresponding esi for each, too, but that is
846 	 * used solely for building nfs_args, and may be immediately
847 	 * discarded, as domount() requires the info from just one esi,
848 	 * but all the nfs_args.
849 	 *
850 	 * Currently, the NFS mount code will hang if not all servers
851 	 * requested are available. To avoid that, we need to ping each
852 	 * server, here, and remove it from the list if it is not
853 	 * responding. This has the side-effect of that server then
854 	 * being permanently unavailable for this failover mount, even if
855 	 * it recovers. That's unfortunate, but the best we can do until
856 	 * the mount code path is fixed.
857 	 */
858 
859 	/*
860 	 * If the current server was down, loop indefinitely until we find
861 	 * at least one responsive server.
862 	 */
863 	do {
864 		/* no locking needed for sv_next; it is only set at fs mount */
865 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
866 			struct nfs_args *next;
867 
868 			/*
869 			 * nargs_head: the head of the nfs_args list
870 			 * nargs: the current tail of the list
871 			 * next: the newly-created element to be added
872 			 */
873 
874 			/*
875 			 * We've already tried the current server, above;
876 			 * if it was responding, we have already included it
877 			 * and it may now be ignored.
878 			 *
879 			 * Otherwise, try it again, since it may now have
880 			 * recovered.
881 			 */
882 			if (svp == mi->mi_curr_serv && esi_first != NULL)
883 				continue;
884 
885 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
886 			if (svp->sv_flags & SV4_NOTINUSE) {
887 				nfs_rw_exit(&svp->sv_lock);
888 				continue;
889 			}
890 			nfs_rw_exit(&svp->sv_lock);
891 
892 			/* check if the server is responding */
893 			status = nfs4_trigger_ping_server(svp, nointr);
894 			/* if the server did not respond, ignore it */
895 			if (status != RPC_SUCCESS)
896 				continue;
897 
898 			esi = nfs4_trigger_esi_create(vp, svp);
899 			if (esi == NULL)
900 				continue;
901 
902 			/*
903 			 * If the original current server (mi_curr_serv)
904 			 * was down when when we first tried it,
905 			 * (i.e. esi_first == NULL),
906 			 * we select this new server (svp) to be the server
907 			 * that we will actually contact (esi_first).
908 			 *
909 			 * Note that it's possible that mi_curr_serv == svp,
910 			 * if that mi_curr_serv was down but has now recovered.
911 			 */
912 			next = nfs4_trigger_nargs_create(mi, svp, esi);
913 			if (esi_first == NULL) {
914 				ASSERT(nargs == NULL);
915 				ASSERT(nargs_head == NULL);
916 				nargs_head = next;
917 				esi_first = esi;
918 				(void) strlcpy(hostlist,
919 				    esi_first->esi_hostname, MAXPATHLEN);
920 			} else {
921 				ASSERT(nargs_head != NULL);
922 				nargs->nfs_ext_u.nfs_extB.next = next;
923 				(void) strlcat(hostlist, ",", MAXPATHLEN);
924 				(void) strlcat(hostlist, esi->esi_hostname,
925 				    MAXPATHLEN);
926 				/* esi was only needed for hostname & nargs */
927 				nfs4_trigger_esi_destroy(esi, vp);
928 			}
929 
930 			nargs = next;
931 		}
932 
933 		/* if we've had no response at all, wait a second */
934 		if (esi_first == NULL)
935 			delay(drv_usectohz(1000000));
936 
937 	} while (esi_first == NULL);
938 	ASSERT(nargs_head != NULL);
939 
940 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
941 	dma->dma_esi = esi_first;
942 	dma->dma_hostlist = hostlist;
943 	dma->dma_nargs = nargs_head;
944 
945 	return (dma);
946 }
947 
948 static void
949 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
950 {
951 	if (dma != NULL) {
952 		if (dma->dma_esi != NULL && vp != NULL)
953 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
954 
955 		if (dma->dma_hostlist != NULL)
956 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
957 
958 		if (dma->dma_nargs != NULL) {
959 			struct nfs_args *nargs = dma->dma_nargs;
960 
961 			do {
962 				struct nfs_args *next =
963 				    nargs->nfs_ext_u.nfs_extB.next;
964 
965 				nfs4_trigger_nargs_destroy(nargs);
966 				nargs = next;
967 			} while (nargs != NULL);
968 		}
969 
970 		kmem_free(dma, sizeof (domount_args_t));
971 	}
972 }
973 
974 /*
975  * The ephemeral_servinfo_t struct contains basic information we will need to
976  * perform the mount. Whilst the structure is generic across different
977  * types of ephemeral mount, the way we gather its contents differs.
978  */
979 static ephemeral_servinfo_t *
980 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp)
981 {
982 	ephemeral_servinfo_t *esi;
983 	rnode4_t *rp = VTOR4(vp);
984 
985 	ASSERT(RP_ISSTUB(rp));
986 
987 	/* Call the ephemeral type-specific routine */
988 	if (RP_ISSTUB_MIRRORMOUNT(rp))
989 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
990 	else
991 		esi = NULL;
992 
993 	/* for now, we only support mirror-mounts */
994 	ASSERT(esi != NULL);
995 
996 	return (esi);
997 }
998 
999 static void
1000 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1001 {
1002 	rnode4_t *rp = VTOR4(vp);
1003 
1004 	ASSERT(RP_ISSTUB(rp));
1005 
1006 	/* for now, we only support mirror-mounts */
1007 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
1008 
1009 	/* Currently, no need for an ephemeral type-specific routine */
1010 
1011 	/*
1012 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
1013 	 * and will be handled by nfs4_trigger_nargs_destroy().
1014 	 * We need only free the structure itself.
1015 	 */
1016 	if (esi != NULL)
1017 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
1018 }
1019 
1020 /*
1021  * Some of this may turn out to be common with other ephemeral types,
1022  * in which case it should be moved to nfs4_trigger_esi_create(), or a
1023  * common function called.
1024  */
1025 static ephemeral_servinfo_t *
1026 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1027 {
1028 	char			*stubpath;
1029 	struct knetconfig	*sikncp, *svkncp;
1030 	struct netbuf		*bufp;
1031 	ephemeral_servinfo_t	*esi;
1032 
1033 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1034 
1035 	/* initially set to be our type of ephemeral mount; may be added to */
1036 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1037 
1038 	/*
1039 	 * We're copying info from the stub rnode's servinfo4, but
1040 	 * we must create new copies, not pointers, since this information
1041 	 * is to be associated with the new mount, which will be
1042 	 * unmounted (and its structures freed) separately
1043 	 */
1044 
1045 	/*
1046 	 * Sizes passed to kmem_[z]alloc here must match those freed
1047 	 * in nfs4_free_args()
1048 	 */
1049 
1050 	/*
1051 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1052 	 * is difficult to avoid: as we need to read svp to calculate the
1053 	 * sizes to be allocated.
1054 	 */
1055 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1056 
1057 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1058 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1059 
1060 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1061 	bufp = esi->esi_addr;
1062 	bufp->len = svp->sv_addr.len;
1063 	bufp->maxlen = svp->sv_addr.maxlen;
1064 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1065 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1066 
1067 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1068 	sikncp = esi->esi_knconf;
1069 	svkncp = svp->sv_knconf;
1070 	sikncp->knc_semantics = svkncp->knc_semantics;
1071 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1072 	(void) strcat((char *)sikncp->knc_protofmly,
1073 	    (char *)svkncp->knc_protofmly);
1074 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1075 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1076 	sikncp->knc_rdev = svkncp->knc_rdev;
1077 
1078 	/*
1079 	 * Used when AUTH_DH is negotiated.
1080 	 *
1081 	 * This is ephemeral mount-type specific, since it contains the
1082 	 * server's time-sync syncaddr.
1083 	 */
1084 	if (svp->sv_dhsec) {
1085 		struct netbuf *bufp;
1086 		sec_data_t *sdata;
1087 		dh_k4_clntdata_t *data;
1088 
1089 		sdata = svp->sv_dhsec;
1090 		data = (dh_k4_clntdata_t *)sdata->data;
1091 		ASSERT(sdata->rpcflavor == AUTH_DH);
1092 
1093 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1094 		bufp->len = data->syncaddr.len;
1095 		bufp->maxlen = data->syncaddr.maxlen;
1096 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1097 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1098 		esi->esi_syncaddr = bufp;
1099 
1100 		if (data->netname != NULL) {
1101 			int nmlen = data->netnamelen;
1102 
1103 			/*
1104 			 * We need to copy from a dh_k4_clntdata_t
1105 			 * netname/netnamelen pair to a NUL-terminated
1106 			 * netname string suitable for putting in nfs_args,
1107 			 * where the latter has no netnamelen field.
1108 			 */
1109 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1110 			bcopy(data->netname, esi->esi_netname, nmlen);
1111 		}
1112 	} else {
1113 		esi->esi_syncaddr = NULL;
1114 		esi->esi_netname = NULL;
1115 	}
1116 
1117 	stubpath = fn_path(VTOSV(vp)->sv_name);
1118 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1119 	ASSERT(*stubpath == '.');
1120 	stubpath += 1;
1121 
1122 	/* for nfs_args->fh */
1123 	esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1;
1124 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1125 	(void) strcat(esi->esi_path, svp->sv_path);
1126 	(void) strcat(esi->esi_path, stubpath);
1127 
1128 	stubpath -= 1;
1129 	/* stubpath allocated by fn_path() */
1130 	kmem_free(stubpath, strlen(stubpath) + 1);
1131 
1132 	nfs_rw_exit(&svp->sv_lock);
1133 
1134 	return (esi);
1135 }
1136 
1137 /*
1138  * Assemble the args, and call the generic VFS mount function to
1139  * finally perform the ephemeral mount.
1140  */
1141 static int
1142 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1143     cred_t *cr)
1144 {
1145 	struct mounta	*uap;
1146 	char		*mntpt, *orig_path, *path;
1147 	const char	*orig_mntpt;
1148 	int		retval;
1149 	int		mntpt_len;
1150 	int		spec_len;
1151 	zone_t		*zone = curproc->p_zone;
1152 	bool_t		has_leading_slash;
1153 
1154 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1155 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1156 	struct nfs_args		*nargs = dma->dma_nargs;
1157 
1158 	/* first, construct the mount point for the ephemeral mount */
1159 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1160 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1161 
1162 	if (*orig_path == '.')
1163 		orig_path++;
1164 
1165 	/*
1166 	 * Get rid of zone's root path
1167 	 */
1168 	if (zone != global_zone) {
1169 		/*
1170 		 * -1 for trailing '/' and -1 for EOS.
1171 		 */
1172 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1173 		    zone->zone_rootpathlen - 1) == 0) {
1174 			orig_mntpt += (zone->zone_rootpathlen - 2);
1175 		}
1176 	}
1177 
1178 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1179 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1180 	(void) strcat(mntpt, orig_mntpt);
1181 	(void) strcat(mntpt, orig_path);
1182 
1183 	kmem_free(path, strlen(path) + 1);
1184 	path = esi->esi_path;
1185 	if (*path == '.')
1186 		path++;
1187 	if (path[0] == '/' && path[1] == '/')
1188 		path++;
1189 	has_leading_slash = (*path == '/');
1190 
1191 	spec_len = strlen(dma->dma_hostlist);
1192 	spec_len += strlen(path);
1193 
1194 	/* We are going to have to add this in */
1195 	if (!has_leading_slash)
1196 		spec_len++;
1197 
1198 	/* We need to get the ':' for dma_hostlist:esi_path */
1199 	spec_len++;
1200 
1201 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1202 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1203 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1204 	    has_leading_slash ? "" : "/", path);
1205 
1206 	uap->dir = mntpt;
1207 
1208 	uap->flags = MS_SYSSPACE | MS_DATA;
1209 	/* fstype-independent mount options not covered elsewhere */
1210 	/* copy parent's mount(1M) "-m" flag */
1211 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1212 		uap->flags |= MS_NOMNTTAB;
1213 
1214 	uap->fstype = MNTTYPE_NFS4;
1215 	uap->dataptr = (char *)nargs;
1216 	/* not needed for MS_SYSSPACE */
1217 	uap->datalen = 0;
1218 
1219 	/* use optptr to pass in extra mount options */
1220 	uap->flags |= MS_OPTIONSTR;
1221 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1222 	if (uap->optptr == NULL) {
1223 		retval = EINVAL;
1224 		goto done;
1225 	}
1226 	/* domount() expects us to count the trailing NUL */
1227 	uap->optlen = strlen(uap->optptr) + 1;
1228 
1229 	retval = domount(NULL, uap, stubvp, cr, vfsp);
1230 	if (retval == 0)
1231 		VFS_RELE(*vfsp);
1232 done:
1233 	if (uap->optptr)
1234 		nfs4_trigger_destroy_mntopts(uap->optptr);
1235 
1236 	kmem_free(uap->spec, spec_len + 1);
1237 	kmem_free(uap, sizeof (struct mounta));
1238 	kmem_free(mntpt, mntpt_len + 1);
1239 
1240 	return (retval);
1241 }
1242 
1243 /*
1244  * Build an nfs_args structure for passing to domount().
1245  *
1246  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1247  * generic data - common to all ephemeral mount types - is read directly
1248  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1249  */
1250 static struct nfs_args *
1251 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1252     ephemeral_servinfo_t *esi)
1253 {
1254 	sec_data_t *secdata;
1255 	struct nfs_args *nargs;
1256 
1257 	/* setup the nfs args */
1258 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1259 
1260 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1261 
1262 	nargs->addr = esi->esi_addr;
1263 
1264 	/* for AUTH_DH by negotiation */
1265 	if (esi->esi_syncaddr || esi->esi_netname) {
1266 		nargs->flags |= NFSMNT_SECURE;
1267 		nargs->syncaddr = esi->esi_syncaddr;
1268 		nargs->netname = esi->esi_netname;
1269 	}
1270 
1271 	nargs->flags |= NFSMNT_KNCONF;
1272 	nargs->knconf = esi->esi_knconf;
1273 	nargs->flags |= NFSMNT_HOSTNAME;
1274 	nargs->hostname = esi->esi_hostname;
1275 	nargs->fh = esi->esi_path;
1276 
1277 	/* general mount settings, all copied from parent mount */
1278 	mutex_enter(&mi->mi_lock);
1279 
1280 	if (!(mi->mi_flags & MI4_HARD))
1281 		nargs->flags |= NFSMNT_SOFT;
1282 
1283 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
1284 	    NFSMNT_RETRANS;
1285 	nargs->wsize = mi->mi_stsize;
1286 	nargs->rsize = mi->mi_tsize;
1287 	nargs->timeo = mi->mi_timeo;
1288 	nargs->retrans = mi->mi_retrans;
1289 
1290 	if (mi->mi_flags & MI4_INT)
1291 		nargs->flags |= NFSMNT_INT;
1292 	if (mi->mi_flags & MI4_NOAC)
1293 		nargs->flags |= NFSMNT_NOAC;
1294 
1295 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
1296 	    NFSMNT_ACDIRMAX;
1297 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
1298 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
1299 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
1300 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
1301 
1302 	if (mi->mi_flags & MI4_NOCTO)
1303 		nargs->flags |= NFSMNT_NOCTO;
1304 	if (mi->mi_flags & MI4_GRPID)
1305 		nargs->flags |= NFSMNT_GRPID;
1306 	if (mi->mi_flags & MI4_LLOCK)
1307 		nargs->flags |= NFSMNT_LLOCK;
1308 	if (mi->mi_flags & MI4_NOPRINT)
1309 		nargs->flags |= NFSMNT_NOPRINT;
1310 	if (mi->mi_flags & MI4_DIRECTIO)
1311 		nargs->flags |= NFSMNT_DIRECTIO;
1312 	if (mi->mi_flags & MI4_PUBLIC)
1313 		nargs->flags |= NFSMNT_PUBLIC;
1314 
1315 	mutex_exit(&mi->mi_lock);
1316 
1317 	/* add any specific flags for this type of ephemeral mount */
1318 	nargs->flags |= esi->esi_mount_flags;
1319 
1320 	/*
1321 	 * Security data & negotiation policy.
1322 	 *
1323 	 * We need to preserve the parent mount's preference for security
1324 	 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT.
1325 	 *
1326 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
1327 	 * security flavour was requested, with data in sv_secdata, and that
1328 	 * no negotiation should occur. If this specified flavour fails, that's
1329 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
1330 	 *
1331 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
1332 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
1333 	 * Possible flavours are recorded in an array in sv_secinfo, with
1334 	 * currently in-use flavour pointed to by sv_currsec.
1335 	 *
1336 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
1337 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
1338 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
1339 	 */
1340 	if (svp->sv_flags & SV4_TRYSECDEFAULT) {
1341 		/* enable negotiation for ephemeral mount */
1342 		nargs->flags |= NFSMNT_SECDEFAULT;
1343 
1344 		/*
1345 		 * As a starting point for negotiation, copy parent
1346 		 * mount's negotiated flavour (sv_currsec) if available,
1347 		 * or its passed-in flavour (sv_secdata) if not.
1348 		 */
1349 		if (svp->sv_currsec != NULL)
1350 			secdata = copy_sec_data(svp->sv_currsec);
1351 		else if (svp->sv_secdata != NULL)
1352 			secdata = copy_sec_data(svp->sv_secdata);
1353 		else
1354 			secdata = NULL;
1355 	} else {
1356 		/* do not enable negotiation; copy parent's passed-in flavour */
1357 		if (svp->sv_secdata != NULL)
1358 			secdata = copy_sec_data(svp->sv_secdata);
1359 		else
1360 			secdata = NULL;
1361 	}
1362 
1363 	nfs_rw_exit(&svp->sv_lock);
1364 
1365 	nargs->flags |= NFSMNT_NEWARGS;
1366 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
1367 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
1368 
1369 	/* for NFS RO failover; caller will set if necessary */
1370 	nargs->nfs_ext_u.nfs_extB.next = NULL;
1371 
1372 	return (nargs);
1373 }
1374 
1375 static void
1376 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
1377 {
1378 	/*
1379 	 * Either the mount failed, in which case the data is not needed, or
1380 	 * nfs4_mount() has either taken copies of what it needs or,
1381 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
1382 	 * whereby nfs4_free_args() will ignore it.
1383 	 */
1384 	nfs4_free_args(nargs);
1385 	kmem_free(nargs, sizeof (struct nfs_args));
1386 }
1387 
1388 /*
1389  * When we finally get into the mounting, we need to add this
1390  * node to the ephemeral tree.
1391  *
1392  * This is called from nfs4_mount().
1393  */
1394 int
1395 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
1396 {
1397 	mntinfo4_t		*mi_parent;
1398 	nfs4_ephemeral_t	*eph;
1399 	nfs4_ephemeral_tree_t	*net;
1400 
1401 	nfs4_ephemeral_t	*prior;
1402 	nfs4_ephemeral_t	*child;
1403 
1404 	nfs4_ephemeral_t	*peer;
1405 
1406 	nfs4_trigger_globals_t	*ntg;
1407 	zone_t			*zone = curproc->p_zone;
1408 
1409 	int			rc = 0;
1410 
1411 	mi_parent = VTOMI4(mvp);
1412 
1413 	/*
1414 	 * Get this before grabbing anything else!
1415 	 */
1416 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
1417 	if (!ntg->ntg_thread_started) {
1418 		nfs4_ephemeral_start_harvester(ntg);
1419 	}
1420 
1421 	mutex_enter(&mi_parent->mi_lock);
1422 	mutex_enter(&mi->mi_lock);
1423 
1424 	net = mi->mi_ephemeral_tree =
1425 	    mi_parent->mi_ephemeral_tree;
1426 
1427 	/*
1428 	 * If the mi_ephemeral_tree is NULL, then it
1429 	 * means that either the harvester or a manual
1430 	 * umount has cleared the tree out right before
1431 	 * we got here.
1432 	 *
1433 	 * There is nothing we can do here, so return
1434 	 * to the caller and let them decide whether they
1435 	 * try again.
1436 	 */
1437 	if (net == NULL) {
1438 		mutex_exit(&mi->mi_lock);
1439 		mutex_exit(&mi_parent->mi_lock);
1440 
1441 		return (EBUSY);
1442 	}
1443 
1444 	nfs4_ephemeral_tree_hold(net);
1445 
1446 	/*
1447 	 * We need to tack together the ephemeral mount
1448 	 * with this new mntinfo.
1449 	 */
1450 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
1451 	eph->ne_mount = mi;
1452 	eph->ne_ref_time = gethrestime_sec();
1453 
1454 	/*
1455 	 * We need to tell the ephemeral mount when
1456 	 * to time out.
1457 	 */
1458 	eph->ne_mount_to = ntg->ntg_mount_to;
1459 
1460 	mi->mi_flags |= MI4_EPHEMERAL;
1461 	mi->mi_ephemeral = eph;
1462 
1463 	/*
1464 	 * If the enclosing mntinfo4 is also ephemeral,
1465 	 * then we need to point to its enclosing parent.
1466 	 * Else the enclosing mntinfo4 is the enclosing parent.
1467 	 *
1468 	 * We also need to weave this ephemeral node
1469 	 * into the tree.
1470 	 */
1471 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
1472 		/*
1473 		 * We need to decide if we are
1474 		 * the root node of this branch
1475 		 * or if we are a sibling of this
1476 		 * branch.
1477 		 */
1478 		prior = mi_parent->mi_ephemeral;
1479 		if (prior == NULL) {
1480 			/*
1481 			 * Race condition, clean up, and
1482 			 * let caller handle mntinfo.
1483 			 */
1484 			mi->mi_flags &= ~MI4_EPHEMERAL;
1485 			mi->mi_ephemeral = NULL;
1486 			kmem_free(eph, sizeof (*eph));
1487 			rc = EBUSY;
1488 		} else {
1489 			if (prior->ne_child == NULL) {
1490 				prior->ne_child = eph;
1491 			} else {
1492 				child = prior->ne_child;
1493 
1494 				prior->ne_child = eph;
1495 				eph->ne_peer = child;
1496 
1497 				child->ne_prior = eph;
1498 			}
1499 
1500 			eph->ne_prior = prior;
1501 		}
1502 	} else {
1503 		/*
1504 		 * The parent mntinfo4 is the non-ephemeral
1505 		 * root of the ephemeral tree. We
1506 		 * need to decide if we are the root
1507 		 * node of that tree or if we are a
1508 		 * sibling of the root node.
1509 		 *
1510 		 * We are the root if there is no
1511 		 * other node.
1512 		 */
1513 		if (net->net_root == NULL) {
1514 			net->net_root = eph;
1515 		} else {
1516 			eph->ne_peer = peer = net->net_root;
1517 			ASSERT(peer != NULL);
1518 			net->net_root = eph;
1519 
1520 			peer->ne_prior = eph;
1521 		}
1522 
1523 		eph->ne_prior = NULL;
1524 	}
1525 
1526 	nfs4_ephemeral_tree_rele(net);
1527 
1528 	mutex_exit(&mi->mi_lock);
1529 	mutex_exit(&mi_parent->mi_lock);
1530 
1531 	return (rc);
1532 }
1533 
1534 /*
1535  * Commit the changes to the ephemeral tree for removing this node.
1536  */
1537 static void
1538 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
1539 {
1540 	nfs4_ephemeral_t	*e = eph;
1541 	nfs4_ephemeral_t	*peer;
1542 	nfs4_ephemeral_t	*prior;
1543 
1544 	peer = eph->ne_peer;
1545 	prior = e->ne_prior;
1546 
1547 	/*
1548 	 * If this branch root was not the
1549 	 * tree root, then we need to fix back pointers.
1550 	 */
1551 	if (prior) {
1552 		if (prior->ne_child == e) {
1553 			prior->ne_child = peer;
1554 		} else {
1555 			prior->ne_peer = peer;
1556 		}
1557 
1558 		if (peer)
1559 			peer->ne_prior = prior;
1560 	} else if (peer) {
1561 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
1562 		peer->ne_prior = NULL;
1563 	} else {
1564 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
1565 	}
1566 }
1567 
1568 /*
1569  * We want to avoid recursion at all costs. So we need to
1570  * unroll the tree. We do this by a depth first traversal to
1571  * leaf nodes. We blast away the leaf and work our way back
1572  * up and down the tree.
1573  */
1574 static int
1575 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
1576     int isTreeRoot, int flag, cred_t *cr)
1577 {
1578 	nfs4_ephemeral_t	*e = eph;
1579 	nfs4_ephemeral_t	*prior;
1580 	mntinfo4_t		*mi;
1581 	vfs_t			*vfsp;
1582 	int			error;
1583 
1584 	/*
1585 	 * We use the loop while unrolling the ephemeral tree.
1586 	 */
1587 	for (;;) {
1588 		/*
1589 		 * First we walk down the child.
1590 		 */
1591 		if (e->ne_child) {
1592 			prior = e;
1593 			e = e->ne_child;
1594 			continue;
1595 		}
1596 
1597 		/*
1598 		 * If we are the root of the branch we are removing,
1599 		 * we end it here. But if the branch is the root of
1600 		 * the tree, we have to forge on. We do not consider
1601 		 * the peer list for the root because while it may
1602 		 * be okay to remove, it is both extra work and a
1603 		 * potential for a false-positive error to stall the
1604 		 * unmount attempt.
1605 		 */
1606 		if (e == eph && isTreeRoot == FALSE)
1607 			return (0);
1608 
1609 		/*
1610 		 * Next we walk down the peer list.
1611 		 */
1612 		if (e->ne_peer) {
1613 			prior = e;
1614 			e = e->ne_peer;
1615 			continue;
1616 		}
1617 
1618 		/*
1619 		 * We can only remove the node passed in by the
1620 		 * caller if it is the root of the ephemeral tree.
1621 		 * Otherwise, the caller will remove it.
1622 		 */
1623 		if (e == eph && isTreeRoot == FALSE)
1624 			return (0);
1625 
1626 		/*
1627 		 * Okay, we have a leaf node, time
1628 		 * to prune it!
1629 		 *
1630 		 * Note that prior can only be NULL if
1631 		 * and only if it is the root of the
1632 		 * ephemeral tree.
1633 		 */
1634 		prior = e->ne_prior;
1635 
1636 		mi = e->ne_mount;
1637 		mutex_enter(&mi->mi_lock);
1638 		vfsp = mi->mi_vfsp;
1639 
1640 		/*
1641 		 * Cleared by umount2_engine.
1642 		 */
1643 		VFS_HOLD(vfsp);
1644 
1645 		/*
1646 		 * Inform nfs4_unmount to not recursively
1647 		 * descend into this node's children when it
1648 		 * gets processed.
1649 		 */
1650 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
1651 		mutex_exit(&mi->mi_lock);
1652 
1653 		error = umount2_engine(vfsp, flag, cr, FALSE);
1654 		if (error) {
1655 			/*
1656 			 * We need to reenable nfs4_unmount's ability
1657 			 * to recursively descend on this node.
1658 			 */
1659 			mutex_enter(&mi->mi_lock);
1660 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
1661 			mutex_exit(&mi->mi_lock);
1662 
1663 			return (error);
1664 		}
1665 
1666 		/*
1667 		 * If we are the current node, we do not want to
1668 		 * touch anything else. At this point, the only
1669 		 * way the current node can have survived to here
1670 		 * is if it is the root of the ephemeral tree and
1671 		 * we are unmounting the enclosing mntinfo4.
1672 		 */
1673 		if (e == eph) {
1674 			ASSERT(prior == NULL);
1675 			return (0);
1676 		}
1677 
1678 		/*
1679 		 * Stitch up the prior node. Note that since
1680 		 * we have handled the root of the tree, prior
1681 		 * must be non-NULL.
1682 		 */
1683 		ASSERT(prior != NULL);
1684 		if (prior->ne_child == e) {
1685 			prior->ne_child = NULL;
1686 		} else {
1687 			ASSERT(prior->ne_peer == e);
1688 
1689 			prior->ne_peer = NULL;
1690 		}
1691 
1692 		e = prior;
1693 	}
1694 
1695 	/* NOTREACHED */
1696 }
1697 
1698 /*
1699  * Common code to safely release net_cnt_lock and net_tree_lock
1700  */
1701 void
1702 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
1703     nfs4_ephemeral_tree_t **pnet)
1704 {
1705 	nfs4_ephemeral_tree_t	*net = *pnet;
1706 
1707 	if (*pmust_unlock) {
1708 		mutex_enter(&net->net_cnt_lock);
1709 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
1710 		nfs4_ephemeral_tree_decr(net);
1711 		mutex_exit(&net->net_cnt_lock);
1712 
1713 		mutex_exit(&net->net_tree_lock);
1714 
1715 		*pmust_unlock = FALSE;
1716 	}
1717 }
1718 
1719 /*
1720  * While we may have removed any child or sibling nodes of this
1721  * ephemeral node, we can not nuke it until we know that there
1722  * were no actived vnodes on it. This will do that final
1723  * work once we know it is not busy.
1724  */
1725 void
1726 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
1727     nfs4_ephemeral_tree_t **pnet)
1728 {
1729 	/*
1730 	 * Now we need to get rid of the ephemeral data if it exists.
1731 	 */
1732 	mutex_enter(&mi->mi_lock);
1733 	if (mi->mi_ephemeral) {
1734 		/*
1735 		 * If we are the root node of an ephemeral branch
1736 		 * which is being removed, then we need to fixup
1737 		 * pointers into and out of the node.
1738 		 */
1739 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
1740 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
1741 
1742 		ASSERT(mi->mi_ephemeral != NULL);
1743 
1744 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
1745 		mi->mi_ephemeral = NULL;
1746 	}
1747 	mutex_exit(&mi->mi_lock);
1748 
1749 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
1750 }
1751 
1752 /*
1753  * Unmount an ephemeral node.
1754  */
1755 int
1756 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
1757     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
1758 {
1759 	int			error = 0;
1760 	nfs4_ephemeral_t	*eph;
1761 	nfs4_ephemeral_tree_t	*net;
1762 	int			is_derooting = FALSE;
1763 	int			is_recursed = FALSE;
1764 	int			was_locked = FALSE;
1765 
1766 	/*
1767 	 * Make sure to set the default state for cleaning
1768 	 * up the tree in the caller (and on the way out).
1769 	 */
1770 	*pmust_unlock = FALSE;
1771 
1772 	/*
1773 	 * The active vnodes on this file system may be ephemeral
1774 	 * children. We need to check for and try to unmount them
1775 	 * here. If any can not be unmounted, we are going
1776 	 * to return EBUSY.
1777 	 */
1778 	mutex_enter(&mi->mi_lock);
1779 
1780 	/*
1781 	 * If an ephemeral tree, we need to check to see if
1782 	 * the lock is already held. If it is, then we need
1783 	 * to see if we are being called as a result of
1784 	 * the recursive removal of some node of the tree or
1785 	 * if we are another attempt to remove the tree.
1786 	 *
1787 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
1788 	 * node. mi_ephemeral being non-NULL also does this.
1789 	 *
1790 	 * mi_ephemeral_tree being non-NULL is sufficient
1791 	 * to also indicate either it is an ephemeral node
1792 	 * or the enclosing mntinfo4.
1793 	 *
1794 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
1795 	 * when we delete the ephemeral node and need to
1796 	 * differentiate from an ephemeral node and the
1797 	 * enclosing root node.
1798 	 */
1799 	*pnet = net = mi->mi_ephemeral_tree;
1800 	if (net == NULL) {
1801 		mutex_exit(&mi->mi_lock);
1802 		return (0);
1803 	}
1804 
1805 	eph = mi->mi_ephemeral;
1806 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
1807 	is_derooting = (eph == NULL);
1808 
1809 	/*
1810 	 * If this is not recursion, then we need to
1811 	 * grab a ref count.
1812 	 *
1813 	 * But wait, we also do not want to do that
1814 	 * if a harvester thread has already grabbed
1815 	 * the lock.
1816 	 */
1817 	if (!is_recursed) {
1818 		mutex_enter(&net->net_cnt_lock);
1819 		if (net->net_status &
1820 		    NFS4_EPHEMERAL_TREE_LOCKED) {
1821 			/*
1822 			 * If the tree is locked, we need
1823 			 * to decide whether we are the
1824 			 * harvester or some explicit call
1825 			 * for a umount. The only way that
1826 			 * we are the harvester is if
1827 			 * MS_SYSSPACE is set.
1828 			 *
1829 			 * We only let the harvester through
1830 			 * at this point.
1831 			 *
1832 			 * We return EBUSY so that the
1833 			 * caller knows something is
1834 			 * going on. Note that by that
1835 			 * time, the umount in the other
1836 			 * thread may have already occured.
1837 			 */
1838 			if (!(flag & MS_SYSSPACE)) {
1839 				mutex_exit(&net->net_cnt_lock);
1840 				mutex_exit(&mi->mi_lock);
1841 
1842 				return (EBUSY);
1843 			}
1844 
1845 			was_locked = TRUE;
1846 		} else {
1847 			net->net_refcnt++;
1848 			ASSERT(net->net_refcnt != 0);
1849 		}
1850 
1851 		mutex_exit(&net->net_cnt_lock);
1852 	}
1853 	mutex_exit(&mi->mi_lock);
1854 
1855 	/*
1856 	 * If we are not the harvester, we need to check
1857 	 * to see if we need to grab the tree lock.
1858 	 */
1859 	if (was_locked == FALSE) {
1860 		/*
1861 		 * If we grab the lock, it means that no other
1862 		 * operation is working on the tree. If we don't
1863 		 * grab it, we need to decide if this is because
1864 		 * we are a recursive call or a new operation.
1865 		 */
1866 		if (mutex_tryenter(&net->net_tree_lock)) {
1867 			*pmust_unlock = TRUE;
1868 		} else {
1869 			/*
1870 			 * If we are a recursive call, we can
1871 			 * proceed without the lock.
1872 			 * Otherwise we have to wait until
1873 			 * the lock becomes free.
1874 			 */
1875 			if (!is_recursed) {
1876 				mutex_enter(&net->net_cnt_lock);
1877 				if (net->net_status &
1878 				    (NFS4_EPHEMERAL_TREE_DEROOTING
1879 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
1880 					nfs4_ephemeral_tree_decr(net);
1881 					mutex_exit(&net->net_cnt_lock);
1882 					goto is_busy;
1883 				}
1884 				mutex_exit(&net->net_cnt_lock);
1885 
1886 				/*
1887 				 * We can't hold any other locks whilst
1888 				 * we wait on this to free up.
1889 				 */
1890 				mutex_enter(&net->net_tree_lock);
1891 
1892 				/*
1893 				 * Note that while mi->mi_ephemeral
1894 				 * may change and thus we have to
1895 				 * update eph, it is the case that
1896 				 * we have tied down net and
1897 				 * do not care if mi->mi_ephemeral_tree
1898 				 * has changed.
1899 				 */
1900 				mutex_enter(&mi->mi_lock);
1901 				eph = mi->mi_ephemeral;
1902 				mutex_exit(&mi->mi_lock);
1903 
1904 				/*
1905 				 * Okay, we need to see if either the
1906 				 * tree got nuked or the current node
1907 				 * got nuked. Both of which will cause
1908 				 * an error.
1909 				 *
1910 				 * Note that a subsequent retry of the
1911 				 * umount shall work.
1912 				 */
1913 				mutex_enter(&net->net_cnt_lock);
1914 				if (net->net_status &
1915 				    NFS4_EPHEMERAL_TREE_INVALID ||
1916 				    (!is_derooting && eph == NULL)) {
1917 					nfs4_ephemeral_tree_decr(net);
1918 					mutex_exit(&net->net_cnt_lock);
1919 					mutex_exit(&net->net_tree_lock);
1920 					goto is_busy;
1921 				}
1922 				mutex_exit(&net->net_cnt_lock);
1923 				*pmust_unlock = TRUE;
1924 			}
1925 		}
1926 	}
1927 
1928 	/*
1929 	 * Only once we have grabbed the lock can we mark what we
1930 	 * are planning on doing to the ephemeral tree.
1931 	 */
1932 	if (*pmust_unlock) {
1933 		mutex_enter(&net->net_cnt_lock);
1934 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
1935 
1936 		/*
1937 		 * Check to see if we are nuking the root.
1938 		 */
1939 		if (is_derooting)
1940 			net->net_status |=
1941 			    NFS4_EPHEMERAL_TREE_DEROOTING;
1942 		mutex_exit(&net->net_cnt_lock);
1943 	}
1944 
1945 	if (!is_derooting) {
1946 		/*
1947 		 * Only work on children if the caller has not already
1948 		 * done so.
1949 		 */
1950 		if (!is_recursed) {
1951 			ASSERT(eph != NULL);
1952 
1953 			error = nfs4_ephemeral_unmount_engine(eph,
1954 			    FALSE, flag, cr);
1955 			if (error)
1956 				goto is_busy;
1957 		}
1958 	} else {
1959 		eph = net->net_root;
1960 
1961 		/*
1962 		 * Only work if there is something there.
1963 		 */
1964 		if (eph) {
1965 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
1966 			    flag, cr);
1967 			if (error) {
1968 				mutex_enter(&net->net_cnt_lock);
1969 				net->net_status &=
1970 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
1971 				mutex_exit(&net->net_cnt_lock);
1972 				goto is_busy;
1973 			}
1974 
1975 			/*
1976 			 * Nothing else which goes wrong will
1977 			 * invalidate the blowing away of the
1978 			 * ephmeral tree.
1979 			 */
1980 			net->net_root = NULL;
1981 		}
1982 
1983 		/*
1984 		 * We have derooted and we have caused the tree to be
1985 		 * invalidated.
1986 		 */
1987 		mutex_enter(&net->net_cnt_lock);
1988 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
1989 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
1990 		nfs4_ephemeral_tree_decr(net);
1991 		mutex_exit(&net->net_cnt_lock);
1992 
1993 		if (was_locked == FALSE)
1994 			mutex_exit(&net->net_tree_lock);
1995 
1996 		/*
1997 		 * We have just blown away any notation of this
1998 		 * tree being locked. We can't let the caller
1999 		 * try to clean things up.
2000 		 */
2001 		*pmust_unlock = FALSE;
2002 
2003 		/*
2004 		 * At this point, the tree should no
2005 		 * longer be associated with the
2006 		 * mntinfo4. We need to pull it off
2007 		 * there and let the harvester take
2008 		 * care of it once the refcnt drops.
2009 		 */
2010 		mutex_enter(&mi->mi_lock);
2011 		mi->mi_ephemeral_tree = NULL;
2012 		mutex_exit(&mi->mi_lock);
2013 	}
2014 
2015 	return (0);
2016 
2017 is_busy:
2018 
2019 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2020 
2021 	return (error);
2022 }
2023 
2024 /*
2025  * Do the umount and record any error in the parent.
2026  */
2027 static void
2028 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2029     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2030 {
2031 	int	error;
2032 
2033 	error = umount2_engine(vfsp, flag, kcred, FALSE);
2034 	if (error) {
2035 		if (prior) {
2036 			if (prior->ne_child == e)
2037 				prior->ne_state |=
2038 				    NFS4_EPHEMERAL_CHILD_ERROR;
2039 			else
2040 				prior->ne_state |=
2041 				    NFS4_EPHEMERAL_PEER_ERROR;
2042 		}
2043 	}
2044 }
2045 
2046 /*
2047  * For each tree in the forest (where the forest is in
2048  * effect all of the ephemeral trees for this zone),
2049  * scan to see if a node can be unmounted. Note that
2050  * unlike nfs4_ephemeral_unmount_engine(), we do
2051  * not process the current node before children or
2052  * siblings. I.e., if a node can be unmounted, we
2053  * do not recursively check to see if the nodes
2054  * hanging off of it can also be unmounted.
2055  *
2056  * Instead, we delve down deep to try and remove the
2057  * children first. Then, because we share code with
2058  * nfs4_ephemeral_unmount_engine(), we will try
2059  * them again. This could be a performance issue in
2060  * the future.
2061  *
2062  * Also note that unlike nfs4_ephemeral_unmount_engine(),
2063  * we do not halt on an error. We will not remove the
2064  * current node, but we will keep on trying to remove
2065  * the others.
2066  *
2067  * force indicates that we want the unmount to occur
2068  * even if there is something blocking it.
2069  *
2070  * time_check indicates that we want to see if the
2071  * mount has expired past mount_to or not. Typically
2072  * we want to do this and only on a shutdown of the
2073  * zone would we want to ignore the check.
2074  */
2075 static void
2076 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2077     bool_t force, bool_t time_check)
2078 {
2079 	nfs4_ephemeral_tree_t	*net;
2080 	nfs4_ephemeral_tree_t	*prev = NULL;
2081 	nfs4_ephemeral_tree_t	*next;
2082 	nfs4_ephemeral_t	*e;
2083 	nfs4_ephemeral_t	*prior;
2084 	time_t			now = gethrestime_sec();
2085 
2086 	nfs4_ephemeral_tree_t	*harvest = NULL;
2087 
2088 	int			flag;
2089 
2090 	mntinfo4_t		*mi;
2091 	vfs_t			*vfsp;
2092 
2093 	if (force)
2094 		flag = MS_FORCE | MS_SYSSPACE;
2095 	else
2096 		flag = MS_SYSSPACE;
2097 
2098 	mutex_enter(&ntg->ntg_forest_lock);
2099 	for (net = ntg->ntg_forest; net != NULL; net = next) {
2100 		next = net->net_next;
2101 
2102 		nfs4_ephemeral_tree_hold(net);
2103 
2104 		mutex_enter(&net->net_tree_lock);
2105 
2106 		/*
2107 		 * Let the unmount code know that the
2108 		 * tree is already locked!
2109 		 */
2110 		mutex_enter(&net->net_cnt_lock);
2111 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2112 		mutex_exit(&net->net_cnt_lock);
2113 
2114 		/*
2115 		 * If the intent is force all ephemeral nodes to
2116 		 * be unmounted in this zone, we can short circuit a
2117 		 * lot of tree traversal and simply zap the root node.
2118 		 */
2119 		if (force) {
2120 			if (net->net_root) {
2121 				mi = net->net_root->ne_mount;
2122 				vfsp = mi->mi_vfsp;
2123 
2124 				/*
2125 				 * Cleared by umount2_engine.
2126 				 */
2127 				VFS_HOLD(vfsp);
2128 
2129 				(void) umount2_engine(vfsp, flag,
2130 				    kcred, FALSE);
2131 
2132 				goto check_done;
2133 			}
2134 		}
2135 
2136 		e = net->net_root;
2137 		if (e)
2138 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2139 
2140 		while (e) {
2141 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2142 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2143 				if (e->ne_child) {
2144 					e = e->ne_child;
2145 					e->ne_state =
2146 					    NFS4_EPHEMERAL_VISIT_CHILD;
2147 				}
2148 
2149 				continue;
2150 			} else if (e->ne_state ==
2151 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2152 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2153 				if (e->ne_peer) {
2154 					e = e->ne_peer;
2155 					e->ne_state =
2156 					    NFS4_EPHEMERAL_VISIT_CHILD;
2157 				}
2158 
2159 				continue;
2160 			} else if (e->ne_state ==
2161 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2162 				prior = e->ne_prior;
2163 
2164 				/*
2165 				 * If a child reported an error, do
2166 				 * not bother trying to unmount.
2167 				 *
2168 				 * If your prior node is a parent,
2169 				 * pass the error up such that they
2170 				 * also do not try to unmount.
2171 				 *
2172 				 * However, if your prior is a sibling,
2173 				 * let them try to unmount if they can.
2174 				 */
2175 				if (prior) {
2176 					if (prior->ne_child == e)
2177 						prior->ne_state |=
2178 						    NFS4_EPHEMERAL_CHILD_ERROR;
2179 					else
2180 						prior->ne_state |=
2181 						    NFS4_EPHEMERAL_PEER_ERROR;
2182 				}
2183 
2184 				/*
2185 				 * Clear the error and if needed, process peers.
2186 				 *
2187 				 * Once we mask out the error, we know whether
2188 				 * or we have to process another node.
2189 				 */
2190 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2191 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2192 					e = prior;
2193 
2194 				continue;
2195 			} else if (e->ne_state ==
2196 			    NFS4_EPHEMERAL_PEER_ERROR) {
2197 				prior = e->ne_prior;
2198 
2199 				if (prior) {
2200 					if (prior->ne_child == e)
2201 						prior->ne_state =
2202 						    NFS4_EPHEMERAL_CHILD_ERROR;
2203 					else
2204 						prior->ne_state =
2205 						    NFS4_EPHEMERAL_PEER_ERROR;
2206 				}
2207 
2208 				/*
2209 				 * Clear the error from this node and do the
2210 				 * correct processing.
2211 				 */
2212 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2213 				continue;
2214 			}
2215 
2216 			prior = e->ne_prior;
2217 			e->ne_state = NFS4_EPHEMERAL_OK;
2218 
2219 			/*
2220 			 * It must be the case that we need to process
2221 			 * this node.
2222 			 */
2223 			if (!time_check ||
2224 			    now - e->ne_ref_time > e->ne_mount_to) {
2225 				mi = e->ne_mount;
2226 				vfsp = mi->mi_vfsp;
2227 
2228 				/*
2229 				 * Cleared by umount2_engine.
2230 				 */
2231 				VFS_HOLD(vfsp);
2232 
2233 				/*
2234 				 * Note that we effectively work down to the
2235 				 * leaf nodes first, try to unmount them,
2236 				 * then work our way back up into the leaf
2237 				 * nodes.
2238 				 *
2239 				 * Also note that we deal with a lot of
2240 				 * complexity by sharing the work with
2241 				 * the manual unmount code.
2242 				 */
2243 				nfs4_ephemeral_record_umount(vfsp, flag,
2244 				    e, prior);
2245 			}
2246 
2247 			e = prior;
2248 		}
2249 
2250 check_done:
2251 
2252 		/*
2253 		 * At this point we are done processing this tree.
2254 		 *
2255 		 * If the tree is invalid and we are the only reference
2256 		 * to it, then we push it on the local linked list
2257 		 * to remove it at the end. We avoid that action now
2258 		 * to keep the tree processing going along at a fair clip.
2259 		 *
2260 		 * Else, even if we are the only reference, we drop
2261 		 * our hold on the current tree and allow it to be
2262 		 * reused as needed.
2263 		 */
2264 		mutex_enter(&net->net_cnt_lock);
2265 		if (net->net_refcnt == 1 &&
2266 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
2267 			nfs4_ephemeral_tree_decr(net);
2268 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2269 			mutex_exit(&net->net_cnt_lock);
2270 			mutex_exit(&net->net_tree_lock);
2271 
2272 			if (prev)
2273 				prev->net_next = net->net_next;
2274 			else
2275 				ntg->ntg_forest = net->net_next;
2276 
2277 			net->net_next = harvest;
2278 			harvest = net;
2279 			continue;
2280 		}
2281 
2282 		nfs4_ephemeral_tree_decr(net);
2283 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2284 		mutex_exit(&net->net_cnt_lock);
2285 		mutex_exit(&net->net_tree_lock);
2286 
2287 		prev = net;
2288 	}
2289 	mutex_exit(&ntg->ntg_forest_lock);
2290 
2291 	for (net = harvest; net != NULL; net = next) {
2292 		next = net->net_next;
2293 
2294 		mutex_destroy(&net->net_tree_lock);
2295 		mutex_destroy(&net->net_cnt_lock);
2296 		kmem_free(net, sizeof (*net));
2297 	}
2298 }
2299 
2300 /*
2301  * This is the thread which decides when the harvesting
2302  * can proceed and when to kill it off for this zone.
2303  */
2304 static void
2305 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
2306 {
2307 	clock_t		timeleft;
2308 	zone_t		*zone = curproc->p_zone;
2309 
2310 	for (;;) {
2311 		timeleft = zone_status_timedwait(zone, lbolt +
2312 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
2313 
2314 		/*
2315 		 * zone is exiting...
2316 		 */
2317 		if (timeleft != -1) {
2318 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
2319 			zthread_exit();
2320 			/* NOTREACHED */
2321 		}
2322 
2323 		/*
2324 		 * Only bother scanning if there is potential
2325 		 * work to be done.
2326 		 */
2327 		if (ntg->ntg_forest == NULL)
2328 			continue;
2329 
2330 		/*
2331 		 * Now scan the list and get rid of everything which
2332 		 * is old.
2333 		 */
2334 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
2335 	}
2336 
2337 	/* NOTREACHED */
2338 }
2339 
2340 /*
2341  * The zone specific glue needed to start the unmount harvester.
2342  *
2343  * Note that we want to avoid holding the mutex as long as possible,
2344  * hence the multiple checks.
2345  *
2346  * The caller should avoid us getting down here in the first
2347  * place.
2348  */
2349 static void
2350 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
2351 {
2352 	/*
2353 	 * It got started before we got here...
2354 	 */
2355 	if (ntg->ntg_thread_started)
2356 		return;
2357 
2358 	mutex_enter(&nfs4_ephemeral_thread_lock);
2359 
2360 	if (ntg->ntg_thread_started) {
2361 		mutex_exit(&nfs4_ephemeral_thread_lock);
2362 		return;
2363 	}
2364 
2365 	/*
2366 	 * Start the unmounter harvester thread for this zone.
2367 	 */
2368 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
2369 	    ntg, 0, minclsyspri);
2370 
2371 	ntg->ntg_thread_started = TRUE;
2372 	mutex_exit(&nfs4_ephemeral_thread_lock);
2373 }
2374 
2375 /*ARGSUSED*/
2376 static void *
2377 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
2378 {
2379 	nfs4_trigger_globals_t	*ntg;
2380 
2381 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
2382 	ntg->ntg_thread_started = FALSE;
2383 
2384 	/*
2385 	 * This is the default....
2386 	 */
2387 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
2388 
2389 	mutex_init(&ntg->ntg_forest_lock, NULL,
2390 	    MUTEX_DEFAULT, NULL);
2391 
2392 	return (ntg);
2393 }
2394 
2395 /*
2396  * Try a nice gentle walk down the forest and convince
2397  * all of the trees to gracefully give it up.
2398  */
2399 /*ARGSUSED*/
2400 static void
2401 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
2402 {
2403 	nfs4_trigger_globals_t	*ntg = arg;
2404 
2405 	if (!ntg)
2406 		return;
2407 
2408 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
2409 }
2410 
2411 /*
2412  * Race along the forest and rip all of the trees out by
2413  * their rootballs!
2414  */
2415 /*ARGSUSED*/
2416 static void
2417 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
2418 {
2419 	nfs4_trigger_globals_t	*ntg = arg;
2420 
2421 	if (!ntg)
2422 		return;
2423 
2424 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
2425 
2426 	mutex_destroy(&ntg->ntg_forest_lock);
2427 	kmem_free(ntg, sizeof (*ntg));
2428 }
2429 
2430 /*
2431  * This is the zone independent cleanup needed for
2432  * emphemeral mount processing.
2433  */
2434 void
2435 nfs4_ephemeral_fini(void)
2436 {
2437 	(void) zone_key_delete(nfs4_ephemeral_key);
2438 	mutex_destroy(&nfs4_ephemeral_thread_lock);
2439 }
2440 
2441 /*
2442  * This is the zone independent initialization needed for
2443  * emphemeral mount processing.
2444  */
2445 void
2446 nfs4_ephemeral_init(void)
2447 {
2448 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
2449 	    NULL);
2450 
2451 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
2452 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
2453 }
2454 
2455 /*
2456  * nfssys() calls this function to set the per-zone
2457  * value of mount_to to drive when an ephemeral mount is
2458  * timed out. Each mount will grab a copy of this value
2459  * when mounted.
2460  */
2461 void
2462 nfs4_ephemeral_set_mount_to(uint_t mount_to)
2463 {
2464 	nfs4_trigger_globals_t	*ntg;
2465 	zone_t			*zone = curproc->p_zone;
2466 
2467 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2468 
2469 	ntg->ntg_mount_to = mount_to;
2470 }
2471 
2472 /*
2473  * Walk the list of v4 mount options; if they are currently set in vfsp,
2474  * append them to a new comma-separated mount option string, and return it.
2475  *
2476  * Caller should free by calling nfs4_trigger_destroy_mntopts().
2477  */
2478 static char *
2479 nfs4_trigger_create_mntopts(vfs_t *vfsp)
2480 {
2481 	uint_t i;
2482 	char *mntopts;
2483 	struct vfssw *vswp;
2484 	mntopts_t *optproto;
2485 
2486 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
2487 
2488 	/* get the list of applicable mount options for v4; locks *vswp */
2489 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
2490 	optproto = &vswp->vsw_optproto;
2491 
2492 	for (i = 0; i < optproto->mo_count; i++) {
2493 		struct mntopt *mop = &optproto->mo_list[i];
2494 
2495 		if (mop->mo_flags & MO_EMPTY)
2496 			continue;
2497 
2498 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
2499 			kmem_free(mntopts, MAX_MNTOPT_STR);
2500 			vfs_unrefvfssw(vswp);
2501 			return (NULL);
2502 		}
2503 	}
2504 
2505 	vfs_unrefvfssw(vswp);
2506 
2507 	/*
2508 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
2509 	 * and it may only be passed via MS_OPTIONSTR, so we
2510 	 * must handle it here.
2511 	 *
2512 	 * Ideally, it would be in the list, but NFS does not specify its
2513 	 * own opt proto list, it uses instead the default one. Since
2514 	 * not all filesystems support extended attrs, it would not be
2515 	 * appropriate to add it there.
2516 	 */
2517 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
2518 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
2519 		kmem_free(mntopts, MAX_MNTOPT_STR);
2520 		return (NULL);
2521 	}
2522 
2523 	return (mntopts);
2524 }
2525 
2526 static void
2527 nfs4_trigger_destroy_mntopts(char *mntopts)
2528 {
2529 	if (mntopts)
2530 		kmem_free(mntopts, MAX_MNTOPT_STR);
2531 }
2532 
2533 /*
2534  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
2535  */
2536 static int
2537 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
2538 {
2539 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
2540 		return (EINVAL);
2541 
2542 	if (vfs_optionisset(vfsp, optname, NULL)) {
2543 		size_t mntoptslen = strlen(mntopts);
2544 		size_t optnamelen = strlen(optname);
2545 
2546 		/* +1 for ',', +1 for NUL */
2547 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
2548 			return (EOVERFLOW);
2549 
2550 		/* first or subsequent mount option? */
2551 		if (*mntopts != '\0')
2552 			(void) strcat(mntopts, ",");
2553 
2554 		(void) strcat(mntopts, optname);
2555 	}
2556 
2557 	return (0);
2558 }
2559 
2560 static enum clnt_stat
2561 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
2562 {
2563 	int retries, error;
2564 	uint_t max_msgsize;
2565 	enum clnt_stat status;
2566 	CLIENT *cl;
2567 	struct timeval timeout;
2568 
2569 	/* as per recov_newserver() */
2570 	max_msgsize = 0;
2571 	retries = 1;
2572 	timeout.tv_sec = 2;
2573 	timeout.tv_usec = 0;
2574 
2575 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM,
2576 	    NFS_V4, max_msgsize, retries, CRED(), &cl);
2577 	if (error)
2578 		return (RPC_FAILED);
2579 
2580 	if (nointr)
2581 		cl->cl_nosignal = TRUE;
2582 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
2583 	    timeout);
2584 	if (nointr)
2585 		cl->cl_nosignal = FALSE;
2586 
2587 	AUTH_DESTROY(cl->cl_auth);
2588 	CLNT_DESTROY(cl);
2589 
2590 	return (status);
2591 }
2592