xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision ab5a7454a6d76e82a121d74c74d5589cc3d37a8f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29  * triggered from a "stub" rnode via a special set of vnodeops.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/mount.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 #include <sys/mntent.h>
68 #include <sys/priv.h>
69 
70 #include <rpc/types.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73 
74 #include <nfs/nfs.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/nfs_acl.h>
77 #include <nfs/lm.h>
78 #include <nfs/nfs4.h>
79 #include <nfs/nfs4_kprot.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82 #include <nfs/nfsid_map.h>
83 #include <nfs/nfs4_idmap_impl.h>
84 
85 #include <vm/hat.h>
86 #include <vm/as.h>
87 #include <vm/page.h>
88 #include <vm/pvn.h>
89 #include <vm/seg.h>
90 #include <vm/seg_map.h>
91 #include <vm/seg_kpm.h>
92 #include <vm/seg_vn.h>
93 
94 #include <fs/fs_subr.h>
95 
96 #include <sys/ddi.h>
97 #include <sys/int_fmtio.h>
98 
99 #include <sys/sunddi.h>
100 
101 #include <sys/priv_names.h>
102 
103 extern zone_key_t	nfs4clnt_zone_key;
104 extern zone_key_t	nfsidmap_zone_key;
105 
106 /*
107  * The automatic unmounter thread stuff!
108  */
109 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
110 
111 /*
112  * Just a default....
113  */
114 static uint_t nfs4_trigger_mount_to = 240;
115 
116 typedef struct nfs4_trigger_globals {
117 	kmutex_t		ntg_forest_lock;
118 	uint_t			ntg_mount_to;
119 	int			ntg_thread_started;
120 	nfs4_ephemeral_tree_t	*ntg_forest;
121 } nfs4_trigger_globals_t;
122 
123 kmutex_t	nfs4_ephemeral_thread_lock;
124 
125 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
126 
127 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
128 
129 /*
130  * Used for ephemeral mounts; contains data either duplicated from
131  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
132  *
133  * It's intended that this structure is used solely for ephemeral
134  * mount-type specific data, for passing this data to
135  * nfs4_trigger_nargs_create().
136  */
137 typedef struct ephemeral_servinfo {
138 	char			*esi_hostname;
139 	char			*esi_netname;
140 	char			*esi_path;
141 	int			esi_path_len;
142 	int			esi_mount_flags;
143 	struct netbuf		*esi_addr;
144 	struct netbuf		*esi_syncaddr;
145 	struct knetconfig	*esi_knconf;
146 } ephemeral_servinfo_t;
147 
148 /*
149  * Collect together the mount-type specific and generic data args.
150  */
151 typedef struct domount_args {
152 	ephemeral_servinfo_t	*dma_esi;
153 	char			*dma_hostlist; /* comma-sep. for RO failover */
154 	struct nfs_args		*dma_nargs;
155 } domount_args_t;
156 
157 
158 /*
159  * The vnode ops functions for a trigger stub vnode
160  */
161 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
162 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
163     caller_context_t *);
164 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
165     caller_context_t *);
166 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
167     caller_context_t *);
168 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
169     caller_context_t *);
170 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
171     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
172     int *, pathname_t *);
173 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
174     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
175     vsecattr_t *);
176 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
177     int);
178 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
179     caller_context_t *, int);
180 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
181     cred_t *, caller_context_t *, int);
182 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
183     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
184 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
185     caller_context_t *, int);
186 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
187     cred_t *, caller_context_t *, int);
188 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
189 
190 /*
191  * Regular NFSv4 vnodeops that we need to reference directly
192  */
193 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
194 		    caller_context_t *);
195 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
196 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
197 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
198 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
199 		    struct pathname *, int, vnode_t *, cred_t *,
200 		    caller_context_t *, int *, pathname_t *);
201 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
202 		    caller_context_t *);
203 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
204 		    caller_context_t *);
205 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
206 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
207 
208 static int	nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
209 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
210     cred_t *, vnode_t **);
211 static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *, cred_t *);
212 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
213     vnode_t *vp);
214 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
215     cred_t *);
216 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
217 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
218     servinfo4_t *);
219 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
220     cred_t *);
221 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
222     ephemeral_servinfo_t *);
223 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
224 static char	*nfs4_trigger_create_mntopts(vfs_t *);
225 static void	nfs4_trigger_destroy_mntopts(char *);
226 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
227 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
228 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
229     struct netbuf *, int);
230 
231 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
232 
233 vnodeops_t *nfs4_trigger_vnodeops;
234 
235 /*
236  * These are the vnodeops that we must define for stub vnodes.
237  *
238  *
239  * Many of the VOPs defined for NFSv4 do not need to be defined here,
240  * for various reasons. This will result in the VFS default function being
241  * used:
242  *
243  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
244  *   lost the reference to the stub vnode, meaning these should not be called:
245  *       close, read, write, ioctl, readdir, seek.
246  *
247  * - These VOPs are meaningless for vnodes without data pages. Since the
248  *   stub vnode is of type VDIR, these should not be called:
249  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
250  *
251  * - These VOPs are otherwise not applicable, and should not be called:
252  *       dump, setsecattr.
253  *
254  *
255  * These VOPs we do not want to define, but nor do we want the VFS default
256  * action. Instead, we specify the VFS error function, with fs_error(), but
257  * note that fs_error() is not actually called. Instead it results in the
258  * use of the error function defined for the particular VOP, in vn_ops_table[]:
259  *
260  * -   frlock, dispose, shrlock.
261  *
262  *
263  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
264  * NOTE: if any of these ops involve an OTW call with the stub FH, then
265  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
266  * to protect the security data in the servinfo4_t for the "parent"
267  * filesystem that contains the stub.
268  *
269  * - These VOPs should not trigger a mount, so that "ls -l" does not:
270  *       pathconf, getsecattr.
271  *
272  * - These VOPs would not make sense to trigger:
273  *       inactive, rwlock, rwunlock, fid, realvp.
274  */
275 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
276 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
277 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
278 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
279 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
280 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
281 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
282 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
283 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
284 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
285 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
286 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
287 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
288 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
289 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
290 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
291 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
292 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
293 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
294 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
295 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
296 	VOPNAME_FRLOCK,		{ .error = fs_error },
297 	VOPNAME_DISPOSE,	{ .error = fs_error },
298 	VOPNAME_SHRLOCK,	{ .error = fs_error },
299 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
300 	NULL, NULL
301 };
302 
303 static void
304 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
305 {
306 	ASSERT(mutex_owned(&net->net_cnt_lock));
307 	net->net_refcnt++;
308 	ASSERT(net->net_refcnt != 0);
309 }
310 
311 static void
312 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
313 {
314 	mutex_enter(&net->net_cnt_lock);
315 	nfs4_ephemeral_tree_incr(net);
316 	mutex_exit(&net->net_cnt_lock);
317 }
318 
319 /*
320  * We need a safe way to decrement the refcnt whilst the
321  * lock is being held.
322  */
323 static void
324 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
325 {
326 	ASSERT(mutex_owned(&net->net_cnt_lock));
327 	ASSERT(net->net_refcnt != 0);
328 	net->net_refcnt--;
329 }
330 
331 static void
332 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
333 {
334 	mutex_enter(&net->net_cnt_lock);
335 	nfs4_ephemeral_tree_decr(net);
336 	mutex_exit(&net->net_cnt_lock);
337 }
338 
339 /*
340  * Trigger ops for stub vnodes; for mirror mounts, etc.
341  *
342  * The general idea is that a "triggering" op will first call
343  * nfs4_trigger_mount(), which will find out whether a mount has already
344  * been triggered.
345  *
346  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
347  * of the covering vfs.
348  *
349  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
350  * and again set newvp, as above.
351  *
352  * The triggering op may then re-issue the VOP by calling it on newvp.
353  *
354  * Note that some ops may perform custom action, and may or may not need
355  * to trigger a mount.
356  *
357  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
358  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
359  * and that would just recurse. Instead, we call the v4 op directly,
360  * by name.  This is OK, since we know that the vnode is for NFSv4,
361  * otherwise it couldn't be a stub.
362  *
363  */
364 
365 static int
366 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
367 {
368 	int error;
369 	vnode_t *newvp;
370 
371 	error = nfs4_trigger_mount(*vpp, cr, &newvp);
372 	if (error)
373 		return (error);
374 
375 	/* Release the stub vnode, as we're losing the reference to it */
376 	VN_RELE(*vpp);
377 
378 	/* Give the caller the root vnode of the newly-mounted fs */
379 	*vpp = newvp;
380 
381 	/* return with VN_HELD(newvp) */
382 	return (VOP_OPEN(vpp, flag, cr, ct));
383 }
384 
385 void
386 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
387 {
388 	uint_t mask;
389 	timespec_t now;
390 
391 	/*
392 	 * Set some attributes here for referrals.
393 	 */
394 	mask = vap->va_mask;
395 	bzero(vap, sizeof (struct vattr));
396 	vap->va_mask	= mask;
397 	vap->va_uid	= 0;
398 	vap->va_gid	= 0;
399 	vap->va_nlink	= 1;
400 	vap->va_size	= 1;
401 	gethrestime(&now);
402 	vap->va_atime	= now;
403 	vap->va_mtime	= now;
404 	vap->va_ctime	= now;
405 	vap->va_type	= VDIR;
406 	vap->va_mode	= 0555;
407 	vap->va_fsid	= vp->v_vfsp->vfs_dev;
408 	vap->va_rdev	= 0;
409 	vap->va_blksize	= MAXBSIZE;
410 	vap->va_nblocks	= 1;
411 	vap->va_seq	= 0;
412 }
413 
414 /*
415  * For the majority of cases, nfs4_trigger_getattr() will not trigger
416  * a mount. However, if ATTR_TRIGGER is set, we are being informed
417  * that we need to force the mount before we attempt to determine
418  * the attributes. The intent is an atomic operation for security
419  * testing.
420  *
421  * If we're not triggering a mount, we can still inquire about the
422  * actual attributes from the server in the mirror mount case,
423  * and will return manufactured attributes for a referral (see
424  * the 'create' branch of find_referral_stubvp()).
425  */
426 static int
427 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
428     caller_context_t *ct)
429 {
430 	int error;
431 
432 	if (flags & ATTR_TRIGGER) {
433 		vnode_t	*newvp;
434 
435 		error = nfs4_trigger_mount(vp, cr, &newvp);
436 		if (error)
437 			return (error);
438 
439 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
440 		VN_RELE(newvp);
441 
442 	} else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
443 
444 		error = nfs4_getattr(vp, vap, flags, cr, ct);
445 
446 	} else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
447 
448 		nfs4_fake_attrs(vp, vap);
449 		error = 0;
450 	}
451 
452 	return (error);
453 }
454 
455 static int
456 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
457 		caller_context_t *ct)
458 {
459 	int error;
460 	vnode_t *newvp;
461 
462 	error = nfs4_trigger_mount(vp, cr, &newvp);
463 	if (error)
464 		return (error);
465 
466 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
467 	VN_RELE(newvp);
468 
469 	return (error);
470 }
471 
472 static int
473 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
474     caller_context_t *ct)
475 {
476 	int error;
477 	vnode_t *newvp;
478 
479 	error = nfs4_trigger_mount(vp, cr, &newvp);
480 	if (error)
481 		return (error);
482 
483 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
484 	VN_RELE(newvp);
485 
486 	return (error);
487 }
488 
489 static int
490 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
491     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
492     caller_context_t *ct, int *deflags, pathname_t *rpnp)
493 {
494 	int error;
495 	vnode_t *newdvp;
496 	rnode4_t *drp = VTOR4(dvp);
497 
498 	ASSERT(RP_ISSTUB(drp));
499 
500 	/*
501 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
502 	 * that up. Instead, pass onto the regular op, regardless of whether
503 	 * we've triggered a mount.
504 	 */
505 	if (strcmp(nm, "..") == 0)
506 		if (RP_ISSTUB_MIRRORMOUNT(drp)) {
507 			return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
508 			    ct, deflags, rpnp));
509 		} else if (RP_ISSTUB_REFERRAL(drp)) {
510 			/* Return the parent vnode */
511 			return (vtodv(dvp, vpp, cr, TRUE));
512 		}
513 
514 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
515 	if (error)
516 		return (error);
517 
518 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
519 	    deflags, rpnp);
520 	VN_RELE(newdvp);
521 
522 	return (error);
523 }
524 
525 static int
526 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
527     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
528     int flags, caller_context_t *ct, vsecattr_t *vsecp)
529 {
530 	int error;
531 	vnode_t *newdvp;
532 
533 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
534 	if (error)
535 		return (error);
536 
537 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
538 	    flags, ct, vsecp);
539 	VN_RELE(newdvp);
540 
541 	return (error);
542 }
543 
544 static int
545 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
546     int flags)
547 {
548 	int error;
549 	vnode_t *newdvp;
550 
551 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
552 	if (error)
553 		return (error);
554 
555 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
556 	VN_RELE(newdvp);
557 
558 	return (error);
559 }
560 
561 static int
562 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
563     caller_context_t *ct, int flags)
564 {
565 	int error;
566 	vnode_t *newtdvp;
567 
568 	error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
569 	if (error)
570 		return (error);
571 
572 	/*
573 	 * We don't check whether svp is a stub. Let the NFSv4 code
574 	 * detect that error, and return accordingly.
575 	 */
576 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
577 	VN_RELE(newtdvp);
578 
579 	return (error);
580 }
581 
582 static int
583 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
584     cred_t *cr, caller_context_t *ct, int flags)
585 {
586 	int error;
587 	vnode_t *newsdvp;
588 	rnode4_t *tdrp = VTOR4(tdvp);
589 
590 	/*
591 	 * We know that sdvp is a stub, otherwise we would not be here.
592 	 *
593 	 * If tdvp is also be a stub, there are two possibilities: it
594 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
595 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
596 	 *
597 	 * In the former case, just trigger sdvp, and treat tdvp as
598 	 * though it were not a stub.
599 	 *
600 	 * In the latter case, it might be a different stub for the
601 	 * same server fs as sdvp, or for a different server fs.
602 	 * Regardless, from the client perspective this would still
603 	 * be a cross-filesystem rename, and should not be allowed,
604 	 * so return EXDEV, without triggering either mount.
605 	 */
606 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
607 		return (EXDEV);
608 
609 	error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
610 	if (error)
611 		return (error);
612 
613 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
614 
615 	VN_RELE(newsdvp);
616 
617 	return (error);
618 }
619 
620 /* ARGSUSED */
621 static int
622 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
623     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
624 {
625 	int error;
626 	vnode_t *newdvp;
627 
628 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
629 	if (error)
630 		return (error);
631 
632 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
633 	VN_RELE(newdvp);
634 
635 	return (error);
636 }
637 
638 static int
639 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
640     caller_context_t *ct, int flags)
641 {
642 	int error;
643 	vnode_t *newdvp;
644 
645 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
646 	if (error)
647 		return (error);
648 
649 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
650 	VN_RELE(newdvp);
651 
652 	return (error);
653 }
654 
655 static int
656 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
657     cred_t *cr, caller_context_t *ct, int flags)
658 {
659 	int error;
660 	vnode_t *newdvp;
661 
662 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
663 	if (error)
664 		return (error);
665 
666 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
667 	VN_RELE(newdvp);
668 
669 	return (error);
670 }
671 
672 static int
673 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
674     caller_context_t *ct)
675 {
676 	int error;
677 	vnode_t *newvp;
678 
679 	error = nfs4_trigger_mount(vp, cr, &newvp);
680 	if (error)
681 		return (error);
682 
683 	error = VOP_READLINK(newvp, uiop, cr, ct);
684 	VN_RELE(newvp);
685 
686 	return (error);
687 }
688 
689 /* end of trigger vnode ops */
690 
691 /*
692  * See if the mount has already been done by another caller.
693  */
694 static int
695 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
696     bool_t *was_mounted, vfs_t **vfsp)
697 {
698 	int		error;
699 	mntinfo4_t	*mi = VTOMI4(vp);
700 
701 	*was_mounted = FALSE;
702 
703 	error = vn_vfsrlock_wait(vp);
704 	if (error)
705 		return (error);
706 
707 	*vfsp = vn_mountedvfs(vp);
708 	if (*vfsp != NULL) {
709 		/* the mount has already occurred */
710 		error = VFS_ROOT(*vfsp, newvpp);
711 		if (!error) {
712 			/* need to update the reference time  */
713 			mutex_enter(&mi->mi_lock);
714 			if (mi->mi_ephemeral)
715 				mi->mi_ephemeral->ne_ref_time =
716 				    gethrestime_sec();
717 			mutex_exit(&mi->mi_lock);
718 
719 			*was_mounted = TRUE;
720 		}
721 	}
722 
723 	vn_vfsunlock(vp);
724 	return (0);
725 }
726 
727 /*
728  * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
729  *
730  * The mount may have already occurred, via another thread. If not,
731  * assemble the location information - which may require fetching - and
732  * perform the mount.
733  *
734  * Sets newvp to be the root of the fs that is now covering vp. Note
735  * that we return with VN_HELD(*newvp).
736  *
737  * The caller is responsible for passing the VOP onto the covering fs.
738  */
739 static int
740 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
741 {
742 	int			 error;
743 	vfs_t			*vfsp;
744 	rnode4_t		*rp = VTOR4(vp);
745 	mntinfo4_t		*mi = VTOMI4(vp);
746 	domount_args_t		*dma;
747 
748 	nfs4_ephemeral_tree_t	*net;
749 
750 	bool_t			must_unlock = FALSE;
751 	bool_t			is_building = FALSE;
752 	bool_t			was_mounted = FALSE;
753 
754 	cred_t			*mcred = NULL;
755 
756 	nfs4_trigger_globals_t	*ntg;
757 
758 	zone_t			*zone = curproc->p_zone;
759 
760 	ASSERT(RP_ISSTUB(rp));
761 
762 	*newvpp = NULL;
763 
764 	/*
765 	 * Has the mount already occurred?
766 	 */
767 	error = nfs4_trigger_mounted_already(vp, newvpp,
768 	    &was_mounted, &vfsp);
769 	if (error || was_mounted)
770 		goto done;
771 
772 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
773 	ASSERT(ntg != NULL);
774 
775 	mutex_enter(&mi->mi_lock);
776 
777 	/*
778 	 * We need to lock down the ephemeral tree.
779 	 */
780 	if (mi->mi_ephemeral_tree == NULL) {
781 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
782 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
783 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
784 		net->net_refcnt = 1;
785 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
786 		is_building = TRUE;
787 
788 		/*
789 		 * We need to add it to the zone specific list for
790 		 * automatic unmounting and harvesting of deadwood.
791 		 */
792 		mutex_enter(&ntg->ntg_forest_lock);
793 		if (ntg->ntg_forest != NULL)
794 			net->net_next = ntg->ntg_forest;
795 		ntg->ntg_forest = net;
796 		mutex_exit(&ntg->ntg_forest_lock);
797 
798 		/*
799 		 * No lock order confusion with mi_lock because no
800 		 * other node could have grabbed net_tree_lock.
801 		 */
802 		mutex_enter(&net->net_tree_lock);
803 		mi->mi_ephemeral_tree = net;
804 		net->net_mount = mi;
805 		mutex_exit(&mi->mi_lock);
806 	} else {
807 		net = mi->mi_ephemeral_tree;
808 		nfs4_ephemeral_tree_hold(net);
809 
810 		mutex_exit(&mi->mi_lock);
811 
812 		mutex_enter(&net->net_tree_lock);
813 
814 		/*
815 		 * We can only procede if the tree is neither locked
816 		 * nor being torn down.
817 		 */
818 		mutex_enter(&net->net_cnt_lock);
819 		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
820 			nfs4_ephemeral_tree_decr(net);
821 			mutex_exit(&net->net_cnt_lock);
822 			mutex_exit(&net->net_tree_lock);
823 
824 			return (EIO);
825 		}
826 		mutex_exit(&net->net_cnt_lock);
827 	}
828 
829 	mutex_enter(&net->net_cnt_lock);
830 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
831 	mutex_exit(&net->net_cnt_lock);
832 
833 	must_unlock = TRUE;
834 
835 	dma = nfs4_trigger_domount_args_create(vp, cr);
836 	if (dma == NULL) {
837 		error = EINVAL;
838 		goto done;
839 	}
840 
841 	/*
842 	 * Note that since we define mirror mounts to work
843 	 * for any user, we simply extend the privileges of
844 	 * the user's credentials to allow the mount to
845 	 * proceed.
846 	 */
847 	mcred = crdup(cr);
848 	if (mcred == NULL) {
849 		error = EINVAL;
850 		goto done;
851 	}
852 
853 	crset_zone_privall(mcred);
854 	if (is_system_labeled())
855 		(void) setpflags(NET_MAC_AWARE, 1, mcred);
856 
857 	error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
858 	nfs4_trigger_domount_args_destroy(dma, vp);
859 
860 	DTRACE_PROBE2(nfs4clnt__func__referral__mount,
861 	    vnode_t *, vp, int, error);
862 
863 	crfree(mcred);
864 
865 done:
866 
867 	if (must_unlock) {
868 		mutex_enter(&net->net_cnt_lock);
869 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
870 
871 		/*
872 		 * REFCNT: If we are the root of the tree, then we need
873 		 * to keep a reference because we malloced the tree and
874 		 * this is where we tied it to our mntinfo.
875 		 *
876 		 * If we are not the root of the tree, then our tie to
877 		 * the mntinfo occured elsewhere and we need to
878 		 * decrement the reference to the tree.
879 		 */
880 		if (is_building)
881 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
882 		else
883 			nfs4_ephemeral_tree_decr(net);
884 		mutex_exit(&net->net_cnt_lock);
885 
886 		mutex_exit(&net->net_tree_lock);
887 	}
888 
889 	if (!error && (newvpp == NULL || *newvpp == NULL))
890 		error = ENOSYS;
891 
892 	return (error);
893 }
894 
895 /*
896  * Collect together both the generic & mount-type specific args.
897  */
898 static domount_args_t *
899 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr)
900 {
901 	int nointr;
902 	char *hostlist;
903 	servinfo4_t *svp;
904 	struct nfs_args *nargs, *nargs_head;
905 	enum clnt_stat status;
906 	ephemeral_servinfo_t *esi, *esi_first;
907 	domount_args_t *dma;
908 	mntinfo4_t *mi = VTOMI4(vp);
909 
910 	nointr = !(mi->mi_flags & MI4_INT);
911 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
912 
913 	svp = mi->mi_curr_serv;
914 	/* check if the current server is responding */
915 	status = nfs4_trigger_ping_server(svp, nointr);
916 	if (status == RPC_SUCCESS) {
917 		esi_first = nfs4_trigger_esi_create(vp, svp, cr);
918 		if (esi_first == NULL) {
919 			kmem_free(hostlist, MAXPATHLEN);
920 			return (NULL);
921 		}
922 
923 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
924 
925 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
926 	} else {
927 		/* current server did not respond */
928 		esi_first = NULL;
929 		nargs_head = NULL;
930 	}
931 	nargs = nargs_head;
932 
933 	/*
934 	 * NFS RO failover.
935 	 *
936 	 * If we have multiple servinfo4 structures, linked via sv_next,
937 	 * we must create one nfs_args for each, linking the nfs_args via
938 	 * nfs_ext_u.nfs_extB.next.
939 	 *
940 	 * We need to build a corresponding esi for each, too, but that is
941 	 * used solely for building nfs_args, and may be immediately
942 	 * discarded, as domount() requires the info from just one esi,
943 	 * but all the nfs_args.
944 	 *
945 	 * Currently, the NFS mount code will hang if not all servers
946 	 * requested are available. To avoid that, we need to ping each
947 	 * server, here, and remove it from the list if it is not
948 	 * responding. This has the side-effect of that server then
949 	 * being permanently unavailable for this failover mount, even if
950 	 * it recovers. That's unfortunate, but the best we can do until
951 	 * the mount code path is fixed.
952 	 */
953 
954 	/*
955 	 * If the current server was down, loop indefinitely until we find
956 	 * at least one responsive server.
957 	 */
958 	do {
959 		/* no locking needed for sv_next; it is only set at fs mount */
960 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
961 			struct nfs_args *next;
962 
963 			/*
964 			 * nargs_head: the head of the nfs_args list
965 			 * nargs: the current tail of the list
966 			 * next: the newly-created element to be added
967 			 */
968 
969 			/*
970 			 * We've already tried the current server, above;
971 			 * if it was responding, we have already included it
972 			 * and it may now be ignored.
973 			 *
974 			 * Otherwise, try it again, since it may now have
975 			 * recovered.
976 			 */
977 			if (svp == mi->mi_curr_serv && esi_first != NULL)
978 				continue;
979 
980 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
981 			if (svp->sv_flags & SV4_NOTINUSE) {
982 				nfs_rw_exit(&svp->sv_lock);
983 				continue;
984 			}
985 			nfs_rw_exit(&svp->sv_lock);
986 
987 			/* check if the server is responding */
988 			status = nfs4_trigger_ping_server(svp, nointr);
989 			/* if the server did not respond, ignore it */
990 			if (status != RPC_SUCCESS)
991 				continue;
992 
993 			esi = nfs4_trigger_esi_create(vp, svp, cr);
994 			if (esi == NULL)
995 				continue;
996 
997 			/*
998 			 * If the original current server (mi_curr_serv)
999 			 * was down when when we first tried it,
1000 			 * (i.e. esi_first == NULL),
1001 			 * we select this new server (svp) to be the server
1002 			 * that we will actually contact (esi_first).
1003 			 *
1004 			 * Note that it's possible that mi_curr_serv == svp,
1005 			 * if that mi_curr_serv was down but has now recovered.
1006 			 */
1007 			next = nfs4_trigger_nargs_create(mi, svp, esi);
1008 			if (esi_first == NULL) {
1009 				ASSERT(nargs == NULL);
1010 				ASSERT(nargs_head == NULL);
1011 				nargs_head = next;
1012 				esi_first = esi;
1013 				(void) strlcpy(hostlist,
1014 				    esi_first->esi_hostname, MAXPATHLEN);
1015 			} else {
1016 				ASSERT(nargs_head != NULL);
1017 				nargs->nfs_ext_u.nfs_extB.next = next;
1018 				(void) strlcat(hostlist, ",", MAXPATHLEN);
1019 				(void) strlcat(hostlist, esi->esi_hostname,
1020 				    MAXPATHLEN);
1021 				/* esi was only needed for hostname & nargs */
1022 				nfs4_trigger_esi_destroy(esi, vp);
1023 			}
1024 
1025 			nargs = next;
1026 		}
1027 
1028 		/* if we've had no response at all, wait a second */
1029 		if (esi_first == NULL)
1030 			delay(drv_usectohz(1000000));
1031 
1032 	} while (esi_first == NULL);
1033 	ASSERT(nargs_head != NULL);
1034 
1035 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1036 	dma->dma_esi = esi_first;
1037 	dma->dma_hostlist = hostlist;
1038 	dma->dma_nargs = nargs_head;
1039 
1040 	return (dma);
1041 }
1042 
1043 static void
1044 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1045 {
1046 	if (dma != NULL) {
1047 		if (dma->dma_esi != NULL && vp != NULL)
1048 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1049 
1050 		if (dma->dma_hostlist != NULL)
1051 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
1052 
1053 		if (dma->dma_nargs != NULL) {
1054 			struct nfs_args *nargs = dma->dma_nargs;
1055 
1056 			do {
1057 				struct nfs_args *next =
1058 				    nargs->nfs_ext_u.nfs_extB.next;
1059 
1060 				nfs4_trigger_nargs_destroy(nargs);
1061 				nargs = next;
1062 			} while (nargs != NULL);
1063 		}
1064 
1065 		kmem_free(dma, sizeof (domount_args_t));
1066 	}
1067 }
1068 
1069 /*
1070  * The ephemeral_servinfo_t struct contains basic information we will need to
1071  * perform the mount. Whilst the structure is generic across different
1072  * types of ephemeral mount, the way we gather its contents differs.
1073  */
1074 static ephemeral_servinfo_t *
1075 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1076 {
1077 	ephemeral_servinfo_t *esi;
1078 	rnode4_t *rp = VTOR4(vp);
1079 
1080 	ASSERT(RP_ISSTUB(rp));
1081 
1082 	/* Call the ephemeral type-specific routine */
1083 	if (RP_ISSTUB_MIRRORMOUNT(rp))
1084 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1085 	else if (RP_ISSTUB_REFERRAL(rp))
1086 		esi = nfs4_trigger_esi_create_referral(vp, cr);
1087 	else
1088 		esi = NULL;
1089 	return (esi);
1090 }
1091 
1092 static void
1093 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1094 {
1095 	rnode4_t *rp = VTOR4(vp);
1096 
1097 	ASSERT(RP_ISSTUB(rp));
1098 
1099 	/* Currently, no need for an ephemeral type-specific routine */
1100 
1101 	/*
1102 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
1103 	 * and will be handled by nfs4_trigger_nargs_destroy().
1104 	 * We need only free the structure itself.
1105 	 */
1106 	if (esi != NULL)
1107 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
1108 }
1109 
1110 /*
1111  * Some of this may turn out to be common with other ephemeral types,
1112  * in which case it should be moved to nfs4_trigger_esi_create(), or a
1113  * common function called.
1114  */
1115 
1116 /*
1117  * Mirror mounts case - should have all data available
1118  */
1119 static ephemeral_servinfo_t *
1120 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1121 {
1122 	char			*stubpath;
1123 	struct knetconfig	*sikncp, *svkncp;
1124 	struct netbuf		*bufp;
1125 	ephemeral_servinfo_t	*esi;
1126 
1127 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1128 
1129 	/* initially set to be our type of ephemeral mount; may be added to */
1130 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1131 
1132 	/*
1133 	 * We're copying info from the stub rnode's servinfo4, but
1134 	 * we must create new copies, not pointers, since this information
1135 	 * is to be associated with the new mount, which will be
1136 	 * unmounted (and its structures freed) separately
1137 	 */
1138 
1139 	/*
1140 	 * Sizes passed to kmem_[z]alloc here must match those freed
1141 	 * in nfs4_free_args()
1142 	 */
1143 
1144 	/*
1145 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1146 	 * is difficult to avoid: as we need to read svp to calculate the
1147 	 * sizes to be allocated.
1148 	 */
1149 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1150 
1151 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1152 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1153 
1154 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1155 	bufp = esi->esi_addr;
1156 	bufp->len = svp->sv_addr.len;
1157 	bufp->maxlen = svp->sv_addr.maxlen;
1158 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1159 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1160 
1161 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1162 	sikncp = esi->esi_knconf;
1163 	svkncp = svp->sv_knconf;
1164 	sikncp->knc_semantics = svkncp->knc_semantics;
1165 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1166 	(void) strcat((char *)sikncp->knc_protofmly,
1167 	    (char *)svkncp->knc_protofmly);
1168 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1169 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1170 	sikncp->knc_rdev = svkncp->knc_rdev;
1171 
1172 	/*
1173 	 * Used when AUTH_DH is negotiated.
1174 	 *
1175 	 * This is ephemeral mount-type specific, since it contains the
1176 	 * server's time-sync syncaddr.
1177 	 */
1178 	if (svp->sv_dhsec) {
1179 		struct netbuf *bufp;
1180 		sec_data_t *sdata;
1181 		dh_k4_clntdata_t *data;
1182 
1183 		sdata = svp->sv_dhsec;
1184 		data = (dh_k4_clntdata_t *)sdata->data;
1185 		ASSERT(sdata->rpcflavor == AUTH_DH);
1186 
1187 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1188 		bufp->len = data->syncaddr.len;
1189 		bufp->maxlen = data->syncaddr.maxlen;
1190 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1191 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1192 		esi->esi_syncaddr = bufp;
1193 
1194 		if (data->netname != NULL) {
1195 			int nmlen = data->netnamelen;
1196 
1197 			/*
1198 			 * We need to copy from a dh_k4_clntdata_t
1199 			 * netname/netnamelen pair to a NUL-terminated
1200 			 * netname string suitable for putting in nfs_args,
1201 			 * where the latter has no netnamelen field.
1202 			 */
1203 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1204 			bcopy(data->netname, esi->esi_netname, nmlen);
1205 		}
1206 	} else {
1207 		esi->esi_syncaddr = NULL;
1208 		esi->esi_netname = NULL;
1209 	}
1210 
1211 	stubpath = fn_path(VTOSV(vp)->sv_name);
1212 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1213 	ASSERT(*stubpath == '.');
1214 	stubpath += 1;
1215 
1216 	/* for nfs_args->fh */
1217 	esi->esi_path_len = strlen(stubpath) + 1;
1218 	if (strcmp(svp->sv_path, "/") != 0)
1219 		esi->esi_path_len += strlen(svp->sv_path);
1220 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1221 	if (strcmp(svp->sv_path, "/") != 0)
1222 		(void) strcat(esi->esi_path, svp->sv_path);
1223 	(void) strcat(esi->esi_path, stubpath);
1224 
1225 	stubpath -= 1;
1226 	/* stubpath allocated by fn_path() */
1227 	kmem_free(stubpath, strlen(stubpath) + 1);
1228 
1229 	nfs_rw_exit(&svp->sv_lock);
1230 
1231 	return (esi);
1232 }
1233 
1234 /*
1235  * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1236  * get network information required to do the mount call.
1237  */
1238 int
1239 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1240 {
1241 	door_arg_t	door_args;
1242 	door_handle_t	dh;
1243 	XDR		xdr;
1244 	refd_door_args_t *xdr_argsp;
1245 	refd_door_res_t  *orig_resp;
1246 	k_sigset_t	smask;
1247 	int		xdr_len = 0;
1248 	int 		res_len = 16; /* length of an ip adress */
1249 	int		orig_reslen = res_len;
1250 	int		error = 0;
1251 	struct nfsidmap_globals *nig;
1252 
1253 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1254 		return (ECONNREFUSED);
1255 
1256 	nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1257 	ASSERT(nig != NULL);
1258 
1259 	mutex_enter(&nig->nfsidmap_daemon_lock);
1260 	dh = nig->nfsidmap_daemon_dh;
1261 	if (dh == NULL) {
1262 		mutex_exit(&nig->nfsidmap_daemon_lock);
1263 		cmn_err(CE_NOTE,
1264 		    "nfs4_callmapid: nfsmapid daemon not " \
1265 		    "running unable to resolve host name\n");
1266 		return (EINVAL);
1267 	}
1268 	door_ki_hold(dh);
1269 	mutex_exit(&nig->nfsidmap_daemon_lock);
1270 
1271 	xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1272 
1273 	xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1274 	xdr_argsp->xdr_len = xdr_len;
1275 	xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1276 
1277 	xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1278 	    xdr_len, XDR_ENCODE);
1279 
1280 	if (!xdr_utf8string(&xdr, server)) {
1281 		kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1282 		door_ki_rele(dh);
1283 		return (1);
1284 	}
1285 
1286 	if (orig_reslen)
1287 		orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1288 
1289 	door_args.data_ptr = (char *)xdr_argsp;
1290 	door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1291 	door_args.desc_ptr = NULL;
1292 	door_args.desc_num = 0;
1293 	door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1294 	door_args.rsize = res_len;
1295 
1296 	sigintr(&smask, 1);
1297 	error = door_ki_upcall(dh, &door_args);
1298 	sigunintr(&smask);
1299 
1300 	door_ki_rele(dh);
1301 
1302 	kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1303 	if (error) {
1304 		kmem_free(orig_resp, orig_reslen);
1305 		/*
1306 		 * There is no door to connect to. The referral daemon
1307 		 * must not be running yet.
1308 		 */
1309 		cmn_err(CE_WARN,
1310 		    "nfsmapid not running cannot resolve host name");
1311 		goto out;
1312 	}
1313 
1314 	/*
1315 	 * If the results buffer passed back are not the same as
1316 	 * what was sent free the old buffer and use the new one.
1317 	 */
1318 	if (orig_resp && orig_reslen) {
1319 		refd_door_res_t *door_resp;
1320 
1321 		door_resp = (refd_door_res_t *)door_args.rbuf;
1322 		if ((void *)door_args.rbuf != orig_resp)
1323 			kmem_free(orig_resp, orig_reslen);
1324 		if (door_resp->res_status == 0) {
1325 			xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1326 			    door_resp->xdr_len, XDR_DECODE);
1327 			bzero(resp, sizeof (struct nfs_fsl_info));
1328 			if (!xdr_nfs_fsl_info(&xdr, resp)) {
1329 				DTRACE_PROBE2(
1330 				    nfs4clnt__debug__referral__upcall__xdrfail,
1331 				    struct nfs_fsl_info *, resp,
1332 				    char *, "nfs4_callmapid");
1333 				error = EINVAL;
1334 			}
1335 		} else {
1336 			DTRACE_PROBE2(
1337 			    nfs4clnt__debug__referral__upcall__badstatus,
1338 			    int, door_resp->res_status,
1339 			    char *, "nfs4_callmapid");
1340 			error = door_resp->res_status;
1341 		}
1342 		kmem_free(door_args.rbuf, door_args.rsize);
1343 	}
1344 out:
1345 	DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1346 	    char *, server, int, error);
1347 	return (error);
1348 }
1349 
1350 /*
1351  * Fetches the fs_locations attribute. Typically called
1352  * from a Replication/Migration/Referrals/Mirror-mount context
1353  *
1354  * Fills in the attributes in garp. The caller is assumed
1355  * to have allocated memory for garp.
1356  *
1357  * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1358  *	 it's already done by caller. Otherwise lock these mutexes
1359  *	 before doing the rfs4call().
1360  *
1361  * Returns
1362  * 	1	 for success
1363  * 	0	 for failure
1364  */
1365 int
1366 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1367     cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1368 {
1369 	COMPOUND4args_clnt args;
1370 	COMPOUND4res_clnt res;
1371 	nfs_argop4 *argop;
1372 	int argoplist_size = 3 * sizeof (nfs_argop4);
1373 	nfs4_server_t *sp = NULL;
1374 	int doqueue = 1;
1375 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1376 	int retval = 1;
1377 	struct nfs4_clnt *nfscl;
1378 
1379 	if (lock == TRUE)
1380 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1381 	else
1382 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1383 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1384 
1385 	sp = find_nfs4_server(mi);
1386 	if (lock == TRUE)
1387 		nfs_rw_exit(&mi->mi_recovlock);
1388 
1389 	if (sp != NULL)
1390 		mutex_exit(&sp->s_lock);
1391 
1392 	if (lock == TRUE) {
1393 		if (sp != NULL)
1394 			(void) nfs_rw_enter_sig(&sp->s_recovlock,
1395 			    RW_WRITER, 0);
1396 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1397 	} else {
1398 		if (sp != NULL) {
1399 			ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1400 			    nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1401 		}
1402 	}
1403 
1404 	/*
1405 	 * Do we want to do the setup for recovery here?
1406 	 *
1407 	 * We know that the server responded to a null ping a very
1408 	 * short time ago, and we know that we intend to do a
1409 	 * single stateless operation - we want to fetch attributes,
1410 	 * so we know we can't encounter errors about state.  If
1411 	 * something goes wrong with the GETATTR, like not being
1412 	 * able to get a response from the server or getting any
1413 	 * kind of FH error, we should fail the mount.
1414 	 *
1415 	 * We may want to re-visited this at a later time.
1416 	 */
1417 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
1418 
1419 	args.ctag = TAG_GETATTR_FSLOCATION;
1420 	/* PUTFH LOOKUP GETATTR */
1421 	args.array_len = 3;
1422 	args.array = argop;
1423 
1424 	/* 0. putfh file */
1425 	argop[0].argop = OP_CPUTFH;
1426 	argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1427 
1428 	/* 1. lookup name, can't be dotdot */
1429 	argop[1].argop = OP_CLOOKUP;
1430 	argop[1].nfs_argop4_u.opclookup.cname = nm;
1431 
1432 	/* 2. file attrs */
1433 	argop[2].argop = OP_GETATTR;
1434 	argop[2].nfs_argop4_u.opgetattr.attr_request =
1435 	    FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1436 	    FATTR4_MOUNTED_ON_FILEID_MASK;
1437 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
1438 
1439 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1440 
1441 	if (lock == TRUE) {
1442 		nfs_rw_exit(&mi->mi_recovlock);
1443 		if (sp != NULL)
1444 			nfs_rw_exit(&sp->s_recovlock);
1445 	}
1446 
1447 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1448 	nfscl->nfscl_stat.referrals.value.ui64++;
1449 	DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1450 	    nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1451 
1452 	if (e.error != 0) {
1453 		if (sp != NULL)
1454 			nfs4_server_rele(sp);
1455 		kmem_free(argop, argoplist_size);
1456 		return (0);
1457 	}
1458 
1459 	/*
1460 	 * Check for all possible error conditions.
1461 	 * For valid replies without an ops array or for illegal
1462 	 * replies, return a failure.
1463 	 */
1464 	if (res.status != NFS4_OK || res.array_len < 3 ||
1465 	    res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1466 		retval = 0;
1467 		goto exit;
1468 	}
1469 
1470 	/*
1471 	 * There isn't much value in putting the attributes
1472 	 * in the attr cache since fs_locations4 aren't
1473 	 * encountered very frequently, so just make them
1474 	 * available to the caller.
1475 	 */
1476 	*garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1477 
1478 	DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1479 	    nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1480 
1481 	/* No fs_locations? -- return a failure */
1482 	if (garp->n4g_ext_res == NULL ||
1483 	    garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1484 		retval = 0;
1485 		goto exit;
1486 	}
1487 
1488 	if (!garp->n4g_fsid_valid)
1489 		retval = 0;
1490 
1491 exit:
1492 	if (retval == 0) {
1493 		/* the call was ok but failed validating the call results */
1494 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1495 	} else {
1496 		ASSERT(callres != NULL);
1497 		*callres = res;
1498 	}
1499 
1500 	if (sp != NULL)
1501 		nfs4_server_rele(sp);
1502 	kmem_free(argop, argoplist_size);
1503 	return (retval);
1504 }
1505 
1506 /* tunable to disable referral mounts */
1507 int nfs4_no_referrals = 0;
1508 
1509 /*
1510  * Returns NULL if the vnode cannot be created or found.
1511  */
1512 vnode_t *
1513 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1514 {
1515 	nfs_fh4 *stub_fh, *dfh;
1516 	nfs4_sharedfh_t *sfhp;
1517 	char *newfhval;
1518 	vnode_t *vp = NULL;
1519 	fattr4_mounted_on_fileid mnt_on_fileid;
1520 	nfs4_ga_res_t garp;
1521 	mntinfo4_t *mi;
1522 	COMPOUND4res_clnt callres;
1523 	hrtime_t t;
1524 
1525 	if (nfs4_no_referrals)
1526 		return (NULL);
1527 
1528 	/*
1529 	 * Get the mounted_on_fileid, unique on that server::fsid
1530 	 */
1531 	mi = VTOMI4(dvp);
1532 	if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1533 	    &garp, &callres, FALSE) == 0)
1534 		return (NULL);
1535 	mnt_on_fileid = garp.n4g_mon_fid;
1536 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1537 
1538 	/*
1539 	 * Build a fake filehandle from the dir FH and the mounted_on_fileid
1540 	 */
1541 	dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1542 	stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1543 	stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1544 	    sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1545 	newfhval = stub_fh->nfs_fh4_val;
1546 
1547 	/* copy directory's file handle */
1548 	bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1549 	stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1550 	newfhval = newfhval + dfh->nfs_fh4_len;
1551 
1552 	/* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1553 	bcopy((char *)&mnt_on_fileid, newfhval,
1554 	    sizeof (fattr4_mounted_on_fileid));
1555 	stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1556 
1557 	sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1558 	kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1559 	    sizeof (fattr4_mounted_on_fileid));
1560 	kmem_free(stub_fh, sizeof (nfs_fh4));
1561 	if (sfhp == NULL)
1562 		return (NULL);
1563 
1564 	t = gethrtime();
1565 	garp.n4g_va.va_type = VDIR;
1566 	vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1567 	    cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1568 
1569 	if (vp != NULL)
1570 		vp->v_type = VDIR;
1571 
1572 	sfh4_rele(&sfhp);
1573 	return (vp);
1574 }
1575 
1576 int
1577 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1578 {
1579 	vnode_t *nvp;
1580 	rnode4_t *rp;
1581 
1582 	if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1583 		return (EINVAL);
1584 
1585 	rp = VTOR4(nvp);
1586 	mutex_enter(&rp->r_statelock);
1587 	r4_stub_referral(rp);
1588 	mutex_exit(&rp->r_statelock);
1589 	dnlc_enter(dvp, nm, nvp);
1590 
1591 	if (*vpp != NULL)
1592 		VN_RELE(*vpp);	/* no longer need this vnode */
1593 
1594 	*vpp = nvp;
1595 
1596 	return (0);
1597 }
1598 
1599 /*
1600  * Fetch the location information and resolve the new server.
1601  * Caller needs to free up the XDR data which is returned.
1602  * Input: mount info, shared filehandle, nodename
1603  * Return: Index to the result or Error(-1)
1604  * Output: FsLocations Info, Resolved Server Info.
1605  */
1606 int
1607 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1608     char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1609     struct nfs_fsl_info *fsloc)
1610 {
1611 	fs_location4 *fsp;
1612 	struct nfs_fsl_info nfsfsloc;
1613 	int ret, i, error;
1614 	nfs4_ga_res_t garp;
1615 	COMPOUND4res_clnt callres;
1616 	struct knetconfig *knc;
1617 
1618 	ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1619 	if (ret == 0)
1620 		return (-1);
1621 
1622 	/*
1623 	 * As a lame attempt to figuring out if we're
1624 	 * handling a migration event or a referral,
1625 	 * look for rnodes with this fsid in the rnode
1626 	 * cache.
1627 	 *
1628 	 * If we can find one or more such rnodes, it
1629 	 * means we're handling a migration event and
1630 	 * we want to bail out in that case.
1631 	 */
1632 	if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1633 		DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1634 		    mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1635 		    char *, "nfs4_process_referral");
1636 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1637 		return (-1);
1638 	}
1639 
1640 	/*
1641 	 * Find the first responsive server to mount.  When we find
1642 	 * one, fsp will point to it.
1643 	 */
1644 	for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1645 
1646 		fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1647 		if (fsp->server_len == 0 || fsp->server_val == NULL)
1648 			continue;
1649 
1650 		error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1651 		if (error != 0)
1652 			continue;
1653 
1654 		error = nfs4_ping_server_common(nfsfsloc.knconf,
1655 		    nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1656 		if (error == RPC_SUCCESS)
1657 			break;
1658 
1659 		DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1660 		    sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1661 		    char *, "nfs4_process_referral");
1662 
1663 		(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1664 	}
1665 	knc = nfsfsloc.knconf;
1666 	if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1667 	    (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1668 		DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1669 		    nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1670 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1671 		return (-1);
1672 	}
1673 
1674 	/* Send the results back */
1675 	*fsloc = nfsfsloc;
1676 	*grp = garp;
1677 	*res = callres;
1678 	return (i);
1679 }
1680 
1681 /*
1682  * Referrals case - need to fetch referral data and then upcall to
1683  * user-level to get complete mount data.
1684  */
1685 static ephemeral_servinfo_t *
1686 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1687 {
1688 	struct knetconfig	*sikncp, *svkncp;
1689 	struct netbuf		*bufp;
1690 	ephemeral_servinfo_t	*esi;
1691 	vnode_t			*dvp;
1692 	rnode4_t		*drp;
1693 	fs_location4		*fsp;
1694 	struct nfs_fsl_info	nfsfsloc;
1695 	nfs4_ga_res_t		garp;
1696 	char			*p;
1697 	char			fn[MAXNAMELEN];
1698 	int			i, index = -1;
1699 	mntinfo4_t		*mi;
1700 	COMPOUND4res_clnt	callres;
1701 
1702 	/*
1703 	 * If we're passed in a stub vnode that
1704 	 * isn't a "referral" stub, bail out
1705 	 * and return a failure
1706 	 */
1707 	if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1708 		return (NULL);
1709 
1710 	if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1711 		return (NULL);
1712 
1713 	drp = VTOR4(dvp);
1714 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1715 		VN_RELE(dvp);
1716 		return (NULL);
1717 	}
1718 
1719 	if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1720 		nfs_rw_exit(&drp->r_rwlock);
1721 		VN_RELE(dvp);
1722 		return (NULL);
1723 	}
1724 
1725 	mi = VTOMI4(dvp);
1726 	index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1727 	    &garp, &callres, &nfsfsloc);
1728 	nfs_rw_exit(&drp->r_rwlock);
1729 	VN_RELE(dvp);
1730 	if (index < 0)
1731 		return (NULL);
1732 
1733 	fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1734 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1735 
1736 	/* initially set to be our type of ephemeral mount; may be added to */
1737 	esi->esi_mount_flags = NFSMNT_REFERRAL;
1738 
1739 	esi->esi_hostname =
1740 	    kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1741 	bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1742 	    fsp->server_val->utf8string_len);
1743 	esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1744 
1745 	bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1746 	bufp->len = nfsfsloc.addr->len;
1747 	bufp->maxlen = nfsfsloc.addr->maxlen;
1748 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1749 	bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1750 	esi->esi_addr = bufp;
1751 
1752 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1753 	sikncp = esi->esi_knconf;
1754 
1755 	DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1756 	    struct nfs_fsl_info *, &nfsfsloc,
1757 	    char *, "nfs4_trigger_esi_create_referral");
1758 
1759 	svkncp = nfsfsloc.knconf;
1760 	sikncp->knc_semantics = svkncp->knc_semantics;
1761 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1762 	(void) strlcat((char *)sikncp->knc_protofmly,
1763 	    (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1764 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1765 	(void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1766 	    KNC_STRSIZE);
1767 	sikncp->knc_rdev = svkncp->knc_rdev;
1768 
1769 	DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1770 	    struct knetconfig *, sikncp,
1771 	    char *, "nfs4_trigger_esi_create_referral");
1772 
1773 	esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1774 	bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1775 	esi->esi_syncaddr = NULL;
1776 
1777 	esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1778 	esi->esi_path_len = MAXPATHLEN;
1779 	*p++ = '/';
1780 	for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1781 		component4 *comp;
1782 
1783 		comp = &fsp->rootpath.pathname4_val[i];
1784 		/* If no space, null the string and bail */
1785 		if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1786 			goto err;
1787 		bcopy(comp->utf8string_val, p, comp->utf8string_len);
1788 		p += comp->utf8string_len;
1789 		*p++ = '/';
1790 	}
1791 	if (fsp->rootpath.pathname4_len != 0)
1792 		*(p - 1) = '\0';
1793 	else
1794 		*p = '\0';
1795 	p = esi->esi_path;
1796 	esi->esi_path = strdup(p);
1797 	esi->esi_path_len = strlen(p) + 1;
1798 	kmem_free(p, MAXPATHLEN);
1799 
1800 	/* Allocated in nfs4_process_referral() */
1801 	(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1802 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1803 
1804 	return (esi);
1805 err:
1806 	kmem_free(esi->esi_path, esi->esi_path_len);
1807 	kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1808 	kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1809 	kmem_free(esi->esi_addr, sizeof (struct netbuf));
1810 	kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1811 	kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1812 	kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1813 	kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1814 	kmem_free(esi, sizeof (ephemeral_servinfo_t));
1815 	(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1816 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1817 	return (NULL);
1818 }
1819 
1820 /*
1821  * Assemble the args, and call the generic VFS mount function to
1822  * finally perform the ephemeral mount.
1823  */
1824 static int
1825 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1826     cred_t *cr, vnode_t **newvpp)
1827 {
1828 	struct mounta	*uap;
1829 	char		*mntpt, *orig_path, *path;
1830 	const char	*orig_mntpt;
1831 	int		retval;
1832 	int		mntpt_len;
1833 	int		spec_len;
1834 	zone_t		*zone = curproc->p_zone;
1835 	bool_t		has_leading_slash;
1836 	int		i;
1837 
1838 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1839 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1840 	struct nfs_args		*nargs = dma->dma_nargs;
1841 
1842 	/* first, construct the mount point for the ephemeral mount */
1843 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1844 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1845 
1846 	if (*orig_path == '.')
1847 		orig_path++;
1848 
1849 	/*
1850 	 * Get rid of zone's root path
1851 	 */
1852 	if (zone != global_zone) {
1853 		/*
1854 		 * -1 for trailing '/' and -1 for EOS.
1855 		 */
1856 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1857 		    zone->zone_rootpathlen - 1) == 0) {
1858 			orig_mntpt += (zone->zone_rootpathlen - 2);
1859 		}
1860 	}
1861 
1862 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1863 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1864 	(void) strcat(mntpt, orig_mntpt);
1865 	(void) strcat(mntpt, orig_path);
1866 
1867 	kmem_free(path, strlen(path) + 1);
1868 	path = esi->esi_path;
1869 	if (*path == '.')
1870 		path++;
1871 	if (path[0] == '/' && path[1] == '/')
1872 		path++;
1873 	has_leading_slash = (*path == '/');
1874 
1875 	spec_len = strlen(dma->dma_hostlist);
1876 	spec_len += strlen(path);
1877 
1878 	/* We are going to have to add this in */
1879 	if (!has_leading_slash)
1880 		spec_len++;
1881 
1882 	/* We need to get the ':' for dma_hostlist:esi_path */
1883 	spec_len++;
1884 
1885 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1886 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1887 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1888 	    has_leading_slash ? "" : "/", path);
1889 
1890 	uap->dir = mntpt;
1891 
1892 	uap->flags = MS_SYSSPACE | MS_DATA;
1893 	/* fstype-independent mount options not covered elsewhere */
1894 	/* copy parent's mount(1M) "-m" flag */
1895 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1896 		uap->flags |= MS_NOMNTTAB;
1897 
1898 	uap->fstype = MNTTYPE_NFS4;
1899 	uap->dataptr = (char *)nargs;
1900 	/* not needed for MS_SYSSPACE */
1901 	uap->datalen = 0;
1902 
1903 	/* use optptr to pass in extra mount options */
1904 	uap->flags |= MS_OPTIONSTR;
1905 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1906 	if (uap->optptr == NULL) {
1907 		retval = EINVAL;
1908 		goto done;
1909 	}
1910 
1911 	/* domount() expects us to count the trailing NUL */
1912 	uap->optlen = strlen(uap->optptr) + 1;
1913 
1914 	/*
1915 	 * If we get EBUSY, we try again once to see if we can perform
1916 	 * the mount. We do this because of a spurious race condition.
1917 	 */
1918 	for (i = 0; i < 2; i++) {
1919 		int	error;
1920 		bool_t	was_mounted;
1921 
1922 		retval = domount(NULL, uap, stubvp, cr, vfsp);
1923 		if (retval == 0) {
1924 			retval = VFS_ROOT(*vfsp, newvpp);
1925 			VFS_RELE(*vfsp);
1926 			break;
1927 		} else if (retval != EBUSY) {
1928 			break;
1929 		}
1930 
1931 		/*
1932 		 * We might find it mounted by the other racer...
1933 		 */
1934 		error = nfs4_trigger_mounted_already(stubvp,
1935 		    newvpp, &was_mounted, vfsp);
1936 		if (error) {
1937 			goto done;
1938 		} else if (was_mounted) {
1939 			retval = 0;
1940 			break;
1941 		}
1942 	}
1943 
1944 done:
1945 	if (uap->optptr)
1946 		nfs4_trigger_destroy_mntopts(uap->optptr);
1947 
1948 	kmem_free(uap->spec, spec_len + 1);
1949 	kmem_free(uap, sizeof (struct mounta));
1950 	kmem_free(mntpt, mntpt_len + 1);
1951 
1952 	return (retval);
1953 }
1954 
1955 /*
1956  * Build an nfs_args structure for passing to domount().
1957  *
1958  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1959  * generic data - common to all ephemeral mount types - is read directly
1960  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1961  */
1962 static struct nfs_args *
1963 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1964     ephemeral_servinfo_t *esi)
1965 {
1966 	sec_data_t *secdata;
1967 	struct nfs_args *nargs;
1968 
1969 	/* setup the nfs args */
1970 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1971 
1972 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1973 
1974 	nargs->addr = esi->esi_addr;
1975 
1976 	/* for AUTH_DH by negotiation */
1977 	if (esi->esi_syncaddr || esi->esi_netname) {
1978 		nargs->flags |= NFSMNT_SECURE;
1979 		nargs->syncaddr = esi->esi_syncaddr;
1980 		nargs->netname = esi->esi_netname;
1981 	}
1982 
1983 	nargs->flags |= NFSMNT_KNCONF;
1984 	nargs->knconf = esi->esi_knconf;
1985 	nargs->flags |= NFSMNT_HOSTNAME;
1986 	nargs->hostname = esi->esi_hostname;
1987 	nargs->fh = esi->esi_path;
1988 
1989 	/* general mount settings, all copied from parent mount */
1990 	mutex_enter(&mi->mi_lock);
1991 
1992 	if (!(mi->mi_flags & MI4_HARD))
1993 		nargs->flags |= NFSMNT_SOFT;
1994 
1995 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
1996 	    NFSMNT_RETRANS;
1997 	nargs->wsize = mi->mi_stsize;
1998 	nargs->rsize = mi->mi_tsize;
1999 	nargs->timeo = mi->mi_timeo;
2000 	nargs->retrans = mi->mi_retrans;
2001 
2002 	if (mi->mi_flags & MI4_INT)
2003 		nargs->flags |= NFSMNT_INT;
2004 	if (mi->mi_flags & MI4_NOAC)
2005 		nargs->flags |= NFSMNT_NOAC;
2006 
2007 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2008 	    NFSMNT_ACDIRMAX;
2009 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
2010 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
2011 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2012 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2013 
2014 	/* add any specific flags for this type of ephemeral mount */
2015 	nargs->flags |= esi->esi_mount_flags;
2016 
2017 	if (mi->mi_flags & MI4_NOCTO)
2018 		nargs->flags |= NFSMNT_NOCTO;
2019 	if (mi->mi_flags & MI4_GRPID)
2020 		nargs->flags |= NFSMNT_GRPID;
2021 	if (mi->mi_flags & MI4_LLOCK)
2022 		nargs->flags |= NFSMNT_LLOCK;
2023 	if (mi->mi_flags & MI4_NOPRINT)
2024 		nargs->flags |= NFSMNT_NOPRINT;
2025 	if (mi->mi_flags & MI4_DIRECTIO)
2026 		nargs->flags |= NFSMNT_DIRECTIO;
2027 	if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2028 		nargs->flags |= NFSMNT_PUBLIC;
2029 
2030 	/* Do some referral-specific option tweaking */
2031 	if (nargs->flags & NFSMNT_REFERRAL) {
2032 		nargs->flags &= ~NFSMNT_DORDMA;
2033 		nargs->flags |= NFSMNT_TRYRDMA;
2034 	}
2035 
2036 	mutex_exit(&mi->mi_lock);
2037 
2038 	/*
2039 	 * Security data & negotiation policy.
2040 	 *
2041 	 * For mirror mounts, we need to preserve the parent mount's
2042 	 * preference for security negotiation, translating SV4_TRYSECDEFAULT
2043 	 * to NFSMNT_SECDEFAULT if present.
2044 	 *
2045 	 * For referrals, we always want security negotiation and will
2046 	 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2047 	 * The reason is that we can't negotiate down from a parent's
2048 	 * Kerberos flavor to AUTH_SYS.
2049 	 *
2050 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2051 	 * security flavour was requested, with data in sv_secdata, and that
2052 	 * no negotiation should occur. If this specified flavour fails, that's
2053 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2054 	 *
2055 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2056 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
2057 	 * Possible flavours are recorded in an array in sv_secinfo, with
2058 	 * currently in-use flavour pointed to by sv_currsec.
2059 	 *
2060 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
2061 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2062 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2063 	 */
2064 	if (nargs->flags & NFSMNT_REFERRAL) {
2065 		/* enable negotiation for referral mount */
2066 		nargs->flags |= NFSMNT_SECDEFAULT;
2067 		secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2068 		secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2069 		secdata->data = NULL;
2070 	}
2071 
2072 	else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2073 		/* enable negotiation for mirror mount */
2074 		nargs->flags |= NFSMNT_SECDEFAULT;
2075 
2076 		/*
2077 		 * As a starting point for negotiation, copy parent
2078 		 * mount's negotiated flavour (sv_currsec) if available,
2079 		 * or its passed-in flavour (sv_secdata) if not.
2080 		 */
2081 		if (svp->sv_currsec != NULL)
2082 			secdata = copy_sec_data(svp->sv_currsec);
2083 		else if (svp->sv_secdata != NULL)
2084 			secdata = copy_sec_data(svp->sv_secdata);
2085 		else
2086 			secdata = NULL;
2087 	} else {
2088 		/* do not enable negotiation; copy parent's passed-in flavour */
2089 		if (svp->sv_secdata != NULL)
2090 			secdata = copy_sec_data(svp->sv_secdata);
2091 		else
2092 			secdata = NULL;
2093 	}
2094 
2095 	nfs_rw_exit(&svp->sv_lock);
2096 
2097 	nargs->flags |= NFSMNT_NEWARGS;
2098 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
2099 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2100 
2101 	/* for NFS RO failover; caller will set if necessary */
2102 	nargs->nfs_ext_u.nfs_extB.next = NULL;
2103 
2104 	return (nargs);
2105 }
2106 
2107 static void
2108 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2109 {
2110 	/*
2111 	 * Either the mount failed, in which case the data is not needed, or
2112 	 * nfs4_mount() has either taken copies of what it needs or,
2113 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
2114 	 * whereby nfs4_free_args() will ignore it.
2115 	 */
2116 	nfs4_free_args(nargs);
2117 	kmem_free(nargs, sizeof (struct nfs_args));
2118 }
2119 
2120 /*
2121  * When we finally get into the mounting, we need to add this
2122  * node to the ephemeral tree.
2123  *
2124  * This is called from nfs4_mount().
2125  */
2126 int
2127 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2128 {
2129 	mntinfo4_t		*mi_parent;
2130 	nfs4_ephemeral_t	*eph;
2131 	nfs4_ephemeral_tree_t	*net;
2132 
2133 	nfs4_ephemeral_t	*prior;
2134 	nfs4_ephemeral_t	*child;
2135 
2136 	nfs4_ephemeral_t	*peer;
2137 
2138 	nfs4_trigger_globals_t	*ntg;
2139 	zone_t			*zone = curproc->p_zone;
2140 
2141 	int			rc = 0;
2142 
2143 	mi_parent = VTOMI4(mvp);
2144 
2145 	/*
2146 	 * Get this before grabbing anything else!
2147 	 */
2148 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2149 	if (!ntg->ntg_thread_started) {
2150 		nfs4_ephemeral_start_harvester(ntg);
2151 	}
2152 
2153 	mutex_enter(&mi_parent->mi_lock);
2154 	mutex_enter(&mi->mi_lock);
2155 
2156 	net = mi->mi_ephemeral_tree =
2157 	    mi_parent->mi_ephemeral_tree;
2158 
2159 	/*
2160 	 * If the mi_ephemeral_tree is NULL, then it
2161 	 * means that either the harvester or a manual
2162 	 * umount has cleared the tree out right before
2163 	 * we got here.
2164 	 *
2165 	 * There is nothing we can do here, so return
2166 	 * to the caller and let them decide whether they
2167 	 * try again.
2168 	 */
2169 	if (net == NULL) {
2170 		mutex_exit(&mi->mi_lock);
2171 		mutex_exit(&mi_parent->mi_lock);
2172 
2173 		return (EBUSY);
2174 	}
2175 
2176 	/*
2177 	 * We've just tied the mntinfo to the tree, so
2178 	 * now we bump the refcnt and hold it there until
2179 	 * this mntinfo is removed from the tree.
2180 	 */
2181 	nfs4_ephemeral_tree_hold(net);
2182 
2183 	/*
2184 	 * We need to tack together the ephemeral mount
2185 	 * with this new mntinfo.
2186 	 */
2187 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2188 	eph->ne_mount = mi;
2189 	eph->ne_ref_time = gethrestime_sec();
2190 
2191 	/*
2192 	 * We need to tell the ephemeral mount when
2193 	 * to time out.
2194 	 */
2195 	eph->ne_mount_to = ntg->ntg_mount_to;
2196 
2197 	mi->mi_ephemeral = eph;
2198 
2199 	/*
2200 	 * If the enclosing mntinfo4 is also ephemeral,
2201 	 * then we need to point to its enclosing parent.
2202 	 * Else the enclosing mntinfo4 is the enclosing parent.
2203 	 *
2204 	 * We also need to weave this ephemeral node
2205 	 * into the tree.
2206 	 */
2207 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2208 		/*
2209 		 * We need to decide if we are
2210 		 * the root node of this branch
2211 		 * or if we are a sibling of this
2212 		 * branch.
2213 		 */
2214 		prior = mi_parent->mi_ephemeral;
2215 		if (prior == NULL) {
2216 			/*
2217 			 * Race condition, clean up, and
2218 			 * let caller handle mntinfo.
2219 			 */
2220 			mi->mi_flags &= ~MI4_EPHEMERAL;
2221 			mi->mi_ephemeral = NULL;
2222 			kmem_free(eph, sizeof (*eph));
2223 			nfs4_ephemeral_tree_rele(net);
2224 			rc = EBUSY;
2225 		} else {
2226 			if (prior->ne_child == NULL) {
2227 				prior->ne_child = eph;
2228 			} else {
2229 				child = prior->ne_child;
2230 
2231 				prior->ne_child = eph;
2232 				eph->ne_peer = child;
2233 
2234 				child->ne_prior = eph;
2235 			}
2236 
2237 			eph->ne_prior = prior;
2238 		}
2239 	} else {
2240 		/*
2241 		 * The parent mntinfo4 is the non-ephemeral
2242 		 * root of the ephemeral tree. We
2243 		 * need to decide if we are the root
2244 		 * node of that tree or if we are a
2245 		 * sibling of the root node.
2246 		 *
2247 		 * We are the root if there is no
2248 		 * other node.
2249 		 */
2250 		if (net->net_root == NULL) {
2251 			net->net_root = eph;
2252 		} else {
2253 			eph->ne_peer = peer = net->net_root;
2254 			ASSERT(peer != NULL);
2255 			net->net_root = eph;
2256 
2257 			peer->ne_prior = eph;
2258 		}
2259 
2260 		eph->ne_prior = NULL;
2261 	}
2262 
2263 	mutex_exit(&mi->mi_lock);
2264 	mutex_exit(&mi_parent->mi_lock);
2265 
2266 	return (rc);
2267 }
2268 
2269 /*
2270  * Commit the changes to the ephemeral tree for removing this node.
2271  */
2272 static void
2273 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2274 {
2275 	nfs4_ephemeral_t	*e = eph;
2276 	nfs4_ephemeral_t	*peer;
2277 	nfs4_ephemeral_t	*prior;
2278 
2279 	peer = eph->ne_peer;
2280 	prior = e->ne_prior;
2281 
2282 	/*
2283 	 * If this branch root was not the
2284 	 * tree root, then we need to fix back pointers.
2285 	 */
2286 	if (prior) {
2287 		if (prior->ne_child == e) {
2288 			prior->ne_child = peer;
2289 		} else {
2290 			prior->ne_peer = peer;
2291 		}
2292 
2293 		if (peer)
2294 			peer->ne_prior = prior;
2295 	} else if (peer) {
2296 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2297 		peer->ne_prior = NULL;
2298 	} else {
2299 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2300 	}
2301 }
2302 
2303 /*
2304  * We want to avoid recursion at all costs. So we need to
2305  * unroll the tree. We do this by a depth first traversal to
2306  * leaf nodes. We blast away the leaf and work our way back
2307  * up and down the tree.
2308  */
2309 static int
2310 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2311     int isTreeRoot, int flag, cred_t *cr)
2312 {
2313 	nfs4_ephemeral_t	*e = eph;
2314 	nfs4_ephemeral_t	*prior;
2315 	mntinfo4_t		*mi;
2316 	vfs_t			*vfsp;
2317 	int			error;
2318 
2319 	/*
2320 	 * We use the loop while unrolling the ephemeral tree.
2321 	 */
2322 	for (;;) {
2323 		/*
2324 		 * First we walk down the child.
2325 		 */
2326 		if (e->ne_child) {
2327 			prior = e;
2328 			e = e->ne_child;
2329 			continue;
2330 		}
2331 
2332 		/*
2333 		 * If we are the root of the branch we are removing,
2334 		 * we end it here. But if the branch is the root of
2335 		 * the tree, we have to forge on. We do not consider
2336 		 * the peer list for the root because while it may
2337 		 * be okay to remove, it is both extra work and a
2338 		 * potential for a false-positive error to stall the
2339 		 * unmount attempt.
2340 		 */
2341 		if (e == eph && isTreeRoot == FALSE)
2342 			return (0);
2343 
2344 		/*
2345 		 * Next we walk down the peer list.
2346 		 */
2347 		if (e->ne_peer) {
2348 			prior = e;
2349 			e = e->ne_peer;
2350 			continue;
2351 		}
2352 
2353 		/*
2354 		 * We can only remove the node passed in by the
2355 		 * caller if it is the root of the ephemeral tree.
2356 		 * Otherwise, the caller will remove it.
2357 		 */
2358 		if (e == eph && isTreeRoot == FALSE)
2359 			return (0);
2360 
2361 		/*
2362 		 * Okay, we have a leaf node, time
2363 		 * to prune it!
2364 		 *
2365 		 * Note that prior can only be NULL if
2366 		 * and only if it is the root of the
2367 		 * ephemeral tree.
2368 		 */
2369 		prior = e->ne_prior;
2370 
2371 		mi = e->ne_mount;
2372 		mutex_enter(&mi->mi_lock);
2373 		vfsp = mi->mi_vfsp;
2374 
2375 		/*
2376 		 * Cleared by umount2_engine.
2377 		 */
2378 		VFS_HOLD(vfsp);
2379 
2380 		/*
2381 		 * Inform nfs4_unmount to not recursively
2382 		 * descend into this node's children when it
2383 		 * gets processed.
2384 		 */
2385 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2386 		mutex_exit(&mi->mi_lock);
2387 
2388 		error = umount2_engine(vfsp, flag, cr, FALSE);
2389 		if (error) {
2390 			/*
2391 			 * We need to reenable nfs4_unmount's ability
2392 			 * to recursively descend on this node.
2393 			 */
2394 			mutex_enter(&mi->mi_lock);
2395 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2396 			mutex_exit(&mi->mi_lock);
2397 
2398 			return (error);
2399 		}
2400 
2401 		/*
2402 		 * If we are the current node, we do not want to
2403 		 * touch anything else. At this point, the only
2404 		 * way the current node can have survived to here
2405 		 * is if it is the root of the ephemeral tree and
2406 		 * we are unmounting the enclosing mntinfo4.
2407 		 */
2408 		if (e == eph) {
2409 			ASSERT(prior == NULL);
2410 			return (0);
2411 		}
2412 
2413 		/*
2414 		 * Stitch up the prior node. Note that since
2415 		 * we have handled the root of the tree, prior
2416 		 * must be non-NULL.
2417 		 */
2418 		ASSERT(prior != NULL);
2419 		if (prior->ne_child == e) {
2420 			prior->ne_child = NULL;
2421 		} else {
2422 			ASSERT(prior->ne_peer == e);
2423 
2424 			prior->ne_peer = NULL;
2425 		}
2426 
2427 		e = prior;
2428 	}
2429 
2430 	/* NOTREACHED */
2431 }
2432 
2433 /*
2434  * Common code to safely release net_cnt_lock and net_tree_lock
2435  */
2436 void
2437 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2438     nfs4_ephemeral_tree_t **pnet)
2439 {
2440 	nfs4_ephemeral_tree_t	*net = *pnet;
2441 
2442 	if (*pmust_unlock) {
2443 		mutex_enter(&net->net_cnt_lock);
2444 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2445 		mutex_exit(&net->net_cnt_lock);
2446 
2447 		mutex_exit(&net->net_tree_lock);
2448 
2449 		*pmust_unlock = FALSE;
2450 	}
2451 }
2452 
2453 /*
2454  * While we may have removed any child or sibling nodes of this
2455  * ephemeral node, we can not nuke it until we know that there
2456  * were no actived vnodes on it. This will do that final
2457  * work once we know it is not busy.
2458  */
2459 void
2460 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2461     nfs4_ephemeral_tree_t **pnet)
2462 {
2463 	/*
2464 	 * Now we need to get rid of the ephemeral data if it exists.
2465 	 */
2466 	mutex_enter(&mi->mi_lock);
2467 	if (mi->mi_ephemeral) {
2468 		/*
2469 		 * If we are the root node of an ephemeral branch
2470 		 * which is being removed, then we need to fixup
2471 		 * pointers into and out of the node.
2472 		 */
2473 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2474 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2475 
2476 		nfs4_ephemeral_tree_rele(*pnet);
2477 		ASSERT(mi->mi_ephemeral != NULL);
2478 
2479 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2480 		mi->mi_ephemeral = NULL;
2481 	}
2482 	mutex_exit(&mi->mi_lock);
2483 
2484 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2485 }
2486 
2487 /*
2488  * Unmount an ephemeral node.
2489  *
2490  * Note that if this code fails, then it must unlock.
2491  *
2492  * If it succeeds, then the caller must be prepared to do so.
2493  */
2494 int
2495 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2496     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2497 {
2498 	int			error = 0;
2499 	nfs4_ephemeral_t	*eph;
2500 	nfs4_ephemeral_tree_t	*net;
2501 	int			is_derooting = FALSE;
2502 	int			is_recursed = FALSE;
2503 	int			was_locked = FALSE;
2504 
2505 	/*
2506 	 * Make sure to set the default state for cleaning
2507 	 * up the tree in the caller (and on the way out).
2508 	 */
2509 	*pmust_unlock = FALSE;
2510 
2511 	/*
2512 	 * The active vnodes on this file system may be ephemeral
2513 	 * children. We need to check for and try to unmount them
2514 	 * here. If any can not be unmounted, we are going
2515 	 * to return EBUSY.
2516 	 */
2517 	mutex_enter(&mi->mi_lock);
2518 
2519 	/*
2520 	 * If an ephemeral tree, we need to check to see if
2521 	 * the lock is already held. If it is, then we need
2522 	 * to see if we are being called as a result of
2523 	 * the recursive removal of some node of the tree or
2524 	 * if we are another attempt to remove the tree.
2525 	 *
2526 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2527 	 * node. mi_ephemeral being non-NULL also does this.
2528 	 *
2529 	 * mi_ephemeral_tree being non-NULL is sufficient
2530 	 * to also indicate either it is an ephemeral node
2531 	 * or the enclosing mntinfo4.
2532 	 *
2533 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
2534 	 * when we delete the ephemeral node and need to
2535 	 * differentiate from an ephemeral node and the
2536 	 * enclosing root node.
2537 	 */
2538 	*pnet = net = mi->mi_ephemeral_tree;
2539 	if (net == NULL) {
2540 		mutex_exit(&mi->mi_lock);
2541 		return (0);
2542 	}
2543 
2544 	eph = mi->mi_ephemeral;
2545 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2546 	is_derooting = (eph == NULL);
2547 
2548 	mutex_enter(&net->net_cnt_lock);
2549 
2550 	/*
2551 	 * If this is not recursion, then we need to
2552 	 * check to see if a harvester thread has
2553 	 * already grabbed the lock.
2554 	 *
2555 	 * After we exit this branch, we may not
2556 	 * blindly return, we need to jump to
2557 	 * is_busy!
2558 	 */
2559 	if (!is_recursed) {
2560 		if (net->net_status &
2561 		    NFS4_EPHEMERAL_TREE_LOCKED) {
2562 			/*
2563 			 * If the tree is locked, we need
2564 			 * to decide whether we are the
2565 			 * harvester or some explicit call
2566 			 * for a umount. The only way that
2567 			 * we are the harvester is if
2568 			 * MS_SYSSPACE is set.
2569 			 *
2570 			 * We only let the harvester through
2571 			 * at this point.
2572 			 *
2573 			 * We return EBUSY so that the
2574 			 * caller knows something is
2575 			 * going on. Note that by that
2576 			 * time, the umount in the other
2577 			 * thread may have already occured.
2578 			 */
2579 			if (!(flag & MS_SYSSPACE)) {
2580 				mutex_exit(&net->net_cnt_lock);
2581 				mutex_exit(&mi->mi_lock);
2582 
2583 				return (EBUSY);
2584 			}
2585 
2586 			was_locked = TRUE;
2587 		}
2588 	}
2589 
2590 	mutex_exit(&net->net_cnt_lock);
2591 	mutex_exit(&mi->mi_lock);
2592 
2593 	/*
2594 	 * If we are not the harvester, we need to check
2595 	 * to see if we need to grab the tree lock.
2596 	 */
2597 	if (was_locked == FALSE) {
2598 		/*
2599 		 * If we grab the lock, it means that no other
2600 		 * operation is working on the tree. If we don't
2601 		 * grab it, we need to decide if this is because
2602 		 * we are a recursive call or a new operation.
2603 		 */
2604 		if (mutex_tryenter(&net->net_tree_lock)) {
2605 			*pmust_unlock = TRUE;
2606 		} else {
2607 			/*
2608 			 * If we are a recursive call, we can
2609 			 * proceed without the lock.
2610 			 * Otherwise we have to wait until
2611 			 * the lock becomes free.
2612 			 */
2613 			if (!is_recursed) {
2614 				mutex_enter(&net->net_cnt_lock);
2615 				if (net->net_status &
2616 				    (NFS4_EPHEMERAL_TREE_DEROOTING
2617 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
2618 					mutex_exit(&net->net_cnt_lock);
2619 					goto is_busy;
2620 				}
2621 				mutex_exit(&net->net_cnt_lock);
2622 
2623 				/*
2624 				 * We can't hold any other locks whilst
2625 				 * we wait on this to free up.
2626 				 */
2627 				mutex_enter(&net->net_tree_lock);
2628 
2629 				/*
2630 				 * Note that while mi->mi_ephemeral
2631 				 * may change and thus we have to
2632 				 * update eph, it is the case that
2633 				 * we have tied down net and
2634 				 * do not care if mi->mi_ephemeral_tree
2635 				 * has changed.
2636 				 */
2637 				mutex_enter(&mi->mi_lock);
2638 				eph = mi->mi_ephemeral;
2639 				mutex_exit(&mi->mi_lock);
2640 
2641 				/*
2642 				 * Okay, we need to see if either the
2643 				 * tree got nuked or the current node
2644 				 * got nuked. Both of which will cause
2645 				 * an error.
2646 				 *
2647 				 * Note that a subsequent retry of the
2648 				 * umount shall work.
2649 				 */
2650 				mutex_enter(&net->net_cnt_lock);
2651 				if (net->net_status &
2652 				    NFS4_EPHEMERAL_TREE_INVALID ||
2653 				    (!is_derooting && eph == NULL)) {
2654 					mutex_exit(&net->net_cnt_lock);
2655 					mutex_exit(&net->net_tree_lock);
2656 					goto is_busy;
2657 				}
2658 				mutex_exit(&net->net_cnt_lock);
2659 				*pmust_unlock = TRUE;
2660 			}
2661 		}
2662 	}
2663 
2664 	/*
2665 	 * Only once we have grabbed the lock can we mark what we
2666 	 * are planning on doing to the ephemeral tree.
2667 	 */
2668 	if (*pmust_unlock) {
2669 		mutex_enter(&net->net_cnt_lock);
2670 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2671 
2672 		/*
2673 		 * Check to see if we are nuking the root.
2674 		 */
2675 		if (is_derooting)
2676 			net->net_status |=
2677 			    NFS4_EPHEMERAL_TREE_DEROOTING;
2678 		mutex_exit(&net->net_cnt_lock);
2679 	}
2680 
2681 	if (!is_derooting) {
2682 		/*
2683 		 * Only work on children if the caller has not already
2684 		 * done so.
2685 		 */
2686 		if (!is_recursed) {
2687 			ASSERT(eph != NULL);
2688 
2689 			error = nfs4_ephemeral_unmount_engine(eph,
2690 			    FALSE, flag, cr);
2691 			if (error)
2692 				goto is_busy;
2693 		}
2694 	} else {
2695 		eph = net->net_root;
2696 
2697 		/*
2698 		 * Only work if there is something there.
2699 		 */
2700 		if (eph) {
2701 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2702 			    flag, cr);
2703 			if (error) {
2704 				mutex_enter(&net->net_cnt_lock);
2705 				net->net_status &=
2706 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
2707 				mutex_exit(&net->net_cnt_lock);
2708 				goto is_busy;
2709 			}
2710 
2711 			/*
2712 			 * Nothing else which goes wrong will
2713 			 * invalidate the blowing away of the
2714 			 * ephmeral tree.
2715 			 */
2716 			net->net_root = NULL;
2717 		}
2718 
2719 		/*
2720 		 * We have derooted and we have caused the tree to be
2721 		 * invalidated.
2722 		 */
2723 		mutex_enter(&net->net_cnt_lock);
2724 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2725 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2726 		DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2727 		    uint_t, net->net_refcnt);
2728 
2729 		/*
2730 		 * We will not finalize this node, so safe to
2731 		 * release it.
2732 		 */
2733 		nfs4_ephemeral_tree_decr(net);
2734 		mutex_exit(&net->net_cnt_lock);
2735 
2736 		if (was_locked == FALSE)
2737 			mutex_exit(&net->net_tree_lock);
2738 
2739 		/*
2740 		 * We have just blown away any notation of this
2741 		 * tree being locked or having a refcnt.
2742 		 * We can't let the caller try to clean things up.
2743 		 */
2744 		*pmust_unlock = FALSE;
2745 
2746 		/*
2747 		 * At this point, the tree should no longer be
2748 		 * associated with the mntinfo4. We need to pull
2749 		 * it off there and let the harvester take
2750 		 * care of it once the refcnt drops.
2751 		 */
2752 		mutex_enter(&mi->mi_lock);
2753 		mi->mi_ephemeral_tree = NULL;
2754 		mutex_exit(&mi->mi_lock);
2755 	}
2756 
2757 	return (0);
2758 
2759 is_busy:
2760 
2761 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2762 
2763 	return (error);
2764 }
2765 
2766 /*
2767  * Do the umount and record any error in the parent.
2768  */
2769 static void
2770 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2771     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2772 {
2773 	int	error;
2774 
2775 	error = umount2_engine(vfsp, flag, kcred, FALSE);
2776 	if (error) {
2777 		if (prior) {
2778 			if (prior->ne_child == e)
2779 				prior->ne_state |=
2780 				    NFS4_EPHEMERAL_CHILD_ERROR;
2781 			else
2782 				prior->ne_state |=
2783 				    NFS4_EPHEMERAL_PEER_ERROR;
2784 		}
2785 	}
2786 }
2787 
2788 /*
2789  * For each tree in the forest (where the forest is in
2790  * effect all of the ephemeral trees for this zone),
2791  * scan to see if a node can be unmounted. Note that
2792  * unlike nfs4_ephemeral_unmount_engine(), we do
2793  * not process the current node before children or
2794  * siblings. I.e., if a node can be unmounted, we
2795  * do not recursively check to see if the nodes
2796  * hanging off of it can also be unmounted.
2797  *
2798  * Instead, we delve down deep to try and remove the
2799  * children first. Then, because we share code with
2800  * nfs4_ephemeral_unmount_engine(), we will try
2801  * them again. This could be a performance issue in
2802  * the future.
2803  *
2804  * Also note that unlike nfs4_ephemeral_unmount_engine(),
2805  * we do not halt on an error. We will not remove the
2806  * current node, but we will keep on trying to remove
2807  * the others.
2808  *
2809  * force indicates that we want the unmount to occur
2810  * even if there is something blocking it.
2811  *
2812  * time_check indicates that we want to see if the
2813  * mount has expired past mount_to or not. Typically
2814  * we want to do this and only on a shutdown of the
2815  * zone would we want to ignore the check.
2816  */
2817 static void
2818 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2819     bool_t force, bool_t time_check)
2820 {
2821 	nfs4_ephemeral_tree_t	*net;
2822 	nfs4_ephemeral_tree_t	*prev = NULL;
2823 	nfs4_ephemeral_tree_t	*next;
2824 	nfs4_ephemeral_t	*e;
2825 	nfs4_ephemeral_t	*prior;
2826 	time_t			now = gethrestime_sec();
2827 
2828 	nfs4_ephemeral_tree_t	*harvest = NULL;
2829 
2830 	int			flag;
2831 
2832 	mntinfo4_t		*mi;
2833 	vfs_t			*vfsp;
2834 
2835 	if (force)
2836 		flag = MS_FORCE | MS_SYSSPACE;
2837 	else
2838 		flag = MS_SYSSPACE;
2839 
2840 	mutex_enter(&ntg->ntg_forest_lock);
2841 	for (net = ntg->ntg_forest; net != NULL; net = next) {
2842 		next = net->net_next;
2843 
2844 		nfs4_ephemeral_tree_hold(net);
2845 
2846 		mutex_enter(&net->net_tree_lock);
2847 
2848 		/*
2849 		 * Let the unmount code know that the
2850 		 * tree is already locked!
2851 		 */
2852 		mutex_enter(&net->net_cnt_lock);
2853 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2854 		mutex_exit(&net->net_cnt_lock);
2855 
2856 		/*
2857 		 * If the intent is force all ephemeral nodes to
2858 		 * be unmounted in this zone, we can short circuit a
2859 		 * lot of tree traversal and simply zap the root node.
2860 		 */
2861 		if (force) {
2862 			if (net->net_root) {
2863 				mi = net->net_root->ne_mount;
2864 				vfsp = mi->mi_vfsp;
2865 
2866 				/*
2867 				 * Cleared by umount2_engine.
2868 				 */
2869 				VFS_HOLD(vfsp);
2870 
2871 				(void) umount2_engine(vfsp, flag,
2872 				    kcred, FALSE);
2873 
2874 				goto check_done;
2875 			}
2876 		}
2877 
2878 		e = net->net_root;
2879 		if (e)
2880 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2881 
2882 		while (e) {
2883 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2884 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2885 				if (e->ne_child) {
2886 					e = e->ne_child;
2887 					e->ne_state =
2888 					    NFS4_EPHEMERAL_VISIT_CHILD;
2889 				}
2890 
2891 				continue;
2892 			} else if (e->ne_state ==
2893 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2894 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2895 				if (e->ne_peer) {
2896 					e = e->ne_peer;
2897 					e->ne_state =
2898 					    NFS4_EPHEMERAL_VISIT_CHILD;
2899 				}
2900 
2901 				continue;
2902 			} else if (e->ne_state ==
2903 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2904 				prior = e->ne_prior;
2905 
2906 				/*
2907 				 * If a child reported an error, do
2908 				 * not bother trying to unmount.
2909 				 *
2910 				 * If your prior node is a parent,
2911 				 * pass the error up such that they
2912 				 * also do not try to unmount.
2913 				 *
2914 				 * However, if your prior is a sibling,
2915 				 * let them try to unmount if they can.
2916 				 */
2917 				if (prior) {
2918 					if (prior->ne_child == e)
2919 						prior->ne_state |=
2920 						    NFS4_EPHEMERAL_CHILD_ERROR;
2921 					else
2922 						prior->ne_state |=
2923 						    NFS4_EPHEMERAL_PEER_ERROR;
2924 				}
2925 
2926 				/*
2927 				 * Clear the error and if needed, process peers.
2928 				 *
2929 				 * Once we mask out the error, we know whether
2930 				 * or we have to process another node.
2931 				 */
2932 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2933 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2934 					e = prior;
2935 
2936 				continue;
2937 			} else if (e->ne_state ==
2938 			    NFS4_EPHEMERAL_PEER_ERROR) {
2939 				prior = e->ne_prior;
2940 
2941 				if (prior) {
2942 					if (prior->ne_child == e)
2943 						prior->ne_state =
2944 						    NFS4_EPHEMERAL_CHILD_ERROR;
2945 					else
2946 						prior->ne_state =
2947 						    NFS4_EPHEMERAL_PEER_ERROR;
2948 				}
2949 
2950 				/*
2951 				 * Clear the error from this node and do the
2952 				 * correct processing.
2953 				 */
2954 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2955 				continue;
2956 			}
2957 
2958 			prior = e->ne_prior;
2959 			e->ne_state = NFS4_EPHEMERAL_OK;
2960 
2961 			/*
2962 			 * It must be the case that we need to process
2963 			 * this node.
2964 			 */
2965 			if (!time_check ||
2966 			    now - e->ne_ref_time > e->ne_mount_to) {
2967 				mi = e->ne_mount;
2968 				vfsp = mi->mi_vfsp;
2969 
2970 				/*
2971 				 * Cleared by umount2_engine.
2972 				 */
2973 				VFS_HOLD(vfsp);
2974 
2975 				/*
2976 				 * Note that we effectively work down to the
2977 				 * leaf nodes first, try to unmount them,
2978 				 * then work our way back up into the leaf
2979 				 * nodes.
2980 				 *
2981 				 * Also note that we deal with a lot of
2982 				 * complexity by sharing the work with
2983 				 * the manual unmount code.
2984 				 */
2985 				nfs4_ephemeral_record_umount(vfsp, flag,
2986 				    e, prior);
2987 			}
2988 
2989 			e = prior;
2990 		}
2991 
2992 check_done:
2993 
2994 		/*
2995 		 * At this point we are done processing this tree.
2996 		 *
2997 		 * If the tree is invalid and we were the only reference
2998 		 * to it, then we push it on the local linked list
2999 		 * to remove it at the end. We avoid that action now
3000 		 * to keep the tree processing going along at a fair clip.
3001 		 *
3002 		 * Else, even if we were the only reference, we
3003 		 * allow it to be reused as needed.
3004 		 */
3005 		mutex_enter(&net->net_cnt_lock);
3006 		nfs4_ephemeral_tree_decr(net);
3007 		if (net->net_refcnt == 0 &&
3008 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3009 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3010 			mutex_exit(&net->net_cnt_lock);
3011 			mutex_exit(&net->net_tree_lock);
3012 
3013 			if (prev)
3014 				prev->net_next = net->net_next;
3015 			else
3016 				ntg->ntg_forest = net->net_next;
3017 
3018 			net->net_next = harvest;
3019 			harvest = net;
3020 			continue;
3021 		}
3022 
3023 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3024 		mutex_exit(&net->net_cnt_lock);
3025 		mutex_exit(&net->net_tree_lock);
3026 
3027 		prev = net;
3028 	}
3029 	mutex_exit(&ntg->ntg_forest_lock);
3030 
3031 	for (net = harvest; net != NULL; net = next) {
3032 		next = net->net_next;
3033 
3034 		mutex_destroy(&net->net_tree_lock);
3035 		mutex_destroy(&net->net_cnt_lock);
3036 		kmem_free(net, sizeof (*net));
3037 	}
3038 }
3039 
3040 /*
3041  * This is the thread which decides when the harvesting
3042  * can proceed and when to kill it off for this zone.
3043  */
3044 static void
3045 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3046 {
3047 	clock_t		timeleft;
3048 	zone_t		*zone = curproc->p_zone;
3049 
3050 	for (;;) {
3051 		timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3052 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3053 
3054 		/*
3055 		 * zone is exiting...
3056 		 */
3057 		if (timeleft != -1) {
3058 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3059 			zthread_exit();
3060 			/* NOTREACHED */
3061 		}
3062 
3063 		/*
3064 		 * Only bother scanning if there is potential
3065 		 * work to be done.
3066 		 */
3067 		if (ntg->ntg_forest == NULL)
3068 			continue;
3069 
3070 		/*
3071 		 * Now scan the list and get rid of everything which
3072 		 * is old.
3073 		 */
3074 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3075 	}
3076 
3077 	/* NOTREACHED */
3078 }
3079 
3080 /*
3081  * The zone specific glue needed to start the unmount harvester.
3082  *
3083  * Note that we want to avoid holding the mutex as long as possible,
3084  * hence the multiple checks.
3085  *
3086  * The caller should avoid us getting down here in the first
3087  * place.
3088  */
3089 static void
3090 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3091 {
3092 	/*
3093 	 * It got started before we got here...
3094 	 */
3095 	if (ntg->ntg_thread_started)
3096 		return;
3097 
3098 	mutex_enter(&nfs4_ephemeral_thread_lock);
3099 
3100 	if (ntg->ntg_thread_started) {
3101 		mutex_exit(&nfs4_ephemeral_thread_lock);
3102 		return;
3103 	}
3104 
3105 	/*
3106 	 * Start the unmounter harvester thread for this zone.
3107 	 */
3108 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3109 	    ntg, 0, minclsyspri);
3110 
3111 	ntg->ntg_thread_started = TRUE;
3112 	mutex_exit(&nfs4_ephemeral_thread_lock);
3113 }
3114 
3115 /*ARGSUSED*/
3116 static void *
3117 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3118 {
3119 	nfs4_trigger_globals_t	*ntg;
3120 
3121 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3122 	ntg->ntg_thread_started = FALSE;
3123 
3124 	/*
3125 	 * This is the default....
3126 	 */
3127 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
3128 
3129 	mutex_init(&ntg->ntg_forest_lock, NULL,
3130 	    MUTEX_DEFAULT, NULL);
3131 
3132 	return (ntg);
3133 }
3134 
3135 /*
3136  * Try a nice gentle walk down the forest and convince
3137  * all of the trees to gracefully give it up.
3138  */
3139 /*ARGSUSED*/
3140 static void
3141 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3142 {
3143 	nfs4_trigger_globals_t	*ntg = arg;
3144 
3145 	if (!ntg)
3146 		return;
3147 
3148 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3149 }
3150 
3151 /*
3152  * Race along the forest and rip all of the trees out by
3153  * their rootballs!
3154  */
3155 /*ARGSUSED*/
3156 static void
3157 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3158 {
3159 	nfs4_trigger_globals_t	*ntg = arg;
3160 
3161 	if (!ntg)
3162 		return;
3163 
3164 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3165 
3166 	mutex_destroy(&ntg->ntg_forest_lock);
3167 	kmem_free(ntg, sizeof (*ntg));
3168 }
3169 
3170 /*
3171  * This is the zone independent cleanup needed for
3172  * emphemeral mount processing.
3173  */
3174 void
3175 nfs4_ephemeral_fini(void)
3176 {
3177 	(void) zone_key_delete(nfs4_ephemeral_key);
3178 	mutex_destroy(&nfs4_ephemeral_thread_lock);
3179 }
3180 
3181 /*
3182  * This is the zone independent initialization needed for
3183  * emphemeral mount processing.
3184  */
3185 void
3186 nfs4_ephemeral_init(void)
3187 {
3188 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3189 	    NULL);
3190 
3191 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3192 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3193 }
3194 
3195 /*
3196  * nfssys() calls this function to set the per-zone
3197  * value of mount_to to drive when an ephemeral mount is
3198  * timed out. Each mount will grab a copy of this value
3199  * when mounted.
3200  */
3201 void
3202 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3203 {
3204 	nfs4_trigger_globals_t	*ntg;
3205 	zone_t			*zone = curproc->p_zone;
3206 
3207 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3208 
3209 	ntg->ntg_mount_to = mount_to;
3210 }
3211 
3212 /*
3213  * Walk the list of v4 mount options; if they are currently set in vfsp,
3214  * append them to a new comma-separated mount option string, and return it.
3215  *
3216  * Caller should free by calling nfs4_trigger_destroy_mntopts().
3217  */
3218 static char *
3219 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3220 {
3221 	uint_t i;
3222 	char *mntopts;
3223 	struct vfssw *vswp;
3224 	mntopts_t *optproto;
3225 
3226 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3227 
3228 	/* get the list of applicable mount options for v4; locks *vswp */
3229 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
3230 	optproto = &vswp->vsw_optproto;
3231 
3232 	for (i = 0; i < optproto->mo_count; i++) {
3233 		struct mntopt *mop = &optproto->mo_list[i];
3234 
3235 		if (mop->mo_flags & MO_EMPTY)
3236 			continue;
3237 
3238 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3239 			kmem_free(mntopts, MAX_MNTOPT_STR);
3240 			vfs_unrefvfssw(vswp);
3241 			return (NULL);
3242 		}
3243 	}
3244 
3245 	vfs_unrefvfssw(vswp);
3246 
3247 	/*
3248 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
3249 	 * and it may only be passed via MS_OPTIONSTR, so we
3250 	 * must handle it here.
3251 	 *
3252 	 * Ideally, it would be in the list, but NFS does not specify its
3253 	 * own opt proto list, it uses instead the default one. Since
3254 	 * not all filesystems support extended attrs, it would not be
3255 	 * appropriate to add it there.
3256 	 */
3257 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3258 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3259 		kmem_free(mntopts, MAX_MNTOPT_STR);
3260 		return (NULL);
3261 	}
3262 
3263 	return (mntopts);
3264 }
3265 
3266 static void
3267 nfs4_trigger_destroy_mntopts(char *mntopts)
3268 {
3269 	if (mntopts)
3270 		kmem_free(mntopts, MAX_MNTOPT_STR);
3271 }
3272 
3273 /*
3274  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3275  */
3276 static int
3277 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3278 {
3279 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
3280 		return (EINVAL);
3281 
3282 	if (vfs_optionisset(vfsp, optname, NULL)) {
3283 		size_t mntoptslen = strlen(mntopts);
3284 		size_t optnamelen = strlen(optname);
3285 
3286 		/* +1 for ',', +1 for NUL */
3287 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3288 			return (EOVERFLOW);
3289 
3290 		/* first or subsequent mount option? */
3291 		if (*mntopts != '\0')
3292 			(void) strcat(mntopts, ",");
3293 
3294 		(void) strcat(mntopts, optname);
3295 	}
3296 
3297 	return (0);
3298 }
3299 
3300 static enum clnt_stat
3301 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3302 {
3303 	int retries;
3304 	uint_t max_msgsize;
3305 	enum clnt_stat status;
3306 	CLIENT *cl;
3307 	struct timeval timeout;
3308 
3309 	/* as per recov_newserver() */
3310 	max_msgsize = 0;
3311 	retries = 1;
3312 	timeout.tv_sec = 2;
3313 	timeout.tv_usec = 0;
3314 
3315 	if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3316 	    max_msgsize, retries, CRED(), &cl) != 0)
3317 		return (RPC_FAILED);
3318 
3319 	if (nointr)
3320 		cl->cl_nosignal = TRUE;
3321 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3322 	    timeout);
3323 	if (nointr)
3324 		cl->cl_nosignal = FALSE;
3325 
3326 	AUTH_DESTROY(cl->cl_auth);
3327 	CLNT_DESTROY(cl);
3328 
3329 	return (status);
3330 }
3331 
3332 static enum clnt_stat
3333 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3334 {
3335 	return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3336 }
3337