xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision e753f464d28e02e23aa93bd7d51d39fc56f79897)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29  * triggered from a "stub" rnode via a special set of vnodeops.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/mount.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 #include <sys/mntent.h>
68 #include <sys/priv.h>
69 
70 #include <rpc/types.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73 
74 #include <nfs/nfs.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/nfs_acl.h>
77 #include <nfs/lm.h>
78 #include <nfs/nfs4.h>
79 #include <nfs/nfs4_kprot.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82 #include <nfs/nfsid_map.h>
83 #include <nfs/nfs4_idmap_impl.h>
84 
85 #include <vm/hat.h>
86 #include <vm/as.h>
87 #include <vm/page.h>
88 #include <vm/pvn.h>
89 #include <vm/seg.h>
90 #include <vm/seg_map.h>
91 #include <vm/seg_kpm.h>
92 #include <vm/seg_vn.h>
93 
94 #include <fs/fs_subr.h>
95 
96 #include <sys/ddi.h>
97 #include <sys/int_fmtio.h>
98 
99 #include <sys/sunddi.h>
100 
101 #include <sys/priv_names.h>
102 
103 extern zone_key_t	nfs4clnt_zone_key;
104 extern zone_key_t	nfsidmap_zone_key;
105 
106 /*
107  * The automatic unmounter thread stuff!
108  */
109 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
110 
111 /*
112  * Just a default....
113  */
114 static uint_t nfs4_trigger_mount_to = 240;
115 
116 typedef struct nfs4_trigger_globals {
117 	kmutex_t		ntg_forest_lock;
118 	uint_t			ntg_mount_to;
119 	int			ntg_thread_started;
120 	nfs4_ephemeral_tree_t	*ntg_forest;
121 } nfs4_trigger_globals_t;
122 
123 kmutex_t	nfs4_ephemeral_thread_lock;
124 
125 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
126 
127 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
128 
129 /*
130  * Used for ephemeral mounts; contains data either duplicated from
131  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
132  *
133  * It's intended that this structure is used solely for ephemeral
134  * mount-type specific data, for passing this data to
135  * nfs4_trigger_nargs_create().
136  */
137 typedef struct ephemeral_servinfo {
138 	char			*esi_hostname;
139 	char			*esi_netname;
140 	char			*esi_path;
141 	int			esi_path_len;
142 	int			esi_mount_flags;
143 	struct netbuf		*esi_addr;
144 	struct netbuf		*esi_syncaddr;
145 	struct knetconfig	*esi_knconf;
146 } ephemeral_servinfo_t;
147 
148 /*
149  * Collect together the mount-type specific and generic data args.
150  */
151 typedef struct domount_args {
152 	ephemeral_servinfo_t	*dma_esi;
153 	char			*dma_hostlist; /* comma-sep. for RO failover */
154 	struct nfs_args		*dma_nargs;
155 } domount_args_t;
156 
157 
158 /*
159  * The vnode ops functions for a trigger stub vnode
160  */
161 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
162 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
163     caller_context_t *);
164 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
165     caller_context_t *);
166 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
167     caller_context_t *);
168 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
169     caller_context_t *);
170 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
171     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
172     int *, pathname_t *);
173 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
174     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
175     vsecattr_t *);
176 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
177     int);
178 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
179     caller_context_t *, int);
180 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
181     cred_t *, caller_context_t *, int);
182 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
183     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
184 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
185     caller_context_t *, int);
186 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
187     cred_t *, caller_context_t *, int);
188 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
189 
190 /*
191  * Regular NFSv4 vnodeops that we need to reference directly
192  */
193 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
194 		    caller_context_t *);
195 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
196 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
197 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
198 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
199 		    struct pathname *, int, vnode_t *, cred_t *,
200 		    caller_context_t *, int *, pathname_t *);
201 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
202 		    caller_context_t *);
203 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
204 		    caller_context_t *);
205 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
206 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
207 
208 static int	nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
209 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
210     cred_t *, vnode_t **);
211 static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *, cred_t *);
212 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
213     vnode_t *vp);
214 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
215     cred_t *);
216 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
217 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
218     servinfo4_t *);
219 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
220     cred_t *);
221 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
222     ephemeral_servinfo_t *);
223 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
224 static char	*nfs4_trigger_create_mntopts(vfs_t *);
225 static void	nfs4_trigger_destroy_mntopts(char *);
226 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
227 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
228 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
229     struct netbuf *, int);
230 
231 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
232 
233 vnodeops_t *nfs4_trigger_vnodeops;
234 
235 /*
236  * These are the vnodeops that we must define for stub vnodes.
237  *
238  *
239  * Many of the VOPs defined for NFSv4 do not need to be defined here,
240  * for various reasons. This will result in the VFS default function being
241  * used:
242  *
243  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
244  *   lost the reference to the stub vnode, meaning these should not be called:
245  *       close, read, write, ioctl, readdir, seek.
246  *
247  * - These VOPs are meaningless for vnodes without data pages. Since the
248  *   stub vnode is of type VDIR, these should not be called:
249  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
250  *
251  * - These VOPs are otherwise not applicable, and should not be called:
252  *       dump, setsecattr.
253  *
254  *
255  * These VOPs we do not want to define, but nor do we want the VFS default
256  * action. Instead, we specify the VFS error function, with fs_error(), but
257  * note that fs_error() is not actually called. Instead it results in the
258  * use of the error function defined for the particular VOP, in vn_ops_table[]:
259  *
260  * -   frlock, dispose, shrlock.
261  *
262  *
263  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
264  * NOTE: if any of these ops involve an OTW call with the stub FH, then
265  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
266  * to protect the security data in the servinfo4_t for the "parent"
267  * filesystem that contains the stub.
268  *
269  * - These VOPs should not trigger a mount, so that "ls -l" does not:
270  *       pathconf, getsecattr.
271  *
272  * - These VOPs would not make sense to trigger:
273  *       inactive, rwlock, rwunlock, fid, realvp.
274  */
275 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
276 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
277 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
278 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
279 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
280 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
281 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
282 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
283 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
284 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
285 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
286 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
287 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
288 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
289 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
290 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
291 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
292 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
293 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
294 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
295 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
296 	VOPNAME_FRLOCK,		{ .error = fs_error },
297 	VOPNAME_DISPOSE,	{ .error = fs_error },
298 	VOPNAME_SHRLOCK,	{ .error = fs_error },
299 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
300 	NULL, NULL
301 };
302 
303 static void
304 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
305 {
306 	ASSERT(mutex_owned(&net->net_cnt_lock));
307 	net->net_refcnt++;
308 	ASSERT(net->net_refcnt != 0);
309 }
310 
311 static void
312 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
313 {
314 	mutex_enter(&net->net_cnt_lock);
315 	nfs4_ephemeral_tree_incr(net);
316 	mutex_exit(&net->net_cnt_lock);
317 }
318 
319 /*
320  * We need a safe way to decrement the refcnt whilst the
321  * lock is being held.
322  */
323 static void
324 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
325 {
326 	ASSERT(mutex_owned(&net->net_cnt_lock));
327 	ASSERT(net->net_refcnt != 0);
328 	net->net_refcnt--;
329 }
330 
331 static void
332 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
333 {
334 	mutex_enter(&net->net_cnt_lock);
335 	nfs4_ephemeral_tree_decr(net);
336 	mutex_exit(&net->net_cnt_lock);
337 }
338 
339 /*
340  * Trigger ops for stub vnodes; for mirror mounts, etc.
341  *
342  * The general idea is that a "triggering" op will first call
343  * nfs4_trigger_mount(), which will find out whether a mount has already
344  * been triggered.
345  *
346  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
347  * of the covering vfs.
348  *
349  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
350  * and again set newvp, as above.
351  *
352  * The triggering op may then re-issue the VOP by calling it on newvp.
353  *
354  * Note that some ops may perform custom action, and may or may not need
355  * to trigger a mount.
356  *
357  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
358  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
359  * and that would just recurse. Instead, we call the v4 op directly,
360  * by name.  This is OK, since we know that the vnode is for NFSv4,
361  * otherwise it couldn't be a stub.
362  *
363  */
364 
365 static int
366 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
367 {
368 	int error;
369 	vnode_t *newvp;
370 
371 	error = nfs4_trigger_mount(*vpp, cr, &newvp);
372 	if (error)
373 		return (error);
374 
375 	/* Release the stub vnode, as we're losing the reference to it */
376 	VN_RELE(*vpp);
377 
378 	/* Give the caller the root vnode of the newly-mounted fs */
379 	*vpp = newvp;
380 
381 	/* return with VN_HELD(newvp) */
382 	return (VOP_OPEN(vpp, flag, cr, ct));
383 }
384 
385 void
386 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
387 {
388 	uint_t mask;
389 	timespec_t now;
390 
391 	/*
392 	 * Set some attributes here for referrals.
393 	 */
394 	mask = vap->va_mask;
395 	bzero(vap, sizeof (struct vattr));
396 	vap->va_mask	= mask;
397 	vap->va_uid	= 0;
398 	vap->va_gid	= 0;
399 	vap->va_nlink	= 1;
400 	vap->va_size	= 1;
401 	gethrestime(&now);
402 	vap->va_atime	= now;
403 	vap->va_mtime	= now;
404 	vap->va_ctime	= now;
405 	vap->va_type	= VDIR;
406 	vap->va_mode	= 0555;
407 	vap->va_fsid	= vp->v_vfsp->vfs_dev;
408 	vap->va_rdev	= 0;
409 	vap->va_blksize	= MAXBSIZE;
410 	vap->va_nblocks	= 1;
411 	vap->va_seq	= 0;
412 }
413 
414 /*
415  * For the majority of cases, nfs4_trigger_getattr() will not trigger
416  * a mount. However, if ATTR_TRIGGER is set, we are being informed
417  * that we need to force the mount before we attempt to determine
418  * the attributes. The intent is an atomic operation for security
419  * testing.
420  *
421  * If we're not triggering a mount, we can still inquire about the
422  * actual attributes from the server in the mirror mount case,
423  * and will return manufactured attributes for a referral (see
424  * the 'create' branch of find_referral_stubvp()).
425  */
426 static int
427 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
428     caller_context_t *ct)
429 {
430 	int error;
431 
432 	if (flags & ATTR_TRIGGER) {
433 		vnode_t	*newvp;
434 
435 		error = nfs4_trigger_mount(vp, cr, &newvp);
436 		if (error)
437 			return (error);
438 
439 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
440 		VN_RELE(newvp);
441 
442 	} else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
443 
444 		error = nfs4_getattr(vp, vap, flags, cr, ct);
445 
446 	} else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
447 
448 		nfs4_fake_attrs(vp, vap);
449 		error = 0;
450 	}
451 
452 	return (error);
453 }
454 
455 static int
456 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
457 		caller_context_t *ct)
458 {
459 	int error;
460 	vnode_t *newvp;
461 
462 	error = nfs4_trigger_mount(vp, cr, &newvp);
463 	if (error)
464 		return (error);
465 
466 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
467 	VN_RELE(newvp);
468 
469 	return (error);
470 }
471 
472 static int
473 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
474     caller_context_t *ct)
475 {
476 	int error;
477 	vnode_t *newvp;
478 
479 	error = nfs4_trigger_mount(vp, cr, &newvp);
480 	if (error)
481 		return (error);
482 
483 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
484 	VN_RELE(newvp);
485 
486 	return (error);
487 }
488 
489 static int
490 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
491     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
492     caller_context_t *ct, int *deflags, pathname_t *rpnp)
493 {
494 	int error;
495 	vnode_t *newdvp;
496 	rnode4_t *drp = VTOR4(dvp);
497 
498 	ASSERT(RP_ISSTUB(drp));
499 
500 	/*
501 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
502 	 * that up. Instead, pass onto the regular op, regardless of whether
503 	 * we've triggered a mount.
504 	 */
505 	if (strcmp(nm, "..") == 0)
506 		if (RP_ISSTUB_MIRRORMOUNT(drp)) {
507 			return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
508 			    ct, deflags, rpnp));
509 		} else if (RP_ISSTUB_REFERRAL(drp)) {
510 			/* Return the parent vnode */
511 			return (vtodv(dvp, vpp, cr, TRUE));
512 		}
513 
514 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
515 	if (error)
516 		return (error);
517 
518 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
519 	    deflags, rpnp);
520 	VN_RELE(newdvp);
521 
522 	return (error);
523 }
524 
525 static int
526 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
527     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
528     int flags, caller_context_t *ct, vsecattr_t *vsecp)
529 {
530 	int error;
531 	vnode_t *newdvp;
532 
533 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
534 	if (error)
535 		return (error);
536 
537 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
538 	    flags, ct, vsecp);
539 	VN_RELE(newdvp);
540 
541 	return (error);
542 }
543 
544 static int
545 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
546     int flags)
547 {
548 	int error;
549 	vnode_t *newdvp;
550 
551 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
552 	if (error)
553 		return (error);
554 
555 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
556 	VN_RELE(newdvp);
557 
558 	return (error);
559 }
560 
561 static int
562 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
563     caller_context_t *ct, int flags)
564 {
565 	int error;
566 	vnode_t *newtdvp;
567 
568 	error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
569 	if (error)
570 		return (error);
571 
572 	/*
573 	 * We don't check whether svp is a stub. Let the NFSv4 code
574 	 * detect that error, and return accordingly.
575 	 */
576 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
577 	VN_RELE(newtdvp);
578 
579 	return (error);
580 }
581 
582 static int
583 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
584     cred_t *cr, caller_context_t *ct, int flags)
585 {
586 	int error;
587 	vnode_t *newsdvp;
588 	rnode4_t *tdrp = VTOR4(tdvp);
589 
590 	/*
591 	 * We know that sdvp is a stub, otherwise we would not be here.
592 	 *
593 	 * If tdvp is also be a stub, there are two possibilities: it
594 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
595 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
596 	 *
597 	 * In the former case, just trigger sdvp, and treat tdvp as
598 	 * though it were not a stub.
599 	 *
600 	 * In the latter case, it might be a different stub for the
601 	 * same server fs as sdvp, or for a different server fs.
602 	 * Regardless, from the client perspective this would still
603 	 * be a cross-filesystem rename, and should not be allowed,
604 	 * so return EXDEV, without triggering either mount.
605 	 */
606 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
607 		return (EXDEV);
608 
609 	error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
610 	if (error)
611 		return (error);
612 
613 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
614 
615 	VN_RELE(newsdvp);
616 
617 	return (error);
618 }
619 
620 /* ARGSUSED */
621 static int
622 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
623     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
624 {
625 	int error;
626 	vnode_t *newdvp;
627 
628 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
629 	if (error)
630 		return (error);
631 
632 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
633 	VN_RELE(newdvp);
634 
635 	return (error);
636 }
637 
638 static int
639 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
640     caller_context_t *ct, int flags)
641 {
642 	int error;
643 	vnode_t *newdvp;
644 
645 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
646 	if (error)
647 		return (error);
648 
649 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
650 	VN_RELE(newdvp);
651 
652 	return (error);
653 }
654 
655 static int
656 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
657     cred_t *cr, caller_context_t *ct, int flags)
658 {
659 	int error;
660 	vnode_t *newdvp;
661 
662 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
663 	if (error)
664 		return (error);
665 
666 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
667 	VN_RELE(newdvp);
668 
669 	return (error);
670 }
671 
672 static int
673 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
674     caller_context_t *ct)
675 {
676 	int error;
677 	vnode_t *newvp;
678 
679 	error = nfs4_trigger_mount(vp, cr, &newvp);
680 	if (error)
681 		return (error);
682 
683 	error = VOP_READLINK(newvp, uiop, cr, ct);
684 	VN_RELE(newvp);
685 
686 	return (error);
687 }
688 
689 /* end of trigger vnode ops */
690 
691 /*
692  * See if the mount has already been done by another caller.
693  */
694 static int
695 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
696     bool_t *was_mounted, vfs_t **vfsp)
697 {
698 	int		error;
699 	mntinfo4_t	*mi = VTOMI4(vp);
700 
701 	*was_mounted = FALSE;
702 
703 	error = vn_vfsrlock_wait(vp);
704 	if (error)
705 		return (error);
706 
707 	*vfsp = vn_mountedvfs(vp);
708 	if (*vfsp != NULL) {
709 		/* the mount has already occurred */
710 		error = VFS_ROOT(*vfsp, newvpp);
711 		if (!error) {
712 			/* need to update the reference time  */
713 			mutex_enter(&mi->mi_lock);
714 			if (mi->mi_ephemeral)
715 				mi->mi_ephemeral->ne_ref_time =
716 				    gethrestime_sec();
717 			mutex_exit(&mi->mi_lock);
718 
719 			*was_mounted = TRUE;
720 		}
721 	}
722 
723 	vn_vfsunlock(vp);
724 	return (0);
725 }
726 
727 /*
728  * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
729  *
730  * The mount may have already occurred, via another thread. If not,
731  * assemble the location information - which may require fetching - and
732  * perform the mount.
733  *
734  * Sets newvp to be the root of the fs that is now covering vp. Note
735  * that we return with VN_HELD(*newvp).
736  *
737  * The caller is responsible for passing the VOP onto the covering fs.
738  */
739 static int
740 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
741 {
742 	int			 error;
743 	vfs_t			*vfsp;
744 	rnode4_t		*rp = VTOR4(vp);
745 	mntinfo4_t		*mi = VTOMI4(vp);
746 	domount_args_t		*dma;
747 
748 	nfs4_ephemeral_tree_t	*net;
749 
750 	bool_t			must_unlock = FALSE;
751 	bool_t			is_building = FALSE;
752 	bool_t			was_mounted = FALSE;
753 
754 	cred_t			*mcred = NULL;
755 
756 	nfs4_trigger_globals_t	*ntg;
757 
758 	zone_t			*zone = curproc->p_zone;
759 
760 	ASSERT(RP_ISSTUB(rp));
761 
762 	*newvpp = NULL;
763 
764 	/*
765 	 * Has the mount already occurred?
766 	 */
767 	error = nfs4_trigger_mounted_already(vp, newvpp,
768 	    &was_mounted, &vfsp);
769 	if (error || was_mounted)
770 		goto done;
771 
772 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
773 	ASSERT(ntg != NULL);
774 
775 	mutex_enter(&mi->mi_lock);
776 
777 	/*
778 	 * We need to lock down the ephemeral tree.
779 	 */
780 	if (mi->mi_ephemeral_tree == NULL) {
781 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
782 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
783 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
784 		net->net_refcnt = 1;
785 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
786 		is_building = TRUE;
787 
788 		/*
789 		 * We need to add it to the zone specific list for
790 		 * automatic unmounting and harvesting of deadwood.
791 		 */
792 		mutex_enter(&ntg->ntg_forest_lock);
793 		if (ntg->ntg_forest != NULL)
794 			net->net_next = ntg->ntg_forest;
795 		ntg->ntg_forest = net;
796 		mutex_exit(&ntg->ntg_forest_lock);
797 
798 		/*
799 		 * No lock order confusion with mi_lock because no
800 		 * other node could have grabbed net_tree_lock.
801 		 */
802 		mutex_enter(&net->net_tree_lock);
803 		mi->mi_ephemeral_tree = net;
804 		net->net_mount = mi;
805 		mutex_exit(&mi->mi_lock);
806 
807 		MI4_HOLD(mi);
808 		VFS_HOLD(mi->mi_vfsp);
809 	} else {
810 		net = mi->mi_ephemeral_tree;
811 		nfs4_ephemeral_tree_hold(net);
812 
813 		mutex_exit(&mi->mi_lock);
814 
815 		mutex_enter(&net->net_tree_lock);
816 
817 		/*
818 		 * We can only procede if the tree is neither locked
819 		 * nor being torn down.
820 		 */
821 		mutex_enter(&net->net_cnt_lock);
822 		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
823 			nfs4_ephemeral_tree_decr(net);
824 			mutex_exit(&net->net_cnt_lock);
825 			mutex_exit(&net->net_tree_lock);
826 
827 			return (EIO);
828 		}
829 		mutex_exit(&net->net_cnt_lock);
830 	}
831 
832 	mutex_enter(&net->net_cnt_lock);
833 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
834 	mutex_exit(&net->net_cnt_lock);
835 
836 	must_unlock = TRUE;
837 
838 	dma = nfs4_trigger_domount_args_create(vp, cr);
839 	if (dma == NULL) {
840 		error = EINVAL;
841 		goto done;
842 	}
843 
844 	/*
845 	 * Note that since we define mirror mounts to work
846 	 * for any user, we simply extend the privileges of
847 	 * the user's credentials to allow the mount to
848 	 * proceed.
849 	 */
850 	mcred = crdup(cr);
851 	if (mcred == NULL) {
852 		error = EINVAL;
853 		goto done;
854 	}
855 
856 	crset_zone_privall(mcred);
857 	if (is_system_labeled())
858 		(void) setpflags(NET_MAC_AWARE, 1, mcred);
859 
860 	error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
861 	nfs4_trigger_domount_args_destroy(dma, vp);
862 
863 	DTRACE_PROBE2(nfs4clnt__func__referral__mount,
864 	    vnode_t *, vp, int, error);
865 
866 	crfree(mcred);
867 
868 done:
869 
870 	if (must_unlock) {
871 		mutex_enter(&net->net_cnt_lock);
872 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
873 
874 		/*
875 		 * REFCNT: If we are the root of the tree, then we need
876 		 * to keep a reference because we malloced the tree and
877 		 * this is where we tied it to our mntinfo.
878 		 *
879 		 * If we are not the root of the tree, then our tie to
880 		 * the mntinfo occured elsewhere and we need to
881 		 * decrement the reference to the tree.
882 		 */
883 		if (is_building)
884 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
885 		else
886 			nfs4_ephemeral_tree_decr(net);
887 		mutex_exit(&net->net_cnt_lock);
888 
889 		mutex_exit(&net->net_tree_lock);
890 	}
891 
892 	if (!error && (newvpp == NULL || *newvpp == NULL))
893 		error = ENOSYS;
894 
895 	return (error);
896 }
897 
898 /*
899  * Collect together both the generic & mount-type specific args.
900  */
901 static domount_args_t *
902 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr)
903 {
904 	int nointr;
905 	char *hostlist;
906 	servinfo4_t *svp;
907 	struct nfs_args *nargs, *nargs_head;
908 	enum clnt_stat status;
909 	ephemeral_servinfo_t *esi, *esi_first;
910 	domount_args_t *dma;
911 	mntinfo4_t *mi = VTOMI4(vp);
912 
913 	nointr = !(mi->mi_flags & MI4_INT);
914 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
915 
916 	svp = mi->mi_curr_serv;
917 	/* check if the current server is responding */
918 	status = nfs4_trigger_ping_server(svp, nointr);
919 	if (status == RPC_SUCCESS) {
920 		esi_first = nfs4_trigger_esi_create(vp, svp, cr);
921 		if (esi_first == NULL) {
922 			kmem_free(hostlist, MAXPATHLEN);
923 			return (NULL);
924 		}
925 
926 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
927 
928 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
929 	} else {
930 		/* current server did not respond */
931 		esi_first = NULL;
932 		nargs_head = NULL;
933 	}
934 	nargs = nargs_head;
935 
936 	/*
937 	 * NFS RO failover.
938 	 *
939 	 * If we have multiple servinfo4 structures, linked via sv_next,
940 	 * we must create one nfs_args for each, linking the nfs_args via
941 	 * nfs_ext_u.nfs_extB.next.
942 	 *
943 	 * We need to build a corresponding esi for each, too, but that is
944 	 * used solely for building nfs_args, and may be immediately
945 	 * discarded, as domount() requires the info from just one esi,
946 	 * but all the nfs_args.
947 	 *
948 	 * Currently, the NFS mount code will hang if not all servers
949 	 * requested are available. To avoid that, we need to ping each
950 	 * server, here, and remove it from the list if it is not
951 	 * responding. This has the side-effect of that server then
952 	 * being permanently unavailable for this failover mount, even if
953 	 * it recovers. That's unfortunate, but the best we can do until
954 	 * the mount code path is fixed.
955 	 */
956 
957 	/*
958 	 * If the current server was down, loop indefinitely until we find
959 	 * at least one responsive server.
960 	 */
961 	do {
962 		/* no locking needed for sv_next; it is only set at fs mount */
963 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
964 			struct nfs_args *next;
965 
966 			/*
967 			 * nargs_head: the head of the nfs_args list
968 			 * nargs: the current tail of the list
969 			 * next: the newly-created element to be added
970 			 */
971 
972 			/*
973 			 * We've already tried the current server, above;
974 			 * if it was responding, we have already included it
975 			 * and it may now be ignored.
976 			 *
977 			 * Otherwise, try it again, since it may now have
978 			 * recovered.
979 			 */
980 			if (svp == mi->mi_curr_serv && esi_first != NULL)
981 				continue;
982 
983 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
984 			if (svp->sv_flags & SV4_NOTINUSE) {
985 				nfs_rw_exit(&svp->sv_lock);
986 				continue;
987 			}
988 			nfs_rw_exit(&svp->sv_lock);
989 
990 			/* check if the server is responding */
991 			status = nfs4_trigger_ping_server(svp, nointr);
992 			/* if the server did not respond, ignore it */
993 			if (status != RPC_SUCCESS)
994 				continue;
995 
996 			esi = nfs4_trigger_esi_create(vp, svp, cr);
997 			if (esi == NULL)
998 				continue;
999 
1000 			/*
1001 			 * If the original current server (mi_curr_serv)
1002 			 * was down when when we first tried it,
1003 			 * (i.e. esi_first == NULL),
1004 			 * we select this new server (svp) to be the server
1005 			 * that we will actually contact (esi_first).
1006 			 *
1007 			 * Note that it's possible that mi_curr_serv == svp,
1008 			 * if that mi_curr_serv was down but has now recovered.
1009 			 */
1010 			next = nfs4_trigger_nargs_create(mi, svp, esi);
1011 			if (esi_first == NULL) {
1012 				ASSERT(nargs == NULL);
1013 				ASSERT(nargs_head == NULL);
1014 				nargs_head = next;
1015 				esi_first = esi;
1016 				(void) strlcpy(hostlist,
1017 				    esi_first->esi_hostname, MAXPATHLEN);
1018 			} else {
1019 				ASSERT(nargs_head != NULL);
1020 				nargs->nfs_ext_u.nfs_extB.next = next;
1021 				(void) strlcat(hostlist, ",", MAXPATHLEN);
1022 				(void) strlcat(hostlist, esi->esi_hostname,
1023 				    MAXPATHLEN);
1024 				/* esi was only needed for hostname & nargs */
1025 				nfs4_trigger_esi_destroy(esi, vp);
1026 			}
1027 
1028 			nargs = next;
1029 		}
1030 
1031 		/* if we've had no response at all, wait a second */
1032 		if (esi_first == NULL)
1033 			delay(drv_usectohz(1000000));
1034 
1035 	} while (esi_first == NULL);
1036 	ASSERT(nargs_head != NULL);
1037 
1038 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1039 	dma->dma_esi = esi_first;
1040 	dma->dma_hostlist = hostlist;
1041 	dma->dma_nargs = nargs_head;
1042 
1043 	return (dma);
1044 }
1045 
1046 static void
1047 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1048 {
1049 	if (dma != NULL) {
1050 		if (dma->dma_esi != NULL && vp != NULL)
1051 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1052 
1053 		if (dma->dma_hostlist != NULL)
1054 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
1055 
1056 		if (dma->dma_nargs != NULL) {
1057 			struct nfs_args *nargs = dma->dma_nargs;
1058 
1059 			do {
1060 				struct nfs_args *next =
1061 				    nargs->nfs_ext_u.nfs_extB.next;
1062 
1063 				nfs4_trigger_nargs_destroy(nargs);
1064 				nargs = next;
1065 			} while (nargs != NULL);
1066 		}
1067 
1068 		kmem_free(dma, sizeof (domount_args_t));
1069 	}
1070 }
1071 
1072 /*
1073  * The ephemeral_servinfo_t struct contains basic information we will need to
1074  * perform the mount. Whilst the structure is generic across different
1075  * types of ephemeral mount, the way we gather its contents differs.
1076  */
1077 static ephemeral_servinfo_t *
1078 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1079 {
1080 	ephemeral_servinfo_t *esi;
1081 	rnode4_t *rp = VTOR4(vp);
1082 
1083 	ASSERT(RP_ISSTUB(rp));
1084 
1085 	/* Call the ephemeral type-specific routine */
1086 	if (RP_ISSTUB_MIRRORMOUNT(rp))
1087 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1088 	else if (RP_ISSTUB_REFERRAL(rp))
1089 		esi = nfs4_trigger_esi_create_referral(vp, cr);
1090 	else
1091 		esi = NULL;
1092 	return (esi);
1093 }
1094 
1095 static void
1096 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1097 {
1098 	rnode4_t *rp = VTOR4(vp);
1099 
1100 	ASSERT(RP_ISSTUB(rp));
1101 
1102 	/* Currently, no need for an ephemeral type-specific routine */
1103 
1104 	/*
1105 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
1106 	 * and will be handled by nfs4_trigger_nargs_destroy().
1107 	 * We need only free the structure itself.
1108 	 */
1109 	if (esi != NULL)
1110 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
1111 }
1112 
1113 /*
1114  * Some of this may turn out to be common with other ephemeral types,
1115  * in which case it should be moved to nfs4_trigger_esi_create(), or a
1116  * common function called.
1117  */
1118 
1119 /*
1120  * Mirror mounts case - should have all data available
1121  */
1122 static ephemeral_servinfo_t *
1123 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1124 {
1125 	char			*stubpath;
1126 	struct knetconfig	*sikncp, *svkncp;
1127 	struct netbuf		*bufp;
1128 	ephemeral_servinfo_t	*esi;
1129 
1130 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1131 
1132 	/* initially set to be our type of ephemeral mount; may be added to */
1133 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1134 
1135 	/*
1136 	 * We're copying info from the stub rnode's servinfo4, but
1137 	 * we must create new copies, not pointers, since this information
1138 	 * is to be associated with the new mount, which will be
1139 	 * unmounted (and its structures freed) separately
1140 	 */
1141 
1142 	/*
1143 	 * Sizes passed to kmem_[z]alloc here must match those freed
1144 	 * in nfs4_free_args()
1145 	 */
1146 
1147 	/*
1148 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1149 	 * is difficult to avoid: as we need to read svp to calculate the
1150 	 * sizes to be allocated.
1151 	 */
1152 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1153 
1154 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1155 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1156 
1157 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1158 	bufp = esi->esi_addr;
1159 	bufp->len = svp->sv_addr.len;
1160 	bufp->maxlen = svp->sv_addr.maxlen;
1161 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1162 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1163 
1164 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1165 	sikncp = esi->esi_knconf;
1166 	svkncp = svp->sv_knconf;
1167 	sikncp->knc_semantics = svkncp->knc_semantics;
1168 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1169 	(void) strcat((char *)sikncp->knc_protofmly,
1170 	    (char *)svkncp->knc_protofmly);
1171 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1172 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1173 	sikncp->knc_rdev = svkncp->knc_rdev;
1174 
1175 	/*
1176 	 * Used when AUTH_DH is negotiated.
1177 	 *
1178 	 * This is ephemeral mount-type specific, since it contains the
1179 	 * server's time-sync syncaddr.
1180 	 */
1181 	if (svp->sv_dhsec) {
1182 		struct netbuf *bufp;
1183 		sec_data_t *sdata;
1184 		dh_k4_clntdata_t *data;
1185 
1186 		sdata = svp->sv_dhsec;
1187 		data = (dh_k4_clntdata_t *)sdata->data;
1188 		ASSERT(sdata->rpcflavor == AUTH_DH);
1189 
1190 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1191 		bufp->len = data->syncaddr.len;
1192 		bufp->maxlen = data->syncaddr.maxlen;
1193 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1194 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1195 		esi->esi_syncaddr = bufp;
1196 
1197 		if (data->netname != NULL) {
1198 			int nmlen = data->netnamelen;
1199 
1200 			/*
1201 			 * We need to copy from a dh_k4_clntdata_t
1202 			 * netname/netnamelen pair to a NUL-terminated
1203 			 * netname string suitable for putting in nfs_args,
1204 			 * where the latter has no netnamelen field.
1205 			 */
1206 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1207 			bcopy(data->netname, esi->esi_netname, nmlen);
1208 		}
1209 	} else {
1210 		esi->esi_syncaddr = NULL;
1211 		esi->esi_netname = NULL;
1212 	}
1213 
1214 	stubpath = fn_path(VTOSV(vp)->sv_name);
1215 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1216 	ASSERT(*stubpath == '.');
1217 	stubpath += 1;
1218 
1219 	/* for nfs_args->fh */
1220 	esi->esi_path_len = strlen(stubpath) + 1;
1221 	if (strcmp(svp->sv_path, "/") != 0)
1222 		esi->esi_path_len += strlen(svp->sv_path);
1223 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1224 	if (strcmp(svp->sv_path, "/") != 0)
1225 		(void) strcat(esi->esi_path, svp->sv_path);
1226 	(void) strcat(esi->esi_path, stubpath);
1227 
1228 	stubpath -= 1;
1229 	/* stubpath allocated by fn_path() */
1230 	kmem_free(stubpath, strlen(stubpath) + 1);
1231 
1232 	nfs_rw_exit(&svp->sv_lock);
1233 
1234 	return (esi);
1235 }
1236 
1237 /*
1238  * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1239  * get network information required to do the mount call.
1240  */
1241 int
1242 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1243 {
1244 	door_arg_t	door_args;
1245 	door_handle_t	dh;
1246 	XDR		xdr;
1247 	refd_door_args_t *xdr_argsp;
1248 	refd_door_res_t  *orig_resp;
1249 	k_sigset_t	smask;
1250 	int		xdr_len = 0;
1251 	int 		res_len = 16; /* length of an ip adress */
1252 	int		orig_reslen = res_len;
1253 	int		error = 0;
1254 	struct nfsidmap_globals *nig;
1255 
1256 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1257 		return (ECONNREFUSED);
1258 
1259 	nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1260 	ASSERT(nig != NULL);
1261 
1262 	mutex_enter(&nig->nfsidmap_daemon_lock);
1263 	dh = nig->nfsidmap_daemon_dh;
1264 	if (dh == NULL) {
1265 		mutex_exit(&nig->nfsidmap_daemon_lock);
1266 		cmn_err(CE_NOTE,
1267 		    "nfs4_callmapid: nfsmapid daemon not " \
1268 		    "running unable to resolve host name\n");
1269 		return (EINVAL);
1270 	}
1271 	door_ki_hold(dh);
1272 	mutex_exit(&nig->nfsidmap_daemon_lock);
1273 
1274 	xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1275 
1276 	xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1277 	xdr_argsp->xdr_len = xdr_len;
1278 	xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1279 
1280 	xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1281 	    xdr_len, XDR_ENCODE);
1282 
1283 	if (!xdr_utf8string(&xdr, server)) {
1284 		kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1285 		door_ki_rele(dh);
1286 		return (1);
1287 	}
1288 
1289 	if (orig_reslen)
1290 		orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1291 
1292 	door_args.data_ptr = (char *)xdr_argsp;
1293 	door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1294 	door_args.desc_ptr = NULL;
1295 	door_args.desc_num = 0;
1296 	door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1297 	door_args.rsize = res_len;
1298 
1299 	sigintr(&smask, 1);
1300 	error = door_ki_upcall(dh, &door_args);
1301 	sigunintr(&smask);
1302 
1303 	door_ki_rele(dh);
1304 
1305 	kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1306 	if (error) {
1307 		kmem_free(orig_resp, orig_reslen);
1308 		/*
1309 		 * There is no door to connect to. The referral daemon
1310 		 * must not be running yet.
1311 		 */
1312 		cmn_err(CE_WARN,
1313 		    "nfsmapid not running cannot resolve host name");
1314 		goto out;
1315 	}
1316 
1317 	/*
1318 	 * If the results buffer passed back are not the same as
1319 	 * what was sent free the old buffer and use the new one.
1320 	 */
1321 	if (orig_resp && orig_reslen) {
1322 		refd_door_res_t *door_resp;
1323 
1324 		door_resp = (refd_door_res_t *)door_args.rbuf;
1325 		if ((void *)door_args.rbuf != orig_resp)
1326 			kmem_free(orig_resp, orig_reslen);
1327 		if (door_resp->res_status == 0) {
1328 			xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1329 			    door_resp->xdr_len, XDR_DECODE);
1330 			bzero(resp, sizeof (struct nfs_fsl_info));
1331 			if (!xdr_nfs_fsl_info(&xdr, resp)) {
1332 				DTRACE_PROBE2(
1333 				    nfs4clnt__debug__referral__upcall__xdrfail,
1334 				    struct nfs_fsl_info *, resp,
1335 				    char *, "nfs4_callmapid");
1336 				error = EINVAL;
1337 			}
1338 		} else {
1339 			DTRACE_PROBE2(
1340 			    nfs4clnt__debug__referral__upcall__badstatus,
1341 			    int, door_resp->res_status,
1342 			    char *, "nfs4_callmapid");
1343 			error = door_resp->res_status;
1344 		}
1345 		kmem_free(door_args.rbuf, door_args.rsize);
1346 	}
1347 out:
1348 	DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1349 	    char *, server, int, error);
1350 	return (error);
1351 }
1352 
1353 /*
1354  * Fetches the fs_locations attribute. Typically called
1355  * from a Replication/Migration/Referrals/Mirror-mount context
1356  *
1357  * Fills in the attributes in garp. The caller is assumed
1358  * to have allocated memory for garp.
1359  *
1360  * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1361  *	 it's already done by caller. Otherwise lock these mutexes
1362  *	 before doing the rfs4call().
1363  *
1364  * Returns
1365  * 	1	 for success
1366  * 	0	 for failure
1367  */
1368 int
1369 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1370     cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1371 {
1372 	COMPOUND4args_clnt args;
1373 	COMPOUND4res_clnt res;
1374 	nfs_argop4 *argop;
1375 	int argoplist_size = 3 * sizeof (nfs_argop4);
1376 	nfs4_server_t *sp = NULL;
1377 	int doqueue = 1;
1378 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1379 	int retval = 1;
1380 	struct nfs4_clnt *nfscl;
1381 
1382 	if (lock == TRUE)
1383 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1384 	else
1385 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1386 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1387 
1388 	sp = find_nfs4_server(mi);
1389 	if (lock == TRUE)
1390 		nfs_rw_exit(&mi->mi_recovlock);
1391 
1392 	if (sp != NULL)
1393 		mutex_exit(&sp->s_lock);
1394 
1395 	if (lock == TRUE) {
1396 		if (sp != NULL)
1397 			(void) nfs_rw_enter_sig(&sp->s_recovlock,
1398 			    RW_WRITER, 0);
1399 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1400 	} else {
1401 		if (sp != NULL) {
1402 			ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1403 			    nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1404 		}
1405 	}
1406 
1407 	/*
1408 	 * Do we want to do the setup for recovery here?
1409 	 *
1410 	 * We know that the server responded to a null ping a very
1411 	 * short time ago, and we know that we intend to do a
1412 	 * single stateless operation - we want to fetch attributes,
1413 	 * so we know we can't encounter errors about state.  If
1414 	 * something goes wrong with the GETATTR, like not being
1415 	 * able to get a response from the server or getting any
1416 	 * kind of FH error, we should fail the mount.
1417 	 *
1418 	 * We may want to re-visited this at a later time.
1419 	 */
1420 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
1421 
1422 	args.ctag = TAG_GETATTR_FSLOCATION;
1423 	/* PUTFH LOOKUP GETATTR */
1424 	args.array_len = 3;
1425 	args.array = argop;
1426 
1427 	/* 0. putfh file */
1428 	argop[0].argop = OP_CPUTFH;
1429 	argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1430 
1431 	/* 1. lookup name, can't be dotdot */
1432 	argop[1].argop = OP_CLOOKUP;
1433 	argop[1].nfs_argop4_u.opclookup.cname = nm;
1434 
1435 	/* 2. file attrs */
1436 	argop[2].argop = OP_GETATTR;
1437 	argop[2].nfs_argop4_u.opgetattr.attr_request =
1438 	    FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1439 	    FATTR4_MOUNTED_ON_FILEID_MASK;
1440 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
1441 
1442 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1443 
1444 	if (lock == TRUE) {
1445 		nfs_rw_exit(&mi->mi_recovlock);
1446 		if (sp != NULL)
1447 			nfs_rw_exit(&sp->s_recovlock);
1448 	}
1449 
1450 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1451 	nfscl->nfscl_stat.referrals.value.ui64++;
1452 	DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1453 	    nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1454 
1455 	if (e.error != 0) {
1456 		if (sp != NULL)
1457 			nfs4_server_rele(sp);
1458 		kmem_free(argop, argoplist_size);
1459 		return (0);
1460 	}
1461 
1462 	/*
1463 	 * Check for all possible error conditions.
1464 	 * For valid replies without an ops array or for illegal
1465 	 * replies, return a failure.
1466 	 */
1467 	if (res.status != NFS4_OK || res.array_len < 3 ||
1468 	    res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1469 		retval = 0;
1470 		goto exit;
1471 	}
1472 
1473 	/*
1474 	 * There isn't much value in putting the attributes
1475 	 * in the attr cache since fs_locations4 aren't
1476 	 * encountered very frequently, so just make them
1477 	 * available to the caller.
1478 	 */
1479 	*garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1480 
1481 	DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1482 	    nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1483 
1484 	/* No fs_locations? -- return a failure */
1485 	if (garp->n4g_ext_res == NULL ||
1486 	    garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1487 		retval = 0;
1488 		goto exit;
1489 	}
1490 
1491 	if (!garp->n4g_fsid_valid)
1492 		retval = 0;
1493 
1494 exit:
1495 	if (retval == 0) {
1496 		/* the call was ok but failed validating the call results */
1497 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1498 	} else {
1499 		ASSERT(callres != NULL);
1500 		*callres = res;
1501 	}
1502 
1503 	if (sp != NULL)
1504 		nfs4_server_rele(sp);
1505 	kmem_free(argop, argoplist_size);
1506 	return (retval);
1507 }
1508 
1509 /* tunable to disable referral mounts */
1510 int nfs4_no_referrals = 0;
1511 
1512 /*
1513  * Returns NULL if the vnode cannot be created or found.
1514  */
1515 vnode_t *
1516 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1517 {
1518 	nfs_fh4 *stub_fh, *dfh;
1519 	nfs4_sharedfh_t *sfhp;
1520 	char *newfhval;
1521 	vnode_t *vp = NULL;
1522 	fattr4_mounted_on_fileid mnt_on_fileid;
1523 	nfs4_ga_res_t garp;
1524 	mntinfo4_t *mi;
1525 	COMPOUND4res_clnt callres;
1526 	hrtime_t t;
1527 
1528 	if (nfs4_no_referrals)
1529 		return (NULL);
1530 
1531 	/*
1532 	 * Get the mounted_on_fileid, unique on that server::fsid
1533 	 */
1534 	mi = VTOMI4(dvp);
1535 	if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1536 	    &garp, &callres, FALSE) == 0)
1537 		return (NULL);
1538 	mnt_on_fileid = garp.n4g_mon_fid;
1539 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1540 
1541 	/*
1542 	 * Build a fake filehandle from the dir FH and the mounted_on_fileid
1543 	 */
1544 	dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1545 	stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1546 	stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1547 	    sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1548 	newfhval = stub_fh->nfs_fh4_val;
1549 
1550 	/* copy directory's file handle */
1551 	bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1552 	stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1553 	newfhval = newfhval + dfh->nfs_fh4_len;
1554 
1555 	/* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1556 	bcopy((char *)&mnt_on_fileid, newfhval,
1557 	    sizeof (fattr4_mounted_on_fileid));
1558 	stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1559 
1560 	sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1561 	kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1562 	    sizeof (fattr4_mounted_on_fileid));
1563 	kmem_free(stub_fh, sizeof (nfs_fh4));
1564 	if (sfhp == NULL)
1565 		return (NULL);
1566 
1567 	t = gethrtime();
1568 	garp.n4g_va.va_type = VDIR;
1569 	vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1570 	    cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1571 
1572 	if (vp != NULL)
1573 		vp->v_type = VDIR;
1574 
1575 	sfh4_rele(&sfhp);
1576 	return (vp);
1577 }
1578 
1579 int
1580 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1581 {
1582 	vnode_t *nvp;
1583 	rnode4_t *rp;
1584 
1585 	if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1586 		return (EINVAL);
1587 
1588 	rp = VTOR4(nvp);
1589 	mutex_enter(&rp->r_statelock);
1590 	r4_stub_referral(rp);
1591 	mutex_exit(&rp->r_statelock);
1592 	dnlc_enter(dvp, nm, nvp);
1593 
1594 	if (*vpp != NULL)
1595 		VN_RELE(*vpp);	/* no longer need this vnode */
1596 
1597 	*vpp = nvp;
1598 
1599 	return (0);
1600 }
1601 
1602 /*
1603  * Fetch the location information and resolve the new server.
1604  * Caller needs to free up the XDR data which is returned.
1605  * Input: mount info, shared filehandle, nodename
1606  * Return: Index to the result or Error(-1)
1607  * Output: FsLocations Info, Resolved Server Info.
1608  */
1609 int
1610 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1611     char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1612     struct nfs_fsl_info *fsloc)
1613 {
1614 	fs_location4 *fsp;
1615 	struct nfs_fsl_info nfsfsloc;
1616 	int ret, i, error;
1617 	nfs4_ga_res_t garp;
1618 	COMPOUND4res_clnt callres;
1619 	struct knetconfig *knc;
1620 
1621 	ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1622 	if (ret == 0)
1623 		return (-1);
1624 
1625 	/*
1626 	 * As a lame attempt to figuring out if we're
1627 	 * handling a migration event or a referral,
1628 	 * look for rnodes with this fsid in the rnode
1629 	 * cache.
1630 	 *
1631 	 * If we can find one or more such rnodes, it
1632 	 * means we're handling a migration event and
1633 	 * we want to bail out in that case.
1634 	 */
1635 	if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1636 		DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1637 		    mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1638 		    char *, "nfs4_process_referral");
1639 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1640 		return (-1);
1641 	}
1642 
1643 	/*
1644 	 * Find the first responsive server to mount.  When we find
1645 	 * one, fsp will point to it.
1646 	 */
1647 	for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1648 
1649 		fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1650 		if (fsp->server_len == 0 || fsp->server_val == NULL)
1651 			continue;
1652 
1653 		error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1654 		if (error != 0)
1655 			continue;
1656 
1657 		error = nfs4_ping_server_common(nfsfsloc.knconf,
1658 		    nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1659 		if (error == RPC_SUCCESS)
1660 			break;
1661 
1662 		DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1663 		    sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1664 		    char *, "nfs4_process_referral");
1665 
1666 		(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1667 	}
1668 	knc = nfsfsloc.knconf;
1669 	if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1670 	    (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1671 		DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1672 		    nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1673 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1674 		return (-1);
1675 	}
1676 
1677 	/* Send the results back */
1678 	*fsloc = nfsfsloc;
1679 	*grp = garp;
1680 	*res = callres;
1681 	return (i);
1682 }
1683 
1684 /*
1685  * Referrals case - need to fetch referral data and then upcall to
1686  * user-level to get complete mount data.
1687  */
1688 static ephemeral_servinfo_t *
1689 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1690 {
1691 	struct knetconfig	*sikncp, *svkncp;
1692 	struct netbuf		*bufp;
1693 	ephemeral_servinfo_t	*esi;
1694 	vnode_t			*dvp;
1695 	rnode4_t		*drp;
1696 	fs_location4		*fsp;
1697 	struct nfs_fsl_info	nfsfsloc;
1698 	nfs4_ga_res_t		garp;
1699 	char			*p;
1700 	char			fn[MAXNAMELEN];
1701 	int			i, index = -1;
1702 	mntinfo4_t		*mi;
1703 	COMPOUND4res_clnt	callres;
1704 
1705 	/*
1706 	 * If we're passed in a stub vnode that
1707 	 * isn't a "referral" stub, bail out
1708 	 * and return a failure
1709 	 */
1710 	if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1711 		return (NULL);
1712 
1713 	if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1714 		return (NULL);
1715 
1716 	drp = VTOR4(dvp);
1717 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1718 		VN_RELE(dvp);
1719 		return (NULL);
1720 	}
1721 
1722 	if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1723 		nfs_rw_exit(&drp->r_rwlock);
1724 		VN_RELE(dvp);
1725 		return (NULL);
1726 	}
1727 
1728 	mi = VTOMI4(dvp);
1729 	index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1730 	    &garp, &callres, &nfsfsloc);
1731 	nfs_rw_exit(&drp->r_rwlock);
1732 	VN_RELE(dvp);
1733 	if (index < 0)
1734 		return (NULL);
1735 
1736 	fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1737 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1738 
1739 	/* initially set to be our type of ephemeral mount; may be added to */
1740 	esi->esi_mount_flags = NFSMNT_REFERRAL;
1741 
1742 	esi->esi_hostname =
1743 	    kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1744 	bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1745 	    fsp->server_val->utf8string_len);
1746 	esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1747 
1748 	bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1749 	bufp->len = nfsfsloc.addr->len;
1750 	bufp->maxlen = nfsfsloc.addr->maxlen;
1751 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1752 	bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1753 	esi->esi_addr = bufp;
1754 
1755 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1756 	sikncp = esi->esi_knconf;
1757 
1758 	DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1759 	    struct nfs_fsl_info *, &nfsfsloc,
1760 	    char *, "nfs4_trigger_esi_create_referral");
1761 
1762 	svkncp = nfsfsloc.knconf;
1763 	sikncp->knc_semantics = svkncp->knc_semantics;
1764 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1765 	(void) strlcat((char *)sikncp->knc_protofmly,
1766 	    (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1767 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1768 	(void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1769 	    KNC_STRSIZE);
1770 	sikncp->knc_rdev = svkncp->knc_rdev;
1771 
1772 	DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1773 	    struct knetconfig *, sikncp,
1774 	    char *, "nfs4_trigger_esi_create_referral");
1775 
1776 	esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1777 	bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1778 	esi->esi_syncaddr = NULL;
1779 
1780 	esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1781 	esi->esi_path_len = MAXPATHLEN;
1782 	*p++ = '/';
1783 	for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1784 		component4 *comp;
1785 
1786 		comp = &fsp->rootpath.pathname4_val[i];
1787 		/* If no space, null the string and bail */
1788 		if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1789 			goto err;
1790 		bcopy(comp->utf8string_val, p, comp->utf8string_len);
1791 		p += comp->utf8string_len;
1792 		*p++ = '/';
1793 	}
1794 	if (fsp->rootpath.pathname4_len != 0)
1795 		*(p - 1) = '\0';
1796 	else
1797 		*p = '\0';
1798 	p = esi->esi_path;
1799 	esi->esi_path = strdup(p);
1800 	esi->esi_path_len = strlen(p) + 1;
1801 	kmem_free(p, MAXPATHLEN);
1802 
1803 	/* Allocated in nfs4_process_referral() */
1804 	(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1805 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1806 
1807 	return (esi);
1808 err:
1809 	kmem_free(esi->esi_path, esi->esi_path_len);
1810 	kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1811 	kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1812 	kmem_free(esi->esi_addr, sizeof (struct netbuf));
1813 	kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1814 	kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1815 	kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1816 	kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1817 	kmem_free(esi, sizeof (ephemeral_servinfo_t));
1818 	(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1819 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1820 	return (NULL);
1821 }
1822 
1823 /*
1824  * Assemble the args, and call the generic VFS mount function to
1825  * finally perform the ephemeral mount.
1826  */
1827 static int
1828 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1829     cred_t *cr, vnode_t **newvpp)
1830 {
1831 	struct mounta	*uap;
1832 	char		*mntpt, *orig_path, *path;
1833 	const char	*orig_mntpt;
1834 	int		retval;
1835 	int		mntpt_len;
1836 	int		spec_len;
1837 	zone_t		*zone = curproc->p_zone;
1838 	bool_t		has_leading_slash;
1839 	int		i;
1840 
1841 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1842 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1843 	struct nfs_args		*nargs = dma->dma_nargs;
1844 
1845 	/* first, construct the mount point for the ephemeral mount */
1846 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1847 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1848 
1849 	if (*orig_path == '.')
1850 		orig_path++;
1851 
1852 	/*
1853 	 * Get rid of zone's root path
1854 	 */
1855 	if (zone != global_zone) {
1856 		/*
1857 		 * -1 for trailing '/' and -1 for EOS.
1858 		 */
1859 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1860 		    zone->zone_rootpathlen - 1) == 0) {
1861 			orig_mntpt += (zone->zone_rootpathlen - 2);
1862 		}
1863 	}
1864 
1865 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1866 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1867 	(void) strcat(mntpt, orig_mntpt);
1868 	(void) strcat(mntpt, orig_path);
1869 
1870 	kmem_free(path, strlen(path) + 1);
1871 	path = esi->esi_path;
1872 	if (*path == '.')
1873 		path++;
1874 	if (path[0] == '/' && path[1] == '/')
1875 		path++;
1876 	has_leading_slash = (*path == '/');
1877 
1878 	spec_len = strlen(dma->dma_hostlist);
1879 	spec_len += strlen(path);
1880 
1881 	/* We are going to have to add this in */
1882 	if (!has_leading_slash)
1883 		spec_len++;
1884 
1885 	/* We need to get the ':' for dma_hostlist:esi_path */
1886 	spec_len++;
1887 
1888 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1889 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1890 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1891 	    has_leading_slash ? "" : "/", path);
1892 
1893 	uap->dir = mntpt;
1894 
1895 	uap->flags = MS_SYSSPACE | MS_DATA;
1896 	/* fstype-independent mount options not covered elsewhere */
1897 	/* copy parent's mount(1M) "-m" flag */
1898 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1899 		uap->flags |= MS_NOMNTTAB;
1900 
1901 	uap->fstype = MNTTYPE_NFS4;
1902 	uap->dataptr = (char *)nargs;
1903 	/* not needed for MS_SYSSPACE */
1904 	uap->datalen = 0;
1905 
1906 	/* use optptr to pass in extra mount options */
1907 	uap->flags |= MS_OPTIONSTR;
1908 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1909 	if (uap->optptr == NULL) {
1910 		retval = EINVAL;
1911 		goto done;
1912 	}
1913 
1914 	/* domount() expects us to count the trailing NUL */
1915 	uap->optlen = strlen(uap->optptr) + 1;
1916 
1917 	/*
1918 	 * If we get EBUSY, we try again once to see if we can perform
1919 	 * the mount. We do this because of a spurious race condition.
1920 	 */
1921 	for (i = 0; i < 2; i++) {
1922 		int	error;
1923 		bool_t	was_mounted;
1924 
1925 		retval = domount(NULL, uap, stubvp, cr, vfsp);
1926 		if (retval == 0) {
1927 			retval = VFS_ROOT(*vfsp, newvpp);
1928 			VFS_RELE(*vfsp);
1929 			break;
1930 		} else if (retval != EBUSY) {
1931 			break;
1932 		}
1933 
1934 		/*
1935 		 * We might find it mounted by the other racer...
1936 		 */
1937 		error = nfs4_trigger_mounted_already(stubvp,
1938 		    newvpp, &was_mounted, vfsp);
1939 		if (error) {
1940 			goto done;
1941 		} else if (was_mounted) {
1942 			retval = 0;
1943 			break;
1944 		}
1945 	}
1946 
1947 done:
1948 	if (uap->optptr)
1949 		nfs4_trigger_destroy_mntopts(uap->optptr);
1950 
1951 	kmem_free(uap->spec, spec_len + 1);
1952 	kmem_free(uap, sizeof (struct mounta));
1953 	kmem_free(mntpt, mntpt_len + 1);
1954 
1955 	return (retval);
1956 }
1957 
1958 /*
1959  * Build an nfs_args structure for passing to domount().
1960  *
1961  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1962  * generic data - common to all ephemeral mount types - is read directly
1963  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1964  */
1965 static struct nfs_args *
1966 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1967     ephemeral_servinfo_t *esi)
1968 {
1969 	sec_data_t *secdata;
1970 	struct nfs_args *nargs;
1971 
1972 	/* setup the nfs args */
1973 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1974 
1975 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1976 
1977 	nargs->addr = esi->esi_addr;
1978 
1979 	/* for AUTH_DH by negotiation */
1980 	if (esi->esi_syncaddr || esi->esi_netname) {
1981 		nargs->flags |= NFSMNT_SECURE;
1982 		nargs->syncaddr = esi->esi_syncaddr;
1983 		nargs->netname = esi->esi_netname;
1984 	}
1985 
1986 	nargs->flags |= NFSMNT_KNCONF;
1987 	nargs->knconf = esi->esi_knconf;
1988 	nargs->flags |= NFSMNT_HOSTNAME;
1989 	nargs->hostname = esi->esi_hostname;
1990 	nargs->fh = esi->esi_path;
1991 
1992 	/* general mount settings, all copied from parent mount */
1993 	mutex_enter(&mi->mi_lock);
1994 
1995 	if (!(mi->mi_flags & MI4_HARD))
1996 		nargs->flags |= NFSMNT_SOFT;
1997 
1998 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
1999 	    NFSMNT_RETRANS;
2000 	nargs->wsize = mi->mi_stsize;
2001 	nargs->rsize = mi->mi_tsize;
2002 	nargs->timeo = mi->mi_timeo;
2003 	nargs->retrans = mi->mi_retrans;
2004 
2005 	if (mi->mi_flags & MI4_INT)
2006 		nargs->flags |= NFSMNT_INT;
2007 	if (mi->mi_flags & MI4_NOAC)
2008 		nargs->flags |= NFSMNT_NOAC;
2009 
2010 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2011 	    NFSMNT_ACDIRMAX;
2012 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
2013 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
2014 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2015 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2016 
2017 	/* add any specific flags for this type of ephemeral mount */
2018 	nargs->flags |= esi->esi_mount_flags;
2019 
2020 	if (mi->mi_flags & MI4_NOCTO)
2021 		nargs->flags |= NFSMNT_NOCTO;
2022 	if (mi->mi_flags & MI4_GRPID)
2023 		nargs->flags |= NFSMNT_GRPID;
2024 	if (mi->mi_flags & MI4_LLOCK)
2025 		nargs->flags |= NFSMNT_LLOCK;
2026 	if (mi->mi_flags & MI4_NOPRINT)
2027 		nargs->flags |= NFSMNT_NOPRINT;
2028 	if (mi->mi_flags & MI4_DIRECTIO)
2029 		nargs->flags |= NFSMNT_DIRECTIO;
2030 	if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2031 		nargs->flags |= NFSMNT_PUBLIC;
2032 
2033 	/* Do some referral-specific option tweaking */
2034 	if (nargs->flags & NFSMNT_REFERRAL) {
2035 		nargs->flags &= ~NFSMNT_DORDMA;
2036 		nargs->flags |= NFSMNT_TRYRDMA;
2037 	}
2038 
2039 	mutex_exit(&mi->mi_lock);
2040 
2041 	/*
2042 	 * Security data & negotiation policy.
2043 	 *
2044 	 * For mirror mounts, we need to preserve the parent mount's
2045 	 * preference for security negotiation, translating SV4_TRYSECDEFAULT
2046 	 * to NFSMNT_SECDEFAULT if present.
2047 	 *
2048 	 * For referrals, we always want security negotiation and will
2049 	 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2050 	 * The reason is that we can't negotiate down from a parent's
2051 	 * Kerberos flavor to AUTH_SYS.
2052 	 *
2053 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2054 	 * security flavour was requested, with data in sv_secdata, and that
2055 	 * no negotiation should occur. If this specified flavour fails, that's
2056 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2057 	 *
2058 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2059 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
2060 	 * Possible flavours are recorded in an array in sv_secinfo, with
2061 	 * currently in-use flavour pointed to by sv_currsec.
2062 	 *
2063 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
2064 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2065 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2066 	 */
2067 	if (nargs->flags & NFSMNT_REFERRAL) {
2068 		/* enable negotiation for referral mount */
2069 		nargs->flags |= NFSMNT_SECDEFAULT;
2070 		secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2071 		secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2072 		secdata->data = NULL;
2073 	} else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2074 		/* enable negotiation for mirror mount */
2075 		nargs->flags |= NFSMNT_SECDEFAULT;
2076 
2077 		/*
2078 		 * As a starting point for negotiation, copy parent
2079 		 * mount's negotiated flavour (sv_currsec) if available,
2080 		 * or its passed-in flavour (sv_secdata) if not.
2081 		 */
2082 		if (svp->sv_currsec != NULL)
2083 			secdata = copy_sec_data(svp->sv_currsec);
2084 		else if (svp->sv_secdata != NULL)
2085 			secdata = copy_sec_data(svp->sv_secdata);
2086 		else
2087 			secdata = NULL;
2088 	} else {
2089 		/* do not enable negotiation; copy parent's passed-in flavour */
2090 		if (svp->sv_secdata != NULL)
2091 			secdata = copy_sec_data(svp->sv_secdata);
2092 		else
2093 			secdata = NULL;
2094 	}
2095 
2096 	nfs_rw_exit(&svp->sv_lock);
2097 
2098 	nargs->flags |= NFSMNT_NEWARGS;
2099 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
2100 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2101 
2102 	/* for NFS RO failover; caller will set if necessary */
2103 	nargs->nfs_ext_u.nfs_extB.next = NULL;
2104 
2105 	return (nargs);
2106 }
2107 
2108 static void
2109 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2110 {
2111 	/*
2112 	 * Either the mount failed, in which case the data is not needed, or
2113 	 * nfs4_mount() has either taken copies of what it needs or,
2114 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
2115 	 * whereby nfs4_free_args() will ignore it.
2116 	 */
2117 	nfs4_free_args(nargs);
2118 	kmem_free(nargs, sizeof (struct nfs_args));
2119 }
2120 
2121 /*
2122  * When we finally get into the mounting, we need to add this
2123  * node to the ephemeral tree.
2124  *
2125  * This is called from nfs4_mount().
2126  */
2127 int
2128 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2129 {
2130 	mntinfo4_t		*mi_parent;
2131 	nfs4_ephemeral_t	*eph;
2132 	nfs4_ephemeral_tree_t	*net;
2133 
2134 	nfs4_ephemeral_t	*prior;
2135 	nfs4_ephemeral_t	*child;
2136 
2137 	nfs4_ephemeral_t	*peer;
2138 
2139 	nfs4_trigger_globals_t	*ntg;
2140 	zone_t			*zone = curproc->p_zone;
2141 
2142 	int			rc = 0;
2143 
2144 	mi_parent = VTOMI4(mvp);
2145 
2146 	/*
2147 	 * Get this before grabbing anything else!
2148 	 */
2149 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2150 	if (!ntg->ntg_thread_started) {
2151 		nfs4_ephemeral_start_harvester(ntg);
2152 	}
2153 
2154 	mutex_enter(&mi_parent->mi_lock);
2155 	mutex_enter(&mi->mi_lock);
2156 
2157 	net = mi->mi_ephemeral_tree =
2158 	    mi_parent->mi_ephemeral_tree;
2159 
2160 	/*
2161 	 * If the mi_ephemeral_tree is NULL, then it
2162 	 * means that either the harvester or a manual
2163 	 * umount has cleared the tree out right before
2164 	 * we got here.
2165 	 *
2166 	 * There is nothing we can do here, so return
2167 	 * to the caller and let them decide whether they
2168 	 * try again.
2169 	 */
2170 	if (net == NULL) {
2171 		mutex_exit(&mi->mi_lock);
2172 		mutex_exit(&mi_parent->mi_lock);
2173 
2174 		return (EBUSY);
2175 	}
2176 
2177 	/*
2178 	 * We've just tied the mntinfo to the tree, so
2179 	 * now we bump the refcnt and hold it there until
2180 	 * this mntinfo is removed from the tree.
2181 	 */
2182 	nfs4_ephemeral_tree_hold(net);
2183 
2184 	/*
2185 	 * We need to tack together the ephemeral mount
2186 	 * with this new mntinfo.
2187 	 */
2188 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2189 	eph->ne_mount = mi;
2190 	MI4_HOLD(mi);
2191 	VFS_HOLD(mi->mi_vfsp);
2192 	eph->ne_ref_time = gethrestime_sec();
2193 
2194 	/*
2195 	 * We need to tell the ephemeral mount when
2196 	 * to time out.
2197 	 */
2198 	eph->ne_mount_to = ntg->ntg_mount_to;
2199 
2200 	mi->mi_ephemeral = eph;
2201 
2202 	/*
2203 	 * If the enclosing mntinfo4 is also ephemeral,
2204 	 * then we need to point to its enclosing parent.
2205 	 * Else the enclosing mntinfo4 is the enclosing parent.
2206 	 *
2207 	 * We also need to weave this ephemeral node
2208 	 * into the tree.
2209 	 */
2210 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2211 		/*
2212 		 * We need to decide if we are
2213 		 * the root node of this branch
2214 		 * or if we are a sibling of this
2215 		 * branch.
2216 		 */
2217 		prior = mi_parent->mi_ephemeral;
2218 		if (prior == NULL) {
2219 			/*
2220 			 * Race condition, clean up, and
2221 			 * let caller handle mntinfo.
2222 			 */
2223 			mi->mi_flags &= ~MI4_EPHEMERAL;
2224 			mi->mi_ephemeral = NULL;
2225 			kmem_free(eph, sizeof (*eph));
2226 			VFS_RELE(mi->mi_vfsp);
2227 			MI4_RELE(mi);
2228 			nfs4_ephemeral_tree_rele(net);
2229 			rc = EBUSY;
2230 		} else {
2231 			if (prior->ne_child == NULL) {
2232 				prior->ne_child = eph;
2233 			} else {
2234 				child = prior->ne_child;
2235 
2236 				prior->ne_child = eph;
2237 				eph->ne_peer = child;
2238 
2239 				child->ne_prior = eph;
2240 			}
2241 
2242 			eph->ne_prior = prior;
2243 		}
2244 	} else {
2245 		/*
2246 		 * The parent mntinfo4 is the non-ephemeral
2247 		 * root of the ephemeral tree. We
2248 		 * need to decide if we are the root
2249 		 * node of that tree or if we are a
2250 		 * sibling of the root node.
2251 		 *
2252 		 * We are the root if there is no
2253 		 * other node.
2254 		 */
2255 		if (net->net_root == NULL) {
2256 			net->net_root = eph;
2257 		} else {
2258 			eph->ne_peer = peer = net->net_root;
2259 			ASSERT(peer != NULL);
2260 			net->net_root = eph;
2261 
2262 			peer->ne_prior = eph;
2263 		}
2264 
2265 		eph->ne_prior = NULL;
2266 	}
2267 
2268 	mutex_exit(&mi->mi_lock);
2269 	mutex_exit(&mi_parent->mi_lock);
2270 
2271 	return (rc);
2272 }
2273 
2274 /*
2275  * Commit the changes to the ephemeral tree for removing this node.
2276  */
2277 static void
2278 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2279 {
2280 	nfs4_ephemeral_t	*e = eph;
2281 	nfs4_ephemeral_t	*peer;
2282 	nfs4_ephemeral_t	*prior;
2283 
2284 	peer = eph->ne_peer;
2285 	prior = e->ne_prior;
2286 
2287 	/*
2288 	 * If this branch root was not the
2289 	 * tree root, then we need to fix back pointers.
2290 	 */
2291 	if (prior) {
2292 		if (prior->ne_child == e) {
2293 			prior->ne_child = peer;
2294 		} else {
2295 			prior->ne_peer = peer;
2296 		}
2297 
2298 		if (peer)
2299 			peer->ne_prior = prior;
2300 	} else if (peer) {
2301 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2302 		peer->ne_prior = NULL;
2303 	} else {
2304 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2305 	}
2306 }
2307 
2308 /*
2309  * We want to avoid recursion at all costs. So we need to
2310  * unroll the tree. We do this by a depth first traversal to
2311  * leaf nodes. We blast away the leaf and work our way back
2312  * up and down the tree.
2313  */
2314 static int
2315 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2316     int isTreeRoot, int flag, cred_t *cr)
2317 {
2318 	nfs4_ephemeral_t	*e = eph;
2319 	nfs4_ephemeral_t	*prior;
2320 	mntinfo4_t		*mi;
2321 	vfs_t			*vfsp;
2322 	int			error;
2323 
2324 	/*
2325 	 * We use the loop while unrolling the ephemeral tree.
2326 	 */
2327 	for (;;) {
2328 		/*
2329 		 * First we walk down the child.
2330 		 */
2331 		if (e->ne_child) {
2332 			prior = e;
2333 			e = e->ne_child;
2334 			continue;
2335 		}
2336 
2337 		/*
2338 		 * If we are the root of the branch we are removing,
2339 		 * we end it here. But if the branch is the root of
2340 		 * the tree, we have to forge on. We do not consider
2341 		 * the peer list for the root because while it may
2342 		 * be okay to remove, it is both extra work and a
2343 		 * potential for a false-positive error to stall the
2344 		 * unmount attempt.
2345 		 */
2346 		if (e == eph && isTreeRoot == FALSE)
2347 			return (0);
2348 
2349 		/*
2350 		 * Next we walk down the peer list.
2351 		 */
2352 		if (e->ne_peer) {
2353 			prior = e;
2354 			e = e->ne_peer;
2355 			continue;
2356 		}
2357 
2358 		/*
2359 		 * We can only remove the node passed in by the
2360 		 * caller if it is the root of the ephemeral tree.
2361 		 * Otherwise, the caller will remove it.
2362 		 */
2363 		if (e == eph && isTreeRoot == FALSE)
2364 			return (0);
2365 
2366 		/*
2367 		 * Okay, we have a leaf node, time
2368 		 * to prune it!
2369 		 *
2370 		 * Note that prior can only be NULL if
2371 		 * and only if it is the root of the
2372 		 * ephemeral tree.
2373 		 */
2374 		prior = e->ne_prior;
2375 
2376 		mi = e->ne_mount;
2377 		mutex_enter(&mi->mi_lock);
2378 		vfsp = mi->mi_vfsp;
2379 		ASSERT(vfsp != NULL);
2380 
2381 		/*
2382 		 * Cleared by umount2_engine.
2383 		 */
2384 		VFS_HOLD(vfsp);
2385 
2386 		/*
2387 		 * Inform nfs4_unmount to not recursively
2388 		 * descend into this node's children when it
2389 		 * gets processed.
2390 		 */
2391 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2392 		mutex_exit(&mi->mi_lock);
2393 
2394 		error = umount2_engine(vfsp, flag, cr, FALSE);
2395 		if (error) {
2396 			/*
2397 			 * We need to reenable nfs4_unmount's ability
2398 			 * to recursively descend on this node.
2399 			 */
2400 			mutex_enter(&mi->mi_lock);
2401 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2402 			mutex_exit(&mi->mi_lock);
2403 
2404 			return (error);
2405 		}
2406 
2407 		/*
2408 		 * If we are the current node, we do not want to
2409 		 * touch anything else. At this point, the only
2410 		 * way the current node can have survived to here
2411 		 * is if it is the root of the ephemeral tree and
2412 		 * we are unmounting the enclosing mntinfo4.
2413 		 */
2414 		if (e == eph) {
2415 			ASSERT(prior == NULL);
2416 			return (0);
2417 		}
2418 
2419 		/*
2420 		 * Stitch up the prior node. Note that since
2421 		 * we have handled the root of the tree, prior
2422 		 * must be non-NULL.
2423 		 */
2424 		ASSERT(prior != NULL);
2425 		if (prior->ne_child == e) {
2426 			prior->ne_child = NULL;
2427 		} else {
2428 			ASSERT(prior->ne_peer == e);
2429 
2430 			prior->ne_peer = NULL;
2431 		}
2432 
2433 		e = prior;
2434 	}
2435 
2436 	/* NOTREACHED */
2437 }
2438 
2439 /*
2440  * Common code to safely release net_cnt_lock and net_tree_lock
2441  */
2442 void
2443 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2444     nfs4_ephemeral_tree_t **pnet)
2445 {
2446 	nfs4_ephemeral_tree_t	*net = *pnet;
2447 
2448 	if (*pmust_unlock) {
2449 		mutex_enter(&net->net_cnt_lock);
2450 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2451 		mutex_exit(&net->net_cnt_lock);
2452 
2453 		mutex_exit(&net->net_tree_lock);
2454 
2455 		*pmust_unlock = FALSE;
2456 	}
2457 }
2458 
2459 /*
2460  * While we may have removed any child or sibling nodes of this
2461  * ephemeral node, we can not nuke it until we know that there
2462  * were no actived vnodes on it. This will do that final
2463  * work once we know it is not busy.
2464  */
2465 void
2466 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2467     nfs4_ephemeral_tree_t **pnet)
2468 {
2469 	/*
2470 	 * Now we need to get rid of the ephemeral data if it exists.
2471 	 */
2472 	mutex_enter(&mi->mi_lock);
2473 	if (mi->mi_ephemeral) {
2474 		/*
2475 		 * If we are the root node of an ephemeral branch
2476 		 * which is being removed, then we need to fixup
2477 		 * pointers into and out of the node.
2478 		 */
2479 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2480 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2481 
2482 		nfs4_ephemeral_tree_rele(*pnet);
2483 		ASSERT(mi->mi_ephemeral != NULL);
2484 
2485 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2486 		mi->mi_ephemeral = NULL;
2487 		VFS_RELE(mi->mi_vfsp);
2488 		MI4_RELE(mi);
2489 	}
2490 	mutex_exit(&mi->mi_lock);
2491 
2492 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2493 }
2494 
2495 /*
2496  * Unmount an ephemeral node.
2497  *
2498  * Note that if this code fails, then it must unlock.
2499  *
2500  * If it succeeds, then the caller must be prepared to do so.
2501  */
2502 int
2503 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2504     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2505 {
2506 	int			error = 0;
2507 	nfs4_ephemeral_t	*eph;
2508 	nfs4_ephemeral_tree_t	*net;
2509 	int			is_derooting = FALSE;
2510 	int			is_recursed = FALSE;
2511 	int			was_locked = FALSE;
2512 
2513 	/*
2514 	 * Make sure to set the default state for cleaning
2515 	 * up the tree in the caller (and on the way out).
2516 	 */
2517 	*pmust_unlock = FALSE;
2518 
2519 	/*
2520 	 * The active vnodes on this file system may be ephemeral
2521 	 * children. We need to check for and try to unmount them
2522 	 * here. If any can not be unmounted, we are going
2523 	 * to return EBUSY.
2524 	 */
2525 	mutex_enter(&mi->mi_lock);
2526 
2527 	/*
2528 	 * If an ephemeral tree, we need to check to see if
2529 	 * the lock is already held. If it is, then we need
2530 	 * to see if we are being called as a result of
2531 	 * the recursive removal of some node of the tree or
2532 	 * if we are another attempt to remove the tree.
2533 	 *
2534 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2535 	 * node. mi_ephemeral being non-NULL also does this.
2536 	 *
2537 	 * mi_ephemeral_tree being non-NULL is sufficient
2538 	 * to also indicate either it is an ephemeral node
2539 	 * or the enclosing mntinfo4.
2540 	 *
2541 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
2542 	 * when we delete the ephemeral node and need to
2543 	 * differentiate from an ephemeral node and the
2544 	 * enclosing root node.
2545 	 */
2546 	*pnet = net = mi->mi_ephemeral_tree;
2547 	if (net == NULL) {
2548 		mutex_exit(&mi->mi_lock);
2549 		return (0);
2550 	}
2551 
2552 	eph = mi->mi_ephemeral;
2553 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2554 	is_derooting = (eph == NULL);
2555 
2556 	mutex_enter(&net->net_cnt_lock);
2557 
2558 	/*
2559 	 * If this is not recursion, then we need to
2560 	 * check to see if a harvester thread has
2561 	 * already grabbed the lock.
2562 	 *
2563 	 * After we exit this branch, we may not
2564 	 * blindly return, we need to jump to
2565 	 * is_busy!
2566 	 */
2567 	if (!is_recursed) {
2568 		if (net->net_status &
2569 		    NFS4_EPHEMERAL_TREE_LOCKED) {
2570 			/*
2571 			 * If the tree is locked, we need
2572 			 * to decide whether we are the
2573 			 * harvester or some explicit call
2574 			 * for a umount. The only way that
2575 			 * we are the harvester is if
2576 			 * MS_SYSSPACE is set.
2577 			 *
2578 			 * We only let the harvester through
2579 			 * at this point.
2580 			 *
2581 			 * We return EBUSY so that the
2582 			 * caller knows something is
2583 			 * going on. Note that by that
2584 			 * time, the umount in the other
2585 			 * thread may have already occured.
2586 			 */
2587 			if (!(flag & MS_SYSSPACE)) {
2588 				mutex_exit(&net->net_cnt_lock);
2589 				mutex_exit(&mi->mi_lock);
2590 
2591 				return (EBUSY);
2592 			}
2593 
2594 			was_locked = TRUE;
2595 		}
2596 	}
2597 
2598 	mutex_exit(&net->net_cnt_lock);
2599 	mutex_exit(&mi->mi_lock);
2600 
2601 	/*
2602 	 * If we are not the harvester, we need to check
2603 	 * to see if we need to grab the tree lock.
2604 	 */
2605 	if (was_locked == FALSE) {
2606 		/*
2607 		 * If we grab the lock, it means that no other
2608 		 * operation is working on the tree. If we don't
2609 		 * grab it, we need to decide if this is because
2610 		 * we are a recursive call or a new operation.
2611 		 */
2612 		if (mutex_tryenter(&net->net_tree_lock)) {
2613 			*pmust_unlock = TRUE;
2614 		} else {
2615 			/*
2616 			 * If we are a recursive call, we can
2617 			 * proceed without the lock.
2618 			 * Otherwise we have to wait until
2619 			 * the lock becomes free.
2620 			 */
2621 			if (!is_recursed) {
2622 				mutex_enter(&net->net_cnt_lock);
2623 				if (net->net_status &
2624 				    (NFS4_EPHEMERAL_TREE_DEROOTING
2625 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
2626 					mutex_exit(&net->net_cnt_lock);
2627 					goto is_busy;
2628 				}
2629 				mutex_exit(&net->net_cnt_lock);
2630 
2631 				/*
2632 				 * We can't hold any other locks whilst
2633 				 * we wait on this to free up.
2634 				 */
2635 				mutex_enter(&net->net_tree_lock);
2636 
2637 				/*
2638 				 * Note that while mi->mi_ephemeral
2639 				 * may change and thus we have to
2640 				 * update eph, it is the case that
2641 				 * we have tied down net and
2642 				 * do not care if mi->mi_ephemeral_tree
2643 				 * has changed.
2644 				 */
2645 				mutex_enter(&mi->mi_lock);
2646 				eph = mi->mi_ephemeral;
2647 				mutex_exit(&mi->mi_lock);
2648 
2649 				/*
2650 				 * Okay, we need to see if either the
2651 				 * tree got nuked or the current node
2652 				 * got nuked. Both of which will cause
2653 				 * an error.
2654 				 *
2655 				 * Note that a subsequent retry of the
2656 				 * umount shall work.
2657 				 */
2658 				mutex_enter(&net->net_cnt_lock);
2659 				if (net->net_status &
2660 				    NFS4_EPHEMERAL_TREE_INVALID ||
2661 				    (!is_derooting && eph == NULL)) {
2662 					mutex_exit(&net->net_cnt_lock);
2663 					mutex_exit(&net->net_tree_lock);
2664 					goto is_busy;
2665 				}
2666 				mutex_exit(&net->net_cnt_lock);
2667 				*pmust_unlock = TRUE;
2668 			}
2669 		}
2670 	}
2671 
2672 	/*
2673 	 * Only once we have grabbed the lock can we mark what we
2674 	 * are planning on doing to the ephemeral tree.
2675 	 */
2676 	if (*pmust_unlock) {
2677 		mutex_enter(&net->net_cnt_lock);
2678 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2679 
2680 		/*
2681 		 * Check to see if we are nuking the root.
2682 		 */
2683 		if (is_derooting)
2684 			net->net_status |=
2685 			    NFS4_EPHEMERAL_TREE_DEROOTING;
2686 		mutex_exit(&net->net_cnt_lock);
2687 	}
2688 
2689 	if (!is_derooting) {
2690 		/*
2691 		 * Only work on children if the caller has not already
2692 		 * done so.
2693 		 */
2694 		if (!is_recursed) {
2695 			ASSERT(eph != NULL);
2696 
2697 			error = nfs4_ephemeral_unmount_engine(eph,
2698 			    FALSE, flag, cr);
2699 			if (error)
2700 				goto is_busy;
2701 		}
2702 	} else {
2703 		eph = net->net_root;
2704 
2705 		/*
2706 		 * Only work if there is something there.
2707 		 */
2708 		if (eph) {
2709 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2710 			    flag, cr);
2711 			if (error) {
2712 				mutex_enter(&net->net_cnt_lock);
2713 				net->net_status &=
2714 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
2715 				mutex_exit(&net->net_cnt_lock);
2716 				goto is_busy;
2717 			}
2718 
2719 			/*
2720 			 * Nothing else which goes wrong will
2721 			 * invalidate the blowing away of the
2722 			 * ephmeral tree.
2723 			 */
2724 			net->net_root = NULL;
2725 		}
2726 
2727 		/*
2728 		 * We have derooted and we have caused the tree to be
2729 		 * invalidated.
2730 		 */
2731 		mutex_enter(&net->net_cnt_lock);
2732 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2733 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2734 		DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2735 		    uint_t, net->net_refcnt);
2736 
2737 		/*
2738 		 * We will not finalize this node, so safe to
2739 		 * release it.
2740 		 */
2741 		nfs4_ephemeral_tree_decr(net);
2742 		mutex_exit(&net->net_cnt_lock);
2743 
2744 		if (was_locked == FALSE)
2745 			mutex_exit(&net->net_tree_lock);
2746 
2747 		/*
2748 		 * We have just blown away any notation of this
2749 		 * tree being locked or having a refcnt.
2750 		 * We can't let the caller try to clean things up.
2751 		 */
2752 		*pmust_unlock = FALSE;
2753 
2754 		/*
2755 		 * At this point, the tree should no longer be
2756 		 * associated with the mntinfo4. We need to pull
2757 		 * it off there and let the harvester take
2758 		 * care of it once the refcnt drops.
2759 		 */
2760 		mutex_enter(&mi->mi_lock);
2761 		mi->mi_ephemeral_tree = NULL;
2762 		mutex_exit(&mi->mi_lock);
2763 	}
2764 
2765 	return (0);
2766 
2767 is_busy:
2768 
2769 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2770 
2771 	return (error);
2772 }
2773 
2774 /*
2775  * Do the umount and record any error in the parent.
2776  */
2777 static void
2778 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2779     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2780 {
2781 	int	error;
2782 
2783 	/*
2784 	 * Only act on if the fs is still mounted.
2785 	 */
2786 	if (vfsp == NULL)
2787 		return;
2788 
2789 	error = umount2_engine(vfsp, flag, kcred, FALSE);
2790 	if (error) {
2791 		if (prior) {
2792 			if (prior->ne_child == e)
2793 				prior->ne_state |=
2794 				    NFS4_EPHEMERAL_CHILD_ERROR;
2795 			else
2796 				prior->ne_state |=
2797 				    NFS4_EPHEMERAL_PEER_ERROR;
2798 		}
2799 	}
2800 }
2801 
2802 /*
2803  * For each tree in the forest (where the forest is in
2804  * effect all of the ephemeral trees for this zone),
2805  * scan to see if a node can be unmounted. Note that
2806  * unlike nfs4_ephemeral_unmount_engine(), we do
2807  * not process the current node before children or
2808  * siblings. I.e., if a node can be unmounted, we
2809  * do not recursively check to see if the nodes
2810  * hanging off of it can also be unmounted.
2811  *
2812  * Instead, we delve down deep to try and remove the
2813  * children first. Then, because we share code with
2814  * nfs4_ephemeral_unmount_engine(), we will try
2815  * them again. This could be a performance issue in
2816  * the future.
2817  *
2818  * Also note that unlike nfs4_ephemeral_unmount_engine(),
2819  * we do not halt on an error. We will not remove the
2820  * current node, but we will keep on trying to remove
2821  * the others.
2822  *
2823  * force indicates that we want the unmount to occur
2824  * even if there is something blocking it.
2825  *
2826  * time_check indicates that we want to see if the
2827  * mount has expired past mount_to or not. Typically
2828  * we want to do this and only on a shutdown of the
2829  * zone would we want to ignore the check.
2830  */
2831 static void
2832 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2833     bool_t force, bool_t time_check)
2834 {
2835 	nfs4_ephemeral_tree_t	*net;
2836 	nfs4_ephemeral_tree_t	*prev = NULL;
2837 	nfs4_ephemeral_tree_t	*next;
2838 	nfs4_ephemeral_t	*e;
2839 	nfs4_ephemeral_t	*prior;
2840 	time_t			now = gethrestime_sec();
2841 
2842 	nfs4_ephemeral_tree_t	*harvest = NULL;
2843 
2844 	int			flag;
2845 
2846 	mntinfo4_t		*mi;
2847 	vfs_t			*vfsp;
2848 
2849 	if (force)
2850 		flag = MS_FORCE | MS_SYSSPACE;
2851 	else
2852 		flag = MS_SYSSPACE;
2853 
2854 	mutex_enter(&ntg->ntg_forest_lock);
2855 	for (net = ntg->ntg_forest; net != NULL; net = next) {
2856 		next = net->net_next;
2857 
2858 		nfs4_ephemeral_tree_hold(net);
2859 
2860 		mutex_enter(&net->net_tree_lock);
2861 
2862 		/*
2863 		 * Let the unmount code know that the
2864 		 * tree is already locked!
2865 		 */
2866 		mutex_enter(&net->net_cnt_lock);
2867 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2868 		mutex_exit(&net->net_cnt_lock);
2869 
2870 		/*
2871 		 * If the intent is force all ephemeral nodes to
2872 		 * be unmounted in this zone, we can short circuit a
2873 		 * lot of tree traversal and simply zap the root node.
2874 		 */
2875 		if (force) {
2876 			if (net->net_root) {
2877 				mi = net->net_root->ne_mount;
2878 
2879 				vfsp = mi->mi_vfsp;
2880 				ASSERT(vfsp != NULL);
2881 
2882 				/*
2883 				 * Cleared by umount2_engine.
2884 				 */
2885 				VFS_HOLD(vfsp);
2886 
2887 				(void) umount2_engine(vfsp, flag,
2888 				    kcred, FALSE);
2889 
2890 				goto check_done;
2891 			}
2892 		}
2893 
2894 		e = net->net_root;
2895 		if (e)
2896 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2897 
2898 		while (e) {
2899 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2900 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2901 				if (e->ne_child) {
2902 					e = e->ne_child;
2903 					e->ne_state =
2904 					    NFS4_EPHEMERAL_VISIT_CHILD;
2905 				}
2906 
2907 				continue;
2908 			} else if (e->ne_state ==
2909 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2910 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2911 				if (e->ne_peer) {
2912 					e = e->ne_peer;
2913 					e->ne_state =
2914 					    NFS4_EPHEMERAL_VISIT_CHILD;
2915 				}
2916 
2917 				continue;
2918 			} else if (e->ne_state ==
2919 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2920 				prior = e->ne_prior;
2921 
2922 				/*
2923 				 * If a child reported an error, do
2924 				 * not bother trying to unmount.
2925 				 *
2926 				 * If your prior node is a parent,
2927 				 * pass the error up such that they
2928 				 * also do not try to unmount.
2929 				 *
2930 				 * However, if your prior is a sibling,
2931 				 * let them try to unmount if they can.
2932 				 */
2933 				if (prior) {
2934 					if (prior->ne_child == e)
2935 						prior->ne_state |=
2936 						    NFS4_EPHEMERAL_CHILD_ERROR;
2937 					else
2938 						prior->ne_state |=
2939 						    NFS4_EPHEMERAL_PEER_ERROR;
2940 				}
2941 
2942 				/*
2943 				 * Clear the error and if needed, process peers.
2944 				 *
2945 				 * Once we mask out the error, we know whether
2946 				 * or we have to process another node.
2947 				 */
2948 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2949 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2950 					e = prior;
2951 
2952 				continue;
2953 			} else if (e->ne_state ==
2954 			    NFS4_EPHEMERAL_PEER_ERROR) {
2955 				prior = e->ne_prior;
2956 
2957 				if (prior) {
2958 					if (prior->ne_child == e)
2959 						prior->ne_state =
2960 						    NFS4_EPHEMERAL_CHILD_ERROR;
2961 					else
2962 						prior->ne_state =
2963 						    NFS4_EPHEMERAL_PEER_ERROR;
2964 				}
2965 
2966 				/*
2967 				 * Clear the error from this node and do the
2968 				 * correct processing.
2969 				 */
2970 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2971 				continue;
2972 			}
2973 
2974 			prior = e->ne_prior;
2975 			e->ne_state = NFS4_EPHEMERAL_OK;
2976 
2977 			/*
2978 			 * It must be the case that we need to process
2979 			 * this node.
2980 			 */
2981 			if (!time_check ||
2982 			    now - e->ne_ref_time > e->ne_mount_to) {
2983 				mi = e->ne_mount;
2984 				vfsp = mi->mi_vfsp;
2985 
2986 				/*
2987 				 * Cleared by umount2_engine.
2988 				 */
2989 				if (vfsp != NULL)
2990 					VFS_HOLD(vfsp);
2991 
2992 				/*
2993 				 * Note that we effectively work down to the
2994 				 * leaf nodes first, try to unmount them,
2995 				 * then work our way back up into the leaf
2996 				 * nodes.
2997 				 *
2998 				 * Also note that we deal with a lot of
2999 				 * complexity by sharing the work with
3000 				 * the manual unmount code.
3001 				 */
3002 				nfs4_ephemeral_record_umount(vfsp, flag,
3003 				    e, prior);
3004 			}
3005 
3006 			e = prior;
3007 		}
3008 
3009 check_done:
3010 
3011 		/*
3012 		 * At this point we are done processing this tree.
3013 		 *
3014 		 * If the tree is invalid and we were the only reference
3015 		 * to it, then we push it on the local linked list
3016 		 * to remove it at the end. We avoid that action now
3017 		 * to keep the tree processing going along at a fair clip.
3018 		 *
3019 		 * Else, even if we were the only reference, we
3020 		 * allow it to be reused as needed.
3021 		 */
3022 		mutex_enter(&net->net_cnt_lock);
3023 		nfs4_ephemeral_tree_decr(net);
3024 		if (net->net_refcnt == 0 &&
3025 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3026 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3027 			mutex_exit(&net->net_cnt_lock);
3028 			mutex_exit(&net->net_tree_lock);
3029 
3030 			if (prev)
3031 				prev->net_next = net->net_next;
3032 			else
3033 				ntg->ntg_forest = net->net_next;
3034 
3035 			net->net_next = harvest;
3036 			harvest = net;
3037 
3038 			VFS_RELE(net->net_mount->mi_vfsp);
3039 			MI4_RELE(net->net_mount);
3040 
3041 			continue;
3042 		}
3043 
3044 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3045 		mutex_exit(&net->net_cnt_lock);
3046 		mutex_exit(&net->net_tree_lock);
3047 
3048 		prev = net;
3049 	}
3050 	mutex_exit(&ntg->ntg_forest_lock);
3051 
3052 	for (net = harvest; net != NULL; net = next) {
3053 		next = net->net_next;
3054 
3055 		mutex_destroy(&net->net_tree_lock);
3056 		mutex_destroy(&net->net_cnt_lock);
3057 		kmem_free(net, sizeof (*net));
3058 	}
3059 }
3060 
3061 /*
3062  * This is the thread which decides when the harvesting
3063  * can proceed and when to kill it off for this zone.
3064  */
3065 static void
3066 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3067 {
3068 	clock_t		timeleft;
3069 	zone_t		*zone = curproc->p_zone;
3070 
3071 	for (;;) {
3072 		timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3073 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3074 
3075 		/*
3076 		 * zone is exiting...
3077 		 */
3078 		if (timeleft != -1) {
3079 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3080 			zthread_exit();
3081 			/* NOTREACHED */
3082 		}
3083 
3084 		/*
3085 		 * Only bother scanning if there is potential
3086 		 * work to be done.
3087 		 */
3088 		if (ntg->ntg_forest == NULL)
3089 			continue;
3090 
3091 		/*
3092 		 * Now scan the list and get rid of everything which
3093 		 * is old.
3094 		 */
3095 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3096 	}
3097 
3098 	/* NOTREACHED */
3099 }
3100 
3101 /*
3102  * The zone specific glue needed to start the unmount harvester.
3103  *
3104  * Note that we want to avoid holding the mutex as long as possible,
3105  * hence the multiple checks.
3106  *
3107  * The caller should avoid us getting down here in the first
3108  * place.
3109  */
3110 static void
3111 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3112 {
3113 	/*
3114 	 * It got started before we got here...
3115 	 */
3116 	if (ntg->ntg_thread_started)
3117 		return;
3118 
3119 	mutex_enter(&nfs4_ephemeral_thread_lock);
3120 
3121 	if (ntg->ntg_thread_started) {
3122 		mutex_exit(&nfs4_ephemeral_thread_lock);
3123 		return;
3124 	}
3125 
3126 	/*
3127 	 * Start the unmounter harvester thread for this zone.
3128 	 */
3129 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3130 	    ntg, 0, minclsyspri);
3131 
3132 	ntg->ntg_thread_started = TRUE;
3133 	mutex_exit(&nfs4_ephemeral_thread_lock);
3134 }
3135 
3136 /*ARGSUSED*/
3137 static void *
3138 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3139 {
3140 	nfs4_trigger_globals_t	*ntg;
3141 
3142 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3143 	ntg->ntg_thread_started = FALSE;
3144 
3145 	/*
3146 	 * This is the default....
3147 	 */
3148 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
3149 
3150 	mutex_init(&ntg->ntg_forest_lock, NULL,
3151 	    MUTEX_DEFAULT, NULL);
3152 
3153 	return (ntg);
3154 }
3155 
3156 /*
3157  * Try a nice gentle walk down the forest and convince
3158  * all of the trees to gracefully give it up.
3159  */
3160 /*ARGSUSED*/
3161 static void
3162 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3163 {
3164 	nfs4_trigger_globals_t	*ntg = arg;
3165 
3166 	if (!ntg)
3167 		return;
3168 
3169 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3170 }
3171 
3172 /*
3173  * Race along the forest and rip all of the trees out by
3174  * their rootballs!
3175  */
3176 /*ARGSUSED*/
3177 static void
3178 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3179 {
3180 	nfs4_trigger_globals_t	*ntg = arg;
3181 
3182 	if (!ntg)
3183 		return;
3184 
3185 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3186 
3187 	mutex_destroy(&ntg->ntg_forest_lock);
3188 	kmem_free(ntg, sizeof (*ntg));
3189 }
3190 
3191 /*
3192  * This is the zone independent cleanup needed for
3193  * emphemeral mount processing.
3194  */
3195 void
3196 nfs4_ephemeral_fini(void)
3197 {
3198 	(void) zone_key_delete(nfs4_ephemeral_key);
3199 	mutex_destroy(&nfs4_ephemeral_thread_lock);
3200 }
3201 
3202 /*
3203  * This is the zone independent initialization needed for
3204  * emphemeral mount processing.
3205  */
3206 void
3207 nfs4_ephemeral_init(void)
3208 {
3209 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3210 	    NULL);
3211 
3212 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3213 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3214 }
3215 
3216 /*
3217  * nfssys() calls this function to set the per-zone
3218  * value of mount_to to drive when an ephemeral mount is
3219  * timed out. Each mount will grab a copy of this value
3220  * when mounted.
3221  */
3222 void
3223 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3224 {
3225 	nfs4_trigger_globals_t	*ntg;
3226 	zone_t			*zone = curproc->p_zone;
3227 
3228 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3229 
3230 	ntg->ntg_mount_to = mount_to;
3231 }
3232 
3233 /*
3234  * Walk the list of v4 mount options; if they are currently set in vfsp,
3235  * append them to a new comma-separated mount option string, and return it.
3236  *
3237  * Caller should free by calling nfs4_trigger_destroy_mntopts().
3238  */
3239 static char *
3240 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3241 {
3242 	uint_t i;
3243 	char *mntopts;
3244 	struct vfssw *vswp;
3245 	mntopts_t *optproto;
3246 
3247 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3248 
3249 	/* get the list of applicable mount options for v4; locks *vswp */
3250 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
3251 	optproto = &vswp->vsw_optproto;
3252 
3253 	for (i = 0; i < optproto->mo_count; i++) {
3254 		struct mntopt *mop = &optproto->mo_list[i];
3255 
3256 		if (mop->mo_flags & MO_EMPTY)
3257 			continue;
3258 
3259 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3260 			kmem_free(mntopts, MAX_MNTOPT_STR);
3261 			vfs_unrefvfssw(vswp);
3262 			return (NULL);
3263 		}
3264 	}
3265 
3266 	vfs_unrefvfssw(vswp);
3267 
3268 	/*
3269 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
3270 	 * and it may only be passed via MS_OPTIONSTR, so we
3271 	 * must handle it here.
3272 	 *
3273 	 * Ideally, it would be in the list, but NFS does not specify its
3274 	 * own opt proto list, it uses instead the default one. Since
3275 	 * not all filesystems support extended attrs, it would not be
3276 	 * appropriate to add it there.
3277 	 */
3278 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3279 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3280 		kmem_free(mntopts, MAX_MNTOPT_STR);
3281 		return (NULL);
3282 	}
3283 
3284 	return (mntopts);
3285 }
3286 
3287 static void
3288 nfs4_trigger_destroy_mntopts(char *mntopts)
3289 {
3290 	if (mntopts)
3291 		kmem_free(mntopts, MAX_MNTOPT_STR);
3292 }
3293 
3294 /*
3295  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3296  */
3297 static int
3298 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3299 {
3300 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
3301 		return (EINVAL);
3302 
3303 	if (vfs_optionisset(vfsp, optname, NULL)) {
3304 		size_t mntoptslen = strlen(mntopts);
3305 		size_t optnamelen = strlen(optname);
3306 
3307 		/* +1 for ',', +1 for NUL */
3308 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3309 			return (EOVERFLOW);
3310 
3311 		/* first or subsequent mount option? */
3312 		if (*mntopts != '\0')
3313 			(void) strcat(mntopts, ",");
3314 
3315 		(void) strcat(mntopts, optname);
3316 	}
3317 
3318 	return (0);
3319 }
3320 
3321 static enum clnt_stat
3322 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3323 {
3324 	int retries;
3325 	uint_t max_msgsize;
3326 	enum clnt_stat status;
3327 	CLIENT *cl;
3328 	struct timeval timeout;
3329 
3330 	/* as per recov_newserver() */
3331 	max_msgsize = 0;
3332 	retries = 1;
3333 	timeout.tv_sec = 2;
3334 	timeout.tv_usec = 0;
3335 
3336 	if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3337 	    max_msgsize, retries, CRED(), &cl) != 0)
3338 		return (RPC_FAILED);
3339 
3340 	if (nointr)
3341 		cl->cl_nosignal = TRUE;
3342 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3343 	    timeout);
3344 	if (nointr)
3345 		cl->cl_nosignal = FALSE;
3346 
3347 	AUTH_DESTROY(cl->cl_auth);
3348 	CLNT_DESTROY(cl);
3349 
3350 	return (status);
3351 }
3352 
3353 static enum clnt_stat
3354 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3355 {
3356 	return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3357 }
3358