xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision 7991db1f01ba66ffcf3c089dee6c47c88ea9248c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29  * triggered from a "stub" rnode via a special set of vnodeops.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/mount.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 #include <sys/mntent.h>
68 #include <sys/priv.h>
69 
70 #include <rpc/types.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73 
74 #include <nfs/nfs.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/nfs_acl.h>
77 #include <nfs/lm.h>
78 #include <nfs/nfs4.h>
79 #include <nfs/nfs4_kprot.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82 #include <nfs/nfsid_map.h>
83 #include <nfs/nfs4_idmap_impl.h>
84 
85 #include <vm/hat.h>
86 #include <vm/as.h>
87 #include <vm/page.h>
88 #include <vm/pvn.h>
89 #include <vm/seg.h>
90 #include <vm/seg_map.h>
91 #include <vm/seg_kpm.h>
92 #include <vm/seg_vn.h>
93 
94 #include <fs/fs_subr.h>
95 
96 #include <sys/ddi.h>
97 #include <sys/int_fmtio.h>
98 
99 #include <sys/sunddi.h>
100 
101 #include <sys/priv_names.h>
102 
103 extern zone_key_t	nfs4clnt_zone_key;
104 extern zone_key_t	nfsidmap_zone_key;
105 
106 /*
107  * The automatic unmounter thread stuff!
108  */
109 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
110 
111 /*
112  * Just a default....
113  */
114 static uint_t nfs4_trigger_mount_to = 240;
115 
116 typedef struct nfs4_trigger_globals {
117 	kmutex_t		ntg_forest_lock;
118 	uint_t			ntg_mount_to;
119 	int			ntg_thread_started;
120 	nfs4_ephemeral_tree_t	*ntg_forest;
121 } nfs4_trigger_globals_t;
122 
123 kmutex_t	nfs4_ephemeral_thread_lock;
124 
125 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
126 
127 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
128 
129 /*
130  * Used for ephemeral mounts; contains data either duplicated from
131  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
132  *
133  * It's intended that this structure is used solely for ephemeral
134  * mount-type specific data, for passing this data to
135  * nfs4_trigger_nargs_create().
136  */
137 typedef struct ephemeral_servinfo {
138 	char			*esi_hostname;
139 	char			*esi_netname;
140 	char			*esi_path;
141 	int			esi_path_len;
142 	int			esi_mount_flags;
143 	struct netbuf		*esi_addr;
144 	struct netbuf		*esi_syncaddr;
145 	struct knetconfig	*esi_knconf;
146 } ephemeral_servinfo_t;
147 
148 /*
149  * Collect together the mount-type specific and generic data args.
150  */
151 typedef struct domount_args {
152 	ephemeral_servinfo_t	*dma_esi;
153 	char			*dma_hostlist; /* comma-sep. for RO failover */
154 	struct nfs_args		*dma_nargs;
155 } domount_args_t;
156 
157 
158 /*
159  * The vnode ops functions for a trigger stub vnode
160  */
161 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
162 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
163     caller_context_t *);
164 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
165     caller_context_t *);
166 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
167     caller_context_t *);
168 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
169     caller_context_t *);
170 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
171     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
172     int *, pathname_t *);
173 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
174     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
175     vsecattr_t *);
176 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
177     int);
178 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
179     caller_context_t *, int);
180 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
181     cred_t *, caller_context_t *, int);
182 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
183     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
184 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
185     caller_context_t *, int);
186 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
187     cred_t *, caller_context_t *, int);
188 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
189 
190 /*
191  * Regular NFSv4 vnodeops that we need to reference directly
192  */
193 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
194 		    caller_context_t *);
195 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
196 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
197 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
198 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
199 		    struct pathname *, int, vnode_t *, cred_t *,
200 		    caller_context_t *, int *, pathname_t *);
201 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
202 		    caller_context_t *);
203 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
204 		    caller_context_t *);
205 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
206 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
207 
208 static int	nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
209 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
210     cred_t *, vnode_t **);
211 static int 	nfs4_trigger_domount_args_create(vnode_t *, cred_t *,
212     domount_args_t **dmap);
213 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
214     vnode_t *vp);
215 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
216     cred_t *);
217 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
218 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
219     servinfo4_t *);
220 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
221     cred_t *);
222 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
223     ephemeral_servinfo_t *);
224 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
225 static char	*nfs4_trigger_create_mntopts(vfs_t *);
226 static void	nfs4_trigger_destroy_mntopts(char *);
227 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
228 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
229 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
230     struct netbuf *, int);
231 
232 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
233 
234 vnodeops_t *nfs4_trigger_vnodeops;
235 
236 /*
237  * These are the vnodeops that we must define for stub vnodes.
238  *
239  *
240  * Many of the VOPs defined for NFSv4 do not need to be defined here,
241  * for various reasons. This will result in the VFS default function being
242  * used:
243  *
244  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
245  *   lost the reference to the stub vnode, meaning these should not be called:
246  *       close, read, write, ioctl, readdir, seek.
247  *
248  * - These VOPs are meaningless for vnodes without data pages. Since the
249  *   stub vnode is of type VDIR, these should not be called:
250  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
251  *
252  * - These VOPs are otherwise not applicable, and should not be called:
253  *       dump, setsecattr.
254  *
255  *
256  * These VOPs we do not want to define, but nor do we want the VFS default
257  * action. Instead, we specify the VFS error function, with fs_error(), but
258  * note that fs_error() is not actually called. Instead it results in the
259  * use of the error function defined for the particular VOP, in vn_ops_table[]:
260  *
261  * -   frlock, dispose, shrlock.
262  *
263  *
264  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
265  * NOTE: if any of these ops involve an OTW call with the stub FH, then
266  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
267  * to protect the security data in the servinfo4_t for the "parent"
268  * filesystem that contains the stub.
269  *
270  * - These VOPs should not trigger a mount, so that "ls -l" does not:
271  *       pathconf, getsecattr.
272  *
273  * - These VOPs would not make sense to trigger:
274  *       inactive, rwlock, rwunlock, fid, realvp.
275  */
276 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
277 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
278 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
279 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
280 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
281 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
282 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
283 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
284 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
285 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
286 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
287 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
288 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
289 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
290 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
291 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
292 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
293 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
294 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
295 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
296 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
297 	VOPNAME_FRLOCK,		{ .error = fs_error },
298 	VOPNAME_DISPOSE,	{ .error = fs_error },
299 	VOPNAME_SHRLOCK,	{ .error = fs_error },
300 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
301 	NULL, NULL
302 };
303 
304 static void
305 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
306 {
307 	ASSERT(mutex_owned(&net->net_cnt_lock));
308 	net->net_refcnt++;
309 	ASSERT(net->net_refcnt != 0);
310 }
311 
312 static void
313 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
314 {
315 	mutex_enter(&net->net_cnt_lock);
316 	nfs4_ephemeral_tree_incr(net);
317 	mutex_exit(&net->net_cnt_lock);
318 }
319 
320 /*
321  * We need a safe way to decrement the refcnt whilst the
322  * lock is being held.
323  */
324 static void
325 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
326 {
327 	ASSERT(mutex_owned(&net->net_cnt_lock));
328 	ASSERT(net->net_refcnt != 0);
329 	net->net_refcnt--;
330 }
331 
332 static void
333 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
334 {
335 	mutex_enter(&net->net_cnt_lock);
336 	nfs4_ephemeral_tree_decr(net);
337 	mutex_exit(&net->net_cnt_lock);
338 }
339 
340 /*
341  * Trigger ops for stub vnodes; for mirror mounts, etc.
342  *
343  * The general idea is that a "triggering" op will first call
344  * nfs4_trigger_mount(), which will find out whether a mount has already
345  * been triggered.
346  *
347  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
348  * of the covering vfs.
349  *
350  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
351  * and again set newvp, as above.
352  *
353  * The triggering op may then re-issue the VOP by calling it on newvp.
354  *
355  * Note that some ops may perform custom action, and may or may not need
356  * to trigger a mount.
357  *
358  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
359  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
360  * and that would just recurse. Instead, we call the v4 op directly,
361  * by name.  This is OK, since we know that the vnode is for NFSv4,
362  * otherwise it couldn't be a stub.
363  *
364  */
365 
366 static int
367 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
368 {
369 	int error;
370 	vnode_t *newvp;
371 
372 	error = nfs4_trigger_mount(*vpp, cr, &newvp);
373 	if (error)
374 		return (error);
375 
376 	/* Release the stub vnode, as we're losing the reference to it */
377 	VN_RELE(*vpp);
378 
379 	/* Give the caller the root vnode of the newly-mounted fs */
380 	*vpp = newvp;
381 
382 	/* return with VN_HELD(newvp) */
383 	return (VOP_OPEN(vpp, flag, cr, ct));
384 }
385 
386 void
387 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
388 {
389 	uint_t mask;
390 	timespec_t now;
391 
392 	/*
393 	 * Set some attributes here for referrals.
394 	 */
395 	mask = vap->va_mask;
396 	bzero(vap, sizeof (struct vattr));
397 	vap->va_mask	= mask;
398 	vap->va_uid	= 0;
399 	vap->va_gid	= 0;
400 	vap->va_nlink	= 1;
401 	vap->va_size	= 1;
402 	gethrestime(&now);
403 	vap->va_atime	= now;
404 	vap->va_mtime	= now;
405 	vap->va_ctime	= now;
406 	vap->va_type	= VDIR;
407 	vap->va_mode	= 0555;
408 	vap->va_fsid	= vp->v_vfsp->vfs_dev;
409 	vap->va_rdev	= 0;
410 	vap->va_blksize	= MAXBSIZE;
411 	vap->va_nblocks	= 1;
412 	vap->va_seq	= 0;
413 }
414 
415 /*
416  * For the majority of cases, nfs4_trigger_getattr() will not trigger
417  * a mount. However, if ATTR_TRIGGER is set, we are being informed
418  * that we need to force the mount before we attempt to determine
419  * the attributes. The intent is an atomic operation for security
420  * testing.
421  *
422  * If we're not triggering a mount, we can still inquire about the
423  * actual attributes from the server in the mirror mount case,
424  * and will return manufactured attributes for a referral (see
425  * the 'create' branch of find_referral_stubvp()).
426  */
427 static int
428 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
429     caller_context_t *ct)
430 {
431 	int error;
432 
433 	if (flags & ATTR_TRIGGER || RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
434 		vnode_t	*newvp;
435 
436 		error = nfs4_trigger_mount(vp, cr, &newvp);
437 		if (error)
438 			return (error);
439 
440 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
441 		VN_RELE(newvp);
442 	} else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
443 
444 		nfs4_fake_attrs(vp, vap);
445 		error = 0;
446 	}
447 
448 	return (error);
449 }
450 
451 static int
452 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
453 		caller_context_t *ct)
454 {
455 	int error;
456 	vnode_t *newvp;
457 
458 	error = nfs4_trigger_mount(vp, cr, &newvp);
459 	if (error)
460 		return (error);
461 
462 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
463 	VN_RELE(newvp);
464 
465 	return (error);
466 }
467 
468 static int
469 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
470     caller_context_t *ct)
471 {
472 	int error;
473 	vnode_t *newvp;
474 
475 	error = nfs4_trigger_mount(vp, cr, &newvp);
476 	if (error)
477 		return (error);
478 
479 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
480 	VN_RELE(newvp);
481 
482 	return (error);
483 }
484 
485 static int
486 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
487     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
488     caller_context_t *ct, int *deflags, pathname_t *rpnp)
489 {
490 	int error;
491 	vnode_t *newdvp;
492 	rnode4_t *drp = VTOR4(dvp);
493 
494 	ASSERT(RP_ISSTUB(drp));
495 
496 	/*
497 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
498 	 * that up. Instead, pass onto the regular op, regardless of whether
499 	 * we've triggered a mount.
500 	 */
501 	if (strcmp(nm, "..") == 0)
502 		if (RP_ISSTUB_MIRRORMOUNT(drp)) {
503 			return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
504 			    ct, deflags, rpnp));
505 		} else if (RP_ISSTUB_REFERRAL(drp)) {
506 			/* Return the parent vnode */
507 			return (vtodv(dvp, vpp, cr, TRUE));
508 		}
509 
510 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
511 	if (error)
512 		return (error);
513 
514 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
515 	    deflags, rpnp);
516 	VN_RELE(newdvp);
517 
518 	return (error);
519 }
520 
521 static int
522 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
523     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
524     int flags, caller_context_t *ct, vsecattr_t *vsecp)
525 {
526 	int error;
527 	vnode_t *newdvp;
528 
529 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
530 	if (error)
531 		return (error);
532 
533 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
534 	    flags, ct, vsecp);
535 	VN_RELE(newdvp);
536 
537 	return (error);
538 }
539 
540 static int
541 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
542     int flags)
543 {
544 	int error;
545 	vnode_t *newdvp;
546 
547 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
548 	if (error)
549 		return (error);
550 
551 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
552 	VN_RELE(newdvp);
553 
554 	return (error);
555 }
556 
557 static int
558 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
559     caller_context_t *ct, int flags)
560 {
561 	int error;
562 	vnode_t *newtdvp;
563 
564 	error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
565 	if (error)
566 		return (error);
567 
568 	/*
569 	 * We don't check whether svp is a stub. Let the NFSv4 code
570 	 * detect that error, and return accordingly.
571 	 */
572 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
573 	VN_RELE(newtdvp);
574 
575 	return (error);
576 }
577 
578 static int
579 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
580     cred_t *cr, caller_context_t *ct, int flags)
581 {
582 	int error;
583 	vnode_t *newsdvp;
584 	rnode4_t *tdrp = VTOR4(tdvp);
585 
586 	/*
587 	 * We know that sdvp is a stub, otherwise we would not be here.
588 	 *
589 	 * If tdvp is also be a stub, there are two possibilities: it
590 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
591 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
592 	 *
593 	 * In the former case, just trigger sdvp, and treat tdvp as
594 	 * though it were not a stub.
595 	 *
596 	 * In the latter case, it might be a different stub for the
597 	 * same server fs as sdvp, or for a different server fs.
598 	 * Regardless, from the client perspective this would still
599 	 * be a cross-filesystem rename, and should not be allowed,
600 	 * so return EXDEV, without triggering either mount.
601 	 */
602 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
603 		return (EXDEV);
604 
605 	error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
606 	if (error)
607 		return (error);
608 
609 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
610 
611 	VN_RELE(newsdvp);
612 
613 	return (error);
614 }
615 
616 /* ARGSUSED */
617 static int
618 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
619     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
620 {
621 	int error;
622 	vnode_t *newdvp;
623 
624 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
625 	if (error)
626 		return (error);
627 
628 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
629 	VN_RELE(newdvp);
630 
631 	return (error);
632 }
633 
634 static int
635 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
636     caller_context_t *ct, int flags)
637 {
638 	int error;
639 	vnode_t *newdvp;
640 
641 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
642 	if (error)
643 		return (error);
644 
645 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
646 	VN_RELE(newdvp);
647 
648 	return (error);
649 }
650 
651 static int
652 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
653     cred_t *cr, caller_context_t *ct, int flags)
654 {
655 	int error;
656 	vnode_t *newdvp;
657 
658 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
659 	if (error)
660 		return (error);
661 
662 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
663 	VN_RELE(newdvp);
664 
665 	return (error);
666 }
667 
668 static int
669 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
670     caller_context_t *ct)
671 {
672 	int error;
673 	vnode_t *newvp;
674 
675 	error = nfs4_trigger_mount(vp, cr, &newvp);
676 	if (error)
677 		return (error);
678 
679 	error = VOP_READLINK(newvp, uiop, cr, ct);
680 	VN_RELE(newvp);
681 
682 	return (error);
683 }
684 
685 /* end of trigger vnode ops */
686 
687 /*
688  * See if the mount has already been done by another caller.
689  */
690 static int
691 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
692     bool_t *was_mounted, vfs_t **vfsp)
693 {
694 	int		error;
695 	mntinfo4_t	*mi = VTOMI4(vp);
696 
697 	*was_mounted = FALSE;
698 
699 	error = vn_vfsrlock_wait(vp);
700 	if (error)
701 		return (error);
702 
703 	*vfsp = vn_mountedvfs(vp);
704 	if (*vfsp != NULL) {
705 		/* the mount has already occurred */
706 		error = VFS_ROOT(*vfsp, newvpp);
707 		if (!error) {
708 			/* need to update the reference time  */
709 			mutex_enter(&mi->mi_lock);
710 			if (mi->mi_ephemeral)
711 				mi->mi_ephemeral->ne_ref_time =
712 				    gethrestime_sec();
713 			mutex_exit(&mi->mi_lock);
714 
715 			*was_mounted = TRUE;
716 		}
717 	}
718 
719 	vn_vfsunlock(vp);
720 	return (0);
721 }
722 
723 /*
724  * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
725  *
726  * The mount may have already occurred, via another thread. If not,
727  * assemble the location information - which may require fetching - and
728  * perform the mount.
729  *
730  * Sets newvp to be the root of the fs that is now covering vp. Note
731  * that we return with VN_HELD(*newvp).
732  *
733  * The caller is responsible for passing the VOP onto the covering fs.
734  */
735 static int
736 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
737 {
738 	int			 error;
739 	vfs_t			*vfsp;
740 	rnode4_t		*rp = VTOR4(vp);
741 	mntinfo4_t		*mi = VTOMI4(vp);
742 	domount_args_t		*dma;
743 
744 	nfs4_ephemeral_tree_t	*net;
745 
746 	bool_t			must_unlock = FALSE;
747 	bool_t			is_building = FALSE;
748 	bool_t			was_mounted = FALSE;
749 
750 	cred_t			*mcred = NULL;
751 
752 	nfs4_trigger_globals_t	*ntg;
753 
754 	zone_t			*zone = curproc->p_zone;
755 
756 	ASSERT(RP_ISSTUB(rp));
757 
758 	*newvpp = NULL;
759 
760 	/*
761 	 * Has the mount already occurred?
762 	 */
763 	error = nfs4_trigger_mounted_already(vp, newvpp,
764 	    &was_mounted, &vfsp);
765 	if (error || was_mounted)
766 		goto done;
767 
768 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
769 	ASSERT(ntg != NULL);
770 
771 	mutex_enter(&mi->mi_lock);
772 
773 	/*
774 	 * We need to lock down the ephemeral tree.
775 	 */
776 	if (mi->mi_ephemeral_tree == NULL) {
777 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
778 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
779 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
780 		net->net_refcnt = 1;
781 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
782 		is_building = TRUE;
783 
784 		/*
785 		 * We need to add it to the zone specific list for
786 		 * automatic unmounting and harvesting of deadwood.
787 		 */
788 		mutex_enter(&ntg->ntg_forest_lock);
789 		if (ntg->ntg_forest != NULL)
790 			net->net_next = ntg->ntg_forest;
791 		ntg->ntg_forest = net;
792 		mutex_exit(&ntg->ntg_forest_lock);
793 
794 		/*
795 		 * No lock order confusion with mi_lock because no
796 		 * other node could have grabbed net_tree_lock.
797 		 */
798 		mutex_enter(&net->net_tree_lock);
799 		mi->mi_ephemeral_tree = net;
800 		net->net_mount = mi;
801 		mutex_exit(&mi->mi_lock);
802 
803 		MI4_HOLD(mi);
804 		VFS_HOLD(mi->mi_vfsp);
805 	} else {
806 		net = mi->mi_ephemeral_tree;
807 		nfs4_ephemeral_tree_hold(net);
808 
809 		mutex_exit(&mi->mi_lock);
810 
811 		mutex_enter(&net->net_tree_lock);
812 
813 		/*
814 		 * We can only procede if the tree is neither locked
815 		 * nor being torn down.
816 		 */
817 		mutex_enter(&net->net_cnt_lock);
818 		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
819 			nfs4_ephemeral_tree_decr(net);
820 			mutex_exit(&net->net_cnt_lock);
821 			mutex_exit(&net->net_tree_lock);
822 
823 			return (EIO);
824 		}
825 		mutex_exit(&net->net_cnt_lock);
826 	}
827 
828 	mutex_enter(&net->net_cnt_lock);
829 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
830 	mutex_exit(&net->net_cnt_lock);
831 
832 	must_unlock = TRUE;
833 
834 	error = nfs4_trigger_domount_args_create(vp, cr, &dma);
835 	if (error)
836 		goto done;
837 
838 	/*
839 	 * Note that since we define mirror mounts to work
840 	 * for any user, we simply extend the privileges of
841 	 * the user's credentials to allow the mount to
842 	 * proceed.
843 	 */
844 	mcred = crdup(cr);
845 	if (mcred == NULL) {
846 		error = EINVAL;
847 		nfs4_trigger_domount_args_destroy(dma, vp);
848 		goto done;
849 	}
850 
851 	crset_zone_privall(mcred);
852 	if (is_system_labeled())
853 		(void) setpflags(NET_MAC_AWARE, 1, mcred);
854 
855 	error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
856 	nfs4_trigger_domount_args_destroy(dma, vp);
857 
858 	DTRACE_PROBE2(nfs4clnt__func__referral__mount,
859 	    vnode_t *, vp, int, error);
860 
861 	crfree(mcred);
862 
863 done:
864 
865 	if (must_unlock) {
866 		mutex_enter(&net->net_cnt_lock);
867 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
868 
869 		/*
870 		 * REFCNT: If we are the root of the tree, then we need
871 		 * to keep a reference because we malloced the tree and
872 		 * this is where we tied it to our mntinfo.
873 		 *
874 		 * If we are not the root of the tree, then our tie to
875 		 * the mntinfo occured elsewhere and we need to
876 		 * decrement the reference to the tree.
877 		 */
878 		if (is_building)
879 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
880 		else
881 			nfs4_ephemeral_tree_decr(net);
882 		mutex_exit(&net->net_cnt_lock);
883 
884 		mutex_exit(&net->net_tree_lock);
885 	}
886 
887 	if (!error && (newvpp == NULL || *newvpp == NULL))
888 		error = ENOSYS;
889 
890 	return (error);
891 }
892 
893 /*
894  * Collect together both the generic & mount-type specific args.
895  */
896 static int
897 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr, domount_args_t **dmap)
898 {
899 	int nointr;
900 	char *hostlist;
901 	servinfo4_t *svp;
902 	struct nfs_args *nargs, *nargs_head;
903 	enum clnt_stat status;
904 	ephemeral_servinfo_t *esi, *esi_first;
905 	domount_args_t *dma;
906 	mntinfo4_t *mi = VTOMI4(vp);
907 
908 	nointr = !(mi->mi_flags & MI4_INT);
909 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
910 
911 	svp = mi->mi_curr_serv;
912 	/* check if the current server is responding */
913 	status = nfs4_trigger_ping_server(svp, nointr);
914 	if (status == RPC_SUCCESS) {
915 		esi_first = nfs4_trigger_esi_create(vp, svp, cr);
916 		if (esi_first == NULL) {
917 			kmem_free(hostlist, MAXPATHLEN);
918 			return (EINVAL);
919 		}
920 
921 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
922 
923 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
924 	} else {
925 		/* current server did not respond */
926 		esi_first = NULL;
927 		nargs_head = NULL;
928 	}
929 	nargs = nargs_head;
930 
931 	/*
932 	 * NFS RO failover.
933 	 *
934 	 * If we have multiple servinfo4 structures, linked via sv_next,
935 	 * we must create one nfs_args for each, linking the nfs_args via
936 	 * nfs_ext_u.nfs_extB.next.
937 	 *
938 	 * We need to build a corresponding esi for each, too, but that is
939 	 * used solely for building nfs_args, and may be immediately
940 	 * discarded, as domount() requires the info from just one esi,
941 	 * but all the nfs_args.
942 	 *
943 	 * Currently, the NFS mount code will hang if not all servers
944 	 * requested are available. To avoid that, we need to ping each
945 	 * server, here, and remove it from the list if it is not
946 	 * responding. This has the side-effect of that server then
947 	 * being permanently unavailable for this failover mount, even if
948 	 * it recovers. That's unfortunate, but the best we can do until
949 	 * the mount code path is fixed.
950 	 */
951 
952 	/*
953 	 * If the current server was down, loop indefinitely until we find
954 	 * at least one responsive server.
955 	 */
956 	do {
957 		/* no locking needed for sv_next; it is only set at fs mount */
958 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
959 			struct nfs_args *next;
960 
961 			/*
962 			 * nargs_head: the head of the nfs_args list
963 			 * nargs: the current tail of the list
964 			 * next: the newly-created element to be added
965 			 */
966 
967 			/*
968 			 * We've already tried the current server, above;
969 			 * if it was responding, we have already included it
970 			 * and it may now be ignored.
971 			 *
972 			 * Otherwise, try it again, since it may now have
973 			 * recovered.
974 			 */
975 			if (svp == mi->mi_curr_serv && esi_first != NULL)
976 				continue;
977 
978 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
979 			if (svp->sv_flags & SV4_NOTINUSE) {
980 				nfs_rw_exit(&svp->sv_lock);
981 				continue;
982 			}
983 			nfs_rw_exit(&svp->sv_lock);
984 
985 			/* check if the server is responding */
986 			status = nfs4_trigger_ping_server(svp, nointr);
987 			if (status == RPC_INTR) {
988 				kmem_free(hostlist, MAXPATHLEN);
989 				nfs4_trigger_esi_destroy(esi_first, vp);
990 				nargs = nargs_head;
991 				while (nargs != NULL) {
992 					next = nargs->nfs_ext_u.nfs_extB.next;
993 					nfs4_trigger_nargs_destroy(nargs);
994 					nargs = next;
995 				}
996 				return (EINTR);
997 			} else if (status != RPC_SUCCESS) {
998 				/* if the server did not respond, ignore it */
999 				continue;
1000 			}
1001 
1002 			esi = nfs4_trigger_esi_create(vp, svp, cr);
1003 			if (esi == NULL)
1004 				continue;
1005 
1006 			/*
1007 			 * If the original current server (mi_curr_serv)
1008 			 * was down when when we first tried it,
1009 			 * (i.e. esi_first == NULL),
1010 			 * we select this new server (svp) to be the server
1011 			 * that we will actually contact (esi_first).
1012 			 *
1013 			 * Note that it's possible that mi_curr_serv == svp,
1014 			 * if that mi_curr_serv was down but has now recovered.
1015 			 */
1016 			next = nfs4_trigger_nargs_create(mi, svp, esi);
1017 			if (esi_first == NULL) {
1018 				ASSERT(nargs == NULL);
1019 				ASSERT(nargs_head == NULL);
1020 				nargs_head = next;
1021 				esi_first = esi;
1022 				(void) strlcpy(hostlist,
1023 				    esi_first->esi_hostname, MAXPATHLEN);
1024 			} else {
1025 				ASSERT(nargs_head != NULL);
1026 				nargs->nfs_ext_u.nfs_extB.next = next;
1027 				(void) strlcat(hostlist, ",", MAXPATHLEN);
1028 				(void) strlcat(hostlist, esi->esi_hostname,
1029 				    MAXPATHLEN);
1030 				/* esi was only needed for hostname & nargs */
1031 				nfs4_trigger_esi_destroy(esi, vp);
1032 			}
1033 
1034 			nargs = next;
1035 		}
1036 
1037 		/* if we've had no response at all, wait a second */
1038 		if (esi_first == NULL)
1039 			delay(drv_usectohz(1000000));
1040 
1041 	} while (esi_first == NULL);
1042 	ASSERT(nargs_head != NULL);
1043 
1044 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1045 	dma->dma_esi = esi_first;
1046 	dma->dma_hostlist = hostlist;
1047 	dma->dma_nargs = nargs_head;
1048 	*dmap = dma;
1049 
1050 	return (0);
1051 }
1052 
1053 static void
1054 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1055 {
1056 	if (dma != NULL) {
1057 		if (dma->dma_esi != NULL && vp != NULL)
1058 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1059 
1060 		if (dma->dma_hostlist != NULL)
1061 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
1062 
1063 		if (dma->dma_nargs != NULL) {
1064 			struct nfs_args *nargs = dma->dma_nargs;
1065 
1066 			do {
1067 				struct nfs_args *next =
1068 				    nargs->nfs_ext_u.nfs_extB.next;
1069 
1070 				nfs4_trigger_nargs_destroy(nargs);
1071 				nargs = next;
1072 			} while (nargs != NULL);
1073 		}
1074 
1075 		kmem_free(dma, sizeof (domount_args_t));
1076 	}
1077 }
1078 
1079 /*
1080  * The ephemeral_servinfo_t struct contains basic information we will need to
1081  * perform the mount. Whilst the structure is generic across different
1082  * types of ephemeral mount, the way we gather its contents differs.
1083  */
1084 static ephemeral_servinfo_t *
1085 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1086 {
1087 	ephemeral_servinfo_t *esi;
1088 	rnode4_t *rp = VTOR4(vp);
1089 
1090 	ASSERT(RP_ISSTUB(rp));
1091 
1092 	/* Call the ephemeral type-specific routine */
1093 	if (RP_ISSTUB_MIRRORMOUNT(rp))
1094 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1095 	else if (RP_ISSTUB_REFERRAL(rp))
1096 		esi = nfs4_trigger_esi_create_referral(vp, cr);
1097 	else
1098 		esi = NULL;
1099 	return (esi);
1100 }
1101 
1102 static void
1103 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1104 {
1105 	rnode4_t *rp = VTOR4(vp);
1106 
1107 	ASSERT(RP_ISSTUB(rp));
1108 
1109 	/* Currently, no need for an ephemeral type-specific routine */
1110 
1111 	/*
1112 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
1113 	 * and will be handled by nfs4_trigger_nargs_destroy().
1114 	 * We need only free the structure itself.
1115 	 */
1116 	if (esi != NULL)
1117 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
1118 }
1119 
1120 /*
1121  * Some of this may turn out to be common with other ephemeral types,
1122  * in which case it should be moved to nfs4_trigger_esi_create(), or a
1123  * common function called.
1124  */
1125 
1126 /*
1127  * Mirror mounts case - should have all data available
1128  */
1129 static ephemeral_servinfo_t *
1130 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1131 {
1132 	char			*stubpath;
1133 	struct knetconfig	*sikncp, *svkncp;
1134 	struct netbuf		*bufp;
1135 	ephemeral_servinfo_t	*esi;
1136 
1137 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1138 
1139 	/* initially set to be our type of ephemeral mount; may be added to */
1140 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1141 
1142 	/*
1143 	 * We're copying info from the stub rnode's servinfo4, but
1144 	 * we must create new copies, not pointers, since this information
1145 	 * is to be associated with the new mount, which will be
1146 	 * unmounted (and its structures freed) separately
1147 	 */
1148 
1149 	/*
1150 	 * Sizes passed to kmem_[z]alloc here must match those freed
1151 	 * in nfs4_free_args()
1152 	 */
1153 
1154 	/*
1155 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1156 	 * is difficult to avoid: as we need to read svp to calculate the
1157 	 * sizes to be allocated.
1158 	 */
1159 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1160 
1161 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1162 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1163 
1164 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1165 	bufp = esi->esi_addr;
1166 	bufp->len = svp->sv_addr.len;
1167 	bufp->maxlen = svp->sv_addr.maxlen;
1168 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1169 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1170 
1171 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1172 	sikncp = esi->esi_knconf;
1173 	svkncp = svp->sv_knconf;
1174 	sikncp->knc_semantics = svkncp->knc_semantics;
1175 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1176 	(void) strcat((char *)sikncp->knc_protofmly,
1177 	    (char *)svkncp->knc_protofmly);
1178 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1179 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1180 	sikncp->knc_rdev = svkncp->knc_rdev;
1181 
1182 	/*
1183 	 * Used when AUTH_DH is negotiated.
1184 	 *
1185 	 * This is ephemeral mount-type specific, since it contains the
1186 	 * server's time-sync syncaddr.
1187 	 */
1188 	if (svp->sv_dhsec) {
1189 		struct netbuf *bufp;
1190 		sec_data_t *sdata;
1191 		dh_k4_clntdata_t *data;
1192 
1193 		sdata = svp->sv_dhsec;
1194 		data = (dh_k4_clntdata_t *)sdata->data;
1195 		ASSERT(sdata->rpcflavor == AUTH_DH);
1196 
1197 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1198 		bufp->len = data->syncaddr.len;
1199 		bufp->maxlen = data->syncaddr.maxlen;
1200 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1201 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1202 		esi->esi_syncaddr = bufp;
1203 
1204 		if (data->netname != NULL) {
1205 			int nmlen = data->netnamelen;
1206 
1207 			/*
1208 			 * We need to copy from a dh_k4_clntdata_t
1209 			 * netname/netnamelen pair to a NUL-terminated
1210 			 * netname string suitable for putting in nfs_args,
1211 			 * where the latter has no netnamelen field.
1212 			 */
1213 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1214 			bcopy(data->netname, esi->esi_netname, nmlen);
1215 		}
1216 	} else {
1217 		esi->esi_syncaddr = NULL;
1218 		esi->esi_netname = NULL;
1219 	}
1220 
1221 	stubpath = fn_path(VTOSV(vp)->sv_name);
1222 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1223 	ASSERT(*stubpath == '.');
1224 	stubpath += 1;
1225 
1226 	/* for nfs_args->fh */
1227 	esi->esi_path_len = strlen(stubpath) + 1;
1228 	if (strcmp(svp->sv_path, "/") != 0)
1229 		esi->esi_path_len += strlen(svp->sv_path);
1230 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1231 	if (strcmp(svp->sv_path, "/") != 0)
1232 		(void) strcat(esi->esi_path, svp->sv_path);
1233 	(void) strcat(esi->esi_path, stubpath);
1234 
1235 	stubpath -= 1;
1236 	/* stubpath allocated by fn_path() */
1237 	kmem_free(stubpath, strlen(stubpath) + 1);
1238 
1239 	nfs_rw_exit(&svp->sv_lock);
1240 
1241 	return (esi);
1242 }
1243 
1244 /*
1245  * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1246  * get network information required to do the mount call.
1247  */
1248 int
1249 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1250 {
1251 	door_arg_t	door_args;
1252 	door_handle_t	dh;
1253 	XDR		xdr;
1254 	refd_door_args_t *xdr_argsp;
1255 	refd_door_res_t  *orig_resp;
1256 	k_sigset_t	smask;
1257 	int		xdr_len = 0;
1258 	int 		res_len = 16; /* length of an ip adress */
1259 	int		orig_reslen = res_len;
1260 	int		error = 0;
1261 	struct nfsidmap_globals *nig;
1262 
1263 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1264 		return (ECONNREFUSED);
1265 
1266 	nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1267 	ASSERT(nig != NULL);
1268 
1269 	mutex_enter(&nig->nfsidmap_daemon_lock);
1270 	dh = nig->nfsidmap_daemon_dh;
1271 	if (dh == NULL) {
1272 		mutex_exit(&nig->nfsidmap_daemon_lock);
1273 		cmn_err(CE_NOTE,
1274 		    "nfs4_callmapid: nfsmapid daemon not " \
1275 		    "running unable to resolve host name\n");
1276 		return (EINVAL);
1277 	}
1278 	door_ki_hold(dh);
1279 	mutex_exit(&nig->nfsidmap_daemon_lock);
1280 
1281 	xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1282 
1283 	xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1284 	xdr_argsp->xdr_len = xdr_len;
1285 	xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1286 
1287 	xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1288 	    xdr_len, XDR_ENCODE);
1289 
1290 	if (!xdr_utf8string(&xdr, server)) {
1291 		kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1292 		door_ki_rele(dh);
1293 		return (1);
1294 	}
1295 
1296 	if (orig_reslen)
1297 		orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1298 
1299 	door_args.data_ptr = (char *)xdr_argsp;
1300 	door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1301 	door_args.desc_ptr = NULL;
1302 	door_args.desc_num = 0;
1303 	door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1304 	door_args.rsize = res_len;
1305 
1306 	sigintr(&smask, 1);
1307 	error = door_ki_upcall(dh, &door_args);
1308 	sigunintr(&smask);
1309 
1310 	door_ki_rele(dh);
1311 
1312 	kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1313 	if (error) {
1314 		kmem_free(orig_resp, orig_reslen);
1315 		/*
1316 		 * There is no door to connect to. The referral daemon
1317 		 * must not be running yet.
1318 		 */
1319 		cmn_err(CE_WARN,
1320 		    "nfsmapid not running cannot resolve host name");
1321 		goto out;
1322 	}
1323 
1324 	/*
1325 	 * If the results buffer passed back are not the same as
1326 	 * what was sent free the old buffer and use the new one.
1327 	 */
1328 	if (orig_resp && orig_reslen) {
1329 		refd_door_res_t *door_resp;
1330 
1331 		door_resp = (refd_door_res_t *)door_args.rbuf;
1332 		if ((void *)door_args.rbuf != orig_resp)
1333 			kmem_free(orig_resp, orig_reslen);
1334 		if (door_resp->res_status == 0) {
1335 			xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1336 			    door_resp->xdr_len, XDR_DECODE);
1337 			bzero(resp, sizeof (struct nfs_fsl_info));
1338 			if (!xdr_nfs_fsl_info(&xdr, resp)) {
1339 				DTRACE_PROBE2(
1340 				    nfs4clnt__debug__referral__upcall__xdrfail,
1341 				    struct nfs_fsl_info *, resp,
1342 				    char *, "nfs4_callmapid");
1343 				error = EINVAL;
1344 			}
1345 		} else {
1346 			DTRACE_PROBE2(
1347 			    nfs4clnt__debug__referral__upcall__badstatus,
1348 			    int, door_resp->res_status,
1349 			    char *, "nfs4_callmapid");
1350 			error = door_resp->res_status;
1351 		}
1352 		kmem_free(door_args.rbuf, door_args.rsize);
1353 	}
1354 out:
1355 	DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1356 	    char *, server, int, error);
1357 	return (error);
1358 }
1359 
1360 /*
1361  * Fetches the fs_locations attribute. Typically called
1362  * from a Replication/Migration/Referrals/Mirror-mount context
1363  *
1364  * Fills in the attributes in garp. The caller is assumed
1365  * to have allocated memory for garp.
1366  *
1367  * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1368  *	 it's already done by caller. Otherwise lock these mutexes
1369  *	 before doing the rfs4call().
1370  *
1371  * Returns
1372  * 	1	 for success
1373  * 	0	 for failure
1374  */
1375 int
1376 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1377     cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1378 {
1379 	COMPOUND4args_clnt args;
1380 	COMPOUND4res_clnt res;
1381 	nfs_argop4 *argop;
1382 	int argoplist_size = 3 * sizeof (nfs_argop4);
1383 	nfs4_server_t *sp = NULL;
1384 	int doqueue = 1;
1385 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1386 	int retval = 1;
1387 	struct nfs4_clnt *nfscl;
1388 
1389 	if (lock == TRUE)
1390 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1391 	else
1392 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1393 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1394 
1395 	sp = find_nfs4_server(mi);
1396 	if (lock == TRUE)
1397 		nfs_rw_exit(&mi->mi_recovlock);
1398 
1399 	if (sp != NULL)
1400 		mutex_exit(&sp->s_lock);
1401 
1402 	if (lock == TRUE) {
1403 		if (sp != NULL)
1404 			(void) nfs_rw_enter_sig(&sp->s_recovlock,
1405 			    RW_WRITER, 0);
1406 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1407 	} else {
1408 		if (sp != NULL) {
1409 			ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1410 			    nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1411 		}
1412 	}
1413 
1414 	/*
1415 	 * Do we want to do the setup for recovery here?
1416 	 *
1417 	 * We know that the server responded to a null ping a very
1418 	 * short time ago, and we know that we intend to do a
1419 	 * single stateless operation - we want to fetch attributes,
1420 	 * so we know we can't encounter errors about state.  If
1421 	 * something goes wrong with the GETATTR, like not being
1422 	 * able to get a response from the server or getting any
1423 	 * kind of FH error, we should fail the mount.
1424 	 *
1425 	 * We may want to re-visited this at a later time.
1426 	 */
1427 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
1428 
1429 	args.ctag = TAG_GETATTR_FSLOCATION;
1430 	/* PUTFH LOOKUP GETATTR */
1431 	args.array_len = 3;
1432 	args.array = argop;
1433 
1434 	/* 0. putfh file */
1435 	argop[0].argop = OP_CPUTFH;
1436 	argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1437 
1438 	/* 1. lookup name, can't be dotdot */
1439 	argop[1].argop = OP_CLOOKUP;
1440 	argop[1].nfs_argop4_u.opclookup.cname = nm;
1441 
1442 	/* 2. file attrs */
1443 	argop[2].argop = OP_GETATTR;
1444 	argop[2].nfs_argop4_u.opgetattr.attr_request =
1445 	    FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1446 	    FATTR4_MOUNTED_ON_FILEID_MASK;
1447 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
1448 
1449 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1450 
1451 	if (lock == TRUE) {
1452 		nfs_rw_exit(&mi->mi_recovlock);
1453 		if (sp != NULL)
1454 			nfs_rw_exit(&sp->s_recovlock);
1455 	}
1456 
1457 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1458 	nfscl->nfscl_stat.referrals.value.ui64++;
1459 	DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1460 	    nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1461 
1462 	if (e.error != 0) {
1463 		if (sp != NULL)
1464 			nfs4_server_rele(sp);
1465 		kmem_free(argop, argoplist_size);
1466 		return (0);
1467 	}
1468 
1469 	/*
1470 	 * Check for all possible error conditions.
1471 	 * For valid replies without an ops array or for illegal
1472 	 * replies, return a failure.
1473 	 */
1474 	if (res.status != NFS4_OK || res.array_len < 3 ||
1475 	    res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1476 		retval = 0;
1477 		goto exit;
1478 	}
1479 
1480 	/*
1481 	 * There isn't much value in putting the attributes
1482 	 * in the attr cache since fs_locations4 aren't
1483 	 * encountered very frequently, so just make them
1484 	 * available to the caller.
1485 	 */
1486 	*garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1487 
1488 	DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1489 	    nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1490 
1491 	/* No fs_locations? -- return a failure */
1492 	if (garp->n4g_ext_res == NULL ||
1493 	    garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1494 		retval = 0;
1495 		goto exit;
1496 	}
1497 
1498 	if (!garp->n4g_fsid_valid)
1499 		retval = 0;
1500 
1501 exit:
1502 	if (retval == 0) {
1503 		/* the call was ok but failed validating the call results */
1504 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1505 	} else {
1506 		ASSERT(callres != NULL);
1507 		*callres = res;
1508 	}
1509 
1510 	if (sp != NULL)
1511 		nfs4_server_rele(sp);
1512 	kmem_free(argop, argoplist_size);
1513 	return (retval);
1514 }
1515 
1516 /* tunable to disable referral mounts */
1517 int nfs4_no_referrals = 0;
1518 
1519 /*
1520  * Returns NULL if the vnode cannot be created or found.
1521  */
1522 vnode_t *
1523 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1524 {
1525 	nfs_fh4 *stub_fh, *dfh;
1526 	nfs4_sharedfh_t *sfhp;
1527 	char *newfhval;
1528 	vnode_t *vp = NULL;
1529 	fattr4_mounted_on_fileid mnt_on_fileid;
1530 	nfs4_ga_res_t garp;
1531 	mntinfo4_t *mi;
1532 	COMPOUND4res_clnt callres;
1533 	hrtime_t t;
1534 
1535 	if (nfs4_no_referrals)
1536 		return (NULL);
1537 
1538 	/*
1539 	 * Get the mounted_on_fileid, unique on that server::fsid
1540 	 */
1541 	mi = VTOMI4(dvp);
1542 	if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1543 	    &garp, &callres, FALSE) == 0)
1544 		return (NULL);
1545 	mnt_on_fileid = garp.n4g_mon_fid;
1546 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1547 
1548 	/*
1549 	 * Build a fake filehandle from the dir FH and the mounted_on_fileid
1550 	 */
1551 	dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1552 	stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1553 	stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1554 	    sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1555 	newfhval = stub_fh->nfs_fh4_val;
1556 
1557 	/* copy directory's file handle */
1558 	bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1559 	stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1560 	newfhval = newfhval + dfh->nfs_fh4_len;
1561 
1562 	/* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1563 	bcopy((char *)&mnt_on_fileid, newfhval,
1564 	    sizeof (fattr4_mounted_on_fileid));
1565 	stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1566 
1567 	sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1568 	kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1569 	    sizeof (fattr4_mounted_on_fileid));
1570 	kmem_free(stub_fh, sizeof (nfs_fh4));
1571 	if (sfhp == NULL)
1572 		return (NULL);
1573 
1574 	t = gethrtime();
1575 	garp.n4g_va.va_type = VDIR;
1576 	vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1577 	    cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1578 
1579 	if (vp != NULL)
1580 		vp->v_type = VDIR;
1581 
1582 	sfh4_rele(&sfhp);
1583 	return (vp);
1584 }
1585 
1586 int
1587 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1588 {
1589 	vnode_t *nvp;
1590 	rnode4_t *rp;
1591 
1592 	if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1593 		return (EINVAL);
1594 
1595 	rp = VTOR4(nvp);
1596 	mutex_enter(&rp->r_statelock);
1597 	r4_stub_referral(rp);
1598 	mutex_exit(&rp->r_statelock);
1599 	dnlc_enter(dvp, nm, nvp);
1600 
1601 	if (*vpp != NULL)
1602 		VN_RELE(*vpp);	/* no longer need this vnode */
1603 
1604 	*vpp = nvp;
1605 
1606 	return (0);
1607 }
1608 
1609 /*
1610  * Fetch the location information and resolve the new server.
1611  * Caller needs to free up the XDR data which is returned.
1612  * Input: mount info, shared filehandle, nodename
1613  * Return: Index to the result or Error(-1)
1614  * Output: FsLocations Info, Resolved Server Info.
1615  */
1616 int
1617 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1618     char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1619     struct nfs_fsl_info *fsloc)
1620 {
1621 	fs_location4 *fsp;
1622 	struct nfs_fsl_info nfsfsloc;
1623 	int ret, i, error;
1624 	nfs4_ga_res_t garp;
1625 	COMPOUND4res_clnt callres;
1626 	struct knetconfig *knc;
1627 
1628 	ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1629 	if (ret == 0)
1630 		return (-1);
1631 
1632 	/*
1633 	 * As a lame attempt to figuring out if we're
1634 	 * handling a migration event or a referral,
1635 	 * look for rnodes with this fsid in the rnode
1636 	 * cache.
1637 	 *
1638 	 * If we can find one or more such rnodes, it
1639 	 * means we're handling a migration event and
1640 	 * we want to bail out in that case.
1641 	 */
1642 	if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1643 		DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1644 		    mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1645 		    char *, "nfs4_process_referral");
1646 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1647 		return (-1);
1648 	}
1649 
1650 	/*
1651 	 * Find the first responsive server to mount.  When we find
1652 	 * one, fsp will point to it.
1653 	 */
1654 	for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1655 
1656 		fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1657 		if (fsp->server_len == 0 || fsp->server_val == NULL)
1658 			continue;
1659 
1660 		error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1661 		if (error != 0)
1662 			continue;
1663 
1664 		error = nfs4_ping_server_common(nfsfsloc.knconf,
1665 		    nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1666 		if (error == RPC_SUCCESS)
1667 			break;
1668 
1669 		DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1670 		    sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1671 		    char *, "nfs4_process_referral");
1672 
1673 		(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1674 	}
1675 	knc = nfsfsloc.knconf;
1676 	if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1677 	    (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1678 		DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1679 		    nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1680 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1681 		return (-1);
1682 	}
1683 
1684 	/* Send the results back */
1685 	*fsloc = nfsfsloc;
1686 	*grp = garp;
1687 	*res = callres;
1688 	return (i);
1689 }
1690 
1691 /*
1692  * Referrals case - need to fetch referral data and then upcall to
1693  * user-level to get complete mount data.
1694  */
1695 static ephemeral_servinfo_t *
1696 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1697 {
1698 	struct knetconfig	*sikncp, *svkncp;
1699 	struct netbuf		*bufp;
1700 	ephemeral_servinfo_t	*esi;
1701 	vnode_t			*dvp;
1702 	rnode4_t		*drp;
1703 	fs_location4		*fsp;
1704 	struct nfs_fsl_info	nfsfsloc;
1705 	nfs4_ga_res_t		garp;
1706 	char			*p;
1707 	char			fn[MAXNAMELEN];
1708 	int			i, index = -1;
1709 	mntinfo4_t		*mi;
1710 	COMPOUND4res_clnt	callres;
1711 
1712 	/*
1713 	 * If we're passed in a stub vnode that
1714 	 * isn't a "referral" stub, bail out
1715 	 * and return a failure
1716 	 */
1717 	if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1718 		return (NULL);
1719 
1720 	if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1721 		return (NULL);
1722 
1723 	drp = VTOR4(dvp);
1724 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1725 		VN_RELE(dvp);
1726 		return (NULL);
1727 	}
1728 
1729 	if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1730 		nfs_rw_exit(&drp->r_rwlock);
1731 		VN_RELE(dvp);
1732 		return (NULL);
1733 	}
1734 
1735 	mi = VTOMI4(dvp);
1736 	index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1737 	    &garp, &callres, &nfsfsloc);
1738 	nfs_rw_exit(&drp->r_rwlock);
1739 	VN_RELE(dvp);
1740 	if (index < 0)
1741 		return (NULL);
1742 
1743 	fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1744 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1745 
1746 	/* initially set to be our type of ephemeral mount; may be added to */
1747 	esi->esi_mount_flags = NFSMNT_REFERRAL;
1748 
1749 	esi->esi_hostname =
1750 	    kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1751 	bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1752 	    fsp->server_val->utf8string_len);
1753 	esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1754 
1755 	bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1756 	bufp->len = nfsfsloc.addr->len;
1757 	bufp->maxlen = nfsfsloc.addr->maxlen;
1758 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1759 	bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1760 	esi->esi_addr = bufp;
1761 
1762 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1763 	sikncp = esi->esi_knconf;
1764 
1765 	DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1766 	    struct nfs_fsl_info *, &nfsfsloc,
1767 	    char *, "nfs4_trigger_esi_create_referral");
1768 
1769 	svkncp = nfsfsloc.knconf;
1770 	sikncp->knc_semantics = svkncp->knc_semantics;
1771 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1772 	(void) strlcat((char *)sikncp->knc_protofmly,
1773 	    (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1774 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1775 	(void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1776 	    KNC_STRSIZE);
1777 	sikncp->knc_rdev = svkncp->knc_rdev;
1778 
1779 	DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1780 	    struct knetconfig *, sikncp,
1781 	    char *, "nfs4_trigger_esi_create_referral");
1782 
1783 	esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1784 	bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1785 	esi->esi_syncaddr = NULL;
1786 
1787 	esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1788 	esi->esi_path_len = MAXPATHLEN;
1789 	*p++ = '/';
1790 	for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1791 		component4 *comp;
1792 
1793 		comp = &fsp->rootpath.pathname4_val[i];
1794 		/* If no space, null the string and bail */
1795 		if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1796 			goto err;
1797 		bcopy(comp->utf8string_val, p, comp->utf8string_len);
1798 		p += comp->utf8string_len;
1799 		*p++ = '/';
1800 	}
1801 	if (fsp->rootpath.pathname4_len != 0)
1802 		*(p - 1) = '\0';
1803 	else
1804 		*p = '\0';
1805 	p = esi->esi_path;
1806 	esi->esi_path = strdup(p);
1807 	esi->esi_path_len = strlen(p) + 1;
1808 	kmem_free(p, MAXPATHLEN);
1809 
1810 	/* Allocated in nfs4_process_referral() */
1811 	(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1812 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1813 
1814 	return (esi);
1815 err:
1816 	kmem_free(esi->esi_path, esi->esi_path_len);
1817 	kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1818 	kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1819 	kmem_free(esi->esi_addr, sizeof (struct netbuf));
1820 	kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1821 	kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1822 	kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1823 	kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1824 	kmem_free(esi, sizeof (ephemeral_servinfo_t));
1825 	(void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1826 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1827 	return (NULL);
1828 }
1829 
1830 /*
1831  * Assemble the args, and call the generic VFS mount function to
1832  * finally perform the ephemeral mount.
1833  */
1834 static int
1835 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1836     cred_t *cr, vnode_t **newvpp)
1837 {
1838 	struct mounta	*uap;
1839 	char		*mntpt, *orig_path, *path;
1840 	const char	*orig_mntpt;
1841 	int		retval;
1842 	int		mntpt_len;
1843 	int		spec_len;
1844 	zone_t		*zone = curproc->p_zone;
1845 	bool_t		has_leading_slash;
1846 	int		i;
1847 
1848 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1849 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1850 	struct nfs_args		*nargs = dma->dma_nargs;
1851 
1852 	/* first, construct the mount point for the ephemeral mount */
1853 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1854 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1855 
1856 	if (*orig_path == '.')
1857 		orig_path++;
1858 
1859 	/*
1860 	 * Get rid of zone's root path
1861 	 */
1862 	if (zone != global_zone) {
1863 		/*
1864 		 * -1 for trailing '/' and -1 for EOS.
1865 		 */
1866 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1867 		    zone->zone_rootpathlen - 1) == 0) {
1868 			orig_mntpt += (zone->zone_rootpathlen - 2);
1869 		}
1870 	}
1871 
1872 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1873 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1874 	(void) strcat(mntpt, orig_mntpt);
1875 	(void) strcat(mntpt, orig_path);
1876 
1877 	kmem_free(path, strlen(path) + 1);
1878 	path = esi->esi_path;
1879 	if (*path == '.')
1880 		path++;
1881 	if (path[0] == '/' && path[1] == '/')
1882 		path++;
1883 	has_leading_slash = (*path == '/');
1884 
1885 	spec_len = strlen(dma->dma_hostlist);
1886 	spec_len += strlen(path);
1887 
1888 	/* We are going to have to add this in */
1889 	if (!has_leading_slash)
1890 		spec_len++;
1891 
1892 	/* We need to get the ':' for dma_hostlist:esi_path */
1893 	spec_len++;
1894 
1895 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1896 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1897 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1898 	    has_leading_slash ? "" : "/", path);
1899 
1900 	uap->dir = mntpt;
1901 
1902 	uap->flags = MS_SYSSPACE | MS_DATA;
1903 	/* fstype-independent mount options not covered elsewhere */
1904 	/* copy parent's mount(1M) "-m" flag */
1905 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1906 		uap->flags |= MS_NOMNTTAB;
1907 
1908 	uap->fstype = MNTTYPE_NFS4;
1909 	uap->dataptr = (char *)nargs;
1910 	/* not needed for MS_SYSSPACE */
1911 	uap->datalen = 0;
1912 
1913 	/* use optptr to pass in extra mount options */
1914 	uap->flags |= MS_OPTIONSTR;
1915 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1916 	if (uap->optptr == NULL) {
1917 		retval = EINVAL;
1918 		goto done;
1919 	}
1920 
1921 	/* domount() expects us to count the trailing NUL */
1922 	uap->optlen = strlen(uap->optptr) + 1;
1923 
1924 	/*
1925 	 * If we get EBUSY, we try again once to see if we can perform
1926 	 * the mount. We do this because of a spurious race condition.
1927 	 */
1928 	for (i = 0; i < 2; i++) {
1929 		int	error;
1930 		bool_t	was_mounted;
1931 
1932 		retval = domount(NULL, uap, stubvp, cr, vfsp);
1933 		if (retval == 0) {
1934 			retval = VFS_ROOT(*vfsp, newvpp);
1935 			VFS_RELE(*vfsp);
1936 			break;
1937 		} else if (retval != EBUSY) {
1938 			break;
1939 		}
1940 
1941 		/*
1942 		 * We might find it mounted by the other racer...
1943 		 */
1944 		error = nfs4_trigger_mounted_already(stubvp,
1945 		    newvpp, &was_mounted, vfsp);
1946 		if (error) {
1947 			goto done;
1948 		} else if (was_mounted) {
1949 			retval = 0;
1950 			break;
1951 		}
1952 	}
1953 
1954 done:
1955 	if (uap->optptr)
1956 		nfs4_trigger_destroy_mntopts(uap->optptr);
1957 
1958 	kmem_free(uap->spec, spec_len + 1);
1959 	kmem_free(uap, sizeof (struct mounta));
1960 	kmem_free(mntpt, mntpt_len + 1);
1961 
1962 	return (retval);
1963 }
1964 
1965 /*
1966  * Build an nfs_args structure for passing to domount().
1967  *
1968  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1969  * generic data - common to all ephemeral mount types - is read directly
1970  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1971  */
1972 static struct nfs_args *
1973 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1974     ephemeral_servinfo_t *esi)
1975 {
1976 	sec_data_t *secdata;
1977 	struct nfs_args *nargs;
1978 
1979 	/* setup the nfs args */
1980 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1981 
1982 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1983 
1984 	nargs->addr = esi->esi_addr;
1985 
1986 	/* for AUTH_DH by negotiation */
1987 	if (esi->esi_syncaddr || esi->esi_netname) {
1988 		nargs->flags |= NFSMNT_SECURE;
1989 		nargs->syncaddr = esi->esi_syncaddr;
1990 		nargs->netname = esi->esi_netname;
1991 	}
1992 
1993 	nargs->flags |= NFSMNT_KNCONF;
1994 	nargs->knconf = esi->esi_knconf;
1995 	nargs->flags |= NFSMNT_HOSTNAME;
1996 	nargs->hostname = esi->esi_hostname;
1997 	nargs->fh = esi->esi_path;
1998 
1999 	/* general mount settings, all copied from parent mount */
2000 	mutex_enter(&mi->mi_lock);
2001 
2002 	if (!(mi->mi_flags & MI4_HARD))
2003 		nargs->flags |= NFSMNT_SOFT;
2004 
2005 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
2006 	    NFSMNT_RETRANS;
2007 	nargs->wsize = mi->mi_stsize;
2008 	nargs->rsize = mi->mi_tsize;
2009 	nargs->timeo = mi->mi_timeo;
2010 	nargs->retrans = mi->mi_retrans;
2011 
2012 	if (mi->mi_flags & MI4_INT)
2013 		nargs->flags |= NFSMNT_INT;
2014 	if (mi->mi_flags & MI4_NOAC)
2015 		nargs->flags |= NFSMNT_NOAC;
2016 
2017 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2018 	    NFSMNT_ACDIRMAX;
2019 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
2020 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
2021 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2022 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2023 
2024 	/* add any specific flags for this type of ephemeral mount */
2025 	nargs->flags |= esi->esi_mount_flags;
2026 
2027 	if (mi->mi_flags & MI4_NOCTO)
2028 		nargs->flags |= NFSMNT_NOCTO;
2029 	if (mi->mi_flags & MI4_GRPID)
2030 		nargs->flags |= NFSMNT_GRPID;
2031 	if (mi->mi_flags & MI4_LLOCK)
2032 		nargs->flags |= NFSMNT_LLOCK;
2033 	if (mi->mi_flags & MI4_NOPRINT)
2034 		nargs->flags |= NFSMNT_NOPRINT;
2035 	if (mi->mi_flags & MI4_DIRECTIO)
2036 		nargs->flags |= NFSMNT_DIRECTIO;
2037 	if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2038 		nargs->flags |= NFSMNT_PUBLIC;
2039 
2040 	/* Do some referral-specific option tweaking */
2041 	if (nargs->flags & NFSMNT_REFERRAL) {
2042 		nargs->flags &= ~NFSMNT_DORDMA;
2043 		nargs->flags |= NFSMNT_TRYRDMA;
2044 	}
2045 
2046 	mutex_exit(&mi->mi_lock);
2047 
2048 	/*
2049 	 * Security data & negotiation policy.
2050 	 *
2051 	 * For mirror mounts, we need to preserve the parent mount's
2052 	 * preference for security negotiation, translating SV4_TRYSECDEFAULT
2053 	 * to NFSMNT_SECDEFAULT if present.
2054 	 *
2055 	 * For referrals, we always want security negotiation and will
2056 	 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2057 	 * The reason is that we can't negotiate down from a parent's
2058 	 * Kerberos flavor to AUTH_SYS.
2059 	 *
2060 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2061 	 * security flavour was requested, with data in sv_secdata, and that
2062 	 * no negotiation should occur. If this specified flavour fails, that's
2063 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2064 	 *
2065 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2066 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
2067 	 * Possible flavours are recorded in an array in sv_secinfo, with
2068 	 * currently in-use flavour pointed to by sv_currsec.
2069 	 *
2070 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
2071 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2072 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2073 	 */
2074 	if (nargs->flags & NFSMNT_REFERRAL) {
2075 		/* enable negotiation for referral mount */
2076 		nargs->flags |= NFSMNT_SECDEFAULT;
2077 		secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2078 		secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2079 		secdata->data = NULL;
2080 	} else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2081 		/* enable negotiation for mirror mount */
2082 		nargs->flags |= NFSMNT_SECDEFAULT;
2083 
2084 		/*
2085 		 * As a starting point for negotiation, copy parent
2086 		 * mount's negotiated flavour (sv_currsec) if available,
2087 		 * or its passed-in flavour (sv_secdata) if not.
2088 		 */
2089 		if (svp->sv_currsec != NULL)
2090 			secdata = copy_sec_data(svp->sv_currsec);
2091 		else if (svp->sv_secdata != NULL)
2092 			secdata = copy_sec_data(svp->sv_secdata);
2093 		else
2094 			secdata = NULL;
2095 	} else {
2096 		/* do not enable negotiation; copy parent's passed-in flavour */
2097 		if (svp->sv_secdata != NULL)
2098 			secdata = copy_sec_data(svp->sv_secdata);
2099 		else
2100 			secdata = NULL;
2101 	}
2102 
2103 	nfs_rw_exit(&svp->sv_lock);
2104 
2105 	nargs->flags |= NFSMNT_NEWARGS;
2106 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
2107 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2108 
2109 	/* for NFS RO failover; caller will set if necessary */
2110 	nargs->nfs_ext_u.nfs_extB.next = NULL;
2111 
2112 	return (nargs);
2113 }
2114 
2115 static void
2116 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2117 {
2118 	/*
2119 	 * Either the mount failed, in which case the data is not needed, or
2120 	 * nfs4_mount() has either taken copies of what it needs or,
2121 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
2122 	 * whereby nfs4_free_args() will ignore it.
2123 	 */
2124 	nfs4_free_args(nargs);
2125 	kmem_free(nargs, sizeof (struct nfs_args));
2126 }
2127 
2128 /*
2129  * When we finally get into the mounting, we need to add this
2130  * node to the ephemeral tree.
2131  *
2132  * This is called from nfs4_mount().
2133  */
2134 int
2135 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2136 {
2137 	mntinfo4_t		*mi_parent;
2138 	nfs4_ephemeral_t	*eph;
2139 	nfs4_ephemeral_tree_t	*net;
2140 
2141 	nfs4_ephemeral_t	*prior;
2142 	nfs4_ephemeral_t	*child;
2143 
2144 	nfs4_ephemeral_t	*peer;
2145 
2146 	nfs4_trigger_globals_t	*ntg;
2147 	zone_t			*zone = curproc->p_zone;
2148 
2149 	int			rc = 0;
2150 
2151 	mi_parent = VTOMI4(mvp);
2152 
2153 	/*
2154 	 * Get this before grabbing anything else!
2155 	 */
2156 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2157 	if (!ntg->ntg_thread_started) {
2158 		nfs4_ephemeral_start_harvester(ntg);
2159 	}
2160 
2161 	mutex_enter(&mi_parent->mi_lock);
2162 	mutex_enter(&mi->mi_lock);
2163 
2164 	net = mi->mi_ephemeral_tree =
2165 	    mi_parent->mi_ephemeral_tree;
2166 
2167 	/*
2168 	 * If the mi_ephemeral_tree is NULL, then it
2169 	 * means that either the harvester or a manual
2170 	 * umount has cleared the tree out right before
2171 	 * we got here.
2172 	 *
2173 	 * There is nothing we can do here, so return
2174 	 * to the caller and let them decide whether they
2175 	 * try again.
2176 	 */
2177 	if (net == NULL) {
2178 		mutex_exit(&mi->mi_lock);
2179 		mutex_exit(&mi_parent->mi_lock);
2180 
2181 		return (EBUSY);
2182 	}
2183 
2184 	/*
2185 	 * We've just tied the mntinfo to the tree, so
2186 	 * now we bump the refcnt and hold it there until
2187 	 * this mntinfo is removed from the tree.
2188 	 */
2189 	nfs4_ephemeral_tree_hold(net);
2190 
2191 	/*
2192 	 * We need to tack together the ephemeral mount
2193 	 * with this new mntinfo.
2194 	 */
2195 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2196 	eph->ne_mount = mi;
2197 	MI4_HOLD(mi);
2198 	VFS_HOLD(mi->mi_vfsp);
2199 	eph->ne_ref_time = gethrestime_sec();
2200 
2201 	/*
2202 	 * We need to tell the ephemeral mount when
2203 	 * to time out.
2204 	 */
2205 	eph->ne_mount_to = ntg->ntg_mount_to;
2206 
2207 	mi->mi_ephemeral = eph;
2208 
2209 	/*
2210 	 * If the enclosing mntinfo4 is also ephemeral,
2211 	 * then we need to point to its enclosing parent.
2212 	 * Else the enclosing mntinfo4 is the enclosing parent.
2213 	 *
2214 	 * We also need to weave this ephemeral node
2215 	 * into the tree.
2216 	 */
2217 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2218 		/*
2219 		 * We need to decide if we are
2220 		 * the root node of this branch
2221 		 * or if we are a sibling of this
2222 		 * branch.
2223 		 */
2224 		prior = mi_parent->mi_ephemeral;
2225 		if (prior == NULL) {
2226 			/*
2227 			 * Race condition, clean up, and
2228 			 * let caller handle mntinfo.
2229 			 */
2230 			mi->mi_flags &= ~MI4_EPHEMERAL;
2231 			mi->mi_ephemeral = NULL;
2232 			kmem_free(eph, sizeof (*eph));
2233 			VFS_RELE(mi->mi_vfsp);
2234 			MI4_RELE(mi);
2235 			nfs4_ephemeral_tree_rele(net);
2236 			rc = EBUSY;
2237 		} else {
2238 			if (prior->ne_child == NULL) {
2239 				prior->ne_child = eph;
2240 			} else {
2241 				child = prior->ne_child;
2242 
2243 				prior->ne_child = eph;
2244 				eph->ne_peer = child;
2245 
2246 				child->ne_prior = eph;
2247 			}
2248 
2249 			eph->ne_prior = prior;
2250 		}
2251 	} else {
2252 		/*
2253 		 * The parent mntinfo4 is the non-ephemeral
2254 		 * root of the ephemeral tree. We
2255 		 * need to decide if we are the root
2256 		 * node of that tree or if we are a
2257 		 * sibling of the root node.
2258 		 *
2259 		 * We are the root if there is no
2260 		 * other node.
2261 		 */
2262 		if (net->net_root == NULL) {
2263 			net->net_root = eph;
2264 		} else {
2265 			eph->ne_peer = peer = net->net_root;
2266 			ASSERT(peer != NULL);
2267 			net->net_root = eph;
2268 
2269 			peer->ne_prior = eph;
2270 		}
2271 
2272 		eph->ne_prior = NULL;
2273 	}
2274 
2275 	mutex_exit(&mi->mi_lock);
2276 	mutex_exit(&mi_parent->mi_lock);
2277 
2278 	return (rc);
2279 }
2280 
2281 /*
2282  * Commit the changes to the ephemeral tree for removing this node.
2283  */
2284 static void
2285 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2286 {
2287 	nfs4_ephemeral_t	*e = eph;
2288 	nfs4_ephemeral_t	*peer;
2289 	nfs4_ephemeral_t	*prior;
2290 
2291 	peer = eph->ne_peer;
2292 	prior = e->ne_prior;
2293 
2294 	/*
2295 	 * If this branch root was not the
2296 	 * tree root, then we need to fix back pointers.
2297 	 */
2298 	if (prior) {
2299 		if (prior->ne_child == e) {
2300 			prior->ne_child = peer;
2301 		} else {
2302 			prior->ne_peer = peer;
2303 		}
2304 
2305 		if (peer)
2306 			peer->ne_prior = prior;
2307 	} else if (peer) {
2308 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2309 		peer->ne_prior = NULL;
2310 	} else {
2311 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2312 	}
2313 }
2314 
2315 /*
2316  * We want to avoid recursion at all costs. So we need to
2317  * unroll the tree. We do this by a depth first traversal to
2318  * leaf nodes. We blast away the leaf and work our way back
2319  * up and down the tree.
2320  */
2321 static int
2322 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2323     int isTreeRoot, int flag, cred_t *cr)
2324 {
2325 	nfs4_ephemeral_t	*e = eph;
2326 	nfs4_ephemeral_t	*prior;
2327 	mntinfo4_t		*mi;
2328 	vfs_t			*vfsp;
2329 	int			error;
2330 
2331 	/*
2332 	 * We use the loop while unrolling the ephemeral tree.
2333 	 */
2334 	for (;;) {
2335 		/*
2336 		 * First we walk down the child.
2337 		 */
2338 		if (e->ne_child) {
2339 			prior = e;
2340 			e = e->ne_child;
2341 			continue;
2342 		}
2343 
2344 		/*
2345 		 * If we are the root of the branch we are removing,
2346 		 * we end it here. But if the branch is the root of
2347 		 * the tree, we have to forge on. We do not consider
2348 		 * the peer list for the root because while it may
2349 		 * be okay to remove, it is both extra work and a
2350 		 * potential for a false-positive error to stall the
2351 		 * unmount attempt.
2352 		 */
2353 		if (e == eph && isTreeRoot == FALSE)
2354 			return (0);
2355 
2356 		/*
2357 		 * Next we walk down the peer list.
2358 		 */
2359 		if (e->ne_peer) {
2360 			prior = e;
2361 			e = e->ne_peer;
2362 			continue;
2363 		}
2364 
2365 		/*
2366 		 * We can only remove the node passed in by the
2367 		 * caller if it is the root of the ephemeral tree.
2368 		 * Otherwise, the caller will remove it.
2369 		 */
2370 		if (e == eph && isTreeRoot == FALSE)
2371 			return (0);
2372 
2373 		/*
2374 		 * Okay, we have a leaf node, time
2375 		 * to prune it!
2376 		 *
2377 		 * Note that prior can only be NULL if
2378 		 * and only if it is the root of the
2379 		 * ephemeral tree.
2380 		 */
2381 		prior = e->ne_prior;
2382 
2383 		mi = e->ne_mount;
2384 		mutex_enter(&mi->mi_lock);
2385 		vfsp = mi->mi_vfsp;
2386 		ASSERT(vfsp != NULL);
2387 
2388 		/*
2389 		 * Cleared by umount2_engine.
2390 		 */
2391 		VFS_HOLD(vfsp);
2392 
2393 		/*
2394 		 * Inform nfs4_unmount to not recursively
2395 		 * descend into this node's children when it
2396 		 * gets processed.
2397 		 */
2398 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2399 		mutex_exit(&mi->mi_lock);
2400 
2401 		error = umount2_engine(vfsp, flag, cr, FALSE);
2402 		if (error) {
2403 			/*
2404 			 * We need to reenable nfs4_unmount's ability
2405 			 * to recursively descend on this node.
2406 			 */
2407 			mutex_enter(&mi->mi_lock);
2408 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2409 			mutex_exit(&mi->mi_lock);
2410 
2411 			return (error);
2412 		}
2413 
2414 		/*
2415 		 * If we are the current node, we do not want to
2416 		 * touch anything else. At this point, the only
2417 		 * way the current node can have survived to here
2418 		 * is if it is the root of the ephemeral tree and
2419 		 * we are unmounting the enclosing mntinfo4.
2420 		 */
2421 		if (e == eph) {
2422 			ASSERT(prior == NULL);
2423 			return (0);
2424 		}
2425 
2426 		/*
2427 		 * Stitch up the prior node. Note that since
2428 		 * we have handled the root of the tree, prior
2429 		 * must be non-NULL.
2430 		 */
2431 		ASSERT(prior != NULL);
2432 		if (prior->ne_child == e) {
2433 			prior->ne_child = NULL;
2434 		} else {
2435 			ASSERT(prior->ne_peer == e);
2436 
2437 			prior->ne_peer = NULL;
2438 		}
2439 
2440 		e = prior;
2441 	}
2442 
2443 	/* NOTREACHED */
2444 }
2445 
2446 /*
2447  * Common code to safely release net_cnt_lock and net_tree_lock
2448  */
2449 void
2450 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2451     nfs4_ephemeral_tree_t **pnet)
2452 {
2453 	nfs4_ephemeral_tree_t	*net = *pnet;
2454 
2455 	if (*pmust_unlock) {
2456 		mutex_enter(&net->net_cnt_lock);
2457 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2458 		mutex_exit(&net->net_cnt_lock);
2459 
2460 		mutex_exit(&net->net_tree_lock);
2461 
2462 		*pmust_unlock = FALSE;
2463 	}
2464 }
2465 
2466 /*
2467  * While we may have removed any child or sibling nodes of this
2468  * ephemeral node, we can not nuke it until we know that there
2469  * were no actived vnodes on it. This will do that final
2470  * work once we know it is not busy.
2471  */
2472 void
2473 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2474     nfs4_ephemeral_tree_t **pnet)
2475 {
2476 	/*
2477 	 * Now we need to get rid of the ephemeral data if it exists.
2478 	 */
2479 	mutex_enter(&mi->mi_lock);
2480 	if (mi->mi_ephemeral) {
2481 		/*
2482 		 * If we are the root node of an ephemeral branch
2483 		 * which is being removed, then we need to fixup
2484 		 * pointers into and out of the node.
2485 		 */
2486 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2487 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2488 
2489 		nfs4_ephemeral_tree_rele(*pnet);
2490 		ASSERT(mi->mi_ephemeral != NULL);
2491 
2492 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2493 		mi->mi_ephemeral = NULL;
2494 		VFS_RELE(mi->mi_vfsp);
2495 		MI4_RELE(mi);
2496 	}
2497 	mutex_exit(&mi->mi_lock);
2498 
2499 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2500 }
2501 
2502 /*
2503  * Unmount an ephemeral node.
2504  *
2505  * Note that if this code fails, then it must unlock.
2506  *
2507  * If it succeeds, then the caller must be prepared to do so.
2508  */
2509 int
2510 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2511     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2512 {
2513 	int			error = 0;
2514 	nfs4_ephemeral_t	*eph;
2515 	nfs4_ephemeral_tree_t	*net;
2516 	int			is_derooting = FALSE;
2517 	int			is_recursed = FALSE;
2518 	int			was_locked = FALSE;
2519 
2520 	/*
2521 	 * Make sure to set the default state for cleaning
2522 	 * up the tree in the caller (and on the way out).
2523 	 */
2524 	*pmust_unlock = FALSE;
2525 
2526 	/*
2527 	 * The active vnodes on this file system may be ephemeral
2528 	 * children. We need to check for and try to unmount them
2529 	 * here. If any can not be unmounted, we are going
2530 	 * to return EBUSY.
2531 	 */
2532 	mutex_enter(&mi->mi_lock);
2533 
2534 	/*
2535 	 * If an ephemeral tree, we need to check to see if
2536 	 * the lock is already held. If it is, then we need
2537 	 * to see if we are being called as a result of
2538 	 * the recursive removal of some node of the tree or
2539 	 * if we are another attempt to remove the tree.
2540 	 *
2541 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2542 	 * node. mi_ephemeral being non-NULL also does this.
2543 	 *
2544 	 * mi_ephemeral_tree being non-NULL is sufficient
2545 	 * to also indicate either it is an ephemeral node
2546 	 * or the enclosing mntinfo4.
2547 	 *
2548 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
2549 	 * when we delete the ephemeral node and need to
2550 	 * differentiate from an ephemeral node and the
2551 	 * enclosing root node.
2552 	 */
2553 	*pnet = net = mi->mi_ephemeral_tree;
2554 	if (net == NULL) {
2555 		mutex_exit(&mi->mi_lock);
2556 		return (0);
2557 	}
2558 
2559 	eph = mi->mi_ephemeral;
2560 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2561 	is_derooting = (eph == NULL);
2562 
2563 	mutex_enter(&net->net_cnt_lock);
2564 
2565 	/*
2566 	 * If this is not recursion, then we need to
2567 	 * check to see if a harvester thread has
2568 	 * already grabbed the lock.
2569 	 *
2570 	 * After we exit this branch, we may not
2571 	 * blindly return, we need to jump to
2572 	 * is_busy!
2573 	 */
2574 	if (!is_recursed) {
2575 		if (net->net_status &
2576 		    NFS4_EPHEMERAL_TREE_LOCKED) {
2577 			/*
2578 			 * If the tree is locked, we need
2579 			 * to decide whether we are the
2580 			 * harvester or some explicit call
2581 			 * for a umount. The only way that
2582 			 * we are the harvester is if
2583 			 * MS_SYSSPACE is set.
2584 			 *
2585 			 * We only let the harvester through
2586 			 * at this point.
2587 			 *
2588 			 * We return EBUSY so that the
2589 			 * caller knows something is
2590 			 * going on. Note that by that
2591 			 * time, the umount in the other
2592 			 * thread may have already occured.
2593 			 */
2594 			if (!(flag & MS_SYSSPACE)) {
2595 				mutex_exit(&net->net_cnt_lock);
2596 				mutex_exit(&mi->mi_lock);
2597 
2598 				return (EBUSY);
2599 			}
2600 
2601 			was_locked = TRUE;
2602 		}
2603 	}
2604 
2605 	mutex_exit(&net->net_cnt_lock);
2606 	mutex_exit(&mi->mi_lock);
2607 
2608 	/*
2609 	 * If we are not the harvester, we need to check
2610 	 * to see if we need to grab the tree lock.
2611 	 */
2612 	if (was_locked == FALSE) {
2613 		/*
2614 		 * If we grab the lock, it means that no other
2615 		 * operation is working on the tree. If we don't
2616 		 * grab it, we need to decide if this is because
2617 		 * we are a recursive call or a new operation.
2618 		 */
2619 		if (mutex_tryenter(&net->net_tree_lock)) {
2620 			*pmust_unlock = TRUE;
2621 		} else {
2622 			/*
2623 			 * If we are a recursive call, we can
2624 			 * proceed without the lock.
2625 			 * Otherwise we have to wait until
2626 			 * the lock becomes free.
2627 			 */
2628 			if (!is_recursed) {
2629 				mutex_enter(&net->net_cnt_lock);
2630 				if (net->net_status &
2631 				    (NFS4_EPHEMERAL_TREE_DEROOTING
2632 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
2633 					mutex_exit(&net->net_cnt_lock);
2634 					goto is_busy;
2635 				}
2636 				mutex_exit(&net->net_cnt_lock);
2637 
2638 				/*
2639 				 * We can't hold any other locks whilst
2640 				 * we wait on this to free up.
2641 				 */
2642 				mutex_enter(&net->net_tree_lock);
2643 
2644 				/*
2645 				 * Note that while mi->mi_ephemeral
2646 				 * may change and thus we have to
2647 				 * update eph, it is the case that
2648 				 * we have tied down net and
2649 				 * do not care if mi->mi_ephemeral_tree
2650 				 * has changed.
2651 				 */
2652 				mutex_enter(&mi->mi_lock);
2653 				eph = mi->mi_ephemeral;
2654 				mutex_exit(&mi->mi_lock);
2655 
2656 				/*
2657 				 * Okay, we need to see if either the
2658 				 * tree got nuked or the current node
2659 				 * got nuked. Both of which will cause
2660 				 * an error.
2661 				 *
2662 				 * Note that a subsequent retry of the
2663 				 * umount shall work.
2664 				 */
2665 				mutex_enter(&net->net_cnt_lock);
2666 				if (net->net_status &
2667 				    NFS4_EPHEMERAL_TREE_INVALID ||
2668 				    (!is_derooting && eph == NULL)) {
2669 					mutex_exit(&net->net_cnt_lock);
2670 					mutex_exit(&net->net_tree_lock);
2671 					goto is_busy;
2672 				}
2673 				mutex_exit(&net->net_cnt_lock);
2674 				*pmust_unlock = TRUE;
2675 			}
2676 		}
2677 	}
2678 
2679 	/*
2680 	 * Only once we have grabbed the lock can we mark what we
2681 	 * are planning on doing to the ephemeral tree.
2682 	 */
2683 	if (*pmust_unlock) {
2684 		mutex_enter(&net->net_cnt_lock);
2685 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2686 
2687 		/*
2688 		 * Check to see if we are nuking the root.
2689 		 */
2690 		if (is_derooting)
2691 			net->net_status |=
2692 			    NFS4_EPHEMERAL_TREE_DEROOTING;
2693 		mutex_exit(&net->net_cnt_lock);
2694 	}
2695 
2696 	if (!is_derooting) {
2697 		/*
2698 		 * Only work on children if the caller has not already
2699 		 * done so.
2700 		 */
2701 		if (!is_recursed) {
2702 			ASSERT(eph != NULL);
2703 
2704 			error = nfs4_ephemeral_unmount_engine(eph,
2705 			    FALSE, flag, cr);
2706 			if (error)
2707 				goto is_busy;
2708 		}
2709 	} else {
2710 		eph = net->net_root;
2711 
2712 		/*
2713 		 * Only work if there is something there.
2714 		 */
2715 		if (eph) {
2716 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2717 			    flag, cr);
2718 			if (error) {
2719 				mutex_enter(&net->net_cnt_lock);
2720 				net->net_status &=
2721 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
2722 				mutex_exit(&net->net_cnt_lock);
2723 				goto is_busy;
2724 			}
2725 
2726 			/*
2727 			 * Nothing else which goes wrong will
2728 			 * invalidate the blowing away of the
2729 			 * ephmeral tree.
2730 			 */
2731 			net->net_root = NULL;
2732 		}
2733 
2734 		/*
2735 		 * We have derooted and we have caused the tree to be
2736 		 * invalidated.
2737 		 */
2738 		mutex_enter(&net->net_cnt_lock);
2739 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2740 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2741 		DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2742 		    uint_t, net->net_refcnt);
2743 
2744 		/*
2745 		 * We will not finalize this node, so safe to
2746 		 * release it.
2747 		 */
2748 		nfs4_ephemeral_tree_decr(net);
2749 		mutex_exit(&net->net_cnt_lock);
2750 
2751 		if (was_locked == FALSE)
2752 			mutex_exit(&net->net_tree_lock);
2753 
2754 		/*
2755 		 * We have just blown away any notation of this
2756 		 * tree being locked or having a refcnt.
2757 		 * We can't let the caller try to clean things up.
2758 		 */
2759 		*pmust_unlock = FALSE;
2760 
2761 		/*
2762 		 * At this point, the tree should no longer be
2763 		 * associated with the mntinfo4. We need to pull
2764 		 * it off there and let the harvester take
2765 		 * care of it once the refcnt drops.
2766 		 */
2767 		mutex_enter(&mi->mi_lock);
2768 		mi->mi_ephemeral_tree = NULL;
2769 		mutex_exit(&mi->mi_lock);
2770 	}
2771 
2772 	return (0);
2773 
2774 is_busy:
2775 
2776 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2777 
2778 	return (error);
2779 }
2780 
2781 /*
2782  * Do the umount and record any error in the parent.
2783  */
2784 static void
2785 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2786     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2787 {
2788 	int	error;
2789 
2790 	/*
2791 	 * Only act on if the fs is still mounted.
2792 	 */
2793 	if (vfsp == NULL)
2794 		return;
2795 
2796 	error = umount2_engine(vfsp, flag, kcred, FALSE);
2797 	if (error) {
2798 		if (prior) {
2799 			if (prior->ne_child == e)
2800 				prior->ne_state |=
2801 				    NFS4_EPHEMERAL_CHILD_ERROR;
2802 			else
2803 				prior->ne_state |=
2804 				    NFS4_EPHEMERAL_PEER_ERROR;
2805 		}
2806 	}
2807 }
2808 
2809 /*
2810  * For each tree in the forest (where the forest is in
2811  * effect all of the ephemeral trees for this zone),
2812  * scan to see if a node can be unmounted. Note that
2813  * unlike nfs4_ephemeral_unmount_engine(), we do
2814  * not process the current node before children or
2815  * siblings. I.e., if a node can be unmounted, we
2816  * do not recursively check to see if the nodes
2817  * hanging off of it can also be unmounted.
2818  *
2819  * Instead, we delve down deep to try and remove the
2820  * children first. Then, because we share code with
2821  * nfs4_ephemeral_unmount_engine(), we will try
2822  * them again. This could be a performance issue in
2823  * the future.
2824  *
2825  * Also note that unlike nfs4_ephemeral_unmount_engine(),
2826  * we do not halt on an error. We will not remove the
2827  * current node, but we will keep on trying to remove
2828  * the others.
2829  *
2830  * force indicates that we want the unmount to occur
2831  * even if there is something blocking it.
2832  *
2833  * time_check indicates that we want to see if the
2834  * mount has expired past mount_to or not. Typically
2835  * we want to do this and only on a shutdown of the
2836  * zone would we want to ignore the check.
2837  */
2838 static void
2839 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2840     bool_t force, bool_t time_check)
2841 {
2842 	nfs4_ephemeral_tree_t	*net;
2843 	nfs4_ephemeral_tree_t	*prev = NULL;
2844 	nfs4_ephemeral_tree_t	*next;
2845 	nfs4_ephemeral_t	*e;
2846 	nfs4_ephemeral_t	*prior;
2847 	time_t			now = gethrestime_sec();
2848 
2849 	nfs4_ephemeral_tree_t	*harvest = NULL;
2850 
2851 	int			flag;
2852 
2853 	mntinfo4_t		*mi;
2854 	vfs_t			*vfsp;
2855 
2856 	if (force)
2857 		flag = MS_FORCE | MS_SYSSPACE;
2858 	else
2859 		flag = MS_SYSSPACE;
2860 
2861 	mutex_enter(&ntg->ntg_forest_lock);
2862 	for (net = ntg->ntg_forest; net != NULL; net = next) {
2863 		next = net->net_next;
2864 
2865 		nfs4_ephemeral_tree_hold(net);
2866 
2867 		mutex_enter(&net->net_tree_lock);
2868 
2869 		/*
2870 		 * Let the unmount code know that the
2871 		 * tree is already locked!
2872 		 */
2873 		mutex_enter(&net->net_cnt_lock);
2874 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2875 		mutex_exit(&net->net_cnt_lock);
2876 
2877 		/*
2878 		 * If the intent is force all ephemeral nodes to
2879 		 * be unmounted in this zone, we can short circuit a
2880 		 * lot of tree traversal and simply zap the root node.
2881 		 */
2882 		if (force) {
2883 			if (net->net_root) {
2884 				mi = net->net_root->ne_mount;
2885 
2886 				vfsp = mi->mi_vfsp;
2887 				ASSERT(vfsp != NULL);
2888 
2889 				/*
2890 				 * Cleared by umount2_engine.
2891 				 */
2892 				VFS_HOLD(vfsp);
2893 
2894 				(void) umount2_engine(vfsp, flag,
2895 				    kcred, FALSE);
2896 
2897 				goto check_done;
2898 			}
2899 		}
2900 
2901 		e = net->net_root;
2902 		if (e)
2903 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2904 
2905 		while (e) {
2906 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2907 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2908 				if (e->ne_child) {
2909 					e = e->ne_child;
2910 					e->ne_state =
2911 					    NFS4_EPHEMERAL_VISIT_CHILD;
2912 				}
2913 
2914 				continue;
2915 			} else if (e->ne_state ==
2916 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2917 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2918 				if (e->ne_peer) {
2919 					e = e->ne_peer;
2920 					e->ne_state =
2921 					    NFS4_EPHEMERAL_VISIT_CHILD;
2922 				}
2923 
2924 				continue;
2925 			} else if (e->ne_state ==
2926 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2927 				prior = e->ne_prior;
2928 
2929 				/*
2930 				 * If a child reported an error, do
2931 				 * not bother trying to unmount.
2932 				 *
2933 				 * If your prior node is a parent,
2934 				 * pass the error up such that they
2935 				 * also do not try to unmount.
2936 				 *
2937 				 * However, if your prior is a sibling,
2938 				 * let them try to unmount if they can.
2939 				 */
2940 				if (prior) {
2941 					if (prior->ne_child == e)
2942 						prior->ne_state |=
2943 						    NFS4_EPHEMERAL_CHILD_ERROR;
2944 					else
2945 						prior->ne_state |=
2946 						    NFS4_EPHEMERAL_PEER_ERROR;
2947 				}
2948 
2949 				/*
2950 				 * Clear the error and if needed, process peers.
2951 				 *
2952 				 * Once we mask out the error, we know whether
2953 				 * or we have to process another node.
2954 				 */
2955 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2956 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2957 					e = prior;
2958 
2959 				continue;
2960 			} else if (e->ne_state ==
2961 			    NFS4_EPHEMERAL_PEER_ERROR) {
2962 				prior = e->ne_prior;
2963 
2964 				if (prior) {
2965 					if (prior->ne_child == e)
2966 						prior->ne_state =
2967 						    NFS4_EPHEMERAL_CHILD_ERROR;
2968 					else
2969 						prior->ne_state =
2970 						    NFS4_EPHEMERAL_PEER_ERROR;
2971 				}
2972 
2973 				/*
2974 				 * Clear the error from this node and do the
2975 				 * correct processing.
2976 				 */
2977 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2978 				continue;
2979 			}
2980 
2981 			prior = e->ne_prior;
2982 			e->ne_state = NFS4_EPHEMERAL_OK;
2983 
2984 			/*
2985 			 * It must be the case that we need to process
2986 			 * this node.
2987 			 */
2988 			if (!time_check ||
2989 			    now - e->ne_ref_time > e->ne_mount_to) {
2990 				mi = e->ne_mount;
2991 				vfsp = mi->mi_vfsp;
2992 
2993 				/*
2994 				 * Cleared by umount2_engine.
2995 				 */
2996 				if (vfsp != NULL)
2997 					VFS_HOLD(vfsp);
2998 
2999 				/*
3000 				 * Note that we effectively work down to the
3001 				 * leaf nodes first, try to unmount them,
3002 				 * then work our way back up into the leaf
3003 				 * nodes.
3004 				 *
3005 				 * Also note that we deal with a lot of
3006 				 * complexity by sharing the work with
3007 				 * the manual unmount code.
3008 				 */
3009 				nfs4_ephemeral_record_umount(vfsp, flag,
3010 				    e, prior);
3011 			}
3012 
3013 			e = prior;
3014 		}
3015 
3016 check_done:
3017 
3018 		/*
3019 		 * At this point we are done processing this tree.
3020 		 *
3021 		 * If the tree is invalid and we were the only reference
3022 		 * to it, then we push it on the local linked list
3023 		 * to remove it at the end. We avoid that action now
3024 		 * to keep the tree processing going along at a fair clip.
3025 		 *
3026 		 * Else, even if we were the only reference, we
3027 		 * allow it to be reused as needed.
3028 		 */
3029 		mutex_enter(&net->net_cnt_lock);
3030 		nfs4_ephemeral_tree_decr(net);
3031 		if (net->net_refcnt == 0 &&
3032 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3033 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3034 			mutex_exit(&net->net_cnt_lock);
3035 			mutex_exit(&net->net_tree_lock);
3036 
3037 			if (prev)
3038 				prev->net_next = net->net_next;
3039 			else
3040 				ntg->ntg_forest = net->net_next;
3041 
3042 			net->net_next = harvest;
3043 			harvest = net;
3044 
3045 			VFS_RELE(net->net_mount->mi_vfsp);
3046 			MI4_RELE(net->net_mount);
3047 
3048 			continue;
3049 		}
3050 
3051 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3052 		mutex_exit(&net->net_cnt_lock);
3053 		mutex_exit(&net->net_tree_lock);
3054 
3055 		prev = net;
3056 	}
3057 	mutex_exit(&ntg->ntg_forest_lock);
3058 
3059 	for (net = harvest; net != NULL; net = next) {
3060 		next = net->net_next;
3061 
3062 		mutex_destroy(&net->net_tree_lock);
3063 		mutex_destroy(&net->net_cnt_lock);
3064 		kmem_free(net, sizeof (*net));
3065 	}
3066 }
3067 
3068 /*
3069  * This is the thread which decides when the harvesting
3070  * can proceed and when to kill it off for this zone.
3071  */
3072 static void
3073 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3074 {
3075 	clock_t		timeleft;
3076 	zone_t		*zone = curproc->p_zone;
3077 
3078 	for (;;) {
3079 		timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3080 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3081 
3082 		/*
3083 		 * zone is exiting...
3084 		 */
3085 		if (timeleft != -1) {
3086 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3087 			zthread_exit();
3088 			/* NOTREACHED */
3089 		}
3090 
3091 		/*
3092 		 * Only bother scanning if there is potential
3093 		 * work to be done.
3094 		 */
3095 		if (ntg->ntg_forest == NULL)
3096 			continue;
3097 
3098 		/*
3099 		 * Now scan the list and get rid of everything which
3100 		 * is old.
3101 		 */
3102 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3103 	}
3104 
3105 	/* NOTREACHED */
3106 }
3107 
3108 /*
3109  * The zone specific glue needed to start the unmount harvester.
3110  *
3111  * Note that we want to avoid holding the mutex as long as possible,
3112  * hence the multiple checks.
3113  *
3114  * The caller should avoid us getting down here in the first
3115  * place.
3116  */
3117 static void
3118 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3119 {
3120 	/*
3121 	 * It got started before we got here...
3122 	 */
3123 	if (ntg->ntg_thread_started)
3124 		return;
3125 
3126 	mutex_enter(&nfs4_ephemeral_thread_lock);
3127 
3128 	if (ntg->ntg_thread_started) {
3129 		mutex_exit(&nfs4_ephemeral_thread_lock);
3130 		return;
3131 	}
3132 
3133 	/*
3134 	 * Start the unmounter harvester thread for this zone.
3135 	 */
3136 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3137 	    ntg, 0, minclsyspri);
3138 
3139 	ntg->ntg_thread_started = TRUE;
3140 	mutex_exit(&nfs4_ephemeral_thread_lock);
3141 }
3142 
3143 /*ARGSUSED*/
3144 static void *
3145 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3146 {
3147 	nfs4_trigger_globals_t	*ntg;
3148 
3149 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3150 	ntg->ntg_thread_started = FALSE;
3151 
3152 	/*
3153 	 * This is the default....
3154 	 */
3155 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
3156 
3157 	mutex_init(&ntg->ntg_forest_lock, NULL,
3158 	    MUTEX_DEFAULT, NULL);
3159 
3160 	return (ntg);
3161 }
3162 
3163 /*
3164  * Try a nice gentle walk down the forest and convince
3165  * all of the trees to gracefully give it up.
3166  */
3167 /*ARGSUSED*/
3168 static void
3169 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3170 {
3171 	nfs4_trigger_globals_t	*ntg = arg;
3172 
3173 	if (!ntg)
3174 		return;
3175 
3176 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3177 }
3178 
3179 /*
3180  * Race along the forest and rip all of the trees out by
3181  * their rootballs!
3182  */
3183 /*ARGSUSED*/
3184 static void
3185 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3186 {
3187 	nfs4_trigger_globals_t	*ntg = arg;
3188 
3189 	if (!ntg)
3190 		return;
3191 
3192 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3193 
3194 	mutex_destroy(&ntg->ntg_forest_lock);
3195 	kmem_free(ntg, sizeof (*ntg));
3196 }
3197 
3198 /*
3199  * This is the zone independent cleanup needed for
3200  * emphemeral mount processing.
3201  */
3202 void
3203 nfs4_ephemeral_fini(void)
3204 {
3205 	(void) zone_key_delete(nfs4_ephemeral_key);
3206 	mutex_destroy(&nfs4_ephemeral_thread_lock);
3207 }
3208 
3209 /*
3210  * This is the zone independent initialization needed for
3211  * emphemeral mount processing.
3212  */
3213 void
3214 nfs4_ephemeral_init(void)
3215 {
3216 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3217 	    NULL);
3218 
3219 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3220 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3221 }
3222 
3223 /*
3224  * nfssys() calls this function to set the per-zone
3225  * value of mount_to to drive when an ephemeral mount is
3226  * timed out. Each mount will grab a copy of this value
3227  * when mounted.
3228  */
3229 void
3230 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3231 {
3232 	nfs4_trigger_globals_t	*ntg;
3233 	zone_t			*zone = curproc->p_zone;
3234 
3235 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3236 
3237 	ntg->ntg_mount_to = mount_to;
3238 }
3239 
3240 /*
3241  * Walk the list of v4 mount options; if they are currently set in vfsp,
3242  * append them to a new comma-separated mount option string, and return it.
3243  *
3244  * Caller should free by calling nfs4_trigger_destroy_mntopts().
3245  */
3246 static char *
3247 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3248 {
3249 	uint_t i;
3250 	char *mntopts;
3251 	struct vfssw *vswp;
3252 	mntopts_t *optproto;
3253 
3254 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3255 
3256 	/* get the list of applicable mount options for v4; locks *vswp */
3257 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
3258 	optproto = &vswp->vsw_optproto;
3259 
3260 	for (i = 0; i < optproto->mo_count; i++) {
3261 		struct mntopt *mop = &optproto->mo_list[i];
3262 
3263 		if (mop->mo_flags & MO_EMPTY)
3264 			continue;
3265 
3266 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3267 			kmem_free(mntopts, MAX_MNTOPT_STR);
3268 			vfs_unrefvfssw(vswp);
3269 			return (NULL);
3270 		}
3271 	}
3272 
3273 	vfs_unrefvfssw(vswp);
3274 
3275 	/*
3276 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
3277 	 * and it may only be passed via MS_OPTIONSTR, so we
3278 	 * must handle it here.
3279 	 *
3280 	 * Ideally, it would be in the list, but NFS does not specify its
3281 	 * own opt proto list, it uses instead the default one. Since
3282 	 * not all filesystems support extended attrs, it would not be
3283 	 * appropriate to add it there.
3284 	 */
3285 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3286 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3287 		kmem_free(mntopts, MAX_MNTOPT_STR);
3288 		return (NULL);
3289 	}
3290 
3291 	return (mntopts);
3292 }
3293 
3294 static void
3295 nfs4_trigger_destroy_mntopts(char *mntopts)
3296 {
3297 	if (mntopts)
3298 		kmem_free(mntopts, MAX_MNTOPT_STR);
3299 }
3300 
3301 /*
3302  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3303  */
3304 static int
3305 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3306 {
3307 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
3308 		return (EINVAL);
3309 
3310 	if (vfs_optionisset(vfsp, optname, NULL)) {
3311 		size_t mntoptslen = strlen(mntopts);
3312 		size_t optnamelen = strlen(optname);
3313 
3314 		/* +1 for ',', +1 for NUL */
3315 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3316 			return (EOVERFLOW);
3317 
3318 		/* first or subsequent mount option? */
3319 		if (*mntopts != '\0')
3320 			(void) strcat(mntopts, ",");
3321 
3322 		(void) strcat(mntopts, optname);
3323 	}
3324 
3325 	return (0);
3326 }
3327 
3328 static enum clnt_stat
3329 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3330 {
3331 	int retries;
3332 	uint_t max_msgsize;
3333 	enum clnt_stat status;
3334 	CLIENT *cl;
3335 	struct timeval timeout;
3336 
3337 	/* as per recov_newserver() */
3338 	max_msgsize = 0;
3339 	retries = 1;
3340 	timeout.tv_sec = 2;
3341 	timeout.tv_usec = 0;
3342 
3343 	if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3344 	    max_msgsize, retries, CRED(), &cl) != 0)
3345 		return (RPC_FAILED);
3346 
3347 	if (nointr)
3348 		cl->cl_nosignal = TRUE;
3349 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3350 	    timeout);
3351 	if (nointr)
3352 		cl->cl_nosignal = FALSE;
3353 
3354 	AUTH_DESTROY(cl->cl_auth);
3355 	CLNT_DESTROY(cl);
3356 
3357 	return (status);
3358 }
3359 
3360 static enum clnt_stat
3361 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3362 {
3363 	return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3364 }
3365