xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision 45ede40b2394db7967e59f19288fae9b62efd4aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29  * triggered from a "stub" rnode via a special set of vnodeops.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/mount.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 #include <sys/mntent.h>
68 #include <sys/priv.h>
69 
70 #include <rpc/types.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73 
74 #include <nfs/nfs.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/nfs_acl.h>
77 #include <nfs/lm.h>
78 #include <nfs/nfs4.h>
79 #include <nfs/nfs4_kprot.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82 #include <nfs/nfsid_map.h>
83 #include <nfs/nfs4_idmap_impl.h>
84 
85 #include <vm/hat.h>
86 #include <vm/as.h>
87 #include <vm/page.h>
88 #include <vm/pvn.h>
89 #include <vm/seg.h>
90 #include <vm/seg_map.h>
91 #include <vm/seg_kpm.h>
92 #include <vm/seg_vn.h>
93 
94 #include <fs/fs_subr.h>
95 
96 #include <sys/ddi.h>
97 #include <sys/int_fmtio.h>
98 
99 #include <sys/sunddi.h>
100 
101 #include <sys/priv_names.h>
102 
103 extern zone_key_t	nfs4clnt_zone_key;
104 extern zone_key_t	nfsidmap_zone_key;
105 
106 /*
107  * The automatic unmounter thread stuff!
108  */
109 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
110 
111 /*
112  * Just a default....
113  */
114 static uint_t nfs4_trigger_mount_to = 240;
115 
116 typedef struct nfs4_trigger_globals {
117 	kmutex_t		ntg_forest_lock;
118 	uint_t			ntg_mount_to;
119 	int			ntg_thread_started;
120 	nfs4_ephemeral_tree_t	*ntg_forest;
121 } nfs4_trigger_globals_t;
122 
123 kmutex_t	nfs4_ephemeral_thread_lock;
124 
125 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
126 
127 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
128 
129 /*
130  * Used for ephemeral mounts; contains data either duplicated from
131  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
132  *
133  * It's intended that this structure is used solely for ephemeral
134  * mount-type specific data, for passing this data to
135  * nfs4_trigger_nargs_create().
136  */
137 typedef struct ephemeral_servinfo {
138 	char			*esi_hostname;
139 	char			*esi_netname;
140 	char			*esi_path;
141 	int			esi_path_len;
142 	int			esi_mount_flags;
143 	struct netbuf		*esi_addr;
144 	struct netbuf		*esi_syncaddr;
145 	struct knetconfig	*esi_knconf;
146 } ephemeral_servinfo_t;
147 
148 /*
149  * Collect together the mount-type specific and generic data args.
150  */
151 typedef struct domount_args {
152 	ephemeral_servinfo_t	*dma_esi;
153 	char			*dma_hostlist; /* comma-sep. for RO failover */
154 	struct nfs_args		*dma_nargs;
155 } domount_args_t;
156 
157 
158 /*
159  * The vnode ops functions for a trigger stub vnode
160  */
161 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
162 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
163     caller_context_t *);
164 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
165     caller_context_t *);
166 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
167     caller_context_t *);
168 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
169     caller_context_t *);
170 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
171     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
172     int *, pathname_t *);
173 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
174     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
175     vsecattr_t *);
176 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
177     int);
178 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
179     caller_context_t *, int);
180 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
181     cred_t *, caller_context_t *, int);
182 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
183     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
184 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
185     caller_context_t *, int);
186 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
187     cred_t *, caller_context_t *, int);
188 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
189 
190 /*
191  * Regular NFSv4 vnodeops that we need to reference directly
192  */
193 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
194 		    caller_context_t *);
195 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
196 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
197 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
198 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
199 		    struct pathname *, int, vnode_t *, cred_t *,
200 		    caller_context_t *, int *, pathname_t *);
201 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
202 		    caller_context_t *);
203 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
204 		    caller_context_t *);
205 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
206 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
207 
208 static int	nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
209 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
210     cred_t *, vnode_t **);
211 static int 	nfs4_trigger_domount_args_create(vnode_t *, cred_t *,
212     domount_args_t **dmap);
213 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
214     vnode_t *vp);
215 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
216     cred_t *);
217 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
218 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
219     servinfo4_t *);
220 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
221     cred_t *);
222 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
223     ephemeral_servinfo_t *);
224 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
225 static char	*nfs4_trigger_create_mntopts(vfs_t *);
226 static void	nfs4_trigger_destroy_mntopts(char *);
227 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
228 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
229 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
230     struct netbuf *, int);
231 
232 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
233 
234 vnodeops_t *nfs4_trigger_vnodeops;
235 
236 /*
237  * These are the vnodeops that we must define for stub vnodes.
238  *
239  *
240  * Many of the VOPs defined for NFSv4 do not need to be defined here,
241  * for various reasons. This will result in the VFS default function being
242  * used:
243  *
244  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
245  *   lost the reference to the stub vnode, meaning these should not be called:
246  *       close, read, write, ioctl, readdir, seek.
247  *
248  * - These VOPs are meaningless for vnodes without data pages. Since the
249  *   stub vnode is of type VDIR, these should not be called:
250  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
251  *
252  * - These VOPs are otherwise not applicable, and should not be called:
253  *       dump, setsecattr.
254  *
255  *
256  * These VOPs we do not want to define, but nor do we want the VFS default
257  * action. Instead, we specify the VFS error function, with fs_error(), but
258  * note that fs_error() is not actually called. Instead it results in the
259  * use of the error function defined for the particular VOP, in vn_ops_table[]:
260  *
261  * -   frlock, dispose, shrlock.
262  *
263  *
264  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
265  * NOTE: if any of these ops involve an OTW call with the stub FH, then
266  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
267  * to protect the security data in the servinfo4_t for the "parent"
268  * filesystem that contains the stub.
269  *
270  * - These VOPs should not trigger a mount, so that "ls -l" does not:
271  *       pathconf, getsecattr.
272  *
273  * - These VOPs would not make sense to trigger:
274  *       inactive, rwlock, rwunlock, fid, realvp.
275  */
276 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
277 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
278 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
279 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
280 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
281 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
282 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
283 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
284 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
285 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
286 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
287 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
288 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
289 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
290 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
291 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
292 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
293 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
294 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
295 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
296 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
297 	VOPNAME_FRLOCK,		{ .error = fs_error },
298 	VOPNAME_DISPOSE,	{ .error = fs_error },
299 	VOPNAME_SHRLOCK,	{ .error = fs_error },
300 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
301 	NULL, NULL
302 };
303 
304 static void
305 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
306 {
307 	ASSERT(mutex_owned(&net->net_cnt_lock));
308 	net->net_refcnt++;
309 	ASSERT(net->net_refcnt != 0);
310 }
311 
312 static void
313 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
314 {
315 	mutex_enter(&net->net_cnt_lock);
316 	nfs4_ephemeral_tree_incr(net);
317 	mutex_exit(&net->net_cnt_lock);
318 }
319 
320 /*
321  * We need a safe way to decrement the refcnt whilst the
322  * lock is being held.
323  */
324 static void
325 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
326 {
327 	ASSERT(mutex_owned(&net->net_cnt_lock));
328 	ASSERT(net->net_refcnt != 0);
329 	net->net_refcnt--;
330 }
331 
332 static void
333 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
334 {
335 	mutex_enter(&net->net_cnt_lock);
336 	nfs4_ephemeral_tree_decr(net);
337 	mutex_exit(&net->net_cnt_lock);
338 }
339 
340 /*
341  * Trigger ops for stub vnodes; for mirror mounts, etc.
342  *
343  * The general idea is that a "triggering" op will first call
344  * nfs4_trigger_mount(), which will find out whether a mount has already
345  * been triggered.
346  *
347  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
348  * of the covering vfs.
349  *
350  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
351  * and again set newvp, as above.
352  *
353  * The triggering op may then re-issue the VOP by calling it on newvp.
354  *
355  * Note that some ops may perform custom action, and may or may not need
356  * to trigger a mount.
357  *
358  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
359  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
360  * and that would just recurse. Instead, we call the v4 op directly,
361  * by name.  This is OK, since we know that the vnode is for NFSv4,
362  * otherwise it couldn't be a stub.
363  *
364  */
365 
366 static int
367 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
368 {
369 	int error;
370 	vnode_t *newvp;
371 
372 	error = nfs4_trigger_mount(*vpp, cr, &newvp);
373 	if (error)
374 		return (error);
375 
376 	/* Release the stub vnode, as we're losing the reference to it */
377 	VN_RELE(*vpp);
378 
379 	/* Give the caller the root vnode of the newly-mounted fs */
380 	*vpp = newvp;
381 
382 	/* return with VN_HELD(newvp) */
383 	return (VOP_OPEN(vpp, flag, cr, ct));
384 }
385 
386 void
387 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
388 {
389 	uint_t mask;
390 	timespec_t now;
391 
392 	/*
393 	 * Set some attributes here for referrals.
394 	 */
395 	mask = vap->va_mask;
396 	bzero(vap, sizeof (struct vattr));
397 	vap->va_mask	= mask;
398 	vap->va_uid	= 0;
399 	vap->va_gid	= 0;
400 	vap->va_nlink	= 1;
401 	vap->va_size	= 1;
402 	gethrestime(&now);
403 	vap->va_atime	= now;
404 	vap->va_mtime	= now;
405 	vap->va_ctime	= now;
406 	vap->va_type	= VDIR;
407 	vap->va_mode	= 0555;
408 	vap->va_fsid	= vp->v_vfsp->vfs_dev;
409 	vap->va_rdev	= 0;
410 	vap->va_blksize	= MAXBSIZE;
411 	vap->va_nblocks	= 1;
412 	vap->va_seq	= 0;
413 }
414 
415 /*
416  * For the majority of cases, nfs4_trigger_getattr() will not trigger
417  * a mount. However, if ATTR_TRIGGER is set, we are being informed
418  * that we need to force the mount before we attempt to determine
419  * the attributes. The intent is an atomic operation for security
420  * testing.
421  *
422  * If we're not triggering a mount, we can still inquire about the
423  * actual attributes from the server in the mirror mount case,
424  * and will return manufactured attributes for a referral (see
425  * the 'create' branch of find_referral_stubvp()).
426  */
427 static int
428 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
429     caller_context_t *ct)
430 {
431 	int error;
432 
433 	if (flags & ATTR_TRIGGER) {
434 		vnode_t	*newvp;
435 
436 		error = nfs4_trigger_mount(vp, cr, &newvp);
437 		if (error)
438 			return (error);
439 
440 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
441 		VN_RELE(newvp);
442 
443 	} else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
444 
445 		error = nfs4_getattr(vp, vap, flags, cr, ct);
446 
447 	} else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
448 
449 		nfs4_fake_attrs(vp, vap);
450 		error = 0;
451 	}
452 
453 	return (error);
454 }
455 
456 static int
457 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
458     caller_context_t *ct)
459 {
460 	int error;
461 	vnode_t *newvp;
462 
463 	error = nfs4_trigger_mount(vp, cr, &newvp);
464 	if (error)
465 		return (error);
466 
467 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
468 	VN_RELE(newvp);
469 
470 	return (error);
471 }
472 
473 static int
474 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
475     caller_context_t *ct)
476 {
477 	int error;
478 	vnode_t *newvp;
479 
480 	error = nfs4_trigger_mount(vp, cr, &newvp);
481 	if (error)
482 		return (error);
483 
484 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
485 	VN_RELE(newvp);
486 
487 	return (error);
488 }
489 
490 static int
491 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
492     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
493     caller_context_t *ct, int *deflags, pathname_t *rpnp)
494 {
495 	int error;
496 	vnode_t *newdvp;
497 	rnode4_t *drp = VTOR4(dvp);
498 
499 	ASSERT(RP_ISSTUB(drp));
500 
501 	/*
502 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
503 	 * that up. Instead, pass onto the regular op, regardless of whether
504 	 * we've triggered a mount.
505 	 */
506 	if (strcmp(nm, "..") == 0)
507 		if (RP_ISSTUB_MIRRORMOUNT(drp)) {
508 			return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
509 			    ct, deflags, rpnp));
510 		} else if (RP_ISSTUB_REFERRAL(drp)) {
511 			/* Return the parent vnode */
512 			return (vtodv(dvp, vpp, cr, TRUE));
513 		}
514 
515 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
516 	if (error)
517 		return (error);
518 
519 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
520 	    deflags, rpnp);
521 	VN_RELE(newdvp);
522 
523 	return (error);
524 }
525 
526 static int
527 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
528     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
529     int flags, caller_context_t *ct, vsecattr_t *vsecp)
530 {
531 	int error;
532 	vnode_t *newdvp;
533 
534 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
535 	if (error)
536 		return (error);
537 
538 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
539 	    flags, ct, vsecp);
540 	VN_RELE(newdvp);
541 
542 	return (error);
543 }
544 
545 static int
546 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
547     int flags)
548 {
549 	int error;
550 	vnode_t *newdvp;
551 
552 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
553 	if (error)
554 		return (error);
555 
556 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
557 	VN_RELE(newdvp);
558 
559 	return (error);
560 }
561 
562 static int
563 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
564     caller_context_t *ct, int flags)
565 {
566 	int error;
567 	vnode_t *newtdvp;
568 
569 	error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
570 	if (error)
571 		return (error);
572 
573 	/*
574 	 * We don't check whether svp is a stub. Let the NFSv4 code
575 	 * detect that error, and return accordingly.
576 	 */
577 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
578 	VN_RELE(newtdvp);
579 
580 	return (error);
581 }
582 
583 static int
584 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
585     cred_t *cr, caller_context_t *ct, int flags)
586 {
587 	int error;
588 	vnode_t *newsdvp;
589 	rnode4_t *tdrp = VTOR4(tdvp);
590 
591 	/*
592 	 * We know that sdvp is a stub, otherwise we would not be here.
593 	 *
594 	 * If tdvp is also be a stub, there are two possibilities: it
595 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
596 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
597 	 *
598 	 * In the former case, just trigger sdvp, and treat tdvp as
599 	 * though it were not a stub.
600 	 *
601 	 * In the latter case, it might be a different stub for the
602 	 * same server fs as sdvp, or for a different server fs.
603 	 * Regardless, from the client perspective this would still
604 	 * be a cross-filesystem rename, and should not be allowed,
605 	 * so return EXDEV, without triggering either mount.
606 	 */
607 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
608 		return (EXDEV);
609 
610 	error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
611 	if (error)
612 		return (error);
613 
614 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
615 
616 	VN_RELE(newsdvp);
617 
618 	return (error);
619 }
620 
621 /* ARGSUSED */
622 static int
623 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
624     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
625 {
626 	int error;
627 	vnode_t *newdvp;
628 
629 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
630 	if (error)
631 		return (error);
632 
633 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
634 	VN_RELE(newdvp);
635 
636 	return (error);
637 }
638 
639 static int
640 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
641     caller_context_t *ct, int flags)
642 {
643 	int error;
644 	vnode_t *newdvp;
645 
646 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
647 	if (error)
648 		return (error);
649 
650 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
651 	VN_RELE(newdvp);
652 
653 	return (error);
654 }
655 
656 static int
657 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
658     cred_t *cr, caller_context_t *ct, int flags)
659 {
660 	int error;
661 	vnode_t *newdvp;
662 
663 	error = nfs4_trigger_mount(dvp, cr, &newdvp);
664 	if (error)
665 		return (error);
666 
667 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
668 	VN_RELE(newdvp);
669 
670 	return (error);
671 }
672 
673 static int
674 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
675     caller_context_t *ct)
676 {
677 	int error;
678 	vnode_t *newvp;
679 
680 	error = nfs4_trigger_mount(vp, cr, &newvp);
681 	if (error)
682 		return (error);
683 
684 	error = VOP_READLINK(newvp, uiop, cr, ct);
685 	VN_RELE(newvp);
686 
687 	return (error);
688 }
689 
690 /* end of trigger vnode ops */
691 
692 /*
693  * See if the mount has already been done by another caller.
694  */
695 static int
696 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
697     bool_t *was_mounted, vfs_t **vfsp)
698 {
699 	int		error;
700 	mntinfo4_t	*mi = VTOMI4(vp);
701 
702 	*was_mounted = FALSE;
703 
704 	error = vn_vfsrlock_wait(vp);
705 	if (error)
706 		return (error);
707 
708 	*vfsp = vn_mountedvfs(vp);
709 	if (*vfsp != NULL) {
710 		/* the mount has already occurred */
711 		error = VFS_ROOT(*vfsp, newvpp);
712 		if (!error) {
713 			/* need to update the reference time  */
714 			mutex_enter(&mi->mi_lock);
715 			if (mi->mi_ephemeral)
716 				mi->mi_ephemeral->ne_ref_time =
717 				    gethrestime_sec();
718 			mutex_exit(&mi->mi_lock);
719 
720 			*was_mounted = TRUE;
721 		}
722 	}
723 
724 	vn_vfsunlock(vp);
725 	return (0);
726 }
727 
728 /*
729  * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
730  *
731  * The mount may have already occurred, via another thread. If not,
732  * assemble the location information - which may require fetching - and
733  * perform the mount.
734  *
735  * Sets newvp to be the root of the fs that is now covering vp. Note
736  * that we return with VN_HELD(*newvp).
737  *
738  * The caller is responsible for passing the VOP onto the covering fs.
739  */
740 static int
741 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
742 {
743 	int			 error;
744 	vfs_t			*vfsp;
745 	rnode4_t		*rp = VTOR4(vp);
746 	mntinfo4_t		*mi = VTOMI4(vp);
747 	domount_args_t		*dma;
748 
749 	nfs4_ephemeral_tree_t	*net;
750 
751 	bool_t			must_unlock = FALSE;
752 	bool_t			is_building = FALSE;
753 	bool_t			was_mounted = FALSE;
754 
755 	cred_t			*mcred = NULL;
756 
757 	nfs4_trigger_globals_t	*ntg;
758 
759 	zone_t			*zone = curproc->p_zone;
760 
761 	ASSERT(RP_ISSTUB(rp));
762 
763 	*newvpp = NULL;
764 
765 	/*
766 	 * Has the mount already occurred?
767 	 */
768 	error = nfs4_trigger_mounted_already(vp, newvpp,
769 	    &was_mounted, &vfsp);
770 	if (error || was_mounted)
771 		goto done;
772 
773 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
774 	ASSERT(ntg != NULL);
775 
776 	mutex_enter(&mi->mi_lock);
777 
778 	/*
779 	 * We need to lock down the ephemeral tree.
780 	 */
781 	if (mi->mi_ephemeral_tree == NULL) {
782 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
783 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
784 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
785 		net->net_refcnt = 1;
786 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
787 		is_building = TRUE;
788 
789 		/*
790 		 * We need to add it to the zone specific list for
791 		 * automatic unmounting and harvesting of deadwood.
792 		 */
793 		mutex_enter(&ntg->ntg_forest_lock);
794 		if (ntg->ntg_forest != NULL)
795 			net->net_next = ntg->ntg_forest;
796 		ntg->ntg_forest = net;
797 		mutex_exit(&ntg->ntg_forest_lock);
798 
799 		/*
800 		 * No lock order confusion with mi_lock because no
801 		 * other node could have grabbed net_tree_lock.
802 		 */
803 		mutex_enter(&net->net_tree_lock);
804 		mi->mi_ephemeral_tree = net;
805 		net->net_mount = mi;
806 		mutex_exit(&mi->mi_lock);
807 
808 		MI4_HOLD(mi);
809 		VFS_HOLD(mi->mi_vfsp);
810 	} else {
811 		net = mi->mi_ephemeral_tree;
812 		nfs4_ephemeral_tree_hold(net);
813 
814 		mutex_exit(&mi->mi_lock);
815 
816 		mutex_enter(&net->net_tree_lock);
817 
818 		/*
819 		 * We can only procede if the tree is neither locked
820 		 * nor being torn down.
821 		 */
822 		mutex_enter(&net->net_cnt_lock);
823 		if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
824 			nfs4_ephemeral_tree_decr(net);
825 			mutex_exit(&net->net_cnt_lock);
826 			mutex_exit(&net->net_tree_lock);
827 
828 			return (EIO);
829 		}
830 		mutex_exit(&net->net_cnt_lock);
831 	}
832 
833 	mutex_enter(&net->net_cnt_lock);
834 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
835 	mutex_exit(&net->net_cnt_lock);
836 
837 	must_unlock = TRUE;
838 
839 	error = nfs4_trigger_domount_args_create(vp, cr, &dma);
840 	if (error)
841 		goto done;
842 
843 	/*
844 	 * Note that since we define mirror mounts to work
845 	 * for any user, we simply extend the privileges of
846 	 * the user's credentials to allow the mount to
847 	 * proceed.
848 	 */
849 	mcred = crdup(cr);
850 	if (mcred == NULL) {
851 		error = EINVAL;
852 		nfs4_trigger_domount_args_destroy(dma, vp);
853 		goto done;
854 	}
855 
856 	crset_zone_privall(mcred);
857 	if (is_system_labeled())
858 		(void) setpflags(NET_MAC_AWARE, 1, mcred);
859 
860 	error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
861 	nfs4_trigger_domount_args_destroy(dma, vp);
862 
863 	DTRACE_PROBE2(nfs4clnt__func__referral__mount,
864 	    vnode_t *, vp, int, error);
865 
866 	crfree(mcred);
867 
868 done:
869 
870 	if (must_unlock) {
871 		mutex_enter(&net->net_cnt_lock);
872 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
873 
874 		/*
875 		 * REFCNT: If we are the root of the tree, then we need
876 		 * to keep a reference because we malloced the tree and
877 		 * this is where we tied it to our mntinfo.
878 		 *
879 		 * If we are not the root of the tree, then our tie to
880 		 * the mntinfo occured elsewhere and we need to
881 		 * decrement the reference to the tree.
882 		 */
883 		if (is_building)
884 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
885 		else
886 			nfs4_ephemeral_tree_decr(net);
887 		mutex_exit(&net->net_cnt_lock);
888 
889 		mutex_exit(&net->net_tree_lock);
890 	}
891 
892 	if (!error && (newvpp == NULL || *newvpp == NULL))
893 		error = ENOSYS;
894 
895 	return (error);
896 }
897 
898 /*
899  * Collect together both the generic & mount-type specific args.
900  */
901 static int
902 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr, domount_args_t **dmap)
903 {
904 	int nointr;
905 	char *hostlist;
906 	servinfo4_t *svp;
907 	struct nfs_args *nargs, *nargs_head;
908 	enum clnt_stat status;
909 	ephemeral_servinfo_t *esi, *esi_first;
910 	domount_args_t *dma;
911 	mntinfo4_t *mi = VTOMI4(vp);
912 
913 	nointr = !(mi->mi_flags & MI4_INT);
914 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
915 
916 	svp = mi->mi_curr_serv;
917 	/* check if the current server is responding */
918 	status = nfs4_trigger_ping_server(svp, nointr);
919 	if (status == RPC_SUCCESS) {
920 		esi_first = nfs4_trigger_esi_create(vp, svp, cr);
921 		if (esi_first == NULL) {
922 			kmem_free(hostlist, MAXPATHLEN);
923 			return (EINVAL);
924 		}
925 
926 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
927 
928 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
929 	} else {
930 		/* current server did not respond */
931 		esi_first = NULL;
932 		nargs_head = NULL;
933 	}
934 	nargs = nargs_head;
935 
936 	/*
937 	 * NFS RO failover.
938 	 *
939 	 * If we have multiple servinfo4 structures, linked via sv_next,
940 	 * we must create one nfs_args for each, linking the nfs_args via
941 	 * nfs_ext_u.nfs_extB.next.
942 	 *
943 	 * We need to build a corresponding esi for each, too, but that is
944 	 * used solely for building nfs_args, and may be immediately
945 	 * discarded, as domount() requires the info from just one esi,
946 	 * but all the nfs_args.
947 	 *
948 	 * Currently, the NFS mount code will hang if not all servers
949 	 * requested are available. To avoid that, we need to ping each
950 	 * server, here, and remove it from the list if it is not
951 	 * responding. This has the side-effect of that server then
952 	 * being permanently unavailable for this failover mount, even if
953 	 * it recovers. That's unfortunate, but the best we can do until
954 	 * the mount code path is fixed.
955 	 */
956 
957 	/*
958 	 * If the current server was down, loop indefinitely until we find
959 	 * at least one responsive server.
960 	 */
961 	do {
962 		/* no locking needed for sv_next; it is only set at fs mount */
963 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
964 			struct nfs_args *next;
965 
966 			/*
967 			 * nargs_head: the head of the nfs_args list
968 			 * nargs: the current tail of the list
969 			 * next: the newly-created element to be added
970 			 */
971 
972 			/*
973 			 * We've already tried the current server, above;
974 			 * if it was responding, we have already included it
975 			 * and it may now be ignored.
976 			 *
977 			 * Otherwise, try it again, since it may now have
978 			 * recovered.
979 			 */
980 			if (svp == mi->mi_curr_serv && esi_first != NULL)
981 				continue;
982 
983 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
984 			if (svp->sv_flags & SV4_NOTINUSE) {
985 				nfs_rw_exit(&svp->sv_lock);
986 				continue;
987 			}
988 			nfs_rw_exit(&svp->sv_lock);
989 
990 			/* check if the server is responding */
991 			status = nfs4_trigger_ping_server(svp, nointr);
992 			if (status == RPC_INTR) {
993 				kmem_free(hostlist, MAXPATHLEN);
994 				nfs4_trigger_esi_destroy(esi_first, vp);
995 				nargs = nargs_head;
996 				while (nargs != NULL) {
997 					next = nargs->nfs_ext_u.nfs_extB.next;
998 					nfs4_trigger_nargs_destroy(nargs);
999 					nargs = next;
1000 				}
1001 				return (EINTR);
1002 			} else if (status != RPC_SUCCESS) {
1003 				/* if the server did not respond, ignore it */
1004 				continue;
1005 			}
1006 
1007 			esi = nfs4_trigger_esi_create(vp, svp, cr);
1008 			if (esi == NULL)
1009 				continue;
1010 
1011 			/*
1012 			 * If the original current server (mi_curr_serv)
1013 			 * was down when when we first tried it,
1014 			 * (i.e. esi_first == NULL),
1015 			 * we select this new server (svp) to be the server
1016 			 * that we will actually contact (esi_first).
1017 			 *
1018 			 * Note that it's possible that mi_curr_serv == svp,
1019 			 * if that mi_curr_serv was down but has now recovered.
1020 			 */
1021 			next = nfs4_trigger_nargs_create(mi, svp, esi);
1022 			if (esi_first == NULL) {
1023 				ASSERT(nargs == NULL);
1024 				ASSERT(nargs_head == NULL);
1025 				nargs_head = next;
1026 				esi_first = esi;
1027 				(void) strlcpy(hostlist,
1028 				    esi_first->esi_hostname, MAXPATHLEN);
1029 			} else {
1030 				ASSERT(nargs_head != NULL);
1031 				nargs->nfs_ext_u.nfs_extB.next = next;
1032 				(void) strlcat(hostlist, ",", MAXPATHLEN);
1033 				(void) strlcat(hostlist, esi->esi_hostname,
1034 				    MAXPATHLEN);
1035 				/* esi was only needed for hostname & nargs */
1036 				nfs4_trigger_esi_destroy(esi, vp);
1037 			}
1038 
1039 			nargs = next;
1040 		}
1041 
1042 		/* if we've had no response at all, wait a second */
1043 		if (esi_first == NULL)
1044 			delay(drv_usectohz(1000000));
1045 
1046 	} while (esi_first == NULL);
1047 	ASSERT(nargs_head != NULL);
1048 
1049 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1050 	dma->dma_esi = esi_first;
1051 	dma->dma_hostlist = hostlist;
1052 	dma->dma_nargs = nargs_head;
1053 	*dmap = dma;
1054 
1055 	return (0);
1056 }
1057 
1058 static void
1059 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1060 {
1061 	if (dma != NULL) {
1062 		if (dma->dma_esi != NULL && vp != NULL)
1063 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1064 
1065 		if (dma->dma_hostlist != NULL)
1066 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
1067 
1068 		if (dma->dma_nargs != NULL) {
1069 			struct nfs_args *nargs = dma->dma_nargs;
1070 
1071 			do {
1072 				struct nfs_args *next =
1073 				    nargs->nfs_ext_u.nfs_extB.next;
1074 
1075 				nfs4_trigger_nargs_destroy(nargs);
1076 				nargs = next;
1077 			} while (nargs != NULL);
1078 		}
1079 
1080 		kmem_free(dma, sizeof (domount_args_t));
1081 	}
1082 }
1083 
1084 /*
1085  * The ephemeral_servinfo_t struct contains basic information we will need to
1086  * perform the mount. Whilst the structure is generic across different
1087  * types of ephemeral mount, the way we gather its contents differs.
1088  */
1089 static ephemeral_servinfo_t *
1090 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1091 {
1092 	ephemeral_servinfo_t *esi;
1093 	rnode4_t *rp = VTOR4(vp);
1094 
1095 	ASSERT(RP_ISSTUB(rp));
1096 
1097 	/* Call the ephemeral type-specific routine */
1098 	if (RP_ISSTUB_MIRRORMOUNT(rp))
1099 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1100 	else if (RP_ISSTUB_REFERRAL(rp))
1101 		esi = nfs4_trigger_esi_create_referral(vp, cr);
1102 	else
1103 		esi = NULL;
1104 	return (esi);
1105 }
1106 
1107 static void
1108 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1109 {
1110 	rnode4_t *rp = VTOR4(vp);
1111 
1112 	ASSERT(RP_ISSTUB(rp));
1113 
1114 	/* Currently, no need for an ephemeral type-specific routine */
1115 
1116 	/*
1117 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
1118 	 * and will be handled by nfs4_trigger_nargs_destroy().
1119 	 * We need only free the structure itself.
1120 	 */
1121 	if (esi != NULL)
1122 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
1123 }
1124 
1125 /*
1126  * Some of this may turn out to be common with other ephemeral types,
1127  * in which case it should be moved to nfs4_trigger_esi_create(), or a
1128  * common function called.
1129  */
1130 
1131 /*
1132  * Mirror mounts case - should have all data available
1133  */
1134 static ephemeral_servinfo_t *
1135 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1136 {
1137 	char			*stubpath;
1138 	struct knetconfig	*sikncp, *svkncp;
1139 	struct netbuf		*bufp;
1140 	ephemeral_servinfo_t	*esi;
1141 
1142 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1143 
1144 	/* initially set to be our type of ephemeral mount; may be added to */
1145 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1146 
1147 	/*
1148 	 * We're copying info from the stub rnode's servinfo4, but
1149 	 * we must create new copies, not pointers, since this information
1150 	 * is to be associated with the new mount, which will be
1151 	 * unmounted (and its structures freed) separately
1152 	 */
1153 
1154 	/*
1155 	 * Sizes passed to kmem_[z]alloc here must match those freed
1156 	 * in nfs4_free_args()
1157 	 */
1158 
1159 	/*
1160 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1161 	 * is difficult to avoid: as we need to read svp to calculate the
1162 	 * sizes to be allocated.
1163 	 */
1164 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1165 
1166 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1167 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1168 
1169 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1170 	bufp = esi->esi_addr;
1171 	bufp->len = svp->sv_addr.len;
1172 	bufp->maxlen = svp->sv_addr.maxlen;
1173 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1174 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1175 
1176 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1177 	sikncp = esi->esi_knconf;
1178 	svkncp = svp->sv_knconf;
1179 	sikncp->knc_semantics = svkncp->knc_semantics;
1180 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1181 	(void) strcat((char *)sikncp->knc_protofmly,
1182 	    (char *)svkncp->knc_protofmly);
1183 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1184 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1185 	sikncp->knc_rdev = svkncp->knc_rdev;
1186 
1187 	/*
1188 	 * Used when AUTH_DH is negotiated.
1189 	 *
1190 	 * This is ephemeral mount-type specific, since it contains the
1191 	 * server's time-sync syncaddr.
1192 	 */
1193 	if (svp->sv_dhsec) {
1194 		struct netbuf *bufp;
1195 		sec_data_t *sdata;
1196 		dh_k4_clntdata_t *data;
1197 
1198 		sdata = svp->sv_dhsec;
1199 		data = (dh_k4_clntdata_t *)sdata->data;
1200 		ASSERT(sdata->rpcflavor == AUTH_DH);
1201 
1202 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1203 		bufp->len = data->syncaddr.len;
1204 		bufp->maxlen = data->syncaddr.maxlen;
1205 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1206 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1207 		esi->esi_syncaddr = bufp;
1208 
1209 		if (data->netname != NULL) {
1210 			int nmlen = data->netnamelen;
1211 
1212 			/*
1213 			 * We need to copy from a dh_k4_clntdata_t
1214 			 * netname/netnamelen pair to a NUL-terminated
1215 			 * netname string suitable for putting in nfs_args,
1216 			 * where the latter has no netnamelen field.
1217 			 */
1218 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1219 			bcopy(data->netname, esi->esi_netname, nmlen);
1220 		}
1221 	} else {
1222 		esi->esi_syncaddr = NULL;
1223 		esi->esi_netname = NULL;
1224 	}
1225 
1226 	stubpath = fn_path(VTOSV(vp)->sv_name);
1227 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1228 	ASSERT(*stubpath == '.');
1229 	stubpath += 1;
1230 
1231 	/* for nfs_args->fh */
1232 	esi->esi_path_len = strlen(stubpath) + 1;
1233 	if (strcmp(svp->sv_path, "/") != 0)
1234 		esi->esi_path_len += strlen(svp->sv_path);
1235 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1236 	if (strcmp(svp->sv_path, "/") != 0)
1237 		(void) strcat(esi->esi_path, svp->sv_path);
1238 	(void) strcat(esi->esi_path, stubpath);
1239 
1240 	stubpath -= 1;
1241 	/* stubpath allocated by fn_path() */
1242 	kmem_free(stubpath, strlen(stubpath) + 1);
1243 
1244 	nfs_rw_exit(&svp->sv_lock);
1245 
1246 	return (esi);
1247 }
1248 
1249 /*
1250  * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1251  * get network information required to do the mount call.
1252  */
1253 int
1254 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1255 {
1256 	door_arg_t	door_args;
1257 	door_handle_t	dh;
1258 	XDR		xdr;
1259 	refd_door_args_t *xdr_argsp;
1260 	refd_door_res_t  *orig_resp;
1261 	k_sigset_t	smask;
1262 	int		xdr_len = 0;
1263 	int 		res_len = 16; /* length of an ip adress */
1264 	int		orig_reslen = res_len;
1265 	int		error = 0;
1266 	struct nfsidmap_globals *nig;
1267 
1268 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1269 		return (ECONNREFUSED);
1270 
1271 	nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1272 	ASSERT(nig != NULL);
1273 
1274 	mutex_enter(&nig->nfsidmap_daemon_lock);
1275 	dh = nig->nfsidmap_daemon_dh;
1276 	if (dh == NULL) {
1277 		mutex_exit(&nig->nfsidmap_daemon_lock);
1278 		cmn_err(CE_NOTE,
1279 		    "nfs4_callmapid: nfsmapid daemon not " \
1280 		    "running unable to resolve host name\n");
1281 		return (EINVAL);
1282 	}
1283 	door_ki_hold(dh);
1284 	mutex_exit(&nig->nfsidmap_daemon_lock);
1285 
1286 	xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1287 
1288 	xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1289 	xdr_argsp->xdr_len = xdr_len;
1290 	xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1291 
1292 	xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1293 	    xdr_len, XDR_ENCODE);
1294 
1295 	if (!xdr_utf8string(&xdr, server)) {
1296 		kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1297 		door_ki_rele(dh);
1298 		return (1);
1299 	}
1300 
1301 	if (orig_reslen)
1302 		orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1303 
1304 	door_args.data_ptr = (char *)xdr_argsp;
1305 	door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1306 	door_args.desc_ptr = NULL;
1307 	door_args.desc_num = 0;
1308 	door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1309 	door_args.rsize = res_len;
1310 
1311 	sigintr(&smask, 1);
1312 	error = door_ki_upcall(dh, &door_args);
1313 	sigunintr(&smask);
1314 
1315 	door_ki_rele(dh);
1316 
1317 	kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1318 	if (error) {
1319 		kmem_free(orig_resp, orig_reslen);
1320 		/*
1321 		 * There is no door to connect to. The referral daemon
1322 		 * must not be running yet.
1323 		 */
1324 		cmn_err(CE_WARN,
1325 		    "nfsmapid not running cannot resolve host name");
1326 		goto out;
1327 	}
1328 
1329 	/*
1330 	 * If the results buffer passed back are not the same as
1331 	 * what was sent free the old buffer and use the new one.
1332 	 */
1333 	if (orig_resp && orig_reslen) {
1334 		refd_door_res_t *door_resp;
1335 
1336 		door_resp = (refd_door_res_t *)door_args.rbuf;
1337 		if ((void *)door_args.rbuf != orig_resp)
1338 			kmem_free(orig_resp, orig_reslen);
1339 		if (door_resp->res_status == 0) {
1340 			xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1341 			    door_resp->xdr_len, XDR_DECODE);
1342 			bzero(resp, sizeof (struct nfs_fsl_info));
1343 			if (!xdr_nfs_fsl_info(&xdr, resp)) {
1344 				DTRACE_PROBE2(
1345 				    nfs4clnt__debug__referral__upcall__xdrfail,
1346 				    struct nfs_fsl_info *, resp,
1347 				    char *, "nfs4_callmapid");
1348 				error = EINVAL;
1349 			}
1350 		} else {
1351 			DTRACE_PROBE2(
1352 			    nfs4clnt__debug__referral__upcall__badstatus,
1353 			    int, door_resp->res_status,
1354 			    char *, "nfs4_callmapid");
1355 			error = door_resp->res_status;
1356 		}
1357 		kmem_free(door_args.rbuf, door_args.rsize);
1358 	}
1359 out:
1360 	DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1361 	    char *, server, int, error);
1362 	return (error);
1363 }
1364 
1365 /*
1366  * Fetches the fs_locations attribute. Typically called
1367  * from a Replication/Migration/Referrals/Mirror-mount context
1368  *
1369  * Fills in the attributes in garp. The caller is assumed
1370  * to have allocated memory for garp.
1371  *
1372  * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1373  *	 it's already done by caller. Otherwise lock these mutexes
1374  *	 before doing the rfs4call().
1375  *
1376  * Returns
1377  * 	1	 for success
1378  * 	0	 for failure
1379  */
1380 int
1381 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1382     cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1383 {
1384 	COMPOUND4args_clnt args;
1385 	COMPOUND4res_clnt res;
1386 	nfs_argop4 *argop;
1387 	int argoplist_size = 3 * sizeof (nfs_argop4);
1388 	nfs4_server_t *sp = NULL;
1389 	int doqueue = 1;
1390 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1391 	int retval = 1;
1392 	struct nfs4_clnt *nfscl;
1393 
1394 	if (lock == TRUE)
1395 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1396 	else
1397 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1398 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1399 
1400 	sp = find_nfs4_server(mi);
1401 	if (lock == TRUE)
1402 		nfs_rw_exit(&mi->mi_recovlock);
1403 
1404 	if (sp != NULL)
1405 		mutex_exit(&sp->s_lock);
1406 
1407 	if (lock == TRUE) {
1408 		if (sp != NULL)
1409 			(void) nfs_rw_enter_sig(&sp->s_recovlock,
1410 			    RW_WRITER, 0);
1411 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1412 	} else {
1413 		if (sp != NULL) {
1414 			ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1415 			    nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1416 		}
1417 	}
1418 
1419 	/*
1420 	 * Do we want to do the setup for recovery here?
1421 	 *
1422 	 * We know that the server responded to a null ping a very
1423 	 * short time ago, and we know that we intend to do a
1424 	 * single stateless operation - we want to fetch attributes,
1425 	 * so we know we can't encounter errors about state.  If
1426 	 * something goes wrong with the GETATTR, like not being
1427 	 * able to get a response from the server or getting any
1428 	 * kind of FH error, we should fail the mount.
1429 	 *
1430 	 * We may want to re-visited this at a later time.
1431 	 */
1432 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
1433 
1434 	args.ctag = TAG_GETATTR_FSLOCATION;
1435 	/* PUTFH LOOKUP GETATTR */
1436 	args.array_len = 3;
1437 	args.array = argop;
1438 
1439 	/* 0. putfh file */
1440 	argop[0].argop = OP_CPUTFH;
1441 	argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1442 
1443 	/* 1. lookup name, can't be dotdot */
1444 	argop[1].argop = OP_CLOOKUP;
1445 	argop[1].nfs_argop4_u.opclookup.cname = nm;
1446 
1447 	/* 2. file attrs */
1448 	argop[2].argop = OP_GETATTR;
1449 	argop[2].nfs_argop4_u.opgetattr.attr_request =
1450 	    FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1451 	    FATTR4_MOUNTED_ON_FILEID_MASK;
1452 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
1453 
1454 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1455 
1456 	if (lock == TRUE) {
1457 		nfs_rw_exit(&mi->mi_recovlock);
1458 		if (sp != NULL)
1459 			nfs_rw_exit(&sp->s_recovlock);
1460 	}
1461 
1462 	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1463 	nfscl->nfscl_stat.referrals.value.ui64++;
1464 	DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1465 	    nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1466 
1467 	if (e.error != 0) {
1468 		if (sp != NULL)
1469 			nfs4_server_rele(sp);
1470 		kmem_free(argop, argoplist_size);
1471 		return (0);
1472 	}
1473 
1474 	/*
1475 	 * Check for all possible error conditions.
1476 	 * For valid replies without an ops array or for illegal
1477 	 * replies, return a failure.
1478 	 */
1479 	if (res.status != NFS4_OK || res.array_len < 3 ||
1480 	    res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1481 		retval = 0;
1482 		goto exit;
1483 	}
1484 
1485 	/*
1486 	 * There isn't much value in putting the attributes
1487 	 * in the attr cache since fs_locations4 aren't
1488 	 * encountered very frequently, so just make them
1489 	 * available to the caller.
1490 	 */
1491 	*garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1492 
1493 	DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1494 	    nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1495 
1496 	/* No fs_locations? -- return a failure */
1497 	if (garp->n4g_ext_res == NULL ||
1498 	    garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1499 		retval = 0;
1500 		goto exit;
1501 	}
1502 
1503 	if (!garp->n4g_fsid_valid)
1504 		retval = 0;
1505 
1506 exit:
1507 	if (retval == 0) {
1508 		/* the call was ok but failed validating the call results */
1509 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1510 	} else {
1511 		ASSERT(callres != NULL);
1512 		*callres = res;
1513 	}
1514 
1515 	if (sp != NULL)
1516 		nfs4_server_rele(sp);
1517 	kmem_free(argop, argoplist_size);
1518 	return (retval);
1519 }
1520 
1521 /* tunable to disable referral mounts */
1522 int nfs4_no_referrals = 0;
1523 
1524 /*
1525  * Returns NULL if the vnode cannot be created or found.
1526  */
1527 vnode_t *
1528 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1529 {
1530 	nfs_fh4 *stub_fh, *dfh;
1531 	nfs4_sharedfh_t *sfhp;
1532 	char *newfhval;
1533 	vnode_t *vp = NULL;
1534 	fattr4_mounted_on_fileid mnt_on_fileid;
1535 	nfs4_ga_res_t garp;
1536 	mntinfo4_t *mi;
1537 	COMPOUND4res_clnt callres;
1538 	hrtime_t t;
1539 
1540 	if (nfs4_no_referrals)
1541 		return (NULL);
1542 
1543 	/*
1544 	 * Get the mounted_on_fileid, unique on that server::fsid
1545 	 */
1546 	mi = VTOMI4(dvp);
1547 	if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1548 	    &garp, &callres, FALSE) == 0)
1549 		return (NULL);
1550 	mnt_on_fileid = garp.n4g_mon_fid;
1551 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1552 
1553 	/*
1554 	 * Build a fake filehandle from the dir FH and the mounted_on_fileid
1555 	 */
1556 	dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1557 	stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1558 	stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1559 	    sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1560 	newfhval = stub_fh->nfs_fh4_val;
1561 
1562 	/* copy directory's file handle */
1563 	bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1564 	stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1565 	newfhval = newfhval + dfh->nfs_fh4_len;
1566 
1567 	/* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1568 	bcopy((char *)&mnt_on_fileid, newfhval,
1569 	    sizeof (fattr4_mounted_on_fileid));
1570 	stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1571 
1572 	sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1573 	kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1574 	    sizeof (fattr4_mounted_on_fileid));
1575 	kmem_free(stub_fh, sizeof (nfs_fh4));
1576 	if (sfhp == NULL)
1577 		return (NULL);
1578 
1579 	t = gethrtime();
1580 	garp.n4g_va.va_type = VDIR;
1581 	vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1582 	    cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1583 
1584 	if (vp != NULL)
1585 		vp->v_type = VDIR;
1586 
1587 	sfh4_rele(&sfhp);
1588 	return (vp);
1589 }
1590 
1591 int
1592 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1593 {
1594 	vnode_t *nvp;
1595 	rnode4_t *rp;
1596 
1597 	if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1598 		return (EINVAL);
1599 
1600 	rp = VTOR4(nvp);
1601 	mutex_enter(&rp->r_statelock);
1602 	r4_stub_referral(rp);
1603 	mutex_exit(&rp->r_statelock);
1604 	dnlc_enter(dvp, nm, nvp);
1605 
1606 	if (*vpp != NULL)
1607 		VN_RELE(*vpp);	/* no longer need this vnode */
1608 
1609 	*vpp = nvp;
1610 
1611 	return (0);
1612 }
1613 
1614 /*
1615  * Fetch the location information and resolve the new server.
1616  * Caller needs to free up the XDR data which is returned.
1617  * Input: mount info, shared filehandle, nodename
1618  * Return: Index to the result or Error(-1)
1619  * Output: FsLocations Info, Resolved Server Info.
1620  */
1621 int
1622 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1623     char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1624     struct nfs_fsl_info *fsloc)
1625 {
1626 	fs_location4 *fsp;
1627 	struct nfs_fsl_info nfsfsloc;
1628 	int ret, i, error;
1629 	nfs4_ga_res_t garp;
1630 	COMPOUND4res_clnt callres;
1631 	struct knetconfig *knc;
1632 
1633 	ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1634 	if (ret == 0)
1635 		return (-1);
1636 
1637 	/*
1638 	 * As a lame attempt to figuring out if we're
1639 	 * handling a migration event or a referral,
1640 	 * look for rnodes with this fsid in the rnode
1641 	 * cache.
1642 	 *
1643 	 * If we can find one or more such rnodes, it
1644 	 * means we're handling a migration event and
1645 	 * we want to bail out in that case.
1646 	 */
1647 	if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1648 		DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1649 		    mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1650 		    char *, "nfs4_process_referral");
1651 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1652 		return (-1);
1653 	}
1654 
1655 	/*
1656 	 * Find the first responsive server to mount.  When we find
1657 	 * one, fsp will point to it.
1658 	 */
1659 	for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1660 
1661 		fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1662 		if (fsp->server_len == 0 || fsp->server_val == NULL)
1663 			continue;
1664 
1665 		error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1666 		if (error != 0)
1667 			continue;
1668 
1669 		error = nfs4_ping_server_common(nfsfsloc.knconf,
1670 		    nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1671 		if (error == RPC_SUCCESS)
1672 			break;
1673 
1674 		DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1675 		    sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1676 		    char *, "nfs4_process_referral");
1677 
1678 		xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1679 	}
1680 	knc = nfsfsloc.knconf;
1681 	if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1682 	    (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1683 		DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1684 		    nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1685 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1686 		return (-1);
1687 	}
1688 
1689 	/* Send the results back */
1690 	*fsloc = nfsfsloc;
1691 	*grp = garp;
1692 	*res = callres;
1693 	return (i);
1694 }
1695 
1696 /*
1697  * Referrals case - need to fetch referral data and then upcall to
1698  * user-level to get complete mount data.
1699  */
1700 static ephemeral_servinfo_t *
1701 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1702 {
1703 	struct knetconfig	*sikncp, *svkncp;
1704 	struct netbuf		*bufp;
1705 	ephemeral_servinfo_t	*esi;
1706 	vnode_t			*dvp;
1707 	rnode4_t		*drp;
1708 	fs_location4		*fsp;
1709 	struct nfs_fsl_info	nfsfsloc;
1710 	nfs4_ga_res_t		garp;
1711 	char			*p;
1712 	char			fn[MAXNAMELEN];
1713 	int			i, index = -1;
1714 	mntinfo4_t		*mi;
1715 	COMPOUND4res_clnt	callres;
1716 
1717 	/*
1718 	 * If we're passed in a stub vnode that
1719 	 * isn't a "referral" stub, bail out
1720 	 * and return a failure
1721 	 */
1722 	if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1723 		return (NULL);
1724 
1725 	if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1726 		return (NULL);
1727 
1728 	drp = VTOR4(dvp);
1729 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1730 		VN_RELE(dvp);
1731 		return (NULL);
1732 	}
1733 
1734 	if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1735 		nfs_rw_exit(&drp->r_rwlock);
1736 		VN_RELE(dvp);
1737 		return (NULL);
1738 	}
1739 
1740 	mi = VTOMI4(dvp);
1741 	index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1742 	    &garp, &callres, &nfsfsloc);
1743 	nfs_rw_exit(&drp->r_rwlock);
1744 	VN_RELE(dvp);
1745 	if (index < 0)
1746 		return (NULL);
1747 
1748 	fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1749 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1750 
1751 	/* initially set to be our type of ephemeral mount; may be added to */
1752 	esi->esi_mount_flags = NFSMNT_REFERRAL;
1753 
1754 	esi->esi_hostname =
1755 	    kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1756 	bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1757 	    fsp->server_val->utf8string_len);
1758 	esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1759 
1760 	bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1761 	bufp->len = nfsfsloc.addr->len;
1762 	bufp->maxlen = nfsfsloc.addr->maxlen;
1763 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1764 	bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1765 	esi->esi_addr = bufp;
1766 
1767 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1768 	sikncp = esi->esi_knconf;
1769 
1770 	DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1771 	    struct nfs_fsl_info *, &nfsfsloc,
1772 	    char *, "nfs4_trigger_esi_create_referral");
1773 
1774 	svkncp = nfsfsloc.knconf;
1775 	sikncp->knc_semantics = svkncp->knc_semantics;
1776 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1777 	(void) strlcat((char *)sikncp->knc_protofmly,
1778 	    (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1779 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1780 	(void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1781 	    KNC_STRSIZE);
1782 	sikncp->knc_rdev = svkncp->knc_rdev;
1783 
1784 	DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1785 	    struct knetconfig *, sikncp,
1786 	    char *, "nfs4_trigger_esi_create_referral");
1787 
1788 	esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1789 	bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1790 	esi->esi_syncaddr = NULL;
1791 
1792 	esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1793 	esi->esi_path_len = MAXPATHLEN;
1794 	*p++ = '/';
1795 	for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1796 		component4 *comp;
1797 
1798 		comp = &fsp->rootpath.pathname4_val[i];
1799 		/* If no space, null the string and bail */
1800 		if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1801 			goto err;
1802 		bcopy(comp->utf8string_val, p, comp->utf8string_len);
1803 		p += comp->utf8string_len;
1804 		*p++ = '/';
1805 	}
1806 	if (fsp->rootpath.pathname4_len != 0)
1807 		*(p - 1) = '\0';
1808 	else
1809 		*p = '\0';
1810 	p = esi->esi_path;
1811 	esi->esi_path = strdup(p);
1812 	esi->esi_path_len = strlen(p) + 1;
1813 	kmem_free(p, MAXPATHLEN);
1814 
1815 	/* Allocated in nfs4_process_referral() */
1816 	xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1817 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1818 
1819 	return (esi);
1820 err:
1821 	kmem_free(esi->esi_path, esi->esi_path_len);
1822 	kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1823 	kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1824 	kmem_free(esi->esi_addr, sizeof (struct netbuf));
1825 	kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1826 	kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1827 	kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1828 	kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1829 	kmem_free(esi, sizeof (ephemeral_servinfo_t));
1830 	xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1831 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1832 	return (NULL);
1833 }
1834 
1835 /*
1836  * Assemble the args, and call the generic VFS mount function to
1837  * finally perform the ephemeral mount.
1838  */
1839 static int
1840 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1841     cred_t *cr, vnode_t **newvpp)
1842 {
1843 	struct mounta	*uap;
1844 	char		*mntpt, *orig_path, *path;
1845 	const char	*orig_mntpt;
1846 	int		retval;
1847 	int		mntpt_len;
1848 	int		spec_len;
1849 	zone_t		*zone = curproc->p_zone;
1850 	bool_t		has_leading_slash;
1851 	int		i;
1852 
1853 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1854 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1855 	struct nfs_args		*nargs = dma->dma_nargs;
1856 
1857 	/* first, construct the mount point for the ephemeral mount */
1858 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1859 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1860 
1861 	if (*orig_path == '.')
1862 		orig_path++;
1863 
1864 	/*
1865 	 * Get rid of zone's root path
1866 	 */
1867 	if (zone != global_zone) {
1868 		/*
1869 		 * -1 for trailing '/' and -1 for EOS.
1870 		 */
1871 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1872 		    zone->zone_rootpathlen - 1) == 0) {
1873 			orig_mntpt += (zone->zone_rootpathlen - 2);
1874 		}
1875 	}
1876 
1877 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1878 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1879 	(void) strcat(mntpt, orig_mntpt);
1880 	(void) strcat(mntpt, orig_path);
1881 
1882 	kmem_free(path, strlen(path) + 1);
1883 	path = esi->esi_path;
1884 	if (*path == '.')
1885 		path++;
1886 	if (path[0] == '/' && path[1] == '/')
1887 		path++;
1888 	has_leading_slash = (*path == '/');
1889 
1890 	spec_len = strlen(dma->dma_hostlist);
1891 	spec_len += strlen(path);
1892 
1893 	/* We are going to have to add this in */
1894 	if (!has_leading_slash)
1895 		spec_len++;
1896 
1897 	/* We need to get the ':' for dma_hostlist:esi_path */
1898 	spec_len++;
1899 
1900 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1901 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1902 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1903 	    has_leading_slash ? "" : "/", path);
1904 
1905 	uap->dir = mntpt;
1906 
1907 	uap->flags = MS_SYSSPACE | MS_DATA;
1908 	/* fstype-independent mount options not covered elsewhere */
1909 	/* copy parent's mount(1M) "-m" flag */
1910 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1911 		uap->flags |= MS_NOMNTTAB;
1912 
1913 	uap->fstype = MNTTYPE_NFS4;
1914 	uap->dataptr = (char *)nargs;
1915 	/* not needed for MS_SYSSPACE */
1916 	uap->datalen = 0;
1917 
1918 	/* use optptr to pass in extra mount options */
1919 	uap->flags |= MS_OPTIONSTR;
1920 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1921 	if (uap->optptr == NULL) {
1922 		retval = EINVAL;
1923 		goto done;
1924 	}
1925 
1926 	/* domount() expects us to count the trailing NUL */
1927 	uap->optlen = strlen(uap->optptr) + 1;
1928 
1929 	/*
1930 	 * If we get EBUSY, we try again once to see if we can perform
1931 	 * the mount. We do this because of a spurious race condition.
1932 	 */
1933 	for (i = 0; i < 2; i++) {
1934 		int	error;
1935 		bool_t	was_mounted;
1936 
1937 		retval = domount(NULL, uap, stubvp, cr, vfsp);
1938 		if (retval == 0) {
1939 			retval = VFS_ROOT(*vfsp, newvpp);
1940 			VFS_RELE(*vfsp);
1941 			break;
1942 		} else if (retval != EBUSY) {
1943 			break;
1944 		}
1945 
1946 		/*
1947 		 * We might find it mounted by the other racer...
1948 		 */
1949 		error = nfs4_trigger_mounted_already(stubvp,
1950 		    newvpp, &was_mounted, vfsp);
1951 		if (error) {
1952 			goto done;
1953 		} else if (was_mounted) {
1954 			retval = 0;
1955 			break;
1956 		}
1957 	}
1958 
1959 done:
1960 	if (uap->optptr)
1961 		nfs4_trigger_destroy_mntopts(uap->optptr);
1962 
1963 	kmem_free(uap->spec, spec_len + 1);
1964 	kmem_free(uap, sizeof (struct mounta));
1965 	kmem_free(mntpt, mntpt_len + 1);
1966 
1967 	return (retval);
1968 }
1969 
1970 /*
1971  * Build an nfs_args structure for passing to domount().
1972  *
1973  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1974  * generic data - common to all ephemeral mount types - is read directly
1975  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1976  */
1977 static struct nfs_args *
1978 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1979     ephemeral_servinfo_t *esi)
1980 {
1981 	sec_data_t *secdata;
1982 	struct nfs_args *nargs;
1983 
1984 	/* setup the nfs args */
1985 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1986 
1987 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1988 
1989 	nargs->addr = esi->esi_addr;
1990 
1991 	/* for AUTH_DH by negotiation */
1992 	if (esi->esi_syncaddr || esi->esi_netname) {
1993 		nargs->flags |= NFSMNT_SECURE;
1994 		nargs->syncaddr = esi->esi_syncaddr;
1995 		nargs->netname = esi->esi_netname;
1996 	}
1997 
1998 	nargs->flags |= NFSMNT_KNCONF;
1999 	nargs->knconf = esi->esi_knconf;
2000 	nargs->flags |= NFSMNT_HOSTNAME;
2001 	nargs->hostname = esi->esi_hostname;
2002 	nargs->fh = esi->esi_path;
2003 
2004 	/* general mount settings, all copied from parent mount */
2005 	mutex_enter(&mi->mi_lock);
2006 
2007 	if (!(mi->mi_flags & MI4_HARD))
2008 		nargs->flags |= NFSMNT_SOFT;
2009 
2010 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
2011 	    NFSMNT_RETRANS;
2012 	nargs->wsize = mi->mi_stsize;
2013 	nargs->rsize = mi->mi_tsize;
2014 	nargs->timeo = mi->mi_timeo;
2015 	nargs->retrans = mi->mi_retrans;
2016 
2017 	if (mi->mi_flags & MI4_INT)
2018 		nargs->flags |= NFSMNT_INT;
2019 	if (mi->mi_flags & MI4_NOAC)
2020 		nargs->flags |= NFSMNT_NOAC;
2021 
2022 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2023 	    NFSMNT_ACDIRMAX;
2024 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
2025 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
2026 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2027 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2028 
2029 	/* add any specific flags for this type of ephemeral mount */
2030 	nargs->flags |= esi->esi_mount_flags;
2031 
2032 	if (mi->mi_flags & MI4_NOCTO)
2033 		nargs->flags |= NFSMNT_NOCTO;
2034 	if (mi->mi_flags & MI4_GRPID)
2035 		nargs->flags |= NFSMNT_GRPID;
2036 	if (mi->mi_flags & MI4_LLOCK)
2037 		nargs->flags |= NFSMNT_LLOCK;
2038 	if (mi->mi_flags & MI4_NOPRINT)
2039 		nargs->flags |= NFSMNT_NOPRINT;
2040 	if (mi->mi_flags & MI4_DIRECTIO)
2041 		nargs->flags |= NFSMNT_DIRECTIO;
2042 	if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2043 		nargs->flags |= NFSMNT_PUBLIC;
2044 
2045 	/* Do some referral-specific option tweaking */
2046 	if (nargs->flags & NFSMNT_REFERRAL) {
2047 		nargs->flags &= ~NFSMNT_DORDMA;
2048 		nargs->flags |= NFSMNT_TRYRDMA;
2049 	}
2050 
2051 	mutex_exit(&mi->mi_lock);
2052 
2053 	/*
2054 	 * Security data & negotiation policy.
2055 	 *
2056 	 * For mirror mounts, we need to preserve the parent mount's
2057 	 * preference for security negotiation, translating SV4_TRYSECDEFAULT
2058 	 * to NFSMNT_SECDEFAULT if present.
2059 	 *
2060 	 * For referrals, we always want security negotiation and will
2061 	 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2062 	 * The reason is that we can't negotiate down from a parent's
2063 	 * Kerberos flavor to AUTH_SYS.
2064 	 *
2065 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2066 	 * security flavour was requested, with data in sv_secdata, and that
2067 	 * no negotiation should occur. If this specified flavour fails, that's
2068 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2069 	 *
2070 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2071 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
2072 	 * Possible flavours are recorded in an array in sv_secinfo, with
2073 	 * currently in-use flavour pointed to by sv_currsec.
2074 	 *
2075 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
2076 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2077 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2078 	 */
2079 	if (nargs->flags & NFSMNT_REFERRAL) {
2080 		/* enable negotiation for referral mount */
2081 		nargs->flags |= NFSMNT_SECDEFAULT;
2082 		secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2083 		secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2084 		secdata->data = NULL;
2085 	} else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2086 		/* enable negotiation for mirror mount */
2087 		nargs->flags |= NFSMNT_SECDEFAULT;
2088 
2089 		/*
2090 		 * As a starting point for negotiation, copy parent
2091 		 * mount's negotiated flavour (sv_currsec) if available,
2092 		 * or its passed-in flavour (sv_secdata) if not.
2093 		 */
2094 		if (svp->sv_currsec != NULL)
2095 			secdata = copy_sec_data(svp->sv_currsec);
2096 		else if (svp->sv_secdata != NULL)
2097 			secdata = copy_sec_data(svp->sv_secdata);
2098 		else
2099 			secdata = NULL;
2100 	} else {
2101 		/* do not enable negotiation; copy parent's passed-in flavour */
2102 		if (svp->sv_secdata != NULL)
2103 			secdata = copy_sec_data(svp->sv_secdata);
2104 		else
2105 			secdata = NULL;
2106 	}
2107 
2108 	nfs_rw_exit(&svp->sv_lock);
2109 
2110 	nargs->flags |= NFSMNT_NEWARGS;
2111 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
2112 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2113 
2114 	/* for NFS RO failover; caller will set if necessary */
2115 	nargs->nfs_ext_u.nfs_extB.next = NULL;
2116 
2117 	return (nargs);
2118 }
2119 
2120 static void
2121 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2122 {
2123 	/*
2124 	 * Either the mount failed, in which case the data is not needed, or
2125 	 * nfs4_mount() has either taken copies of what it needs or,
2126 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
2127 	 * whereby nfs4_free_args() will ignore it.
2128 	 */
2129 	nfs4_free_args(nargs);
2130 	kmem_free(nargs, sizeof (struct nfs_args));
2131 }
2132 
2133 /*
2134  * When we finally get into the mounting, we need to add this
2135  * node to the ephemeral tree.
2136  *
2137  * This is called from nfs4_mount().
2138  */
2139 int
2140 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2141 {
2142 	mntinfo4_t		*mi_parent;
2143 	nfs4_ephemeral_t	*eph;
2144 	nfs4_ephemeral_tree_t	*net;
2145 
2146 	nfs4_ephemeral_t	*prior;
2147 	nfs4_ephemeral_t	*child;
2148 
2149 	nfs4_ephemeral_t	*peer;
2150 
2151 	nfs4_trigger_globals_t	*ntg;
2152 	zone_t			*zone = curproc->p_zone;
2153 
2154 	int			rc = 0;
2155 
2156 	mi_parent = VTOMI4(mvp);
2157 
2158 	/*
2159 	 * Get this before grabbing anything else!
2160 	 */
2161 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2162 	if (!ntg->ntg_thread_started) {
2163 		nfs4_ephemeral_start_harvester(ntg);
2164 	}
2165 
2166 	mutex_enter(&mi_parent->mi_lock);
2167 	mutex_enter(&mi->mi_lock);
2168 
2169 	net = mi->mi_ephemeral_tree =
2170 	    mi_parent->mi_ephemeral_tree;
2171 
2172 	/*
2173 	 * If the mi_ephemeral_tree is NULL, then it
2174 	 * means that either the harvester or a manual
2175 	 * umount has cleared the tree out right before
2176 	 * we got here.
2177 	 *
2178 	 * There is nothing we can do here, so return
2179 	 * to the caller and let them decide whether they
2180 	 * try again.
2181 	 */
2182 	if (net == NULL) {
2183 		mutex_exit(&mi->mi_lock);
2184 		mutex_exit(&mi_parent->mi_lock);
2185 
2186 		return (EBUSY);
2187 	}
2188 
2189 	/*
2190 	 * We've just tied the mntinfo to the tree, so
2191 	 * now we bump the refcnt and hold it there until
2192 	 * this mntinfo is removed from the tree.
2193 	 */
2194 	nfs4_ephemeral_tree_hold(net);
2195 
2196 	/*
2197 	 * We need to tack together the ephemeral mount
2198 	 * with this new mntinfo.
2199 	 */
2200 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2201 	eph->ne_mount = mi;
2202 	MI4_HOLD(mi);
2203 	VFS_HOLD(mi->mi_vfsp);
2204 	eph->ne_ref_time = gethrestime_sec();
2205 
2206 	/*
2207 	 * We need to tell the ephemeral mount when
2208 	 * to time out.
2209 	 */
2210 	eph->ne_mount_to = ntg->ntg_mount_to;
2211 
2212 	mi->mi_ephemeral = eph;
2213 
2214 	/*
2215 	 * If the enclosing mntinfo4 is also ephemeral,
2216 	 * then we need to point to its enclosing parent.
2217 	 * Else the enclosing mntinfo4 is the enclosing parent.
2218 	 *
2219 	 * We also need to weave this ephemeral node
2220 	 * into the tree.
2221 	 */
2222 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2223 		/*
2224 		 * We need to decide if we are
2225 		 * the root node of this branch
2226 		 * or if we are a sibling of this
2227 		 * branch.
2228 		 */
2229 		prior = mi_parent->mi_ephemeral;
2230 		if (prior == NULL) {
2231 			/*
2232 			 * Race condition, clean up, and
2233 			 * let caller handle mntinfo.
2234 			 */
2235 			mi->mi_flags &= ~MI4_EPHEMERAL;
2236 			mi->mi_ephemeral = NULL;
2237 			kmem_free(eph, sizeof (*eph));
2238 			VFS_RELE(mi->mi_vfsp);
2239 			MI4_RELE(mi);
2240 			nfs4_ephemeral_tree_rele(net);
2241 			rc = EBUSY;
2242 		} else {
2243 			if (prior->ne_child == NULL) {
2244 				prior->ne_child = eph;
2245 			} else {
2246 				child = prior->ne_child;
2247 
2248 				prior->ne_child = eph;
2249 				eph->ne_peer = child;
2250 
2251 				child->ne_prior = eph;
2252 			}
2253 
2254 			eph->ne_prior = prior;
2255 		}
2256 	} else {
2257 		/*
2258 		 * The parent mntinfo4 is the non-ephemeral
2259 		 * root of the ephemeral tree. We
2260 		 * need to decide if we are the root
2261 		 * node of that tree or if we are a
2262 		 * sibling of the root node.
2263 		 *
2264 		 * We are the root if there is no
2265 		 * other node.
2266 		 */
2267 		if (net->net_root == NULL) {
2268 			net->net_root = eph;
2269 		} else {
2270 			eph->ne_peer = peer = net->net_root;
2271 			ASSERT(peer != NULL);
2272 			net->net_root = eph;
2273 
2274 			peer->ne_prior = eph;
2275 		}
2276 
2277 		eph->ne_prior = NULL;
2278 	}
2279 
2280 	mutex_exit(&mi->mi_lock);
2281 	mutex_exit(&mi_parent->mi_lock);
2282 
2283 	return (rc);
2284 }
2285 
2286 /*
2287  * Commit the changes to the ephemeral tree for removing this node.
2288  */
2289 static void
2290 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2291 {
2292 	nfs4_ephemeral_t	*e = eph;
2293 	nfs4_ephemeral_t	*peer;
2294 	nfs4_ephemeral_t	*prior;
2295 
2296 	peer = eph->ne_peer;
2297 	prior = e->ne_prior;
2298 
2299 	/*
2300 	 * If this branch root was not the
2301 	 * tree root, then we need to fix back pointers.
2302 	 */
2303 	if (prior) {
2304 		if (prior->ne_child == e) {
2305 			prior->ne_child = peer;
2306 		} else {
2307 			prior->ne_peer = peer;
2308 		}
2309 
2310 		if (peer)
2311 			peer->ne_prior = prior;
2312 	} else if (peer) {
2313 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2314 		peer->ne_prior = NULL;
2315 	} else {
2316 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2317 	}
2318 }
2319 
2320 /*
2321  * We want to avoid recursion at all costs. So we need to
2322  * unroll the tree. We do this by a depth first traversal to
2323  * leaf nodes. We blast away the leaf and work our way back
2324  * up and down the tree.
2325  */
2326 static int
2327 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2328     int isTreeRoot, int flag, cred_t *cr)
2329 {
2330 	nfs4_ephemeral_t	*e = eph;
2331 	nfs4_ephemeral_t	*prior;
2332 	mntinfo4_t		*mi;
2333 	vfs_t			*vfsp;
2334 	int			error;
2335 
2336 	/*
2337 	 * We use the loop while unrolling the ephemeral tree.
2338 	 */
2339 	for (;;) {
2340 		/*
2341 		 * First we walk down the child.
2342 		 */
2343 		if (e->ne_child) {
2344 			prior = e;
2345 			e = e->ne_child;
2346 			continue;
2347 		}
2348 
2349 		/*
2350 		 * If we are the root of the branch we are removing,
2351 		 * we end it here. But if the branch is the root of
2352 		 * the tree, we have to forge on. We do not consider
2353 		 * the peer list for the root because while it may
2354 		 * be okay to remove, it is both extra work and a
2355 		 * potential for a false-positive error to stall the
2356 		 * unmount attempt.
2357 		 */
2358 		if (e == eph && isTreeRoot == FALSE)
2359 			return (0);
2360 
2361 		/*
2362 		 * Next we walk down the peer list.
2363 		 */
2364 		if (e->ne_peer) {
2365 			prior = e;
2366 			e = e->ne_peer;
2367 			continue;
2368 		}
2369 
2370 		/*
2371 		 * We can only remove the node passed in by the
2372 		 * caller if it is the root of the ephemeral tree.
2373 		 * Otherwise, the caller will remove it.
2374 		 */
2375 		if (e == eph && isTreeRoot == FALSE)
2376 			return (0);
2377 
2378 		/*
2379 		 * Okay, we have a leaf node, time
2380 		 * to prune it!
2381 		 *
2382 		 * Note that prior can only be NULL if
2383 		 * and only if it is the root of the
2384 		 * ephemeral tree.
2385 		 */
2386 		prior = e->ne_prior;
2387 
2388 		mi = e->ne_mount;
2389 		mutex_enter(&mi->mi_lock);
2390 		vfsp = mi->mi_vfsp;
2391 		ASSERT(vfsp != NULL);
2392 
2393 		/*
2394 		 * Cleared by umount2_engine.
2395 		 */
2396 		VFS_HOLD(vfsp);
2397 
2398 		/*
2399 		 * Inform nfs4_unmount to not recursively
2400 		 * descend into this node's children when it
2401 		 * gets processed.
2402 		 */
2403 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2404 		mutex_exit(&mi->mi_lock);
2405 
2406 		error = umount2_engine(vfsp, flag, cr, FALSE);
2407 		if (error) {
2408 			/*
2409 			 * We need to reenable nfs4_unmount's ability
2410 			 * to recursively descend on this node.
2411 			 */
2412 			mutex_enter(&mi->mi_lock);
2413 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2414 			mutex_exit(&mi->mi_lock);
2415 
2416 			return (error);
2417 		}
2418 
2419 		/*
2420 		 * If we are the current node, we do not want to
2421 		 * touch anything else. At this point, the only
2422 		 * way the current node can have survived to here
2423 		 * is if it is the root of the ephemeral tree and
2424 		 * we are unmounting the enclosing mntinfo4.
2425 		 */
2426 		if (e == eph) {
2427 			ASSERT(prior == NULL);
2428 			return (0);
2429 		}
2430 
2431 		/*
2432 		 * Stitch up the prior node. Note that since
2433 		 * we have handled the root of the tree, prior
2434 		 * must be non-NULL.
2435 		 */
2436 		ASSERT(prior != NULL);
2437 		if (prior->ne_child == e) {
2438 			prior->ne_child = NULL;
2439 		} else {
2440 			ASSERT(prior->ne_peer == e);
2441 
2442 			prior->ne_peer = NULL;
2443 		}
2444 
2445 		e = prior;
2446 	}
2447 
2448 	/* NOTREACHED */
2449 }
2450 
2451 /*
2452  * Common code to safely release net_cnt_lock and net_tree_lock
2453  */
2454 void
2455 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2456     nfs4_ephemeral_tree_t **pnet)
2457 {
2458 	nfs4_ephemeral_tree_t	*net = *pnet;
2459 
2460 	if (*pmust_unlock) {
2461 		mutex_enter(&net->net_cnt_lock);
2462 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2463 		mutex_exit(&net->net_cnt_lock);
2464 
2465 		mutex_exit(&net->net_tree_lock);
2466 
2467 		*pmust_unlock = FALSE;
2468 	}
2469 }
2470 
2471 /*
2472  * While we may have removed any child or sibling nodes of this
2473  * ephemeral node, we can not nuke it until we know that there
2474  * were no actived vnodes on it. This will do that final
2475  * work once we know it is not busy.
2476  */
2477 void
2478 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2479     nfs4_ephemeral_tree_t **pnet)
2480 {
2481 	/*
2482 	 * Now we need to get rid of the ephemeral data if it exists.
2483 	 */
2484 	mutex_enter(&mi->mi_lock);
2485 	if (mi->mi_ephemeral) {
2486 		/*
2487 		 * If we are the root node of an ephemeral branch
2488 		 * which is being removed, then we need to fixup
2489 		 * pointers into and out of the node.
2490 		 */
2491 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2492 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2493 
2494 		nfs4_ephemeral_tree_rele(*pnet);
2495 		ASSERT(mi->mi_ephemeral != NULL);
2496 
2497 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2498 		mi->mi_ephemeral = NULL;
2499 		VFS_RELE(mi->mi_vfsp);
2500 		MI4_RELE(mi);
2501 	}
2502 	mutex_exit(&mi->mi_lock);
2503 
2504 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2505 }
2506 
2507 /*
2508  * Unmount an ephemeral node.
2509  *
2510  * Note that if this code fails, then it must unlock.
2511  *
2512  * If it succeeds, then the caller must be prepared to do so.
2513  */
2514 int
2515 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2516     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2517 {
2518 	int			error = 0;
2519 	nfs4_ephemeral_t	*eph;
2520 	nfs4_ephemeral_tree_t	*net;
2521 	int			is_derooting = FALSE;
2522 	int			is_recursed = FALSE;
2523 	int			was_locked = FALSE;
2524 
2525 	/*
2526 	 * Make sure to set the default state for cleaning
2527 	 * up the tree in the caller (and on the way out).
2528 	 */
2529 	*pmust_unlock = FALSE;
2530 
2531 	/*
2532 	 * The active vnodes on this file system may be ephemeral
2533 	 * children. We need to check for and try to unmount them
2534 	 * here. If any can not be unmounted, we are going
2535 	 * to return EBUSY.
2536 	 */
2537 	mutex_enter(&mi->mi_lock);
2538 
2539 	/*
2540 	 * If an ephemeral tree, we need to check to see if
2541 	 * the lock is already held. If it is, then we need
2542 	 * to see if we are being called as a result of
2543 	 * the recursive removal of some node of the tree or
2544 	 * if we are another attempt to remove the tree.
2545 	 *
2546 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2547 	 * node. mi_ephemeral being non-NULL also does this.
2548 	 *
2549 	 * mi_ephemeral_tree being non-NULL is sufficient
2550 	 * to also indicate either it is an ephemeral node
2551 	 * or the enclosing mntinfo4.
2552 	 *
2553 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
2554 	 * when we delete the ephemeral node and need to
2555 	 * differentiate from an ephemeral node and the
2556 	 * enclosing root node.
2557 	 */
2558 	*pnet = net = mi->mi_ephemeral_tree;
2559 	if (net == NULL) {
2560 		mutex_exit(&mi->mi_lock);
2561 		return (0);
2562 	}
2563 
2564 	eph = mi->mi_ephemeral;
2565 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2566 	is_derooting = (eph == NULL);
2567 
2568 	mutex_enter(&net->net_cnt_lock);
2569 
2570 	/*
2571 	 * If this is not recursion, then we need to
2572 	 * check to see if a harvester thread has
2573 	 * already grabbed the lock.
2574 	 *
2575 	 * After we exit this branch, we may not
2576 	 * blindly return, we need to jump to
2577 	 * is_busy!
2578 	 */
2579 	if (!is_recursed) {
2580 		if (net->net_status &
2581 		    NFS4_EPHEMERAL_TREE_LOCKED) {
2582 			/*
2583 			 * If the tree is locked, we need
2584 			 * to decide whether we are the
2585 			 * harvester or some explicit call
2586 			 * for a umount. The only way that
2587 			 * we are the harvester is if
2588 			 * MS_SYSSPACE is set.
2589 			 *
2590 			 * We only let the harvester through
2591 			 * at this point.
2592 			 *
2593 			 * We return EBUSY so that the
2594 			 * caller knows something is
2595 			 * going on. Note that by that
2596 			 * time, the umount in the other
2597 			 * thread may have already occured.
2598 			 */
2599 			if (!(flag & MS_SYSSPACE)) {
2600 				mutex_exit(&net->net_cnt_lock);
2601 				mutex_exit(&mi->mi_lock);
2602 
2603 				return (EBUSY);
2604 			}
2605 
2606 			was_locked = TRUE;
2607 		}
2608 	}
2609 
2610 	mutex_exit(&net->net_cnt_lock);
2611 	mutex_exit(&mi->mi_lock);
2612 
2613 	/*
2614 	 * If we are not the harvester, we need to check
2615 	 * to see if we need to grab the tree lock.
2616 	 */
2617 	if (was_locked == FALSE) {
2618 		/*
2619 		 * If we grab the lock, it means that no other
2620 		 * operation is working on the tree. If we don't
2621 		 * grab it, we need to decide if this is because
2622 		 * we are a recursive call or a new operation.
2623 		 */
2624 		if (mutex_tryenter(&net->net_tree_lock)) {
2625 			*pmust_unlock = TRUE;
2626 		} else {
2627 			/*
2628 			 * If we are a recursive call, we can
2629 			 * proceed without the lock.
2630 			 * Otherwise we have to wait until
2631 			 * the lock becomes free.
2632 			 */
2633 			if (!is_recursed) {
2634 				mutex_enter(&net->net_cnt_lock);
2635 				if (net->net_status &
2636 				    (NFS4_EPHEMERAL_TREE_DEROOTING
2637 				    | NFS4_EPHEMERAL_TREE_INVALID)) {
2638 					mutex_exit(&net->net_cnt_lock);
2639 					goto is_busy;
2640 				}
2641 				mutex_exit(&net->net_cnt_lock);
2642 
2643 				/*
2644 				 * We can't hold any other locks whilst
2645 				 * we wait on this to free up.
2646 				 */
2647 				mutex_enter(&net->net_tree_lock);
2648 
2649 				/*
2650 				 * Note that while mi->mi_ephemeral
2651 				 * may change and thus we have to
2652 				 * update eph, it is the case that
2653 				 * we have tied down net and
2654 				 * do not care if mi->mi_ephemeral_tree
2655 				 * has changed.
2656 				 */
2657 				mutex_enter(&mi->mi_lock);
2658 				eph = mi->mi_ephemeral;
2659 				mutex_exit(&mi->mi_lock);
2660 
2661 				/*
2662 				 * Okay, we need to see if either the
2663 				 * tree got nuked or the current node
2664 				 * got nuked. Both of which will cause
2665 				 * an error.
2666 				 *
2667 				 * Note that a subsequent retry of the
2668 				 * umount shall work.
2669 				 */
2670 				mutex_enter(&net->net_cnt_lock);
2671 				if (net->net_status &
2672 				    NFS4_EPHEMERAL_TREE_INVALID ||
2673 				    (!is_derooting && eph == NULL)) {
2674 					mutex_exit(&net->net_cnt_lock);
2675 					mutex_exit(&net->net_tree_lock);
2676 					goto is_busy;
2677 				}
2678 				mutex_exit(&net->net_cnt_lock);
2679 				*pmust_unlock = TRUE;
2680 			}
2681 		}
2682 	}
2683 
2684 	/*
2685 	 * Only once we have grabbed the lock can we mark what we
2686 	 * are planning on doing to the ephemeral tree.
2687 	 */
2688 	if (*pmust_unlock) {
2689 		mutex_enter(&net->net_cnt_lock);
2690 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2691 
2692 		/*
2693 		 * Check to see if we are nuking the root.
2694 		 */
2695 		if (is_derooting)
2696 			net->net_status |=
2697 			    NFS4_EPHEMERAL_TREE_DEROOTING;
2698 		mutex_exit(&net->net_cnt_lock);
2699 	}
2700 
2701 	if (!is_derooting) {
2702 		/*
2703 		 * Only work on children if the caller has not already
2704 		 * done so.
2705 		 */
2706 		if (!is_recursed) {
2707 			ASSERT(eph != NULL);
2708 
2709 			error = nfs4_ephemeral_unmount_engine(eph,
2710 			    FALSE, flag, cr);
2711 			if (error)
2712 				goto is_busy;
2713 		}
2714 	} else {
2715 		eph = net->net_root;
2716 
2717 		/*
2718 		 * Only work if there is something there.
2719 		 */
2720 		if (eph) {
2721 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2722 			    flag, cr);
2723 			if (error) {
2724 				mutex_enter(&net->net_cnt_lock);
2725 				net->net_status &=
2726 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
2727 				mutex_exit(&net->net_cnt_lock);
2728 				goto is_busy;
2729 			}
2730 
2731 			/*
2732 			 * Nothing else which goes wrong will
2733 			 * invalidate the blowing away of the
2734 			 * ephmeral tree.
2735 			 */
2736 			net->net_root = NULL;
2737 		}
2738 
2739 		/*
2740 		 * We have derooted and we have caused the tree to be
2741 		 * invalidated.
2742 		 */
2743 		mutex_enter(&net->net_cnt_lock);
2744 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2745 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2746 		DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2747 		    uint_t, net->net_refcnt);
2748 
2749 		/*
2750 		 * We will not finalize this node, so safe to
2751 		 * release it.
2752 		 */
2753 		nfs4_ephemeral_tree_decr(net);
2754 		mutex_exit(&net->net_cnt_lock);
2755 
2756 		if (was_locked == FALSE)
2757 			mutex_exit(&net->net_tree_lock);
2758 
2759 		/*
2760 		 * We have just blown away any notation of this
2761 		 * tree being locked or having a refcnt.
2762 		 * We can't let the caller try to clean things up.
2763 		 */
2764 		*pmust_unlock = FALSE;
2765 
2766 		/*
2767 		 * At this point, the tree should no longer be
2768 		 * associated with the mntinfo4. We need to pull
2769 		 * it off there and let the harvester take
2770 		 * care of it once the refcnt drops.
2771 		 */
2772 		mutex_enter(&mi->mi_lock);
2773 		mi->mi_ephemeral_tree = NULL;
2774 		mutex_exit(&mi->mi_lock);
2775 	}
2776 
2777 	return (0);
2778 
2779 is_busy:
2780 
2781 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2782 
2783 	return (error);
2784 }
2785 
2786 /*
2787  * Do the umount and record any error in the parent.
2788  */
2789 static void
2790 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2791     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2792 {
2793 	int	error;
2794 
2795 	/*
2796 	 * Only act on if the fs is still mounted.
2797 	 */
2798 	if (vfsp == NULL)
2799 		return;
2800 
2801 	error = umount2_engine(vfsp, flag, kcred, FALSE);
2802 	if (error) {
2803 		if (prior) {
2804 			if (prior->ne_child == e)
2805 				prior->ne_state |=
2806 				    NFS4_EPHEMERAL_CHILD_ERROR;
2807 			else
2808 				prior->ne_state |=
2809 				    NFS4_EPHEMERAL_PEER_ERROR;
2810 		}
2811 	}
2812 }
2813 
2814 /*
2815  * For each tree in the forest (where the forest is in
2816  * effect all of the ephemeral trees for this zone),
2817  * scan to see if a node can be unmounted. Note that
2818  * unlike nfs4_ephemeral_unmount_engine(), we do
2819  * not process the current node before children or
2820  * siblings. I.e., if a node can be unmounted, we
2821  * do not recursively check to see if the nodes
2822  * hanging off of it can also be unmounted.
2823  *
2824  * Instead, we delve down deep to try and remove the
2825  * children first. Then, because we share code with
2826  * nfs4_ephemeral_unmount_engine(), we will try
2827  * them again. This could be a performance issue in
2828  * the future.
2829  *
2830  * Also note that unlike nfs4_ephemeral_unmount_engine(),
2831  * we do not halt on an error. We will not remove the
2832  * current node, but we will keep on trying to remove
2833  * the others.
2834  *
2835  * force indicates that we want the unmount to occur
2836  * even if there is something blocking it.
2837  *
2838  * time_check indicates that we want to see if the
2839  * mount has expired past mount_to or not. Typically
2840  * we want to do this and only on a shutdown of the
2841  * zone would we want to ignore the check.
2842  */
2843 static void
2844 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2845     bool_t force, bool_t time_check)
2846 {
2847 	nfs4_ephemeral_tree_t	*net;
2848 	nfs4_ephemeral_tree_t	*prev = NULL;
2849 	nfs4_ephemeral_tree_t	*next;
2850 	nfs4_ephemeral_t	*e;
2851 	nfs4_ephemeral_t	*prior;
2852 	time_t			now = gethrestime_sec();
2853 
2854 	nfs4_ephemeral_tree_t	*harvest = NULL;
2855 
2856 	int			flag;
2857 
2858 	mntinfo4_t		*mi;
2859 	vfs_t			*vfsp;
2860 
2861 	if (force)
2862 		flag = MS_FORCE | MS_SYSSPACE;
2863 	else
2864 		flag = MS_SYSSPACE;
2865 
2866 	mutex_enter(&ntg->ntg_forest_lock);
2867 	for (net = ntg->ntg_forest; net != NULL; net = next) {
2868 		next = net->net_next;
2869 
2870 		nfs4_ephemeral_tree_hold(net);
2871 
2872 		mutex_enter(&net->net_tree_lock);
2873 
2874 		/*
2875 		 * Let the unmount code know that the
2876 		 * tree is already locked!
2877 		 */
2878 		mutex_enter(&net->net_cnt_lock);
2879 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2880 		mutex_exit(&net->net_cnt_lock);
2881 
2882 		/*
2883 		 * If the intent is force all ephemeral nodes to
2884 		 * be unmounted in this zone, we can short circuit a
2885 		 * lot of tree traversal and simply zap the root node.
2886 		 */
2887 		if (force) {
2888 			if (net->net_root) {
2889 				mi = net->net_root->ne_mount;
2890 
2891 				vfsp = mi->mi_vfsp;
2892 				ASSERT(vfsp != NULL);
2893 
2894 				/*
2895 				 * Cleared by umount2_engine.
2896 				 */
2897 				VFS_HOLD(vfsp);
2898 
2899 				(void) umount2_engine(vfsp, flag,
2900 				    kcred, FALSE);
2901 
2902 				goto check_done;
2903 			}
2904 		}
2905 
2906 		e = net->net_root;
2907 		if (e)
2908 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2909 
2910 		while (e) {
2911 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2912 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2913 				if (e->ne_child) {
2914 					e = e->ne_child;
2915 					e->ne_state =
2916 					    NFS4_EPHEMERAL_VISIT_CHILD;
2917 				}
2918 
2919 				continue;
2920 			} else if (e->ne_state ==
2921 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2922 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2923 				if (e->ne_peer) {
2924 					e = e->ne_peer;
2925 					e->ne_state =
2926 					    NFS4_EPHEMERAL_VISIT_CHILD;
2927 				}
2928 
2929 				continue;
2930 			} else if (e->ne_state ==
2931 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2932 				prior = e->ne_prior;
2933 
2934 				/*
2935 				 * If a child reported an error, do
2936 				 * not bother trying to unmount.
2937 				 *
2938 				 * If your prior node is a parent,
2939 				 * pass the error up such that they
2940 				 * also do not try to unmount.
2941 				 *
2942 				 * However, if your prior is a sibling,
2943 				 * let them try to unmount if they can.
2944 				 */
2945 				if (prior) {
2946 					if (prior->ne_child == e)
2947 						prior->ne_state |=
2948 						    NFS4_EPHEMERAL_CHILD_ERROR;
2949 					else
2950 						prior->ne_state |=
2951 						    NFS4_EPHEMERAL_PEER_ERROR;
2952 				}
2953 
2954 				/*
2955 				 * Clear the error and if needed, process peers.
2956 				 *
2957 				 * Once we mask out the error, we know whether
2958 				 * or we have to process another node.
2959 				 */
2960 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2961 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2962 					e = prior;
2963 
2964 				continue;
2965 			} else if (e->ne_state ==
2966 			    NFS4_EPHEMERAL_PEER_ERROR) {
2967 				prior = e->ne_prior;
2968 
2969 				if (prior) {
2970 					if (prior->ne_child == e)
2971 						prior->ne_state =
2972 						    NFS4_EPHEMERAL_CHILD_ERROR;
2973 					else
2974 						prior->ne_state =
2975 						    NFS4_EPHEMERAL_PEER_ERROR;
2976 				}
2977 
2978 				/*
2979 				 * Clear the error from this node and do the
2980 				 * correct processing.
2981 				 */
2982 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2983 				continue;
2984 			}
2985 
2986 			prior = e->ne_prior;
2987 			e->ne_state = NFS4_EPHEMERAL_OK;
2988 
2989 			/*
2990 			 * It must be the case that we need to process
2991 			 * this node.
2992 			 */
2993 			if (!time_check ||
2994 			    now - e->ne_ref_time > e->ne_mount_to) {
2995 				mi = e->ne_mount;
2996 				vfsp = mi->mi_vfsp;
2997 
2998 				/*
2999 				 * Cleared by umount2_engine.
3000 				 */
3001 				if (vfsp != NULL)
3002 					VFS_HOLD(vfsp);
3003 
3004 				/*
3005 				 * Note that we effectively work down to the
3006 				 * leaf nodes first, try to unmount them,
3007 				 * then work our way back up into the leaf
3008 				 * nodes.
3009 				 *
3010 				 * Also note that we deal with a lot of
3011 				 * complexity by sharing the work with
3012 				 * the manual unmount code.
3013 				 */
3014 				nfs4_ephemeral_record_umount(vfsp, flag,
3015 				    e, prior);
3016 			}
3017 
3018 			e = prior;
3019 		}
3020 
3021 check_done:
3022 
3023 		/*
3024 		 * At this point we are done processing this tree.
3025 		 *
3026 		 * If the tree is invalid and we were the only reference
3027 		 * to it, then we push it on the local linked list
3028 		 * to remove it at the end. We avoid that action now
3029 		 * to keep the tree processing going along at a fair clip.
3030 		 *
3031 		 * Else, even if we were the only reference, we
3032 		 * allow it to be reused as needed.
3033 		 */
3034 		mutex_enter(&net->net_cnt_lock);
3035 		nfs4_ephemeral_tree_decr(net);
3036 		if (net->net_refcnt == 0 &&
3037 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3038 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3039 			mutex_exit(&net->net_cnt_lock);
3040 			mutex_exit(&net->net_tree_lock);
3041 
3042 			if (prev)
3043 				prev->net_next = net->net_next;
3044 			else
3045 				ntg->ntg_forest = net->net_next;
3046 
3047 			net->net_next = harvest;
3048 			harvest = net;
3049 
3050 			VFS_RELE(net->net_mount->mi_vfsp);
3051 			MI4_RELE(net->net_mount);
3052 
3053 			continue;
3054 		}
3055 
3056 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3057 		mutex_exit(&net->net_cnt_lock);
3058 		mutex_exit(&net->net_tree_lock);
3059 
3060 		prev = net;
3061 	}
3062 	mutex_exit(&ntg->ntg_forest_lock);
3063 
3064 	for (net = harvest; net != NULL; net = next) {
3065 		next = net->net_next;
3066 
3067 		mutex_destroy(&net->net_tree_lock);
3068 		mutex_destroy(&net->net_cnt_lock);
3069 		kmem_free(net, sizeof (*net));
3070 	}
3071 }
3072 
3073 /*
3074  * This is the thread which decides when the harvesting
3075  * can proceed and when to kill it off for this zone.
3076  */
3077 static void
3078 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3079 {
3080 	clock_t		timeleft;
3081 	zone_t		*zone = curproc->p_zone;
3082 
3083 	for (;;) {
3084 		timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3085 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3086 
3087 		/*
3088 		 * zone is exiting...
3089 		 */
3090 		if (timeleft != -1) {
3091 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3092 			zthread_exit();
3093 			/* NOTREACHED */
3094 		}
3095 
3096 		/*
3097 		 * Only bother scanning if there is potential
3098 		 * work to be done.
3099 		 */
3100 		if (ntg->ntg_forest == NULL)
3101 			continue;
3102 
3103 		/*
3104 		 * Now scan the list and get rid of everything which
3105 		 * is old.
3106 		 */
3107 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3108 	}
3109 
3110 	/* NOTREACHED */
3111 }
3112 
3113 /*
3114  * The zone specific glue needed to start the unmount harvester.
3115  *
3116  * Note that we want to avoid holding the mutex as long as possible,
3117  * hence the multiple checks.
3118  *
3119  * The caller should avoid us getting down here in the first
3120  * place.
3121  */
3122 static void
3123 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3124 {
3125 	/*
3126 	 * It got started before we got here...
3127 	 */
3128 	if (ntg->ntg_thread_started)
3129 		return;
3130 
3131 	mutex_enter(&nfs4_ephemeral_thread_lock);
3132 
3133 	if (ntg->ntg_thread_started) {
3134 		mutex_exit(&nfs4_ephemeral_thread_lock);
3135 		return;
3136 	}
3137 
3138 	/*
3139 	 * Start the unmounter harvester thread for this zone.
3140 	 */
3141 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3142 	    ntg, 0, minclsyspri);
3143 
3144 	ntg->ntg_thread_started = TRUE;
3145 	mutex_exit(&nfs4_ephemeral_thread_lock);
3146 }
3147 
3148 /*ARGSUSED*/
3149 static void *
3150 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3151 {
3152 	nfs4_trigger_globals_t	*ntg;
3153 
3154 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3155 	ntg->ntg_thread_started = FALSE;
3156 
3157 	/*
3158 	 * This is the default....
3159 	 */
3160 	ntg->ntg_mount_to = nfs4_trigger_mount_to;
3161 
3162 	mutex_init(&ntg->ntg_forest_lock, NULL,
3163 	    MUTEX_DEFAULT, NULL);
3164 
3165 	return (ntg);
3166 }
3167 
3168 /*
3169  * Try a nice gentle walk down the forest and convince
3170  * all of the trees to gracefully give it up.
3171  */
3172 /*ARGSUSED*/
3173 static void
3174 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3175 {
3176 	nfs4_trigger_globals_t	*ntg = arg;
3177 
3178 	if (!ntg)
3179 		return;
3180 
3181 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3182 }
3183 
3184 /*
3185  * Race along the forest and rip all of the trees out by
3186  * their rootballs!
3187  */
3188 /*ARGSUSED*/
3189 static void
3190 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3191 {
3192 	nfs4_trigger_globals_t	*ntg = arg;
3193 
3194 	if (!ntg)
3195 		return;
3196 
3197 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3198 
3199 	mutex_destroy(&ntg->ntg_forest_lock);
3200 	kmem_free(ntg, sizeof (*ntg));
3201 }
3202 
3203 /*
3204  * This is the zone independent cleanup needed for
3205  * emphemeral mount processing.
3206  */
3207 void
3208 nfs4_ephemeral_fini(void)
3209 {
3210 	(void) zone_key_delete(nfs4_ephemeral_key);
3211 	mutex_destroy(&nfs4_ephemeral_thread_lock);
3212 }
3213 
3214 /*
3215  * This is the zone independent initialization needed for
3216  * emphemeral mount processing.
3217  */
3218 void
3219 nfs4_ephemeral_init(void)
3220 {
3221 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3222 	    NULL);
3223 
3224 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3225 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3226 }
3227 
3228 /*
3229  * nfssys() calls this function to set the per-zone
3230  * value of mount_to to drive when an ephemeral mount is
3231  * timed out. Each mount will grab a copy of this value
3232  * when mounted.
3233  */
3234 void
3235 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3236 {
3237 	nfs4_trigger_globals_t	*ntg;
3238 	zone_t			*zone = curproc->p_zone;
3239 
3240 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3241 
3242 	ntg->ntg_mount_to = mount_to;
3243 }
3244 
3245 /*
3246  * Walk the list of v4 mount options; if they are currently set in vfsp,
3247  * append them to a new comma-separated mount option string, and return it.
3248  *
3249  * Caller should free by calling nfs4_trigger_destroy_mntopts().
3250  */
3251 static char *
3252 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3253 {
3254 	uint_t i;
3255 	char *mntopts;
3256 	struct vfssw *vswp;
3257 	mntopts_t *optproto;
3258 
3259 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3260 
3261 	/* get the list of applicable mount options for v4; locks *vswp */
3262 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
3263 	optproto = &vswp->vsw_optproto;
3264 
3265 	for (i = 0; i < optproto->mo_count; i++) {
3266 		struct mntopt *mop = &optproto->mo_list[i];
3267 
3268 		if (mop->mo_flags & MO_EMPTY)
3269 			continue;
3270 
3271 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3272 			kmem_free(mntopts, MAX_MNTOPT_STR);
3273 			vfs_unrefvfssw(vswp);
3274 			return (NULL);
3275 		}
3276 	}
3277 
3278 	vfs_unrefvfssw(vswp);
3279 
3280 	/*
3281 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
3282 	 * and it may only be passed via MS_OPTIONSTR, so we
3283 	 * must handle it here.
3284 	 *
3285 	 * Ideally, it would be in the list, but NFS does not specify its
3286 	 * own opt proto list, it uses instead the default one. Since
3287 	 * not all filesystems support extended attrs, it would not be
3288 	 * appropriate to add it there.
3289 	 */
3290 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3291 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3292 		kmem_free(mntopts, MAX_MNTOPT_STR);
3293 		return (NULL);
3294 	}
3295 
3296 	return (mntopts);
3297 }
3298 
3299 static void
3300 nfs4_trigger_destroy_mntopts(char *mntopts)
3301 {
3302 	if (mntopts)
3303 		kmem_free(mntopts, MAX_MNTOPT_STR);
3304 }
3305 
3306 /*
3307  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3308  */
3309 static int
3310 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3311 {
3312 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
3313 		return (EINVAL);
3314 
3315 	if (vfs_optionisset(vfsp, optname, NULL)) {
3316 		size_t mntoptslen = strlen(mntopts);
3317 		size_t optnamelen = strlen(optname);
3318 
3319 		/* +1 for ',', +1 for NUL */
3320 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3321 			return (EOVERFLOW);
3322 
3323 		/* first or subsequent mount option? */
3324 		if (*mntopts != '\0')
3325 			(void) strcat(mntopts, ",");
3326 
3327 		(void) strcat(mntopts, optname);
3328 	}
3329 
3330 	return (0);
3331 }
3332 
3333 static enum clnt_stat
3334 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3335 {
3336 	int retries;
3337 	uint_t max_msgsize;
3338 	enum clnt_stat status;
3339 	CLIENT *cl;
3340 	struct timeval timeout;
3341 
3342 	/* as per recov_newserver() */
3343 	max_msgsize = 0;
3344 	retries = 1;
3345 	timeout.tv_sec = 2;
3346 	timeout.tv_usec = 0;
3347 
3348 	if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3349 	    max_msgsize, retries, CRED(), &cl) != 0)
3350 		return (RPC_FAILED);
3351 
3352 	if (nointr)
3353 		cl->cl_nosignal = TRUE;
3354 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3355 	    timeout);
3356 	if (nointr)
3357 		cl->cl_nosignal = FALSE;
3358 
3359 	AUTH_DESTROY(cl->cl_auth);
3360 	CLNT_DESTROY(cl);
3361 
3362 	return (status);
3363 }
3364 
3365 static enum clnt_stat
3366 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3367 {
3368 	return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3369 }
3370