xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c (revision bd211b8556ef6b18ebf137419bd5555d65271664)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
31  * triggered from a "stub" rnode via a special set of vnodeops.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/time.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/vfs_opreg.h>
42 #include <sys/file.h>
43 #include <sys/filio.h>
44 #include <sys/uio.h>
45 #include <sys/buf.h>
46 #include <sys/mman.h>
47 #include <sys/pathname.h>
48 #include <sys/dirent.h>
49 #include <sys/debug.h>
50 #include <sys/vmsystm.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
53 #include <sys/swap.h>
54 #include <sys/errno.h>
55 #include <sys/strsubr.h>
56 #include <sys/sysmacros.h>
57 #include <sys/kmem.h>
58 #include <sys/mount.h>
59 #include <sys/cmn_err.h>
60 #include <sys/pathconf.h>
61 #include <sys/utsname.h>
62 #include <sys/dnlc.h>
63 #include <sys/acl.h>
64 #include <sys/systeminfo.h>
65 #include <sys/policy.h>
66 #include <sys/sdt.h>
67 #include <sys/list.h>
68 #include <sys/stat.h>
69 #include <sys/mntent.h>
70 
71 #include <rpc/types.h>
72 #include <rpc/auth.h>
73 #include <rpc/clnt.h>
74 
75 #include <nfs/nfs.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/nfs_acl.h>
78 #include <nfs/lm.h>
79 #include <nfs/nfs4.h>
80 #include <nfs/nfs4_kprot.h>
81 #include <nfs/rnode4.h>
82 #include <nfs/nfs4_clnt.h>
83 
84 #include <vm/hat.h>
85 #include <vm/as.h>
86 #include <vm/page.h>
87 #include <vm/pvn.h>
88 #include <vm/seg.h>
89 #include <vm/seg_map.h>
90 #include <vm/seg_kpm.h>
91 #include <vm/seg_vn.h>
92 
93 #include <fs/fs_subr.h>
94 
95 #include <sys/ddi.h>
96 #include <sys/int_fmtio.h>
97 
98 #include <sys/sunddi.h>
99 
100 /*
101  * The automatic unmounter thread stuff!
102  */
103 static int nfs4_trigger_thread_timer = 20;	/* in seconds */
104 
105 /*
106  * Just a default....
107  */
108 static uint_t nfs4_trigger_mount_to = 240;
109 
110 typedef struct nfs4_trigger_globals {
111 	kmutex_t		ntg_forest_lock;
112 	uint_t			ntg_mount_to;
113 	int			ntg_thread_started;
114 	nfs4_ephemeral_tree_t	*ntg_forest;
115 } nfs4_trigger_globals_t;
116 
117 kmutex_t	nfs4_ephemeral_thread_lock;
118 
119 zone_key_t	nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
120 
121 static void	nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
122 
123 /*
124  * Used for ephemeral mounts; contains data either duplicated from
125  * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
126  *
127  * It's intended that this structure is used solely for ephemeral
128  * mount-type specific data, for passing this data to
129  * nfs4_trigger_nargs_create().
130  */
131 typedef struct ephemeral_servinfo {
132 	char			*esi_hostname;
133 	char			*esi_netname;
134 	char			*esi_path;
135 	int			esi_path_len;
136 	int			esi_mount_flags;
137 	struct netbuf		*esi_addr;
138 	struct netbuf		*esi_syncaddr;
139 	struct knetconfig	*esi_knconf;
140 } ephemeral_servinfo_t;
141 
142 /*
143  * Collect together the mount-type specific and generic data args.
144  */
145 typedef struct domount_args {
146 	ephemeral_servinfo_t	*dma_esi;
147 	char			*dma_hostlist; /* comma-sep. for RO failover */
148 	struct nfs_args		*dma_nargs;
149 } domount_args_t;
150 
151 
152 /*
153  * The vnode ops functions for a trigger stub vnode
154  */
155 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
156 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
157     caller_context_t *);
158 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
159     caller_context_t *);
160 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
161     caller_context_t *);
162 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
163     caller_context_t *);
164 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
165     struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
166     int *, pathname_t *);
167 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
168     enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
169     vsecattr_t *);
170 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
171     int);
172 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
173     caller_context_t *, int);
174 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
175     cred_t *, caller_context_t *, int);
176 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
177     vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
178 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
179     caller_context_t *, int);
180 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
181     cred_t *, caller_context_t *, int);
182 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
183 
184 /*
185  * Regular NFSv4 vnodeops that we need to reference directly
186  */
187 extern int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
188 		    caller_context_t *);
189 extern void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
190 extern int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
191 extern void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
192 extern int	nfs4_lookup(vnode_t *, char *, vnode_t **,
193 		    struct pathname *, int, vnode_t *, cred_t *,
194 		    caller_context_t *, int *, pathname_t *);
195 extern int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
196 		    caller_context_t *);
197 extern int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
198 		    caller_context_t *);
199 extern int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
200 extern int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
201 
202 static int	nfs4_trigger_mount(vnode_t *, vnode_t **);
203 static int	nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
204     cred_t *);
205 static domount_args_t  *nfs4_trigger_domount_args_create(vnode_t *);
206 static void	nfs4_trigger_domount_args_destroy(domount_args_t *dma,
207     vnode_t *vp);
208 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *);
209 static void	nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
210 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
211     servinfo4_t *);
212 static struct nfs_args 	*nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
213     ephemeral_servinfo_t *);
214 static void	nfs4_trigger_nargs_destroy(struct nfs_args *);
215 static char	*nfs4_trigger_create_mntopts(vfs_t *);
216 static void	nfs4_trigger_destroy_mntopts(char *);
217 static int 	nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
218 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
219 
220 extern int	umount2_engine(vfs_t *, int, cred_t *, int);
221 
222 
223 vnodeops_t *nfs4_trigger_vnodeops;
224 
225 /*
226  * These are the vnodeops that we must define for stub vnodes.
227  *
228  *
229  * Many of the VOPs defined for NFSv4 do not need to be defined here,
230  * for various reasons. This will result in the VFS default function being
231  * used:
232  *
233  * - These VOPs require a previous VOP_OPEN to have occurred. That will have
234  *   lost the reference to the stub vnode, meaning these should not be called:
235  *       close, read, write, ioctl, readdir, seek.
236  *
237  * - These VOPs are meaningless for vnodes without data pages. Since the
238  *   stub vnode is of type VDIR, these should not be called:
239  *       space, getpage, putpage, map, addmap, delmap, pageio, fsync.
240  *
241  * - These VOPs are otherwise not applicable, and should not be called:
242  *       dump, setsecattr.
243  *
244  *
245  * These VOPs we do not want to define, but nor do we want the VFS default
246  * action. Instead, we specify the VFS error function, with fs_error(), but
247  * note that fs_error() is not actually called. Instead it results in the
248  * use of the error function defined for the particular VOP, in vn_ops_table[]:
249  *
250  * -   frlock, dispose, shrlock.
251  *
252  *
253  * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
254  * NOTE: if any of these ops involve an OTW call with the stub FH, then
255  * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
256  * to protect the security data in the servinfo4_t for the "parent"
257  * filesystem that contains the stub.
258  *
259  * - These VOPs should not trigger a mount, so that "ls -l" does not:
260  *       pathconf, getsecattr.
261  *
262  * - These VOPs would not make sense to trigger:
263  *       inactive, rwlock, rwunlock, fid, realvp.
264  */
265 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
266 	VOPNAME_OPEN,		{ .vop_open = nfs4_trigger_open },
267 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_trigger_getattr },
268 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_trigger_setattr },
269 	VOPNAME_ACCESS,		{ .vop_access = nfs4_trigger_access },
270 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_trigger_lookup },
271 	VOPNAME_CREATE,		{ .vop_create = nfs4_trigger_create },
272 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_trigger_remove },
273 	VOPNAME_LINK,		{ .vop_link = nfs4_trigger_link },
274 	VOPNAME_RENAME,		{ .vop_rename = nfs4_trigger_rename },
275 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_trigger_mkdir },
276 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_trigger_rmdir },
277 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_trigger_symlink },
278 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_trigger_readlink },
279 	VOPNAME_INACTIVE, 	{ .vop_inactive = nfs4_inactive },
280 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
281 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
282 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
283 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
284 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
285 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
286 	VOPNAME_FRLOCK,		{ .error = fs_error },
287 	VOPNAME_DISPOSE,	{ .error = fs_error },
288 	VOPNAME_SHRLOCK,	{ .error = fs_error },
289 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
290 	NULL, NULL
291 };
292 
293 /*
294  * Trigger ops for stub vnodes; for mirror mounts, etc.
295  *
296  * The general idea is that a "triggering" op will first call
297  * nfs4_trigger_mount(), which will find out whether a mount has already
298  * been triggered.
299  *
300  * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
301  * of the covering vfs.
302  *
303  * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
304  * and again set newvp, as above.
305  *
306  * The triggering op may then re-issue the VOP by calling it on newvp.
307  *
308  * Note that some ops may perform custom action, and may or may not need
309  * to trigger a mount.
310  *
311  * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
312  * obviously can't do this with VOP_<whatever>, since it's a stub vnode
313  * and that would just recurse. Instead, we call the v4 op directly,
314  * by name.  This is OK, since we know that the vnode is for NFSv4,
315  * otherwise it couldn't be a stub.
316  *
317  */
318 
319 static int
320 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
321 {
322 	int error;
323 	vnode_t *newvp;
324 
325 	error = nfs4_trigger_mount(*vpp, &newvp);
326 	if (error)
327 		return (error);
328 
329 	/* Release the stub vnode, as we're losing the reference to it */
330 	VN_RELE(*vpp);
331 
332 	/* Give the caller the root vnode of the newly-mounted fs */
333 	*vpp = newvp;
334 
335 	/* return with VN_HELD(newvp) */
336 	return (VOP_OPEN(vpp, flag, cr, ct));
337 }
338 
339 /*
340  * For the majority of cases, nfs4_trigger_getattr() will not trigger
341  * a mount. However, if ATTR_TRIGGER is set, we are being informed
342  * that we need to force the mount before we attempt to determine
343  * the attributes. The intent is an atomic operation for security
344  * testing.
345  */
346 static int
347 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
348     caller_context_t *ct)
349 {
350 	int error;
351 
352 	if (flags & ATTR_TRIGGER) {
353 		vnode_t	*newvp;
354 
355 		error = nfs4_trigger_mount(vp, &newvp);
356 		if (error)
357 			return (error);
358 
359 		error = VOP_GETATTR(newvp, vap, flags, cr, ct);
360 		VN_RELE(newvp);
361 	} else {
362 		error = nfs4_getattr(vp, vap, flags, cr, ct);
363 	}
364 
365 	return (error);
366 }
367 
368 static int
369 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
370 		caller_context_t *ct)
371 {
372 	int error;
373 	vnode_t *newvp;
374 
375 	error = nfs4_trigger_mount(vp, &newvp);
376 	if (error)
377 		return (error);
378 
379 	error = VOP_SETATTR(newvp, vap, flags, cr, ct);
380 	VN_RELE(newvp);
381 
382 	return (error);
383 }
384 
385 static int
386 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
387     caller_context_t *ct)
388 {
389 	int error;
390 	vnode_t *newvp;
391 
392 	error = nfs4_trigger_mount(vp, &newvp);
393 	if (error)
394 		return (error);
395 
396 	error = VOP_ACCESS(newvp, mode, flags, cr, ct);
397 	VN_RELE(newvp);
398 
399 	return (error);
400 }
401 
402 static int
403 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
404     struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
405     caller_context_t *ct, int *deflags, pathname_t *rpnp)
406 {
407 	int error;
408 	vnode_t *newdvp;
409 	rnode4_t *drp = VTOR4(dvp);
410 
411 	ASSERT(RP_ISSTUB(drp));
412 
413 	/* for now, we only support mirror-mounts */
414 	ASSERT(RP_ISSTUB_MIRRORMOUNT(drp));
415 
416 	/*
417 	 * It's not legal to lookup ".." for an fs root, so we mustn't pass
418 	 * that up. Instead, pass onto the regular op, regardless of whether
419 	 * we've triggered a mount.
420 	 */
421 	if (strcmp(nm, "..") == 0)
422 		return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
423 		    ct, deflags, rpnp));
424 
425 	error = nfs4_trigger_mount(dvp, &newdvp);
426 	if (error)
427 		return (error);
428 
429 	error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
430 	    deflags, rpnp);
431 	VN_RELE(newdvp);
432 
433 	return (error);
434 }
435 
436 static int
437 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
438     enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
439     int flags, caller_context_t *ct, vsecattr_t *vsecp)
440 {
441 	int error;
442 	vnode_t *newdvp;
443 
444 	error = nfs4_trigger_mount(dvp, &newdvp);
445 	if (error)
446 		return (error);
447 
448 	error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
449 	    flags, ct, vsecp);
450 	VN_RELE(newdvp);
451 
452 	return (error);
453 }
454 
455 static int
456 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
457     int flags)
458 {
459 	int error;
460 	vnode_t *newdvp;
461 
462 	error = nfs4_trigger_mount(dvp, &newdvp);
463 	if (error)
464 		return (error);
465 
466 	error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
467 	VN_RELE(newdvp);
468 
469 	return (error);
470 }
471 
472 static int
473 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
474     caller_context_t *ct, int flags)
475 {
476 	int error;
477 	vnode_t *newtdvp;
478 
479 	error = nfs4_trigger_mount(tdvp, &newtdvp);
480 	if (error)
481 		return (error);
482 
483 	/*
484 	 * We don't check whether svp is a stub. Let the NFSv4 code
485 	 * detect that error, and return accordingly.
486 	 */
487 	error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
488 	VN_RELE(newtdvp);
489 
490 	return (error);
491 }
492 
493 static int
494 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
495     cred_t *cr, caller_context_t *ct, int flags)
496 {
497 	int error;
498 	vnode_t *newsdvp;
499 	rnode4_t *tdrp = VTOR4(tdvp);
500 
501 	/*
502 	 * We know that sdvp is a stub, otherwise we would not be here.
503 	 *
504 	 * If tdvp is also be a stub, there are two possibilities: it
505 	 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
506 	 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
507 	 *
508 	 * In the former case, just trigger sdvp, and treat tdvp as
509 	 * though it were not a stub.
510 	 *
511 	 * In the latter case, it might be a different stub for the
512 	 * same server fs as sdvp, or for a different server fs.
513 	 * Regardless, from the client perspective this would still
514 	 * be a cross-filesystem rename, and should not be allowed,
515 	 * so return EXDEV, without triggering either mount.
516 	 */
517 	if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
518 		return (EXDEV);
519 
520 	error = nfs4_trigger_mount(sdvp, &newsdvp);
521 	if (error)
522 		return (error);
523 
524 	error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
525 
526 	VN_RELE(newsdvp);
527 
528 	return (error);
529 }
530 
531 /* ARGSUSED */
532 static int
533 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
534     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
535 {
536 	int error;
537 	vnode_t *newdvp;
538 
539 	error = nfs4_trigger_mount(dvp, &newdvp);
540 	if (error)
541 		return (error);
542 
543 	error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
544 	VN_RELE(newdvp);
545 
546 	return (error);
547 }
548 
549 static int
550 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
551     caller_context_t *ct, int flags)
552 {
553 	int error;
554 	vnode_t *newdvp;
555 
556 	error = nfs4_trigger_mount(dvp, &newdvp);
557 	if (error)
558 		return (error);
559 
560 	error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
561 	VN_RELE(newdvp);
562 
563 	return (error);
564 }
565 
566 static int
567 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
568     cred_t *cr, caller_context_t *ct, int flags)
569 {
570 	int error;
571 	vnode_t *newdvp;
572 
573 	error = nfs4_trigger_mount(dvp, &newdvp);
574 	if (error)
575 		return (error);
576 
577 	error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
578 	VN_RELE(newdvp);
579 
580 	return (error);
581 }
582 
583 static int
584 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
585     caller_context_t *ct)
586 {
587 	int error;
588 	vnode_t *newvp;
589 
590 	error = nfs4_trigger_mount(vp, &newvp);
591 	if (error)
592 		return (error);
593 
594 	error = VOP_READLINK(newvp, uiop, cr, ct);
595 	VN_RELE(newvp);
596 
597 	return (error);
598 }
599 
600 /* end of trigger vnode ops */
601 
602 
603 /*
604  * Mount upon a trigger vnode; for mirror-mounts, etc.
605  *
606  * The mount may have already occurred, via another thread. If not,
607  * assemble the location information - which may require fetching - and
608  * perform the mount.
609  *
610  * Sets newvp to be the root of the fs that is now covering vp. Note
611  * that we return with VN_HELD(*newvp).
612  *
613  * The caller is responsible for passing the VOP onto the covering fs.
614  */
615 static int
616 nfs4_trigger_mount(vnode_t *vp, vnode_t **newvpp)
617 {
618 	int			 error;
619 	vfs_t			*vfsp;
620 	rnode4_t		*rp = VTOR4(vp);
621 	mntinfo4_t		*mi = VTOMI4(vp);
622 	domount_args_t		*dma;
623 
624 	nfs4_ephemeral_tree_t	*net;
625 
626 	bool_t			must_unlock = FALSE;
627 	bool_t			is_building = FALSE;
628 
629 	cred_t			*zcred;
630 
631 	nfs4_trigger_globals_t	*ntg;
632 
633 	zone_t			*zone = curproc->p_zone;
634 
635 	ASSERT(RP_ISSTUB(rp));
636 
637 	/* for now, we only support mirror-mounts */
638 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
639 
640 	*newvpp = NULL;
641 
642 	/*
643 	 * Has the mount already occurred?
644 	 */
645 	error = vn_vfsrlock_wait(vp);
646 	if (error)
647 		goto done;
648 	vfsp = vn_mountedvfs(vp);
649 	if (vfsp != NULL) {
650 		/* the mount has already occurred */
651 		error = VFS_ROOT(vfsp, newvpp);
652 		if (!error) {
653 			/* need to update the reference time  */
654 			mutex_enter(&mi->mi_lock);
655 			if (mi->mi_ephemeral)
656 				mi->mi_ephemeral->ne_ref_time =
657 				    gethrestime_sec();
658 			mutex_exit(&mi->mi_lock);
659 		}
660 
661 		vn_vfsunlock(vp);
662 		goto done;
663 	}
664 	vn_vfsunlock(vp);
665 
666 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
667 	ASSERT(ntg != NULL);
668 
669 	mutex_enter(&mi->mi_lock);
670 
671 	/*
672 	 * We need to lock down the ephemeral tree.
673 	 */
674 	if (mi->mi_ephemeral_tree == NULL) {
675 		net = kmem_zalloc(sizeof (*net), KM_SLEEP);
676 		mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
677 		mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
678 		net->net_refcnt = 1;
679 		net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
680 		is_building = TRUE;
681 
682 		/*
683 		 * We need to add it to the zone specific list for
684 		 * automatic unmounting and harvesting of deadwood.
685 		 */
686 		mutex_enter(&ntg->ntg_forest_lock);
687 		if (ntg->ntg_forest != NULL)
688 			net->net_next = ntg->ntg_forest;
689 		ntg->ntg_forest = net;
690 		mutex_exit(&ntg->ntg_forest_lock);
691 
692 		/*
693 		 * No lock order confusion with mi_lock because no
694 		 * other node could have grabbed net_tree_lock.
695 		 */
696 		mutex_enter(&net->net_tree_lock);
697 		mi->mi_ephemeral_tree = net;
698 		net->net_mount = mi;
699 		mutex_exit(&mi->mi_lock);
700 	} else {
701 		net = mi->mi_ephemeral_tree;
702 		mutex_exit(&mi->mi_lock);
703 
704 		mutex_enter(&net->net_cnt_lock);
705 		net->net_refcnt++;
706 		mutex_exit(&net->net_cnt_lock);
707 
708 		/*
709 		 * Note that we do not do any checks to
710 		 * see if the parent has been nuked.
711 		 * We count on the vfs layer having protected
712 		 * us from feet shooters.
713 		 */
714 		mutex_enter(&net->net_tree_lock);
715 	}
716 
717 	mutex_enter(&net->net_cnt_lock);
718 	net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
719 	mutex_exit(&net->net_cnt_lock);
720 
721 	must_unlock = TRUE;
722 
723 	dma = nfs4_trigger_domount_args_create(vp);
724 	if (dma == NULL) {
725 		error = EINVAL;
726 		goto done;
727 	}
728 
729 	/*
730 	 * Need to be root for this call to make mount work.
731 	 * Note that since we define mirror mounts to work
732 	 * for any user, we allow the mount to proceed. And
733 	 * we realize that the server will perform security
734 	 * checks to make sure that the client is allowed
735 	 * access. Finally, once the mount takes place,
736 	 * directory permissions will ensure that the
737 	 * content is secure.
738 	 */
739 	zcred = zone_get_kcred(getzoneid());
740 	ASSERT(zcred != NULL);
741 
742 	error = nfs4_trigger_domount(vp, dma, &vfsp, zcred);
743 	nfs4_trigger_domount_args_destroy(dma, vp);
744 
745 	crfree(zcred);
746 
747 	if (!error)
748 		error = VFS_ROOT(vfsp, newvpp);
749 done:
750 	if (must_unlock) {
751 		mutex_enter(&net->net_cnt_lock);
752 		net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
753 		if (is_building)
754 			net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
755 		net->net_refcnt--;
756 		mutex_exit(&net->net_cnt_lock);
757 
758 		mutex_exit(&net->net_tree_lock);
759 	}
760 
761 	if (!error && (newvpp == NULL || *newvpp == NULL))
762 		error = ENOSYS;
763 
764 	return (error);
765 }
766 
767 /*
768  * Collect together both the generic & mount-type specific args.
769  */
770 static domount_args_t *
771 nfs4_trigger_domount_args_create(vnode_t *vp)
772 {
773 	int nointr;
774 	char *hostlist;
775 	servinfo4_t *svp;
776 	struct nfs_args *nargs, *nargs_head;
777 	enum clnt_stat status;
778 	ephemeral_servinfo_t *esi, *esi_first;
779 	domount_args_t *dma;
780 	mntinfo4_t *mi = VTOMI4(vp);
781 
782 	nointr = !(mi->mi_flags & MI4_INT);
783 	hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
784 
785 	svp = mi->mi_curr_serv;
786 	/* check if the current server is responding */
787 	status = nfs4_trigger_ping_server(svp, nointr);
788 	if (status == RPC_SUCCESS) {
789 		esi_first = nfs4_trigger_esi_create(vp, svp);
790 		if (esi_first == NULL) {
791 			kmem_free(hostlist, MAXPATHLEN);
792 			return (NULL);
793 		}
794 
795 		(void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
796 
797 		nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
798 	} else {
799 		/* current server did not respond */
800 		esi_first = NULL;
801 		nargs_head = NULL;
802 	}
803 	nargs = nargs_head;
804 
805 	/*
806 	 * NFS RO failover.
807 	 *
808 	 * If we have multiple servinfo4 structures, linked via sv_next,
809 	 * we must create one nfs_args for each, linking the nfs_args via
810 	 * nfs_ext_u.nfs_extB.next.
811 	 *
812 	 * We need to build a corresponding esi for each, too, but that is
813 	 * used solely for building nfs_args, and may be immediately
814 	 * discarded, as domount() requires the info from just one esi,
815 	 * but all the nfs_args.
816 	 *
817 	 * Currently, the NFS mount code will hang if not all servers
818 	 * requested are available. To avoid that, we need to ping each
819 	 * server, here, and remove it from the list if it is not
820 	 * responding. This has the side-effect of that server then
821 	 * being permanently unavailable for this failover mount, even if
822 	 * it recovers. That's unfortunate, but the best we can do until
823 	 * the mount code path is fixed.
824 	 */
825 
826 	/*
827 	 * If the current server was down, loop indefinitely until we find
828 	 * at least one responsive server.
829 	 */
830 	do {
831 		/* no locking needed for sv_next; it is only set at fs mount */
832 		for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
833 			struct nfs_args *next;
834 
835 			/*
836 			 * nargs_head: the head of the nfs_args list
837 			 * nargs: the current tail of the list
838 			 * next: the newly-created element to be added
839 			 */
840 
841 			/*
842 			 * We've already tried the current server, above;
843 			 * if it was responding, we have already included it
844 			 * and it may now be ignored.
845 			 *
846 			 * Otherwise, try it again, since it may now have
847 			 * recovered.
848 			 */
849 			if (svp == mi->mi_curr_serv && esi_first != NULL)
850 				continue;
851 
852 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
853 			if (svp->sv_flags & SV4_NOTINUSE) {
854 				nfs_rw_exit(&svp->sv_lock);
855 				continue;
856 			}
857 			nfs_rw_exit(&svp->sv_lock);
858 
859 			/* check if the server is responding */
860 			status = nfs4_trigger_ping_server(svp, nointr);
861 			/* if the server did not respond, ignore it */
862 			if (status != RPC_SUCCESS)
863 				continue;
864 
865 			esi = nfs4_trigger_esi_create(vp, svp);
866 			if (esi == NULL)
867 				continue;
868 
869 			/*
870 			 * If the original current server (mi_curr_serv)
871 			 * was down when when we first tried it,
872 			 * (i.e. esi_first == NULL),
873 			 * we select this new server (svp) to be the server
874 			 * that we will actually contact (esi_first).
875 			 *
876 			 * Note that it's possible that mi_curr_serv == svp,
877 			 * if that mi_curr_serv was down but has now recovered.
878 			 */
879 			next = nfs4_trigger_nargs_create(mi, svp, esi);
880 			if (esi_first == NULL) {
881 				ASSERT(nargs == NULL);
882 				ASSERT(nargs_head == NULL);
883 				nargs_head = next;
884 				esi_first = esi;
885 				(void) strlcpy(hostlist,
886 				    esi_first->esi_hostname, MAXPATHLEN);
887 			} else {
888 				ASSERT(nargs_head != NULL);
889 				nargs->nfs_ext_u.nfs_extB.next = next;
890 				(void) strlcat(hostlist, ",", MAXPATHLEN);
891 				(void) strlcat(hostlist, esi->esi_hostname,
892 				    MAXPATHLEN);
893 				/* esi was only needed for hostname & nargs */
894 				nfs4_trigger_esi_destroy(esi, vp);
895 			}
896 
897 			nargs = next;
898 		}
899 
900 		/* if we've had no response at all, wait a second */
901 		if (esi_first == NULL)
902 			delay(drv_usectohz(1000000));
903 
904 	} while (esi_first == NULL);
905 	ASSERT(nargs_head != NULL);
906 
907 	dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
908 	dma->dma_esi = esi_first;
909 	dma->dma_hostlist = hostlist;
910 	dma->dma_nargs = nargs_head;
911 
912 	return (dma);
913 }
914 
915 static void
916 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
917 {
918 	if (dma != NULL) {
919 		if (dma->dma_esi != NULL && vp != NULL)
920 			nfs4_trigger_esi_destroy(dma->dma_esi, vp);
921 
922 		if (dma->dma_hostlist != NULL)
923 			kmem_free(dma->dma_hostlist, MAXPATHLEN);
924 
925 		if (dma->dma_nargs != NULL) {
926 			struct nfs_args *nargs = dma->dma_nargs;
927 
928 			do {
929 				struct nfs_args *next =
930 				    nargs->nfs_ext_u.nfs_extB.next;
931 
932 				nfs4_trigger_nargs_destroy(nargs);
933 				nargs = next;
934 			} while (nargs != NULL);
935 		}
936 
937 		kmem_free(dma, sizeof (domount_args_t));
938 	}
939 }
940 
941 /*
942  * The ephemeral_servinfo_t struct contains basic information we will need to
943  * perform the mount. Whilst the structure is generic across different
944  * types of ephemeral mount, the way we gather its contents differs.
945  */
946 static ephemeral_servinfo_t *
947 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp)
948 {
949 	ephemeral_servinfo_t *esi;
950 	rnode4_t *rp = VTOR4(vp);
951 
952 	ASSERT(RP_ISSTUB(rp));
953 
954 	/* Call the ephemeral type-specific routine */
955 	if (RP_ISSTUB_MIRRORMOUNT(rp))
956 		esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
957 	else
958 		esi = NULL;
959 
960 	/* for now, we only support mirror-mounts */
961 	ASSERT(esi != NULL);
962 
963 	return (esi);
964 }
965 
966 static void
967 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
968 {
969 	rnode4_t *rp = VTOR4(vp);
970 
971 	ASSERT(RP_ISSTUB(rp));
972 
973 	/* for now, we only support mirror-mounts */
974 	ASSERT(RP_ISSTUB_MIRRORMOUNT(rp));
975 
976 	/* Currently, no need for an ephemeral type-specific routine */
977 
978 	/*
979 	 * The contents of ephemeral_servinfo_t goes into nfs_args,
980 	 * and will be handled by nfs4_trigger_nargs_destroy().
981 	 * We need only free the structure itself.
982 	 */
983 	if (esi != NULL)
984 		kmem_free(esi, sizeof (ephemeral_servinfo_t));
985 }
986 
987 /*
988  * Some of this may turn out to be common with other ephemeral types,
989  * in which case it should be moved to nfs4_trigger_esi_create(), or a
990  * common function called.
991  */
992 static ephemeral_servinfo_t *
993 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
994 {
995 	char			*stubpath;
996 	struct knetconfig	*sikncp, *svkncp;
997 	struct netbuf		*bufp;
998 	ephemeral_servinfo_t	*esi;
999 
1000 	esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1001 
1002 	/* initially set to be our type of ephemeral mount; may be added to */
1003 	esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1004 
1005 	/*
1006 	 * We're copying info from the stub rnode's servinfo4, but
1007 	 * we must create new copies, not pointers, since this information
1008 	 * is to be associated with the new mount, which will be
1009 	 * unmounted (and its structures freed) separately
1010 	 */
1011 
1012 	/*
1013 	 * Sizes passed to kmem_[z]alloc here must match those freed
1014 	 * in nfs4_free_args()
1015 	 */
1016 
1017 	/*
1018 	 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1019 	 * is difficult to avoid: as we need to read svp to calculate the
1020 	 * sizes to be allocated.
1021 	 */
1022 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1023 
1024 	esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1025 	(void) strcat(esi->esi_hostname, svp->sv_hostname);
1026 
1027 	esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1028 	bufp = esi->esi_addr;
1029 	bufp->len = svp->sv_addr.len;
1030 	bufp->maxlen = svp->sv_addr.maxlen;
1031 	bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1032 	bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1033 
1034 	esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1035 	sikncp = esi->esi_knconf;
1036 	svkncp = svp->sv_knconf;
1037 	sikncp->knc_semantics = svkncp->knc_semantics;
1038 	sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1039 	(void) strcat((char *)sikncp->knc_protofmly,
1040 	    (char *)svkncp->knc_protofmly);
1041 	sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1042 	(void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1043 	sikncp->knc_rdev = svkncp->knc_rdev;
1044 
1045 	/*
1046 	 * Used when AUTH_DH is negotiated.
1047 	 *
1048 	 * This is ephemeral mount-type specific, since it contains the
1049 	 * server's time-sync syncaddr.
1050 	 */
1051 	if (svp->sv_dhsec) {
1052 		struct netbuf *bufp;
1053 		sec_data_t *sdata;
1054 		dh_k4_clntdata_t *data;
1055 
1056 		sdata = svp->sv_dhsec;
1057 		data = (dh_k4_clntdata_t *)sdata->data;
1058 		ASSERT(sdata->rpcflavor == AUTH_DH);
1059 
1060 		bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1061 		bufp->len = data->syncaddr.len;
1062 		bufp->maxlen = data->syncaddr.maxlen;
1063 		bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1064 		bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1065 		esi->esi_syncaddr = bufp;
1066 
1067 		if (data->netname != NULL) {
1068 			int nmlen = data->netnamelen;
1069 
1070 			/*
1071 			 * We need to copy from a dh_k4_clntdata_t
1072 			 * netname/netnamelen pair to a NUL-terminated
1073 			 * netname string suitable for putting in nfs_args,
1074 			 * where the latter has no netnamelen field.
1075 			 */
1076 			esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1077 			bcopy(data->netname, esi->esi_netname, nmlen);
1078 		}
1079 	} else {
1080 		esi->esi_syncaddr = NULL;
1081 		esi->esi_netname = NULL;
1082 	}
1083 
1084 	stubpath = fn_path(VTOSV(vp)->sv_name);
1085 	/* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1086 	ASSERT(*stubpath == '.');
1087 	stubpath += 1;
1088 
1089 	/* for nfs_args->fh */
1090 	esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1;
1091 	esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1092 	(void) strcat(esi->esi_path, svp->sv_path);
1093 	(void) strcat(esi->esi_path, stubpath);
1094 
1095 	stubpath -= 1;
1096 	/* stubpath allocated by fn_path() */
1097 	kmem_free(stubpath, strlen(stubpath) + 1);
1098 
1099 	nfs_rw_exit(&svp->sv_lock);
1100 
1101 	return (esi);
1102 }
1103 
1104 /*
1105  * Assemble the args, and call the generic VFS mount function to
1106  * finally perform the ephemeral mount.
1107  */
1108 static int
1109 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1110     cred_t *cr)
1111 {
1112 	struct mounta	*uap;
1113 	char		*mntpt, *orig_path, *path;
1114 	const char	*orig_mntpt;
1115 	int		retval;
1116 	int		mntpt_len;
1117 	int		spec_len;
1118 	zone_t		*zone = curproc->p_zone;
1119 	bool_t		has_leading_slash;
1120 
1121 	vfs_t			*stubvfsp = stubvp->v_vfsp;
1122 	ephemeral_servinfo_t	*esi = dma->dma_esi;
1123 	struct nfs_args		*nargs = dma->dma_nargs;
1124 
1125 	/* first, construct the mount point for the ephemeral mount */
1126 	orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1127 	orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1128 
1129 	if (*orig_path == '.')
1130 		orig_path++;
1131 
1132 	/*
1133 	 * Get rid of zone's root path
1134 	 */
1135 	if (zone != global_zone) {
1136 		/*
1137 		 * -1 for trailing '/' and -1 for EOS.
1138 		 */
1139 		if (strncmp(zone->zone_rootpath, orig_mntpt,
1140 		    zone->zone_rootpathlen - 1) == 0) {
1141 			orig_mntpt += (zone->zone_rootpathlen - 2);
1142 		}
1143 	}
1144 
1145 	mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1146 	mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1147 	(void) strcat(mntpt, orig_mntpt);
1148 	(void) strcat(mntpt, orig_path);
1149 
1150 	kmem_free(path, strlen(path) + 1);
1151 	path = esi->esi_path;
1152 	if (*path == '.')
1153 		path++;
1154 	if (path[0] == '/' && path[1] == '/')
1155 		path++;
1156 	has_leading_slash = (*path == '/');
1157 
1158 	spec_len = strlen(dma->dma_hostlist);
1159 	spec_len += strlen(path);
1160 
1161 	/* We are going to have to add this in */
1162 	if (!has_leading_slash)
1163 		spec_len++;
1164 
1165 	/* We need to get the ':' for dma_hostlist:esi_path */
1166 	spec_len++;
1167 
1168 	uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1169 	uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1170 	(void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1171 	    has_leading_slash ? "" : "/", path);
1172 
1173 	uap->dir = mntpt;
1174 
1175 	uap->flags = MS_SYSSPACE | MS_DATA;
1176 	/* fstype-independent mount options not covered elsewhere */
1177 	/* copy parent's mount(1M) "-m" flag */
1178 	if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1179 		uap->flags |= MS_NOMNTTAB;
1180 
1181 	uap->fstype = MNTTYPE_NFS4;
1182 	uap->dataptr = (char *)nargs;
1183 	/* not needed for MS_SYSSPACE */
1184 	uap->datalen = 0;
1185 
1186 	/* use optptr to pass in extra mount options */
1187 	uap->flags |= MS_OPTIONSTR;
1188 	uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1189 	if (uap->optptr == NULL) {
1190 		retval = EINVAL;
1191 		goto done;
1192 	}
1193 	/* domount() expects us to count the trailing NUL */
1194 	uap->optlen = strlen(uap->optptr) + 1;
1195 
1196 	retval = domount(NULL, uap, stubvp, cr, vfsp);
1197 	if (retval == 0)
1198 		VFS_RELE(*vfsp);
1199 done:
1200 	if (uap->optptr)
1201 		nfs4_trigger_destroy_mntopts(uap->optptr);
1202 
1203 	kmem_free(uap->spec, spec_len + 1);
1204 	kmem_free(uap, sizeof (struct mounta));
1205 	kmem_free(mntpt, mntpt_len + 1);
1206 
1207 	return (retval);
1208 }
1209 
1210 /*
1211  * Build an nfs_args structure for passing to domount().
1212  *
1213  * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1214  * generic data - common to all ephemeral mount types - is read directly
1215  * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1216  */
1217 static struct nfs_args *
1218 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1219     ephemeral_servinfo_t *esi)
1220 {
1221 	sec_data_t *secdata;
1222 	struct nfs_args *nargs;
1223 
1224 	/* setup the nfs args */
1225 	nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1226 
1227 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1228 
1229 	nargs->addr = esi->esi_addr;
1230 
1231 	/* for AUTH_DH by negotiation */
1232 	if (esi->esi_syncaddr || esi->esi_netname) {
1233 		nargs->flags |= NFSMNT_SECURE;
1234 		nargs->syncaddr = esi->esi_syncaddr;
1235 		nargs->netname = esi->esi_netname;
1236 	}
1237 
1238 	nargs->flags |= NFSMNT_KNCONF;
1239 	nargs->knconf = esi->esi_knconf;
1240 	nargs->flags |= NFSMNT_HOSTNAME;
1241 	nargs->hostname = esi->esi_hostname;
1242 	nargs->fh = esi->esi_path;
1243 
1244 	/* general mount settings, all copied from parent mount */
1245 	mutex_enter(&mi->mi_lock);
1246 
1247 	if (!(mi->mi_flags & MI4_HARD))
1248 		nargs->flags |= NFSMNT_SOFT;
1249 
1250 	nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
1251 	    NFSMNT_RETRANS;
1252 	nargs->wsize = mi->mi_stsize;
1253 	nargs->rsize = mi->mi_tsize;
1254 	nargs->timeo = mi->mi_timeo;
1255 	nargs->retrans = mi->mi_retrans;
1256 
1257 	if (mi->mi_flags & MI4_INT)
1258 		nargs->flags |= NFSMNT_INT;
1259 	if (mi->mi_flags & MI4_NOAC)
1260 		nargs->flags |= NFSMNT_NOAC;
1261 
1262 	nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
1263 	    NFSMNT_ACDIRMAX;
1264 	nargs->acregmin = HR2SEC(mi->mi_acregmin);
1265 	nargs->acregmax = HR2SEC(mi->mi_acregmax);
1266 	nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
1267 	nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
1268 
1269 	if (mi->mi_flags & MI4_NOCTO)
1270 		nargs->flags |= NFSMNT_NOCTO;
1271 	if (mi->mi_flags & MI4_GRPID)
1272 		nargs->flags |= NFSMNT_GRPID;
1273 	if (mi->mi_flags & MI4_LLOCK)
1274 		nargs->flags |= NFSMNT_LLOCK;
1275 	if (mi->mi_flags & MI4_NOPRINT)
1276 		nargs->flags |= NFSMNT_NOPRINT;
1277 	if (mi->mi_flags & MI4_DIRECTIO)
1278 		nargs->flags |= NFSMNT_DIRECTIO;
1279 	if (mi->mi_flags & MI4_PUBLIC)
1280 		nargs->flags |= NFSMNT_PUBLIC;
1281 
1282 	mutex_exit(&mi->mi_lock);
1283 
1284 	/* add any specific flags for this type of ephemeral mount */
1285 	nargs->flags |= esi->esi_mount_flags;
1286 
1287 	/*
1288 	 * Security data & negotiation policy.
1289 	 *
1290 	 * We need to preserve the parent mount's preference for security
1291 	 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT.
1292 	 *
1293 	 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
1294 	 * security flavour was requested, with data in sv_secdata, and that
1295 	 * no negotiation should occur. If this specified flavour fails, that's
1296 	 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
1297 	 *
1298 	 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
1299 	 * default flavour, in sv_secdata, but then negotiate a new flavour.
1300 	 * Possible flavours are recorded in an array in sv_secinfo, with
1301 	 * currently in-use flavour pointed to by sv_currsec.
1302 	 *
1303 	 * If sv_currsec is set, i.e. if negotiation has already occurred,
1304 	 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
1305 	 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
1306 	 */
1307 	if (svp->sv_flags & SV4_TRYSECDEFAULT) {
1308 		/* enable negotiation for ephemeral mount */
1309 		nargs->flags |= NFSMNT_SECDEFAULT;
1310 
1311 		/*
1312 		 * As a starting point for negotiation, copy parent
1313 		 * mount's negotiated flavour (sv_currsec) if available,
1314 		 * or its passed-in flavour (sv_secdata) if not.
1315 		 */
1316 		if (svp->sv_currsec != NULL)
1317 			secdata = copy_sec_data(svp->sv_currsec);
1318 		else if (svp->sv_secdata != NULL)
1319 			secdata = copy_sec_data(svp->sv_secdata);
1320 		else
1321 			secdata = NULL;
1322 	} else {
1323 		/* do not enable negotiation; copy parent's passed-in flavour */
1324 		if (svp->sv_secdata != NULL)
1325 			secdata = copy_sec_data(svp->sv_secdata);
1326 		else
1327 			secdata = NULL;
1328 	}
1329 
1330 	nfs_rw_exit(&svp->sv_lock);
1331 
1332 	nargs->flags |= NFSMNT_NEWARGS;
1333 	nargs->nfs_args_ext = NFS_ARGS_EXTB;
1334 	nargs->nfs_ext_u.nfs_extB.secdata = secdata;
1335 
1336 	/* for NFS RO failover; caller will set if necessary */
1337 	nargs->nfs_ext_u.nfs_extB.next = NULL;
1338 
1339 	return (nargs);
1340 }
1341 
1342 static void
1343 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
1344 {
1345 	/*
1346 	 * Either the mount failed, in which case the data is not needed, or
1347 	 * nfs4_mount() has either taken copies of what it needs or,
1348 	 * where it has merely copied the ptr, it has set *our* ptr to NULL,
1349 	 * whereby nfs4_free_args() will ignore it.
1350 	 */
1351 	nfs4_free_args(nargs);
1352 	kmem_free(nargs, sizeof (struct nfs_args));
1353 }
1354 
1355 /*
1356  * When we finally get into the mounting, we need to add this
1357  * node to the ephemeral tree.
1358  *
1359  * This is called from nfs4_mount().
1360  */
1361 void
1362 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
1363 {
1364 	mntinfo4_t		*mi_parent;
1365 	nfs4_ephemeral_t	*eph;
1366 	nfs4_ephemeral_tree_t	*net;
1367 
1368 	nfs4_ephemeral_t	*prior;
1369 	nfs4_ephemeral_t	*child;
1370 
1371 	nfs4_ephemeral_t	*peer;
1372 
1373 	nfs4_trigger_globals_t	*ntg;
1374 	zone_t			*zone = curproc->p_zone;
1375 
1376 	mi_parent = VTOMI4(mvp);
1377 
1378 	/*
1379 	 * Get this before grabbing anything else!
1380 	 */
1381 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
1382 	if (!ntg->ntg_thread_started) {
1383 		nfs4_ephemeral_start_harvester(ntg);
1384 	}
1385 
1386 	mutex_enter(&mi_parent->mi_lock);
1387 	mutex_enter(&mi->mi_lock);
1388 
1389 	/*
1390 	 * We need to tack together the ephemeral mount
1391 	 * with this new mntinfo.
1392 	 */
1393 	eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
1394 	eph->ne_mount = mi;
1395 	eph->ne_ref_time = gethrestime_sec();
1396 
1397 	/*
1398 	 * We need to tell the ephemeral mount when
1399 	 * to time out.
1400 	 */
1401 	eph->ne_mount_to = ntg->ntg_mount_to;
1402 
1403 	mi->mi_flags |= MI4_EPHEMERAL;
1404 	mi->mi_ephemeral = eph;
1405 
1406 	net = mi->mi_ephemeral_tree =
1407 	    mi_parent->mi_ephemeral_tree;
1408 	ASSERT(net != NULL);
1409 
1410 	/*
1411 	 * If the enclosing mntinfo4 is also ephemeral,
1412 	 * then we need to point to its enclosing parent.
1413 	 * Else the enclosing mntinfo4 is the enclosing parent.
1414 	 *
1415 	 * We also need to weave this ephemeral node
1416 	 * into the tree.
1417 	 */
1418 	if (mi_parent->mi_flags & MI4_EPHEMERAL) {
1419 		/*
1420 		 * We need to decide if we are
1421 		 * the root node of this branch
1422 		 * or if we are a sibling of this
1423 		 * branch.
1424 		 */
1425 		prior = mi_parent->mi_ephemeral;
1426 		ASSERT(prior != NULL);
1427 		if (prior->ne_child == NULL) {
1428 			prior->ne_child = eph;
1429 		} else {
1430 			child = prior->ne_child;
1431 
1432 			prior->ne_child = eph;
1433 			eph->ne_peer = child;
1434 
1435 			child->ne_prior = eph;
1436 		}
1437 
1438 		eph->ne_prior = prior;
1439 	} else {
1440 		/*
1441 		 * The parent mntinfo4 is the non-ephemeral
1442 		 * root of the ephemeral tree. We
1443 		 * need to decide if we are the root
1444 		 * node of that tree or if we are a
1445 		 * sibling of the root node.
1446 		 *
1447 		 * We are the root if there is no
1448 		 * other node.
1449 		 */
1450 		if (net->net_root == NULL) {
1451 			net->net_root = eph;
1452 		} else {
1453 			eph->ne_peer = peer = net->net_root;
1454 			ASSERT(peer != NULL);
1455 			net->net_root = eph;
1456 
1457 			peer->ne_prior = eph;
1458 		}
1459 
1460 		eph->ne_prior = NULL;
1461 	}
1462 
1463 	mutex_exit(&mi->mi_lock);
1464 	mutex_exit(&mi_parent->mi_lock);
1465 }
1466 
1467 /*
1468  * Commit the changes to the ephemeral tree for removing this node.
1469  */
1470 static void
1471 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
1472 {
1473 	nfs4_ephemeral_t	*e = eph;
1474 	nfs4_ephemeral_t	*peer;
1475 	nfs4_ephemeral_t	*prior;
1476 
1477 	peer = eph->ne_peer;
1478 	prior = e->ne_prior;
1479 
1480 	/*
1481 	 * If this branch root was not the
1482 	 * tree root, then we need to fix back pointers.
1483 	 */
1484 	if (prior) {
1485 		if (prior->ne_child == e) {
1486 			prior->ne_child = peer;
1487 		} else {
1488 			prior->ne_peer = peer;
1489 		}
1490 
1491 		if (peer)
1492 			peer->ne_prior = prior;
1493 	} else if (peer) {
1494 		peer->ne_mount->mi_ephemeral_tree->net_root = peer;
1495 		peer->ne_prior = NULL;
1496 	} else {
1497 		e->ne_mount->mi_ephemeral_tree->net_root = NULL;
1498 	}
1499 }
1500 
1501 /*
1502  * We want to avoid recursion at all costs. So we need to
1503  * unroll the tree. We do this by a depth first traversal to
1504  * leaf nodes. We blast away the leaf and work our way back
1505  * up and down the tree.
1506  */
1507 static int
1508 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
1509     int isTreeRoot, int flag, cred_t *cr)
1510 {
1511 	nfs4_ephemeral_t	*e = eph;
1512 	nfs4_ephemeral_t	*prior;
1513 	mntinfo4_t		*mi;
1514 	vfs_t			*vfsp;
1515 	int			error;
1516 
1517 	/*
1518 	 * We use the loop while unrolling the ephemeral tree.
1519 	 */
1520 	for (;;) {
1521 		/*
1522 		 * First we walk down the child.
1523 		 */
1524 		if (e->ne_child) {
1525 			prior = e;
1526 			e = e->ne_child;
1527 			continue;
1528 		}
1529 
1530 		/*
1531 		 * If we are the root of the branch we are removing,
1532 		 * we end it here. But if the branch is the root of
1533 		 * the tree, we have to forge on. We do not consider
1534 		 * the peer list for the root because while it may
1535 		 * be okay to remove, it is both extra work and a
1536 		 * potential for a false-positive error to stall the
1537 		 * unmount attempt.
1538 		 */
1539 		if (e == eph && isTreeRoot == FALSE)
1540 			return (0);
1541 
1542 		/*
1543 		 * Next we walk down the peer list.
1544 		 */
1545 		if (e->ne_peer) {
1546 			prior = e;
1547 			e = e->ne_peer;
1548 			continue;
1549 		}
1550 
1551 		/*
1552 		 * We can only remove the node passed in by the
1553 		 * caller if it is the root of the ephemeral tree.
1554 		 * Otherwise, the caller will remove it.
1555 		 */
1556 		if (e == eph && isTreeRoot == FALSE)
1557 			return (0);
1558 
1559 		/*
1560 		 * Okay, we have a leaf node, time
1561 		 * to prune it!
1562 		 *
1563 		 * Note that prior can only be NULL if
1564 		 * and only if it is the root of the
1565 		 * ephemeral tree.
1566 		 */
1567 		prior = e->ne_prior;
1568 
1569 		mi = e->ne_mount;
1570 		mutex_enter(&mi->mi_lock);
1571 		vfsp = mi->mi_vfsp;
1572 
1573 		/*
1574 		 * Cleared by umount2_engine.
1575 		 */
1576 		VFS_HOLD(vfsp);
1577 
1578 		/*
1579 		 * Inform nfs4_unmount to not recursively
1580 		 * descend into this node's children when it
1581 		 * gets processed.
1582 		 */
1583 		mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
1584 		mutex_exit(&mi->mi_lock);
1585 
1586 		error = umount2_engine(vfsp, flag, cr, FALSE);
1587 		if (error) {
1588 			/*
1589 			 * We need to reenable nfs4_unmount's ability
1590 			 * to recursively descend on this node.
1591 			 */
1592 			mutex_enter(&mi->mi_lock);
1593 			mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
1594 			mutex_exit(&mi->mi_lock);
1595 
1596 			return (error);
1597 		}
1598 
1599 		/*
1600 		 * If we are the current node, we do not want to
1601 		 * touch anything else. At this point, the only
1602 		 * way the current node can have survived to here
1603 		 * is if it is the root of the ephemeral tree and
1604 		 * we are unmounting the enclosing mntinfo4.
1605 		 */
1606 		if (e == eph) {
1607 			ASSERT(prior == NULL);
1608 			return (0);
1609 		}
1610 
1611 		/*
1612 		 * Stitch up the prior node. Note that since
1613 		 * we have handled the root of the tree, prior
1614 		 * must be non-NULL.
1615 		 */
1616 		ASSERT(prior != NULL);
1617 		if (prior->ne_child == e) {
1618 			prior->ne_child = NULL;
1619 		} else {
1620 			ASSERT(prior->ne_peer == e);
1621 
1622 			prior->ne_peer = NULL;
1623 		}
1624 
1625 		e = prior;
1626 	}
1627 
1628 	/* NOTREACHED */
1629 }
1630 
1631 /*
1632  * Common code to safely release net_cnt_lock and net_tree_lock
1633  */
1634 void
1635 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
1636     nfs4_ephemeral_tree_t **pnet)
1637 {
1638 	nfs4_ephemeral_tree_t	*net = *pnet;
1639 
1640 	if (*pmust_unlock) {
1641 		mutex_enter(&net->net_cnt_lock);
1642 		net->net_refcnt--;
1643 		net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
1644 		mutex_exit(&net->net_cnt_lock);
1645 
1646 		mutex_exit(&net->net_tree_lock);
1647 
1648 		*pmust_unlock = FALSE;
1649 	}
1650 }
1651 
1652 /*
1653  * While we may have removed any child or sibling nodes of this
1654  * ephemeral node, we can not nuke it until we know that there
1655  * were no actived vnodes on it. This will do that final
1656  * work once we know it is not busy.
1657  */
1658 void
1659 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
1660     nfs4_ephemeral_tree_t **pnet)
1661 {
1662 	/*
1663 	 * Now we need to get rid of the ephemeral data if it exists.
1664 	 */
1665 	mutex_enter(&mi->mi_lock);
1666 	if (mi->mi_ephemeral) {
1667 		/*
1668 		 * If we are the root node of an ephemeral branch
1669 		 * which is being removed, then we need to fixup
1670 		 * pointers into and out of the node.
1671 		 */
1672 		if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
1673 			nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
1674 
1675 		ASSERT(mi->mi_ephemeral != NULL);
1676 
1677 		kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
1678 		mi->mi_ephemeral = NULL;
1679 	}
1680 	mutex_exit(&mi->mi_lock);
1681 
1682 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
1683 }
1684 
1685 /*
1686  * Unmount an ephemeral node.
1687  */
1688 int
1689 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
1690     bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
1691 {
1692 	int			error = 0;
1693 	nfs4_ephemeral_t	*eph;
1694 	nfs4_ephemeral_tree_t	*net;
1695 	int			is_derooting = FALSE;
1696 	int			is_recursed = FALSE;
1697 
1698 	/*
1699 	 * The active vnodes on this file system may be ephemeral
1700 	 * children. We need to check for and try to unmount them
1701 	 * here. If any can not be unmounted, we are going
1702 	 * to return EBUSY.
1703 	 */
1704 	mutex_enter(&mi->mi_lock);
1705 
1706 	/*
1707 	 * If an ephemeral tree, we need to check to see if
1708 	 * the lock is already held. If it is, then we need
1709 	 * to see if we are being called as a result of
1710 	 * the recursive removal of some node of the tree or
1711 	 * if we are another attempt to remove the tree.
1712 	 *
1713 	 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
1714 	 * node. mi_ephemeral being non-NULL also does this.
1715 	 *
1716 	 * mi_ephemeral_tree being non-NULL is sufficient
1717 	 * to also indicate either it is an ephemeral node
1718 	 * or the enclosing mntinfo4.
1719 	 *
1720 	 * Do we need MI4_EPHEMERAL? Yes, it is useful for
1721 	 * when we delete the ephemeral node and need to
1722 	 * differentiate from an ephemeral node and the
1723 	 * enclosing root node.
1724 	 */
1725 	*pnet = net = mi->mi_ephemeral_tree;
1726 	if (net == NULL) {
1727 		mutex_exit(&mi->mi_lock);
1728 		return (0);
1729 	}
1730 
1731 	eph = mi->mi_ephemeral;
1732 	is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
1733 	is_derooting = (eph == NULL);
1734 
1735 	/*
1736 	 * If this is not recursion, then we need to
1737 	 * grab a ref count.
1738 	 *
1739 	 * But wait, we also do not want to do that
1740 	 * if a harvester thread has already grabbed
1741 	 * the lock.
1742 	 */
1743 	if (!is_recursed) {
1744 		mutex_enter(&net->net_cnt_lock);
1745 		if (net->net_status &
1746 		    NFS4_EPHEMERAL_TREE_LOCKED) {
1747 			mutex_exit(&net->net_cnt_lock);
1748 			mutex_exit(&mi->mi_lock);
1749 
1750 			/*
1751 			 * Someone is already working on
1752 			 * it. We need to back off and
1753 			 * let them proceed.
1754 			 *
1755 			 * We return EBUSY so that the
1756 			 * caller knows something is
1757 			 * going on. Note that by that
1758 			 * time, the umount in the other
1759 			 * thread may have already occured.
1760 			 */
1761 			return (EBUSY);
1762 		} else
1763 			net->net_refcnt++;
1764 		mutex_exit(&net->net_cnt_lock);
1765 	}
1766 	mutex_exit(&mi->mi_lock);
1767 
1768 	/*
1769 	 * If we grab the lock, it means that no other
1770 	 * operation is working on the tree. If we don't
1771 	 * grab it, we need to decide if this is because
1772 	 * we are a recursive call or a new operation.
1773 	 *
1774 	 * If we are a recursive call, we proceed without
1775 	 * the lock.
1776 	 *
1777 	 * Else we have to wait until the lock becomes free.
1778 	 */
1779 	if (!mutex_tryenter(&net->net_tree_lock)) {
1780 		if (!is_recursed) {
1781 			mutex_enter(&net->net_cnt_lock);
1782 			if (net->net_status &
1783 			    (NFS4_EPHEMERAL_TREE_DEROOTING
1784 			    | NFS4_EPHEMERAL_TREE_INVALID)) {
1785 				net->net_refcnt--;
1786 				mutex_exit(&net->net_cnt_lock);
1787 				goto is_busy;
1788 			}
1789 			mutex_exit(&net->net_cnt_lock);
1790 
1791 			/*
1792 			 * We can't hold any other locks whilst
1793 			 * we wait on this to free up.
1794 			 */
1795 			mutex_enter(&net->net_tree_lock);
1796 
1797 			/*
1798 			 * Note that while mi->mi_ephemeral
1799 			 * may change and thus we have to
1800 			 * update eph, it is the case that
1801 			 * we have tied down net and
1802 			 * do not care if mi->mi_ephemeral_tree
1803 			 * has changed.
1804 			 */
1805 			mutex_enter(&mi->mi_lock);
1806 			eph = mi->mi_ephemeral;
1807 			mutex_exit(&mi->mi_lock);
1808 
1809 			/*
1810 			 * Okay, we need to see if either the
1811 			 * tree got nuked or the current node
1812 			 * got nuked. Both of which will cause
1813 			 * an error.
1814 			 *
1815 			 * Note that a subsequent retry of the
1816 			 * umount shall work.
1817 			 */
1818 			mutex_enter(&net->net_cnt_lock);
1819 			if (net->net_status &
1820 			    NFS4_EPHEMERAL_TREE_INVALID ||
1821 			    (!is_derooting && eph == NULL)) {
1822 				net->net_refcnt--;
1823 				mutex_exit(&net->net_cnt_lock);
1824 				mutex_exit(&net->net_tree_lock);
1825 				goto is_busy;
1826 			}
1827 			mutex_exit(&net->net_cnt_lock);
1828 			*pmust_unlock = TRUE;
1829 		}
1830 	} else {
1831 		/*
1832 		 * If we grab it right away, everything must
1833 		 * be great!
1834 		 */
1835 		*pmust_unlock = TRUE;
1836 	}
1837 
1838 	/*
1839 	 * Only once we have grabbed the lock can we mark what we
1840 	 * are planning on doing to the ephemeral tree.
1841 	 */
1842 	if (*pmust_unlock) {
1843 		mutex_enter(&net->net_cnt_lock);
1844 		net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
1845 
1846 		/*
1847 		 * Check to see if we are nuking the root.
1848 		 */
1849 		if (is_derooting)
1850 			net->net_status |=
1851 			    NFS4_EPHEMERAL_TREE_DEROOTING;
1852 		mutex_exit(&net->net_cnt_lock);
1853 	}
1854 
1855 	if (!is_derooting) {
1856 		/*
1857 		 * Only work on children if the caller has not already
1858 		 * done so.
1859 		 */
1860 		if (!is_recursed) {
1861 			ASSERT(eph != NULL);
1862 
1863 			error = nfs4_ephemeral_unmount_engine(eph,
1864 			    FALSE, flag, cr);
1865 			if (error)
1866 				goto is_busy;
1867 		}
1868 	} else {
1869 		eph = net->net_root;
1870 
1871 		/*
1872 		 * Only work if there is something there.
1873 		 */
1874 		if (eph) {
1875 			error = nfs4_ephemeral_unmount_engine(eph, TRUE,
1876 			    flag, cr);
1877 			if (error) {
1878 				mutex_enter(&net->net_cnt_lock);
1879 				net->net_status &=
1880 				    ~NFS4_EPHEMERAL_TREE_DEROOTING;
1881 				mutex_exit(&net->net_cnt_lock);
1882 				goto is_busy;
1883 			}
1884 
1885 			/*
1886 			 * Nothing else which goes wrong will
1887 			 * invalidate the blowing away of the
1888 			 * ephmeral tree.
1889 			 */
1890 			net->net_root = NULL;
1891 		}
1892 
1893 		/*
1894 		 * We have derooted and we have caused the tree to be
1895 		 * invalid.
1896 		 */
1897 		mutex_enter(&net->net_cnt_lock);
1898 		net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
1899 		net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
1900 		net->net_refcnt--;
1901 		mutex_exit(&net->net_cnt_lock);
1902 
1903 		/*
1904 		 * At this point, the tree should no
1905 		 * longer be associated with the
1906 		 * mntinfo4. We need to pull it off
1907 		 * there and let the harvester take
1908 		 * care of it once the refcnt drops.
1909 		 */
1910 		mutex_enter(&mi->mi_lock);
1911 		mi->mi_ephemeral_tree = NULL;
1912 		mutex_exit(&mi->mi_lock);
1913 	}
1914 
1915 	return (0);
1916 
1917 is_busy:
1918 
1919 	nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
1920 
1921 	return (error);
1922 }
1923 
1924 /*
1925  * Do the umount and record any error in the parent.
1926  */
1927 static void
1928 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
1929     nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
1930 {
1931 	int	error;
1932 
1933 	error = umount2_engine(vfsp, flag, kcred, FALSE);
1934 	if (error) {
1935 		if (prior) {
1936 			if (prior->ne_child == e)
1937 				prior->ne_state |=
1938 				    NFS4_EPHEMERAL_CHILD_ERROR;
1939 			else
1940 				prior->ne_state |=
1941 				    NFS4_EPHEMERAL_PEER_ERROR;
1942 		}
1943 	}
1944 }
1945 
1946 /*
1947  * For each tree in the forest (where the forest is in
1948  * effect all of the ephemeral trees for this zone),
1949  * scan to see if a node can be unmounted. Note that
1950  * unlike nfs4_ephemeral_unmount_engine(), we do
1951  * not process the current node before children or
1952  * siblings. I.e., if a node can be unmounted, we
1953  * do not recursively check to see if the nodes
1954  * hanging off of it can also be unmounted.
1955  *
1956  * Instead, we delve down deep to try and remove the
1957  * children first. Then, because we share code with
1958  * nfs4_ephemeral_unmount_engine(), we will try
1959  * them again. This could be a performance issue in
1960  * the future.
1961  *
1962  * Also note that unlike nfs4_ephemeral_unmount_engine(),
1963  * we do not halt on an error. We will not remove the
1964  * current node, but we will keep on trying to remove
1965  * the others.
1966  *
1967  * force indicates that we want the unmount to occur
1968  * even if there is something blocking it.
1969  *
1970  * time_check indicates that we want to see if the
1971  * mount has expired past mount_to or not. Typically
1972  * we want to do this and only on a shutdown of the
1973  * zone would we want to ignore the check.
1974  */
1975 static void
1976 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
1977     bool_t force, bool_t time_check)
1978 {
1979 	nfs4_ephemeral_tree_t	*net;
1980 	nfs4_ephemeral_tree_t	*prev = NULL;
1981 	nfs4_ephemeral_tree_t	*next;
1982 	nfs4_ephemeral_t	*e;
1983 	nfs4_ephemeral_t	*prior;
1984 	time_t			now = gethrestime_sec();
1985 
1986 	nfs4_ephemeral_tree_t	*harvest = NULL;
1987 
1988 	int			flag;
1989 
1990 	mntinfo4_t		*mi;
1991 	vfs_t			*vfsp;
1992 
1993 	if (force)
1994 		flag = MS_FORCE;
1995 	else
1996 		flag = 0;
1997 
1998 	mutex_enter(&ntg->ntg_forest_lock);
1999 	for (net = ntg->ntg_forest; net != NULL; net = next) {
2000 		next = net->net_next;
2001 
2002 		mutex_enter(&net->net_cnt_lock);
2003 		net->net_refcnt++;
2004 		mutex_exit(&net->net_cnt_lock);
2005 
2006 		mutex_enter(&net->net_tree_lock);
2007 
2008 		/*
2009 		 * Let the unmount code know that the
2010 		 * tree is already locked!
2011 		 */
2012 		mutex_enter(&net->net_cnt_lock);
2013 		net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2014 		mutex_exit(&net->net_cnt_lock);
2015 
2016 		/*
2017 		 * If the intent is force all ephemeral nodes to
2018 		 * be unmounted in this zone, we can short circuit a
2019 		 * lot of tree traversal and simply zap the root node.
2020 		 */
2021 		if (force) {
2022 			if (net->net_root) {
2023 				mi = net->net_root->ne_mount;
2024 				vfsp = mi->mi_vfsp;
2025 
2026 				/*
2027 				 * Cleared by umount2_engine.
2028 				 */
2029 				VFS_HOLD(vfsp);
2030 
2031 				(void) umount2_engine(vfsp, flag,
2032 				    kcred, FALSE);
2033 
2034 				goto check_done;
2035 			}
2036 		}
2037 
2038 		e = net->net_root;
2039 		if (e)
2040 			e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2041 
2042 		while (e) {
2043 			if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2044 				e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2045 				if (e->ne_child) {
2046 					e = e->ne_child;
2047 					e->ne_state =
2048 					    NFS4_EPHEMERAL_VISIT_CHILD;
2049 				}
2050 
2051 				continue;
2052 			} else if (e->ne_state ==
2053 			    NFS4_EPHEMERAL_VISIT_SIBLING) {
2054 				e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2055 				if (e->ne_peer) {
2056 					e = e->ne_peer;
2057 					e->ne_state =
2058 					    NFS4_EPHEMERAL_VISIT_CHILD;
2059 				}
2060 
2061 				continue;
2062 			} else if (e->ne_state ==
2063 			    NFS4_EPHEMERAL_CHILD_ERROR) {
2064 				prior = e->ne_prior;
2065 
2066 				/*
2067 				 * If a child reported an error, do
2068 				 * not bother trying to unmount.
2069 				 *
2070 				 * If your prior node is a parent,
2071 				 * pass the error up such that they
2072 				 * also do not try to unmount.
2073 				 *
2074 				 * However, if your prior is a sibling,
2075 				 * let them try to unmount if they can.
2076 				 */
2077 				if (prior) {
2078 					if (prior->ne_child == e)
2079 						prior->ne_state |=
2080 						    NFS4_EPHEMERAL_CHILD_ERROR;
2081 					else
2082 						prior->ne_state |=
2083 						    NFS4_EPHEMERAL_PEER_ERROR;
2084 				}
2085 
2086 				/*
2087 				 * Clear the error and if needed, process peers.
2088 				 *
2089 				 * Once we mask out the error, we know whether
2090 				 * or we have to process another node.
2091 				 */
2092 				e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2093 				if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2094 					e = prior;
2095 
2096 				continue;
2097 			} else if (e->ne_state ==
2098 			    NFS4_EPHEMERAL_PEER_ERROR) {
2099 				prior = e->ne_prior;
2100 
2101 				if (prior) {
2102 					if (prior->ne_child == e)
2103 						prior->ne_state =
2104 						    NFS4_EPHEMERAL_CHILD_ERROR;
2105 					else
2106 						prior->ne_state =
2107 						    NFS4_EPHEMERAL_PEER_ERROR;
2108 				}
2109 
2110 				/*
2111 				 * Clear the error from this node and do the
2112 				 * correct processing.
2113 				 */
2114 				e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2115 				continue;
2116 			}
2117 
2118 			prior = e->ne_prior;
2119 			e->ne_state = NFS4_EPHEMERAL_OK;
2120 
2121 			/*
2122 			 * It must be the case that we need to process
2123 			 * this node.
2124 			 */
2125 			if (!time_check ||
2126 			    now - e->ne_ref_time > e->ne_mount_to) {
2127 				mi = e->ne_mount;
2128 				vfsp = mi->mi_vfsp;
2129 
2130 				/*
2131 				 * Cleared by umount2_engine.
2132 				 */
2133 				VFS_HOLD(vfsp);
2134 
2135 				/*
2136 				 * Note that we effectively work down to the
2137 				 * leaf nodes first, try to unmount them,
2138 				 * then work our way back up into the leaf
2139 				 * nodes.
2140 				 *
2141 				 * Also note that we deal with a lot of
2142 				 * complexity by sharing the work with
2143 				 * the manual unmount code.
2144 				 */
2145 				nfs4_ephemeral_record_umount(vfsp, flag,
2146 				    e, prior);
2147 			}
2148 
2149 			e = prior;
2150 		}
2151 
2152 check_done:
2153 
2154 		/*
2155 		 * Are we done with this tree?
2156 		 */
2157 		mutex_enter(&net->net_cnt_lock);
2158 		if (net->net_refcnt == 1 &&
2159 		    net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
2160 			net->net_refcnt--;
2161 			net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2162 			mutex_exit(&net->net_cnt_lock);
2163 			mutex_exit(&net->net_tree_lock);
2164 
2165 			if (prev)
2166 				prev->net_next = net->net_next;
2167 			else
2168 				ntg->ntg_forest = net->net_next;
2169 
2170 			net->net_next = harvest;
2171 			harvest = net;
2172 			continue;
2173 		}
2174 
2175 		net->net_refcnt--;
2176 		net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
2177 		mutex_exit(&net->net_cnt_lock);
2178 		mutex_exit(&net->net_tree_lock);
2179 
2180 		prev = net;
2181 	}
2182 	mutex_exit(&ntg->ntg_forest_lock);
2183 
2184 	for (net = harvest; net != NULL; net = next) {
2185 		next = net->net_next;
2186 
2187 		mutex_destroy(&net->net_tree_lock);
2188 		mutex_destroy(&net->net_cnt_lock);
2189 		kmem_free(net, sizeof (*net));
2190 	}
2191 }
2192 
2193 /*
2194  * This is the thread which decides when the harvesting
2195  * can proceed and when to kill it off for this zone.
2196  */
2197 static void
2198 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
2199 {
2200 	clock_t		timeleft;
2201 	zone_t		*zone = curproc->p_zone;
2202 
2203 	for (;;) {
2204 		timeleft = zone_status_timedwait(zone, lbolt +
2205 		    nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
2206 
2207 		/*
2208 		 * zone is exiting...
2209 		 */
2210 		if (timeleft != -1) {
2211 			ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
2212 			zthread_exit();
2213 			/* NOTREACHED */
2214 		}
2215 
2216 		/*
2217 		 * Only bother scanning if there is potential
2218 		 * work to be done.
2219 		 */
2220 		if (ntg->ntg_forest == NULL)
2221 			continue;
2222 
2223 		/*
2224 		 * Now scan the list and get rid of everything which
2225 		 * is old.
2226 		 */
2227 		nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
2228 	}
2229 
2230 	/* NOTREACHED */
2231 }
2232 
2233 /*
2234  * The zone specific glue needed to start the unmount harvester.
2235  *
2236  * Note that we want to avoid holding the mutex as long as possible,
2237  * hence the multiple checks.
2238  *
2239  * The caller should avoid us getting down here in the first
2240  * place.
2241  */
2242 static void
2243 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
2244 {
2245 	/*
2246 	 * It got started before we got here...
2247 	 */
2248 	if (ntg->ntg_thread_started)
2249 		return;
2250 
2251 	mutex_enter(&nfs4_ephemeral_thread_lock);
2252 
2253 	if (ntg->ntg_thread_started) {
2254 		mutex_exit(&nfs4_ephemeral_thread_lock);
2255 		return;
2256 	}
2257 
2258 	/*
2259 	 * Start the unmounter harvester thread for this zone.
2260 	 */
2261 	(void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
2262 	    ntg, 0, minclsyspri);
2263 
2264 	ntg->ntg_thread_started = TRUE;
2265 	mutex_exit(&nfs4_ephemeral_thread_lock);
2266 }
2267 
2268 /*ARGSUSED*/
2269 static void *
2270 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
2271 {
2272 	nfs4_trigger_globals_t	*ntg;
2273 
2274 	ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
2275 	ntg->ntg_thread_started = FALSE;
2276 
2277 	/*
2278 	 * This is the default....
2279 	 */
2280 	ntg->ntg_mount_to = nfs4_trigger_thread_timer;
2281 
2282 	mutex_init(&ntg->ntg_forest_lock, NULL,
2283 	    MUTEX_DEFAULT, NULL);
2284 
2285 	return (ntg);
2286 }
2287 
2288 /*
2289  * Try a nice gentle walk down the forest and convince
2290  * all of the trees to gracefully give it up.
2291  */
2292 /*ARGSUSED*/
2293 static void
2294 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
2295 {
2296 	nfs4_trigger_globals_t	*ntg = arg;
2297 
2298 	if (!ntg)
2299 		return;
2300 
2301 	nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
2302 }
2303 
2304 /*
2305  * Race along the forest and rip all of the trees out by
2306  * their rootballs!
2307  */
2308 /*ARGSUSED*/
2309 static void
2310 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
2311 {
2312 	nfs4_trigger_globals_t	*ntg = arg;
2313 
2314 	if (!ntg)
2315 		return;
2316 
2317 	nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
2318 
2319 	mutex_destroy(&ntg->ntg_forest_lock);
2320 	kmem_free(ntg, sizeof (*ntg));
2321 }
2322 
2323 /*
2324  * This is the zone independent cleanup needed for
2325  * emphemeral mount processing.
2326  */
2327 void
2328 nfs4_ephemeral_fini(void)
2329 {
2330 	(void) zone_key_delete(nfs4_ephemeral_key);
2331 	mutex_destroy(&nfs4_ephemeral_thread_lock);
2332 }
2333 
2334 /*
2335  * This is the zone independent initialization needed for
2336  * emphemeral mount processing.
2337  */
2338 void
2339 nfs4_ephemeral_init(void)
2340 {
2341 	mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
2342 	    NULL);
2343 
2344 	zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
2345 	    nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
2346 }
2347 
2348 /*
2349  * nfssys() calls this function to set the per-zone
2350  * value of mount_to to drive when an ephemeral mount is
2351  * timed out. Each mount will grab a copy of this value
2352  * when mounted.
2353  */
2354 void
2355 nfs4_ephemeral_set_mount_to(uint_t mount_to)
2356 {
2357 	nfs4_trigger_globals_t	*ntg;
2358 	zone_t			*zone = curproc->p_zone;
2359 
2360 	ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2361 
2362 	ntg->ntg_mount_to = mount_to;
2363 }
2364 
2365 /*
2366  * Walk the list of v4 mount options; if they are currently set in vfsp,
2367  * append them to a new comma-separated mount option string, and return it.
2368  *
2369  * Caller should free by calling nfs4_trigger_destroy_mntopts().
2370  */
2371 static char *
2372 nfs4_trigger_create_mntopts(vfs_t *vfsp)
2373 {
2374 	uint_t i;
2375 	char *mntopts;
2376 	struct vfssw *vswp;
2377 	mntopts_t *optproto;
2378 
2379 	mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
2380 
2381 	/* get the list of applicable mount options for v4; locks *vswp */
2382 	vswp = vfs_getvfssw(MNTTYPE_NFS4);
2383 	optproto = &vswp->vsw_optproto;
2384 
2385 	for (i = 0; i < optproto->mo_count; i++) {
2386 		struct mntopt *mop = &optproto->mo_list[i];
2387 
2388 		if (mop->mo_flags & MO_EMPTY)
2389 			continue;
2390 
2391 		if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
2392 			kmem_free(mntopts, MAX_MNTOPT_STR);
2393 			vfs_unrefvfssw(vswp);
2394 			return (NULL);
2395 		}
2396 	}
2397 
2398 	vfs_unrefvfssw(vswp);
2399 
2400 	/*
2401 	 * MNTOPT_XATTR is not in the v4 mount opt proto list,
2402 	 * and it may only be passed via MS_OPTIONSTR, so we
2403 	 * must handle it here.
2404 	 *
2405 	 * Ideally, it would be in the list, but NFS does not specify its
2406 	 * own opt proto list, it uses instead the default one. Since
2407 	 * not all filesystems support extended attrs, it would not be
2408 	 * appropriate to add it there.
2409 	 */
2410 	if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
2411 	    nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
2412 		kmem_free(mntopts, MAX_MNTOPT_STR);
2413 		return (NULL);
2414 	}
2415 
2416 	return (mntopts);
2417 }
2418 
2419 static void
2420 nfs4_trigger_destroy_mntopts(char *mntopts)
2421 {
2422 	if (mntopts)
2423 		kmem_free(mntopts, MAX_MNTOPT_STR);
2424 }
2425 
2426 /*
2427  * Check a single mount option (optname). Add to mntopts if it is set in VFS.
2428  */
2429 static int
2430 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
2431 {
2432 	if (mntopts == NULL || optname == NULL || vfsp == NULL)
2433 		return (EINVAL);
2434 
2435 	if (vfs_optionisset(vfsp, optname, NULL)) {
2436 		size_t mntoptslen = strlen(mntopts);
2437 		size_t optnamelen = strlen(optname);
2438 
2439 		/* +1 for ',', +1 for NUL */
2440 		if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
2441 			return (EOVERFLOW);
2442 
2443 		/* first or subsequent mount option? */
2444 		if (*mntopts != '\0')
2445 			(void) strcat(mntopts, ",");
2446 
2447 		(void) strcat(mntopts, optname);
2448 	}
2449 
2450 	return (0);
2451 }
2452 
2453 static enum clnt_stat
2454 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
2455 {
2456 	int retries, error;
2457 	uint_t max_msgsize;
2458 	enum clnt_stat status;
2459 	CLIENT *cl;
2460 	struct timeval timeout;
2461 
2462 	/* as per recov_newserver() */
2463 	max_msgsize = 0;
2464 	retries = 1;
2465 	timeout.tv_sec = 2;
2466 	timeout.tv_usec = 0;
2467 
2468 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM,
2469 	    NFS_V4, max_msgsize, retries, CRED(), &cl);
2470 	if (error)
2471 		return (RPC_FAILED);
2472 
2473 	if (nointr)
2474 		cl->cl_nosignal = TRUE;
2475 	status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
2476 	    timeout);
2477 	if (nointr)
2478 		cl->cl_nosignal = FALSE;
2479 
2480 	AUTH_DESTROY(cl->cl_auth);
2481 	CLNT_DESTROY(cl);
2482 
2483 	return (status);
2484 }
2485