1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29 * triggered from a "stub" rnode via a special set of vnodeops.
30 */
31
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/mount.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 #include <sys/mntent.h>
68 #include <sys/priv.h>
69
70 #include <rpc/types.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73
74 #include <nfs/nfs.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/nfs_acl.h>
77 #include <nfs/lm.h>
78 #include <nfs/nfs4.h>
79 #include <nfs/nfs4_kprot.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82 #include <nfs/nfsid_map.h>
83 #include <nfs/nfs4_idmap_impl.h>
84
85 #include <vm/hat.h>
86 #include <vm/as.h>
87 #include <vm/page.h>
88 #include <vm/pvn.h>
89 #include <vm/seg.h>
90 #include <vm/seg_map.h>
91 #include <vm/seg_kpm.h>
92 #include <vm/seg_vn.h>
93
94 #include <fs/fs_subr.h>
95
96 #include <sys/ddi.h>
97 #include <sys/int_fmtio.h>
98
99 #include <sys/sunddi.h>
100
101 #include <sys/priv_names.h>
102
103 extern zone_key_t nfs4clnt_zone_key;
104 extern zone_key_t nfsidmap_zone_key;
105
106 /*
107 * The automatic unmounter thread stuff!
108 */
109 static int nfs4_trigger_thread_timer = 20; /* in seconds */
110
111 /*
112 * Just a default....
113 */
114 static uint_t nfs4_trigger_mount_to = 240;
115
116 typedef struct nfs4_trigger_globals {
117 kmutex_t ntg_forest_lock;
118 uint_t ntg_mount_to;
119 int ntg_thread_started;
120 nfs4_ephemeral_tree_t *ntg_forest;
121 } nfs4_trigger_globals_t;
122
123 kmutex_t nfs4_ephemeral_thread_lock;
124
125 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
126
127 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
128
129 /*
130 * Used for ephemeral mounts; contains data either duplicated from
131 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
132 *
133 * It's intended that this structure is used solely for ephemeral
134 * mount-type specific data, for passing this data to
135 * nfs4_trigger_nargs_create().
136 */
137 typedef struct ephemeral_servinfo {
138 char *esi_hostname;
139 char *esi_netname;
140 char *esi_path;
141 int esi_path_len;
142 int esi_mount_flags;
143 struct netbuf *esi_addr;
144 struct netbuf *esi_syncaddr;
145 struct knetconfig *esi_knconf;
146 } ephemeral_servinfo_t;
147
148 /*
149 * Collect together the mount-type specific and generic data args.
150 */
151 typedef struct domount_args {
152 ephemeral_servinfo_t *dma_esi;
153 char *dma_hostlist; /* comma-sep. for RO failover */
154 struct nfs_args *dma_nargs;
155 } domount_args_t;
156
157
158 /*
159 * The vnode ops functions for a trigger stub vnode
160 */
161 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
162 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
163 caller_context_t *);
164 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
165 caller_context_t *);
166 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
167 caller_context_t *);
168 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
169 caller_context_t *);
170 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
171 struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
172 int *, pathname_t *);
173 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
174 enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
175 vsecattr_t *);
176 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
177 int);
178 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
179 caller_context_t *, int);
180 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
181 cred_t *, caller_context_t *, int);
182 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
183 vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
184 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
185 caller_context_t *, int);
186 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
187 cred_t *, caller_context_t *, int);
188 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
189
190 /*
191 * Regular NFSv4 vnodeops that we need to reference directly
192 */
193 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
194 caller_context_t *);
195 extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
196 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *);
197 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
198 extern int nfs4_lookup(vnode_t *, char *, vnode_t **,
199 struct pathname *, int, vnode_t *, cred_t *,
200 caller_context_t *, int *, pathname_t *);
201 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
202 caller_context_t *);
203 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
204 caller_context_t *);
205 extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
206 extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
207
208 static int nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
209 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
210 cred_t *, vnode_t **);
211 static int nfs4_trigger_domount_args_create(vnode_t *, cred_t *,
212 domount_args_t **dmap);
213 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma,
214 vnode_t *vp);
215 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
216 cred_t *);
217 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
218 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
219 servinfo4_t *);
220 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
221 cred_t *);
222 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
223 ephemeral_servinfo_t *);
224 static void nfs4_trigger_nargs_destroy(struct nfs_args *);
225 static char *nfs4_trigger_create_mntopts(vfs_t *);
226 static void nfs4_trigger_destroy_mntopts(char *);
227 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
228 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
229 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
230 struct netbuf *, int);
231
232 extern int umount2_engine(vfs_t *, int, cred_t *, int);
233
234 vnodeops_t *nfs4_trigger_vnodeops;
235
236 /*
237 * These are the vnodeops that we must define for stub vnodes.
238 *
239 *
240 * Many of the VOPs defined for NFSv4 do not need to be defined here,
241 * for various reasons. This will result in the VFS default function being
242 * used:
243 *
244 * - These VOPs require a previous VOP_OPEN to have occurred. That will have
245 * lost the reference to the stub vnode, meaning these should not be called:
246 * close, read, write, ioctl, readdir, seek.
247 *
248 * - These VOPs are meaningless for vnodes without data pages. Since the
249 * stub vnode is of type VDIR, these should not be called:
250 * space, getpage, putpage, map, addmap, delmap, pageio, fsync.
251 *
252 * - These VOPs are otherwise not applicable, and should not be called:
253 * dump, setsecattr.
254 *
255 *
256 * These VOPs we do not want to define, but nor do we want the VFS default
257 * action. Instead, we specify the VFS error function, with fs_error(), but
258 * note that fs_error() is not actually called. Instead it results in the
259 * use of the error function defined for the particular VOP, in vn_ops_table[]:
260 *
261 * - frlock, dispose, shrlock.
262 *
263 *
264 * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
265 * NOTE: if any of these ops involve an OTW call with the stub FH, then
266 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
267 * to protect the security data in the servinfo4_t for the "parent"
268 * filesystem that contains the stub.
269 *
270 * - These VOPs should not trigger a mount, so that "ls -l" does not:
271 * pathconf, getsecattr.
272 *
273 * - These VOPs would not make sense to trigger:
274 * inactive, rwlock, rwunlock, fid, realvp.
275 */
276 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
277 VOPNAME_OPEN, { .vop_open = nfs4_trigger_open },
278 VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr },
279 VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr },
280 VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access },
281 VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup },
282 VOPNAME_CREATE, { .vop_create = nfs4_trigger_create },
283 VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove },
284 VOPNAME_LINK, { .vop_link = nfs4_trigger_link },
285 VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename },
286 VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir },
287 VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir },
288 VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink },
289 VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink },
290 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive },
291 VOPNAME_FID, { .vop_fid = nfs4_fid },
292 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock },
293 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock },
294 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp },
295 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr },
296 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf },
297 VOPNAME_FRLOCK, { .error = fs_error },
298 VOPNAME_DISPOSE, { .error = fs_error },
299 VOPNAME_SHRLOCK, { .error = fs_error },
300 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
301 NULL, NULL
302 };
303
304 static void
nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t * net)305 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
306 {
307 ASSERT(mutex_owned(&net->net_cnt_lock));
308 net->net_refcnt++;
309 ASSERT(net->net_refcnt != 0);
310 }
311
312 static void
nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t * net)313 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
314 {
315 mutex_enter(&net->net_cnt_lock);
316 nfs4_ephemeral_tree_incr(net);
317 mutex_exit(&net->net_cnt_lock);
318 }
319
320 /*
321 * We need a safe way to decrement the refcnt whilst the
322 * lock is being held.
323 */
324 static void
nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t * net)325 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
326 {
327 ASSERT(mutex_owned(&net->net_cnt_lock));
328 ASSERT(net->net_refcnt != 0);
329 net->net_refcnt--;
330 }
331
332 static void
nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t * net)333 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
334 {
335 mutex_enter(&net->net_cnt_lock);
336 nfs4_ephemeral_tree_decr(net);
337 mutex_exit(&net->net_cnt_lock);
338 }
339
340 /*
341 * Trigger ops for stub vnodes; for mirror mounts, etc.
342 *
343 * The general idea is that a "triggering" op will first call
344 * nfs4_trigger_mount(), which will find out whether a mount has already
345 * been triggered.
346 *
347 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
348 * of the covering vfs.
349 *
350 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
351 * and again set newvp, as above.
352 *
353 * The triggering op may then re-issue the VOP by calling it on newvp.
354 *
355 * Note that some ops may perform custom action, and may or may not need
356 * to trigger a mount.
357 *
358 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
359 * obviously can't do this with VOP_<whatever>, since it's a stub vnode
360 * and that would just recurse. Instead, we call the v4 op directly,
361 * by name. This is OK, since we know that the vnode is for NFSv4,
362 * otherwise it couldn't be a stub.
363 *
364 */
365
366 static int
nfs4_trigger_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)367 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
368 {
369 int error;
370 vnode_t *newvp;
371
372 error = nfs4_trigger_mount(*vpp, cr, &newvp);
373 if (error)
374 return (error);
375
376 /* Release the stub vnode, as we're losing the reference to it */
377 VN_RELE(*vpp);
378
379 /* Give the caller the root vnode of the newly-mounted fs */
380 *vpp = newvp;
381
382 /* return with VN_HELD(newvp) */
383 return (VOP_OPEN(vpp, flag, cr, ct));
384 }
385
386 void
nfs4_fake_attrs(vnode_t * vp,struct vattr * vap)387 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
388 {
389 uint_t mask;
390 timespec_t now;
391
392 /*
393 * Set some attributes here for referrals.
394 */
395 mask = vap->va_mask;
396 bzero(vap, sizeof (struct vattr));
397 vap->va_mask = mask;
398 vap->va_uid = 0;
399 vap->va_gid = 0;
400 vap->va_nlink = 1;
401 vap->va_size = 1;
402 gethrestime(&now);
403 vap->va_atime = now;
404 vap->va_mtime = now;
405 vap->va_ctime = now;
406 vap->va_type = VDIR;
407 vap->va_mode = 0555;
408 vap->va_fsid = vp->v_vfsp->vfs_dev;
409 vap->va_rdev = 0;
410 vap->va_blksize = MAXBSIZE;
411 vap->va_nblocks = 1;
412 vap->va_seq = 0;
413 }
414
415 /*
416 * For the majority of cases, nfs4_trigger_getattr() will not trigger
417 * a mount. However, if ATTR_TRIGGER is set, we are being informed
418 * that we need to force the mount before we attempt to determine
419 * the attributes. The intent is an atomic operation for security
420 * testing.
421 *
422 * If we're not triggering a mount, we can still inquire about the
423 * actual attributes from the server in the mirror mount case,
424 * and will return manufactured attributes for a referral (see
425 * the 'create' branch of find_referral_stubvp()).
426 */
427 static int
nfs4_trigger_getattr(vnode_t * vp,struct vattr * vap,int flags,cred_t * cr,caller_context_t * ct)428 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
429 caller_context_t *ct)
430 {
431 int error;
432
433 if (flags & ATTR_TRIGGER || RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
434 vnode_t *newvp;
435
436 error = nfs4_trigger_mount(vp, cr, &newvp);
437 if (error)
438 return (error);
439
440 error = VOP_GETATTR(newvp, vap, flags, cr, ct);
441 VN_RELE(newvp);
442 } else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
443
444 nfs4_fake_attrs(vp, vap);
445 error = 0;
446 }
447
448 return (error);
449 }
450
451 static int
nfs4_trigger_setattr(vnode_t * vp,struct vattr * vap,int flags,cred_t * cr,caller_context_t * ct)452 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
453 caller_context_t *ct)
454 {
455 int error;
456 vnode_t *newvp;
457
458 error = nfs4_trigger_mount(vp, cr, &newvp);
459 if (error)
460 return (error);
461
462 error = VOP_SETATTR(newvp, vap, flags, cr, ct);
463 VN_RELE(newvp);
464
465 return (error);
466 }
467
468 static int
nfs4_trigger_access(vnode_t * vp,int mode,int flags,cred_t * cr,caller_context_t * ct)469 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
470 caller_context_t *ct)
471 {
472 int error;
473 vnode_t *newvp;
474
475 error = nfs4_trigger_mount(vp, cr, &newvp);
476 if (error)
477 return (error);
478
479 error = VOP_ACCESS(newvp, mode, flags, cr, ct);
480 VN_RELE(newvp);
481
482 return (error);
483 }
484
485 static int
nfs4_trigger_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,struct pathname * pnp,int flags,vnode_t * rdir,cred_t * cr,caller_context_t * ct,int * deflags,pathname_t * rpnp)486 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
487 struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
488 caller_context_t *ct, int *deflags, pathname_t *rpnp)
489 {
490 int error;
491 vnode_t *newdvp;
492 rnode4_t *drp = VTOR4(dvp);
493
494 ASSERT(RP_ISSTUB(drp));
495
496 /*
497 * It's not legal to lookup ".." for an fs root, so we mustn't pass
498 * that up. Instead, pass onto the regular op, regardless of whether
499 * we've triggered a mount.
500 */
501 if (strcmp(nm, "..") == 0)
502 if (RP_ISSTUB_MIRRORMOUNT(drp)) {
503 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
504 ct, deflags, rpnp));
505 } else if (RP_ISSTUB_REFERRAL(drp)) {
506 /* Return the parent vnode */
507 return (vtodv(dvp, vpp, cr, TRUE));
508 }
509
510 error = nfs4_trigger_mount(dvp, cr, &newdvp);
511 if (error)
512 return (error);
513
514 error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
515 deflags, rpnp);
516 VN_RELE(newdvp);
517
518 return (error);
519 }
520
521 static int
nfs4_trigger_create(vnode_t * dvp,char * nm,struct vattr * va,enum vcexcl exclusive,int mode,vnode_t ** vpp,cred_t * cr,int flags,caller_context_t * ct,vsecattr_t * vsecp)522 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
523 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
524 int flags, caller_context_t *ct, vsecattr_t *vsecp)
525 {
526 int error;
527 vnode_t *newdvp;
528
529 error = nfs4_trigger_mount(dvp, cr, &newdvp);
530 if (error)
531 return (error);
532
533 error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
534 flags, ct, vsecp);
535 VN_RELE(newdvp);
536
537 return (error);
538 }
539
540 static int
nfs4_trigger_remove(vnode_t * dvp,char * nm,cred_t * cr,caller_context_t * ct,int flags)541 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
542 int flags)
543 {
544 int error;
545 vnode_t *newdvp;
546
547 error = nfs4_trigger_mount(dvp, cr, &newdvp);
548 if (error)
549 return (error);
550
551 error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
552 VN_RELE(newdvp);
553
554 return (error);
555 }
556
557 static int
nfs4_trigger_link(vnode_t * tdvp,vnode_t * svp,char * tnm,cred_t * cr,caller_context_t * ct,int flags)558 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
559 caller_context_t *ct, int flags)
560 {
561 int error;
562 vnode_t *newtdvp;
563
564 error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
565 if (error)
566 return (error);
567
568 /*
569 * We don't check whether svp is a stub. Let the NFSv4 code
570 * detect that error, and return accordingly.
571 */
572 error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
573 VN_RELE(newtdvp);
574
575 return (error);
576 }
577
578 static int
nfs4_trigger_rename(vnode_t * sdvp,char * snm,vnode_t * tdvp,char * tnm,cred_t * cr,caller_context_t * ct,int flags)579 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
580 cred_t *cr, caller_context_t *ct, int flags)
581 {
582 int error;
583 vnode_t *newsdvp;
584 rnode4_t *tdrp = VTOR4(tdvp);
585
586 /*
587 * We know that sdvp is a stub, otherwise we would not be here.
588 *
589 * If tdvp is also be a stub, there are two possibilities: it
590 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
591 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
592 *
593 * In the former case, just trigger sdvp, and treat tdvp as
594 * though it were not a stub.
595 *
596 * In the latter case, it might be a different stub for the
597 * same server fs as sdvp, or for a different server fs.
598 * Regardless, from the client perspective this would still
599 * be a cross-filesystem rename, and should not be allowed,
600 * so return EXDEV, without triggering either mount.
601 */
602 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
603 return (EXDEV);
604
605 error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
606 if (error)
607 return (error);
608
609 error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
610
611 VN_RELE(newsdvp);
612
613 return (error);
614 }
615
616 /* ARGSUSED */
617 static int
nfs4_trigger_mkdir(vnode_t * dvp,char * nm,struct vattr * va,vnode_t ** vpp,cred_t * cr,caller_context_t * ct,int flags,vsecattr_t * vsecp)618 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
619 cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
620 {
621 int error;
622 vnode_t *newdvp;
623
624 error = nfs4_trigger_mount(dvp, cr, &newdvp);
625 if (error)
626 return (error);
627
628 error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
629 VN_RELE(newdvp);
630
631 return (error);
632 }
633
634 static int
nfs4_trigger_rmdir(vnode_t * dvp,char * nm,vnode_t * cdir,cred_t * cr,caller_context_t * ct,int flags)635 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
636 caller_context_t *ct, int flags)
637 {
638 int error;
639 vnode_t *newdvp;
640
641 error = nfs4_trigger_mount(dvp, cr, &newdvp);
642 if (error)
643 return (error);
644
645 error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
646 VN_RELE(newdvp);
647
648 return (error);
649 }
650
651 static int
nfs4_trigger_symlink(vnode_t * dvp,char * lnm,struct vattr * tva,char * tnm,cred_t * cr,caller_context_t * ct,int flags)652 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
653 cred_t *cr, caller_context_t *ct, int flags)
654 {
655 int error;
656 vnode_t *newdvp;
657
658 error = nfs4_trigger_mount(dvp, cr, &newdvp);
659 if (error)
660 return (error);
661
662 error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
663 VN_RELE(newdvp);
664
665 return (error);
666 }
667
668 static int
nfs4_trigger_readlink(vnode_t * vp,struct uio * uiop,cred_t * cr,caller_context_t * ct)669 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
670 caller_context_t *ct)
671 {
672 int error;
673 vnode_t *newvp;
674
675 error = nfs4_trigger_mount(vp, cr, &newvp);
676 if (error)
677 return (error);
678
679 error = VOP_READLINK(newvp, uiop, cr, ct);
680 VN_RELE(newvp);
681
682 return (error);
683 }
684
685 /* end of trigger vnode ops */
686
687 /*
688 * See if the mount has already been done by another caller.
689 */
690 static int
nfs4_trigger_mounted_already(vnode_t * vp,vnode_t ** newvpp,bool_t * was_mounted,vfs_t ** vfsp)691 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
692 bool_t *was_mounted, vfs_t **vfsp)
693 {
694 int error;
695 mntinfo4_t *mi = VTOMI4(vp);
696
697 *was_mounted = FALSE;
698
699 error = vn_vfsrlock_wait(vp);
700 if (error)
701 return (error);
702
703 *vfsp = vn_mountedvfs(vp);
704 if (*vfsp != NULL) {
705 /* the mount has already occurred */
706 error = VFS_ROOT(*vfsp, newvpp);
707 if (!error) {
708 /* need to update the reference time */
709 mutex_enter(&mi->mi_lock);
710 if (mi->mi_ephemeral)
711 mi->mi_ephemeral->ne_ref_time =
712 gethrestime_sec();
713 mutex_exit(&mi->mi_lock);
714
715 *was_mounted = TRUE;
716 }
717 }
718
719 vn_vfsunlock(vp);
720 return (0);
721 }
722
723 /*
724 * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
725 *
726 * The mount may have already occurred, via another thread. If not,
727 * assemble the location information - which may require fetching - and
728 * perform the mount.
729 *
730 * Sets newvp to be the root of the fs that is now covering vp. Note
731 * that we return with VN_HELD(*newvp).
732 *
733 * The caller is responsible for passing the VOP onto the covering fs.
734 */
735 static int
nfs4_trigger_mount(vnode_t * vp,cred_t * cr,vnode_t ** newvpp)736 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
737 {
738 int error;
739 vfs_t *vfsp;
740 rnode4_t *rp = VTOR4(vp);
741 mntinfo4_t *mi = VTOMI4(vp);
742 domount_args_t *dma;
743
744 nfs4_ephemeral_tree_t *net;
745
746 bool_t must_unlock = FALSE;
747 bool_t is_building = FALSE;
748 bool_t was_mounted = FALSE;
749
750 cred_t *mcred = NULL;
751
752 nfs4_trigger_globals_t *ntg;
753
754 zone_t *zone = curproc->p_zone;
755
756 ASSERT(RP_ISSTUB(rp));
757
758 *newvpp = NULL;
759
760 /*
761 * Has the mount already occurred?
762 */
763 error = nfs4_trigger_mounted_already(vp, newvpp,
764 &was_mounted, &vfsp);
765 if (error || was_mounted)
766 goto done;
767
768 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
769 ASSERT(ntg != NULL);
770
771 mutex_enter(&mi->mi_lock);
772
773 /*
774 * We need to lock down the ephemeral tree.
775 */
776 if (mi->mi_ephemeral_tree == NULL) {
777 net = kmem_zalloc(sizeof (*net), KM_SLEEP);
778 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
779 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
780 net->net_refcnt = 1;
781 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
782 is_building = TRUE;
783
784 /*
785 * We need to add it to the zone specific list for
786 * automatic unmounting and harvesting of deadwood.
787 */
788 mutex_enter(&ntg->ntg_forest_lock);
789 if (ntg->ntg_forest != NULL)
790 net->net_next = ntg->ntg_forest;
791 ntg->ntg_forest = net;
792 mutex_exit(&ntg->ntg_forest_lock);
793
794 /*
795 * No lock order confusion with mi_lock because no
796 * other node could have grabbed net_tree_lock.
797 */
798 mutex_enter(&net->net_tree_lock);
799 mi->mi_ephemeral_tree = net;
800 net->net_mount = mi;
801 mutex_exit(&mi->mi_lock);
802
803 MI4_HOLD(mi);
804 VFS_HOLD(mi->mi_vfsp);
805 } else {
806 net = mi->mi_ephemeral_tree;
807 nfs4_ephemeral_tree_hold(net);
808
809 mutex_exit(&mi->mi_lock);
810
811 mutex_enter(&net->net_tree_lock);
812
813 /*
814 * We can only procede if the tree is neither locked
815 * nor being torn down.
816 */
817 mutex_enter(&net->net_cnt_lock);
818 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
819 nfs4_ephemeral_tree_decr(net);
820 mutex_exit(&net->net_cnt_lock);
821 mutex_exit(&net->net_tree_lock);
822
823 return (EIO);
824 }
825 mutex_exit(&net->net_cnt_lock);
826 }
827
828 mutex_enter(&net->net_cnt_lock);
829 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
830 mutex_exit(&net->net_cnt_lock);
831
832 must_unlock = TRUE;
833
834 error = nfs4_trigger_domount_args_create(vp, cr, &dma);
835 if (error)
836 goto done;
837
838 /*
839 * Note that since we define mirror mounts to work
840 * for any user, we simply extend the privileges of
841 * the user's credentials to allow the mount to
842 * proceed.
843 */
844 mcred = crdup(cr);
845 if (mcred == NULL) {
846 error = EINVAL;
847 nfs4_trigger_domount_args_destroy(dma, vp);
848 goto done;
849 }
850
851 crset_zone_privall(mcred);
852 if (is_system_labeled())
853 (void) setpflags(NET_MAC_AWARE, 1, mcred);
854
855 error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
856 nfs4_trigger_domount_args_destroy(dma, vp);
857
858 DTRACE_PROBE2(nfs4clnt__func__referral__mount,
859 vnode_t *, vp, int, error);
860
861 crfree(mcred);
862
863 done:
864
865 if (must_unlock) {
866 mutex_enter(&net->net_cnt_lock);
867 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
868
869 /*
870 * REFCNT: If we are the root of the tree, then we need
871 * to keep a reference because we malloced the tree and
872 * this is where we tied it to our mntinfo.
873 *
874 * If we are not the root of the tree, then our tie to
875 * the mntinfo occured elsewhere and we need to
876 * decrement the reference to the tree.
877 */
878 if (is_building)
879 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
880 else
881 nfs4_ephemeral_tree_decr(net);
882 mutex_exit(&net->net_cnt_lock);
883
884 mutex_exit(&net->net_tree_lock);
885 }
886
887 if (!error && (newvpp == NULL || *newvpp == NULL))
888 error = ENOSYS;
889
890 return (error);
891 }
892
893 /*
894 * Collect together both the generic & mount-type specific args.
895 */
896 static int
nfs4_trigger_domount_args_create(vnode_t * vp,cred_t * cr,domount_args_t ** dmap)897 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr, domount_args_t **dmap)
898 {
899 int nointr;
900 char *hostlist;
901 servinfo4_t *svp;
902 struct nfs_args *nargs, *nargs_head;
903 enum clnt_stat status;
904 ephemeral_servinfo_t *esi, *esi_first;
905 domount_args_t *dma;
906 mntinfo4_t *mi = VTOMI4(vp);
907
908 nointr = !(mi->mi_flags & MI4_INT);
909 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
910
911 svp = mi->mi_curr_serv;
912 /* check if the current server is responding */
913 status = nfs4_trigger_ping_server(svp, nointr);
914 if (status == RPC_SUCCESS) {
915 esi_first = nfs4_trigger_esi_create(vp, svp, cr);
916 if (esi_first == NULL) {
917 kmem_free(hostlist, MAXPATHLEN);
918 return (EINVAL);
919 }
920
921 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
922
923 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
924 } else {
925 /* current server did not respond */
926 esi_first = NULL;
927 nargs_head = NULL;
928 }
929 nargs = nargs_head;
930
931 /*
932 * NFS RO failover.
933 *
934 * If we have multiple servinfo4 structures, linked via sv_next,
935 * we must create one nfs_args for each, linking the nfs_args via
936 * nfs_ext_u.nfs_extB.next.
937 *
938 * We need to build a corresponding esi for each, too, but that is
939 * used solely for building nfs_args, and may be immediately
940 * discarded, as domount() requires the info from just one esi,
941 * but all the nfs_args.
942 *
943 * Currently, the NFS mount code will hang if not all servers
944 * requested are available. To avoid that, we need to ping each
945 * server, here, and remove it from the list if it is not
946 * responding. This has the side-effect of that server then
947 * being permanently unavailable for this failover mount, even if
948 * it recovers. That's unfortunate, but the best we can do until
949 * the mount code path is fixed.
950 */
951
952 /*
953 * If the current server was down, loop indefinitely until we find
954 * at least one responsive server.
955 */
956 do {
957 /* no locking needed for sv_next; it is only set at fs mount */
958 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
959 struct nfs_args *next;
960
961 /*
962 * nargs_head: the head of the nfs_args list
963 * nargs: the current tail of the list
964 * next: the newly-created element to be added
965 */
966
967 /*
968 * We've already tried the current server, above;
969 * if it was responding, we have already included it
970 * and it may now be ignored.
971 *
972 * Otherwise, try it again, since it may now have
973 * recovered.
974 */
975 if (svp == mi->mi_curr_serv && esi_first != NULL)
976 continue;
977
978 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
979 if (svp->sv_flags & SV4_NOTINUSE) {
980 nfs_rw_exit(&svp->sv_lock);
981 continue;
982 }
983 nfs_rw_exit(&svp->sv_lock);
984
985 /* check if the server is responding */
986 status = nfs4_trigger_ping_server(svp, nointr);
987 if (status == RPC_INTR) {
988 kmem_free(hostlist, MAXPATHLEN);
989 nfs4_trigger_esi_destroy(esi_first, vp);
990 nargs = nargs_head;
991 while (nargs != NULL) {
992 next = nargs->nfs_ext_u.nfs_extB.next;
993 nfs4_trigger_nargs_destroy(nargs);
994 nargs = next;
995 }
996 return (EINTR);
997 } else if (status != RPC_SUCCESS) {
998 /* if the server did not respond, ignore it */
999 continue;
1000 }
1001
1002 esi = nfs4_trigger_esi_create(vp, svp, cr);
1003 if (esi == NULL)
1004 continue;
1005
1006 /*
1007 * If the original current server (mi_curr_serv)
1008 * was down when when we first tried it,
1009 * (i.e. esi_first == NULL),
1010 * we select this new server (svp) to be the server
1011 * that we will actually contact (esi_first).
1012 *
1013 * Note that it's possible that mi_curr_serv == svp,
1014 * if that mi_curr_serv was down but has now recovered.
1015 */
1016 next = nfs4_trigger_nargs_create(mi, svp, esi);
1017 if (esi_first == NULL) {
1018 ASSERT(nargs == NULL);
1019 ASSERT(nargs_head == NULL);
1020 nargs_head = next;
1021 esi_first = esi;
1022 (void) strlcpy(hostlist,
1023 esi_first->esi_hostname, MAXPATHLEN);
1024 } else {
1025 ASSERT(nargs_head != NULL);
1026 nargs->nfs_ext_u.nfs_extB.next = next;
1027 (void) strlcat(hostlist, ",", MAXPATHLEN);
1028 (void) strlcat(hostlist, esi->esi_hostname,
1029 MAXPATHLEN);
1030 /* esi was only needed for hostname & nargs */
1031 nfs4_trigger_esi_destroy(esi, vp);
1032 }
1033
1034 nargs = next;
1035 }
1036
1037 /* if we've had no response at all, wait a second */
1038 if (esi_first == NULL)
1039 delay(drv_usectohz(1000000));
1040
1041 } while (esi_first == NULL);
1042 ASSERT(nargs_head != NULL);
1043
1044 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1045 dma->dma_esi = esi_first;
1046 dma->dma_hostlist = hostlist;
1047 dma->dma_nargs = nargs_head;
1048 *dmap = dma;
1049
1050 return (0);
1051 }
1052
1053 static void
nfs4_trigger_domount_args_destroy(domount_args_t * dma,vnode_t * vp)1054 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1055 {
1056 if (dma != NULL) {
1057 if (dma->dma_esi != NULL && vp != NULL)
1058 nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1059
1060 if (dma->dma_hostlist != NULL)
1061 kmem_free(dma->dma_hostlist, MAXPATHLEN);
1062
1063 if (dma->dma_nargs != NULL) {
1064 struct nfs_args *nargs = dma->dma_nargs;
1065
1066 do {
1067 struct nfs_args *next =
1068 nargs->nfs_ext_u.nfs_extB.next;
1069
1070 nfs4_trigger_nargs_destroy(nargs);
1071 nargs = next;
1072 } while (nargs != NULL);
1073 }
1074
1075 kmem_free(dma, sizeof (domount_args_t));
1076 }
1077 }
1078
1079 /*
1080 * The ephemeral_servinfo_t struct contains basic information we will need to
1081 * perform the mount. Whilst the structure is generic across different
1082 * types of ephemeral mount, the way we gather its contents differs.
1083 */
1084 static ephemeral_servinfo_t *
nfs4_trigger_esi_create(vnode_t * vp,servinfo4_t * svp,cred_t * cr)1085 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1086 {
1087 ephemeral_servinfo_t *esi;
1088 rnode4_t *rp = VTOR4(vp);
1089
1090 ASSERT(RP_ISSTUB(rp));
1091
1092 /* Call the ephemeral type-specific routine */
1093 if (RP_ISSTUB_MIRRORMOUNT(rp))
1094 esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1095 else if (RP_ISSTUB_REFERRAL(rp))
1096 esi = nfs4_trigger_esi_create_referral(vp, cr);
1097 else
1098 esi = NULL;
1099 return (esi);
1100 }
1101
1102 static void
nfs4_trigger_esi_destroy(ephemeral_servinfo_t * esi,vnode_t * vp)1103 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1104 {
1105 rnode4_t *rp = VTOR4(vp);
1106
1107 ASSERT(RP_ISSTUB(rp));
1108
1109 /* Currently, no need for an ephemeral type-specific routine */
1110
1111 /*
1112 * The contents of ephemeral_servinfo_t goes into nfs_args,
1113 * and will be handled by nfs4_trigger_nargs_destroy().
1114 * We need only free the structure itself.
1115 */
1116 if (esi != NULL)
1117 kmem_free(esi, sizeof (ephemeral_servinfo_t));
1118 }
1119
1120 /*
1121 * Some of this may turn out to be common with other ephemeral types,
1122 * in which case it should be moved to nfs4_trigger_esi_create(), or a
1123 * common function called.
1124 */
1125
1126 /*
1127 * Mirror mounts case - should have all data available
1128 */
1129 static ephemeral_servinfo_t *
nfs4_trigger_esi_create_mirrormount(vnode_t * vp,servinfo4_t * svp)1130 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1131 {
1132 char *stubpath;
1133 struct knetconfig *sikncp, *svkncp;
1134 struct netbuf *bufp;
1135 ephemeral_servinfo_t *esi;
1136
1137 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1138
1139 /* initially set to be our type of ephemeral mount; may be added to */
1140 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1141
1142 /*
1143 * We're copying info from the stub rnode's servinfo4, but
1144 * we must create new copies, not pointers, since this information
1145 * is to be associated with the new mount, which will be
1146 * unmounted (and its structures freed) separately
1147 */
1148
1149 /*
1150 * Sizes passed to kmem_[z]alloc here must match those freed
1151 * in nfs4_free_args()
1152 */
1153
1154 /*
1155 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1156 * is difficult to avoid: as we need to read svp to calculate the
1157 * sizes to be allocated.
1158 */
1159 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1160
1161 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1162 (void) strcat(esi->esi_hostname, svp->sv_hostname);
1163
1164 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1165 bufp = esi->esi_addr;
1166 bufp->len = svp->sv_addr.len;
1167 bufp->maxlen = svp->sv_addr.maxlen;
1168 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1169 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1170
1171 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1172 sikncp = esi->esi_knconf;
1173 svkncp = svp->sv_knconf;
1174 sikncp->knc_semantics = svkncp->knc_semantics;
1175 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1176 (void) strcat((char *)sikncp->knc_protofmly,
1177 (char *)svkncp->knc_protofmly);
1178 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1179 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1180 sikncp->knc_rdev = svkncp->knc_rdev;
1181
1182 /*
1183 * Used when AUTH_DH is negotiated.
1184 *
1185 * This is ephemeral mount-type specific, since it contains the
1186 * server's time-sync syncaddr.
1187 */
1188 if (svp->sv_dhsec) {
1189 struct netbuf *bufp;
1190 sec_data_t *sdata;
1191 dh_k4_clntdata_t *data;
1192
1193 sdata = svp->sv_dhsec;
1194 data = (dh_k4_clntdata_t *)sdata->data;
1195 ASSERT(sdata->rpcflavor == AUTH_DH);
1196
1197 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1198 bufp->len = data->syncaddr.len;
1199 bufp->maxlen = data->syncaddr.maxlen;
1200 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1201 bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1202 esi->esi_syncaddr = bufp;
1203
1204 if (data->netname != NULL) {
1205 int nmlen = data->netnamelen;
1206
1207 /*
1208 * We need to copy from a dh_k4_clntdata_t
1209 * netname/netnamelen pair to a NUL-terminated
1210 * netname string suitable for putting in nfs_args,
1211 * where the latter has no netnamelen field.
1212 */
1213 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1214 bcopy(data->netname, esi->esi_netname, nmlen);
1215 }
1216 } else {
1217 esi->esi_syncaddr = NULL;
1218 esi->esi_netname = NULL;
1219 }
1220
1221 stubpath = fn_path(VTOSV(vp)->sv_name);
1222 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1223 ASSERT(*stubpath == '.');
1224 stubpath += 1;
1225
1226 /* for nfs_args->fh */
1227 esi->esi_path_len = strlen(stubpath) + 1;
1228 if (strcmp(svp->sv_path, "/") != 0)
1229 esi->esi_path_len += strlen(svp->sv_path);
1230 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1231 if (strcmp(svp->sv_path, "/") != 0)
1232 (void) strcat(esi->esi_path, svp->sv_path);
1233 (void) strcat(esi->esi_path, stubpath);
1234
1235 stubpath -= 1;
1236 /* stubpath allocated by fn_path() */
1237 kmem_free(stubpath, strlen(stubpath) + 1);
1238
1239 nfs_rw_exit(&svp->sv_lock);
1240
1241 return (esi);
1242 }
1243
1244 /*
1245 * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1246 * get network information required to do the mount call.
1247 */
1248 int
nfs4_callmapid(utf8string * server,struct nfs_fsl_info * resp)1249 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1250 {
1251 door_arg_t door_args;
1252 door_handle_t dh;
1253 XDR xdr;
1254 refd_door_args_t *xdr_argsp;
1255 refd_door_res_t *orig_resp;
1256 k_sigset_t smask;
1257 int xdr_len = 0;
1258 int res_len = 16; /* length of an ip adress */
1259 int orig_reslen = res_len;
1260 int error = 0;
1261 struct nfsidmap_globals *nig;
1262
1263 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1264 return (ECONNREFUSED);
1265
1266 nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1267 ASSERT(nig != NULL);
1268
1269 mutex_enter(&nig->nfsidmap_daemon_lock);
1270 dh = nig->nfsidmap_daemon_dh;
1271 if (dh == NULL) {
1272 mutex_exit(&nig->nfsidmap_daemon_lock);
1273 cmn_err(CE_NOTE,
1274 "nfs4_callmapid: nfsmapid daemon not " \
1275 "running unable to resolve host name\n");
1276 return (EINVAL);
1277 }
1278 door_ki_hold(dh);
1279 mutex_exit(&nig->nfsidmap_daemon_lock);
1280
1281 xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1282
1283 xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1284 xdr_argsp->xdr_len = xdr_len;
1285 xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1286
1287 xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1288 xdr_len, XDR_ENCODE);
1289
1290 if (!xdr_utf8string(&xdr, server)) {
1291 kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1292 door_ki_rele(dh);
1293 return (1);
1294 }
1295
1296 if (orig_reslen)
1297 orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1298
1299 door_args.data_ptr = (char *)xdr_argsp;
1300 door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1301 door_args.desc_ptr = NULL;
1302 door_args.desc_num = 0;
1303 door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1304 door_args.rsize = res_len;
1305
1306 sigintr(&smask, 1);
1307 error = door_ki_upcall(dh, &door_args);
1308 sigunintr(&smask);
1309
1310 door_ki_rele(dh);
1311
1312 kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1313 if (error) {
1314 kmem_free(orig_resp, orig_reslen);
1315 /*
1316 * There is no door to connect to. The referral daemon
1317 * must not be running yet.
1318 */
1319 cmn_err(CE_WARN,
1320 "nfsmapid not running cannot resolve host name");
1321 goto out;
1322 }
1323
1324 /*
1325 * If the results buffer passed back are not the same as
1326 * what was sent free the old buffer and use the new one.
1327 */
1328 if (orig_resp && orig_reslen) {
1329 refd_door_res_t *door_resp;
1330
1331 door_resp = (refd_door_res_t *)door_args.rbuf;
1332 if ((void *)door_args.rbuf != orig_resp)
1333 kmem_free(orig_resp, orig_reslen);
1334 if (door_resp->res_status == 0) {
1335 xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1336 door_resp->xdr_len, XDR_DECODE);
1337 bzero(resp, sizeof (struct nfs_fsl_info));
1338 if (!xdr_nfs_fsl_info(&xdr, resp)) {
1339 DTRACE_PROBE2(
1340 nfs4clnt__debug__referral__upcall__xdrfail,
1341 struct nfs_fsl_info *, resp,
1342 char *, "nfs4_callmapid");
1343 error = EINVAL;
1344 }
1345 } else {
1346 DTRACE_PROBE2(
1347 nfs4clnt__debug__referral__upcall__badstatus,
1348 int, door_resp->res_status,
1349 char *, "nfs4_callmapid");
1350 error = door_resp->res_status;
1351 }
1352 kmem_free(door_args.rbuf, door_args.rsize);
1353 }
1354 out:
1355 DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1356 char *, server, int, error);
1357 return (error);
1358 }
1359
1360 /*
1361 * Fetches the fs_locations attribute. Typically called
1362 * from a Replication/Migration/Referrals/Mirror-mount context
1363 *
1364 * Fills in the attributes in garp. The caller is assumed
1365 * to have allocated memory for garp.
1366 *
1367 * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1368 * it's already done by caller. Otherwise lock these mutexes
1369 * before doing the rfs4call().
1370 *
1371 * Returns
1372 * 1 for success
1373 * 0 for failure
1374 */
1375 int
nfs4_fetch_locations(mntinfo4_t * mi,nfs4_sharedfh_t * sfh,char * nm,cred_t * cr,nfs4_ga_res_t * garp,COMPOUND4res_clnt * callres,bool_t lock)1376 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1377 cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1378 {
1379 COMPOUND4args_clnt args;
1380 COMPOUND4res_clnt res;
1381 nfs_argop4 *argop;
1382 int argoplist_size = 3 * sizeof (nfs_argop4);
1383 nfs4_server_t *sp = NULL;
1384 int doqueue = 1;
1385 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1386 int retval = 1;
1387 struct nfs4_clnt *nfscl;
1388
1389 if (lock == TRUE)
1390 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1391 else
1392 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1393 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1394
1395 sp = find_nfs4_server(mi);
1396 if (lock == TRUE)
1397 nfs_rw_exit(&mi->mi_recovlock);
1398
1399 if (sp != NULL)
1400 mutex_exit(&sp->s_lock);
1401
1402 if (lock == TRUE) {
1403 if (sp != NULL)
1404 (void) nfs_rw_enter_sig(&sp->s_recovlock,
1405 RW_WRITER, 0);
1406 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1407 } else {
1408 if (sp != NULL) {
1409 ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1410 nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1411 }
1412 }
1413
1414 /*
1415 * Do we want to do the setup for recovery here?
1416 *
1417 * We know that the server responded to a null ping a very
1418 * short time ago, and we know that we intend to do a
1419 * single stateless operation - we want to fetch attributes,
1420 * so we know we can't encounter errors about state. If
1421 * something goes wrong with the GETATTR, like not being
1422 * able to get a response from the server or getting any
1423 * kind of FH error, we should fail the mount.
1424 *
1425 * We may want to re-visited this at a later time.
1426 */
1427 argop = kmem_alloc(argoplist_size, KM_SLEEP);
1428
1429 args.ctag = TAG_GETATTR_FSLOCATION;
1430 /* PUTFH LOOKUP GETATTR */
1431 args.array_len = 3;
1432 args.array = argop;
1433
1434 /* 0. putfh file */
1435 argop[0].argop = OP_CPUTFH;
1436 argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1437
1438 /* 1. lookup name, can't be dotdot */
1439 argop[1].argop = OP_CLOOKUP;
1440 argop[1].nfs_argop4_u.opclookup.cname = nm;
1441
1442 /* 2. file attrs */
1443 argop[2].argop = OP_GETATTR;
1444 argop[2].nfs_argop4_u.opgetattr.attr_request =
1445 FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1446 FATTR4_MOUNTED_ON_FILEID_MASK;
1447 argop[2].nfs_argop4_u.opgetattr.mi = mi;
1448
1449 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1450
1451 if (lock == TRUE) {
1452 nfs_rw_exit(&mi->mi_recovlock);
1453 if (sp != NULL)
1454 nfs_rw_exit(&sp->s_recovlock);
1455 }
1456
1457 nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1458 nfscl->nfscl_stat.referrals.value.ui64++;
1459 DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1460 nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1461
1462 if (e.error != 0) {
1463 if (sp != NULL)
1464 nfs4_server_rele(sp);
1465 kmem_free(argop, argoplist_size);
1466 return (0);
1467 }
1468
1469 /*
1470 * Check for all possible error conditions.
1471 * For valid replies without an ops array or for illegal
1472 * replies, return a failure.
1473 */
1474 if (res.status != NFS4_OK || res.array_len < 3 ||
1475 res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1476 retval = 0;
1477 goto exit;
1478 }
1479
1480 /*
1481 * There isn't much value in putting the attributes
1482 * in the attr cache since fs_locations4 aren't
1483 * encountered very frequently, so just make them
1484 * available to the caller.
1485 */
1486 *garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1487
1488 DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1489 nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1490
1491 /* No fs_locations? -- return a failure */
1492 if (garp->n4g_ext_res == NULL ||
1493 garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1494 retval = 0;
1495 goto exit;
1496 }
1497
1498 if (!garp->n4g_fsid_valid)
1499 retval = 0;
1500
1501 exit:
1502 if (retval == 0) {
1503 /* the call was ok but failed validating the call results */
1504 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1505 } else {
1506 ASSERT(callres != NULL);
1507 *callres = res;
1508 }
1509
1510 if (sp != NULL)
1511 nfs4_server_rele(sp);
1512 kmem_free(argop, argoplist_size);
1513 return (retval);
1514 }
1515
1516 /* tunable to disable referral mounts */
1517 int nfs4_no_referrals = 0;
1518
1519 /*
1520 * Returns NULL if the vnode cannot be created or found.
1521 */
1522 vnode_t *
find_referral_stubvp(vnode_t * dvp,char * nm,cred_t * cr)1523 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1524 {
1525 nfs_fh4 *stub_fh, *dfh;
1526 nfs4_sharedfh_t *sfhp;
1527 char *newfhval;
1528 vnode_t *vp = NULL;
1529 fattr4_mounted_on_fileid mnt_on_fileid;
1530 nfs4_ga_res_t garp;
1531 mntinfo4_t *mi;
1532 COMPOUND4res_clnt callres;
1533 hrtime_t t;
1534
1535 if (nfs4_no_referrals)
1536 return (NULL);
1537
1538 /*
1539 * Get the mounted_on_fileid, unique on that server::fsid
1540 */
1541 mi = VTOMI4(dvp);
1542 if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1543 &garp, &callres, FALSE) == 0)
1544 return (NULL);
1545 mnt_on_fileid = garp.n4g_mon_fid;
1546 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1547
1548 /*
1549 * Build a fake filehandle from the dir FH and the mounted_on_fileid
1550 */
1551 dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1552 stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1553 stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1554 sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1555 newfhval = stub_fh->nfs_fh4_val;
1556
1557 /* copy directory's file handle */
1558 bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1559 stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1560 newfhval = newfhval + dfh->nfs_fh4_len;
1561
1562 /* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1563 bcopy((char *)&mnt_on_fileid, newfhval,
1564 sizeof (fattr4_mounted_on_fileid));
1565 stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1566
1567 sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1568 kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1569 sizeof (fattr4_mounted_on_fileid));
1570 kmem_free(stub_fh, sizeof (nfs_fh4));
1571 if (sfhp == NULL)
1572 return (NULL);
1573
1574 t = gethrtime();
1575 garp.n4g_va.va_type = VDIR;
1576 vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1577 cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1578
1579 if (vp != NULL)
1580 vp->v_type = VDIR;
1581
1582 sfh4_rele(&sfhp);
1583 return (vp);
1584 }
1585
1586 int
nfs4_setup_referral(vnode_t * dvp,char * nm,vnode_t ** vpp,cred_t * cr)1587 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1588 {
1589 vnode_t *nvp;
1590 rnode4_t *rp;
1591
1592 if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1593 return (EINVAL);
1594
1595 rp = VTOR4(nvp);
1596 mutex_enter(&rp->r_statelock);
1597 r4_stub_referral(rp);
1598 mutex_exit(&rp->r_statelock);
1599 dnlc_enter(dvp, nm, nvp);
1600
1601 if (*vpp != NULL)
1602 VN_RELE(*vpp); /* no longer need this vnode */
1603
1604 *vpp = nvp;
1605
1606 return (0);
1607 }
1608
1609 /*
1610 * Fetch the location information and resolve the new server.
1611 * Caller needs to free up the XDR data which is returned.
1612 * Input: mount info, shared filehandle, nodename
1613 * Return: Index to the result or Error(-1)
1614 * Output: FsLocations Info, Resolved Server Info.
1615 */
1616 int
nfs4_process_referral(mntinfo4_t * mi,nfs4_sharedfh_t * sfh,char * nm,cred_t * cr,nfs4_ga_res_t * grp,COMPOUND4res_clnt * res,struct nfs_fsl_info * fsloc)1617 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1618 char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1619 struct nfs_fsl_info *fsloc)
1620 {
1621 fs_location4 *fsp;
1622 struct nfs_fsl_info nfsfsloc;
1623 int ret, i, error;
1624 nfs4_ga_res_t garp;
1625 COMPOUND4res_clnt callres;
1626 struct knetconfig *knc;
1627
1628 ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1629 if (ret == 0)
1630 return (-1);
1631
1632 /*
1633 * As a lame attempt to figuring out if we're
1634 * handling a migration event or a referral,
1635 * look for rnodes with this fsid in the rnode
1636 * cache.
1637 *
1638 * If we can find one or more such rnodes, it
1639 * means we're handling a migration event and
1640 * we want to bail out in that case.
1641 */
1642 if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1643 DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1644 mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1645 char *, "nfs4_process_referral");
1646 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1647 return (-1);
1648 }
1649
1650 /*
1651 * Find the first responsive server to mount. When we find
1652 * one, fsp will point to it.
1653 */
1654 for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1655
1656 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1657 if (fsp->server_len == 0 || fsp->server_val == NULL)
1658 continue;
1659
1660 error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1661 if (error != 0)
1662 continue;
1663
1664 error = nfs4_ping_server_common(nfsfsloc.knconf,
1665 nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1666 if (error == RPC_SUCCESS)
1667 break;
1668
1669 DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1670 sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1671 char *, "nfs4_process_referral");
1672
1673 (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1674 }
1675 knc = nfsfsloc.knconf;
1676 if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1677 (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1678 DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1679 nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1680 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1681 return (-1);
1682 }
1683
1684 /* Send the results back */
1685 *fsloc = nfsfsloc;
1686 *grp = garp;
1687 *res = callres;
1688 return (i);
1689 }
1690
1691 /*
1692 * Referrals case - need to fetch referral data and then upcall to
1693 * user-level to get complete mount data.
1694 */
1695 static ephemeral_servinfo_t *
nfs4_trigger_esi_create_referral(vnode_t * vp,cred_t * cr)1696 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1697 {
1698 struct knetconfig *sikncp, *svkncp;
1699 struct netbuf *bufp;
1700 ephemeral_servinfo_t *esi;
1701 vnode_t *dvp;
1702 rnode4_t *drp;
1703 fs_location4 *fsp;
1704 struct nfs_fsl_info nfsfsloc;
1705 nfs4_ga_res_t garp;
1706 char *p;
1707 char fn[MAXNAMELEN];
1708 int i, index = -1;
1709 mntinfo4_t *mi;
1710 COMPOUND4res_clnt callres;
1711
1712 /*
1713 * If we're passed in a stub vnode that
1714 * isn't a "referral" stub, bail out
1715 * and return a failure
1716 */
1717 if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1718 return (NULL);
1719
1720 if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1721 return (NULL);
1722
1723 drp = VTOR4(dvp);
1724 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1725 VN_RELE(dvp);
1726 return (NULL);
1727 }
1728
1729 if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1730 nfs_rw_exit(&drp->r_rwlock);
1731 VN_RELE(dvp);
1732 return (NULL);
1733 }
1734
1735 mi = VTOMI4(dvp);
1736 index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1737 &garp, &callres, &nfsfsloc);
1738 nfs_rw_exit(&drp->r_rwlock);
1739 VN_RELE(dvp);
1740 if (index < 0)
1741 return (NULL);
1742
1743 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1744 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1745
1746 /* initially set to be our type of ephemeral mount; may be added to */
1747 esi->esi_mount_flags = NFSMNT_REFERRAL;
1748
1749 esi->esi_hostname =
1750 kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1751 bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1752 fsp->server_val->utf8string_len);
1753 esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1754
1755 bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1756 bufp->len = nfsfsloc.addr->len;
1757 bufp->maxlen = nfsfsloc.addr->maxlen;
1758 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1759 bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1760 esi->esi_addr = bufp;
1761
1762 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1763 sikncp = esi->esi_knconf;
1764
1765 DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1766 struct nfs_fsl_info *, &nfsfsloc,
1767 char *, "nfs4_trigger_esi_create_referral");
1768
1769 svkncp = nfsfsloc.knconf;
1770 sikncp->knc_semantics = svkncp->knc_semantics;
1771 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1772 (void) strlcat((char *)sikncp->knc_protofmly,
1773 (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1774 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1775 (void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1776 KNC_STRSIZE);
1777 sikncp->knc_rdev = svkncp->knc_rdev;
1778
1779 DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1780 struct knetconfig *, sikncp,
1781 char *, "nfs4_trigger_esi_create_referral");
1782
1783 esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1784 bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1785 esi->esi_syncaddr = NULL;
1786
1787 esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1788 esi->esi_path_len = MAXPATHLEN;
1789 *p++ = '/';
1790 for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1791 component4 *comp;
1792
1793 comp = &fsp->rootpath.pathname4_val[i];
1794 /* If no space, null the string and bail */
1795 if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1796 goto err;
1797 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1798 p += comp->utf8string_len;
1799 *p++ = '/';
1800 }
1801 if (fsp->rootpath.pathname4_len != 0)
1802 *(p - 1) = '\0';
1803 else
1804 *p = '\0';
1805 p = esi->esi_path;
1806 esi->esi_path = strdup(p);
1807 esi->esi_path_len = strlen(p) + 1;
1808 kmem_free(p, MAXPATHLEN);
1809
1810 /* Allocated in nfs4_process_referral() */
1811 (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1812 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1813
1814 return (esi);
1815 err:
1816 kmem_free(esi->esi_path, esi->esi_path_len);
1817 kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1818 kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1819 kmem_free(esi->esi_addr, sizeof (struct netbuf));
1820 kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1821 kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1822 kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1823 kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1824 kmem_free(esi, sizeof (ephemeral_servinfo_t));
1825 (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1826 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1827 return (NULL);
1828 }
1829
1830 /*
1831 * Assemble the args, and call the generic VFS mount function to
1832 * finally perform the ephemeral mount.
1833 */
1834 static int
nfs4_trigger_domount(vnode_t * stubvp,domount_args_t * dma,vfs_t ** vfsp,cred_t * cr,vnode_t ** newvpp)1835 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1836 cred_t *cr, vnode_t **newvpp)
1837 {
1838 struct mounta *uap;
1839 char *mntpt, *orig_path, *path;
1840 const char *orig_mntpt;
1841 int retval;
1842 int mntpt_len;
1843 int spec_len;
1844 zone_t *zone = curproc->p_zone;
1845 bool_t has_leading_slash;
1846 int i;
1847
1848 vfs_t *stubvfsp = stubvp->v_vfsp;
1849 ephemeral_servinfo_t *esi = dma->dma_esi;
1850 struct nfs_args *nargs = dma->dma_nargs;
1851
1852 /* first, construct the mount point for the ephemeral mount */
1853 orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1854 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1855
1856 if (*orig_path == '.')
1857 orig_path++;
1858
1859 /*
1860 * Get rid of zone's root path
1861 */
1862 if (zone != global_zone) {
1863 /*
1864 * -1 for trailing '/' and -1 for EOS.
1865 */
1866 if (strncmp(zone->zone_rootpath, orig_mntpt,
1867 zone->zone_rootpathlen - 1) == 0) {
1868 orig_mntpt += (zone->zone_rootpathlen - 2);
1869 }
1870 }
1871
1872 mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1873 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1874 (void) strcat(mntpt, orig_mntpt);
1875 (void) strcat(mntpt, orig_path);
1876
1877 kmem_free(path, strlen(path) + 1);
1878 path = esi->esi_path;
1879 if (*path == '.')
1880 path++;
1881 if (path[0] == '/' && path[1] == '/')
1882 path++;
1883 has_leading_slash = (*path == '/');
1884
1885 spec_len = strlen(dma->dma_hostlist);
1886 spec_len += strlen(path);
1887
1888 /* We are going to have to add this in */
1889 if (!has_leading_slash)
1890 spec_len++;
1891
1892 /* We need to get the ':' for dma_hostlist:esi_path */
1893 spec_len++;
1894
1895 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1896 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1897 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1898 has_leading_slash ? "" : "/", path);
1899
1900 uap->dir = mntpt;
1901
1902 uap->flags = MS_SYSSPACE | MS_DATA;
1903 /* fstype-independent mount options not covered elsewhere */
1904 /* copy parent's mount(1M) "-m" flag */
1905 if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1906 uap->flags |= MS_NOMNTTAB;
1907
1908 uap->fstype = MNTTYPE_NFS4;
1909 uap->dataptr = (char *)nargs;
1910 /* not needed for MS_SYSSPACE */
1911 uap->datalen = 0;
1912
1913 /* use optptr to pass in extra mount options */
1914 uap->flags |= MS_OPTIONSTR;
1915 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1916 if (uap->optptr == NULL) {
1917 retval = EINVAL;
1918 goto done;
1919 }
1920
1921 /* domount() expects us to count the trailing NUL */
1922 uap->optlen = strlen(uap->optptr) + 1;
1923
1924 /*
1925 * If we get EBUSY, we try again once to see if we can perform
1926 * the mount. We do this because of a spurious race condition.
1927 */
1928 for (i = 0; i < 2; i++) {
1929 int error;
1930 bool_t was_mounted;
1931
1932 retval = domount(NULL, uap, stubvp, cr, vfsp);
1933 if (retval == 0) {
1934 retval = VFS_ROOT(*vfsp, newvpp);
1935 VFS_RELE(*vfsp);
1936 break;
1937 } else if (retval != EBUSY) {
1938 break;
1939 }
1940
1941 /*
1942 * We might find it mounted by the other racer...
1943 */
1944 error = nfs4_trigger_mounted_already(stubvp,
1945 newvpp, &was_mounted, vfsp);
1946 if (error) {
1947 goto done;
1948 } else if (was_mounted) {
1949 retval = 0;
1950 break;
1951 }
1952 }
1953
1954 done:
1955 if (uap->optptr)
1956 nfs4_trigger_destroy_mntopts(uap->optptr);
1957
1958 kmem_free(uap->spec, spec_len + 1);
1959 kmem_free(uap, sizeof (struct mounta));
1960 kmem_free(mntpt, mntpt_len + 1);
1961
1962 return (retval);
1963 }
1964
1965 /*
1966 * Build an nfs_args structure for passing to domount().
1967 *
1968 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1969 * generic data - common to all ephemeral mount types - is read directly
1970 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1971 */
1972 static struct nfs_args *
nfs4_trigger_nargs_create(mntinfo4_t * mi,servinfo4_t * svp,ephemeral_servinfo_t * esi)1973 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1974 ephemeral_servinfo_t *esi)
1975 {
1976 sec_data_t *secdata;
1977 struct nfs_args *nargs;
1978
1979 /* setup the nfs args */
1980 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1981
1982 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1983
1984 nargs->addr = esi->esi_addr;
1985
1986 /* for AUTH_DH by negotiation */
1987 if (esi->esi_syncaddr || esi->esi_netname) {
1988 nargs->flags |= NFSMNT_SECURE;
1989 nargs->syncaddr = esi->esi_syncaddr;
1990 nargs->netname = esi->esi_netname;
1991 }
1992
1993 nargs->flags |= NFSMNT_KNCONF;
1994 nargs->knconf = esi->esi_knconf;
1995 nargs->flags |= NFSMNT_HOSTNAME;
1996 nargs->hostname = esi->esi_hostname;
1997 nargs->fh = esi->esi_path;
1998
1999 /* general mount settings, all copied from parent mount */
2000 mutex_enter(&mi->mi_lock);
2001
2002 if (!(mi->mi_flags & MI4_HARD))
2003 nargs->flags |= NFSMNT_SOFT;
2004
2005 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
2006 NFSMNT_RETRANS;
2007 nargs->wsize = mi->mi_stsize;
2008 nargs->rsize = mi->mi_tsize;
2009 nargs->timeo = mi->mi_timeo;
2010 nargs->retrans = mi->mi_retrans;
2011
2012 if (mi->mi_flags & MI4_INT)
2013 nargs->flags |= NFSMNT_INT;
2014 if (mi->mi_flags & MI4_NOAC)
2015 nargs->flags |= NFSMNT_NOAC;
2016
2017 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2018 NFSMNT_ACDIRMAX;
2019 nargs->acregmin = HR2SEC(mi->mi_acregmin);
2020 nargs->acregmax = HR2SEC(mi->mi_acregmax);
2021 nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2022 nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2023
2024 /* add any specific flags for this type of ephemeral mount */
2025 nargs->flags |= esi->esi_mount_flags;
2026
2027 if (mi->mi_flags & MI4_NOCTO)
2028 nargs->flags |= NFSMNT_NOCTO;
2029 if (mi->mi_flags & MI4_GRPID)
2030 nargs->flags |= NFSMNT_GRPID;
2031 if (mi->mi_flags & MI4_LLOCK)
2032 nargs->flags |= NFSMNT_LLOCK;
2033 if (mi->mi_flags & MI4_NOPRINT)
2034 nargs->flags |= NFSMNT_NOPRINT;
2035 if (mi->mi_flags & MI4_DIRECTIO)
2036 nargs->flags |= NFSMNT_DIRECTIO;
2037 if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2038 nargs->flags |= NFSMNT_PUBLIC;
2039
2040 /* Do some referral-specific option tweaking */
2041 if (nargs->flags & NFSMNT_REFERRAL) {
2042 nargs->flags &= ~NFSMNT_DORDMA;
2043 nargs->flags |= NFSMNT_TRYRDMA;
2044 }
2045
2046 mutex_exit(&mi->mi_lock);
2047
2048 /*
2049 * Security data & negotiation policy.
2050 *
2051 * For mirror mounts, we need to preserve the parent mount's
2052 * preference for security negotiation, translating SV4_TRYSECDEFAULT
2053 * to NFSMNT_SECDEFAULT if present.
2054 *
2055 * For referrals, we always want security negotiation and will
2056 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2057 * The reason is that we can't negotiate down from a parent's
2058 * Kerberos flavor to AUTH_SYS.
2059 *
2060 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2061 * security flavour was requested, with data in sv_secdata, and that
2062 * no negotiation should occur. If this specified flavour fails, that's
2063 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2064 *
2065 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2066 * default flavour, in sv_secdata, but then negotiate a new flavour.
2067 * Possible flavours are recorded in an array in sv_secinfo, with
2068 * currently in-use flavour pointed to by sv_currsec.
2069 *
2070 * If sv_currsec is set, i.e. if negotiation has already occurred,
2071 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2072 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2073 */
2074 if (nargs->flags & NFSMNT_REFERRAL) {
2075 /* enable negotiation for referral mount */
2076 nargs->flags |= NFSMNT_SECDEFAULT;
2077 secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2078 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2079 secdata->data = NULL;
2080 } else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2081 /* enable negotiation for mirror mount */
2082 nargs->flags |= NFSMNT_SECDEFAULT;
2083
2084 /*
2085 * As a starting point for negotiation, copy parent
2086 * mount's negotiated flavour (sv_currsec) if available,
2087 * or its passed-in flavour (sv_secdata) if not.
2088 */
2089 if (svp->sv_currsec != NULL)
2090 secdata = copy_sec_data(svp->sv_currsec);
2091 else if (svp->sv_secdata != NULL)
2092 secdata = copy_sec_data(svp->sv_secdata);
2093 else
2094 secdata = NULL;
2095 } else {
2096 /* do not enable negotiation; copy parent's passed-in flavour */
2097 if (svp->sv_secdata != NULL)
2098 secdata = copy_sec_data(svp->sv_secdata);
2099 else
2100 secdata = NULL;
2101 }
2102
2103 nfs_rw_exit(&svp->sv_lock);
2104
2105 nargs->flags |= NFSMNT_NEWARGS;
2106 nargs->nfs_args_ext = NFS_ARGS_EXTB;
2107 nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2108
2109 /* for NFS RO failover; caller will set if necessary */
2110 nargs->nfs_ext_u.nfs_extB.next = NULL;
2111
2112 return (nargs);
2113 }
2114
2115 static void
nfs4_trigger_nargs_destroy(struct nfs_args * nargs)2116 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2117 {
2118 /*
2119 * Either the mount failed, in which case the data is not needed, or
2120 * nfs4_mount() has either taken copies of what it needs or,
2121 * where it has merely copied the ptr, it has set *our* ptr to NULL,
2122 * whereby nfs4_free_args() will ignore it.
2123 */
2124 nfs4_free_args(nargs);
2125 kmem_free(nargs, sizeof (struct nfs_args));
2126 }
2127
2128 /*
2129 * When we finally get into the mounting, we need to add this
2130 * node to the ephemeral tree.
2131 *
2132 * This is called from nfs4_mount().
2133 */
2134 int
nfs4_record_ephemeral_mount(mntinfo4_t * mi,vnode_t * mvp)2135 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2136 {
2137 mntinfo4_t *mi_parent;
2138 nfs4_ephemeral_t *eph;
2139 nfs4_ephemeral_tree_t *net;
2140
2141 nfs4_ephemeral_t *prior;
2142 nfs4_ephemeral_t *child;
2143
2144 nfs4_ephemeral_t *peer;
2145
2146 nfs4_trigger_globals_t *ntg;
2147 zone_t *zone = curproc->p_zone;
2148
2149 int rc = 0;
2150
2151 mi_parent = VTOMI4(mvp);
2152
2153 /*
2154 * Get this before grabbing anything else!
2155 */
2156 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2157 if (!ntg->ntg_thread_started) {
2158 nfs4_ephemeral_start_harvester(ntg);
2159 }
2160
2161 mutex_enter(&mi_parent->mi_lock);
2162 mutex_enter(&mi->mi_lock);
2163
2164 net = mi->mi_ephemeral_tree =
2165 mi_parent->mi_ephemeral_tree;
2166
2167 /*
2168 * If the mi_ephemeral_tree is NULL, then it
2169 * means that either the harvester or a manual
2170 * umount has cleared the tree out right before
2171 * we got here.
2172 *
2173 * There is nothing we can do here, so return
2174 * to the caller and let them decide whether they
2175 * try again.
2176 */
2177 if (net == NULL) {
2178 mutex_exit(&mi->mi_lock);
2179 mutex_exit(&mi_parent->mi_lock);
2180
2181 return (EBUSY);
2182 }
2183
2184 /*
2185 * We've just tied the mntinfo to the tree, so
2186 * now we bump the refcnt and hold it there until
2187 * this mntinfo is removed from the tree.
2188 */
2189 nfs4_ephemeral_tree_hold(net);
2190
2191 /*
2192 * We need to tack together the ephemeral mount
2193 * with this new mntinfo.
2194 */
2195 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2196 eph->ne_mount = mi;
2197 MI4_HOLD(mi);
2198 VFS_HOLD(mi->mi_vfsp);
2199 eph->ne_ref_time = gethrestime_sec();
2200
2201 /*
2202 * We need to tell the ephemeral mount when
2203 * to time out.
2204 */
2205 eph->ne_mount_to = ntg->ntg_mount_to;
2206
2207 mi->mi_ephemeral = eph;
2208
2209 /*
2210 * If the enclosing mntinfo4 is also ephemeral,
2211 * then we need to point to its enclosing parent.
2212 * Else the enclosing mntinfo4 is the enclosing parent.
2213 *
2214 * We also need to weave this ephemeral node
2215 * into the tree.
2216 */
2217 if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2218 /*
2219 * We need to decide if we are
2220 * the root node of this branch
2221 * or if we are a sibling of this
2222 * branch.
2223 */
2224 prior = mi_parent->mi_ephemeral;
2225 if (prior == NULL) {
2226 /*
2227 * Race condition, clean up, and
2228 * let caller handle mntinfo.
2229 */
2230 mi->mi_flags &= ~MI4_EPHEMERAL;
2231 mi->mi_ephemeral = NULL;
2232 kmem_free(eph, sizeof (*eph));
2233 VFS_RELE(mi->mi_vfsp);
2234 MI4_RELE(mi);
2235 nfs4_ephemeral_tree_rele(net);
2236 rc = EBUSY;
2237 } else {
2238 if (prior->ne_child == NULL) {
2239 prior->ne_child = eph;
2240 } else {
2241 child = prior->ne_child;
2242
2243 prior->ne_child = eph;
2244 eph->ne_peer = child;
2245
2246 child->ne_prior = eph;
2247 }
2248
2249 eph->ne_prior = prior;
2250 }
2251 } else {
2252 /*
2253 * The parent mntinfo4 is the non-ephemeral
2254 * root of the ephemeral tree. We
2255 * need to decide if we are the root
2256 * node of that tree or if we are a
2257 * sibling of the root node.
2258 *
2259 * We are the root if there is no
2260 * other node.
2261 */
2262 if (net->net_root == NULL) {
2263 net->net_root = eph;
2264 } else {
2265 eph->ne_peer = peer = net->net_root;
2266 ASSERT(peer != NULL);
2267 net->net_root = eph;
2268
2269 peer->ne_prior = eph;
2270 }
2271
2272 eph->ne_prior = NULL;
2273 }
2274
2275 mutex_exit(&mi->mi_lock);
2276 mutex_exit(&mi_parent->mi_lock);
2277
2278 return (rc);
2279 }
2280
2281 /*
2282 * Commit the changes to the ephemeral tree for removing this node.
2283 */
2284 static void
nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t * eph)2285 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2286 {
2287 nfs4_ephemeral_t *e = eph;
2288 nfs4_ephemeral_t *peer;
2289 nfs4_ephemeral_t *prior;
2290
2291 peer = eph->ne_peer;
2292 prior = e->ne_prior;
2293
2294 /*
2295 * If this branch root was not the
2296 * tree root, then we need to fix back pointers.
2297 */
2298 if (prior) {
2299 if (prior->ne_child == e) {
2300 prior->ne_child = peer;
2301 } else {
2302 prior->ne_peer = peer;
2303 }
2304
2305 if (peer)
2306 peer->ne_prior = prior;
2307 } else if (peer) {
2308 peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2309 peer->ne_prior = NULL;
2310 } else {
2311 e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2312 }
2313 }
2314
2315 /*
2316 * We want to avoid recursion at all costs. So we need to
2317 * unroll the tree. We do this by a depth first traversal to
2318 * leaf nodes. We blast away the leaf and work our way back
2319 * up and down the tree.
2320 */
2321 static int
nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t * eph,int isTreeRoot,int flag,cred_t * cr)2322 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2323 int isTreeRoot, int flag, cred_t *cr)
2324 {
2325 nfs4_ephemeral_t *e = eph;
2326 nfs4_ephemeral_t *prior;
2327 mntinfo4_t *mi;
2328 vfs_t *vfsp;
2329 int error;
2330
2331 /*
2332 * We use the loop while unrolling the ephemeral tree.
2333 */
2334 for (;;) {
2335 /*
2336 * First we walk down the child.
2337 */
2338 if (e->ne_child) {
2339 prior = e;
2340 e = e->ne_child;
2341 continue;
2342 }
2343
2344 /*
2345 * If we are the root of the branch we are removing,
2346 * we end it here. But if the branch is the root of
2347 * the tree, we have to forge on. We do not consider
2348 * the peer list for the root because while it may
2349 * be okay to remove, it is both extra work and a
2350 * potential for a false-positive error to stall the
2351 * unmount attempt.
2352 */
2353 if (e == eph && isTreeRoot == FALSE)
2354 return (0);
2355
2356 /*
2357 * Next we walk down the peer list.
2358 */
2359 if (e->ne_peer) {
2360 prior = e;
2361 e = e->ne_peer;
2362 continue;
2363 }
2364
2365 /*
2366 * We can only remove the node passed in by the
2367 * caller if it is the root of the ephemeral tree.
2368 * Otherwise, the caller will remove it.
2369 */
2370 if (e == eph && isTreeRoot == FALSE)
2371 return (0);
2372
2373 /*
2374 * Okay, we have a leaf node, time
2375 * to prune it!
2376 *
2377 * Note that prior can only be NULL if
2378 * and only if it is the root of the
2379 * ephemeral tree.
2380 */
2381 prior = e->ne_prior;
2382
2383 mi = e->ne_mount;
2384 mutex_enter(&mi->mi_lock);
2385 vfsp = mi->mi_vfsp;
2386 ASSERT(vfsp != NULL);
2387
2388 /*
2389 * Cleared by umount2_engine.
2390 */
2391 VFS_HOLD(vfsp);
2392
2393 /*
2394 * Inform nfs4_unmount to not recursively
2395 * descend into this node's children when it
2396 * gets processed.
2397 */
2398 mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2399 mutex_exit(&mi->mi_lock);
2400
2401 error = umount2_engine(vfsp, flag, cr, FALSE);
2402 if (error) {
2403 /*
2404 * We need to reenable nfs4_unmount's ability
2405 * to recursively descend on this node.
2406 */
2407 mutex_enter(&mi->mi_lock);
2408 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2409 mutex_exit(&mi->mi_lock);
2410
2411 return (error);
2412 }
2413
2414 /*
2415 * If we are the current node, we do not want to
2416 * touch anything else. At this point, the only
2417 * way the current node can have survived to here
2418 * is if it is the root of the ephemeral tree and
2419 * we are unmounting the enclosing mntinfo4.
2420 */
2421 if (e == eph) {
2422 ASSERT(prior == NULL);
2423 return (0);
2424 }
2425
2426 /*
2427 * Stitch up the prior node. Note that since
2428 * we have handled the root of the tree, prior
2429 * must be non-NULL.
2430 */
2431 ASSERT(prior != NULL);
2432 if (prior->ne_child == e) {
2433 prior->ne_child = NULL;
2434 } else {
2435 ASSERT(prior->ne_peer == e);
2436
2437 prior->ne_peer = NULL;
2438 }
2439
2440 e = prior;
2441 }
2442
2443 /* NOTREACHED */
2444 }
2445
2446 /*
2447 * Common code to safely release net_cnt_lock and net_tree_lock
2448 */
2449 void
nfs4_ephemeral_umount_unlock(bool_t * pmust_unlock,nfs4_ephemeral_tree_t ** pnet)2450 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2451 nfs4_ephemeral_tree_t **pnet)
2452 {
2453 nfs4_ephemeral_tree_t *net = *pnet;
2454
2455 if (*pmust_unlock) {
2456 mutex_enter(&net->net_cnt_lock);
2457 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2458 mutex_exit(&net->net_cnt_lock);
2459
2460 mutex_exit(&net->net_tree_lock);
2461
2462 *pmust_unlock = FALSE;
2463 }
2464 }
2465
2466 /*
2467 * While we may have removed any child or sibling nodes of this
2468 * ephemeral node, we can not nuke it until we know that there
2469 * were no actived vnodes on it. This will do that final
2470 * work once we know it is not busy.
2471 */
2472 void
nfs4_ephemeral_umount_activate(mntinfo4_t * mi,bool_t * pmust_unlock,nfs4_ephemeral_tree_t ** pnet)2473 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2474 nfs4_ephemeral_tree_t **pnet)
2475 {
2476 /*
2477 * Now we need to get rid of the ephemeral data if it exists.
2478 */
2479 mutex_enter(&mi->mi_lock);
2480 if (mi->mi_ephemeral) {
2481 /*
2482 * If we are the root node of an ephemeral branch
2483 * which is being removed, then we need to fixup
2484 * pointers into and out of the node.
2485 */
2486 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2487 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2488
2489 nfs4_ephemeral_tree_rele(*pnet);
2490 ASSERT(mi->mi_ephemeral != NULL);
2491
2492 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2493 mi->mi_ephemeral = NULL;
2494 VFS_RELE(mi->mi_vfsp);
2495 MI4_RELE(mi);
2496 }
2497 mutex_exit(&mi->mi_lock);
2498
2499 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2500 }
2501
2502 /*
2503 * Unmount an ephemeral node.
2504 *
2505 * Note that if this code fails, then it must unlock.
2506 *
2507 * If it succeeds, then the caller must be prepared to do so.
2508 */
2509 int
nfs4_ephemeral_umount(mntinfo4_t * mi,int flag,cred_t * cr,bool_t * pmust_unlock,nfs4_ephemeral_tree_t ** pnet)2510 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2511 bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2512 {
2513 int error = 0;
2514 nfs4_ephemeral_t *eph;
2515 nfs4_ephemeral_tree_t *net;
2516 int is_derooting = FALSE;
2517 int is_recursed = FALSE;
2518 int was_locked = FALSE;
2519
2520 /*
2521 * Make sure to set the default state for cleaning
2522 * up the tree in the caller (and on the way out).
2523 */
2524 *pmust_unlock = FALSE;
2525
2526 /*
2527 * The active vnodes on this file system may be ephemeral
2528 * children. We need to check for and try to unmount them
2529 * here. If any can not be unmounted, we are going
2530 * to return EBUSY.
2531 */
2532 mutex_enter(&mi->mi_lock);
2533
2534 /*
2535 * If an ephemeral tree, we need to check to see if
2536 * the lock is already held. If it is, then we need
2537 * to see if we are being called as a result of
2538 * the recursive removal of some node of the tree or
2539 * if we are another attempt to remove the tree.
2540 *
2541 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2542 * node. mi_ephemeral being non-NULL also does this.
2543 *
2544 * mi_ephemeral_tree being non-NULL is sufficient
2545 * to also indicate either it is an ephemeral node
2546 * or the enclosing mntinfo4.
2547 *
2548 * Do we need MI4_EPHEMERAL? Yes, it is useful for
2549 * when we delete the ephemeral node and need to
2550 * differentiate from an ephemeral node and the
2551 * enclosing root node.
2552 */
2553 *pnet = net = mi->mi_ephemeral_tree;
2554 if (net == NULL) {
2555 mutex_exit(&mi->mi_lock);
2556 return (0);
2557 }
2558
2559 eph = mi->mi_ephemeral;
2560 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2561 is_derooting = (eph == NULL);
2562
2563 mutex_enter(&net->net_cnt_lock);
2564
2565 /*
2566 * If this is not recursion, then we need to
2567 * check to see if a harvester thread has
2568 * already grabbed the lock.
2569 *
2570 * After we exit this branch, we may not
2571 * blindly return, we need to jump to
2572 * is_busy!
2573 */
2574 if (!is_recursed) {
2575 if (net->net_status &
2576 NFS4_EPHEMERAL_TREE_LOCKED) {
2577 /*
2578 * If the tree is locked, we need
2579 * to decide whether we are the
2580 * harvester or some explicit call
2581 * for a umount. The only way that
2582 * we are the harvester is if
2583 * MS_SYSSPACE is set.
2584 *
2585 * We only let the harvester through
2586 * at this point.
2587 *
2588 * We return EBUSY so that the
2589 * caller knows something is
2590 * going on. Note that by that
2591 * time, the umount in the other
2592 * thread may have already occured.
2593 */
2594 if (!(flag & MS_SYSSPACE)) {
2595 mutex_exit(&net->net_cnt_lock);
2596 mutex_exit(&mi->mi_lock);
2597
2598 return (EBUSY);
2599 }
2600
2601 was_locked = TRUE;
2602 }
2603 }
2604
2605 mutex_exit(&net->net_cnt_lock);
2606 mutex_exit(&mi->mi_lock);
2607
2608 /*
2609 * If we are not the harvester, we need to check
2610 * to see if we need to grab the tree lock.
2611 */
2612 if (was_locked == FALSE) {
2613 /*
2614 * If we grab the lock, it means that no other
2615 * operation is working on the tree. If we don't
2616 * grab it, we need to decide if this is because
2617 * we are a recursive call or a new operation.
2618 */
2619 if (mutex_tryenter(&net->net_tree_lock)) {
2620 *pmust_unlock = TRUE;
2621 } else {
2622 /*
2623 * If we are a recursive call, we can
2624 * proceed without the lock.
2625 * Otherwise we have to wait until
2626 * the lock becomes free.
2627 */
2628 if (!is_recursed) {
2629 mutex_enter(&net->net_cnt_lock);
2630 if (net->net_status &
2631 (NFS4_EPHEMERAL_TREE_DEROOTING
2632 | NFS4_EPHEMERAL_TREE_INVALID)) {
2633 mutex_exit(&net->net_cnt_lock);
2634 goto is_busy;
2635 }
2636 mutex_exit(&net->net_cnt_lock);
2637
2638 /*
2639 * We can't hold any other locks whilst
2640 * we wait on this to free up.
2641 */
2642 mutex_enter(&net->net_tree_lock);
2643
2644 /*
2645 * Note that while mi->mi_ephemeral
2646 * may change and thus we have to
2647 * update eph, it is the case that
2648 * we have tied down net and
2649 * do not care if mi->mi_ephemeral_tree
2650 * has changed.
2651 */
2652 mutex_enter(&mi->mi_lock);
2653 eph = mi->mi_ephemeral;
2654 mutex_exit(&mi->mi_lock);
2655
2656 /*
2657 * Okay, we need to see if either the
2658 * tree got nuked or the current node
2659 * got nuked. Both of which will cause
2660 * an error.
2661 *
2662 * Note that a subsequent retry of the
2663 * umount shall work.
2664 */
2665 mutex_enter(&net->net_cnt_lock);
2666 if (net->net_status &
2667 NFS4_EPHEMERAL_TREE_INVALID ||
2668 (!is_derooting && eph == NULL)) {
2669 mutex_exit(&net->net_cnt_lock);
2670 mutex_exit(&net->net_tree_lock);
2671 goto is_busy;
2672 }
2673 mutex_exit(&net->net_cnt_lock);
2674 *pmust_unlock = TRUE;
2675 }
2676 }
2677 }
2678
2679 /*
2680 * Only once we have grabbed the lock can we mark what we
2681 * are planning on doing to the ephemeral tree.
2682 */
2683 if (*pmust_unlock) {
2684 mutex_enter(&net->net_cnt_lock);
2685 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2686
2687 /*
2688 * Check to see if we are nuking the root.
2689 */
2690 if (is_derooting)
2691 net->net_status |=
2692 NFS4_EPHEMERAL_TREE_DEROOTING;
2693 mutex_exit(&net->net_cnt_lock);
2694 }
2695
2696 if (!is_derooting) {
2697 /*
2698 * Only work on children if the caller has not already
2699 * done so.
2700 */
2701 if (!is_recursed) {
2702 ASSERT(eph != NULL);
2703
2704 error = nfs4_ephemeral_unmount_engine(eph,
2705 FALSE, flag, cr);
2706 if (error)
2707 goto is_busy;
2708 }
2709 } else {
2710 eph = net->net_root;
2711
2712 /*
2713 * Only work if there is something there.
2714 */
2715 if (eph) {
2716 error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2717 flag, cr);
2718 if (error) {
2719 mutex_enter(&net->net_cnt_lock);
2720 net->net_status &=
2721 ~NFS4_EPHEMERAL_TREE_DEROOTING;
2722 mutex_exit(&net->net_cnt_lock);
2723 goto is_busy;
2724 }
2725
2726 /*
2727 * Nothing else which goes wrong will
2728 * invalidate the blowing away of the
2729 * ephmeral tree.
2730 */
2731 net->net_root = NULL;
2732 }
2733
2734 /*
2735 * We have derooted and we have caused the tree to be
2736 * invalidated.
2737 */
2738 mutex_enter(&net->net_cnt_lock);
2739 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2740 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2741 DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2742 uint_t, net->net_refcnt);
2743
2744 /*
2745 * We will not finalize this node, so safe to
2746 * release it.
2747 */
2748 nfs4_ephemeral_tree_decr(net);
2749 mutex_exit(&net->net_cnt_lock);
2750
2751 if (was_locked == FALSE)
2752 mutex_exit(&net->net_tree_lock);
2753
2754 /*
2755 * We have just blown away any notation of this
2756 * tree being locked or having a refcnt.
2757 * We can't let the caller try to clean things up.
2758 */
2759 *pmust_unlock = FALSE;
2760
2761 /*
2762 * At this point, the tree should no longer be
2763 * associated with the mntinfo4. We need to pull
2764 * it off there and let the harvester take
2765 * care of it once the refcnt drops.
2766 */
2767 mutex_enter(&mi->mi_lock);
2768 mi->mi_ephemeral_tree = NULL;
2769 mutex_exit(&mi->mi_lock);
2770 }
2771
2772 return (0);
2773
2774 is_busy:
2775
2776 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2777
2778 return (error);
2779 }
2780
2781 /*
2782 * Do the umount and record any error in the parent.
2783 */
2784 static void
nfs4_ephemeral_record_umount(vfs_t * vfsp,int flag,nfs4_ephemeral_t * e,nfs4_ephemeral_t * prior)2785 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2786 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2787 {
2788 int error;
2789
2790 /*
2791 * Only act on if the fs is still mounted.
2792 */
2793 if (vfsp == NULL)
2794 return;
2795
2796 error = umount2_engine(vfsp, flag, kcred, FALSE);
2797 if (error) {
2798 if (prior) {
2799 if (prior->ne_child == e)
2800 prior->ne_state |=
2801 NFS4_EPHEMERAL_CHILD_ERROR;
2802 else
2803 prior->ne_state |=
2804 NFS4_EPHEMERAL_PEER_ERROR;
2805 }
2806 }
2807 }
2808
2809 /*
2810 * For each tree in the forest (where the forest is in
2811 * effect all of the ephemeral trees for this zone),
2812 * scan to see if a node can be unmounted. Note that
2813 * unlike nfs4_ephemeral_unmount_engine(), we do
2814 * not process the current node before children or
2815 * siblings. I.e., if a node can be unmounted, we
2816 * do not recursively check to see if the nodes
2817 * hanging off of it can also be unmounted.
2818 *
2819 * Instead, we delve down deep to try and remove the
2820 * children first. Then, because we share code with
2821 * nfs4_ephemeral_unmount_engine(), we will try
2822 * them again. This could be a performance issue in
2823 * the future.
2824 *
2825 * Also note that unlike nfs4_ephemeral_unmount_engine(),
2826 * we do not halt on an error. We will not remove the
2827 * current node, but we will keep on trying to remove
2828 * the others.
2829 *
2830 * force indicates that we want the unmount to occur
2831 * even if there is something blocking it.
2832 *
2833 * time_check indicates that we want to see if the
2834 * mount has expired past mount_to or not. Typically
2835 * we want to do this and only on a shutdown of the
2836 * zone would we want to ignore the check.
2837 */
2838 static void
nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t * ntg,bool_t force,bool_t time_check)2839 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2840 bool_t force, bool_t time_check)
2841 {
2842 nfs4_ephemeral_tree_t *net;
2843 nfs4_ephemeral_tree_t *prev = NULL;
2844 nfs4_ephemeral_tree_t *next;
2845 nfs4_ephemeral_t *e;
2846 nfs4_ephemeral_t *prior;
2847 time_t now = gethrestime_sec();
2848
2849 nfs4_ephemeral_tree_t *harvest = NULL;
2850
2851 int flag;
2852
2853 mntinfo4_t *mi;
2854 vfs_t *vfsp;
2855
2856 if (force)
2857 flag = MS_FORCE | MS_SYSSPACE;
2858 else
2859 flag = MS_SYSSPACE;
2860
2861 mutex_enter(&ntg->ntg_forest_lock);
2862 for (net = ntg->ntg_forest; net != NULL; net = next) {
2863 next = net->net_next;
2864
2865 nfs4_ephemeral_tree_hold(net);
2866
2867 mutex_enter(&net->net_tree_lock);
2868
2869 /*
2870 * Let the unmount code know that the
2871 * tree is already locked!
2872 */
2873 mutex_enter(&net->net_cnt_lock);
2874 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2875 mutex_exit(&net->net_cnt_lock);
2876
2877 /*
2878 * If the intent is force all ephemeral nodes to
2879 * be unmounted in this zone, we can short circuit a
2880 * lot of tree traversal and simply zap the root node.
2881 */
2882 if (force) {
2883 if (net->net_root) {
2884 mi = net->net_root->ne_mount;
2885
2886 vfsp = mi->mi_vfsp;
2887 ASSERT(vfsp != NULL);
2888
2889 /*
2890 * Cleared by umount2_engine.
2891 */
2892 VFS_HOLD(vfsp);
2893
2894 (void) umount2_engine(vfsp, flag,
2895 kcred, FALSE);
2896
2897 goto check_done;
2898 }
2899 }
2900
2901 e = net->net_root;
2902 if (e)
2903 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2904
2905 while (e) {
2906 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2907 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2908 if (e->ne_child) {
2909 e = e->ne_child;
2910 e->ne_state =
2911 NFS4_EPHEMERAL_VISIT_CHILD;
2912 }
2913
2914 continue;
2915 } else if (e->ne_state ==
2916 NFS4_EPHEMERAL_VISIT_SIBLING) {
2917 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2918 if (e->ne_peer) {
2919 e = e->ne_peer;
2920 e->ne_state =
2921 NFS4_EPHEMERAL_VISIT_CHILD;
2922 }
2923
2924 continue;
2925 } else if (e->ne_state ==
2926 NFS4_EPHEMERAL_CHILD_ERROR) {
2927 prior = e->ne_prior;
2928
2929 /*
2930 * If a child reported an error, do
2931 * not bother trying to unmount.
2932 *
2933 * If your prior node is a parent,
2934 * pass the error up such that they
2935 * also do not try to unmount.
2936 *
2937 * However, if your prior is a sibling,
2938 * let them try to unmount if they can.
2939 */
2940 if (prior) {
2941 if (prior->ne_child == e)
2942 prior->ne_state |=
2943 NFS4_EPHEMERAL_CHILD_ERROR;
2944 else
2945 prior->ne_state |=
2946 NFS4_EPHEMERAL_PEER_ERROR;
2947 }
2948
2949 /*
2950 * Clear the error and if needed, process peers.
2951 *
2952 * Once we mask out the error, we know whether
2953 * or we have to process another node.
2954 */
2955 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2956 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2957 e = prior;
2958
2959 continue;
2960 } else if (e->ne_state ==
2961 NFS4_EPHEMERAL_PEER_ERROR) {
2962 prior = e->ne_prior;
2963
2964 if (prior) {
2965 if (prior->ne_child == e)
2966 prior->ne_state =
2967 NFS4_EPHEMERAL_CHILD_ERROR;
2968 else
2969 prior->ne_state =
2970 NFS4_EPHEMERAL_PEER_ERROR;
2971 }
2972
2973 /*
2974 * Clear the error from this node and do the
2975 * correct processing.
2976 */
2977 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2978 continue;
2979 }
2980
2981 prior = e->ne_prior;
2982 e->ne_state = NFS4_EPHEMERAL_OK;
2983
2984 /*
2985 * It must be the case that we need to process
2986 * this node.
2987 */
2988 if (!time_check ||
2989 now - e->ne_ref_time > e->ne_mount_to) {
2990 mi = e->ne_mount;
2991 vfsp = mi->mi_vfsp;
2992
2993 /*
2994 * Cleared by umount2_engine.
2995 */
2996 if (vfsp != NULL)
2997 VFS_HOLD(vfsp);
2998
2999 /*
3000 * Note that we effectively work down to the
3001 * leaf nodes first, try to unmount them,
3002 * then work our way back up into the leaf
3003 * nodes.
3004 *
3005 * Also note that we deal with a lot of
3006 * complexity by sharing the work with
3007 * the manual unmount code.
3008 */
3009 nfs4_ephemeral_record_umount(vfsp, flag,
3010 e, prior);
3011 }
3012
3013 e = prior;
3014 }
3015
3016 check_done:
3017
3018 /*
3019 * At this point we are done processing this tree.
3020 *
3021 * If the tree is invalid and we were the only reference
3022 * to it, then we push it on the local linked list
3023 * to remove it at the end. We avoid that action now
3024 * to keep the tree processing going along at a fair clip.
3025 *
3026 * Else, even if we were the only reference, we
3027 * allow it to be reused as needed.
3028 */
3029 mutex_enter(&net->net_cnt_lock);
3030 nfs4_ephemeral_tree_decr(net);
3031 if (net->net_refcnt == 0 &&
3032 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3033 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3034 mutex_exit(&net->net_cnt_lock);
3035 mutex_exit(&net->net_tree_lock);
3036
3037 if (prev)
3038 prev->net_next = net->net_next;
3039 else
3040 ntg->ntg_forest = net->net_next;
3041
3042 net->net_next = harvest;
3043 harvest = net;
3044
3045 VFS_RELE(net->net_mount->mi_vfsp);
3046 MI4_RELE(net->net_mount);
3047
3048 continue;
3049 }
3050
3051 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3052 mutex_exit(&net->net_cnt_lock);
3053 mutex_exit(&net->net_tree_lock);
3054
3055 prev = net;
3056 }
3057 mutex_exit(&ntg->ntg_forest_lock);
3058
3059 for (net = harvest; net != NULL; net = next) {
3060 next = net->net_next;
3061
3062 mutex_destroy(&net->net_tree_lock);
3063 mutex_destroy(&net->net_cnt_lock);
3064 kmem_free(net, sizeof (*net));
3065 }
3066 }
3067
3068 /*
3069 * This is the thread which decides when the harvesting
3070 * can proceed and when to kill it off for this zone.
3071 */
3072 static void
nfs4_ephemeral_harvester(nfs4_trigger_globals_t * ntg)3073 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3074 {
3075 clock_t timeleft;
3076 zone_t *zone = curproc->p_zone;
3077
3078 for (;;) {
3079 timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3080 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3081
3082 /*
3083 * zone is exiting...
3084 */
3085 if (timeleft != -1) {
3086 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3087 zthread_exit();
3088 /* NOTREACHED */
3089 }
3090
3091 /*
3092 * Only bother scanning if there is potential
3093 * work to be done.
3094 */
3095 if (ntg->ntg_forest == NULL)
3096 continue;
3097
3098 /*
3099 * Now scan the list and get rid of everything which
3100 * is old.
3101 */
3102 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3103 }
3104
3105 /* NOTREACHED */
3106 }
3107
3108 /*
3109 * The zone specific glue needed to start the unmount harvester.
3110 *
3111 * Note that we want to avoid holding the mutex as long as possible,
3112 * hence the multiple checks.
3113 *
3114 * The caller should avoid us getting down here in the first
3115 * place.
3116 */
3117 static void
nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t * ntg)3118 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3119 {
3120 /*
3121 * It got started before we got here...
3122 */
3123 if (ntg->ntg_thread_started)
3124 return;
3125
3126 mutex_enter(&nfs4_ephemeral_thread_lock);
3127
3128 if (ntg->ntg_thread_started) {
3129 mutex_exit(&nfs4_ephemeral_thread_lock);
3130 return;
3131 }
3132
3133 /*
3134 * Start the unmounter harvester thread for this zone.
3135 */
3136 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3137 ntg, 0, minclsyspri);
3138
3139 ntg->ntg_thread_started = TRUE;
3140 mutex_exit(&nfs4_ephemeral_thread_lock);
3141 }
3142
3143 /*ARGSUSED*/
3144 static void *
nfs4_ephemeral_zsd_create(zoneid_t zoneid)3145 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3146 {
3147 nfs4_trigger_globals_t *ntg;
3148
3149 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3150 ntg->ntg_thread_started = FALSE;
3151
3152 /*
3153 * This is the default....
3154 */
3155 ntg->ntg_mount_to = nfs4_trigger_thread_timer;
3156
3157 mutex_init(&ntg->ntg_forest_lock, NULL,
3158 MUTEX_DEFAULT, NULL);
3159
3160 return (ntg);
3161 }
3162
3163 /*
3164 * Try a nice gentle walk down the forest and convince
3165 * all of the trees to gracefully give it up.
3166 */
3167 /*ARGSUSED*/
3168 static void
nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid,void * arg)3169 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3170 {
3171 nfs4_trigger_globals_t *ntg = arg;
3172
3173 if (!ntg)
3174 return;
3175
3176 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3177 }
3178
3179 /*
3180 * Race along the forest and rip all of the trees out by
3181 * their rootballs!
3182 */
3183 /*ARGSUSED*/
3184 static void
nfs4_ephemeral_zsd_destroy(zoneid_t zoneid,void * arg)3185 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3186 {
3187 nfs4_trigger_globals_t *ntg = arg;
3188
3189 if (!ntg)
3190 return;
3191
3192 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3193
3194 mutex_destroy(&ntg->ntg_forest_lock);
3195 kmem_free(ntg, sizeof (*ntg));
3196 }
3197
3198 /*
3199 * This is the zone independent cleanup needed for
3200 * emphemeral mount processing.
3201 */
3202 void
nfs4_ephemeral_fini(void)3203 nfs4_ephemeral_fini(void)
3204 {
3205 (void) zone_key_delete(nfs4_ephemeral_key);
3206 mutex_destroy(&nfs4_ephemeral_thread_lock);
3207 }
3208
3209 /*
3210 * This is the zone independent initialization needed for
3211 * emphemeral mount processing.
3212 */
3213 void
nfs4_ephemeral_init(void)3214 nfs4_ephemeral_init(void)
3215 {
3216 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3217 NULL);
3218
3219 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3220 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3221 }
3222
3223 /*
3224 * nfssys() calls this function to set the per-zone
3225 * value of mount_to to drive when an ephemeral mount is
3226 * timed out. Each mount will grab a copy of this value
3227 * when mounted.
3228 */
3229 void
nfs4_ephemeral_set_mount_to(uint_t mount_to)3230 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3231 {
3232 nfs4_trigger_globals_t *ntg;
3233 zone_t *zone = curproc->p_zone;
3234
3235 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3236
3237 ntg->ntg_mount_to = mount_to;
3238 }
3239
3240 /*
3241 * Walk the list of v4 mount options; if they are currently set in vfsp,
3242 * append them to a new comma-separated mount option string, and return it.
3243 *
3244 * Caller should free by calling nfs4_trigger_destroy_mntopts().
3245 */
3246 static char *
nfs4_trigger_create_mntopts(vfs_t * vfsp)3247 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3248 {
3249 uint_t i;
3250 char *mntopts;
3251 struct vfssw *vswp;
3252 mntopts_t *optproto;
3253
3254 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3255
3256 /* get the list of applicable mount options for v4; locks *vswp */
3257 vswp = vfs_getvfssw(MNTTYPE_NFS4);
3258 optproto = &vswp->vsw_optproto;
3259
3260 for (i = 0; i < optproto->mo_count; i++) {
3261 struct mntopt *mop = &optproto->mo_list[i];
3262
3263 if (mop->mo_flags & MO_EMPTY)
3264 continue;
3265
3266 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3267 kmem_free(mntopts, MAX_MNTOPT_STR);
3268 vfs_unrefvfssw(vswp);
3269 return (NULL);
3270 }
3271 }
3272
3273 vfs_unrefvfssw(vswp);
3274
3275 /*
3276 * MNTOPT_XATTR is not in the v4 mount opt proto list,
3277 * and it may only be passed via MS_OPTIONSTR, so we
3278 * must handle it here.
3279 *
3280 * Ideally, it would be in the list, but NFS does not specify its
3281 * own opt proto list, it uses instead the default one. Since
3282 * not all filesystems support extended attrs, it would not be
3283 * appropriate to add it there.
3284 */
3285 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3286 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3287 kmem_free(mntopts, MAX_MNTOPT_STR);
3288 return (NULL);
3289 }
3290
3291 return (mntopts);
3292 }
3293
3294 static void
nfs4_trigger_destroy_mntopts(char * mntopts)3295 nfs4_trigger_destroy_mntopts(char *mntopts)
3296 {
3297 if (mntopts)
3298 kmem_free(mntopts, MAX_MNTOPT_STR);
3299 }
3300
3301 /*
3302 * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3303 */
3304 static int
nfs4_trigger_add_mntopt(char * mntopts,char * optname,vfs_t * vfsp)3305 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3306 {
3307 if (mntopts == NULL || optname == NULL || vfsp == NULL)
3308 return (EINVAL);
3309
3310 if (vfs_optionisset(vfsp, optname, NULL)) {
3311 size_t mntoptslen = strlen(mntopts);
3312 size_t optnamelen = strlen(optname);
3313
3314 /* +1 for ',', +1 for NUL */
3315 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3316 return (EOVERFLOW);
3317
3318 /* first or subsequent mount option? */
3319 if (*mntopts != '\0')
3320 (void) strcat(mntopts, ",");
3321
3322 (void) strcat(mntopts, optname);
3323 }
3324
3325 return (0);
3326 }
3327
3328 static enum clnt_stat
nfs4_ping_server_common(struct knetconfig * knc,struct netbuf * addr,int nointr)3329 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3330 {
3331 int retries;
3332 uint_t max_msgsize;
3333 enum clnt_stat status;
3334 CLIENT *cl;
3335 struct timeval timeout;
3336
3337 /* as per recov_newserver() */
3338 max_msgsize = 0;
3339 retries = 1;
3340 timeout.tv_sec = 2;
3341 timeout.tv_usec = 0;
3342
3343 if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3344 max_msgsize, retries, CRED(), &cl) != 0)
3345 return (RPC_FAILED);
3346
3347 if (nointr)
3348 cl->cl_nosignal = TRUE;
3349 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3350 timeout);
3351 if (nointr)
3352 cl->cl_nosignal = FALSE;
3353
3354 AUTH_DESTROY(cl->cl_auth);
3355 CLNT_DESTROY(cl);
3356
3357 return (status);
3358 }
3359
3360 static enum clnt_stat
nfs4_trigger_ping_server(servinfo4_t * svp,int nointr)3361 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3362 {
3363 return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3364 }
3365