1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are 29 * triggered from a "stub" rnode via a special set of vnodeops. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/vfs_opreg.h> 40 #include <sys/file.h> 41 #include <sys/filio.h> 42 #include <sys/uio.h> 43 #include <sys/buf.h> 44 #include <sys/mman.h> 45 #include <sys/pathname.h> 46 #include <sys/dirent.h> 47 #include <sys/debug.h> 48 #include <sys/vmsystm.h> 49 #include <sys/fcntl.h> 50 #include <sys/flock.h> 51 #include <sys/swap.h> 52 #include <sys/errno.h> 53 #include <sys/strsubr.h> 54 #include <sys/sysmacros.h> 55 #include <sys/kmem.h> 56 #include <sys/mount.h> 57 #include <sys/cmn_err.h> 58 #include <sys/pathconf.h> 59 #include <sys/utsname.h> 60 #include <sys/dnlc.h> 61 #include <sys/acl.h> 62 #include <sys/systeminfo.h> 63 #include <sys/policy.h> 64 #include <sys/sdt.h> 65 #include <sys/list.h> 66 #include <sys/stat.h> 67 #include <sys/mntent.h> 68 69 #include <rpc/types.h> 70 #include <rpc/auth.h> 71 #include <rpc/clnt.h> 72 73 #include <nfs/nfs.h> 74 #include <nfs/nfs_clnt.h> 75 #include <nfs/nfs_acl.h> 76 #include <nfs/lm.h> 77 #include <nfs/nfs4.h> 78 #include <nfs/nfs4_kprot.h> 79 #include <nfs/rnode4.h> 80 #include <nfs/nfs4_clnt.h> 81 82 #include <vm/hat.h> 83 #include <vm/as.h> 84 #include <vm/page.h> 85 #include <vm/pvn.h> 86 #include <vm/seg.h> 87 #include <vm/seg_map.h> 88 #include <vm/seg_kpm.h> 89 #include <vm/seg_vn.h> 90 91 #include <fs/fs_subr.h> 92 93 #include <sys/ddi.h> 94 #include <sys/int_fmtio.h> 95 96 #include <sys/sunddi.h> 97 98 #include <sys/priv_names.h> 99 100 /* 101 * The automatic unmounter thread stuff! 102 */ 103 static int nfs4_trigger_thread_timer = 20; /* in seconds */ 104 105 /* 106 * Just a default.... 107 */ 108 static uint_t nfs4_trigger_mount_to = 240; 109 110 typedef struct nfs4_trigger_globals { 111 kmutex_t ntg_forest_lock; 112 uint_t ntg_mount_to; 113 int ntg_thread_started; 114 nfs4_ephemeral_tree_t *ntg_forest; 115 } nfs4_trigger_globals_t; 116 117 kmutex_t nfs4_ephemeral_thread_lock; 118 119 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED; 120 121 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *); 122 123 /* 124 * Used for ephemeral mounts; contains data either duplicated from 125 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount. 126 * 127 * It's intended that this structure is used solely for ephemeral 128 * mount-type specific data, for passing this data to 129 * nfs4_trigger_nargs_create(). 130 */ 131 typedef struct ephemeral_servinfo { 132 char *esi_hostname; 133 char *esi_netname; 134 char *esi_path; 135 int esi_path_len; 136 int esi_mount_flags; 137 struct netbuf *esi_addr; 138 struct netbuf *esi_syncaddr; 139 struct knetconfig *esi_knconf; 140 } ephemeral_servinfo_t; 141 142 /* 143 * Collect together the mount-type specific and generic data args. 144 */ 145 typedef struct domount_args { 146 ephemeral_servinfo_t *dma_esi; 147 char *dma_hostlist; /* comma-sep. for RO failover */ 148 struct nfs_args *dma_nargs; 149 } domount_args_t; 150 151 152 /* 153 * The vnode ops functions for a trigger stub vnode 154 */ 155 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *); 156 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *, 157 caller_context_t *); 158 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *, 159 caller_context_t *); 160 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *, 161 caller_context_t *); 162 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *, 163 caller_context_t *); 164 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **, 165 struct pathname *, int, vnode_t *, cred_t *, caller_context_t *, 166 int *, pathname_t *); 167 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *, 168 enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *, 169 vsecattr_t *); 170 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *, 171 int); 172 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *, 173 caller_context_t *, int); 174 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *, 175 cred_t *, caller_context_t *, int); 176 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *, 177 vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp); 178 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 179 caller_context_t *, int); 180 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *, 181 cred_t *, caller_context_t *, int); 182 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *); 183 184 /* 185 * Regular NFSv4 vnodeops that we need to reference directly 186 */ 187 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 188 caller_context_t *); 189 extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 190 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *); 191 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 192 extern int nfs4_lookup(vnode_t *, char *, vnode_t **, 193 struct pathname *, int, vnode_t *, cred_t *, 194 caller_context_t *, int *, pathname_t *); 195 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 196 caller_context_t *); 197 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 198 caller_context_t *); 199 extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 200 extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 201 202 static int nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **); 203 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **, 204 cred_t *, vnode_t **); 205 static domount_args_t *nfs4_trigger_domount_args_create(vnode_t *); 206 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma, 207 vnode_t *vp); 208 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *); 209 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *); 210 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *, 211 servinfo4_t *); 212 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *, 213 ephemeral_servinfo_t *); 214 static void nfs4_trigger_nargs_destroy(struct nfs_args *); 215 static char *nfs4_trigger_create_mntopts(vfs_t *); 216 static void nfs4_trigger_destroy_mntopts(char *); 217 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *); 218 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int); 219 220 extern int umount2_engine(vfs_t *, int, cred_t *, int); 221 222 223 vnodeops_t *nfs4_trigger_vnodeops; 224 225 /* 226 * These are the vnodeops that we must define for stub vnodes. 227 * 228 * 229 * Many of the VOPs defined for NFSv4 do not need to be defined here, 230 * for various reasons. This will result in the VFS default function being 231 * used: 232 * 233 * - These VOPs require a previous VOP_OPEN to have occurred. That will have 234 * lost the reference to the stub vnode, meaning these should not be called: 235 * close, read, write, ioctl, readdir, seek. 236 * 237 * - These VOPs are meaningless for vnodes without data pages. Since the 238 * stub vnode is of type VDIR, these should not be called: 239 * space, getpage, putpage, map, addmap, delmap, pageio, fsync. 240 * 241 * - These VOPs are otherwise not applicable, and should not be called: 242 * dump, setsecattr. 243 * 244 * 245 * These VOPs we do not want to define, but nor do we want the VFS default 246 * action. Instead, we specify the VFS error function, with fs_error(), but 247 * note that fs_error() is not actually called. Instead it results in the 248 * use of the error function defined for the particular VOP, in vn_ops_table[]: 249 * 250 * - frlock, dispose, shrlock. 251 * 252 * 253 * These VOPs we define to use the corresponding regular NFSv4 vnodeop. 254 * NOTE: if any of these ops involve an OTW call with the stub FH, then 255 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo() 256 * to protect the security data in the servinfo4_t for the "parent" 257 * filesystem that contains the stub. 258 * 259 * - These VOPs should not trigger a mount, so that "ls -l" does not: 260 * pathconf, getsecattr. 261 * 262 * - These VOPs would not make sense to trigger: 263 * inactive, rwlock, rwunlock, fid, realvp. 264 */ 265 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = { 266 VOPNAME_OPEN, { .vop_open = nfs4_trigger_open }, 267 VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr }, 268 VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr }, 269 VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access }, 270 VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup }, 271 VOPNAME_CREATE, { .vop_create = nfs4_trigger_create }, 272 VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove }, 273 VOPNAME_LINK, { .vop_link = nfs4_trigger_link }, 274 VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename }, 275 VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir }, 276 VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir }, 277 VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink }, 278 VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink }, 279 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 280 VOPNAME_FID, { .vop_fid = nfs4_fid }, 281 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 282 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 283 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 284 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 285 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 286 VOPNAME_FRLOCK, { .error = fs_error }, 287 VOPNAME_DISPOSE, { .error = fs_error }, 288 VOPNAME_SHRLOCK, { .error = fs_error }, 289 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 290 NULL, NULL 291 }; 292 293 static void 294 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net) 295 { 296 ASSERT(mutex_owned(&net->net_cnt_lock)); 297 net->net_refcnt++; 298 ASSERT(net->net_refcnt != 0); 299 } 300 301 static void 302 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net) 303 { 304 mutex_enter(&net->net_cnt_lock); 305 nfs4_ephemeral_tree_incr(net); 306 mutex_exit(&net->net_cnt_lock); 307 } 308 309 /* 310 * We need a safe way to decrement the refcnt whilst the 311 * lock is being held. 312 */ 313 static void 314 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net) 315 { 316 ASSERT(mutex_owned(&net->net_cnt_lock)); 317 ASSERT(net->net_refcnt != 0); 318 net->net_refcnt--; 319 } 320 321 static void 322 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net) 323 { 324 mutex_enter(&net->net_cnt_lock); 325 nfs4_ephemeral_tree_decr(net); 326 mutex_exit(&net->net_cnt_lock); 327 } 328 329 /* 330 * Trigger ops for stub vnodes; for mirror mounts, etc. 331 * 332 * The general idea is that a "triggering" op will first call 333 * nfs4_trigger_mount(), which will find out whether a mount has already 334 * been triggered. 335 * 336 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode 337 * of the covering vfs. 338 * 339 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so, 340 * and again set newvp, as above. 341 * 342 * The triggering op may then re-issue the VOP by calling it on newvp. 343 * 344 * Note that some ops may perform custom action, and may or may not need 345 * to trigger a mount. 346 * 347 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We 348 * obviously can't do this with VOP_<whatever>, since it's a stub vnode 349 * and that would just recurse. Instead, we call the v4 op directly, 350 * by name. This is OK, since we know that the vnode is for NFSv4, 351 * otherwise it couldn't be a stub. 352 * 353 */ 354 355 static int 356 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 357 { 358 int error; 359 vnode_t *newvp; 360 361 error = nfs4_trigger_mount(*vpp, cr, &newvp); 362 if (error) 363 return (error); 364 365 /* Release the stub vnode, as we're losing the reference to it */ 366 VN_RELE(*vpp); 367 368 /* Give the caller the root vnode of the newly-mounted fs */ 369 *vpp = newvp; 370 371 /* return with VN_HELD(newvp) */ 372 return (VOP_OPEN(vpp, flag, cr, ct)); 373 } 374 375 /* 376 * For the majority of cases, nfs4_trigger_getattr() will not trigger 377 * a mount. However, if ATTR_TRIGGER is set, we are being informed 378 * that we need to force the mount before we attempt to determine 379 * the attributes. The intent is an atomic operation for security 380 * testing. 381 */ 382 static int 383 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 384 caller_context_t *ct) 385 { 386 int error; 387 388 if (flags & ATTR_TRIGGER) { 389 vnode_t *newvp; 390 391 error = nfs4_trigger_mount(vp, cr, &newvp); 392 if (error) 393 return (error); 394 395 error = VOP_GETATTR(newvp, vap, flags, cr, ct); 396 VN_RELE(newvp); 397 } else { 398 error = nfs4_getattr(vp, vap, flags, cr, ct); 399 } 400 401 return (error); 402 } 403 404 static int 405 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 406 caller_context_t *ct) 407 { 408 int error; 409 vnode_t *newvp; 410 411 error = nfs4_trigger_mount(vp, cr, &newvp); 412 if (error) 413 return (error); 414 415 error = VOP_SETATTR(newvp, vap, flags, cr, ct); 416 VN_RELE(newvp); 417 418 return (error); 419 } 420 421 static int 422 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr, 423 caller_context_t *ct) 424 { 425 int error; 426 vnode_t *newvp; 427 428 error = nfs4_trigger_mount(vp, cr, &newvp); 429 if (error) 430 return (error); 431 432 error = VOP_ACCESS(newvp, mode, flags, cr, ct); 433 VN_RELE(newvp); 434 435 return (error); 436 } 437 438 static int 439 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, 440 struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr, 441 caller_context_t *ct, int *deflags, pathname_t *rpnp) 442 { 443 int error; 444 vnode_t *newdvp; 445 rnode4_t *drp = VTOR4(dvp); 446 447 ASSERT(RP_ISSTUB(drp)); 448 449 /* for now, we only support mirror-mounts */ 450 ASSERT(RP_ISSTUB_MIRRORMOUNT(drp)); 451 452 /* 453 * It's not legal to lookup ".." for an fs root, so we mustn't pass 454 * that up. Instead, pass onto the regular op, regardless of whether 455 * we've triggered a mount. 456 */ 457 if (strcmp(nm, "..") == 0) 458 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 459 ct, deflags, rpnp)); 460 461 error = nfs4_trigger_mount(dvp, cr, &newdvp); 462 if (error) 463 return (error); 464 465 error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct, 466 deflags, rpnp); 467 VN_RELE(newdvp); 468 469 return (error); 470 } 471 472 static int 473 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va, 474 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr, 475 int flags, caller_context_t *ct, vsecattr_t *vsecp) 476 { 477 int error; 478 vnode_t *newdvp; 479 480 error = nfs4_trigger_mount(dvp, cr, &newdvp); 481 if (error) 482 return (error); 483 484 error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, 485 flags, ct, vsecp); 486 VN_RELE(newdvp); 487 488 return (error); 489 } 490 491 static int 492 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, 493 int flags) 494 { 495 int error; 496 vnode_t *newdvp; 497 498 error = nfs4_trigger_mount(dvp, cr, &newdvp); 499 if (error) 500 return (error); 501 502 error = VOP_REMOVE(newdvp, nm, cr, ct, flags); 503 VN_RELE(newdvp); 504 505 return (error); 506 } 507 508 static int 509 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 510 caller_context_t *ct, int flags) 511 { 512 int error; 513 vnode_t *newtdvp; 514 515 error = nfs4_trigger_mount(tdvp, cr, &newtdvp); 516 if (error) 517 return (error); 518 519 /* 520 * We don't check whether svp is a stub. Let the NFSv4 code 521 * detect that error, and return accordingly. 522 */ 523 error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags); 524 VN_RELE(newtdvp); 525 526 return (error); 527 } 528 529 static int 530 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, 531 cred_t *cr, caller_context_t *ct, int flags) 532 { 533 int error; 534 vnode_t *newsdvp; 535 rnode4_t *tdrp = VTOR4(tdvp); 536 537 /* 538 * We know that sdvp is a stub, otherwise we would not be here. 539 * 540 * If tdvp is also be a stub, there are two possibilities: it 541 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)] 542 * or it is a different stub [!VN_CMP(sdvp, tdvp)]. 543 * 544 * In the former case, just trigger sdvp, and treat tdvp as 545 * though it were not a stub. 546 * 547 * In the latter case, it might be a different stub for the 548 * same server fs as sdvp, or for a different server fs. 549 * Regardless, from the client perspective this would still 550 * be a cross-filesystem rename, and should not be allowed, 551 * so return EXDEV, without triggering either mount. 552 */ 553 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp)) 554 return (EXDEV); 555 556 error = nfs4_trigger_mount(sdvp, cr, &newsdvp); 557 if (error) 558 return (error); 559 560 error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags); 561 562 VN_RELE(newsdvp); 563 564 return (error); 565 } 566 567 /* ARGSUSED */ 568 static int 569 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, 570 cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) 571 { 572 int error; 573 vnode_t *newdvp; 574 575 error = nfs4_trigger_mount(dvp, cr, &newdvp); 576 if (error) 577 return (error); 578 579 error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp); 580 VN_RELE(newdvp); 581 582 return (error); 583 } 584 585 static int 586 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 587 caller_context_t *ct, int flags) 588 { 589 int error; 590 vnode_t *newdvp; 591 592 error = nfs4_trigger_mount(dvp, cr, &newdvp); 593 if (error) 594 return (error); 595 596 error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags); 597 VN_RELE(newdvp); 598 599 return (error); 600 } 601 602 static int 603 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, 604 cred_t *cr, caller_context_t *ct, int flags) 605 { 606 int error; 607 vnode_t *newdvp; 608 609 error = nfs4_trigger_mount(dvp, cr, &newdvp); 610 if (error) 611 return (error); 612 613 error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags); 614 VN_RELE(newdvp); 615 616 return (error); 617 } 618 619 static int 620 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, 621 caller_context_t *ct) 622 { 623 int error; 624 vnode_t *newvp; 625 626 error = nfs4_trigger_mount(vp, cr, &newvp); 627 if (error) 628 return (error); 629 630 error = VOP_READLINK(newvp, uiop, cr, ct); 631 VN_RELE(newvp); 632 633 return (error); 634 } 635 636 /* end of trigger vnode ops */ 637 638 /* 639 * See if the mount has already been done by another caller. 640 */ 641 static int 642 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp, 643 bool_t *was_mounted, vfs_t **vfsp) 644 { 645 int error; 646 mntinfo4_t *mi = VTOMI4(vp); 647 648 *was_mounted = FALSE; 649 650 error = vn_vfsrlock_wait(vp); 651 if (error) 652 return (error); 653 654 *vfsp = vn_mountedvfs(vp); 655 if (*vfsp != NULL) { 656 /* the mount has already occurred */ 657 error = VFS_ROOT(*vfsp, newvpp); 658 if (!error) { 659 /* need to update the reference time */ 660 mutex_enter(&mi->mi_lock); 661 if (mi->mi_ephemeral) 662 mi->mi_ephemeral->ne_ref_time = 663 gethrestime_sec(); 664 mutex_exit(&mi->mi_lock); 665 666 *was_mounted = TRUE; 667 } 668 } 669 670 vn_vfsunlock(vp); 671 return (0); 672 } 673 674 /* 675 * Mount upon a trigger vnode; for mirror-mounts, etc. 676 * 677 * The mount may have already occurred, via another thread. If not, 678 * assemble the location information - which may require fetching - and 679 * perform the mount. 680 * 681 * Sets newvp to be the root of the fs that is now covering vp. Note 682 * that we return with VN_HELD(*newvp). 683 * 684 * The caller is responsible for passing the VOP onto the covering fs. 685 */ 686 static int 687 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp) 688 { 689 int error; 690 vfs_t *vfsp; 691 rnode4_t *rp = VTOR4(vp); 692 mntinfo4_t *mi = VTOMI4(vp); 693 domount_args_t *dma; 694 695 nfs4_ephemeral_tree_t *net; 696 697 bool_t must_unlock = FALSE; 698 bool_t is_building = FALSE; 699 bool_t was_mounted = FALSE; 700 701 cred_t *mcred = NULL; 702 703 nfs4_trigger_globals_t *ntg; 704 705 zone_t *zone = curproc->p_zone; 706 707 ASSERT(RP_ISSTUB(rp)); 708 709 /* for now, we only support mirror-mounts */ 710 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 711 712 *newvpp = NULL; 713 714 /* 715 * Has the mount already occurred? 716 */ 717 error = nfs4_trigger_mounted_already(vp, newvpp, 718 &was_mounted, &vfsp); 719 if (error || was_mounted) 720 goto done; 721 722 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 723 ASSERT(ntg != NULL); 724 725 mutex_enter(&mi->mi_lock); 726 727 /* 728 * We need to lock down the ephemeral tree. 729 */ 730 if (mi->mi_ephemeral_tree == NULL) { 731 net = kmem_zalloc(sizeof (*net), KM_SLEEP); 732 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL); 733 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL); 734 net->net_refcnt = 1; 735 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING; 736 is_building = TRUE; 737 738 /* 739 * We need to add it to the zone specific list for 740 * automatic unmounting and harvesting of deadwood. 741 */ 742 mutex_enter(&ntg->ntg_forest_lock); 743 if (ntg->ntg_forest != NULL) 744 net->net_next = ntg->ntg_forest; 745 ntg->ntg_forest = net; 746 mutex_exit(&ntg->ntg_forest_lock); 747 748 /* 749 * No lock order confusion with mi_lock because no 750 * other node could have grabbed net_tree_lock. 751 */ 752 mutex_enter(&net->net_tree_lock); 753 mi->mi_ephemeral_tree = net; 754 net->net_mount = mi; 755 mutex_exit(&mi->mi_lock); 756 } else { 757 net = mi->mi_ephemeral_tree; 758 nfs4_ephemeral_tree_hold(net); 759 760 mutex_exit(&mi->mi_lock); 761 762 mutex_enter(&net->net_tree_lock); 763 764 /* 765 * We can only procede if the tree is neither locked 766 * nor being torn down. 767 */ 768 mutex_enter(&net->net_cnt_lock); 769 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) { 770 nfs4_ephemeral_tree_decr(net); 771 mutex_exit(&net->net_cnt_lock); 772 mutex_exit(&net->net_tree_lock); 773 774 return (EIO); 775 } 776 mutex_exit(&net->net_cnt_lock); 777 } 778 779 mutex_enter(&net->net_cnt_lock); 780 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING; 781 mutex_exit(&net->net_cnt_lock); 782 783 must_unlock = TRUE; 784 785 dma = nfs4_trigger_domount_args_create(vp); 786 if (dma == NULL) { 787 error = EINVAL; 788 goto done; 789 } 790 791 /* 792 * Note that since we define mirror mounts to work 793 * for any user, we simply extend the privileges of 794 * the user's credentials to allow the mount to 795 * proceed. 796 */ 797 mcred = crdup(cr); 798 if (mcred == NULL) { 799 error = EINVAL; 800 goto done; 801 } 802 803 crset_zone_privall(mcred); 804 805 error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp); 806 nfs4_trigger_domount_args_destroy(dma, vp); 807 808 crfree(mcred); 809 810 done: 811 812 if (must_unlock) { 813 mutex_enter(&net->net_cnt_lock); 814 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING; 815 if (is_building) 816 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING; 817 nfs4_ephemeral_tree_decr(net); 818 mutex_exit(&net->net_cnt_lock); 819 820 mutex_exit(&net->net_tree_lock); 821 } 822 823 if (!error && (newvpp == NULL || *newvpp == NULL)) 824 error = ENOSYS; 825 826 return (error); 827 } 828 829 /* 830 * Collect together both the generic & mount-type specific args. 831 */ 832 static domount_args_t * 833 nfs4_trigger_domount_args_create(vnode_t *vp) 834 { 835 int nointr; 836 char *hostlist; 837 servinfo4_t *svp; 838 struct nfs_args *nargs, *nargs_head; 839 enum clnt_stat status; 840 ephemeral_servinfo_t *esi, *esi_first; 841 domount_args_t *dma; 842 mntinfo4_t *mi = VTOMI4(vp); 843 844 nointr = !(mi->mi_flags & MI4_INT); 845 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 846 847 svp = mi->mi_curr_serv; 848 /* check if the current server is responding */ 849 status = nfs4_trigger_ping_server(svp, nointr); 850 if (status == RPC_SUCCESS) { 851 esi_first = nfs4_trigger_esi_create(vp, svp); 852 if (esi_first == NULL) { 853 kmem_free(hostlist, MAXPATHLEN); 854 return (NULL); 855 } 856 857 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN); 858 859 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first); 860 } else { 861 /* current server did not respond */ 862 esi_first = NULL; 863 nargs_head = NULL; 864 } 865 nargs = nargs_head; 866 867 /* 868 * NFS RO failover. 869 * 870 * If we have multiple servinfo4 structures, linked via sv_next, 871 * we must create one nfs_args for each, linking the nfs_args via 872 * nfs_ext_u.nfs_extB.next. 873 * 874 * We need to build a corresponding esi for each, too, but that is 875 * used solely for building nfs_args, and may be immediately 876 * discarded, as domount() requires the info from just one esi, 877 * but all the nfs_args. 878 * 879 * Currently, the NFS mount code will hang if not all servers 880 * requested are available. To avoid that, we need to ping each 881 * server, here, and remove it from the list if it is not 882 * responding. This has the side-effect of that server then 883 * being permanently unavailable for this failover mount, even if 884 * it recovers. That's unfortunate, but the best we can do until 885 * the mount code path is fixed. 886 */ 887 888 /* 889 * If the current server was down, loop indefinitely until we find 890 * at least one responsive server. 891 */ 892 do { 893 /* no locking needed for sv_next; it is only set at fs mount */ 894 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 895 struct nfs_args *next; 896 897 /* 898 * nargs_head: the head of the nfs_args list 899 * nargs: the current tail of the list 900 * next: the newly-created element to be added 901 */ 902 903 /* 904 * We've already tried the current server, above; 905 * if it was responding, we have already included it 906 * and it may now be ignored. 907 * 908 * Otherwise, try it again, since it may now have 909 * recovered. 910 */ 911 if (svp == mi->mi_curr_serv && esi_first != NULL) 912 continue; 913 914 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 915 if (svp->sv_flags & SV4_NOTINUSE) { 916 nfs_rw_exit(&svp->sv_lock); 917 continue; 918 } 919 nfs_rw_exit(&svp->sv_lock); 920 921 /* check if the server is responding */ 922 status = nfs4_trigger_ping_server(svp, nointr); 923 /* if the server did not respond, ignore it */ 924 if (status != RPC_SUCCESS) 925 continue; 926 927 esi = nfs4_trigger_esi_create(vp, svp); 928 if (esi == NULL) 929 continue; 930 931 /* 932 * If the original current server (mi_curr_serv) 933 * was down when when we first tried it, 934 * (i.e. esi_first == NULL), 935 * we select this new server (svp) to be the server 936 * that we will actually contact (esi_first). 937 * 938 * Note that it's possible that mi_curr_serv == svp, 939 * if that mi_curr_serv was down but has now recovered. 940 */ 941 next = nfs4_trigger_nargs_create(mi, svp, esi); 942 if (esi_first == NULL) { 943 ASSERT(nargs == NULL); 944 ASSERT(nargs_head == NULL); 945 nargs_head = next; 946 esi_first = esi; 947 (void) strlcpy(hostlist, 948 esi_first->esi_hostname, MAXPATHLEN); 949 } else { 950 ASSERT(nargs_head != NULL); 951 nargs->nfs_ext_u.nfs_extB.next = next; 952 (void) strlcat(hostlist, ",", MAXPATHLEN); 953 (void) strlcat(hostlist, esi->esi_hostname, 954 MAXPATHLEN); 955 /* esi was only needed for hostname & nargs */ 956 nfs4_trigger_esi_destroy(esi, vp); 957 } 958 959 nargs = next; 960 } 961 962 /* if we've had no response at all, wait a second */ 963 if (esi_first == NULL) 964 delay(drv_usectohz(1000000)); 965 966 } while (esi_first == NULL); 967 ASSERT(nargs_head != NULL); 968 969 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP); 970 dma->dma_esi = esi_first; 971 dma->dma_hostlist = hostlist; 972 dma->dma_nargs = nargs_head; 973 974 return (dma); 975 } 976 977 static void 978 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp) 979 { 980 if (dma != NULL) { 981 if (dma->dma_esi != NULL && vp != NULL) 982 nfs4_trigger_esi_destroy(dma->dma_esi, vp); 983 984 if (dma->dma_hostlist != NULL) 985 kmem_free(dma->dma_hostlist, MAXPATHLEN); 986 987 if (dma->dma_nargs != NULL) { 988 struct nfs_args *nargs = dma->dma_nargs; 989 990 do { 991 struct nfs_args *next = 992 nargs->nfs_ext_u.nfs_extB.next; 993 994 nfs4_trigger_nargs_destroy(nargs); 995 nargs = next; 996 } while (nargs != NULL); 997 } 998 999 kmem_free(dma, sizeof (domount_args_t)); 1000 } 1001 } 1002 1003 /* 1004 * The ephemeral_servinfo_t struct contains basic information we will need to 1005 * perform the mount. Whilst the structure is generic across different 1006 * types of ephemeral mount, the way we gather its contents differs. 1007 */ 1008 static ephemeral_servinfo_t * 1009 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp) 1010 { 1011 ephemeral_servinfo_t *esi; 1012 rnode4_t *rp = VTOR4(vp); 1013 1014 ASSERT(RP_ISSTUB(rp)); 1015 1016 /* Call the ephemeral type-specific routine */ 1017 if (RP_ISSTUB_MIRRORMOUNT(rp)) 1018 esi = nfs4_trigger_esi_create_mirrormount(vp, svp); 1019 else 1020 esi = NULL; 1021 1022 /* for now, we only support mirror-mounts */ 1023 ASSERT(esi != NULL); 1024 1025 return (esi); 1026 } 1027 1028 static void 1029 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp) 1030 { 1031 rnode4_t *rp = VTOR4(vp); 1032 1033 ASSERT(RP_ISSTUB(rp)); 1034 1035 /* for now, we only support mirror-mounts */ 1036 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 1037 1038 /* Currently, no need for an ephemeral type-specific routine */ 1039 1040 /* 1041 * The contents of ephemeral_servinfo_t goes into nfs_args, 1042 * and will be handled by nfs4_trigger_nargs_destroy(). 1043 * We need only free the structure itself. 1044 */ 1045 if (esi != NULL) 1046 kmem_free(esi, sizeof (ephemeral_servinfo_t)); 1047 } 1048 1049 /* 1050 * Some of this may turn out to be common with other ephemeral types, 1051 * in which case it should be moved to nfs4_trigger_esi_create(), or a 1052 * common function called. 1053 */ 1054 static ephemeral_servinfo_t * 1055 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp) 1056 { 1057 char *stubpath; 1058 struct knetconfig *sikncp, *svkncp; 1059 struct netbuf *bufp; 1060 ephemeral_servinfo_t *esi; 1061 1062 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP); 1063 1064 /* initially set to be our type of ephemeral mount; may be added to */ 1065 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT; 1066 1067 /* 1068 * We're copying info from the stub rnode's servinfo4, but 1069 * we must create new copies, not pointers, since this information 1070 * is to be associated with the new mount, which will be 1071 * unmounted (and its structures freed) separately 1072 */ 1073 1074 /* 1075 * Sizes passed to kmem_[z]alloc here must match those freed 1076 * in nfs4_free_args() 1077 */ 1078 1079 /* 1080 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this 1081 * is difficult to avoid: as we need to read svp to calculate the 1082 * sizes to be allocated. 1083 */ 1084 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1085 1086 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP); 1087 (void) strcat(esi->esi_hostname, svp->sv_hostname); 1088 1089 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1090 bufp = esi->esi_addr; 1091 bufp->len = svp->sv_addr.len; 1092 bufp->maxlen = svp->sv_addr.maxlen; 1093 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1094 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len); 1095 1096 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP); 1097 sikncp = esi->esi_knconf; 1098 svkncp = svp->sv_knconf; 1099 sikncp->knc_semantics = svkncp->knc_semantics; 1100 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1101 (void) strcat((char *)sikncp->knc_protofmly, 1102 (char *)svkncp->knc_protofmly); 1103 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1104 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto); 1105 sikncp->knc_rdev = svkncp->knc_rdev; 1106 1107 /* 1108 * Used when AUTH_DH is negotiated. 1109 * 1110 * This is ephemeral mount-type specific, since it contains the 1111 * server's time-sync syncaddr. 1112 */ 1113 if (svp->sv_dhsec) { 1114 struct netbuf *bufp; 1115 sec_data_t *sdata; 1116 dh_k4_clntdata_t *data; 1117 1118 sdata = svp->sv_dhsec; 1119 data = (dh_k4_clntdata_t *)sdata->data; 1120 ASSERT(sdata->rpcflavor == AUTH_DH); 1121 1122 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1123 bufp->len = data->syncaddr.len; 1124 bufp->maxlen = data->syncaddr.maxlen; 1125 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1126 bcopy(data->syncaddr.buf, bufp->buf, bufp->len); 1127 esi->esi_syncaddr = bufp; 1128 1129 if (data->netname != NULL) { 1130 int nmlen = data->netnamelen; 1131 1132 /* 1133 * We need to copy from a dh_k4_clntdata_t 1134 * netname/netnamelen pair to a NUL-terminated 1135 * netname string suitable for putting in nfs_args, 1136 * where the latter has no netnamelen field. 1137 */ 1138 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP); 1139 bcopy(data->netname, esi->esi_netname, nmlen); 1140 } 1141 } else { 1142 esi->esi_syncaddr = NULL; 1143 esi->esi_netname = NULL; 1144 } 1145 1146 stubpath = fn_path(VTOSV(vp)->sv_name); 1147 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */ 1148 ASSERT(*stubpath == '.'); 1149 stubpath += 1; 1150 1151 /* for nfs_args->fh */ 1152 esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1; 1153 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP); 1154 (void) strcat(esi->esi_path, svp->sv_path); 1155 (void) strcat(esi->esi_path, stubpath); 1156 1157 stubpath -= 1; 1158 /* stubpath allocated by fn_path() */ 1159 kmem_free(stubpath, strlen(stubpath) + 1); 1160 1161 nfs_rw_exit(&svp->sv_lock); 1162 1163 return (esi); 1164 } 1165 1166 /* 1167 * Assemble the args, and call the generic VFS mount function to 1168 * finally perform the ephemeral mount. 1169 */ 1170 static int 1171 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp, 1172 cred_t *cr, vnode_t **newvpp) 1173 { 1174 struct mounta *uap; 1175 char *mntpt, *orig_path, *path; 1176 const char *orig_mntpt; 1177 int retval; 1178 int mntpt_len; 1179 int spec_len; 1180 zone_t *zone = curproc->p_zone; 1181 bool_t has_leading_slash; 1182 int i; 1183 1184 vfs_t *stubvfsp = stubvp->v_vfsp; 1185 ephemeral_servinfo_t *esi = dma->dma_esi; 1186 struct nfs_args *nargs = dma->dma_nargs; 1187 1188 /* first, construct the mount point for the ephemeral mount */ 1189 orig_path = path = fn_path(VTOSV(stubvp)->sv_name); 1190 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt); 1191 1192 if (*orig_path == '.') 1193 orig_path++; 1194 1195 /* 1196 * Get rid of zone's root path 1197 */ 1198 if (zone != global_zone) { 1199 /* 1200 * -1 for trailing '/' and -1 for EOS. 1201 */ 1202 if (strncmp(zone->zone_rootpath, orig_mntpt, 1203 zone->zone_rootpathlen - 1) == 0) { 1204 orig_mntpt += (zone->zone_rootpathlen - 2); 1205 } 1206 } 1207 1208 mntpt_len = strlen(orig_mntpt) + strlen(orig_path); 1209 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP); 1210 (void) strcat(mntpt, orig_mntpt); 1211 (void) strcat(mntpt, orig_path); 1212 1213 kmem_free(path, strlen(path) + 1); 1214 path = esi->esi_path; 1215 if (*path == '.') 1216 path++; 1217 if (path[0] == '/' && path[1] == '/') 1218 path++; 1219 has_leading_slash = (*path == '/'); 1220 1221 spec_len = strlen(dma->dma_hostlist); 1222 spec_len += strlen(path); 1223 1224 /* We are going to have to add this in */ 1225 if (!has_leading_slash) 1226 spec_len++; 1227 1228 /* We need to get the ':' for dma_hostlist:esi_path */ 1229 spec_len++; 1230 1231 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP); 1232 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP); 1233 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist, 1234 has_leading_slash ? "" : "/", path); 1235 1236 uap->dir = mntpt; 1237 1238 uap->flags = MS_SYSSPACE | MS_DATA; 1239 /* fstype-independent mount options not covered elsewhere */ 1240 /* copy parent's mount(1M) "-m" flag */ 1241 if (stubvfsp->vfs_flag & VFS_NOMNTTAB) 1242 uap->flags |= MS_NOMNTTAB; 1243 1244 uap->fstype = MNTTYPE_NFS4; 1245 uap->dataptr = (char *)nargs; 1246 /* not needed for MS_SYSSPACE */ 1247 uap->datalen = 0; 1248 1249 /* use optptr to pass in extra mount options */ 1250 uap->flags |= MS_OPTIONSTR; 1251 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp); 1252 if (uap->optptr == NULL) { 1253 retval = EINVAL; 1254 goto done; 1255 } 1256 1257 /* domount() expects us to count the trailing NUL */ 1258 uap->optlen = strlen(uap->optptr) + 1; 1259 1260 /* 1261 * If we get EBUSY, we try again once to see if we can perform 1262 * the mount. We do this because of a spurious race condition. 1263 */ 1264 for (i = 0; i < 2; i++) { 1265 int error; 1266 bool_t was_mounted; 1267 1268 retval = domount(NULL, uap, stubvp, cr, vfsp); 1269 if (retval == 0) { 1270 retval = VFS_ROOT(*vfsp, newvpp); 1271 VFS_RELE(*vfsp); 1272 break; 1273 } else if (retval != EBUSY) { 1274 break; 1275 } 1276 1277 /* 1278 * We might find it mounted by the other racer... 1279 */ 1280 error = nfs4_trigger_mounted_already(stubvp, 1281 newvpp, &was_mounted, vfsp); 1282 if (error) { 1283 goto done; 1284 } else if (was_mounted) { 1285 retval = 0; 1286 break; 1287 } 1288 } 1289 1290 done: 1291 if (uap->optptr) 1292 nfs4_trigger_destroy_mntopts(uap->optptr); 1293 1294 kmem_free(uap->spec, spec_len + 1); 1295 kmem_free(uap, sizeof (struct mounta)); 1296 kmem_free(mntpt, mntpt_len + 1); 1297 1298 return (retval); 1299 } 1300 1301 /* 1302 * Build an nfs_args structure for passing to domount(). 1303 * 1304 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t; 1305 * generic data - common to all ephemeral mount types - is read directly 1306 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode. 1307 */ 1308 static struct nfs_args * 1309 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp, 1310 ephemeral_servinfo_t *esi) 1311 { 1312 sec_data_t *secdata; 1313 struct nfs_args *nargs; 1314 1315 /* setup the nfs args */ 1316 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP); 1317 1318 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1319 1320 nargs->addr = esi->esi_addr; 1321 1322 /* for AUTH_DH by negotiation */ 1323 if (esi->esi_syncaddr || esi->esi_netname) { 1324 nargs->flags |= NFSMNT_SECURE; 1325 nargs->syncaddr = esi->esi_syncaddr; 1326 nargs->netname = esi->esi_netname; 1327 } 1328 1329 nargs->flags |= NFSMNT_KNCONF; 1330 nargs->knconf = esi->esi_knconf; 1331 nargs->flags |= NFSMNT_HOSTNAME; 1332 nargs->hostname = esi->esi_hostname; 1333 nargs->fh = esi->esi_path; 1334 1335 /* general mount settings, all copied from parent mount */ 1336 mutex_enter(&mi->mi_lock); 1337 1338 if (!(mi->mi_flags & MI4_HARD)) 1339 nargs->flags |= NFSMNT_SOFT; 1340 1341 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO | 1342 NFSMNT_RETRANS; 1343 nargs->wsize = mi->mi_stsize; 1344 nargs->rsize = mi->mi_tsize; 1345 nargs->timeo = mi->mi_timeo; 1346 nargs->retrans = mi->mi_retrans; 1347 1348 if (mi->mi_flags & MI4_INT) 1349 nargs->flags |= NFSMNT_INT; 1350 if (mi->mi_flags & MI4_NOAC) 1351 nargs->flags |= NFSMNT_NOAC; 1352 1353 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN | 1354 NFSMNT_ACDIRMAX; 1355 nargs->acregmin = HR2SEC(mi->mi_acregmin); 1356 nargs->acregmax = HR2SEC(mi->mi_acregmax); 1357 nargs->acdirmin = HR2SEC(mi->mi_acdirmin); 1358 nargs->acdirmax = HR2SEC(mi->mi_acdirmax); 1359 1360 if (mi->mi_flags & MI4_NOCTO) 1361 nargs->flags |= NFSMNT_NOCTO; 1362 if (mi->mi_flags & MI4_GRPID) 1363 nargs->flags |= NFSMNT_GRPID; 1364 if (mi->mi_flags & MI4_LLOCK) 1365 nargs->flags |= NFSMNT_LLOCK; 1366 if (mi->mi_flags & MI4_NOPRINT) 1367 nargs->flags |= NFSMNT_NOPRINT; 1368 if (mi->mi_flags & MI4_DIRECTIO) 1369 nargs->flags |= NFSMNT_DIRECTIO; 1370 if (mi->mi_flags & MI4_PUBLIC) 1371 nargs->flags |= NFSMNT_PUBLIC; 1372 1373 mutex_exit(&mi->mi_lock); 1374 1375 /* add any specific flags for this type of ephemeral mount */ 1376 nargs->flags |= esi->esi_mount_flags; 1377 1378 /* 1379 * Security data & negotiation policy. 1380 * 1381 * We need to preserve the parent mount's preference for security 1382 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT. 1383 * 1384 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific 1385 * security flavour was requested, with data in sv_secdata, and that 1386 * no negotiation should occur. If this specified flavour fails, that's 1387 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT. 1388 * 1389 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in 1390 * default flavour, in sv_secdata, but then negotiate a new flavour. 1391 * Possible flavours are recorded in an array in sv_secinfo, with 1392 * currently in-use flavour pointed to by sv_currsec. 1393 * 1394 * If sv_currsec is set, i.e. if negotiation has already occurred, 1395 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless, 1396 * we will set NFSMNT_SECDEFAULT, to enable negotiation. 1397 */ 1398 if (svp->sv_flags & SV4_TRYSECDEFAULT) { 1399 /* enable negotiation for ephemeral mount */ 1400 nargs->flags |= NFSMNT_SECDEFAULT; 1401 1402 /* 1403 * As a starting point for negotiation, copy parent 1404 * mount's negotiated flavour (sv_currsec) if available, 1405 * or its passed-in flavour (sv_secdata) if not. 1406 */ 1407 if (svp->sv_currsec != NULL) 1408 secdata = copy_sec_data(svp->sv_currsec); 1409 else if (svp->sv_secdata != NULL) 1410 secdata = copy_sec_data(svp->sv_secdata); 1411 else 1412 secdata = NULL; 1413 } else { 1414 /* do not enable negotiation; copy parent's passed-in flavour */ 1415 if (svp->sv_secdata != NULL) 1416 secdata = copy_sec_data(svp->sv_secdata); 1417 else 1418 secdata = NULL; 1419 } 1420 1421 nfs_rw_exit(&svp->sv_lock); 1422 1423 nargs->flags |= NFSMNT_NEWARGS; 1424 nargs->nfs_args_ext = NFS_ARGS_EXTB; 1425 nargs->nfs_ext_u.nfs_extB.secdata = secdata; 1426 1427 /* for NFS RO failover; caller will set if necessary */ 1428 nargs->nfs_ext_u.nfs_extB.next = NULL; 1429 1430 return (nargs); 1431 } 1432 1433 static void 1434 nfs4_trigger_nargs_destroy(struct nfs_args *nargs) 1435 { 1436 /* 1437 * Either the mount failed, in which case the data is not needed, or 1438 * nfs4_mount() has either taken copies of what it needs or, 1439 * where it has merely copied the ptr, it has set *our* ptr to NULL, 1440 * whereby nfs4_free_args() will ignore it. 1441 */ 1442 nfs4_free_args(nargs); 1443 kmem_free(nargs, sizeof (struct nfs_args)); 1444 } 1445 1446 /* 1447 * When we finally get into the mounting, we need to add this 1448 * node to the ephemeral tree. 1449 * 1450 * This is called from nfs4_mount(). 1451 */ 1452 int 1453 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp) 1454 { 1455 mntinfo4_t *mi_parent; 1456 nfs4_ephemeral_t *eph; 1457 nfs4_ephemeral_tree_t *net; 1458 1459 nfs4_ephemeral_t *prior; 1460 nfs4_ephemeral_t *child; 1461 1462 nfs4_ephemeral_t *peer; 1463 1464 nfs4_trigger_globals_t *ntg; 1465 zone_t *zone = curproc->p_zone; 1466 1467 int rc = 0; 1468 1469 mi_parent = VTOMI4(mvp); 1470 1471 /* 1472 * Get this before grabbing anything else! 1473 */ 1474 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 1475 if (!ntg->ntg_thread_started) { 1476 nfs4_ephemeral_start_harvester(ntg); 1477 } 1478 1479 mutex_enter(&mi_parent->mi_lock); 1480 mutex_enter(&mi->mi_lock); 1481 1482 net = mi->mi_ephemeral_tree = 1483 mi_parent->mi_ephemeral_tree; 1484 1485 /* 1486 * If the mi_ephemeral_tree is NULL, then it 1487 * means that either the harvester or a manual 1488 * umount has cleared the tree out right before 1489 * we got here. 1490 * 1491 * There is nothing we can do here, so return 1492 * to the caller and let them decide whether they 1493 * try again. 1494 */ 1495 if (net == NULL) { 1496 mutex_exit(&mi->mi_lock); 1497 mutex_exit(&mi_parent->mi_lock); 1498 1499 return (EBUSY); 1500 } 1501 1502 nfs4_ephemeral_tree_hold(net); 1503 1504 /* 1505 * We need to tack together the ephemeral mount 1506 * with this new mntinfo. 1507 */ 1508 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP); 1509 eph->ne_mount = mi; 1510 eph->ne_ref_time = gethrestime_sec(); 1511 1512 /* 1513 * We need to tell the ephemeral mount when 1514 * to time out. 1515 */ 1516 eph->ne_mount_to = ntg->ntg_mount_to; 1517 1518 mi->mi_flags |= MI4_EPHEMERAL; 1519 mi->mi_ephemeral = eph; 1520 1521 /* 1522 * If the enclosing mntinfo4 is also ephemeral, 1523 * then we need to point to its enclosing parent. 1524 * Else the enclosing mntinfo4 is the enclosing parent. 1525 * 1526 * We also need to weave this ephemeral node 1527 * into the tree. 1528 */ 1529 if (mi_parent->mi_flags & MI4_EPHEMERAL) { 1530 /* 1531 * We need to decide if we are 1532 * the root node of this branch 1533 * or if we are a sibling of this 1534 * branch. 1535 */ 1536 prior = mi_parent->mi_ephemeral; 1537 if (prior == NULL) { 1538 /* 1539 * Race condition, clean up, and 1540 * let caller handle mntinfo. 1541 */ 1542 mi->mi_flags &= ~MI4_EPHEMERAL; 1543 mi->mi_ephemeral = NULL; 1544 kmem_free(eph, sizeof (*eph)); 1545 rc = EBUSY; 1546 } else { 1547 if (prior->ne_child == NULL) { 1548 prior->ne_child = eph; 1549 } else { 1550 child = prior->ne_child; 1551 1552 prior->ne_child = eph; 1553 eph->ne_peer = child; 1554 1555 child->ne_prior = eph; 1556 } 1557 1558 eph->ne_prior = prior; 1559 } 1560 } else { 1561 /* 1562 * The parent mntinfo4 is the non-ephemeral 1563 * root of the ephemeral tree. We 1564 * need to decide if we are the root 1565 * node of that tree or if we are a 1566 * sibling of the root node. 1567 * 1568 * We are the root if there is no 1569 * other node. 1570 */ 1571 if (net->net_root == NULL) { 1572 net->net_root = eph; 1573 } else { 1574 eph->ne_peer = peer = net->net_root; 1575 ASSERT(peer != NULL); 1576 net->net_root = eph; 1577 1578 peer->ne_prior = eph; 1579 } 1580 1581 eph->ne_prior = NULL; 1582 } 1583 1584 nfs4_ephemeral_tree_rele(net); 1585 1586 mutex_exit(&mi->mi_lock); 1587 mutex_exit(&mi_parent->mi_lock); 1588 1589 return (rc); 1590 } 1591 1592 /* 1593 * Commit the changes to the ephemeral tree for removing this node. 1594 */ 1595 static void 1596 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph) 1597 { 1598 nfs4_ephemeral_t *e = eph; 1599 nfs4_ephemeral_t *peer; 1600 nfs4_ephemeral_t *prior; 1601 1602 peer = eph->ne_peer; 1603 prior = e->ne_prior; 1604 1605 /* 1606 * If this branch root was not the 1607 * tree root, then we need to fix back pointers. 1608 */ 1609 if (prior) { 1610 if (prior->ne_child == e) { 1611 prior->ne_child = peer; 1612 } else { 1613 prior->ne_peer = peer; 1614 } 1615 1616 if (peer) 1617 peer->ne_prior = prior; 1618 } else if (peer) { 1619 peer->ne_mount->mi_ephemeral_tree->net_root = peer; 1620 peer->ne_prior = NULL; 1621 } else { 1622 e->ne_mount->mi_ephemeral_tree->net_root = NULL; 1623 } 1624 } 1625 1626 /* 1627 * We want to avoid recursion at all costs. So we need to 1628 * unroll the tree. We do this by a depth first traversal to 1629 * leaf nodes. We blast away the leaf and work our way back 1630 * up and down the tree. 1631 */ 1632 static int 1633 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph, 1634 int isTreeRoot, int flag, cred_t *cr) 1635 { 1636 nfs4_ephemeral_t *e = eph; 1637 nfs4_ephemeral_t *prior; 1638 mntinfo4_t *mi; 1639 vfs_t *vfsp; 1640 int error; 1641 1642 /* 1643 * We use the loop while unrolling the ephemeral tree. 1644 */ 1645 for (;;) { 1646 /* 1647 * First we walk down the child. 1648 */ 1649 if (e->ne_child) { 1650 prior = e; 1651 e = e->ne_child; 1652 continue; 1653 } 1654 1655 /* 1656 * If we are the root of the branch we are removing, 1657 * we end it here. But if the branch is the root of 1658 * the tree, we have to forge on. We do not consider 1659 * the peer list for the root because while it may 1660 * be okay to remove, it is both extra work and a 1661 * potential for a false-positive error to stall the 1662 * unmount attempt. 1663 */ 1664 if (e == eph && isTreeRoot == FALSE) 1665 return (0); 1666 1667 /* 1668 * Next we walk down the peer list. 1669 */ 1670 if (e->ne_peer) { 1671 prior = e; 1672 e = e->ne_peer; 1673 continue; 1674 } 1675 1676 /* 1677 * We can only remove the node passed in by the 1678 * caller if it is the root of the ephemeral tree. 1679 * Otherwise, the caller will remove it. 1680 */ 1681 if (e == eph && isTreeRoot == FALSE) 1682 return (0); 1683 1684 /* 1685 * Okay, we have a leaf node, time 1686 * to prune it! 1687 * 1688 * Note that prior can only be NULL if 1689 * and only if it is the root of the 1690 * ephemeral tree. 1691 */ 1692 prior = e->ne_prior; 1693 1694 mi = e->ne_mount; 1695 mutex_enter(&mi->mi_lock); 1696 vfsp = mi->mi_vfsp; 1697 1698 /* 1699 * Cleared by umount2_engine. 1700 */ 1701 VFS_HOLD(vfsp); 1702 1703 /* 1704 * Inform nfs4_unmount to not recursively 1705 * descend into this node's children when it 1706 * gets processed. 1707 */ 1708 mi->mi_flags |= MI4_EPHEMERAL_RECURSED; 1709 mutex_exit(&mi->mi_lock); 1710 1711 error = umount2_engine(vfsp, flag, cr, FALSE); 1712 if (error) { 1713 /* 1714 * We need to reenable nfs4_unmount's ability 1715 * to recursively descend on this node. 1716 */ 1717 mutex_enter(&mi->mi_lock); 1718 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED; 1719 mutex_exit(&mi->mi_lock); 1720 1721 return (error); 1722 } 1723 1724 /* 1725 * If we are the current node, we do not want to 1726 * touch anything else. At this point, the only 1727 * way the current node can have survived to here 1728 * is if it is the root of the ephemeral tree and 1729 * we are unmounting the enclosing mntinfo4. 1730 */ 1731 if (e == eph) { 1732 ASSERT(prior == NULL); 1733 return (0); 1734 } 1735 1736 /* 1737 * Stitch up the prior node. Note that since 1738 * we have handled the root of the tree, prior 1739 * must be non-NULL. 1740 */ 1741 ASSERT(prior != NULL); 1742 if (prior->ne_child == e) { 1743 prior->ne_child = NULL; 1744 } else { 1745 ASSERT(prior->ne_peer == e); 1746 1747 prior->ne_peer = NULL; 1748 } 1749 1750 e = prior; 1751 } 1752 1753 /* NOTREACHED */ 1754 } 1755 1756 /* 1757 * Common code to safely release net_cnt_lock and net_tree_lock 1758 */ 1759 void 1760 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock, 1761 bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1762 { 1763 nfs4_ephemeral_tree_t *net = *pnet; 1764 1765 if (*pmust_unlock) { 1766 mutex_enter(&net->net_cnt_lock); 1767 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING; 1768 if (*pmust_rele) 1769 nfs4_ephemeral_tree_decr(net); 1770 mutex_exit(&net->net_cnt_lock); 1771 1772 mutex_exit(&net->net_tree_lock); 1773 1774 *pmust_unlock = FALSE; 1775 } 1776 } 1777 1778 /* 1779 * While we may have removed any child or sibling nodes of this 1780 * ephemeral node, we can not nuke it until we know that there 1781 * were no actived vnodes on it. This will do that final 1782 * work once we know it is not busy. 1783 */ 1784 void 1785 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock, 1786 bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1787 { 1788 /* 1789 * Now we need to get rid of the ephemeral data if it exists. 1790 */ 1791 mutex_enter(&mi->mi_lock); 1792 if (mi->mi_ephemeral) { 1793 /* 1794 * If we are the root node of an ephemeral branch 1795 * which is being removed, then we need to fixup 1796 * pointers into and out of the node. 1797 */ 1798 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED)) 1799 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral); 1800 1801 ASSERT(mi->mi_ephemeral != NULL); 1802 1803 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral)); 1804 mi->mi_ephemeral = NULL; 1805 } 1806 mutex_exit(&mi->mi_lock); 1807 1808 nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele, pnet); 1809 } 1810 1811 /* 1812 * Unmount an ephemeral node. 1813 */ 1814 int 1815 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr, 1816 bool_t *pmust_unlock, bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1817 { 1818 int error = 0; 1819 nfs4_ephemeral_t *eph; 1820 nfs4_ephemeral_tree_t *net; 1821 int is_derooting = FALSE; 1822 int is_recursed = FALSE; 1823 int was_locked = FALSE; 1824 1825 /* 1826 * Make sure to set the default state for cleaning 1827 * up the tree in the caller (and on the way out). 1828 */ 1829 *pmust_unlock = *pmust_rele = FALSE; 1830 1831 /* 1832 * The active vnodes on this file system may be ephemeral 1833 * children. We need to check for and try to unmount them 1834 * here. If any can not be unmounted, we are going 1835 * to return EBUSY. 1836 */ 1837 mutex_enter(&mi->mi_lock); 1838 1839 /* 1840 * If an ephemeral tree, we need to check to see if 1841 * the lock is already held. If it is, then we need 1842 * to see if we are being called as a result of 1843 * the recursive removal of some node of the tree or 1844 * if we are another attempt to remove the tree. 1845 * 1846 * mi_flags & MI4_EPHEMERAL indicates an ephemeral 1847 * node. mi_ephemeral being non-NULL also does this. 1848 * 1849 * mi_ephemeral_tree being non-NULL is sufficient 1850 * to also indicate either it is an ephemeral node 1851 * or the enclosing mntinfo4. 1852 * 1853 * Do we need MI4_EPHEMERAL? Yes, it is useful for 1854 * when we delete the ephemeral node and need to 1855 * differentiate from an ephemeral node and the 1856 * enclosing root node. 1857 */ 1858 *pnet = net = mi->mi_ephemeral_tree; 1859 if (net == NULL) { 1860 mutex_exit(&mi->mi_lock); 1861 return (0); 1862 } 1863 1864 eph = mi->mi_ephemeral; 1865 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED; 1866 is_derooting = (eph == NULL); 1867 1868 /* 1869 * If this is not recursion, then we need to 1870 * grab a ref count. 1871 * 1872 * But wait, we also do not want to do that 1873 * if a harvester thread has already grabbed 1874 * the lock. 1875 */ 1876 if (!is_recursed) { 1877 mutex_enter(&net->net_cnt_lock); 1878 if (net->net_status & 1879 NFS4_EPHEMERAL_TREE_LOCKED) { 1880 /* 1881 * If the tree is locked, we need 1882 * to decide whether we are the 1883 * harvester or some explicit call 1884 * for a umount. The only way that 1885 * we are the harvester is if 1886 * MS_SYSSPACE is set. 1887 * 1888 * We only let the harvester through 1889 * at this point. 1890 * 1891 * We return EBUSY so that the 1892 * caller knows something is 1893 * going on. Note that by that 1894 * time, the umount in the other 1895 * thread may have already occured. 1896 */ 1897 if (!(flag & MS_SYSSPACE)) { 1898 mutex_exit(&net->net_cnt_lock); 1899 mutex_exit(&mi->mi_lock); 1900 1901 return (EBUSY); 1902 } 1903 1904 was_locked = TRUE; 1905 } else { 1906 nfs4_ephemeral_tree_incr(net); 1907 *pmust_rele = TRUE; 1908 } 1909 1910 mutex_exit(&net->net_cnt_lock); 1911 } 1912 mutex_exit(&mi->mi_lock); 1913 1914 /* 1915 * If we are not the harvester, we need to check 1916 * to see if we need to grab the tree lock. 1917 */ 1918 if (was_locked == FALSE) { 1919 /* 1920 * If we grab the lock, it means that no other 1921 * operation is working on the tree. If we don't 1922 * grab it, we need to decide if this is because 1923 * we are a recursive call or a new operation. 1924 */ 1925 if (mutex_tryenter(&net->net_tree_lock)) { 1926 *pmust_unlock = TRUE; 1927 } else { 1928 /* 1929 * If we are a recursive call, we can 1930 * proceed without the lock. 1931 * Otherwise we have to wait until 1932 * the lock becomes free. 1933 */ 1934 if (!is_recursed) { 1935 mutex_enter(&net->net_cnt_lock); 1936 if (net->net_status & 1937 (NFS4_EPHEMERAL_TREE_DEROOTING 1938 | NFS4_EPHEMERAL_TREE_INVALID)) { 1939 nfs4_ephemeral_tree_decr(net); 1940 mutex_exit(&net->net_cnt_lock); 1941 *pmust_rele = FALSE; 1942 goto is_busy; 1943 } 1944 mutex_exit(&net->net_cnt_lock); 1945 1946 /* 1947 * We can't hold any other locks whilst 1948 * we wait on this to free up. 1949 */ 1950 mutex_enter(&net->net_tree_lock); 1951 1952 /* 1953 * Note that while mi->mi_ephemeral 1954 * may change and thus we have to 1955 * update eph, it is the case that 1956 * we have tied down net and 1957 * do not care if mi->mi_ephemeral_tree 1958 * has changed. 1959 */ 1960 mutex_enter(&mi->mi_lock); 1961 eph = mi->mi_ephemeral; 1962 mutex_exit(&mi->mi_lock); 1963 1964 /* 1965 * Okay, we need to see if either the 1966 * tree got nuked or the current node 1967 * got nuked. Both of which will cause 1968 * an error. 1969 * 1970 * Note that a subsequent retry of the 1971 * umount shall work. 1972 */ 1973 mutex_enter(&net->net_cnt_lock); 1974 if (net->net_status & 1975 NFS4_EPHEMERAL_TREE_INVALID || 1976 (!is_derooting && eph == NULL)) { 1977 nfs4_ephemeral_tree_decr(net); 1978 mutex_exit(&net->net_cnt_lock); 1979 mutex_exit(&net->net_tree_lock); 1980 *pmust_rele = FALSE; 1981 goto is_busy; 1982 } 1983 mutex_exit(&net->net_cnt_lock); 1984 *pmust_unlock = TRUE; 1985 } 1986 } 1987 } 1988 1989 /* 1990 * Only once we have grabbed the lock can we mark what we 1991 * are planning on doing to the ephemeral tree. 1992 */ 1993 if (*pmust_unlock) { 1994 mutex_enter(&net->net_cnt_lock); 1995 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING; 1996 1997 /* 1998 * Check to see if we are nuking the root. 1999 */ 2000 if (is_derooting) 2001 net->net_status |= 2002 NFS4_EPHEMERAL_TREE_DEROOTING; 2003 mutex_exit(&net->net_cnt_lock); 2004 } 2005 2006 if (!is_derooting) { 2007 /* 2008 * Only work on children if the caller has not already 2009 * done so. 2010 */ 2011 if (!is_recursed) { 2012 ASSERT(eph != NULL); 2013 2014 error = nfs4_ephemeral_unmount_engine(eph, 2015 FALSE, flag, cr); 2016 if (error) 2017 goto is_busy; 2018 } 2019 } else { 2020 eph = net->net_root; 2021 2022 /* 2023 * Only work if there is something there. 2024 */ 2025 if (eph) { 2026 error = nfs4_ephemeral_unmount_engine(eph, TRUE, 2027 flag, cr); 2028 if (error) { 2029 mutex_enter(&net->net_cnt_lock); 2030 net->net_status &= 2031 ~NFS4_EPHEMERAL_TREE_DEROOTING; 2032 mutex_exit(&net->net_cnt_lock); 2033 goto is_busy; 2034 } 2035 2036 /* 2037 * Nothing else which goes wrong will 2038 * invalidate the blowing away of the 2039 * ephmeral tree. 2040 */ 2041 net->net_root = NULL; 2042 } 2043 2044 /* 2045 * We have derooted and we have caused the tree to be 2046 * invalidated. 2047 */ 2048 mutex_enter(&net->net_cnt_lock); 2049 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING; 2050 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID; 2051 if (was_locked == FALSE) 2052 nfs4_ephemeral_tree_decr(net); 2053 mutex_exit(&net->net_cnt_lock); 2054 2055 if (was_locked == FALSE) 2056 mutex_exit(&net->net_tree_lock); 2057 2058 /* 2059 * We have just blown away any notation of this 2060 * tree being locked. We can't let the caller 2061 * try to clean things up. 2062 */ 2063 *pmust_unlock = FALSE; 2064 2065 /* 2066 * At this point, the tree should no longer be 2067 * associated with the mntinfo4. We need to pull 2068 * it off there and let the harvester take 2069 * care of it once the refcnt drops. 2070 */ 2071 mutex_enter(&mi->mi_lock); 2072 mi->mi_ephemeral_tree = NULL; 2073 mutex_exit(&mi->mi_lock); 2074 } 2075 2076 return (0); 2077 2078 is_busy: 2079 2080 nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele, 2081 pnet); 2082 2083 return (error); 2084 } 2085 2086 /* 2087 * Do the umount and record any error in the parent. 2088 */ 2089 static void 2090 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag, 2091 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior) 2092 { 2093 int error; 2094 2095 error = umount2_engine(vfsp, flag, kcred, FALSE); 2096 if (error) { 2097 if (prior) { 2098 if (prior->ne_child == e) 2099 prior->ne_state |= 2100 NFS4_EPHEMERAL_CHILD_ERROR; 2101 else 2102 prior->ne_state |= 2103 NFS4_EPHEMERAL_PEER_ERROR; 2104 } 2105 } 2106 } 2107 2108 /* 2109 * For each tree in the forest (where the forest is in 2110 * effect all of the ephemeral trees for this zone), 2111 * scan to see if a node can be unmounted. Note that 2112 * unlike nfs4_ephemeral_unmount_engine(), we do 2113 * not process the current node before children or 2114 * siblings. I.e., if a node can be unmounted, we 2115 * do not recursively check to see if the nodes 2116 * hanging off of it can also be unmounted. 2117 * 2118 * Instead, we delve down deep to try and remove the 2119 * children first. Then, because we share code with 2120 * nfs4_ephemeral_unmount_engine(), we will try 2121 * them again. This could be a performance issue in 2122 * the future. 2123 * 2124 * Also note that unlike nfs4_ephemeral_unmount_engine(), 2125 * we do not halt on an error. We will not remove the 2126 * current node, but we will keep on trying to remove 2127 * the others. 2128 * 2129 * force indicates that we want the unmount to occur 2130 * even if there is something blocking it. 2131 * 2132 * time_check indicates that we want to see if the 2133 * mount has expired past mount_to or not. Typically 2134 * we want to do this and only on a shutdown of the 2135 * zone would we want to ignore the check. 2136 */ 2137 static void 2138 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg, 2139 bool_t force, bool_t time_check) 2140 { 2141 nfs4_ephemeral_tree_t *net; 2142 nfs4_ephemeral_tree_t *prev = NULL; 2143 nfs4_ephemeral_tree_t *next; 2144 nfs4_ephemeral_t *e; 2145 nfs4_ephemeral_t *prior; 2146 time_t now = gethrestime_sec(); 2147 2148 nfs4_ephemeral_tree_t *harvest = NULL; 2149 2150 int flag; 2151 2152 mntinfo4_t *mi; 2153 vfs_t *vfsp; 2154 2155 if (force) 2156 flag = MS_FORCE | MS_SYSSPACE; 2157 else 2158 flag = MS_SYSSPACE; 2159 2160 mutex_enter(&ntg->ntg_forest_lock); 2161 for (net = ntg->ntg_forest; net != NULL; net = next) { 2162 next = net->net_next; 2163 2164 nfs4_ephemeral_tree_hold(net); 2165 2166 mutex_enter(&net->net_tree_lock); 2167 2168 /* 2169 * Let the unmount code know that the 2170 * tree is already locked! 2171 */ 2172 mutex_enter(&net->net_cnt_lock); 2173 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED; 2174 mutex_exit(&net->net_cnt_lock); 2175 2176 /* 2177 * If the intent is force all ephemeral nodes to 2178 * be unmounted in this zone, we can short circuit a 2179 * lot of tree traversal and simply zap the root node. 2180 */ 2181 if (force) { 2182 if (net->net_root) { 2183 mi = net->net_root->ne_mount; 2184 vfsp = mi->mi_vfsp; 2185 2186 /* 2187 * Cleared by umount2_engine. 2188 */ 2189 VFS_HOLD(vfsp); 2190 2191 (void) umount2_engine(vfsp, flag, 2192 kcred, FALSE); 2193 2194 goto check_done; 2195 } 2196 } 2197 2198 e = net->net_root; 2199 if (e) 2200 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; 2201 2202 while (e) { 2203 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) { 2204 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING; 2205 if (e->ne_child) { 2206 e = e->ne_child; 2207 e->ne_state = 2208 NFS4_EPHEMERAL_VISIT_CHILD; 2209 } 2210 2211 continue; 2212 } else if (e->ne_state == 2213 NFS4_EPHEMERAL_VISIT_SIBLING) { 2214 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME; 2215 if (e->ne_peer) { 2216 e = e->ne_peer; 2217 e->ne_state = 2218 NFS4_EPHEMERAL_VISIT_CHILD; 2219 } 2220 2221 continue; 2222 } else if (e->ne_state == 2223 NFS4_EPHEMERAL_CHILD_ERROR) { 2224 prior = e->ne_prior; 2225 2226 /* 2227 * If a child reported an error, do 2228 * not bother trying to unmount. 2229 * 2230 * If your prior node is a parent, 2231 * pass the error up such that they 2232 * also do not try to unmount. 2233 * 2234 * However, if your prior is a sibling, 2235 * let them try to unmount if they can. 2236 */ 2237 if (prior) { 2238 if (prior->ne_child == e) 2239 prior->ne_state |= 2240 NFS4_EPHEMERAL_CHILD_ERROR; 2241 else 2242 prior->ne_state |= 2243 NFS4_EPHEMERAL_PEER_ERROR; 2244 } 2245 2246 /* 2247 * Clear the error and if needed, process peers. 2248 * 2249 * Once we mask out the error, we know whether 2250 * or we have to process another node. 2251 */ 2252 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR; 2253 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME) 2254 e = prior; 2255 2256 continue; 2257 } else if (e->ne_state == 2258 NFS4_EPHEMERAL_PEER_ERROR) { 2259 prior = e->ne_prior; 2260 2261 if (prior) { 2262 if (prior->ne_child == e) 2263 prior->ne_state = 2264 NFS4_EPHEMERAL_CHILD_ERROR; 2265 else 2266 prior->ne_state = 2267 NFS4_EPHEMERAL_PEER_ERROR; 2268 } 2269 2270 /* 2271 * Clear the error from this node and do the 2272 * correct processing. 2273 */ 2274 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR; 2275 continue; 2276 } 2277 2278 prior = e->ne_prior; 2279 e->ne_state = NFS4_EPHEMERAL_OK; 2280 2281 /* 2282 * It must be the case that we need to process 2283 * this node. 2284 */ 2285 if (!time_check || 2286 now - e->ne_ref_time > e->ne_mount_to) { 2287 mi = e->ne_mount; 2288 vfsp = mi->mi_vfsp; 2289 2290 /* 2291 * Cleared by umount2_engine. 2292 */ 2293 VFS_HOLD(vfsp); 2294 2295 /* 2296 * Note that we effectively work down to the 2297 * leaf nodes first, try to unmount them, 2298 * then work our way back up into the leaf 2299 * nodes. 2300 * 2301 * Also note that we deal with a lot of 2302 * complexity by sharing the work with 2303 * the manual unmount code. 2304 */ 2305 nfs4_ephemeral_record_umount(vfsp, flag, 2306 e, prior); 2307 } 2308 2309 e = prior; 2310 } 2311 2312 check_done: 2313 2314 /* 2315 * At this point we are done processing this tree. 2316 * 2317 * If the tree is invalid and we are the only reference 2318 * to it, then we push it on the local linked list 2319 * to remove it at the end. We avoid that action now 2320 * to keep the tree processing going along at a fair clip. 2321 * 2322 * Else, even if we are the only reference, we drop 2323 * our hold on the current tree and allow it to be 2324 * reused as needed. 2325 */ 2326 mutex_enter(&net->net_cnt_lock); 2327 if (net->net_refcnt == 1 && 2328 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) { 2329 nfs4_ephemeral_tree_decr(net); 2330 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2331 mutex_exit(&net->net_cnt_lock); 2332 mutex_exit(&net->net_tree_lock); 2333 2334 if (prev) 2335 prev->net_next = net->net_next; 2336 else 2337 ntg->ntg_forest = net->net_next; 2338 2339 net->net_next = harvest; 2340 harvest = net; 2341 continue; 2342 } 2343 2344 nfs4_ephemeral_tree_decr(net); 2345 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2346 mutex_exit(&net->net_cnt_lock); 2347 mutex_exit(&net->net_tree_lock); 2348 2349 prev = net; 2350 } 2351 mutex_exit(&ntg->ntg_forest_lock); 2352 2353 for (net = harvest; net != NULL; net = next) { 2354 next = net->net_next; 2355 2356 mutex_destroy(&net->net_tree_lock); 2357 mutex_destroy(&net->net_cnt_lock); 2358 kmem_free(net, sizeof (*net)); 2359 } 2360 } 2361 2362 /* 2363 * This is the thread which decides when the harvesting 2364 * can proceed and when to kill it off for this zone. 2365 */ 2366 static void 2367 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg) 2368 { 2369 clock_t timeleft; 2370 zone_t *zone = curproc->p_zone; 2371 2372 for (;;) { 2373 timeleft = zone_status_timedwait(zone, lbolt + 2374 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN); 2375 2376 /* 2377 * zone is exiting... 2378 */ 2379 if (timeleft != -1) { 2380 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN); 2381 zthread_exit(); 2382 /* NOTREACHED */ 2383 } 2384 2385 /* 2386 * Only bother scanning if there is potential 2387 * work to be done. 2388 */ 2389 if (ntg->ntg_forest == NULL) 2390 continue; 2391 2392 /* 2393 * Now scan the list and get rid of everything which 2394 * is old. 2395 */ 2396 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE); 2397 } 2398 2399 /* NOTREACHED */ 2400 } 2401 2402 /* 2403 * The zone specific glue needed to start the unmount harvester. 2404 * 2405 * Note that we want to avoid holding the mutex as long as possible, 2406 * hence the multiple checks. 2407 * 2408 * The caller should avoid us getting down here in the first 2409 * place. 2410 */ 2411 static void 2412 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg) 2413 { 2414 /* 2415 * It got started before we got here... 2416 */ 2417 if (ntg->ntg_thread_started) 2418 return; 2419 2420 mutex_enter(&nfs4_ephemeral_thread_lock); 2421 2422 if (ntg->ntg_thread_started) { 2423 mutex_exit(&nfs4_ephemeral_thread_lock); 2424 return; 2425 } 2426 2427 /* 2428 * Start the unmounter harvester thread for this zone. 2429 */ 2430 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester, 2431 ntg, 0, minclsyspri); 2432 2433 ntg->ntg_thread_started = TRUE; 2434 mutex_exit(&nfs4_ephemeral_thread_lock); 2435 } 2436 2437 /*ARGSUSED*/ 2438 static void * 2439 nfs4_ephemeral_zsd_create(zoneid_t zoneid) 2440 { 2441 nfs4_trigger_globals_t *ntg; 2442 2443 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP); 2444 ntg->ntg_thread_started = FALSE; 2445 2446 /* 2447 * This is the default.... 2448 */ 2449 ntg->ntg_mount_to = nfs4_trigger_thread_timer; 2450 2451 mutex_init(&ntg->ntg_forest_lock, NULL, 2452 MUTEX_DEFAULT, NULL); 2453 2454 return (ntg); 2455 } 2456 2457 /* 2458 * Try a nice gentle walk down the forest and convince 2459 * all of the trees to gracefully give it up. 2460 */ 2461 /*ARGSUSED*/ 2462 static void 2463 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg) 2464 { 2465 nfs4_trigger_globals_t *ntg = arg; 2466 2467 if (!ntg) 2468 return; 2469 2470 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE); 2471 } 2472 2473 /* 2474 * Race along the forest and rip all of the trees out by 2475 * their rootballs! 2476 */ 2477 /*ARGSUSED*/ 2478 static void 2479 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg) 2480 { 2481 nfs4_trigger_globals_t *ntg = arg; 2482 2483 if (!ntg) 2484 return; 2485 2486 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE); 2487 2488 mutex_destroy(&ntg->ntg_forest_lock); 2489 kmem_free(ntg, sizeof (*ntg)); 2490 } 2491 2492 /* 2493 * This is the zone independent cleanup needed for 2494 * emphemeral mount processing. 2495 */ 2496 void 2497 nfs4_ephemeral_fini(void) 2498 { 2499 (void) zone_key_delete(nfs4_ephemeral_key); 2500 mutex_destroy(&nfs4_ephemeral_thread_lock); 2501 } 2502 2503 /* 2504 * This is the zone independent initialization needed for 2505 * emphemeral mount processing. 2506 */ 2507 void 2508 nfs4_ephemeral_init(void) 2509 { 2510 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT, 2511 NULL); 2512 2513 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create, 2514 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy); 2515 } 2516 2517 /* 2518 * nfssys() calls this function to set the per-zone 2519 * value of mount_to to drive when an ephemeral mount is 2520 * timed out. Each mount will grab a copy of this value 2521 * when mounted. 2522 */ 2523 void 2524 nfs4_ephemeral_set_mount_to(uint_t mount_to) 2525 { 2526 nfs4_trigger_globals_t *ntg; 2527 zone_t *zone = curproc->p_zone; 2528 2529 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 2530 2531 ntg->ntg_mount_to = mount_to; 2532 } 2533 2534 /* 2535 * Walk the list of v4 mount options; if they are currently set in vfsp, 2536 * append them to a new comma-separated mount option string, and return it. 2537 * 2538 * Caller should free by calling nfs4_trigger_destroy_mntopts(). 2539 */ 2540 static char * 2541 nfs4_trigger_create_mntopts(vfs_t *vfsp) 2542 { 2543 uint_t i; 2544 char *mntopts; 2545 struct vfssw *vswp; 2546 mntopts_t *optproto; 2547 2548 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP); 2549 2550 /* get the list of applicable mount options for v4; locks *vswp */ 2551 vswp = vfs_getvfssw(MNTTYPE_NFS4); 2552 optproto = &vswp->vsw_optproto; 2553 2554 for (i = 0; i < optproto->mo_count; i++) { 2555 struct mntopt *mop = &optproto->mo_list[i]; 2556 2557 if (mop->mo_flags & MO_EMPTY) 2558 continue; 2559 2560 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) { 2561 kmem_free(mntopts, MAX_MNTOPT_STR); 2562 vfs_unrefvfssw(vswp); 2563 return (NULL); 2564 } 2565 } 2566 2567 vfs_unrefvfssw(vswp); 2568 2569 /* 2570 * MNTOPT_XATTR is not in the v4 mount opt proto list, 2571 * and it may only be passed via MS_OPTIONSTR, so we 2572 * must handle it here. 2573 * 2574 * Ideally, it would be in the list, but NFS does not specify its 2575 * own opt proto list, it uses instead the default one. Since 2576 * not all filesystems support extended attrs, it would not be 2577 * appropriate to add it there. 2578 */ 2579 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) || 2580 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) { 2581 kmem_free(mntopts, MAX_MNTOPT_STR); 2582 return (NULL); 2583 } 2584 2585 return (mntopts); 2586 } 2587 2588 static void 2589 nfs4_trigger_destroy_mntopts(char *mntopts) 2590 { 2591 if (mntopts) 2592 kmem_free(mntopts, MAX_MNTOPT_STR); 2593 } 2594 2595 /* 2596 * Check a single mount option (optname). Add to mntopts if it is set in VFS. 2597 */ 2598 static int 2599 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp) 2600 { 2601 if (mntopts == NULL || optname == NULL || vfsp == NULL) 2602 return (EINVAL); 2603 2604 if (vfs_optionisset(vfsp, optname, NULL)) { 2605 size_t mntoptslen = strlen(mntopts); 2606 size_t optnamelen = strlen(optname); 2607 2608 /* +1 for ',', +1 for NUL */ 2609 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR) 2610 return (EOVERFLOW); 2611 2612 /* first or subsequent mount option? */ 2613 if (*mntopts != '\0') 2614 (void) strcat(mntopts, ","); 2615 2616 (void) strcat(mntopts, optname); 2617 } 2618 2619 return (0); 2620 } 2621 2622 static enum clnt_stat 2623 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr) 2624 { 2625 int retries, error; 2626 uint_t max_msgsize; 2627 enum clnt_stat status; 2628 CLIENT *cl; 2629 struct timeval timeout; 2630 2631 /* as per recov_newserver() */ 2632 max_msgsize = 0; 2633 retries = 1; 2634 timeout.tv_sec = 2; 2635 timeout.tv_usec = 0; 2636 2637 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM, 2638 NFS_V4, max_msgsize, retries, CRED(), &cl); 2639 if (error) 2640 return (RPC_FAILED); 2641 2642 if (nointr) 2643 cl->cl_nosignal = TRUE; 2644 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL, 2645 timeout); 2646 if (nointr) 2647 cl->cl_nosignal = FALSE; 2648 2649 AUTH_DESTROY(cl->cl_auth); 2650 CLNT_DESTROY(cl); 2651 2652 return (status); 2653 } 2654