1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are 29 * triggered from a "stub" rnode via a special set of vnodeops. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/vfs_opreg.h> 40 #include <sys/file.h> 41 #include <sys/filio.h> 42 #include <sys/uio.h> 43 #include <sys/buf.h> 44 #include <sys/mman.h> 45 #include <sys/pathname.h> 46 #include <sys/dirent.h> 47 #include <sys/debug.h> 48 #include <sys/vmsystm.h> 49 #include <sys/fcntl.h> 50 #include <sys/flock.h> 51 #include <sys/swap.h> 52 #include <sys/errno.h> 53 #include <sys/strsubr.h> 54 #include <sys/sysmacros.h> 55 #include <sys/kmem.h> 56 #include <sys/mount.h> 57 #include <sys/cmn_err.h> 58 #include <sys/pathconf.h> 59 #include <sys/utsname.h> 60 #include <sys/dnlc.h> 61 #include <sys/acl.h> 62 #include <sys/systeminfo.h> 63 #include <sys/policy.h> 64 #include <sys/sdt.h> 65 #include <sys/list.h> 66 #include <sys/stat.h> 67 #include <sys/mntent.h> 68 69 #include <rpc/types.h> 70 #include <rpc/auth.h> 71 #include <rpc/clnt.h> 72 73 #include <nfs/nfs.h> 74 #include <nfs/nfs_clnt.h> 75 #include <nfs/nfs_acl.h> 76 #include <nfs/lm.h> 77 #include <nfs/nfs4.h> 78 #include <nfs/nfs4_kprot.h> 79 #include <nfs/rnode4.h> 80 #include <nfs/nfs4_clnt.h> 81 82 #include <vm/hat.h> 83 #include <vm/as.h> 84 #include <vm/page.h> 85 #include <vm/pvn.h> 86 #include <vm/seg.h> 87 #include <vm/seg_map.h> 88 #include <vm/seg_kpm.h> 89 #include <vm/seg_vn.h> 90 91 #include <fs/fs_subr.h> 92 93 #include <sys/ddi.h> 94 #include <sys/int_fmtio.h> 95 96 #include <sys/sunddi.h> 97 98 #include <sys/priv_names.h> 99 100 /* 101 * The automatic unmounter thread stuff! 102 */ 103 static int nfs4_trigger_thread_timer = 20; /* in seconds */ 104 105 /* 106 * Just a default.... 107 */ 108 static uint_t nfs4_trigger_mount_to = 240; 109 110 typedef struct nfs4_trigger_globals { 111 kmutex_t ntg_forest_lock; 112 uint_t ntg_mount_to; 113 int ntg_thread_started; 114 nfs4_ephemeral_tree_t *ntg_forest; 115 } nfs4_trigger_globals_t; 116 117 kmutex_t nfs4_ephemeral_thread_lock; 118 119 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED; 120 121 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *); 122 123 /* 124 * Used for ephemeral mounts; contains data either duplicated from 125 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount. 126 * 127 * It's intended that this structure is used solely for ephemeral 128 * mount-type specific data, for passing this data to 129 * nfs4_trigger_nargs_create(). 130 */ 131 typedef struct ephemeral_servinfo { 132 char *esi_hostname; 133 char *esi_netname; 134 char *esi_path; 135 int esi_path_len; 136 int esi_mount_flags; 137 struct netbuf *esi_addr; 138 struct netbuf *esi_syncaddr; 139 struct knetconfig *esi_knconf; 140 } ephemeral_servinfo_t; 141 142 /* 143 * Collect together the mount-type specific and generic data args. 144 */ 145 typedef struct domount_args { 146 ephemeral_servinfo_t *dma_esi; 147 char *dma_hostlist; /* comma-sep. for RO failover */ 148 struct nfs_args *dma_nargs; 149 } domount_args_t; 150 151 152 /* 153 * The vnode ops functions for a trigger stub vnode 154 */ 155 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *); 156 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *, 157 caller_context_t *); 158 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *, 159 caller_context_t *); 160 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *, 161 caller_context_t *); 162 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *, 163 caller_context_t *); 164 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **, 165 struct pathname *, int, vnode_t *, cred_t *, caller_context_t *, 166 int *, pathname_t *); 167 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *, 168 enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *, 169 vsecattr_t *); 170 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *, 171 int); 172 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *, 173 caller_context_t *, int); 174 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *, 175 cred_t *, caller_context_t *, int); 176 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *, 177 vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp); 178 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 179 caller_context_t *, int); 180 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *, 181 cred_t *, caller_context_t *, int); 182 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *); 183 184 /* 185 * Regular NFSv4 vnodeops that we need to reference directly 186 */ 187 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 188 caller_context_t *); 189 extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 190 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *); 191 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 192 extern int nfs4_lookup(vnode_t *, char *, vnode_t **, 193 struct pathname *, int, vnode_t *, cred_t *, 194 caller_context_t *, int *, pathname_t *); 195 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 196 caller_context_t *); 197 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 198 caller_context_t *); 199 extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 200 extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 201 202 static int nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **); 203 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **, 204 cred_t *); 205 static domount_args_t *nfs4_trigger_domount_args_create(vnode_t *); 206 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma, 207 vnode_t *vp); 208 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *); 209 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *); 210 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *, 211 servinfo4_t *); 212 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *, 213 ephemeral_servinfo_t *); 214 static void nfs4_trigger_nargs_destroy(struct nfs_args *); 215 static char *nfs4_trigger_create_mntopts(vfs_t *); 216 static void nfs4_trigger_destroy_mntopts(char *); 217 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *); 218 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int); 219 220 extern int umount2_engine(vfs_t *, int, cred_t *, int); 221 222 223 vnodeops_t *nfs4_trigger_vnodeops; 224 225 /* 226 * These are the vnodeops that we must define for stub vnodes. 227 * 228 * 229 * Many of the VOPs defined for NFSv4 do not need to be defined here, 230 * for various reasons. This will result in the VFS default function being 231 * used: 232 * 233 * - These VOPs require a previous VOP_OPEN to have occurred. That will have 234 * lost the reference to the stub vnode, meaning these should not be called: 235 * close, read, write, ioctl, readdir, seek. 236 * 237 * - These VOPs are meaningless for vnodes without data pages. Since the 238 * stub vnode is of type VDIR, these should not be called: 239 * space, getpage, putpage, map, addmap, delmap, pageio, fsync. 240 * 241 * - These VOPs are otherwise not applicable, and should not be called: 242 * dump, setsecattr. 243 * 244 * 245 * These VOPs we do not want to define, but nor do we want the VFS default 246 * action. Instead, we specify the VFS error function, with fs_error(), but 247 * note that fs_error() is not actually called. Instead it results in the 248 * use of the error function defined for the particular VOP, in vn_ops_table[]: 249 * 250 * - frlock, dispose, shrlock. 251 * 252 * 253 * These VOPs we define to use the corresponding regular NFSv4 vnodeop. 254 * NOTE: if any of these ops involve an OTW call with the stub FH, then 255 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo() 256 * to protect the security data in the servinfo4_t for the "parent" 257 * filesystem that contains the stub. 258 * 259 * - These VOPs should not trigger a mount, so that "ls -l" does not: 260 * pathconf, getsecattr. 261 * 262 * - These VOPs would not make sense to trigger: 263 * inactive, rwlock, rwunlock, fid, realvp. 264 */ 265 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = { 266 VOPNAME_OPEN, { .vop_open = nfs4_trigger_open }, 267 VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr }, 268 VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr }, 269 VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access }, 270 VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup }, 271 VOPNAME_CREATE, { .vop_create = nfs4_trigger_create }, 272 VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove }, 273 VOPNAME_LINK, { .vop_link = nfs4_trigger_link }, 274 VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename }, 275 VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir }, 276 VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir }, 277 VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink }, 278 VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink }, 279 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 280 VOPNAME_FID, { .vop_fid = nfs4_fid }, 281 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 282 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 283 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 284 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 285 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 286 VOPNAME_FRLOCK, { .error = fs_error }, 287 VOPNAME_DISPOSE, { .error = fs_error }, 288 VOPNAME_SHRLOCK, { .error = fs_error }, 289 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 290 NULL, NULL 291 }; 292 293 static void 294 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net) 295 { 296 ASSERT(mutex_owned(&net->net_cnt_lock)); 297 net->net_refcnt++; 298 ASSERT(net->net_refcnt != 0); 299 } 300 301 static void 302 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net) 303 { 304 mutex_enter(&net->net_cnt_lock); 305 nfs4_ephemeral_tree_incr(net); 306 mutex_exit(&net->net_cnt_lock); 307 } 308 309 /* 310 * We need a safe way to decrement the refcnt whilst the 311 * lock is being held. 312 */ 313 static void 314 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net) 315 { 316 ASSERT(mutex_owned(&net->net_cnt_lock)); 317 ASSERT(net->net_refcnt != 0); 318 net->net_refcnt--; 319 } 320 321 static void 322 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net) 323 { 324 mutex_enter(&net->net_cnt_lock); 325 nfs4_ephemeral_tree_decr(net); 326 mutex_exit(&net->net_cnt_lock); 327 } 328 329 /* 330 * Trigger ops for stub vnodes; for mirror mounts, etc. 331 * 332 * The general idea is that a "triggering" op will first call 333 * nfs4_trigger_mount(), which will find out whether a mount has already 334 * been triggered. 335 * 336 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode 337 * of the covering vfs. 338 * 339 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so, 340 * and again set newvp, as above. 341 * 342 * The triggering op may then re-issue the VOP by calling it on newvp. 343 * 344 * Note that some ops may perform custom action, and may or may not need 345 * to trigger a mount. 346 * 347 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We 348 * obviously can't do this with VOP_<whatever>, since it's a stub vnode 349 * and that would just recurse. Instead, we call the v4 op directly, 350 * by name. This is OK, since we know that the vnode is for NFSv4, 351 * otherwise it couldn't be a stub. 352 * 353 */ 354 355 static int 356 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 357 { 358 int error; 359 vnode_t *newvp; 360 361 error = nfs4_trigger_mount(*vpp, cr, &newvp); 362 if (error) 363 return (error); 364 365 /* Release the stub vnode, as we're losing the reference to it */ 366 VN_RELE(*vpp); 367 368 /* Give the caller the root vnode of the newly-mounted fs */ 369 *vpp = newvp; 370 371 /* return with VN_HELD(newvp) */ 372 return (VOP_OPEN(vpp, flag, cr, ct)); 373 } 374 375 /* 376 * For the majority of cases, nfs4_trigger_getattr() will not trigger 377 * a mount. However, if ATTR_TRIGGER is set, we are being informed 378 * that we need to force the mount before we attempt to determine 379 * the attributes. The intent is an atomic operation for security 380 * testing. 381 */ 382 static int 383 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 384 caller_context_t *ct) 385 { 386 int error; 387 388 if (flags & ATTR_TRIGGER) { 389 vnode_t *newvp; 390 391 error = nfs4_trigger_mount(vp, cr, &newvp); 392 if (error) 393 return (error); 394 395 error = VOP_GETATTR(newvp, vap, flags, cr, ct); 396 VN_RELE(newvp); 397 } else { 398 error = nfs4_getattr(vp, vap, flags, cr, ct); 399 } 400 401 return (error); 402 } 403 404 static int 405 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 406 caller_context_t *ct) 407 { 408 int error; 409 vnode_t *newvp; 410 411 error = nfs4_trigger_mount(vp, cr, &newvp); 412 if (error) 413 return (error); 414 415 error = VOP_SETATTR(newvp, vap, flags, cr, ct); 416 VN_RELE(newvp); 417 418 return (error); 419 } 420 421 static int 422 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr, 423 caller_context_t *ct) 424 { 425 int error; 426 vnode_t *newvp; 427 428 error = nfs4_trigger_mount(vp, cr, &newvp); 429 if (error) 430 return (error); 431 432 error = VOP_ACCESS(newvp, mode, flags, cr, ct); 433 VN_RELE(newvp); 434 435 return (error); 436 } 437 438 static int 439 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, 440 struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr, 441 caller_context_t *ct, int *deflags, pathname_t *rpnp) 442 { 443 int error; 444 vnode_t *newdvp; 445 rnode4_t *drp = VTOR4(dvp); 446 447 ASSERT(RP_ISSTUB(drp)); 448 449 /* for now, we only support mirror-mounts */ 450 ASSERT(RP_ISSTUB_MIRRORMOUNT(drp)); 451 452 /* 453 * It's not legal to lookup ".." for an fs root, so we mustn't pass 454 * that up. Instead, pass onto the regular op, regardless of whether 455 * we've triggered a mount. 456 */ 457 if (strcmp(nm, "..") == 0) 458 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 459 ct, deflags, rpnp)); 460 461 error = nfs4_trigger_mount(dvp, cr, &newdvp); 462 if (error) 463 return (error); 464 465 error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct, 466 deflags, rpnp); 467 VN_RELE(newdvp); 468 469 return (error); 470 } 471 472 static int 473 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va, 474 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr, 475 int flags, caller_context_t *ct, vsecattr_t *vsecp) 476 { 477 int error; 478 vnode_t *newdvp; 479 480 error = nfs4_trigger_mount(dvp, cr, &newdvp); 481 if (error) 482 return (error); 483 484 error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, 485 flags, ct, vsecp); 486 VN_RELE(newdvp); 487 488 return (error); 489 } 490 491 static int 492 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, 493 int flags) 494 { 495 int error; 496 vnode_t *newdvp; 497 498 error = nfs4_trigger_mount(dvp, cr, &newdvp); 499 if (error) 500 return (error); 501 502 error = VOP_REMOVE(newdvp, nm, cr, ct, flags); 503 VN_RELE(newdvp); 504 505 return (error); 506 } 507 508 static int 509 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 510 caller_context_t *ct, int flags) 511 { 512 int error; 513 vnode_t *newtdvp; 514 515 error = nfs4_trigger_mount(tdvp, cr, &newtdvp); 516 if (error) 517 return (error); 518 519 /* 520 * We don't check whether svp is a stub. Let the NFSv4 code 521 * detect that error, and return accordingly. 522 */ 523 error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags); 524 VN_RELE(newtdvp); 525 526 return (error); 527 } 528 529 static int 530 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, 531 cred_t *cr, caller_context_t *ct, int flags) 532 { 533 int error; 534 vnode_t *newsdvp; 535 rnode4_t *tdrp = VTOR4(tdvp); 536 537 /* 538 * We know that sdvp is a stub, otherwise we would not be here. 539 * 540 * If tdvp is also be a stub, there are two possibilities: it 541 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)] 542 * or it is a different stub [!VN_CMP(sdvp, tdvp)]. 543 * 544 * In the former case, just trigger sdvp, and treat tdvp as 545 * though it were not a stub. 546 * 547 * In the latter case, it might be a different stub for the 548 * same server fs as sdvp, or for a different server fs. 549 * Regardless, from the client perspective this would still 550 * be a cross-filesystem rename, and should not be allowed, 551 * so return EXDEV, without triggering either mount. 552 */ 553 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp)) 554 return (EXDEV); 555 556 error = nfs4_trigger_mount(sdvp, cr, &newsdvp); 557 if (error) 558 return (error); 559 560 error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags); 561 562 VN_RELE(newsdvp); 563 564 return (error); 565 } 566 567 /* ARGSUSED */ 568 static int 569 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, 570 cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) 571 { 572 int error; 573 vnode_t *newdvp; 574 575 error = nfs4_trigger_mount(dvp, cr, &newdvp); 576 if (error) 577 return (error); 578 579 error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp); 580 VN_RELE(newdvp); 581 582 return (error); 583 } 584 585 static int 586 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 587 caller_context_t *ct, int flags) 588 { 589 int error; 590 vnode_t *newdvp; 591 592 error = nfs4_trigger_mount(dvp, cr, &newdvp); 593 if (error) 594 return (error); 595 596 error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags); 597 VN_RELE(newdvp); 598 599 return (error); 600 } 601 602 static int 603 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, 604 cred_t *cr, caller_context_t *ct, int flags) 605 { 606 int error; 607 vnode_t *newdvp; 608 609 error = nfs4_trigger_mount(dvp, cr, &newdvp); 610 if (error) 611 return (error); 612 613 error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags); 614 VN_RELE(newdvp); 615 616 return (error); 617 } 618 619 static int 620 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, 621 caller_context_t *ct) 622 { 623 int error; 624 vnode_t *newvp; 625 626 error = nfs4_trigger_mount(vp, cr, &newvp); 627 if (error) 628 return (error); 629 630 error = VOP_READLINK(newvp, uiop, cr, ct); 631 VN_RELE(newvp); 632 633 return (error); 634 } 635 636 /* end of trigger vnode ops */ 637 638 /* 639 * Mount upon a trigger vnode; for mirror-mounts, etc. 640 * 641 * The mount may have already occurred, via another thread. If not, 642 * assemble the location information - which may require fetching - and 643 * perform the mount. 644 * 645 * Sets newvp to be the root of the fs that is now covering vp. Note 646 * that we return with VN_HELD(*newvp). 647 * 648 * The caller is responsible for passing the VOP onto the covering fs. 649 */ 650 static int 651 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp) 652 { 653 int error; 654 vfs_t *vfsp; 655 rnode4_t *rp = VTOR4(vp); 656 mntinfo4_t *mi = VTOMI4(vp); 657 domount_args_t *dma; 658 659 nfs4_ephemeral_tree_t *net; 660 661 bool_t must_unlock = FALSE; 662 bool_t is_building = FALSE; 663 664 cred_t *mcred = NULL; 665 666 nfs4_trigger_globals_t *ntg; 667 668 zone_t *zone = curproc->p_zone; 669 670 ASSERT(RP_ISSTUB(rp)); 671 672 /* for now, we only support mirror-mounts */ 673 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 674 675 *newvpp = NULL; 676 677 /* 678 * Has the mount already occurred? 679 */ 680 error = vn_vfsrlock_wait(vp); 681 if (error) 682 goto done; 683 vfsp = vn_mountedvfs(vp); 684 if (vfsp != NULL) { 685 /* the mount has already occurred */ 686 error = VFS_ROOT(vfsp, newvpp); 687 if (!error) { 688 /* need to update the reference time */ 689 mutex_enter(&mi->mi_lock); 690 if (mi->mi_ephemeral) 691 mi->mi_ephemeral->ne_ref_time = 692 gethrestime_sec(); 693 mutex_exit(&mi->mi_lock); 694 } 695 696 vn_vfsunlock(vp); 697 goto done; 698 } 699 vn_vfsunlock(vp); 700 701 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 702 ASSERT(ntg != NULL); 703 704 mutex_enter(&mi->mi_lock); 705 706 /* 707 * We need to lock down the ephemeral tree. 708 */ 709 if (mi->mi_ephemeral_tree == NULL) { 710 net = kmem_zalloc(sizeof (*net), KM_SLEEP); 711 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL); 712 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL); 713 net->net_refcnt = 1; 714 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING; 715 is_building = TRUE; 716 717 /* 718 * We need to add it to the zone specific list for 719 * automatic unmounting and harvesting of deadwood. 720 */ 721 mutex_enter(&ntg->ntg_forest_lock); 722 if (ntg->ntg_forest != NULL) 723 net->net_next = ntg->ntg_forest; 724 ntg->ntg_forest = net; 725 mutex_exit(&ntg->ntg_forest_lock); 726 727 /* 728 * No lock order confusion with mi_lock because no 729 * other node could have grabbed net_tree_lock. 730 */ 731 mutex_enter(&net->net_tree_lock); 732 mi->mi_ephemeral_tree = net; 733 net->net_mount = mi; 734 mutex_exit(&mi->mi_lock); 735 } else { 736 net = mi->mi_ephemeral_tree; 737 nfs4_ephemeral_tree_hold(net); 738 739 mutex_exit(&mi->mi_lock); 740 741 mutex_enter(&net->net_tree_lock); 742 743 /* 744 * We can only procede if the tree is neither locked 745 * nor being torn down. 746 */ 747 mutex_enter(&net->net_cnt_lock); 748 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) { 749 nfs4_ephemeral_tree_decr(net); 750 mutex_exit(&net->net_cnt_lock); 751 mutex_exit(&net->net_tree_lock); 752 753 return (EIO); 754 } 755 mutex_exit(&net->net_cnt_lock); 756 } 757 758 mutex_enter(&net->net_cnt_lock); 759 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING; 760 mutex_exit(&net->net_cnt_lock); 761 762 must_unlock = TRUE; 763 764 dma = nfs4_trigger_domount_args_create(vp); 765 if (dma == NULL) { 766 error = EINVAL; 767 goto done; 768 } 769 770 /* 771 * Note that since we define mirror mounts to work 772 * for any user, we simply extend the privileges of 773 * the user's credentials to allow the mount to 774 * proceed. 775 */ 776 mcred = crdup(cr); 777 if (mcred == NULL) { 778 error = EINVAL; 779 goto done; 780 } 781 782 crset_zone_privall(mcred); 783 784 error = nfs4_trigger_domount(vp, dma, &vfsp, mcred); 785 nfs4_trigger_domount_args_destroy(dma, vp); 786 787 crfree(mcred); 788 789 if (!error) 790 error = VFS_ROOT(vfsp, newvpp); 791 done: 792 if (must_unlock) { 793 mutex_enter(&net->net_cnt_lock); 794 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING; 795 if (is_building) 796 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING; 797 nfs4_ephemeral_tree_decr(net); 798 mutex_exit(&net->net_cnt_lock); 799 800 mutex_exit(&net->net_tree_lock); 801 } 802 803 if (!error && (newvpp == NULL || *newvpp == NULL)) 804 error = ENOSYS; 805 806 return (error); 807 } 808 809 /* 810 * Collect together both the generic & mount-type specific args. 811 */ 812 static domount_args_t * 813 nfs4_trigger_domount_args_create(vnode_t *vp) 814 { 815 int nointr; 816 char *hostlist; 817 servinfo4_t *svp; 818 struct nfs_args *nargs, *nargs_head; 819 enum clnt_stat status; 820 ephemeral_servinfo_t *esi, *esi_first; 821 domount_args_t *dma; 822 mntinfo4_t *mi = VTOMI4(vp); 823 824 nointr = !(mi->mi_flags & MI4_INT); 825 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 826 827 svp = mi->mi_curr_serv; 828 /* check if the current server is responding */ 829 status = nfs4_trigger_ping_server(svp, nointr); 830 if (status == RPC_SUCCESS) { 831 esi_first = nfs4_trigger_esi_create(vp, svp); 832 if (esi_first == NULL) { 833 kmem_free(hostlist, MAXPATHLEN); 834 return (NULL); 835 } 836 837 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN); 838 839 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first); 840 } else { 841 /* current server did not respond */ 842 esi_first = NULL; 843 nargs_head = NULL; 844 } 845 nargs = nargs_head; 846 847 /* 848 * NFS RO failover. 849 * 850 * If we have multiple servinfo4 structures, linked via sv_next, 851 * we must create one nfs_args for each, linking the nfs_args via 852 * nfs_ext_u.nfs_extB.next. 853 * 854 * We need to build a corresponding esi for each, too, but that is 855 * used solely for building nfs_args, and may be immediately 856 * discarded, as domount() requires the info from just one esi, 857 * but all the nfs_args. 858 * 859 * Currently, the NFS mount code will hang if not all servers 860 * requested are available. To avoid that, we need to ping each 861 * server, here, and remove it from the list if it is not 862 * responding. This has the side-effect of that server then 863 * being permanently unavailable for this failover mount, even if 864 * it recovers. That's unfortunate, but the best we can do until 865 * the mount code path is fixed. 866 */ 867 868 /* 869 * If the current server was down, loop indefinitely until we find 870 * at least one responsive server. 871 */ 872 do { 873 /* no locking needed for sv_next; it is only set at fs mount */ 874 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 875 struct nfs_args *next; 876 877 /* 878 * nargs_head: the head of the nfs_args list 879 * nargs: the current tail of the list 880 * next: the newly-created element to be added 881 */ 882 883 /* 884 * We've already tried the current server, above; 885 * if it was responding, we have already included it 886 * and it may now be ignored. 887 * 888 * Otherwise, try it again, since it may now have 889 * recovered. 890 */ 891 if (svp == mi->mi_curr_serv && esi_first != NULL) 892 continue; 893 894 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 895 if (svp->sv_flags & SV4_NOTINUSE) { 896 nfs_rw_exit(&svp->sv_lock); 897 continue; 898 } 899 nfs_rw_exit(&svp->sv_lock); 900 901 /* check if the server is responding */ 902 status = nfs4_trigger_ping_server(svp, nointr); 903 /* if the server did not respond, ignore it */ 904 if (status != RPC_SUCCESS) 905 continue; 906 907 esi = nfs4_trigger_esi_create(vp, svp); 908 if (esi == NULL) 909 continue; 910 911 /* 912 * If the original current server (mi_curr_serv) 913 * was down when when we first tried it, 914 * (i.e. esi_first == NULL), 915 * we select this new server (svp) to be the server 916 * that we will actually contact (esi_first). 917 * 918 * Note that it's possible that mi_curr_serv == svp, 919 * if that mi_curr_serv was down but has now recovered. 920 */ 921 next = nfs4_trigger_nargs_create(mi, svp, esi); 922 if (esi_first == NULL) { 923 ASSERT(nargs == NULL); 924 ASSERT(nargs_head == NULL); 925 nargs_head = next; 926 esi_first = esi; 927 (void) strlcpy(hostlist, 928 esi_first->esi_hostname, MAXPATHLEN); 929 } else { 930 ASSERT(nargs_head != NULL); 931 nargs->nfs_ext_u.nfs_extB.next = next; 932 (void) strlcat(hostlist, ",", MAXPATHLEN); 933 (void) strlcat(hostlist, esi->esi_hostname, 934 MAXPATHLEN); 935 /* esi was only needed for hostname & nargs */ 936 nfs4_trigger_esi_destroy(esi, vp); 937 } 938 939 nargs = next; 940 } 941 942 /* if we've had no response at all, wait a second */ 943 if (esi_first == NULL) 944 delay(drv_usectohz(1000000)); 945 946 } while (esi_first == NULL); 947 ASSERT(nargs_head != NULL); 948 949 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP); 950 dma->dma_esi = esi_first; 951 dma->dma_hostlist = hostlist; 952 dma->dma_nargs = nargs_head; 953 954 return (dma); 955 } 956 957 static void 958 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp) 959 { 960 if (dma != NULL) { 961 if (dma->dma_esi != NULL && vp != NULL) 962 nfs4_trigger_esi_destroy(dma->dma_esi, vp); 963 964 if (dma->dma_hostlist != NULL) 965 kmem_free(dma->dma_hostlist, MAXPATHLEN); 966 967 if (dma->dma_nargs != NULL) { 968 struct nfs_args *nargs = dma->dma_nargs; 969 970 do { 971 struct nfs_args *next = 972 nargs->nfs_ext_u.nfs_extB.next; 973 974 nfs4_trigger_nargs_destroy(nargs); 975 nargs = next; 976 } while (nargs != NULL); 977 } 978 979 kmem_free(dma, sizeof (domount_args_t)); 980 } 981 } 982 983 /* 984 * The ephemeral_servinfo_t struct contains basic information we will need to 985 * perform the mount. Whilst the structure is generic across different 986 * types of ephemeral mount, the way we gather its contents differs. 987 */ 988 static ephemeral_servinfo_t * 989 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp) 990 { 991 ephemeral_servinfo_t *esi; 992 rnode4_t *rp = VTOR4(vp); 993 994 ASSERT(RP_ISSTUB(rp)); 995 996 /* Call the ephemeral type-specific routine */ 997 if (RP_ISSTUB_MIRRORMOUNT(rp)) 998 esi = nfs4_trigger_esi_create_mirrormount(vp, svp); 999 else 1000 esi = NULL; 1001 1002 /* for now, we only support mirror-mounts */ 1003 ASSERT(esi != NULL); 1004 1005 return (esi); 1006 } 1007 1008 static void 1009 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp) 1010 { 1011 rnode4_t *rp = VTOR4(vp); 1012 1013 ASSERT(RP_ISSTUB(rp)); 1014 1015 /* for now, we only support mirror-mounts */ 1016 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 1017 1018 /* Currently, no need for an ephemeral type-specific routine */ 1019 1020 /* 1021 * The contents of ephemeral_servinfo_t goes into nfs_args, 1022 * and will be handled by nfs4_trigger_nargs_destroy(). 1023 * We need only free the structure itself. 1024 */ 1025 if (esi != NULL) 1026 kmem_free(esi, sizeof (ephemeral_servinfo_t)); 1027 } 1028 1029 /* 1030 * Some of this may turn out to be common with other ephemeral types, 1031 * in which case it should be moved to nfs4_trigger_esi_create(), or a 1032 * common function called. 1033 */ 1034 static ephemeral_servinfo_t * 1035 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp) 1036 { 1037 char *stubpath; 1038 struct knetconfig *sikncp, *svkncp; 1039 struct netbuf *bufp; 1040 ephemeral_servinfo_t *esi; 1041 1042 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP); 1043 1044 /* initially set to be our type of ephemeral mount; may be added to */ 1045 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT; 1046 1047 /* 1048 * We're copying info from the stub rnode's servinfo4, but 1049 * we must create new copies, not pointers, since this information 1050 * is to be associated with the new mount, which will be 1051 * unmounted (and its structures freed) separately 1052 */ 1053 1054 /* 1055 * Sizes passed to kmem_[z]alloc here must match those freed 1056 * in nfs4_free_args() 1057 */ 1058 1059 /* 1060 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this 1061 * is difficult to avoid: as we need to read svp to calculate the 1062 * sizes to be allocated. 1063 */ 1064 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1065 1066 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP); 1067 (void) strcat(esi->esi_hostname, svp->sv_hostname); 1068 1069 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1070 bufp = esi->esi_addr; 1071 bufp->len = svp->sv_addr.len; 1072 bufp->maxlen = svp->sv_addr.maxlen; 1073 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1074 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len); 1075 1076 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP); 1077 sikncp = esi->esi_knconf; 1078 svkncp = svp->sv_knconf; 1079 sikncp->knc_semantics = svkncp->knc_semantics; 1080 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1081 (void) strcat((char *)sikncp->knc_protofmly, 1082 (char *)svkncp->knc_protofmly); 1083 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1084 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto); 1085 sikncp->knc_rdev = svkncp->knc_rdev; 1086 1087 /* 1088 * Used when AUTH_DH is negotiated. 1089 * 1090 * This is ephemeral mount-type specific, since it contains the 1091 * server's time-sync syncaddr. 1092 */ 1093 if (svp->sv_dhsec) { 1094 struct netbuf *bufp; 1095 sec_data_t *sdata; 1096 dh_k4_clntdata_t *data; 1097 1098 sdata = svp->sv_dhsec; 1099 data = (dh_k4_clntdata_t *)sdata->data; 1100 ASSERT(sdata->rpcflavor == AUTH_DH); 1101 1102 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1103 bufp->len = data->syncaddr.len; 1104 bufp->maxlen = data->syncaddr.maxlen; 1105 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1106 bcopy(data->syncaddr.buf, bufp->buf, bufp->len); 1107 esi->esi_syncaddr = bufp; 1108 1109 if (data->netname != NULL) { 1110 int nmlen = data->netnamelen; 1111 1112 /* 1113 * We need to copy from a dh_k4_clntdata_t 1114 * netname/netnamelen pair to a NUL-terminated 1115 * netname string suitable for putting in nfs_args, 1116 * where the latter has no netnamelen field. 1117 */ 1118 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP); 1119 bcopy(data->netname, esi->esi_netname, nmlen); 1120 } 1121 } else { 1122 esi->esi_syncaddr = NULL; 1123 esi->esi_netname = NULL; 1124 } 1125 1126 stubpath = fn_path(VTOSV(vp)->sv_name); 1127 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */ 1128 ASSERT(*stubpath == '.'); 1129 stubpath += 1; 1130 1131 /* for nfs_args->fh */ 1132 esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1; 1133 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP); 1134 (void) strcat(esi->esi_path, svp->sv_path); 1135 (void) strcat(esi->esi_path, stubpath); 1136 1137 stubpath -= 1; 1138 /* stubpath allocated by fn_path() */ 1139 kmem_free(stubpath, strlen(stubpath) + 1); 1140 1141 nfs_rw_exit(&svp->sv_lock); 1142 1143 return (esi); 1144 } 1145 1146 /* 1147 * Assemble the args, and call the generic VFS mount function to 1148 * finally perform the ephemeral mount. 1149 */ 1150 static int 1151 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp, 1152 cred_t *cr) 1153 { 1154 struct mounta *uap; 1155 char *mntpt, *orig_path, *path; 1156 const char *orig_mntpt; 1157 int retval; 1158 int mntpt_len; 1159 int spec_len; 1160 zone_t *zone = curproc->p_zone; 1161 bool_t has_leading_slash; 1162 1163 vfs_t *stubvfsp = stubvp->v_vfsp; 1164 ephemeral_servinfo_t *esi = dma->dma_esi; 1165 struct nfs_args *nargs = dma->dma_nargs; 1166 1167 /* first, construct the mount point for the ephemeral mount */ 1168 orig_path = path = fn_path(VTOSV(stubvp)->sv_name); 1169 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt); 1170 1171 if (*orig_path == '.') 1172 orig_path++; 1173 1174 /* 1175 * Get rid of zone's root path 1176 */ 1177 if (zone != global_zone) { 1178 /* 1179 * -1 for trailing '/' and -1 for EOS. 1180 */ 1181 if (strncmp(zone->zone_rootpath, orig_mntpt, 1182 zone->zone_rootpathlen - 1) == 0) { 1183 orig_mntpt += (zone->zone_rootpathlen - 2); 1184 } 1185 } 1186 1187 mntpt_len = strlen(orig_mntpt) + strlen(orig_path); 1188 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP); 1189 (void) strcat(mntpt, orig_mntpt); 1190 (void) strcat(mntpt, orig_path); 1191 1192 kmem_free(path, strlen(path) + 1); 1193 path = esi->esi_path; 1194 if (*path == '.') 1195 path++; 1196 if (path[0] == '/' && path[1] == '/') 1197 path++; 1198 has_leading_slash = (*path == '/'); 1199 1200 spec_len = strlen(dma->dma_hostlist); 1201 spec_len += strlen(path); 1202 1203 /* We are going to have to add this in */ 1204 if (!has_leading_slash) 1205 spec_len++; 1206 1207 /* We need to get the ':' for dma_hostlist:esi_path */ 1208 spec_len++; 1209 1210 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP); 1211 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP); 1212 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist, 1213 has_leading_slash ? "" : "/", path); 1214 1215 uap->dir = mntpt; 1216 1217 uap->flags = MS_SYSSPACE | MS_DATA; 1218 /* fstype-independent mount options not covered elsewhere */ 1219 /* copy parent's mount(1M) "-m" flag */ 1220 if (stubvfsp->vfs_flag & VFS_NOMNTTAB) 1221 uap->flags |= MS_NOMNTTAB; 1222 1223 uap->fstype = MNTTYPE_NFS4; 1224 uap->dataptr = (char *)nargs; 1225 /* not needed for MS_SYSSPACE */ 1226 uap->datalen = 0; 1227 1228 /* use optptr to pass in extra mount options */ 1229 uap->flags |= MS_OPTIONSTR; 1230 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp); 1231 if (uap->optptr == NULL) { 1232 retval = EINVAL; 1233 goto done; 1234 } 1235 1236 /* domount() expects us to count the trailing NUL */ 1237 uap->optlen = strlen(uap->optptr) + 1; 1238 1239 retval = domount(NULL, uap, stubvp, cr, vfsp); 1240 if (retval == 0) 1241 VFS_RELE(*vfsp); 1242 1243 done: 1244 if (uap->optptr) 1245 nfs4_trigger_destroy_mntopts(uap->optptr); 1246 1247 kmem_free(uap->spec, spec_len + 1); 1248 kmem_free(uap, sizeof (struct mounta)); 1249 kmem_free(mntpt, mntpt_len + 1); 1250 1251 return (retval); 1252 } 1253 1254 /* 1255 * Build an nfs_args structure for passing to domount(). 1256 * 1257 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t; 1258 * generic data - common to all ephemeral mount types - is read directly 1259 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode. 1260 */ 1261 static struct nfs_args * 1262 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp, 1263 ephemeral_servinfo_t *esi) 1264 { 1265 sec_data_t *secdata; 1266 struct nfs_args *nargs; 1267 1268 /* setup the nfs args */ 1269 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP); 1270 1271 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1272 1273 nargs->addr = esi->esi_addr; 1274 1275 /* for AUTH_DH by negotiation */ 1276 if (esi->esi_syncaddr || esi->esi_netname) { 1277 nargs->flags |= NFSMNT_SECURE; 1278 nargs->syncaddr = esi->esi_syncaddr; 1279 nargs->netname = esi->esi_netname; 1280 } 1281 1282 nargs->flags |= NFSMNT_KNCONF; 1283 nargs->knconf = esi->esi_knconf; 1284 nargs->flags |= NFSMNT_HOSTNAME; 1285 nargs->hostname = esi->esi_hostname; 1286 nargs->fh = esi->esi_path; 1287 1288 /* general mount settings, all copied from parent mount */ 1289 mutex_enter(&mi->mi_lock); 1290 1291 if (!(mi->mi_flags & MI4_HARD)) 1292 nargs->flags |= NFSMNT_SOFT; 1293 1294 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO | 1295 NFSMNT_RETRANS; 1296 nargs->wsize = mi->mi_stsize; 1297 nargs->rsize = mi->mi_tsize; 1298 nargs->timeo = mi->mi_timeo; 1299 nargs->retrans = mi->mi_retrans; 1300 1301 if (mi->mi_flags & MI4_INT) 1302 nargs->flags |= NFSMNT_INT; 1303 if (mi->mi_flags & MI4_NOAC) 1304 nargs->flags |= NFSMNT_NOAC; 1305 1306 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN | 1307 NFSMNT_ACDIRMAX; 1308 nargs->acregmin = HR2SEC(mi->mi_acregmin); 1309 nargs->acregmax = HR2SEC(mi->mi_acregmax); 1310 nargs->acdirmin = HR2SEC(mi->mi_acdirmin); 1311 nargs->acdirmax = HR2SEC(mi->mi_acdirmax); 1312 1313 if (mi->mi_flags & MI4_NOCTO) 1314 nargs->flags |= NFSMNT_NOCTO; 1315 if (mi->mi_flags & MI4_GRPID) 1316 nargs->flags |= NFSMNT_GRPID; 1317 if (mi->mi_flags & MI4_LLOCK) 1318 nargs->flags |= NFSMNT_LLOCK; 1319 if (mi->mi_flags & MI4_NOPRINT) 1320 nargs->flags |= NFSMNT_NOPRINT; 1321 if (mi->mi_flags & MI4_DIRECTIO) 1322 nargs->flags |= NFSMNT_DIRECTIO; 1323 if (mi->mi_flags & MI4_PUBLIC) 1324 nargs->flags |= NFSMNT_PUBLIC; 1325 1326 mutex_exit(&mi->mi_lock); 1327 1328 /* add any specific flags for this type of ephemeral mount */ 1329 nargs->flags |= esi->esi_mount_flags; 1330 1331 /* 1332 * Security data & negotiation policy. 1333 * 1334 * We need to preserve the parent mount's preference for security 1335 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT. 1336 * 1337 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific 1338 * security flavour was requested, with data in sv_secdata, and that 1339 * no negotiation should occur. If this specified flavour fails, that's 1340 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT. 1341 * 1342 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in 1343 * default flavour, in sv_secdata, but then negotiate a new flavour. 1344 * Possible flavours are recorded in an array in sv_secinfo, with 1345 * currently in-use flavour pointed to by sv_currsec. 1346 * 1347 * If sv_currsec is set, i.e. if negotiation has already occurred, 1348 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless, 1349 * we will set NFSMNT_SECDEFAULT, to enable negotiation. 1350 */ 1351 if (svp->sv_flags & SV4_TRYSECDEFAULT) { 1352 /* enable negotiation for ephemeral mount */ 1353 nargs->flags |= NFSMNT_SECDEFAULT; 1354 1355 /* 1356 * As a starting point for negotiation, copy parent 1357 * mount's negotiated flavour (sv_currsec) if available, 1358 * or its passed-in flavour (sv_secdata) if not. 1359 */ 1360 if (svp->sv_currsec != NULL) 1361 secdata = copy_sec_data(svp->sv_currsec); 1362 else if (svp->sv_secdata != NULL) 1363 secdata = copy_sec_data(svp->sv_secdata); 1364 else 1365 secdata = NULL; 1366 } else { 1367 /* do not enable negotiation; copy parent's passed-in flavour */ 1368 if (svp->sv_secdata != NULL) 1369 secdata = copy_sec_data(svp->sv_secdata); 1370 else 1371 secdata = NULL; 1372 } 1373 1374 nfs_rw_exit(&svp->sv_lock); 1375 1376 nargs->flags |= NFSMNT_NEWARGS; 1377 nargs->nfs_args_ext = NFS_ARGS_EXTB; 1378 nargs->nfs_ext_u.nfs_extB.secdata = secdata; 1379 1380 /* for NFS RO failover; caller will set if necessary */ 1381 nargs->nfs_ext_u.nfs_extB.next = NULL; 1382 1383 return (nargs); 1384 } 1385 1386 static void 1387 nfs4_trigger_nargs_destroy(struct nfs_args *nargs) 1388 { 1389 /* 1390 * Either the mount failed, in which case the data is not needed, or 1391 * nfs4_mount() has either taken copies of what it needs or, 1392 * where it has merely copied the ptr, it has set *our* ptr to NULL, 1393 * whereby nfs4_free_args() will ignore it. 1394 */ 1395 nfs4_free_args(nargs); 1396 kmem_free(nargs, sizeof (struct nfs_args)); 1397 } 1398 1399 /* 1400 * When we finally get into the mounting, we need to add this 1401 * node to the ephemeral tree. 1402 * 1403 * This is called from nfs4_mount(). 1404 */ 1405 int 1406 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp) 1407 { 1408 mntinfo4_t *mi_parent; 1409 nfs4_ephemeral_t *eph; 1410 nfs4_ephemeral_tree_t *net; 1411 1412 nfs4_ephemeral_t *prior; 1413 nfs4_ephemeral_t *child; 1414 1415 nfs4_ephemeral_t *peer; 1416 1417 nfs4_trigger_globals_t *ntg; 1418 zone_t *zone = curproc->p_zone; 1419 1420 int rc = 0; 1421 1422 mi_parent = VTOMI4(mvp); 1423 1424 /* 1425 * Get this before grabbing anything else! 1426 */ 1427 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 1428 if (!ntg->ntg_thread_started) { 1429 nfs4_ephemeral_start_harvester(ntg); 1430 } 1431 1432 mutex_enter(&mi_parent->mi_lock); 1433 mutex_enter(&mi->mi_lock); 1434 1435 net = mi->mi_ephemeral_tree = 1436 mi_parent->mi_ephemeral_tree; 1437 1438 /* 1439 * If the mi_ephemeral_tree is NULL, then it 1440 * means that either the harvester or a manual 1441 * umount has cleared the tree out right before 1442 * we got here. 1443 * 1444 * There is nothing we can do here, so return 1445 * to the caller and let them decide whether they 1446 * try again. 1447 */ 1448 if (net == NULL) { 1449 mutex_exit(&mi->mi_lock); 1450 mutex_exit(&mi_parent->mi_lock); 1451 1452 return (EBUSY); 1453 } 1454 1455 nfs4_ephemeral_tree_hold(net); 1456 1457 /* 1458 * We need to tack together the ephemeral mount 1459 * with this new mntinfo. 1460 */ 1461 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP); 1462 eph->ne_mount = mi; 1463 eph->ne_ref_time = gethrestime_sec(); 1464 1465 /* 1466 * We need to tell the ephemeral mount when 1467 * to time out. 1468 */ 1469 eph->ne_mount_to = ntg->ntg_mount_to; 1470 1471 mi->mi_flags |= MI4_EPHEMERAL; 1472 mi->mi_ephemeral = eph; 1473 1474 /* 1475 * If the enclosing mntinfo4 is also ephemeral, 1476 * then we need to point to its enclosing parent. 1477 * Else the enclosing mntinfo4 is the enclosing parent. 1478 * 1479 * We also need to weave this ephemeral node 1480 * into the tree. 1481 */ 1482 if (mi_parent->mi_flags & MI4_EPHEMERAL) { 1483 /* 1484 * We need to decide if we are 1485 * the root node of this branch 1486 * or if we are a sibling of this 1487 * branch. 1488 */ 1489 prior = mi_parent->mi_ephemeral; 1490 if (prior == NULL) { 1491 /* 1492 * Race condition, clean up, and 1493 * let caller handle mntinfo. 1494 */ 1495 mi->mi_flags &= ~MI4_EPHEMERAL; 1496 mi->mi_ephemeral = NULL; 1497 kmem_free(eph, sizeof (*eph)); 1498 rc = EBUSY; 1499 } else { 1500 if (prior->ne_child == NULL) { 1501 prior->ne_child = eph; 1502 } else { 1503 child = prior->ne_child; 1504 1505 prior->ne_child = eph; 1506 eph->ne_peer = child; 1507 1508 child->ne_prior = eph; 1509 } 1510 1511 eph->ne_prior = prior; 1512 } 1513 } else { 1514 /* 1515 * The parent mntinfo4 is the non-ephemeral 1516 * root of the ephemeral tree. We 1517 * need to decide if we are the root 1518 * node of that tree or if we are a 1519 * sibling of the root node. 1520 * 1521 * We are the root if there is no 1522 * other node. 1523 */ 1524 if (net->net_root == NULL) { 1525 net->net_root = eph; 1526 } else { 1527 eph->ne_peer = peer = net->net_root; 1528 ASSERT(peer != NULL); 1529 net->net_root = eph; 1530 1531 peer->ne_prior = eph; 1532 } 1533 1534 eph->ne_prior = NULL; 1535 } 1536 1537 nfs4_ephemeral_tree_rele(net); 1538 1539 mutex_exit(&mi->mi_lock); 1540 mutex_exit(&mi_parent->mi_lock); 1541 1542 return (rc); 1543 } 1544 1545 /* 1546 * Commit the changes to the ephemeral tree for removing this node. 1547 */ 1548 static void 1549 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph) 1550 { 1551 nfs4_ephemeral_t *e = eph; 1552 nfs4_ephemeral_t *peer; 1553 nfs4_ephemeral_t *prior; 1554 1555 peer = eph->ne_peer; 1556 prior = e->ne_prior; 1557 1558 /* 1559 * If this branch root was not the 1560 * tree root, then we need to fix back pointers. 1561 */ 1562 if (prior) { 1563 if (prior->ne_child == e) { 1564 prior->ne_child = peer; 1565 } else { 1566 prior->ne_peer = peer; 1567 } 1568 1569 if (peer) 1570 peer->ne_prior = prior; 1571 } else if (peer) { 1572 peer->ne_mount->mi_ephemeral_tree->net_root = peer; 1573 peer->ne_prior = NULL; 1574 } else { 1575 e->ne_mount->mi_ephemeral_tree->net_root = NULL; 1576 } 1577 } 1578 1579 /* 1580 * We want to avoid recursion at all costs. So we need to 1581 * unroll the tree. We do this by a depth first traversal to 1582 * leaf nodes. We blast away the leaf and work our way back 1583 * up and down the tree. 1584 */ 1585 static int 1586 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph, 1587 int isTreeRoot, int flag, cred_t *cr) 1588 { 1589 nfs4_ephemeral_t *e = eph; 1590 nfs4_ephemeral_t *prior; 1591 mntinfo4_t *mi; 1592 vfs_t *vfsp; 1593 int error; 1594 1595 /* 1596 * We use the loop while unrolling the ephemeral tree. 1597 */ 1598 for (;;) { 1599 /* 1600 * First we walk down the child. 1601 */ 1602 if (e->ne_child) { 1603 prior = e; 1604 e = e->ne_child; 1605 continue; 1606 } 1607 1608 /* 1609 * If we are the root of the branch we are removing, 1610 * we end it here. But if the branch is the root of 1611 * the tree, we have to forge on. We do not consider 1612 * the peer list for the root because while it may 1613 * be okay to remove, it is both extra work and a 1614 * potential for a false-positive error to stall the 1615 * unmount attempt. 1616 */ 1617 if (e == eph && isTreeRoot == FALSE) 1618 return (0); 1619 1620 /* 1621 * Next we walk down the peer list. 1622 */ 1623 if (e->ne_peer) { 1624 prior = e; 1625 e = e->ne_peer; 1626 continue; 1627 } 1628 1629 /* 1630 * We can only remove the node passed in by the 1631 * caller if it is the root of the ephemeral tree. 1632 * Otherwise, the caller will remove it. 1633 */ 1634 if (e == eph && isTreeRoot == FALSE) 1635 return (0); 1636 1637 /* 1638 * Okay, we have a leaf node, time 1639 * to prune it! 1640 * 1641 * Note that prior can only be NULL if 1642 * and only if it is the root of the 1643 * ephemeral tree. 1644 */ 1645 prior = e->ne_prior; 1646 1647 mi = e->ne_mount; 1648 mutex_enter(&mi->mi_lock); 1649 vfsp = mi->mi_vfsp; 1650 1651 /* 1652 * Cleared by umount2_engine. 1653 */ 1654 VFS_HOLD(vfsp); 1655 1656 /* 1657 * Inform nfs4_unmount to not recursively 1658 * descend into this node's children when it 1659 * gets processed. 1660 */ 1661 mi->mi_flags |= MI4_EPHEMERAL_RECURSED; 1662 mutex_exit(&mi->mi_lock); 1663 1664 error = umount2_engine(vfsp, flag, cr, FALSE); 1665 if (error) { 1666 /* 1667 * We need to reenable nfs4_unmount's ability 1668 * to recursively descend on this node. 1669 */ 1670 mutex_enter(&mi->mi_lock); 1671 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED; 1672 mutex_exit(&mi->mi_lock); 1673 1674 return (error); 1675 } 1676 1677 /* 1678 * If we are the current node, we do not want to 1679 * touch anything else. At this point, the only 1680 * way the current node can have survived to here 1681 * is if it is the root of the ephemeral tree and 1682 * we are unmounting the enclosing mntinfo4. 1683 */ 1684 if (e == eph) { 1685 ASSERT(prior == NULL); 1686 return (0); 1687 } 1688 1689 /* 1690 * Stitch up the prior node. Note that since 1691 * we have handled the root of the tree, prior 1692 * must be non-NULL. 1693 */ 1694 ASSERT(prior != NULL); 1695 if (prior->ne_child == e) { 1696 prior->ne_child = NULL; 1697 } else { 1698 ASSERT(prior->ne_peer == e); 1699 1700 prior->ne_peer = NULL; 1701 } 1702 1703 e = prior; 1704 } 1705 1706 /* NOTREACHED */ 1707 } 1708 1709 /* 1710 * Common code to safely release net_cnt_lock and net_tree_lock 1711 */ 1712 void 1713 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock, 1714 bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1715 { 1716 nfs4_ephemeral_tree_t *net = *pnet; 1717 1718 if (*pmust_unlock) { 1719 mutex_enter(&net->net_cnt_lock); 1720 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING; 1721 if (*pmust_rele) 1722 nfs4_ephemeral_tree_decr(net); 1723 mutex_exit(&net->net_cnt_lock); 1724 1725 mutex_exit(&net->net_tree_lock); 1726 1727 *pmust_unlock = FALSE; 1728 } 1729 } 1730 1731 /* 1732 * While we may have removed any child or sibling nodes of this 1733 * ephemeral node, we can not nuke it until we know that there 1734 * were no actived vnodes on it. This will do that final 1735 * work once we know it is not busy. 1736 */ 1737 void 1738 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock, 1739 bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1740 { 1741 /* 1742 * Now we need to get rid of the ephemeral data if it exists. 1743 */ 1744 mutex_enter(&mi->mi_lock); 1745 if (mi->mi_ephemeral) { 1746 /* 1747 * If we are the root node of an ephemeral branch 1748 * which is being removed, then we need to fixup 1749 * pointers into and out of the node. 1750 */ 1751 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED)) 1752 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral); 1753 1754 ASSERT(mi->mi_ephemeral != NULL); 1755 1756 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral)); 1757 mi->mi_ephemeral = NULL; 1758 } 1759 mutex_exit(&mi->mi_lock); 1760 1761 nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele, pnet); 1762 } 1763 1764 /* 1765 * Unmount an ephemeral node. 1766 */ 1767 int 1768 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr, 1769 bool_t *pmust_unlock, bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1770 { 1771 int error = 0; 1772 nfs4_ephemeral_t *eph; 1773 nfs4_ephemeral_tree_t *net; 1774 int is_derooting = FALSE; 1775 int is_recursed = FALSE; 1776 int was_locked = FALSE; 1777 1778 /* 1779 * Make sure to set the default state for cleaning 1780 * up the tree in the caller (and on the way out). 1781 */ 1782 *pmust_unlock = *pmust_rele = FALSE; 1783 1784 /* 1785 * The active vnodes on this file system may be ephemeral 1786 * children. We need to check for and try to unmount them 1787 * here. If any can not be unmounted, we are going 1788 * to return EBUSY. 1789 */ 1790 mutex_enter(&mi->mi_lock); 1791 1792 /* 1793 * If an ephemeral tree, we need to check to see if 1794 * the lock is already held. If it is, then we need 1795 * to see if we are being called as a result of 1796 * the recursive removal of some node of the tree or 1797 * if we are another attempt to remove the tree. 1798 * 1799 * mi_flags & MI4_EPHEMERAL indicates an ephemeral 1800 * node. mi_ephemeral being non-NULL also does this. 1801 * 1802 * mi_ephemeral_tree being non-NULL is sufficient 1803 * to also indicate either it is an ephemeral node 1804 * or the enclosing mntinfo4. 1805 * 1806 * Do we need MI4_EPHEMERAL? Yes, it is useful for 1807 * when we delete the ephemeral node and need to 1808 * differentiate from an ephemeral node and the 1809 * enclosing root node. 1810 */ 1811 *pnet = net = mi->mi_ephemeral_tree; 1812 if (net == NULL) { 1813 mutex_exit(&mi->mi_lock); 1814 return (0); 1815 } 1816 1817 eph = mi->mi_ephemeral; 1818 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED; 1819 is_derooting = (eph == NULL); 1820 1821 /* 1822 * If this is not recursion, then we need to 1823 * grab a ref count. 1824 * 1825 * But wait, we also do not want to do that 1826 * if a harvester thread has already grabbed 1827 * the lock. 1828 */ 1829 if (!is_recursed) { 1830 mutex_enter(&net->net_cnt_lock); 1831 if (net->net_status & 1832 NFS4_EPHEMERAL_TREE_LOCKED) { 1833 /* 1834 * If the tree is locked, we need 1835 * to decide whether we are the 1836 * harvester or some explicit call 1837 * for a umount. The only way that 1838 * we are the harvester is if 1839 * MS_SYSSPACE is set. 1840 * 1841 * We only let the harvester through 1842 * at this point. 1843 * 1844 * We return EBUSY so that the 1845 * caller knows something is 1846 * going on. Note that by that 1847 * time, the umount in the other 1848 * thread may have already occured. 1849 */ 1850 if (!(flag & MS_SYSSPACE)) { 1851 mutex_exit(&net->net_cnt_lock); 1852 mutex_exit(&mi->mi_lock); 1853 1854 return (EBUSY); 1855 } 1856 1857 was_locked = TRUE; 1858 } else { 1859 nfs4_ephemeral_tree_incr(net); 1860 *pmust_rele = TRUE; 1861 } 1862 1863 mutex_exit(&net->net_cnt_lock); 1864 } 1865 mutex_exit(&mi->mi_lock); 1866 1867 /* 1868 * If we are not the harvester, we need to check 1869 * to see if we need to grab the tree lock. 1870 */ 1871 if (was_locked == FALSE) { 1872 /* 1873 * If we grab the lock, it means that no other 1874 * operation is working on the tree. If we don't 1875 * grab it, we need to decide if this is because 1876 * we are a recursive call or a new operation. 1877 */ 1878 if (mutex_tryenter(&net->net_tree_lock)) { 1879 *pmust_unlock = TRUE; 1880 } else { 1881 /* 1882 * If we are a recursive call, we can 1883 * proceed without the lock. 1884 * Otherwise we have to wait until 1885 * the lock becomes free. 1886 */ 1887 if (!is_recursed) { 1888 mutex_enter(&net->net_cnt_lock); 1889 if (net->net_status & 1890 (NFS4_EPHEMERAL_TREE_DEROOTING 1891 | NFS4_EPHEMERAL_TREE_INVALID)) { 1892 nfs4_ephemeral_tree_decr(net); 1893 mutex_exit(&net->net_cnt_lock); 1894 *pmust_rele = FALSE; 1895 goto is_busy; 1896 } 1897 mutex_exit(&net->net_cnt_lock); 1898 1899 /* 1900 * We can't hold any other locks whilst 1901 * we wait on this to free up. 1902 */ 1903 mutex_enter(&net->net_tree_lock); 1904 1905 /* 1906 * Note that while mi->mi_ephemeral 1907 * may change and thus we have to 1908 * update eph, it is the case that 1909 * we have tied down net and 1910 * do not care if mi->mi_ephemeral_tree 1911 * has changed. 1912 */ 1913 mutex_enter(&mi->mi_lock); 1914 eph = mi->mi_ephemeral; 1915 mutex_exit(&mi->mi_lock); 1916 1917 /* 1918 * Okay, we need to see if either the 1919 * tree got nuked or the current node 1920 * got nuked. Both of which will cause 1921 * an error. 1922 * 1923 * Note that a subsequent retry of the 1924 * umount shall work. 1925 */ 1926 mutex_enter(&net->net_cnt_lock); 1927 if (net->net_status & 1928 NFS4_EPHEMERAL_TREE_INVALID || 1929 (!is_derooting && eph == NULL)) { 1930 nfs4_ephemeral_tree_decr(net); 1931 mutex_exit(&net->net_cnt_lock); 1932 mutex_exit(&net->net_tree_lock); 1933 *pmust_rele = FALSE; 1934 goto is_busy; 1935 } 1936 mutex_exit(&net->net_cnt_lock); 1937 *pmust_unlock = TRUE; 1938 } 1939 } 1940 } 1941 1942 /* 1943 * Only once we have grabbed the lock can we mark what we 1944 * are planning on doing to the ephemeral tree. 1945 */ 1946 if (*pmust_unlock) { 1947 mutex_enter(&net->net_cnt_lock); 1948 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING; 1949 1950 /* 1951 * Check to see if we are nuking the root. 1952 */ 1953 if (is_derooting) 1954 net->net_status |= 1955 NFS4_EPHEMERAL_TREE_DEROOTING; 1956 mutex_exit(&net->net_cnt_lock); 1957 } 1958 1959 if (!is_derooting) { 1960 /* 1961 * Only work on children if the caller has not already 1962 * done so. 1963 */ 1964 if (!is_recursed) { 1965 ASSERT(eph != NULL); 1966 1967 error = nfs4_ephemeral_unmount_engine(eph, 1968 FALSE, flag, cr); 1969 if (error) 1970 goto is_busy; 1971 } 1972 } else { 1973 eph = net->net_root; 1974 1975 /* 1976 * Only work if there is something there. 1977 */ 1978 if (eph) { 1979 error = nfs4_ephemeral_unmount_engine(eph, TRUE, 1980 flag, cr); 1981 if (error) { 1982 mutex_enter(&net->net_cnt_lock); 1983 net->net_status &= 1984 ~NFS4_EPHEMERAL_TREE_DEROOTING; 1985 mutex_exit(&net->net_cnt_lock); 1986 goto is_busy; 1987 } 1988 1989 /* 1990 * Nothing else which goes wrong will 1991 * invalidate the blowing away of the 1992 * ephmeral tree. 1993 */ 1994 net->net_root = NULL; 1995 } 1996 1997 /* 1998 * We have derooted and we have caused the tree to be 1999 * invalidated. 2000 */ 2001 mutex_enter(&net->net_cnt_lock); 2002 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING; 2003 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID; 2004 if (was_locked == FALSE) 2005 nfs4_ephemeral_tree_decr(net); 2006 mutex_exit(&net->net_cnt_lock); 2007 2008 if (was_locked == FALSE) 2009 mutex_exit(&net->net_tree_lock); 2010 2011 /* 2012 * We have just blown away any notation of this 2013 * tree being locked. We can't let the caller 2014 * try to clean things up. 2015 */ 2016 *pmust_unlock = FALSE; 2017 2018 /* 2019 * At this point, the tree should no longer be 2020 * associated with the mntinfo4. We need to pull 2021 * it off there and let the harvester take 2022 * care of it once the refcnt drops. 2023 */ 2024 mutex_enter(&mi->mi_lock); 2025 mi->mi_ephemeral_tree = NULL; 2026 mutex_exit(&mi->mi_lock); 2027 } 2028 2029 return (0); 2030 2031 is_busy: 2032 2033 nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele, 2034 pnet); 2035 2036 return (error); 2037 } 2038 2039 /* 2040 * Do the umount and record any error in the parent. 2041 */ 2042 static void 2043 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag, 2044 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior) 2045 { 2046 int error; 2047 2048 error = umount2_engine(vfsp, flag, kcred, FALSE); 2049 if (error) { 2050 if (prior) { 2051 if (prior->ne_child == e) 2052 prior->ne_state |= 2053 NFS4_EPHEMERAL_CHILD_ERROR; 2054 else 2055 prior->ne_state |= 2056 NFS4_EPHEMERAL_PEER_ERROR; 2057 } 2058 } 2059 } 2060 2061 /* 2062 * For each tree in the forest (where the forest is in 2063 * effect all of the ephemeral trees for this zone), 2064 * scan to see if a node can be unmounted. Note that 2065 * unlike nfs4_ephemeral_unmount_engine(), we do 2066 * not process the current node before children or 2067 * siblings. I.e., if a node can be unmounted, we 2068 * do not recursively check to see if the nodes 2069 * hanging off of it can also be unmounted. 2070 * 2071 * Instead, we delve down deep to try and remove the 2072 * children first. Then, because we share code with 2073 * nfs4_ephemeral_unmount_engine(), we will try 2074 * them again. This could be a performance issue in 2075 * the future. 2076 * 2077 * Also note that unlike nfs4_ephemeral_unmount_engine(), 2078 * we do not halt on an error. We will not remove the 2079 * current node, but we will keep on trying to remove 2080 * the others. 2081 * 2082 * force indicates that we want the unmount to occur 2083 * even if there is something blocking it. 2084 * 2085 * time_check indicates that we want to see if the 2086 * mount has expired past mount_to or not. Typically 2087 * we want to do this and only on a shutdown of the 2088 * zone would we want to ignore the check. 2089 */ 2090 static void 2091 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg, 2092 bool_t force, bool_t time_check) 2093 { 2094 nfs4_ephemeral_tree_t *net; 2095 nfs4_ephemeral_tree_t *prev = NULL; 2096 nfs4_ephemeral_tree_t *next; 2097 nfs4_ephemeral_t *e; 2098 nfs4_ephemeral_t *prior; 2099 time_t now = gethrestime_sec(); 2100 2101 nfs4_ephemeral_tree_t *harvest = NULL; 2102 2103 int flag; 2104 2105 mntinfo4_t *mi; 2106 vfs_t *vfsp; 2107 2108 if (force) 2109 flag = MS_FORCE | MS_SYSSPACE; 2110 else 2111 flag = MS_SYSSPACE; 2112 2113 mutex_enter(&ntg->ntg_forest_lock); 2114 for (net = ntg->ntg_forest; net != NULL; net = next) { 2115 next = net->net_next; 2116 2117 nfs4_ephemeral_tree_hold(net); 2118 2119 mutex_enter(&net->net_tree_lock); 2120 2121 /* 2122 * Let the unmount code know that the 2123 * tree is already locked! 2124 */ 2125 mutex_enter(&net->net_cnt_lock); 2126 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED; 2127 mutex_exit(&net->net_cnt_lock); 2128 2129 /* 2130 * If the intent is force all ephemeral nodes to 2131 * be unmounted in this zone, we can short circuit a 2132 * lot of tree traversal and simply zap the root node. 2133 */ 2134 if (force) { 2135 if (net->net_root) { 2136 mi = net->net_root->ne_mount; 2137 vfsp = mi->mi_vfsp; 2138 2139 /* 2140 * Cleared by umount2_engine. 2141 */ 2142 VFS_HOLD(vfsp); 2143 2144 (void) umount2_engine(vfsp, flag, 2145 kcred, FALSE); 2146 2147 goto check_done; 2148 } 2149 } 2150 2151 e = net->net_root; 2152 if (e) 2153 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; 2154 2155 while (e) { 2156 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) { 2157 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING; 2158 if (e->ne_child) { 2159 e = e->ne_child; 2160 e->ne_state = 2161 NFS4_EPHEMERAL_VISIT_CHILD; 2162 } 2163 2164 continue; 2165 } else if (e->ne_state == 2166 NFS4_EPHEMERAL_VISIT_SIBLING) { 2167 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME; 2168 if (e->ne_peer) { 2169 e = e->ne_peer; 2170 e->ne_state = 2171 NFS4_EPHEMERAL_VISIT_CHILD; 2172 } 2173 2174 continue; 2175 } else if (e->ne_state == 2176 NFS4_EPHEMERAL_CHILD_ERROR) { 2177 prior = e->ne_prior; 2178 2179 /* 2180 * If a child reported an error, do 2181 * not bother trying to unmount. 2182 * 2183 * If your prior node is a parent, 2184 * pass the error up such that they 2185 * also do not try to unmount. 2186 * 2187 * However, if your prior is a sibling, 2188 * let them try to unmount if they can. 2189 */ 2190 if (prior) { 2191 if (prior->ne_child == e) 2192 prior->ne_state |= 2193 NFS4_EPHEMERAL_CHILD_ERROR; 2194 else 2195 prior->ne_state |= 2196 NFS4_EPHEMERAL_PEER_ERROR; 2197 } 2198 2199 /* 2200 * Clear the error and if needed, process peers. 2201 * 2202 * Once we mask out the error, we know whether 2203 * or we have to process another node. 2204 */ 2205 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR; 2206 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME) 2207 e = prior; 2208 2209 continue; 2210 } else if (e->ne_state == 2211 NFS4_EPHEMERAL_PEER_ERROR) { 2212 prior = e->ne_prior; 2213 2214 if (prior) { 2215 if (prior->ne_child == e) 2216 prior->ne_state = 2217 NFS4_EPHEMERAL_CHILD_ERROR; 2218 else 2219 prior->ne_state = 2220 NFS4_EPHEMERAL_PEER_ERROR; 2221 } 2222 2223 /* 2224 * Clear the error from this node and do the 2225 * correct processing. 2226 */ 2227 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR; 2228 continue; 2229 } 2230 2231 prior = e->ne_prior; 2232 e->ne_state = NFS4_EPHEMERAL_OK; 2233 2234 /* 2235 * It must be the case that we need to process 2236 * this node. 2237 */ 2238 if (!time_check || 2239 now - e->ne_ref_time > e->ne_mount_to) { 2240 mi = e->ne_mount; 2241 vfsp = mi->mi_vfsp; 2242 2243 /* 2244 * Cleared by umount2_engine. 2245 */ 2246 VFS_HOLD(vfsp); 2247 2248 /* 2249 * Note that we effectively work down to the 2250 * leaf nodes first, try to unmount them, 2251 * then work our way back up into the leaf 2252 * nodes. 2253 * 2254 * Also note that we deal with a lot of 2255 * complexity by sharing the work with 2256 * the manual unmount code. 2257 */ 2258 nfs4_ephemeral_record_umount(vfsp, flag, 2259 e, prior); 2260 } 2261 2262 e = prior; 2263 } 2264 2265 check_done: 2266 2267 /* 2268 * At this point we are done processing this tree. 2269 * 2270 * If the tree is invalid and we are the only reference 2271 * to it, then we push it on the local linked list 2272 * to remove it at the end. We avoid that action now 2273 * to keep the tree processing going along at a fair clip. 2274 * 2275 * Else, even if we are the only reference, we drop 2276 * our hold on the current tree and allow it to be 2277 * reused as needed. 2278 */ 2279 mutex_enter(&net->net_cnt_lock); 2280 if (net->net_refcnt == 1 && 2281 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) { 2282 nfs4_ephemeral_tree_decr(net); 2283 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2284 mutex_exit(&net->net_cnt_lock); 2285 mutex_exit(&net->net_tree_lock); 2286 2287 if (prev) 2288 prev->net_next = net->net_next; 2289 else 2290 ntg->ntg_forest = net->net_next; 2291 2292 net->net_next = harvest; 2293 harvest = net; 2294 continue; 2295 } 2296 2297 nfs4_ephemeral_tree_decr(net); 2298 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2299 mutex_exit(&net->net_cnt_lock); 2300 mutex_exit(&net->net_tree_lock); 2301 2302 prev = net; 2303 } 2304 mutex_exit(&ntg->ntg_forest_lock); 2305 2306 for (net = harvest; net != NULL; net = next) { 2307 next = net->net_next; 2308 2309 mutex_destroy(&net->net_tree_lock); 2310 mutex_destroy(&net->net_cnt_lock); 2311 kmem_free(net, sizeof (*net)); 2312 } 2313 } 2314 2315 /* 2316 * This is the thread which decides when the harvesting 2317 * can proceed and when to kill it off for this zone. 2318 */ 2319 static void 2320 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg) 2321 { 2322 clock_t timeleft; 2323 zone_t *zone = curproc->p_zone; 2324 2325 for (;;) { 2326 timeleft = zone_status_timedwait(zone, lbolt + 2327 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN); 2328 2329 /* 2330 * zone is exiting... 2331 */ 2332 if (timeleft != -1) { 2333 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN); 2334 zthread_exit(); 2335 /* NOTREACHED */ 2336 } 2337 2338 /* 2339 * Only bother scanning if there is potential 2340 * work to be done. 2341 */ 2342 if (ntg->ntg_forest == NULL) 2343 continue; 2344 2345 /* 2346 * Now scan the list and get rid of everything which 2347 * is old. 2348 */ 2349 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE); 2350 } 2351 2352 /* NOTREACHED */ 2353 } 2354 2355 /* 2356 * The zone specific glue needed to start the unmount harvester. 2357 * 2358 * Note that we want to avoid holding the mutex as long as possible, 2359 * hence the multiple checks. 2360 * 2361 * The caller should avoid us getting down here in the first 2362 * place. 2363 */ 2364 static void 2365 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg) 2366 { 2367 /* 2368 * It got started before we got here... 2369 */ 2370 if (ntg->ntg_thread_started) 2371 return; 2372 2373 mutex_enter(&nfs4_ephemeral_thread_lock); 2374 2375 if (ntg->ntg_thread_started) { 2376 mutex_exit(&nfs4_ephemeral_thread_lock); 2377 return; 2378 } 2379 2380 /* 2381 * Start the unmounter harvester thread for this zone. 2382 */ 2383 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester, 2384 ntg, 0, minclsyspri); 2385 2386 ntg->ntg_thread_started = TRUE; 2387 mutex_exit(&nfs4_ephemeral_thread_lock); 2388 } 2389 2390 /*ARGSUSED*/ 2391 static void * 2392 nfs4_ephemeral_zsd_create(zoneid_t zoneid) 2393 { 2394 nfs4_trigger_globals_t *ntg; 2395 2396 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP); 2397 ntg->ntg_thread_started = FALSE; 2398 2399 /* 2400 * This is the default.... 2401 */ 2402 ntg->ntg_mount_to = nfs4_trigger_thread_timer; 2403 2404 mutex_init(&ntg->ntg_forest_lock, NULL, 2405 MUTEX_DEFAULT, NULL); 2406 2407 return (ntg); 2408 } 2409 2410 /* 2411 * Try a nice gentle walk down the forest and convince 2412 * all of the trees to gracefully give it up. 2413 */ 2414 /*ARGSUSED*/ 2415 static void 2416 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg) 2417 { 2418 nfs4_trigger_globals_t *ntg = arg; 2419 2420 if (!ntg) 2421 return; 2422 2423 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE); 2424 } 2425 2426 /* 2427 * Race along the forest and rip all of the trees out by 2428 * their rootballs! 2429 */ 2430 /*ARGSUSED*/ 2431 static void 2432 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg) 2433 { 2434 nfs4_trigger_globals_t *ntg = arg; 2435 2436 if (!ntg) 2437 return; 2438 2439 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE); 2440 2441 mutex_destroy(&ntg->ntg_forest_lock); 2442 kmem_free(ntg, sizeof (*ntg)); 2443 } 2444 2445 /* 2446 * This is the zone independent cleanup needed for 2447 * emphemeral mount processing. 2448 */ 2449 void 2450 nfs4_ephemeral_fini(void) 2451 { 2452 (void) zone_key_delete(nfs4_ephemeral_key); 2453 mutex_destroy(&nfs4_ephemeral_thread_lock); 2454 } 2455 2456 /* 2457 * This is the zone independent initialization needed for 2458 * emphemeral mount processing. 2459 */ 2460 void 2461 nfs4_ephemeral_init(void) 2462 { 2463 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT, 2464 NULL); 2465 2466 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create, 2467 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy); 2468 } 2469 2470 /* 2471 * nfssys() calls this function to set the per-zone 2472 * value of mount_to to drive when an ephemeral mount is 2473 * timed out. Each mount will grab a copy of this value 2474 * when mounted. 2475 */ 2476 void 2477 nfs4_ephemeral_set_mount_to(uint_t mount_to) 2478 { 2479 nfs4_trigger_globals_t *ntg; 2480 zone_t *zone = curproc->p_zone; 2481 2482 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 2483 2484 ntg->ntg_mount_to = mount_to; 2485 } 2486 2487 /* 2488 * Walk the list of v4 mount options; if they are currently set in vfsp, 2489 * append them to a new comma-separated mount option string, and return it. 2490 * 2491 * Caller should free by calling nfs4_trigger_destroy_mntopts(). 2492 */ 2493 static char * 2494 nfs4_trigger_create_mntopts(vfs_t *vfsp) 2495 { 2496 uint_t i; 2497 char *mntopts; 2498 struct vfssw *vswp; 2499 mntopts_t *optproto; 2500 2501 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP); 2502 2503 /* get the list of applicable mount options for v4; locks *vswp */ 2504 vswp = vfs_getvfssw(MNTTYPE_NFS4); 2505 optproto = &vswp->vsw_optproto; 2506 2507 for (i = 0; i < optproto->mo_count; i++) { 2508 struct mntopt *mop = &optproto->mo_list[i]; 2509 2510 if (mop->mo_flags & MO_EMPTY) 2511 continue; 2512 2513 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) { 2514 kmem_free(mntopts, MAX_MNTOPT_STR); 2515 vfs_unrefvfssw(vswp); 2516 return (NULL); 2517 } 2518 } 2519 2520 vfs_unrefvfssw(vswp); 2521 2522 /* 2523 * MNTOPT_XATTR is not in the v4 mount opt proto list, 2524 * and it may only be passed via MS_OPTIONSTR, so we 2525 * must handle it here. 2526 * 2527 * Ideally, it would be in the list, but NFS does not specify its 2528 * own opt proto list, it uses instead the default one. Since 2529 * not all filesystems support extended attrs, it would not be 2530 * appropriate to add it there. 2531 */ 2532 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) || 2533 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) { 2534 kmem_free(mntopts, MAX_MNTOPT_STR); 2535 return (NULL); 2536 } 2537 2538 return (mntopts); 2539 } 2540 2541 static void 2542 nfs4_trigger_destroy_mntopts(char *mntopts) 2543 { 2544 if (mntopts) 2545 kmem_free(mntopts, MAX_MNTOPT_STR); 2546 } 2547 2548 /* 2549 * Check a single mount option (optname). Add to mntopts if it is set in VFS. 2550 */ 2551 static int 2552 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp) 2553 { 2554 if (mntopts == NULL || optname == NULL || vfsp == NULL) 2555 return (EINVAL); 2556 2557 if (vfs_optionisset(vfsp, optname, NULL)) { 2558 size_t mntoptslen = strlen(mntopts); 2559 size_t optnamelen = strlen(optname); 2560 2561 /* +1 for ',', +1 for NUL */ 2562 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR) 2563 return (EOVERFLOW); 2564 2565 /* first or subsequent mount option? */ 2566 if (*mntopts != '\0') 2567 (void) strcat(mntopts, ","); 2568 2569 (void) strcat(mntopts, optname); 2570 } 2571 2572 return (0); 2573 } 2574 2575 static enum clnt_stat 2576 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr) 2577 { 2578 int retries, error; 2579 uint_t max_msgsize; 2580 enum clnt_stat status; 2581 CLIENT *cl; 2582 struct timeval timeout; 2583 2584 /* as per recov_newserver() */ 2585 max_msgsize = 0; 2586 retries = 1; 2587 timeout.tv_sec = 2; 2588 timeout.tv_usec = 0; 2589 2590 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM, 2591 NFS_V4, max_msgsize, retries, CRED(), &cl); 2592 if (error) 2593 return (RPC_FAILED); 2594 2595 if (nointr) 2596 cl->cl_nosignal = TRUE; 2597 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL, 2598 timeout); 2599 if (nointr) 2600 cl->cl_nosignal = FALSE; 2601 2602 AUTH_DESTROY(cl->cl_auth); 2603 CLNT_DESTROY(cl); 2604 2605 return (status); 2606 } 2607