1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are 31 * triggered from a "stub" rnode via a special set of vnodeops. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/time.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/vfs_opreg.h> 42 #include <sys/file.h> 43 #include <sys/filio.h> 44 #include <sys/uio.h> 45 #include <sys/buf.h> 46 #include <sys/mman.h> 47 #include <sys/pathname.h> 48 #include <sys/dirent.h> 49 #include <sys/debug.h> 50 #include <sys/vmsystm.h> 51 #include <sys/fcntl.h> 52 #include <sys/flock.h> 53 #include <sys/swap.h> 54 #include <sys/errno.h> 55 #include <sys/strsubr.h> 56 #include <sys/sysmacros.h> 57 #include <sys/kmem.h> 58 #include <sys/mount.h> 59 #include <sys/cmn_err.h> 60 #include <sys/pathconf.h> 61 #include <sys/utsname.h> 62 #include <sys/dnlc.h> 63 #include <sys/acl.h> 64 #include <sys/systeminfo.h> 65 #include <sys/policy.h> 66 #include <sys/sdt.h> 67 #include <sys/list.h> 68 #include <sys/stat.h> 69 #include <sys/mntent.h> 70 71 #include <rpc/types.h> 72 #include <rpc/auth.h> 73 #include <rpc/clnt.h> 74 75 #include <nfs/nfs.h> 76 #include <nfs/nfs_clnt.h> 77 #include <nfs/nfs_acl.h> 78 #include <nfs/lm.h> 79 #include <nfs/nfs4.h> 80 #include <nfs/nfs4_kprot.h> 81 #include <nfs/rnode4.h> 82 #include <nfs/nfs4_clnt.h> 83 84 #include <vm/hat.h> 85 #include <vm/as.h> 86 #include <vm/page.h> 87 #include <vm/pvn.h> 88 #include <vm/seg.h> 89 #include <vm/seg_map.h> 90 #include <vm/seg_kpm.h> 91 #include <vm/seg_vn.h> 92 93 #include <fs/fs_subr.h> 94 95 #include <sys/ddi.h> 96 #include <sys/int_fmtio.h> 97 98 #include <sys/sunddi.h> 99 100 /* 101 * The automatic unmounter thread stuff! 102 */ 103 static int nfs4_trigger_thread_timer = 20; /* in seconds */ 104 105 /* 106 * Just a default.... 107 */ 108 static uint_t nfs4_trigger_mount_to = 240; 109 110 typedef struct nfs4_trigger_globals { 111 kmutex_t ntg_forest_lock; 112 uint_t ntg_mount_to; 113 int ntg_thread_started; 114 nfs4_ephemeral_tree_t *ntg_forest; 115 } nfs4_trigger_globals_t; 116 117 kmutex_t nfs4_ephemeral_thread_lock; 118 119 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED; 120 121 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *); 122 123 /* 124 * Used for ephemeral mounts; contains data either duplicated from 125 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount. 126 * 127 * It's intended that this structure is used solely for ephemeral 128 * mount-type specific data, for passing this data to 129 * nfs4_trigger_nargs_create(). 130 */ 131 typedef struct ephemeral_servinfo { 132 char *esi_hostname; 133 char *esi_netname; 134 char *esi_path; 135 int esi_path_len; 136 int esi_mount_flags; 137 struct netbuf *esi_addr; 138 struct netbuf *esi_syncaddr; 139 struct knetconfig *esi_knconf; 140 } ephemeral_servinfo_t; 141 142 /* 143 * Collect together the mount-type specific and generic data args. 144 */ 145 typedef struct domount_args { 146 ephemeral_servinfo_t *dma_esi; 147 char *dma_hostlist; /* comma-sep. for RO failover */ 148 struct nfs_args *dma_nargs; 149 } domount_args_t; 150 151 152 /* 153 * The vnode ops functions for a trigger stub vnode 154 */ 155 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *); 156 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *, 157 caller_context_t *); 158 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *, 159 caller_context_t *); 160 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *, 161 caller_context_t *); 162 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *, 163 caller_context_t *); 164 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **, 165 struct pathname *, int, vnode_t *, cred_t *, caller_context_t *, 166 int *, pathname_t *); 167 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *, 168 enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *, 169 vsecattr_t *); 170 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *, 171 int); 172 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *, 173 caller_context_t *, int); 174 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *, 175 cred_t *, caller_context_t *, int); 176 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *, 177 vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp); 178 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 179 caller_context_t *, int); 180 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *, 181 cred_t *, caller_context_t *, int); 182 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *); 183 184 /* 185 * Regular NFSv4 vnodeops that we need to reference directly 186 */ 187 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 188 caller_context_t *); 189 extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 190 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *); 191 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 192 extern int nfs4_lookup(vnode_t *, char *, vnode_t **, 193 struct pathname *, int, vnode_t *, cred_t *, 194 caller_context_t *, int *, pathname_t *); 195 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 196 caller_context_t *); 197 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 198 caller_context_t *); 199 extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 200 extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 201 202 static int nfs4_trigger_mount(vnode_t *, vnode_t **); 203 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **, 204 cred_t *); 205 static domount_args_t *nfs4_trigger_domount_args_create(vnode_t *); 206 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma, 207 vnode_t *vp); 208 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *); 209 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *); 210 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *, 211 servinfo4_t *); 212 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *, 213 ephemeral_servinfo_t *); 214 static void nfs4_trigger_nargs_destroy(struct nfs_args *); 215 static char *nfs4_trigger_create_mntopts(vfs_t *); 216 static void nfs4_trigger_destroy_mntopts(char *); 217 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *); 218 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int); 219 220 extern int umount2_engine(vfs_t *, int, cred_t *, int); 221 222 223 vnodeops_t *nfs4_trigger_vnodeops; 224 225 /* 226 * These are the vnodeops that we must define for stub vnodes. 227 * 228 * 229 * Many of the VOPs defined for NFSv4 do not need to be defined here, 230 * for various reasons. This will result in the VFS default function being 231 * used: 232 * 233 * - These VOPs require a previous VOP_OPEN to have occurred. That will have 234 * lost the reference to the stub vnode, meaning these should not be called: 235 * close, read, write, ioctl, readdir, seek. 236 * 237 * - These VOPs are meaningless for vnodes without data pages. Since the 238 * stub vnode is of type VDIR, these should not be called: 239 * space, getpage, putpage, map, addmap, delmap, pageio, fsync. 240 * 241 * - These VOPs are otherwise not applicable, and should not be called: 242 * dump, setsecattr. 243 * 244 * 245 * These VOPs we do not want to define, but nor do we want the VFS default 246 * action. Instead, we specify the VFS error function, with fs_error(), but 247 * note that fs_error() is not actually called. Instead it results in the 248 * use of the error function defined for the particular VOP, in vn_ops_table[]: 249 * 250 * - frlock, dispose, shrlock. 251 * 252 * 253 * These VOPs we define to use the corresponding regular NFSv4 vnodeop. 254 * NOTE: if any of these ops involve an OTW call with the stub FH, then 255 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo() 256 * to protect the security data in the servinfo4_t for the "parent" 257 * filesystem that contains the stub. 258 * 259 * - These VOPs should not trigger a mount, so that "ls -l" does not: 260 * pathconf, getsecattr. 261 * 262 * - These VOPs would not make sense to trigger: 263 * inactive, rwlock, rwunlock, fid, realvp. 264 */ 265 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = { 266 VOPNAME_OPEN, { .vop_open = nfs4_trigger_open }, 267 VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr }, 268 VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr }, 269 VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access }, 270 VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup }, 271 VOPNAME_CREATE, { .vop_create = nfs4_trigger_create }, 272 VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove }, 273 VOPNAME_LINK, { .vop_link = nfs4_trigger_link }, 274 VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename }, 275 VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir }, 276 VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir }, 277 VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink }, 278 VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink }, 279 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 280 VOPNAME_FID, { .vop_fid = nfs4_fid }, 281 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 282 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 283 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 284 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 285 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 286 VOPNAME_FRLOCK, { .error = fs_error }, 287 VOPNAME_DISPOSE, { .error = fs_error }, 288 VOPNAME_SHRLOCK, { .error = fs_error }, 289 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 290 NULL, NULL 291 }; 292 293 /* 294 * Trigger ops for stub vnodes; for mirror mounts, etc. 295 * 296 * The general idea is that a "triggering" op will first call 297 * nfs4_trigger_mount(), which will find out whether a mount has already 298 * been triggered. 299 * 300 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode 301 * of the covering vfs. 302 * 303 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so, 304 * and again set newvp, as above. 305 * 306 * The triggering op may then re-issue the VOP by calling it on newvp. 307 * 308 * Note that some ops may perform custom action, and may or may not need 309 * to trigger a mount. 310 * 311 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We 312 * obviously can't do this with VOP_<whatever>, since it's a stub vnode 313 * and that would just recurse. Instead, we call the v4 op directly, 314 * by name. This is OK, since we know that the vnode is for NFSv4, 315 * otherwise it couldn't be a stub. 316 * 317 */ 318 319 static int 320 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 321 { 322 int error; 323 vnode_t *newvp; 324 325 error = nfs4_trigger_mount(*vpp, &newvp); 326 if (error) 327 return (error); 328 329 /* Release the stub vnode, as we're losing the reference to it */ 330 VN_RELE(*vpp); 331 332 /* Give the caller the root vnode of the newly-mounted fs */ 333 *vpp = newvp; 334 335 /* return with VN_HELD(newvp) */ 336 return (VOP_OPEN(vpp, flag, cr, ct)); 337 } 338 339 /* 340 * For the majority of cases, nfs4_trigger_getattr() will not trigger 341 * a mount. However, if ATTR_TRIGGER is set, we are being informed 342 * that we need to force the mount before we attempt to determine 343 * the attributes. The intent is an atomic operation for security 344 * testing. 345 */ 346 static int 347 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 348 caller_context_t *ct) 349 { 350 int error; 351 352 if (flags & ATTR_TRIGGER) { 353 vnode_t *newvp; 354 355 error = nfs4_trigger_mount(vp, &newvp); 356 if (error) 357 return (error); 358 359 error = VOP_GETATTR(newvp, vap, flags, cr, ct); 360 VN_RELE(newvp); 361 } else { 362 error = nfs4_getattr(vp, vap, flags, cr, ct); 363 } 364 365 return (error); 366 } 367 368 static int 369 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 370 caller_context_t *ct) 371 { 372 int error; 373 vnode_t *newvp; 374 375 error = nfs4_trigger_mount(vp, &newvp); 376 if (error) 377 return (error); 378 379 error = VOP_SETATTR(newvp, vap, flags, cr, ct); 380 VN_RELE(newvp); 381 382 return (error); 383 } 384 385 static int 386 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr, 387 caller_context_t *ct) 388 { 389 int error; 390 vnode_t *newvp; 391 392 error = nfs4_trigger_mount(vp, &newvp); 393 if (error) 394 return (error); 395 396 error = VOP_ACCESS(newvp, mode, flags, cr, ct); 397 VN_RELE(newvp); 398 399 return (error); 400 } 401 402 static int 403 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, 404 struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr, 405 caller_context_t *ct, int *deflags, pathname_t *rpnp) 406 { 407 int error; 408 vnode_t *newdvp; 409 rnode4_t *drp = VTOR4(dvp); 410 411 ASSERT(RP_ISSTUB(drp)); 412 413 /* for now, we only support mirror-mounts */ 414 ASSERT(RP_ISSTUB_MIRRORMOUNT(drp)); 415 416 /* 417 * It's not legal to lookup ".." for an fs root, so we mustn't pass 418 * that up. Instead, pass onto the regular op, regardless of whether 419 * we've triggered a mount. 420 */ 421 if (strcmp(nm, "..") == 0) 422 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 423 ct, deflags, rpnp)); 424 425 error = nfs4_trigger_mount(dvp, &newdvp); 426 if (error) 427 return (error); 428 429 error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct, 430 deflags, rpnp); 431 VN_RELE(newdvp); 432 433 return (error); 434 } 435 436 static int 437 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va, 438 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr, 439 int flags, caller_context_t *ct, vsecattr_t *vsecp) 440 { 441 int error; 442 vnode_t *newdvp; 443 444 error = nfs4_trigger_mount(dvp, &newdvp); 445 if (error) 446 return (error); 447 448 error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, 449 flags, ct, vsecp); 450 VN_RELE(newdvp); 451 452 return (error); 453 } 454 455 static int 456 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, 457 int flags) 458 { 459 int error; 460 vnode_t *newdvp; 461 462 error = nfs4_trigger_mount(dvp, &newdvp); 463 if (error) 464 return (error); 465 466 error = VOP_REMOVE(newdvp, nm, cr, ct, flags); 467 VN_RELE(newdvp); 468 469 return (error); 470 } 471 472 static int 473 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 474 caller_context_t *ct, int flags) 475 { 476 int error; 477 vnode_t *newtdvp; 478 479 error = nfs4_trigger_mount(tdvp, &newtdvp); 480 if (error) 481 return (error); 482 483 /* 484 * We don't check whether svp is a stub. Let the NFSv4 code 485 * detect that error, and return accordingly. 486 */ 487 error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags); 488 VN_RELE(newtdvp); 489 490 return (error); 491 } 492 493 static int 494 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, 495 cred_t *cr, caller_context_t *ct, int flags) 496 { 497 int error; 498 vnode_t *newsdvp; 499 rnode4_t *tdrp = VTOR4(tdvp); 500 501 /* 502 * We know that sdvp is a stub, otherwise we would not be here. 503 * 504 * If tdvp is also be a stub, there are two possibilities: it 505 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)] 506 * or it is a different stub [!VN_CMP(sdvp, tdvp)]. 507 * 508 * In the former case, just trigger sdvp, and treat tdvp as 509 * though it were not a stub. 510 * 511 * In the latter case, it might be a different stub for the 512 * same server fs as sdvp, or for a different server fs. 513 * Regardless, from the client perspective this would still 514 * be a cross-filesystem rename, and should not be allowed, 515 * so return EXDEV, without triggering either mount. 516 */ 517 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp)) 518 return (EXDEV); 519 520 error = nfs4_trigger_mount(sdvp, &newsdvp); 521 if (error) 522 return (error); 523 524 error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags); 525 526 VN_RELE(newsdvp); 527 528 return (error); 529 } 530 531 /* ARGSUSED */ 532 static int 533 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, 534 cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) 535 { 536 int error; 537 vnode_t *newdvp; 538 539 error = nfs4_trigger_mount(dvp, &newdvp); 540 if (error) 541 return (error); 542 543 error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp); 544 VN_RELE(newdvp); 545 546 return (error); 547 } 548 549 static int 550 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 551 caller_context_t *ct, int flags) 552 { 553 int error; 554 vnode_t *newdvp; 555 556 error = nfs4_trigger_mount(dvp, &newdvp); 557 if (error) 558 return (error); 559 560 error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags); 561 VN_RELE(newdvp); 562 563 return (error); 564 } 565 566 static int 567 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, 568 cred_t *cr, caller_context_t *ct, int flags) 569 { 570 int error; 571 vnode_t *newdvp; 572 573 error = nfs4_trigger_mount(dvp, &newdvp); 574 if (error) 575 return (error); 576 577 error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags); 578 VN_RELE(newdvp); 579 580 return (error); 581 } 582 583 static int 584 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, 585 caller_context_t *ct) 586 { 587 int error; 588 vnode_t *newvp; 589 590 error = nfs4_trigger_mount(vp, &newvp); 591 if (error) 592 return (error); 593 594 error = VOP_READLINK(newvp, uiop, cr, ct); 595 VN_RELE(newvp); 596 597 return (error); 598 } 599 600 /* end of trigger vnode ops */ 601 602 603 /* 604 * Mount upon a trigger vnode; for mirror-mounts, etc. 605 * 606 * The mount may have already occurred, via another thread. If not, 607 * assemble the location information - which may require fetching - and 608 * perform the mount. 609 * 610 * Sets newvp to be the root of the fs that is now covering vp. Note 611 * that we return with VN_HELD(*newvp). 612 * 613 * The caller is responsible for passing the VOP onto the covering fs. 614 */ 615 static int 616 nfs4_trigger_mount(vnode_t *vp, vnode_t **newvpp) 617 { 618 int error; 619 vfs_t *vfsp; 620 rnode4_t *rp = VTOR4(vp); 621 mntinfo4_t *mi = VTOMI4(vp); 622 domount_args_t *dma; 623 624 nfs4_ephemeral_tree_t *net; 625 626 bool_t must_unlock = FALSE; 627 bool_t is_building = FALSE; 628 629 cred_t *zcred; 630 631 nfs4_trigger_globals_t *ntg; 632 633 zone_t *zone = curproc->p_zone; 634 635 ASSERT(RP_ISSTUB(rp)); 636 637 /* for now, we only support mirror-mounts */ 638 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 639 640 *newvpp = NULL; 641 642 /* 643 * Has the mount already occurred? 644 */ 645 error = vn_vfsrlock_wait(vp); 646 if (error) 647 goto done; 648 vfsp = vn_mountedvfs(vp); 649 if (vfsp != NULL) { 650 /* the mount has already occurred */ 651 error = VFS_ROOT(vfsp, newvpp); 652 if (!error) { 653 /* need to update the reference time */ 654 mutex_enter(&mi->mi_lock); 655 if (mi->mi_ephemeral) 656 mi->mi_ephemeral->ne_ref_time = 657 gethrestime_sec(); 658 mutex_exit(&mi->mi_lock); 659 } 660 661 vn_vfsunlock(vp); 662 goto done; 663 } 664 vn_vfsunlock(vp); 665 666 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 667 ASSERT(ntg != NULL); 668 669 mutex_enter(&mi->mi_lock); 670 671 /* 672 * We need to lock down the ephemeral tree. 673 */ 674 if (mi->mi_ephemeral_tree == NULL) { 675 net = kmem_zalloc(sizeof (*net), KM_SLEEP); 676 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL); 677 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL); 678 net->net_refcnt = 1; 679 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING; 680 is_building = TRUE; 681 682 /* 683 * We need to add it to the zone specific list for 684 * automatic unmounting and harvesting of deadwood. 685 */ 686 mutex_enter(&ntg->ntg_forest_lock); 687 if (ntg->ntg_forest != NULL) 688 net->net_next = ntg->ntg_forest; 689 ntg->ntg_forest = net; 690 mutex_exit(&ntg->ntg_forest_lock); 691 692 /* 693 * No lock order confusion with mi_lock because no 694 * other node could have grabbed net_tree_lock. 695 */ 696 mutex_enter(&net->net_tree_lock); 697 mi->mi_ephemeral_tree = net; 698 net->net_mount = mi; 699 mutex_exit(&mi->mi_lock); 700 } else { 701 net = mi->mi_ephemeral_tree; 702 mutex_exit(&mi->mi_lock); 703 704 mutex_enter(&net->net_cnt_lock); 705 net->net_refcnt++; 706 mutex_exit(&net->net_cnt_lock); 707 708 /* 709 * Note that we do not do any checks to 710 * see if the parent has been nuked. 711 * We count on the vfs layer having protected 712 * us from feet shooters. 713 */ 714 mutex_enter(&net->net_tree_lock); 715 } 716 717 mutex_enter(&net->net_cnt_lock); 718 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING; 719 mutex_exit(&net->net_cnt_lock); 720 721 must_unlock = TRUE; 722 723 dma = nfs4_trigger_domount_args_create(vp); 724 if (dma == NULL) { 725 error = EINVAL; 726 goto done; 727 } 728 729 /* 730 * Need to be root for this call to make mount work. 731 * Note that since we define mirror mounts to work 732 * for any user, we allow the mount to proceed. And 733 * we realize that the server will perform security 734 * checks to make sure that the client is allowed 735 * access. Finally, once the mount takes place, 736 * directory permissions will ensure that the 737 * content is secure. 738 */ 739 zcred = zone_get_kcred(getzoneid()); 740 ASSERT(zcred != NULL); 741 742 error = nfs4_trigger_domount(vp, dma, &vfsp, zcred); 743 nfs4_trigger_domount_args_destroy(dma, vp); 744 745 crfree(zcred); 746 747 if (!error) 748 error = VFS_ROOT(vfsp, newvpp); 749 done: 750 if (must_unlock) { 751 mutex_enter(&net->net_cnt_lock); 752 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING; 753 if (is_building) 754 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING; 755 net->net_refcnt--; 756 mutex_exit(&net->net_cnt_lock); 757 758 mutex_exit(&net->net_tree_lock); 759 } 760 761 if (!error && (newvpp == NULL || *newvpp == NULL)) 762 error = ENOSYS; 763 764 return (error); 765 } 766 767 /* 768 * Collect together both the generic & mount-type specific args. 769 */ 770 static domount_args_t * 771 nfs4_trigger_domount_args_create(vnode_t *vp) 772 { 773 int nointr; 774 char *hostlist; 775 servinfo4_t *svp; 776 struct nfs_args *nargs, *nargs_head; 777 enum clnt_stat status; 778 ephemeral_servinfo_t *esi, *esi_first; 779 domount_args_t *dma; 780 mntinfo4_t *mi = VTOMI4(vp); 781 782 nointr = !(mi->mi_flags & MI4_INT); 783 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 784 785 svp = mi->mi_curr_serv; 786 /* check if the current server is responding */ 787 status = nfs4_trigger_ping_server(svp, nointr); 788 if (status == RPC_SUCCESS) { 789 esi_first = nfs4_trigger_esi_create(vp, svp); 790 if (esi_first == NULL) { 791 kmem_free(hostlist, MAXPATHLEN); 792 return (NULL); 793 } 794 795 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN); 796 797 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first); 798 } else { 799 /* current server did not respond */ 800 esi_first = NULL; 801 nargs_head = NULL; 802 } 803 nargs = nargs_head; 804 805 /* 806 * NFS RO failover. 807 * 808 * If we have multiple servinfo4 structures, linked via sv_next, 809 * we must create one nfs_args for each, linking the nfs_args via 810 * nfs_ext_u.nfs_extB.next. 811 * 812 * We need to build a corresponding esi for each, too, but that is 813 * used solely for building nfs_args, and may be immediately 814 * discarded, as domount() requires the info from just one esi, 815 * but all the nfs_args. 816 * 817 * Currently, the NFS mount code will hang if not all servers 818 * requested are available. To avoid that, we need to ping each 819 * server, here, and remove it from the list if it is not 820 * responding. This has the side-effect of that server then 821 * being permanently unavailable for this failover mount, even if 822 * it recovers. That's unfortunate, but the best we can do until 823 * the mount code path is fixed. 824 */ 825 826 /* 827 * If the current server was down, loop indefinitely until we find 828 * at least one responsive server. 829 */ 830 do { 831 /* no locking needed for sv_next; it is only set at fs mount */ 832 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 833 struct nfs_args *next; 834 835 /* 836 * nargs_head: the head of the nfs_args list 837 * nargs: the current tail of the list 838 * next: the newly-created element to be added 839 */ 840 841 /* 842 * We've already tried the current server, above; 843 * if it was responding, we have already included it 844 * and it may now be ignored. 845 * 846 * Otherwise, try it again, since it may now have 847 * recovered. 848 */ 849 if (svp == mi->mi_curr_serv && esi_first != NULL) 850 continue; 851 852 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 853 if (svp->sv_flags & SV4_NOTINUSE) { 854 nfs_rw_exit(&svp->sv_lock); 855 continue; 856 } 857 nfs_rw_exit(&svp->sv_lock); 858 859 /* check if the server is responding */ 860 status = nfs4_trigger_ping_server(svp, nointr); 861 /* if the server did not respond, ignore it */ 862 if (status != RPC_SUCCESS) 863 continue; 864 865 esi = nfs4_trigger_esi_create(vp, svp); 866 if (esi == NULL) 867 continue; 868 869 /* 870 * If the original current server (mi_curr_serv) 871 * was down when when we first tried it, 872 * (i.e. esi_first == NULL), 873 * we select this new server (svp) to be the server 874 * that we will actually contact (esi_first). 875 * 876 * Note that it's possible that mi_curr_serv == svp, 877 * if that mi_curr_serv was down but has now recovered. 878 */ 879 next = nfs4_trigger_nargs_create(mi, svp, esi); 880 if (esi_first == NULL) { 881 ASSERT(nargs == NULL); 882 ASSERT(nargs_head == NULL); 883 nargs_head = next; 884 esi_first = esi; 885 (void) strlcpy(hostlist, 886 esi_first->esi_hostname, MAXPATHLEN); 887 } else { 888 ASSERT(nargs_head != NULL); 889 nargs->nfs_ext_u.nfs_extB.next = next; 890 (void) strlcat(hostlist, ",", MAXPATHLEN); 891 (void) strlcat(hostlist, esi->esi_hostname, 892 MAXPATHLEN); 893 /* esi was only needed for hostname & nargs */ 894 nfs4_trigger_esi_destroy(esi, vp); 895 } 896 897 nargs = next; 898 } 899 900 /* if we've had no response at all, wait a second */ 901 if (esi_first == NULL) 902 delay(drv_usectohz(1000000)); 903 904 } while (esi_first == NULL); 905 ASSERT(nargs_head != NULL); 906 907 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP); 908 dma->dma_esi = esi_first; 909 dma->dma_hostlist = hostlist; 910 dma->dma_nargs = nargs_head; 911 912 return (dma); 913 } 914 915 static void 916 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp) 917 { 918 if (dma != NULL) { 919 if (dma->dma_esi != NULL && vp != NULL) 920 nfs4_trigger_esi_destroy(dma->dma_esi, vp); 921 922 if (dma->dma_hostlist != NULL) 923 kmem_free(dma->dma_hostlist, MAXPATHLEN); 924 925 if (dma->dma_nargs != NULL) { 926 struct nfs_args *nargs = dma->dma_nargs; 927 928 do { 929 struct nfs_args *next = 930 nargs->nfs_ext_u.nfs_extB.next; 931 932 nfs4_trigger_nargs_destroy(nargs); 933 nargs = next; 934 } while (nargs != NULL); 935 } 936 937 kmem_free(dma, sizeof (domount_args_t)); 938 } 939 } 940 941 /* 942 * The ephemeral_servinfo_t struct contains basic information we will need to 943 * perform the mount. Whilst the structure is generic across different 944 * types of ephemeral mount, the way we gather its contents differs. 945 */ 946 static ephemeral_servinfo_t * 947 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp) 948 { 949 ephemeral_servinfo_t *esi; 950 rnode4_t *rp = VTOR4(vp); 951 952 ASSERT(RP_ISSTUB(rp)); 953 954 /* Call the ephemeral type-specific routine */ 955 if (RP_ISSTUB_MIRRORMOUNT(rp)) 956 esi = nfs4_trigger_esi_create_mirrormount(vp, svp); 957 else 958 esi = NULL; 959 960 /* for now, we only support mirror-mounts */ 961 ASSERT(esi != NULL); 962 963 return (esi); 964 } 965 966 static void 967 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp) 968 { 969 rnode4_t *rp = VTOR4(vp); 970 971 ASSERT(RP_ISSTUB(rp)); 972 973 /* for now, we only support mirror-mounts */ 974 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 975 976 /* Currently, no need for an ephemeral type-specific routine */ 977 978 /* 979 * The contents of ephemeral_servinfo_t goes into nfs_args, 980 * and will be handled by nfs4_trigger_nargs_destroy(). 981 * We need only free the structure itself. 982 */ 983 if (esi != NULL) 984 kmem_free(esi, sizeof (ephemeral_servinfo_t)); 985 } 986 987 /* 988 * Some of this may turn out to be common with other ephemeral types, 989 * in which case it should be moved to nfs4_trigger_esi_create(), or a 990 * common function called. 991 */ 992 static ephemeral_servinfo_t * 993 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp) 994 { 995 char *stubpath; 996 struct knetconfig *sikncp, *svkncp; 997 struct netbuf *bufp; 998 ephemeral_servinfo_t *esi; 999 1000 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP); 1001 1002 /* initially set to be our type of ephemeral mount; may be added to */ 1003 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT; 1004 1005 /* 1006 * We're copying info from the stub rnode's servinfo4, but 1007 * we must create new copies, not pointers, since this information 1008 * is to be associated with the new mount, which will be 1009 * unmounted (and its structures freed) separately 1010 */ 1011 1012 /* 1013 * Sizes passed to kmem_[z]alloc here must match those freed 1014 * in nfs4_free_args() 1015 */ 1016 1017 /* 1018 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this 1019 * is difficult to avoid: as we need to read svp to calculate the 1020 * sizes to be allocated. 1021 */ 1022 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1023 1024 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP); 1025 (void) strcat(esi->esi_hostname, svp->sv_hostname); 1026 1027 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1028 bufp = esi->esi_addr; 1029 bufp->len = svp->sv_addr.len; 1030 bufp->maxlen = svp->sv_addr.maxlen; 1031 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1032 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len); 1033 1034 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP); 1035 sikncp = esi->esi_knconf; 1036 svkncp = svp->sv_knconf; 1037 sikncp->knc_semantics = svkncp->knc_semantics; 1038 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1039 (void) strcat((char *)sikncp->knc_protofmly, 1040 (char *)svkncp->knc_protofmly); 1041 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1042 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto); 1043 sikncp->knc_rdev = svkncp->knc_rdev; 1044 1045 /* 1046 * Used when AUTH_DH is negotiated. 1047 * 1048 * This is ephemeral mount-type specific, since it contains the 1049 * server's time-sync syncaddr. 1050 */ 1051 if (svp->sv_dhsec) { 1052 struct netbuf *bufp; 1053 sec_data_t *sdata; 1054 dh_k4_clntdata_t *data; 1055 1056 sdata = svp->sv_dhsec; 1057 data = (dh_k4_clntdata_t *)sdata->data; 1058 ASSERT(sdata->rpcflavor == AUTH_DH); 1059 1060 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1061 bufp->len = data->syncaddr.len; 1062 bufp->maxlen = data->syncaddr.maxlen; 1063 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1064 bcopy(data->syncaddr.buf, bufp->buf, bufp->len); 1065 esi->esi_syncaddr = bufp; 1066 1067 if (data->netname != NULL) { 1068 int nmlen = data->netnamelen; 1069 1070 /* 1071 * We need to copy from a dh_k4_clntdata_t 1072 * netname/netnamelen pair to a NUL-terminated 1073 * netname string suitable for putting in nfs_args, 1074 * where the latter has no netnamelen field. 1075 */ 1076 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP); 1077 bcopy(data->netname, esi->esi_netname, nmlen); 1078 } 1079 } else { 1080 esi->esi_syncaddr = NULL; 1081 esi->esi_netname = NULL; 1082 } 1083 1084 stubpath = fn_path(VTOSV(vp)->sv_name); 1085 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */ 1086 ASSERT(*stubpath == '.'); 1087 stubpath += 1; 1088 1089 /* for nfs_args->fh */ 1090 esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1; 1091 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP); 1092 (void) strcat(esi->esi_path, svp->sv_path); 1093 (void) strcat(esi->esi_path, stubpath); 1094 1095 stubpath -= 1; 1096 /* stubpath allocated by fn_path() */ 1097 kmem_free(stubpath, strlen(stubpath) + 1); 1098 1099 nfs_rw_exit(&svp->sv_lock); 1100 1101 return (esi); 1102 } 1103 1104 /* 1105 * Assemble the args, and call the generic VFS mount function to 1106 * finally perform the ephemeral mount. 1107 */ 1108 static int 1109 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp, 1110 cred_t *cr) 1111 { 1112 struct mounta *uap; 1113 char *mntpt, *orig_path, *path; 1114 const char *orig_mntpt; 1115 int retval; 1116 int mntpt_len; 1117 int spec_len; 1118 zone_t *zone = curproc->p_zone; 1119 bool_t has_leading_slash; 1120 1121 vfs_t *stubvfsp = stubvp->v_vfsp; 1122 ephemeral_servinfo_t *esi = dma->dma_esi; 1123 struct nfs_args *nargs = dma->dma_nargs; 1124 1125 /* first, construct the mount point for the ephemeral mount */ 1126 orig_path = path = fn_path(VTOSV(stubvp)->sv_name); 1127 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt); 1128 1129 if (*orig_path == '.') 1130 orig_path++; 1131 1132 /* 1133 * Get rid of zone's root path 1134 */ 1135 if (zone != global_zone) { 1136 /* 1137 * -1 for trailing '/' and -1 for EOS. 1138 */ 1139 if (strncmp(zone->zone_rootpath, orig_mntpt, 1140 zone->zone_rootpathlen - 1) == 0) { 1141 orig_mntpt += (zone->zone_rootpathlen - 2); 1142 } 1143 } 1144 1145 mntpt_len = strlen(orig_mntpt) + strlen(orig_path); 1146 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP); 1147 (void) strcat(mntpt, orig_mntpt); 1148 (void) strcat(mntpt, orig_path); 1149 1150 kmem_free(path, strlen(path) + 1); 1151 path = esi->esi_path; 1152 if (*path == '.') 1153 path++; 1154 if (path[0] == '/' && path[1] == '/') 1155 path++; 1156 has_leading_slash = (*path == '/'); 1157 1158 spec_len = strlen(dma->dma_hostlist); 1159 spec_len += strlen(path); 1160 1161 /* We are going to have to add this in */ 1162 if (!has_leading_slash) 1163 spec_len++; 1164 1165 /* We need to get the ':' for dma_hostlist:esi_path */ 1166 spec_len++; 1167 1168 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP); 1169 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP); 1170 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist, 1171 has_leading_slash ? "" : "/", path); 1172 1173 uap->dir = mntpt; 1174 1175 uap->flags = MS_SYSSPACE | MS_DATA; 1176 /* fstype-independent mount options not covered elsewhere */ 1177 /* copy parent's mount(1M) "-m" flag */ 1178 if (stubvfsp->vfs_flag & VFS_NOMNTTAB) 1179 uap->flags |= MS_NOMNTTAB; 1180 1181 uap->fstype = MNTTYPE_NFS4; 1182 uap->dataptr = (char *)nargs; 1183 /* not needed for MS_SYSSPACE */ 1184 uap->datalen = 0; 1185 1186 /* use optptr to pass in extra mount options */ 1187 uap->flags |= MS_OPTIONSTR; 1188 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp); 1189 if (uap->optptr == NULL) { 1190 retval = EINVAL; 1191 goto done; 1192 } 1193 /* domount() expects us to count the trailing NUL */ 1194 uap->optlen = strlen(uap->optptr) + 1; 1195 1196 retval = domount(NULL, uap, stubvp, cr, vfsp); 1197 if (retval == 0) 1198 VFS_RELE(*vfsp); 1199 done: 1200 if (uap->optptr) 1201 nfs4_trigger_destroy_mntopts(uap->optptr); 1202 1203 kmem_free(uap->spec, spec_len + 1); 1204 kmem_free(uap, sizeof (struct mounta)); 1205 kmem_free(mntpt, mntpt_len + 1); 1206 1207 return (retval); 1208 } 1209 1210 /* 1211 * Build an nfs_args structure for passing to domount(). 1212 * 1213 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t; 1214 * generic data - common to all ephemeral mount types - is read directly 1215 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode. 1216 */ 1217 static struct nfs_args * 1218 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp, 1219 ephemeral_servinfo_t *esi) 1220 { 1221 sec_data_t *secdata; 1222 struct nfs_args *nargs; 1223 1224 /* setup the nfs args */ 1225 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP); 1226 1227 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1228 1229 nargs->addr = esi->esi_addr; 1230 1231 /* for AUTH_DH by negotiation */ 1232 if (esi->esi_syncaddr || esi->esi_netname) { 1233 nargs->flags |= NFSMNT_SECURE; 1234 nargs->syncaddr = esi->esi_syncaddr; 1235 nargs->netname = esi->esi_netname; 1236 } 1237 1238 nargs->flags |= NFSMNT_KNCONF; 1239 nargs->knconf = esi->esi_knconf; 1240 nargs->flags |= NFSMNT_HOSTNAME; 1241 nargs->hostname = esi->esi_hostname; 1242 nargs->fh = esi->esi_path; 1243 1244 /* general mount settings, all copied from parent mount */ 1245 mutex_enter(&mi->mi_lock); 1246 1247 if (!(mi->mi_flags & MI4_HARD)) 1248 nargs->flags |= NFSMNT_SOFT; 1249 1250 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO | 1251 NFSMNT_RETRANS; 1252 nargs->wsize = mi->mi_stsize; 1253 nargs->rsize = mi->mi_tsize; 1254 nargs->timeo = mi->mi_timeo; 1255 nargs->retrans = mi->mi_retrans; 1256 1257 if (mi->mi_flags & MI4_INT) 1258 nargs->flags |= NFSMNT_INT; 1259 if (mi->mi_flags & MI4_NOAC) 1260 nargs->flags |= NFSMNT_NOAC; 1261 1262 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN | 1263 NFSMNT_ACDIRMAX; 1264 nargs->acregmin = HR2SEC(mi->mi_acregmin); 1265 nargs->acregmax = HR2SEC(mi->mi_acregmax); 1266 nargs->acdirmin = HR2SEC(mi->mi_acdirmin); 1267 nargs->acdirmax = HR2SEC(mi->mi_acdirmax); 1268 1269 if (mi->mi_flags & MI4_NOCTO) 1270 nargs->flags |= NFSMNT_NOCTO; 1271 if (mi->mi_flags & MI4_GRPID) 1272 nargs->flags |= NFSMNT_GRPID; 1273 if (mi->mi_flags & MI4_LLOCK) 1274 nargs->flags |= NFSMNT_LLOCK; 1275 if (mi->mi_flags & MI4_NOPRINT) 1276 nargs->flags |= NFSMNT_NOPRINT; 1277 if (mi->mi_flags & MI4_DIRECTIO) 1278 nargs->flags |= NFSMNT_DIRECTIO; 1279 if (mi->mi_flags & MI4_PUBLIC) 1280 nargs->flags |= NFSMNT_PUBLIC; 1281 1282 mutex_exit(&mi->mi_lock); 1283 1284 /* add any specific flags for this type of ephemeral mount */ 1285 nargs->flags |= esi->esi_mount_flags; 1286 1287 /* 1288 * Security data & negotiation policy. 1289 * 1290 * We need to preserve the parent mount's preference for security 1291 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT. 1292 * 1293 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific 1294 * security flavour was requested, with data in sv_secdata, and that 1295 * no negotiation should occur. If this specified flavour fails, that's 1296 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT. 1297 * 1298 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in 1299 * default flavour, in sv_secdata, but then negotiate a new flavour. 1300 * Possible flavours are recorded in an array in sv_secinfo, with 1301 * currently in-use flavour pointed to by sv_currsec. 1302 * 1303 * If sv_currsec is set, i.e. if negotiation has already occurred, 1304 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless, 1305 * we will set NFSMNT_SECDEFAULT, to enable negotiation. 1306 */ 1307 if (svp->sv_flags & SV4_TRYSECDEFAULT) { 1308 /* enable negotiation for ephemeral mount */ 1309 nargs->flags |= NFSMNT_SECDEFAULT; 1310 1311 /* 1312 * As a starting point for negotiation, copy parent 1313 * mount's negotiated flavour (sv_currsec) if available, 1314 * or its passed-in flavour (sv_secdata) if not. 1315 */ 1316 if (svp->sv_currsec != NULL) 1317 secdata = copy_sec_data(svp->sv_currsec); 1318 else if (svp->sv_secdata != NULL) 1319 secdata = copy_sec_data(svp->sv_secdata); 1320 else 1321 secdata = NULL; 1322 } else { 1323 /* do not enable negotiation; copy parent's passed-in flavour */ 1324 if (svp->sv_secdata != NULL) 1325 secdata = copy_sec_data(svp->sv_secdata); 1326 else 1327 secdata = NULL; 1328 } 1329 1330 nfs_rw_exit(&svp->sv_lock); 1331 1332 nargs->flags |= NFSMNT_NEWARGS; 1333 nargs->nfs_args_ext = NFS_ARGS_EXTB; 1334 nargs->nfs_ext_u.nfs_extB.secdata = secdata; 1335 1336 /* for NFS RO failover; caller will set if necessary */ 1337 nargs->nfs_ext_u.nfs_extB.next = NULL; 1338 1339 return (nargs); 1340 } 1341 1342 static void 1343 nfs4_trigger_nargs_destroy(struct nfs_args *nargs) 1344 { 1345 /* 1346 * Either the mount failed, in which case the data is not needed, or 1347 * nfs4_mount() has either taken copies of what it needs or, 1348 * where it has merely copied the ptr, it has set *our* ptr to NULL, 1349 * whereby nfs4_free_args() will ignore it. 1350 */ 1351 nfs4_free_args(nargs); 1352 kmem_free(nargs, sizeof (struct nfs_args)); 1353 } 1354 1355 /* 1356 * When we finally get into the mounting, we need to add this 1357 * node to the ephemeral tree. 1358 * 1359 * This is called from nfs4_mount(). 1360 */ 1361 void 1362 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp) 1363 { 1364 mntinfo4_t *mi_parent; 1365 nfs4_ephemeral_t *eph; 1366 nfs4_ephemeral_tree_t *net; 1367 1368 nfs4_ephemeral_t *prior; 1369 nfs4_ephemeral_t *child; 1370 1371 nfs4_ephemeral_t *peer; 1372 1373 nfs4_trigger_globals_t *ntg; 1374 zone_t *zone = curproc->p_zone; 1375 1376 mi_parent = VTOMI4(mvp); 1377 1378 /* 1379 * Get this before grabbing anything else! 1380 */ 1381 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 1382 if (!ntg->ntg_thread_started) { 1383 nfs4_ephemeral_start_harvester(ntg); 1384 } 1385 1386 mutex_enter(&mi_parent->mi_lock); 1387 mutex_enter(&mi->mi_lock); 1388 1389 /* 1390 * We need to tack together the ephemeral mount 1391 * with this new mntinfo. 1392 */ 1393 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP); 1394 eph->ne_mount = mi; 1395 eph->ne_ref_time = gethrestime_sec(); 1396 1397 /* 1398 * We need to tell the ephemeral mount when 1399 * to time out. 1400 */ 1401 eph->ne_mount_to = ntg->ntg_mount_to; 1402 1403 mi->mi_flags |= MI4_EPHEMERAL; 1404 mi->mi_ephemeral = eph; 1405 1406 net = mi->mi_ephemeral_tree = 1407 mi_parent->mi_ephemeral_tree; 1408 ASSERT(net != NULL); 1409 1410 /* 1411 * If the enclosing mntinfo4 is also ephemeral, 1412 * then we need to point to its enclosing parent. 1413 * Else the enclosing mntinfo4 is the enclosing parent. 1414 * 1415 * We also need to weave this ephemeral node 1416 * into the tree. 1417 */ 1418 if (mi_parent->mi_flags & MI4_EPHEMERAL) { 1419 /* 1420 * We need to decide if we are 1421 * the root node of this branch 1422 * or if we are a sibling of this 1423 * branch. 1424 */ 1425 prior = mi_parent->mi_ephemeral; 1426 ASSERT(prior != NULL); 1427 if (prior->ne_child == NULL) { 1428 prior->ne_child = eph; 1429 } else { 1430 child = prior->ne_child; 1431 1432 prior->ne_child = eph; 1433 eph->ne_peer = child; 1434 1435 child->ne_prior = eph; 1436 } 1437 1438 eph->ne_prior = prior; 1439 } else { 1440 /* 1441 * The parent mntinfo4 is the non-ephemeral 1442 * root of the ephemeral tree. We 1443 * need to decide if we are the root 1444 * node of that tree or if we are a 1445 * sibling of the root node. 1446 * 1447 * We are the root if there is no 1448 * other node. 1449 */ 1450 if (net->net_root == NULL) { 1451 net->net_root = eph; 1452 } else { 1453 eph->ne_peer = peer = net->net_root; 1454 ASSERT(peer != NULL); 1455 net->net_root = eph; 1456 1457 peer->ne_prior = eph; 1458 } 1459 1460 eph->ne_prior = NULL; 1461 } 1462 1463 mutex_exit(&mi->mi_lock); 1464 mutex_exit(&mi_parent->mi_lock); 1465 } 1466 1467 /* 1468 * Commit the changes to the ephemeral tree for removing this node. 1469 */ 1470 static void 1471 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph) 1472 { 1473 nfs4_ephemeral_t *e = eph; 1474 nfs4_ephemeral_t *peer; 1475 nfs4_ephemeral_t *prior; 1476 1477 peer = eph->ne_peer; 1478 prior = e->ne_prior; 1479 1480 /* 1481 * If this branch root was not the 1482 * tree root, then we need to fix back pointers. 1483 */ 1484 if (prior) { 1485 if (prior->ne_child == e) { 1486 prior->ne_child = peer; 1487 } else { 1488 prior->ne_peer = peer; 1489 } 1490 1491 if (peer) 1492 peer->ne_prior = prior; 1493 } else if (peer) { 1494 peer->ne_mount->mi_ephemeral_tree->net_root = peer; 1495 peer->ne_prior = NULL; 1496 } else { 1497 e->ne_mount->mi_ephemeral_tree->net_root = NULL; 1498 } 1499 } 1500 1501 /* 1502 * We want to avoid recursion at all costs. So we need to 1503 * unroll the tree. We do this by a depth first traversal to 1504 * leaf nodes. We blast away the leaf and work our way back 1505 * up and down the tree. 1506 */ 1507 static int 1508 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph, 1509 int isTreeRoot, int flag, cred_t *cr) 1510 { 1511 nfs4_ephemeral_t *e = eph; 1512 nfs4_ephemeral_t *prior; 1513 mntinfo4_t *mi; 1514 vfs_t *vfsp; 1515 int error; 1516 1517 /* 1518 * We use the loop while unrolling the ephemeral tree. 1519 */ 1520 for (;;) { 1521 /* 1522 * First we walk down the child. 1523 */ 1524 if (e->ne_child) { 1525 prior = e; 1526 e = e->ne_child; 1527 continue; 1528 } 1529 1530 /* 1531 * If we are the root of the branch we are removing, 1532 * we end it here. But if the branch is the root of 1533 * the tree, we have to forge on. We do not consider 1534 * the peer list for the root because while it may 1535 * be okay to remove, it is both extra work and a 1536 * potential for a false-positive error to stall the 1537 * unmount attempt. 1538 */ 1539 if (e == eph && isTreeRoot == FALSE) 1540 return (0); 1541 1542 /* 1543 * Next we walk down the peer list. 1544 */ 1545 if (e->ne_peer) { 1546 prior = e; 1547 e = e->ne_peer; 1548 continue; 1549 } 1550 1551 /* 1552 * We can only remove the node passed in by the 1553 * caller if it is the root of the ephemeral tree. 1554 * Otherwise, the caller will remove it. 1555 */ 1556 if (e == eph && isTreeRoot == FALSE) 1557 return (0); 1558 1559 /* 1560 * Okay, we have a leaf node, time 1561 * to prune it! 1562 * 1563 * Note that prior can only be NULL if 1564 * and only if it is the root of the 1565 * ephemeral tree. 1566 */ 1567 prior = e->ne_prior; 1568 1569 mi = e->ne_mount; 1570 mutex_enter(&mi->mi_lock); 1571 vfsp = mi->mi_vfsp; 1572 1573 /* 1574 * Cleared by umount2_engine. 1575 */ 1576 VFS_HOLD(vfsp); 1577 1578 /* 1579 * Inform nfs4_unmount to not recursively 1580 * descend into this node's children when it 1581 * gets processed. 1582 */ 1583 mi->mi_flags |= MI4_EPHEMERAL_RECURSED; 1584 mutex_exit(&mi->mi_lock); 1585 1586 error = umount2_engine(vfsp, flag, cr, FALSE); 1587 if (error) { 1588 /* 1589 * We need to reenable nfs4_unmount's ability 1590 * to recursively descend on this node. 1591 */ 1592 mutex_enter(&mi->mi_lock); 1593 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED; 1594 mutex_exit(&mi->mi_lock); 1595 1596 return (error); 1597 } 1598 1599 /* 1600 * If we are the current node, we do not want to 1601 * touch anything else. At this point, the only 1602 * way the current node can have survived to here 1603 * is if it is the root of the ephemeral tree and 1604 * we are unmounting the enclosing mntinfo4. 1605 */ 1606 if (e == eph) { 1607 ASSERT(prior == NULL); 1608 return (0); 1609 } 1610 1611 /* 1612 * Stitch up the prior node. Note that since 1613 * we have handled the root of the tree, prior 1614 * must be non-NULL. 1615 */ 1616 ASSERT(prior != NULL); 1617 if (prior->ne_child == e) { 1618 prior->ne_child = NULL; 1619 } else { 1620 ASSERT(prior->ne_peer == e); 1621 1622 prior->ne_peer = NULL; 1623 } 1624 1625 e = prior; 1626 } 1627 1628 /* NOTREACHED */ 1629 } 1630 1631 /* 1632 * Common code to safely release net_cnt_lock and net_tree_lock 1633 */ 1634 void 1635 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock, 1636 nfs4_ephemeral_tree_t **pnet) 1637 { 1638 nfs4_ephemeral_tree_t *net = *pnet; 1639 1640 if (*pmust_unlock) { 1641 mutex_enter(&net->net_cnt_lock); 1642 net->net_refcnt--; 1643 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING; 1644 mutex_exit(&net->net_cnt_lock); 1645 1646 mutex_exit(&net->net_tree_lock); 1647 1648 *pmust_unlock = FALSE; 1649 } 1650 } 1651 1652 /* 1653 * While we may have removed any child or sibling nodes of this 1654 * ephemeral node, we can not nuke it until we know that there 1655 * were no actived vnodes on it. This will do that final 1656 * work once we know it is not busy. 1657 */ 1658 void 1659 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock, 1660 nfs4_ephemeral_tree_t **pnet) 1661 { 1662 /* 1663 * Now we need to get rid of the ephemeral data if it exists. 1664 */ 1665 mutex_enter(&mi->mi_lock); 1666 if (mi->mi_ephemeral) { 1667 /* 1668 * If we are the root node of an ephemeral branch 1669 * which is being removed, then we need to fixup 1670 * pointers into and out of the node. 1671 */ 1672 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED)) 1673 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral); 1674 1675 ASSERT(mi->mi_ephemeral != NULL); 1676 1677 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral)); 1678 mi->mi_ephemeral = NULL; 1679 } 1680 mutex_exit(&mi->mi_lock); 1681 1682 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet); 1683 } 1684 1685 /* 1686 * Unmount an ephemeral node. 1687 */ 1688 int 1689 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr, 1690 bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet) 1691 { 1692 int error = 0; 1693 nfs4_ephemeral_t *eph; 1694 nfs4_ephemeral_tree_t *net; 1695 int is_derooting = FALSE; 1696 int is_recursed = FALSE; 1697 1698 /* 1699 * The active vnodes on this file system may be ephemeral 1700 * children. We need to check for and try to unmount them 1701 * here. If any can not be unmounted, we are going 1702 * to return EBUSY. 1703 */ 1704 mutex_enter(&mi->mi_lock); 1705 1706 /* 1707 * If an ephemeral tree, we need to check to see if 1708 * the lock is already held. If it is, then we need 1709 * to see if we are being called as a result of 1710 * the recursive removal of some node of the tree or 1711 * if we are another attempt to remove the tree. 1712 * 1713 * mi_flags & MI4_EPHEMERAL indicates an ephemeral 1714 * node. mi_ephemeral being non-NULL also does this. 1715 * 1716 * mi_ephemeral_tree being non-NULL is sufficient 1717 * to also indicate either it is an ephemeral node 1718 * or the enclosing mntinfo4. 1719 * 1720 * Do we need MI4_EPHEMERAL? Yes, it is useful for 1721 * when we delete the ephemeral node and need to 1722 * differentiate from an ephemeral node and the 1723 * enclosing root node. 1724 */ 1725 *pnet = net = mi->mi_ephemeral_tree; 1726 if (net == NULL) { 1727 mutex_exit(&mi->mi_lock); 1728 return (0); 1729 } 1730 1731 eph = mi->mi_ephemeral; 1732 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED; 1733 is_derooting = (eph == NULL); 1734 1735 /* 1736 * If this is not recursion, then we need to 1737 * grab a ref count. 1738 * 1739 * But wait, we also do not want to do that 1740 * if a harvester thread has already grabbed 1741 * the lock. 1742 */ 1743 if (!is_recursed) { 1744 mutex_enter(&net->net_cnt_lock); 1745 if (net->net_status & 1746 NFS4_EPHEMERAL_TREE_LOCKED) { 1747 mutex_exit(&net->net_cnt_lock); 1748 mutex_exit(&mi->mi_lock); 1749 1750 /* 1751 * Someone is already working on 1752 * it. We need to back off and 1753 * let them proceed. 1754 * 1755 * We return EBUSY so that the 1756 * caller knows something is 1757 * going on. Note that by that 1758 * time, the umount in the other 1759 * thread may have already occured. 1760 */ 1761 return (EBUSY); 1762 } else 1763 net->net_refcnt++; 1764 mutex_exit(&net->net_cnt_lock); 1765 } 1766 mutex_exit(&mi->mi_lock); 1767 1768 /* 1769 * If we grab the lock, it means that no other 1770 * operation is working on the tree. If we don't 1771 * grab it, we need to decide if this is because 1772 * we are a recursive call or a new operation. 1773 * 1774 * If we are a recursive call, we proceed without 1775 * the lock. 1776 * 1777 * Else we have to wait until the lock becomes free. 1778 */ 1779 if (!mutex_tryenter(&net->net_tree_lock)) { 1780 if (!is_recursed) { 1781 mutex_enter(&net->net_cnt_lock); 1782 if (net->net_status & 1783 (NFS4_EPHEMERAL_TREE_DEROOTING 1784 | NFS4_EPHEMERAL_TREE_INVALID)) { 1785 net->net_refcnt--; 1786 mutex_exit(&net->net_cnt_lock); 1787 goto is_busy; 1788 } 1789 mutex_exit(&net->net_cnt_lock); 1790 1791 /* 1792 * We can't hold any other locks whilst 1793 * we wait on this to free up. 1794 */ 1795 mutex_enter(&net->net_tree_lock); 1796 1797 /* 1798 * Note that while mi->mi_ephemeral 1799 * may change and thus we have to 1800 * update eph, it is the case that 1801 * we have tied down net and 1802 * do not care if mi->mi_ephemeral_tree 1803 * has changed. 1804 */ 1805 mutex_enter(&mi->mi_lock); 1806 eph = mi->mi_ephemeral; 1807 mutex_exit(&mi->mi_lock); 1808 1809 /* 1810 * Okay, we need to see if either the 1811 * tree got nuked or the current node 1812 * got nuked. Both of which will cause 1813 * an error. 1814 * 1815 * Note that a subsequent retry of the 1816 * umount shall work. 1817 */ 1818 mutex_enter(&net->net_cnt_lock); 1819 if (net->net_status & 1820 NFS4_EPHEMERAL_TREE_INVALID || 1821 (!is_derooting && eph == NULL)) { 1822 net->net_refcnt--; 1823 mutex_exit(&net->net_cnt_lock); 1824 mutex_exit(&net->net_tree_lock); 1825 goto is_busy; 1826 } 1827 mutex_exit(&net->net_cnt_lock); 1828 *pmust_unlock = TRUE; 1829 } 1830 } else { 1831 /* 1832 * If we grab it right away, everything must 1833 * be great! 1834 */ 1835 *pmust_unlock = TRUE; 1836 } 1837 1838 /* 1839 * Only once we have grabbed the lock can we mark what we 1840 * are planning on doing to the ephemeral tree. 1841 */ 1842 if (*pmust_unlock) { 1843 mutex_enter(&net->net_cnt_lock); 1844 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING; 1845 1846 /* 1847 * Check to see if we are nuking the root. 1848 */ 1849 if (is_derooting) 1850 net->net_status |= 1851 NFS4_EPHEMERAL_TREE_DEROOTING; 1852 mutex_exit(&net->net_cnt_lock); 1853 } 1854 1855 if (!is_derooting) { 1856 /* 1857 * Only work on children if the caller has not already 1858 * done so. 1859 */ 1860 if (!is_recursed) { 1861 ASSERT(eph != NULL); 1862 1863 error = nfs4_ephemeral_unmount_engine(eph, 1864 FALSE, flag, cr); 1865 if (error) 1866 goto is_busy; 1867 } 1868 } else { 1869 eph = net->net_root; 1870 1871 /* 1872 * Only work if there is something there. 1873 */ 1874 if (eph) { 1875 error = nfs4_ephemeral_unmount_engine(eph, TRUE, 1876 flag, cr); 1877 if (error) { 1878 mutex_enter(&net->net_cnt_lock); 1879 net->net_status &= 1880 ~NFS4_EPHEMERAL_TREE_DEROOTING; 1881 mutex_exit(&net->net_cnt_lock); 1882 goto is_busy; 1883 } 1884 1885 /* 1886 * Nothing else which goes wrong will 1887 * invalidate the blowing away of the 1888 * ephmeral tree. 1889 */ 1890 net->net_root = NULL; 1891 } 1892 1893 /* 1894 * We have derooted and we have caused the tree to be 1895 * invalid. 1896 */ 1897 mutex_enter(&net->net_cnt_lock); 1898 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING; 1899 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID; 1900 net->net_refcnt--; 1901 mutex_exit(&net->net_cnt_lock); 1902 1903 /* 1904 * At this point, the tree should no 1905 * longer be associated with the 1906 * mntinfo4. We need to pull it off 1907 * there and let the harvester take 1908 * care of it once the refcnt drops. 1909 */ 1910 mutex_enter(&mi->mi_lock); 1911 mi->mi_ephemeral_tree = NULL; 1912 mutex_exit(&mi->mi_lock); 1913 } 1914 1915 return (0); 1916 1917 is_busy: 1918 1919 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet); 1920 1921 return (error); 1922 } 1923 1924 /* 1925 * Do the umount and record any error in the parent. 1926 */ 1927 static void 1928 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag, 1929 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior) 1930 { 1931 int error; 1932 1933 error = umount2_engine(vfsp, flag, kcred, FALSE); 1934 if (error) { 1935 if (prior) { 1936 if (prior->ne_child == e) 1937 prior->ne_state |= 1938 NFS4_EPHEMERAL_CHILD_ERROR; 1939 else 1940 prior->ne_state |= 1941 NFS4_EPHEMERAL_PEER_ERROR; 1942 } 1943 } 1944 } 1945 1946 /* 1947 * For each tree in the forest (where the forest is in 1948 * effect all of the ephemeral trees for this zone), 1949 * scan to see if a node can be unmounted. Note that 1950 * unlike nfs4_ephemeral_unmount_engine(), we do 1951 * not process the current node before children or 1952 * siblings. I.e., if a node can be unmounted, we 1953 * do not recursively check to see if the nodes 1954 * hanging off of it can also be unmounted. 1955 * 1956 * Instead, we delve down deep to try and remove the 1957 * children first. Then, because we share code with 1958 * nfs4_ephemeral_unmount_engine(), we will try 1959 * them again. This could be a performance issue in 1960 * the future. 1961 * 1962 * Also note that unlike nfs4_ephemeral_unmount_engine(), 1963 * we do not halt on an error. We will not remove the 1964 * current node, but we will keep on trying to remove 1965 * the others. 1966 * 1967 * force indicates that we want the unmount to occur 1968 * even if there is something blocking it. 1969 * 1970 * time_check indicates that we want to see if the 1971 * mount has expired past mount_to or not. Typically 1972 * we want to do this and only on a shutdown of the 1973 * zone would we want to ignore the check. 1974 */ 1975 static void 1976 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg, 1977 bool_t force, bool_t time_check) 1978 { 1979 nfs4_ephemeral_tree_t *net; 1980 nfs4_ephemeral_tree_t *prev = NULL; 1981 nfs4_ephemeral_tree_t *next; 1982 nfs4_ephemeral_t *e; 1983 nfs4_ephemeral_t *prior; 1984 time_t now = gethrestime_sec(); 1985 1986 nfs4_ephemeral_tree_t *harvest = NULL; 1987 1988 int flag; 1989 1990 mntinfo4_t *mi; 1991 vfs_t *vfsp; 1992 1993 if (force) 1994 flag = MS_FORCE; 1995 else 1996 flag = 0; 1997 1998 mutex_enter(&ntg->ntg_forest_lock); 1999 for (net = ntg->ntg_forest; net != NULL; net = next) { 2000 next = net->net_next; 2001 2002 mutex_enter(&net->net_cnt_lock); 2003 net->net_refcnt++; 2004 mutex_exit(&net->net_cnt_lock); 2005 2006 mutex_enter(&net->net_tree_lock); 2007 2008 /* 2009 * Let the unmount code know that the 2010 * tree is already locked! 2011 */ 2012 mutex_enter(&net->net_cnt_lock); 2013 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED; 2014 mutex_exit(&net->net_cnt_lock); 2015 2016 /* 2017 * If the intent is force all ephemeral nodes to 2018 * be unmounted in this zone, we can short circuit a 2019 * lot of tree traversal and simply zap the root node. 2020 */ 2021 if (force) { 2022 if (net->net_root) { 2023 mi = net->net_root->ne_mount; 2024 vfsp = mi->mi_vfsp; 2025 2026 /* 2027 * Cleared by umount2_engine. 2028 */ 2029 VFS_HOLD(vfsp); 2030 2031 (void) umount2_engine(vfsp, flag, 2032 kcred, FALSE); 2033 2034 goto check_done; 2035 } 2036 } 2037 2038 e = net->net_root; 2039 if (e) 2040 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; 2041 2042 while (e) { 2043 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) { 2044 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING; 2045 if (e->ne_child) { 2046 e = e->ne_child; 2047 e->ne_state = 2048 NFS4_EPHEMERAL_VISIT_CHILD; 2049 } 2050 2051 continue; 2052 } else if (e->ne_state == 2053 NFS4_EPHEMERAL_VISIT_SIBLING) { 2054 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME; 2055 if (e->ne_peer) { 2056 e = e->ne_peer; 2057 e->ne_state = 2058 NFS4_EPHEMERAL_VISIT_CHILD; 2059 } 2060 2061 continue; 2062 } else if (e->ne_state == 2063 NFS4_EPHEMERAL_CHILD_ERROR) { 2064 prior = e->ne_prior; 2065 2066 /* 2067 * If a child reported an error, do 2068 * not bother trying to unmount. 2069 * 2070 * If your prior node is a parent, 2071 * pass the error up such that they 2072 * also do not try to unmount. 2073 * 2074 * However, if your prior is a sibling, 2075 * let them try to unmount if they can. 2076 */ 2077 if (prior) { 2078 if (prior->ne_child == e) 2079 prior->ne_state |= 2080 NFS4_EPHEMERAL_CHILD_ERROR; 2081 else 2082 prior->ne_state |= 2083 NFS4_EPHEMERAL_PEER_ERROR; 2084 } 2085 2086 /* 2087 * Clear the error and if needed, process peers. 2088 * 2089 * Once we mask out the error, we know whether 2090 * or we have to process another node. 2091 */ 2092 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR; 2093 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME) 2094 e = prior; 2095 2096 continue; 2097 } else if (e->ne_state == 2098 NFS4_EPHEMERAL_PEER_ERROR) { 2099 prior = e->ne_prior; 2100 2101 if (prior) { 2102 if (prior->ne_child == e) 2103 prior->ne_state = 2104 NFS4_EPHEMERAL_CHILD_ERROR; 2105 else 2106 prior->ne_state = 2107 NFS4_EPHEMERAL_PEER_ERROR; 2108 } 2109 2110 /* 2111 * Clear the error from this node and do the 2112 * correct processing. 2113 */ 2114 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR; 2115 continue; 2116 } 2117 2118 prior = e->ne_prior; 2119 e->ne_state = NFS4_EPHEMERAL_OK; 2120 2121 /* 2122 * It must be the case that we need to process 2123 * this node. 2124 */ 2125 if (!time_check || 2126 now - e->ne_ref_time > e->ne_mount_to) { 2127 mi = e->ne_mount; 2128 vfsp = mi->mi_vfsp; 2129 2130 /* 2131 * Cleared by umount2_engine. 2132 */ 2133 VFS_HOLD(vfsp); 2134 2135 /* 2136 * Note that we effectively work down to the 2137 * leaf nodes first, try to unmount them, 2138 * then work our way back up into the leaf 2139 * nodes. 2140 * 2141 * Also note that we deal with a lot of 2142 * complexity by sharing the work with 2143 * the manual unmount code. 2144 */ 2145 nfs4_ephemeral_record_umount(vfsp, flag, 2146 e, prior); 2147 } 2148 2149 e = prior; 2150 } 2151 2152 check_done: 2153 2154 /* 2155 * Are we done with this tree? 2156 */ 2157 mutex_enter(&net->net_cnt_lock); 2158 if (net->net_refcnt == 1 && 2159 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) { 2160 net->net_refcnt--; 2161 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2162 mutex_exit(&net->net_cnt_lock); 2163 mutex_exit(&net->net_tree_lock); 2164 2165 if (prev) 2166 prev->net_next = net->net_next; 2167 else 2168 ntg->ntg_forest = net->net_next; 2169 2170 net->net_next = harvest; 2171 harvest = net; 2172 continue; 2173 } 2174 2175 net->net_refcnt--; 2176 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2177 mutex_exit(&net->net_cnt_lock); 2178 mutex_exit(&net->net_tree_lock); 2179 2180 prev = net; 2181 } 2182 mutex_exit(&ntg->ntg_forest_lock); 2183 2184 for (net = harvest; net != NULL; net = next) { 2185 next = net->net_next; 2186 2187 mutex_destroy(&net->net_tree_lock); 2188 mutex_destroy(&net->net_cnt_lock); 2189 kmem_free(net, sizeof (*net)); 2190 } 2191 } 2192 2193 /* 2194 * This is the thread which decides when the harvesting 2195 * can proceed and when to kill it off for this zone. 2196 */ 2197 static void 2198 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg) 2199 { 2200 clock_t timeleft; 2201 zone_t *zone = curproc->p_zone; 2202 2203 for (;;) { 2204 timeleft = zone_status_timedwait(zone, lbolt + 2205 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN); 2206 2207 /* 2208 * zone is exiting... 2209 */ 2210 if (timeleft != -1) { 2211 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN); 2212 zthread_exit(); 2213 /* NOTREACHED */ 2214 } 2215 2216 /* 2217 * Only bother scanning if there is potential 2218 * work to be done. 2219 */ 2220 if (ntg->ntg_forest == NULL) 2221 continue; 2222 2223 /* 2224 * Now scan the list and get rid of everything which 2225 * is old. 2226 */ 2227 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE); 2228 } 2229 2230 /* NOTREACHED */ 2231 } 2232 2233 /* 2234 * The zone specific glue needed to start the unmount harvester. 2235 * 2236 * Note that we want to avoid holding the mutex as long as possible, 2237 * hence the multiple checks. 2238 * 2239 * The caller should avoid us getting down here in the first 2240 * place. 2241 */ 2242 static void 2243 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg) 2244 { 2245 /* 2246 * It got started before we got here... 2247 */ 2248 if (ntg->ntg_thread_started) 2249 return; 2250 2251 mutex_enter(&nfs4_ephemeral_thread_lock); 2252 2253 if (ntg->ntg_thread_started) { 2254 mutex_exit(&nfs4_ephemeral_thread_lock); 2255 return; 2256 } 2257 2258 /* 2259 * Start the unmounter harvester thread for this zone. 2260 */ 2261 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester, 2262 ntg, 0, minclsyspri); 2263 2264 ntg->ntg_thread_started = TRUE; 2265 mutex_exit(&nfs4_ephemeral_thread_lock); 2266 } 2267 2268 /*ARGSUSED*/ 2269 static void * 2270 nfs4_ephemeral_zsd_create(zoneid_t zoneid) 2271 { 2272 nfs4_trigger_globals_t *ntg; 2273 2274 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP); 2275 ntg->ntg_thread_started = FALSE; 2276 2277 /* 2278 * This is the default.... 2279 */ 2280 ntg->ntg_mount_to = nfs4_trigger_thread_timer; 2281 2282 mutex_init(&ntg->ntg_forest_lock, NULL, 2283 MUTEX_DEFAULT, NULL); 2284 2285 return (ntg); 2286 } 2287 2288 /* 2289 * Try a nice gentle walk down the forest and convince 2290 * all of the trees to gracefully give it up. 2291 */ 2292 /*ARGSUSED*/ 2293 static void 2294 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg) 2295 { 2296 nfs4_trigger_globals_t *ntg = arg; 2297 2298 if (!ntg) 2299 return; 2300 2301 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE); 2302 } 2303 2304 /* 2305 * Race along the forest and rip all of the trees out by 2306 * their rootballs! 2307 */ 2308 /*ARGSUSED*/ 2309 static void 2310 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg) 2311 { 2312 nfs4_trigger_globals_t *ntg = arg; 2313 2314 if (!ntg) 2315 return; 2316 2317 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE); 2318 2319 mutex_destroy(&ntg->ntg_forest_lock); 2320 kmem_free(ntg, sizeof (*ntg)); 2321 } 2322 2323 /* 2324 * This is the zone independent cleanup needed for 2325 * emphemeral mount processing. 2326 */ 2327 void 2328 nfs4_ephemeral_fini(void) 2329 { 2330 (void) zone_key_delete(nfs4_ephemeral_key); 2331 mutex_destroy(&nfs4_ephemeral_thread_lock); 2332 } 2333 2334 /* 2335 * This is the zone independent initialization needed for 2336 * emphemeral mount processing. 2337 */ 2338 void 2339 nfs4_ephemeral_init(void) 2340 { 2341 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT, 2342 NULL); 2343 2344 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create, 2345 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy); 2346 } 2347 2348 /* 2349 * nfssys() calls this function to set the per-zone 2350 * value of mount_to to drive when an ephemeral mount is 2351 * timed out. Each mount will grab a copy of this value 2352 * when mounted. 2353 */ 2354 void 2355 nfs4_ephemeral_set_mount_to(uint_t mount_to) 2356 { 2357 nfs4_trigger_globals_t *ntg; 2358 zone_t *zone = curproc->p_zone; 2359 2360 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 2361 2362 ntg->ntg_mount_to = mount_to; 2363 } 2364 2365 /* 2366 * Walk the list of v4 mount options; if they are currently set in vfsp, 2367 * append them to a new comma-separated mount option string, and return it. 2368 * 2369 * Caller should free by calling nfs4_trigger_destroy_mntopts(). 2370 */ 2371 static char * 2372 nfs4_trigger_create_mntopts(vfs_t *vfsp) 2373 { 2374 uint_t i; 2375 char *mntopts; 2376 struct vfssw *vswp; 2377 mntopts_t *optproto; 2378 2379 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP); 2380 2381 /* get the list of applicable mount options for v4; locks *vswp */ 2382 vswp = vfs_getvfssw(MNTTYPE_NFS4); 2383 optproto = &vswp->vsw_optproto; 2384 2385 for (i = 0; i < optproto->mo_count; i++) { 2386 struct mntopt *mop = &optproto->mo_list[i]; 2387 2388 if (mop->mo_flags & MO_EMPTY) 2389 continue; 2390 2391 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) { 2392 kmem_free(mntopts, MAX_MNTOPT_STR); 2393 vfs_unrefvfssw(vswp); 2394 return (NULL); 2395 } 2396 } 2397 2398 vfs_unrefvfssw(vswp); 2399 2400 /* 2401 * MNTOPT_XATTR is not in the v4 mount opt proto list, 2402 * and it may only be passed via MS_OPTIONSTR, so we 2403 * must handle it here. 2404 * 2405 * Ideally, it would be in the list, but NFS does not specify its 2406 * own opt proto list, it uses instead the default one. Since 2407 * not all filesystems support extended attrs, it would not be 2408 * appropriate to add it there. 2409 */ 2410 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) || 2411 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) { 2412 kmem_free(mntopts, MAX_MNTOPT_STR); 2413 return (NULL); 2414 } 2415 2416 return (mntopts); 2417 } 2418 2419 static void 2420 nfs4_trigger_destroy_mntopts(char *mntopts) 2421 { 2422 if (mntopts) 2423 kmem_free(mntopts, MAX_MNTOPT_STR); 2424 } 2425 2426 /* 2427 * Check a single mount option (optname). Add to mntopts if it is set in VFS. 2428 */ 2429 static int 2430 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp) 2431 { 2432 if (mntopts == NULL || optname == NULL || vfsp == NULL) 2433 return (EINVAL); 2434 2435 if (vfs_optionisset(vfsp, optname, NULL)) { 2436 size_t mntoptslen = strlen(mntopts); 2437 size_t optnamelen = strlen(optname); 2438 2439 /* +1 for ',', +1 for NUL */ 2440 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR) 2441 return (EOVERFLOW); 2442 2443 /* first or subsequent mount option? */ 2444 if (*mntopts != '\0') 2445 (void) strcat(mntopts, ","); 2446 2447 (void) strcat(mntopts, optname); 2448 } 2449 2450 return (0); 2451 } 2452 2453 static enum clnt_stat 2454 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr) 2455 { 2456 int retries, error; 2457 uint_t max_msgsize; 2458 enum clnt_stat status; 2459 CLIENT *cl; 2460 struct timeval timeout; 2461 2462 /* as per recov_newserver() */ 2463 max_msgsize = 0; 2464 retries = 1; 2465 timeout.tv_sec = 2; 2466 timeout.tv_usec = 0; 2467 2468 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM, 2469 NFS_V4, max_msgsize, retries, CRED(), &cl); 2470 if (error) 2471 return (RPC_FAILED); 2472 2473 if (nointr) 2474 cl->cl_nosignal = TRUE; 2475 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL, 2476 timeout); 2477 if (nointr) 2478 cl->cl_nosignal = FALSE; 2479 2480 AUTH_DESTROY(cl->cl_auth); 2481 CLNT_DESTROY(cl); 2482 2483 return (status); 2484 } 2485