1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are 29 * triggered from a "stub" rnode via a special set of vnodeops. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/vfs_opreg.h> 40 #include <sys/file.h> 41 #include <sys/filio.h> 42 #include <sys/uio.h> 43 #include <sys/buf.h> 44 #include <sys/mman.h> 45 #include <sys/pathname.h> 46 #include <sys/dirent.h> 47 #include <sys/debug.h> 48 #include <sys/vmsystm.h> 49 #include <sys/fcntl.h> 50 #include <sys/flock.h> 51 #include <sys/swap.h> 52 #include <sys/errno.h> 53 #include <sys/strsubr.h> 54 #include <sys/sysmacros.h> 55 #include <sys/kmem.h> 56 #include <sys/mount.h> 57 #include <sys/cmn_err.h> 58 #include <sys/pathconf.h> 59 #include <sys/utsname.h> 60 #include <sys/dnlc.h> 61 #include <sys/acl.h> 62 #include <sys/systeminfo.h> 63 #include <sys/policy.h> 64 #include <sys/sdt.h> 65 #include <sys/list.h> 66 #include <sys/stat.h> 67 #include <sys/mntent.h> 68 69 #include <rpc/types.h> 70 #include <rpc/auth.h> 71 #include <rpc/clnt.h> 72 73 #include <nfs/nfs.h> 74 #include <nfs/nfs_clnt.h> 75 #include <nfs/nfs_acl.h> 76 #include <nfs/lm.h> 77 #include <nfs/nfs4.h> 78 #include <nfs/nfs4_kprot.h> 79 #include <nfs/rnode4.h> 80 #include <nfs/nfs4_clnt.h> 81 82 #include <vm/hat.h> 83 #include <vm/as.h> 84 #include <vm/page.h> 85 #include <vm/pvn.h> 86 #include <vm/seg.h> 87 #include <vm/seg_map.h> 88 #include <vm/seg_kpm.h> 89 #include <vm/seg_vn.h> 90 91 #include <fs/fs_subr.h> 92 93 #include <sys/ddi.h> 94 #include <sys/int_fmtio.h> 95 96 #include <sys/sunddi.h> 97 98 /* 99 * The automatic unmounter thread stuff! 100 */ 101 static int nfs4_trigger_thread_timer = 20; /* in seconds */ 102 103 /* 104 * Just a default.... 105 */ 106 static uint_t nfs4_trigger_mount_to = 240; 107 108 typedef struct nfs4_trigger_globals { 109 kmutex_t ntg_forest_lock; 110 uint_t ntg_mount_to; 111 int ntg_thread_started; 112 nfs4_ephemeral_tree_t *ntg_forest; 113 } nfs4_trigger_globals_t; 114 115 kmutex_t nfs4_ephemeral_thread_lock; 116 117 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED; 118 119 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *); 120 121 /* 122 * Used for ephemeral mounts; contains data either duplicated from 123 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount. 124 * 125 * It's intended that this structure is used solely for ephemeral 126 * mount-type specific data, for passing this data to 127 * nfs4_trigger_nargs_create(). 128 */ 129 typedef struct ephemeral_servinfo { 130 char *esi_hostname; 131 char *esi_netname; 132 char *esi_path; 133 int esi_path_len; 134 int esi_mount_flags; 135 struct netbuf *esi_addr; 136 struct netbuf *esi_syncaddr; 137 struct knetconfig *esi_knconf; 138 } ephemeral_servinfo_t; 139 140 /* 141 * Collect together the mount-type specific and generic data args. 142 */ 143 typedef struct domount_args { 144 ephemeral_servinfo_t *dma_esi; 145 char *dma_hostlist; /* comma-sep. for RO failover */ 146 struct nfs_args *dma_nargs; 147 } domount_args_t; 148 149 150 /* 151 * The vnode ops functions for a trigger stub vnode 152 */ 153 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *); 154 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *, 155 caller_context_t *); 156 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *, 157 caller_context_t *); 158 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *, 159 caller_context_t *); 160 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *, 161 caller_context_t *); 162 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **, 163 struct pathname *, int, vnode_t *, cred_t *, caller_context_t *, 164 int *, pathname_t *); 165 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *, 166 enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *, 167 vsecattr_t *); 168 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *, 169 int); 170 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *, 171 caller_context_t *, int); 172 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *, 173 cred_t *, caller_context_t *, int); 174 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *, 175 vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp); 176 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 177 caller_context_t *, int); 178 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *, 179 cred_t *, caller_context_t *, int); 180 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *); 181 182 /* 183 * Regular NFSv4 vnodeops that we need to reference directly 184 */ 185 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 186 caller_context_t *); 187 extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 188 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *); 189 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 190 extern int nfs4_lookup(vnode_t *, char *, vnode_t **, 191 struct pathname *, int, vnode_t *, cred_t *, 192 caller_context_t *, int *, pathname_t *); 193 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 194 caller_context_t *); 195 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 196 caller_context_t *); 197 extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 198 extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 199 200 static int nfs4_trigger_mount(vnode_t *, vnode_t **); 201 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **, 202 cred_t *); 203 static domount_args_t *nfs4_trigger_domount_args_create(vnode_t *); 204 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma, 205 vnode_t *vp); 206 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *); 207 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *); 208 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *, 209 servinfo4_t *); 210 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *, 211 ephemeral_servinfo_t *); 212 static void nfs4_trigger_nargs_destroy(struct nfs_args *); 213 static char *nfs4_trigger_create_mntopts(vfs_t *); 214 static void nfs4_trigger_destroy_mntopts(char *); 215 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *); 216 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int); 217 218 extern int umount2_engine(vfs_t *, int, cred_t *, int); 219 220 221 vnodeops_t *nfs4_trigger_vnodeops; 222 223 /* 224 * These are the vnodeops that we must define for stub vnodes. 225 * 226 * 227 * Many of the VOPs defined for NFSv4 do not need to be defined here, 228 * for various reasons. This will result in the VFS default function being 229 * used: 230 * 231 * - These VOPs require a previous VOP_OPEN to have occurred. That will have 232 * lost the reference to the stub vnode, meaning these should not be called: 233 * close, read, write, ioctl, readdir, seek. 234 * 235 * - These VOPs are meaningless for vnodes without data pages. Since the 236 * stub vnode is of type VDIR, these should not be called: 237 * space, getpage, putpage, map, addmap, delmap, pageio, fsync. 238 * 239 * - These VOPs are otherwise not applicable, and should not be called: 240 * dump, setsecattr. 241 * 242 * 243 * These VOPs we do not want to define, but nor do we want the VFS default 244 * action. Instead, we specify the VFS error function, with fs_error(), but 245 * note that fs_error() is not actually called. Instead it results in the 246 * use of the error function defined for the particular VOP, in vn_ops_table[]: 247 * 248 * - frlock, dispose, shrlock. 249 * 250 * 251 * These VOPs we define to use the corresponding regular NFSv4 vnodeop. 252 * NOTE: if any of these ops involve an OTW call with the stub FH, then 253 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo() 254 * to protect the security data in the servinfo4_t for the "parent" 255 * filesystem that contains the stub. 256 * 257 * - These VOPs should not trigger a mount, so that "ls -l" does not: 258 * pathconf, getsecattr. 259 * 260 * - These VOPs would not make sense to trigger: 261 * inactive, rwlock, rwunlock, fid, realvp. 262 */ 263 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = { 264 VOPNAME_OPEN, { .vop_open = nfs4_trigger_open }, 265 VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr }, 266 VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr }, 267 VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access }, 268 VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup }, 269 VOPNAME_CREATE, { .vop_create = nfs4_trigger_create }, 270 VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove }, 271 VOPNAME_LINK, { .vop_link = nfs4_trigger_link }, 272 VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename }, 273 VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir }, 274 VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir }, 275 VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink }, 276 VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink }, 277 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 278 VOPNAME_FID, { .vop_fid = nfs4_fid }, 279 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 280 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 281 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 282 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 283 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 284 VOPNAME_FRLOCK, { .error = fs_error }, 285 VOPNAME_DISPOSE, { .error = fs_error }, 286 VOPNAME_SHRLOCK, { .error = fs_error }, 287 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 288 NULL, NULL 289 }; 290 291 static void 292 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net) 293 { 294 mutex_enter(&net->net_cnt_lock); 295 net->net_refcnt++; 296 ASSERT(net->net_refcnt != 0); 297 mutex_exit(&net->net_cnt_lock); 298 } 299 300 /* 301 * We need a safe way to decrement the refcnt whilst the 302 * lock is being held. 303 */ 304 static void 305 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net) 306 { 307 ASSERT(mutex_owned(&net->net_cnt_lock)); 308 ASSERT(net->net_refcnt != 0); 309 net->net_refcnt--; 310 } 311 312 static void 313 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net) 314 { 315 mutex_enter(&net->net_cnt_lock); 316 nfs4_ephemeral_tree_decr(net); 317 mutex_exit(&net->net_cnt_lock); 318 } 319 320 /* 321 * Trigger ops for stub vnodes; for mirror mounts, etc. 322 * 323 * The general idea is that a "triggering" op will first call 324 * nfs4_trigger_mount(), which will find out whether a mount has already 325 * been triggered. 326 * 327 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode 328 * of the covering vfs. 329 * 330 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so, 331 * and again set newvp, as above. 332 * 333 * The triggering op may then re-issue the VOP by calling it on newvp. 334 * 335 * Note that some ops may perform custom action, and may or may not need 336 * to trigger a mount. 337 * 338 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We 339 * obviously can't do this with VOP_<whatever>, since it's a stub vnode 340 * and that would just recurse. Instead, we call the v4 op directly, 341 * by name. This is OK, since we know that the vnode is for NFSv4, 342 * otherwise it couldn't be a stub. 343 * 344 */ 345 346 static int 347 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 348 { 349 int error; 350 vnode_t *newvp; 351 352 error = nfs4_trigger_mount(*vpp, &newvp); 353 if (error) 354 return (error); 355 356 /* Release the stub vnode, as we're losing the reference to it */ 357 VN_RELE(*vpp); 358 359 /* Give the caller the root vnode of the newly-mounted fs */ 360 *vpp = newvp; 361 362 /* return with VN_HELD(newvp) */ 363 return (VOP_OPEN(vpp, flag, cr, ct)); 364 } 365 366 /* 367 * For the majority of cases, nfs4_trigger_getattr() will not trigger 368 * a mount. However, if ATTR_TRIGGER is set, we are being informed 369 * that we need to force the mount before we attempt to determine 370 * the attributes. The intent is an atomic operation for security 371 * testing. 372 */ 373 static int 374 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 375 caller_context_t *ct) 376 { 377 int error; 378 379 if (flags & ATTR_TRIGGER) { 380 vnode_t *newvp; 381 382 error = nfs4_trigger_mount(vp, &newvp); 383 if (error) 384 return (error); 385 386 error = VOP_GETATTR(newvp, vap, flags, cr, ct); 387 VN_RELE(newvp); 388 } else { 389 error = nfs4_getattr(vp, vap, flags, cr, ct); 390 } 391 392 return (error); 393 } 394 395 static int 396 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 397 caller_context_t *ct) 398 { 399 int error; 400 vnode_t *newvp; 401 402 error = nfs4_trigger_mount(vp, &newvp); 403 if (error) 404 return (error); 405 406 error = VOP_SETATTR(newvp, vap, flags, cr, ct); 407 VN_RELE(newvp); 408 409 return (error); 410 } 411 412 static int 413 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr, 414 caller_context_t *ct) 415 { 416 int error; 417 vnode_t *newvp; 418 419 error = nfs4_trigger_mount(vp, &newvp); 420 if (error) 421 return (error); 422 423 error = VOP_ACCESS(newvp, mode, flags, cr, ct); 424 VN_RELE(newvp); 425 426 return (error); 427 } 428 429 static int 430 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, 431 struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr, 432 caller_context_t *ct, int *deflags, pathname_t *rpnp) 433 { 434 int error; 435 vnode_t *newdvp; 436 rnode4_t *drp = VTOR4(dvp); 437 438 ASSERT(RP_ISSTUB(drp)); 439 440 /* for now, we only support mirror-mounts */ 441 ASSERT(RP_ISSTUB_MIRRORMOUNT(drp)); 442 443 /* 444 * It's not legal to lookup ".." for an fs root, so we mustn't pass 445 * that up. Instead, pass onto the regular op, regardless of whether 446 * we've triggered a mount. 447 */ 448 if (strcmp(nm, "..") == 0) 449 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 450 ct, deflags, rpnp)); 451 452 error = nfs4_trigger_mount(dvp, &newdvp); 453 if (error) 454 return (error); 455 456 error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct, 457 deflags, rpnp); 458 VN_RELE(newdvp); 459 460 return (error); 461 } 462 463 static int 464 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va, 465 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr, 466 int flags, caller_context_t *ct, vsecattr_t *vsecp) 467 { 468 int error; 469 vnode_t *newdvp; 470 471 error = nfs4_trigger_mount(dvp, &newdvp); 472 if (error) 473 return (error); 474 475 error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, 476 flags, ct, vsecp); 477 VN_RELE(newdvp); 478 479 return (error); 480 } 481 482 static int 483 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, 484 int flags) 485 { 486 int error; 487 vnode_t *newdvp; 488 489 error = nfs4_trigger_mount(dvp, &newdvp); 490 if (error) 491 return (error); 492 493 error = VOP_REMOVE(newdvp, nm, cr, ct, flags); 494 VN_RELE(newdvp); 495 496 return (error); 497 } 498 499 static int 500 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 501 caller_context_t *ct, int flags) 502 { 503 int error; 504 vnode_t *newtdvp; 505 506 error = nfs4_trigger_mount(tdvp, &newtdvp); 507 if (error) 508 return (error); 509 510 /* 511 * We don't check whether svp is a stub. Let the NFSv4 code 512 * detect that error, and return accordingly. 513 */ 514 error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags); 515 VN_RELE(newtdvp); 516 517 return (error); 518 } 519 520 static int 521 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, 522 cred_t *cr, caller_context_t *ct, int flags) 523 { 524 int error; 525 vnode_t *newsdvp; 526 rnode4_t *tdrp = VTOR4(tdvp); 527 528 /* 529 * We know that sdvp is a stub, otherwise we would not be here. 530 * 531 * If tdvp is also be a stub, there are two possibilities: it 532 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)] 533 * or it is a different stub [!VN_CMP(sdvp, tdvp)]. 534 * 535 * In the former case, just trigger sdvp, and treat tdvp as 536 * though it were not a stub. 537 * 538 * In the latter case, it might be a different stub for the 539 * same server fs as sdvp, or for a different server fs. 540 * Regardless, from the client perspective this would still 541 * be a cross-filesystem rename, and should not be allowed, 542 * so return EXDEV, without triggering either mount. 543 */ 544 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp)) 545 return (EXDEV); 546 547 error = nfs4_trigger_mount(sdvp, &newsdvp); 548 if (error) 549 return (error); 550 551 error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags); 552 553 VN_RELE(newsdvp); 554 555 return (error); 556 } 557 558 /* ARGSUSED */ 559 static int 560 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, 561 cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) 562 { 563 int error; 564 vnode_t *newdvp; 565 566 error = nfs4_trigger_mount(dvp, &newdvp); 567 if (error) 568 return (error); 569 570 error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp); 571 VN_RELE(newdvp); 572 573 return (error); 574 } 575 576 static int 577 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 578 caller_context_t *ct, int flags) 579 { 580 int error; 581 vnode_t *newdvp; 582 583 error = nfs4_trigger_mount(dvp, &newdvp); 584 if (error) 585 return (error); 586 587 error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags); 588 VN_RELE(newdvp); 589 590 return (error); 591 } 592 593 static int 594 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, 595 cred_t *cr, caller_context_t *ct, int flags) 596 { 597 int error; 598 vnode_t *newdvp; 599 600 error = nfs4_trigger_mount(dvp, &newdvp); 601 if (error) 602 return (error); 603 604 error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags); 605 VN_RELE(newdvp); 606 607 return (error); 608 } 609 610 static int 611 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, 612 caller_context_t *ct) 613 { 614 int error; 615 vnode_t *newvp; 616 617 error = nfs4_trigger_mount(vp, &newvp); 618 if (error) 619 return (error); 620 621 error = VOP_READLINK(newvp, uiop, cr, ct); 622 VN_RELE(newvp); 623 624 return (error); 625 } 626 627 /* end of trigger vnode ops */ 628 629 630 /* 631 * Mount upon a trigger vnode; for mirror-mounts, etc. 632 * 633 * The mount may have already occurred, via another thread. If not, 634 * assemble the location information - which may require fetching - and 635 * perform the mount. 636 * 637 * Sets newvp to be the root of the fs that is now covering vp. Note 638 * that we return with VN_HELD(*newvp). 639 * 640 * The caller is responsible for passing the VOP onto the covering fs. 641 */ 642 static int 643 nfs4_trigger_mount(vnode_t *vp, vnode_t **newvpp) 644 { 645 int error; 646 vfs_t *vfsp; 647 rnode4_t *rp = VTOR4(vp); 648 mntinfo4_t *mi = VTOMI4(vp); 649 domount_args_t *dma; 650 651 nfs4_ephemeral_tree_t *net; 652 653 bool_t must_unlock = FALSE; 654 bool_t is_building = FALSE; 655 656 cred_t *zcred; 657 658 nfs4_trigger_globals_t *ntg; 659 660 zone_t *zone = curproc->p_zone; 661 662 ASSERT(RP_ISSTUB(rp)); 663 664 /* for now, we only support mirror-mounts */ 665 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 666 667 *newvpp = NULL; 668 669 /* 670 * Has the mount already occurred? 671 */ 672 error = vn_vfsrlock_wait(vp); 673 if (error) 674 goto done; 675 vfsp = vn_mountedvfs(vp); 676 if (vfsp != NULL) { 677 /* the mount has already occurred */ 678 error = VFS_ROOT(vfsp, newvpp); 679 if (!error) { 680 /* need to update the reference time */ 681 mutex_enter(&mi->mi_lock); 682 if (mi->mi_ephemeral) 683 mi->mi_ephemeral->ne_ref_time = 684 gethrestime_sec(); 685 mutex_exit(&mi->mi_lock); 686 } 687 688 vn_vfsunlock(vp); 689 goto done; 690 } 691 vn_vfsunlock(vp); 692 693 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 694 ASSERT(ntg != NULL); 695 696 mutex_enter(&mi->mi_lock); 697 698 /* 699 * We need to lock down the ephemeral tree. 700 */ 701 if (mi->mi_ephemeral_tree == NULL) { 702 net = kmem_zalloc(sizeof (*net), KM_SLEEP); 703 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL); 704 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL); 705 net->net_refcnt = 1; 706 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING; 707 is_building = TRUE; 708 709 /* 710 * We need to add it to the zone specific list for 711 * automatic unmounting and harvesting of deadwood. 712 */ 713 mutex_enter(&ntg->ntg_forest_lock); 714 if (ntg->ntg_forest != NULL) 715 net->net_next = ntg->ntg_forest; 716 ntg->ntg_forest = net; 717 mutex_exit(&ntg->ntg_forest_lock); 718 719 /* 720 * No lock order confusion with mi_lock because no 721 * other node could have grabbed net_tree_lock. 722 */ 723 mutex_enter(&net->net_tree_lock); 724 mi->mi_ephemeral_tree = net; 725 net->net_mount = mi; 726 mutex_exit(&mi->mi_lock); 727 } else { 728 net = mi->mi_ephemeral_tree; 729 mutex_exit(&mi->mi_lock); 730 731 nfs4_ephemeral_tree_hold(net); 732 733 mutex_enter(&net->net_tree_lock); 734 735 /* 736 * We can only procede if the tree is neither locked 737 * nor being torn down. 738 */ 739 mutex_enter(&net->net_cnt_lock); 740 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) { 741 nfs4_ephemeral_tree_decr(net); 742 mutex_exit(&net->net_cnt_lock); 743 mutex_exit(&net->net_tree_lock); 744 745 return (EIO); 746 } 747 mutex_exit(&net->net_cnt_lock); 748 } 749 750 mutex_enter(&net->net_cnt_lock); 751 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING; 752 mutex_exit(&net->net_cnt_lock); 753 754 must_unlock = TRUE; 755 756 dma = nfs4_trigger_domount_args_create(vp); 757 if (dma == NULL) { 758 error = EINVAL; 759 goto done; 760 } 761 762 /* 763 * Need to be root for this call to make mount work. 764 * Note that since we define mirror mounts to work 765 * for any user, we allow the mount to proceed. And 766 * we realize that the server will perform security 767 * checks to make sure that the client is allowed 768 * access. Finally, once the mount takes place, 769 * directory permissions will ensure that the 770 * content is secure. 771 */ 772 zcred = zone_get_kcred(getzoneid()); 773 ASSERT(zcred != NULL); 774 775 error = nfs4_trigger_domount(vp, dma, &vfsp, zcred); 776 nfs4_trigger_domount_args_destroy(dma, vp); 777 778 crfree(zcred); 779 780 if (!error) 781 error = VFS_ROOT(vfsp, newvpp); 782 done: 783 if (must_unlock) { 784 mutex_enter(&net->net_cnt_lock); 785 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING; 786 if (is_building) 787 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING; 788 nfs4_ephemeral_tree_decr(net); 789 mutex_exit(&net->net_cnt_lock); 790 791 mutex_exit(&net->net_tree_lock); 792 } 793 794 if (!error && (newvpp == NULL || *newvpp == NULL)) 795 error = ENOSYS; 796 797 return (error); 798 } 799 800 /* 801 * Collect together both the generic & mount-type specific args. 802 */ 803 static domount_args_t * 804 nfs4_trigger_domount_args_create(vnode_t *vp) 805 { 806 int nointr; 807 char *hostlist; 808 servinfo4_t *svp; 809 struct nfs_args *nargs, *nargs_head; 810 enum clnt_stat status; 811 ephemeral_servinfo_t *esi, *esi_first; 812 domount_args_t *dma; 813 mntinfo4_t *mi = VTOMI4(vp); 814 815 nointr = !(mi->mi_flags & MI4_INT); 816 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 817 818 svp = mi->mi_curr_serv; 819 /* check if the current server is responding */ 820 status = nfs4_trigger_ping_server(svp, nointr); 821 if (status == RPC_SUCCESS) { 822 esi_first = nfs4_trigger_esi_create(vp, svp); 823 if (esi_first == NULL) { 824 kmem_free(hostlist, MAXPATHLEN); 825 return (NULL); 826 } 827 828 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN); 829 830 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first); 831 } else { 832 /* current server did not respond */ 833 esi_first = NULL; 834 nargs_head = NULL; 835 } 836 nargs = nargs_head; 837 838 /* 839 * NFS RO failover. 840 * 841 * If we have multiple servinfo4 structures, linked via sv_next, 842 * we must create one nfs_args for each, linking the nfs_args via 843 * nfs_ext_u.nfs_extB.next. 844 * 845 * We need to build a corresponding esi for each, too, but that is 846 * used solely for building nfs_args, and may be immediately 847 * discarded, as domount() requires the info from just one esi, 848 * but all the nfs_args. 849 * 850 * Currently, the NFS mount code will hang if not all servers 851 * requested are available. To avoid that, we need to ping each 852 * server, here, and remove it from the list if it is not 853 * responding. This has the side-effect of that server then 854 * being permanently unavailable for this failover mount, even if 855 * it recovers. That's unfortunate, but the best we can do until 856 * the mount code path is fixed. 857 */ 858 859 /* 860 * If the current server was down, loop indefinitely until we find 861 * at least one responsive server. 862 */ 863 do { 864 /* no locking needed for sv_next; it is only set at fs mount */ 865 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 866 struct nfs_args *next; 867 868 /* 869 * nargs_head: the head of the nfs_args list 870 * nargs: the current tail of the list 871 * next: the newly-created element to be added 872 */ 873 874 /* 875 * We've already tried the current server, above; 876 * if it was responding, we have already included it 877 * and it may now be ignored. 878 * 879 * Otherwise, try it again, since it may now have 880 * recovered. 881 */ 882 if (svp == mi->mi_curr_serv && esi_first != NULL) 883 continue; 884 885 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 886 if (svp->sv_flags & SV4_NOTINUSE) { 887 nfs_rw_exit(&svp->sv_lock); 888 continue; 889 } 890 nfs_rw_exit(&svp->sv_lock); 891 892 /* check if the server is responding */ 893 status = nfs4_trigger_ping_server(svp, nointr); 894 /* if the server did not respond, ignore it */ 895 if (status != RPC_SUCCESS) 896 continue; 897 898 esi = nfs4_trigger_esi_create(vp, svp); 899 if (esi == NULL) 900 continue; 901 902 /* 903 * If the original current server (mi_curr_serv) 904 * was down when when we first tried it, 905 * (i.e. esi_first == NULL), 906 * we select this new server (svp) to be the server 907 * that we will actually contact (esi_first). 908 * 909 * Note that it's possible that mi_curr_serv == svp, 910 * if that mi_curr_serv was down but has now recovered. 911 */ 912 next = nfs4_trigger_nargs_create(mi, svp, esi); 913 if (esi_first == NULL) { 914 ASSERT(nargs == NULL); 915 ASSERT(nargs_head == NULL); 916 nargs_head = next; 917 esi_first = esi; 918 (void) strlcpy(hostlist, 919 esi_first->esi_hostname, MAXPATHLEN); 920 } else { 921 ASSERT(nargs_head != NULL); 922 nargs->nfs_ext_u.nfs_extB.next = next; 923 (void) strlcat(hostlist, ",", MAXPATHLEN); 924 (void) strlcat(hostlist, esi->esi_hostname, 925 MAXPATHLEN); 926 /* esi was only needed for hostname & nargs */ 927 nfs4_trigger_esi_destroy(esi, vp); 928 } 929 930 nargs = next; 931 } 932 933 /* if we've had no response at all, wait a second */ 934 if (esi_first == NULL) 935 delay(drv_usectohz(1000000)); 936 937 } while (esi_first == NULL); 938 ASSERT(nargs_head != NULL); 939 940 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP); 941 dma->dma_esi = esi_first; 942 dma->dma_hostlist = hostlist; 943 dma->dma_nargs = nargs_head; 944 945 return (dma); 946 } 947 948 static void 949 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp) 950 { 951 if (dma != NULL) { 952 if (dma->dma_esi != NULL && vp != NULL) 953 nfs4_trigger_esi_destroy(dma->dma_esi, vp); 954 955 if (dma->dma_hostlist != NULL) 956 kmem_free(dma->dma_hostlist, MAXPATHLEN); 957 958 if (dma->dma_nargs != NULL) { 959 struct nfs_args *nargs = dma->dma_nargs; 960 961 do { 962 struct nfs_args *next = 963 nargs->nfs_ext_u.nfs_extB.next; 964 965 nfs4_trigger_nargs_destroy(nargs); 966 nargs = next; 967 } while (nargs != NULL); 968 } 969 970 kmem_free(dma, sizeof (domount_args_t)); 971 } 972 } 973 974 /* 975 * The ephemeral_servinfo_t struct contains basic information we will need to 976 * perform the mount. Whilst the structure is generic across different 977 * types of ephemeral mount, the way we gather its contents differs. 978 */ 979 static ephemeral_servinfo_t * 980 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp) 981 { 982 ephemeral_servinfo_t *esi; 983 rnode4_t *rp = VTOR4(vp); 984 985 ASSERT(RP_ISSTUB(rp)); 986 987 /* Call the ephemeral type-specific routine */ 988 if (RP_ISSTUB_MIRRORMOUNT(rp)) 989 esi = nfs4_trigger_esi_create_mirrormount(vp, svp); 990 else 991 esi = NULL; 992 993 /* for now, we only support mirror-mounts */ 994 ASSERT(esi != NULL); 995 996 return (esi); 997 } 998 999 static void 1000 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp) 1001 { 1002 rnode4_t *rp = VTOR4(vp); 1003 1004 ASSERT(RP_ISSTUB(rp)); 1005 1006 /* for now, we only support mirror-mounts */ 1007 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 1008 1009 /* Currently, no need for an ephemeral type-specific routine */ 1010 1011 /* 1012 * The contents of ephemeral_servinfo_t goes into nfs_args, 1013 * and will be handled by nfs4_trigger_nargs_destroy(). 1014 * We need only free the structure itself. 1015 */ 1016 if (esi != NULL) 1017 kmem_free(esi, sizeof (ephemeral_servinfo_t)); 1018 } 1019 1020 /* 1021 * Some of this may turn out to be common with other ephemeral types, 1022 * in which case it should be moved to nfs4_trigger_esi_create(), or a 1023 * common function called. 1024 */ 1025 static ephemeral_servinfo_t * 1026 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp) 1027 { 1028 char *stubpath; 1029 struct knetconfig *sikncp, *svkncp; 1030 struct netbuf *bufp; 1031 ephemeral_servinfo_t *esi; 1032 1033 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP); 1034 1035 /* initially set to be our type of ephemeral mount; may be added to */ 1036 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT; 1037 1038 /* 1039 * We're copying info from the stub rnode's servinfo4, but 1040 * we must create new copies, not pointers, since this information 1041 * is to be associated with the new mount, which will be 1042 * unmounted (and its structures freed) separately 1043 */ 1044 1045 /* 1046 * Sizes passed to kmem_[z]alloc here must match those freed 1047 * in nfs4_free_args() 1048 */ 1049 1050 /* 1051 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this 1052 * is difficult to avoid: as we need to read svp to calculate the 1053 * sizes to be allocated. 1054 */ 1055 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1056 1057 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP); 1058 (void) strcat(esi->esi_hostname, svp->sv_hostname); 1059 1060 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1061 bufp = esi->esi_addr; 1062 bufp->len = svp->sv_addr.len; 1063 bufp->maxlen = svp->sv_addr.maxlen; 1064 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1065 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len); 1066 1067 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP); 1068 sikncp = esi->esi_knconf; 1069 svkncp = svp->sv_knconf; 1070 sikncp->knc_semantics = svkncp->knc_semantics; 1071 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1072 (void) strcat((char *)sikncp->knc_protofmly, 1073 (char *)svkncp->knc_protofmly); 1074 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1075 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto); 1076 sikncp->knc_rdev = svkncp->knc_rdev; 1077 1078 /* 1079 * Used when AUTH_DH is negotiated. 1080 * 1081 * This is ephemeral mount-type specific, since it contains the 1082 * server's time-sync syncaddr. 1083 */ 1084 if (svp->sv_dhsec) { 1085 struct netbuf *bufp; 1086 sec_data_t *sdata; 1087 dh_k4_clntdata_t *data; 1088 1089 sdata = svp->sv_dhsec; 1090 data = (dh_k4_clntdata_t *)sdata->data; 1091 ASSERT(sdata->rpcflavor == AUTH_DH); 1092 1093 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1094 bufp->len = data->syncaddr.len; 1095 bufp->maxlen = data->syncaddr.maxlen; 1096 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1097 bcopy(data->syncaddr.buf, bufp->buf, bufp->len); 1098 esi->esi_syncaddr = bufp; 1099 1100 if (data->netname != NULL) { 1101 int nmlen = data->netnamelen; 1102 1103 /* 1104 * We need to copy from a dh_k4_clntdata_t 1105 * netname/netnamelen pair to a NUL-terminated 1106 * netname string suitable for putting in nfs_args, 1107 * where the latter has no netnamelen field. 1108 */ 1109 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP); 1110 bcopy(data->netname, esi->esi_netname, nmlen); 1111 } 1112 } else { 1113 esi->esi_syncaddr = NULL; 1114 esi->esi_netname = NULL; 1115 } 1116 1117 stubpath = fn_path(VTOSV(vp)->sv_name); 1118 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */ 1119 ASSERT(*stubpath == '.'); 1120 stubpath += 1; 1121 1122 /* for nfs_args->fh */ 1123 esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1; 1124 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP); 1125 (void) strcat(esi->esi_path, svp->sv_path); 1126 (void) strcat(esi->esi_path, stubpath); 1127 1128 stubpath -= 1; 1129 /* stubpath allocated by fn_path() */ 1130 kmem_free(stubpath, strlen(stubpath) + 1); 1131 1132 nfs_rw_exit(&svp->sv_lock); 1133 1134 return (esi); 1135 } 1136 1137 /* 1138 * Assemble the args, and call the generic VFS mount function to 1139 * finally perform the ephemeral mount. 1140 */ 1141 static int 1142 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp, 1143 cred_t *cr) 1144 { 1145 struct mounta *uap; 1146 char *mntpt, *orig_path, *path; 1147 const char *orig_mntpt; 1148 int retval; 1149 int mntpt_len; 1150 int spec_len; 1151 zone_t *zone = curproc->p_zone; 1152 bool_t has_leading_slash; 1153 1154 vfs_t *stubvfsp = stubvp->v_vfsp; 1155 ephemeral_servinfo_t *esi = dma->dma_esi; 1156 struct nfs_args *nargs = dma->dma_nargs; 1157 1158 /* first, construct the mount point for the ephemeral mount */ 1159 orig_path = path = fn_path(VTOSV(stubvp)->sv_name); 1160 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt); 1161 1162 if (*orig_path == '.') 1163 orig_path++; 1164 1165 /* 1166 * Get rid of zone's root path 1167 */ 1168 if (zone != global_zone) { 1169 /* 1170 * -1 for trailing '/' and -1 for EOS. 1171 */ 1172 if (strncmp(zone->zone_rootpath, orig_mntpt, 1173 zone->zone_rootpathlen - 1) == 0) { 1174 orig_mntpt += (zone->zone_rootpathlen - 2); 1175 } 1176 } 1177 1178 mntpt_len = strlen(orig_mntpt) + strlen(orig_path); 1179 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP); 1180 (void) strcat(mntpt, orig_mntpt); 1181 (void) strcat(mntpt, orig_path); 1182 1183 kmem_free(path, strlen(path) + 1); 1184 path = esi->esi_path; 1185 if (*path == '.') 1186 path++; 1187 if (path[0] == '/' && path[1] == '/') 1188 path++; 1189 has_leading_slash = (*path == '/'); 1190 1191 spec_len = strlen(dma->dma_hostlist); 1192 spec_len += strlen(path); 1193 1194 /* We are going to have to add this in */ 1195 if (!has_leading_slash) 1196 spec_len++; 1197 1198 /* We need to get the ':' for dma_hostlist:esi_path */ 1199 spec_len++; 1200 1201 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP); 1202 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP); 1203 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist, 1204 has_leading_slash ? "" : "/", path); 1205 1206 uap->dir = mntpt; 1207 1208 uap->flags = MS_SYSSPACE | MS_DATA; 1209 /* fstype-independent mount options not covered elsewhere */ 1210 /* copy parent's mount(1M) "-m" flag */ 1211 if (stubvfsp->vfs_flag & VFS_NOMNTTAB) 1212 uap->flags |= MS_NOMNTTAB; 1213 1214 uap->fstype = MNTTYPE_NFS4; 1215 uap->dataptr = (char *)nargs; 1216 /* not needed for MS_SYSSPACE */ 1217 uap->datalen = 0; 1218 1219 /* use optptr to pass in extra mount options */ 1220 uap->flags |= MS_OPTIONSTR; 1221 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp); 1222 if (uap->optptr == NULL) { 1223 retval = EINVAL; 1224 goto done; 1225 } 1226 /* domount() expects us to count the trailing NUL */ 1227 uap->optlen = strlen(uap->optptr) + 1; 1228 1229 retval = domount(NULL, uap, stubvp, cr, vfsp); 1230 if (retval == 0) 1231 VFS_RELE(*vfsp); 1232 done: 1233 if (uap->optptr) 1234 nfs4_trigger_destroy_mntopts(uap->optptr); 1235 1236 kmem_free(uap->spec, spec_len + 1); 1237 kmem_free(uap, sizeof (struct mounta)); 1238 kmem_free(mntpt, mntpt_len + 1); 1239 1240 return (retval); 1241 } 1242 1243 /* 1244 * Build an nfs_args structure for passing to domount(). 1245 * 1246 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t; 1247 * generic data - common to all ephemeral mount types - is read directly 1248 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode. 1249 */ 1250 static struct nfs_args * 1251 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp, 1252 ephemeral_servinfo_t *esi) 1253 { 1254 sec_data_t *secdata; 1255 struct nfs_args *nargs; 1256 1257 /* setup the nfs args */ 1258 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP); 1259 1260 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1261 1262 nargs->addr = esi->esi_addr; 1263 1264 /* for AUTH_DH by negotiation */ 1265 if (esi->esi_syncaddr || esi->esi_netname) { 1266 nargs->flags |= NFSMNT_SECURE; 1267 nargs->syncaddr = esi->esi_syncaddr; 1268 nargs->netname = esi->esi_netname; 1269 } 1270 1271 nargs->flags |= NFSMNT_KNCONF; 1272 nargs->knconf = esi->esi_knconf; 1273 nargs->flags |= NFSMNT_HOSTNAME; 1274 nargs->hostname = esi->esi_hostname; 1275 nargs->fh = esi->esi_path; 1276 1277 /* general mount settings, all copied from parent mount */ 1278 mutex_enter(&mi->mi_lock); 1279 1280 if (!(mi->mi_flags & MI4_HARD)) 1281 nargs->flags |= NFSMNT_SOFT; 1282 1283 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO | 1284 NFSMNT_RETRANS; 1285 nargs->wsize = mi->mi_stsize; 1286 nargs->rsize = mi->mi_tsize; 1287 nargs->timeo = mi->mi_timeo; 1288 nargs->retrans = mi->mi_retrans; 1289 1290 if (mi->mi_flags & MI4_INT) 1291 nargs->flags |= NFSMNT_INT; 1292 if (mi->mi_flags & MI4_NOAC) 1293 nargs->flags |= NFSMNT_NOAC; 1294 1295 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN | 1296 NFSMNT_ACDIRMAX; 1297 nargs->acregmin = HR2SEC(mi->mi_acregmin); 1298 nargs->acregmax = HR2SEC(mi->mi_acregmax); 1299 nargs->acdirmin = HR2SEC(mi->mi_acdirmin); 1300 nargs->acdirmax = HR2SEC(mi->mi_acdirmax); 1301 1302 if (mi->mi_flags & MI4_NOCTO) 1303 nargs->flags |= NFSMNT_NOCTO; 1304 if (mi->mi_flags & MI4_GRPID) 1305 nargs->flags |= NFSMNT_GRPID; 1306 if (mi->mi_flags & MI4_LLOCK) 1307 nargs->flags |= NFSMNT_LLOCK; 1308 if (mi->mi_flags & MI4_NOPRINT) 1309 nargs->flags |= NFSMNT_NOPRINT; 1310 if (mi->mi_flags & MI4_DIRECTIO) 1311 nargs->flags |= NFSMNT_DIRECTIO; 1312 if (mi->mi_flags & MI4_PUBLIC) 1313 nargs->flags |= NFSMNT_PUBLIC; 1314 1315 mutex_exit(&mi->mi_lock); 1316 1317 /* add any specific flags for this type of ephemeral mount */ 1318 nargs->flags |= esi->esi_mount_flags; 1319 1320 /* 1321 * Security data & negotiation policy. 1322 * 1323 * We need to preserve the parent mount's preference for security 1324 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT. 1325 * 1326 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific 1327 * security flavour was requested, with data in sv_secdata, and that 1328 * no negotiation should occur. If this specified flavour fails, that's 1329 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT. 1330 * 1331 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in 1332 * default flavour, in sv_secdata, but then negotiate a new flavour. 1333 * Possible flavours are recorded in an array in sv_secinfo, with 1334 * currently in-use flavour pointed to by sv_currsec. 1335 * 1336 * If sv_currsec is set, i.e. if negotiation has already occurred, 1337 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless, 1338 * we will set NFSMNT_SECDEFAULT, to enable negotiation. 1339 */ 1340 if (svp->sv_flags & SV4_TRYSECDEFAULT) { 1341 /* enable negotiation for ephemeral mount */ 1342 nargs->flags |= NFSMNT_SECDEFAULT; 1343 1344 /* 1345 * As a starting point for negotiation, copy parent 1346 * mount's negotiated flavour (sv_currsec) if available, 1347 * or its passed-in flavour (sv_secdata) if not. 1348 */ 1349 if (svp->sv_currsec != NULL) 1350 secdata = copy_sec_data(svp->sv_currsec); 1351 else if (svp->sv_secdata != NULL) 1352 secdata = copy_sec_data(svp->sv_secdata); 1353 else 1354 secdata = NULL; 1355 } else { 1356 /* do not enable negotiation; copy parent's passed-in flavour */ 1357 if (svp->sv_secdata != NULL) 1358 secdata = copy_sec_data(svp->sv_secdata); 1359 else 1360 secdata = NULL; 1361 } 1362 1363 nfs_rw_exit(&svp->sv_lock); 1364 1365 nargs->flags |= NFSMNT_NEWARGS; 1366 nargs->nfs_args_ext = NFS_ARGS_EXTB; 1367 nargs->nfs_ext_u.nfs_extB.secdata = secdata; 1368 1369 /* for NFS RO failover; caller will set if necessary */ 1370 nargs->nfs_ext_u.nfs_extB.next = NULL; 1371 1372 return (nargs); 1373 } 1374 1375 static void 1376 nfs4_trigger_nargs_destroy(struct nfs_args *nargs) 1377 { 1378 /* 1379 * Either the mount failed, in which case the data is not needed, or 1380 * nfs4_mount() has either taken copies of what it needs or, 1381 * where it has merely copied the ptr, it has set *our* ptr to NULL, 1382 * whereby nfs4_free_args() will ignore it. 1383 */ 1384 nfs4_free_args(nargs); 1385 kmem_free(nargs, sizeof (struct nfs_args)); 1386 } 1387 1388 /* 1389 * When we finally get into the mounting, we need to add this 1390 * node to the ephemeral tree. 1391 * 1392 * This is called from nfs4_mount(). 1393 */ 1394 int 1395 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp) 1396 { 1397 mntinfo4_t *mi_parent; 1398 nfs4_ephemeral_t *eph; 1399 nfs4_ephemeral_tree_t *net; 1400 1401 nfs4_ephemeral_t *prior; 1402 nfs4_ephemeral_t *child; 1403 1404 nfs4_ephemeral_t *peer; 1405 1406 nfs4_trigger_globals_t *ntg; 1407 zone_t *zone = curproc->p_zone; 1408 1409 int rc = 0; 1410 1411 mi_parent = VTOMI4(mvp); 1412 1413 /* 1414 * Get this before grabbing anything else! 1415 */ 1416 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 1417 if (!ntg->ntg_thread_started) { 1418 nfs4_ephemeral_start_harvester(ntg); 1419 } 1420 1421 mutex_enter(&mi_parent->mi_lock); 1422 mutex_enter(&mi->mi_lock); 1423 1424 net = mi->mi_ephemeral_tree = 1425 mi_parent->mi_ephemeral_tree; 1426 1427 /* 1428 * If the mi_ephemeral_tree is NULL, then it 1429 * means that either the harvester or a manual 1430 * umount has cleared the tree out right before 1431 * we got here. 1432 * 1433 * There is nothing we can do here, so return 1434 * to the caller and let them decide whether they 1435 * try again. 1436 */ 1437 if (net == NULL) { 1438 mutex_exit(&mi->mi_lock); 1439 mutex_exit(&mi_parent->mi_lock); 1440 1441 return (EBUSY); 1442 } 1443 1444 nfs4_ephemeral_tree_hold(net); 1445 1446 /* 1447 * We need to tack together the ephemeral mount 1448 * with this new mntinfo. 1449 */ 1450 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP); 1451 eph->ne_mount = mi; 1452 eph->ne_ref_time = gethrestime_sec(); 1453 1454 /* 1455 * We need to tell the ephemeral mount when 1456 * to time out. 1457 */ 1458 eph->ne_mount_to = ntg->ntg_mount_to; 1459 1460 mi->mi_flags |= MI4_EPHEMERAL; 1461 mi->mi_ephemeral = eph; 1462 1463 /* 1464 * If the enclosing mntinfo4 is also ephemeral, 1465 * then we need to point to its enclosing parent. 1466 * Else the enclosing mntinfo4 is the enclosing parent. 1467 * 1468 * We also need to weave this ephemeral node 1469 * into the tree. 1470 */ 1471 if (mi_parent->mi_flags & MI4_EPHEMERAL) { 1472 /* 1473 * We need to decide if we are 1474 * the root node of this branch 1475 * or if we are a sibling of this 1476 * branch. 1477 */ 1478 prior = mi_parent->mi_ephemeral; 1479 if (prior == NULL) { 1480 /* 1481 * Race condition, clean up, and 1482 * let caller handle mntinfo. 1483 */ 1484 mi->mi_flags &= ~MI4_EPHEMERAL; 1485 mi->mi_ephemeral = NULL; 1486 kmem_free(eph, sizeof (*eph)); 1487 rc = EBUSY; 1488 } else { 1489 if (prior->ne_child == NULL) { 1490 prior->ne_child = eph; 1491 } else { 1492 child = prior->ne_child; 1493 1494 prior->ne_child = eph; 1495 eph->ne_peer = child; 1496 1497 child->ne_prior = eph; 1498 } 1499 1500 eph->ne_prior = prior; 1501 } 1502 } else { 1503 /* 1504 * The parent mntinfo4 is the non-ephemeral 1505 * root of the ephemeral tree. We 1506 * need to decide if we are the root 1507 * node of that tree or if we are a 1508 * sibling of the root node. 1509 * 1510 * We are the root if there is no 1511 * other node. 1512 */ 1513 if (net->net_root == NULL) { 1514 net->net_root = eph; 1515 } else { 1516 eph->ne_peer = peer = net->net_root; 1517 ASSERT(peer != NULL); 1518 net->net_root = eph; 1519 1520 peer->ne_prior = eph; 1521 } 1522 1523 eph->ne_prior = NULL; 1524 } 1525 1526 nfs4_ephemeral_tree_rele(net); 1527 1528 mutex_exit(&mi->mi_lock); 1529 mutex_exit(&mi_parent->mi_lock); 1530 1531 return (rc); 1532 } 1533 1534 /* 1535 * Commit the changes to the ephemeral tree for removing this node. 1536 */ 1537 static void 1538 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph) 1539 { 1540 nfs4_ephemeral_t *e = eph; 1541 nfs4_ephemeral_t *peer; 1542 nfs4_ephemeral_t *prior; 1543 1544 peer = eph->ne_peer; 1545 prior = e->ne_prior; 1546 1547 /* 1548 * If this branch root was not the 1549 * tree root, then we need to fix back pointers. 1550 */ 1551 if (prior) { 1552 if (prior->ne_child == e) { 1553 prior->ne_child = peer; 1554 } else { 1555 prior->ne_peer = peer; 1556 } 1557 1558 if (peer) 1559 peer->ne_prior = prior; 1560 } else if (peer) { 1561 peer->ne_mount->mi_ephemeral_tree->net_root = peer; 1562 peer->ne_prior = NULL; 1563 } else { 1564 e->ne_mount->mi_ephemeral_tree->net_root = NULL; 1565 } 1566 } 1567 1568 /* 1569 * We want to avoid recursion at all costs. So we need to 1570 * unroll the tree. We do this by a depth first traversal to 1571 * leaf nodes. We blast away the leaf and work our way back 1572 * up and down the tree. 1573 */ 1574 static int 1575 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph, 1576 int isTreeRoot, int flag, cred_t *cr) 1577 { 1578 nfs4_ephemeral_t *e = eph; 1579 nfs4_ephemeral_t *prior; 1580 mntinfo4_t *mi; 1581 vfs_t *vfsp; 1582 int error; 1583 1584 /* 1585 * We use the loop while unrolling the ephemeral tree. 1586 */ 1587 for (;;) { 1588 /* 1589 * First we walk down the child. 1590 */ 1591 if (e->ne_child) { 1592 prior = e; 1593 e = e->ne_child; 1594 continue; 1595 } 1596 1597 /* 1598 * If we are the root of the branch we are removing, 1599 * we end it here. But if the branch is the root of 1600 * the tree, we have to forge on. We do not consider 1601 * the peer list for the root because while it may 1602 * be okay to remove, it is both extra work and a 1603 * potential for a false-positive error to stall the 1604 * unmount attempt. 1605 */ 1606 if (e == eph && isTreeRoot == FALSE) 1607 return (0); 1608 1609 /* 1610 * Next we walk down the peer list. 1611 */ 1612 if (e->ne_peer) { 1613 prior = e; 1614 e = e->ne_peer; 1615 continue; 1616 } 1617 1618 /* 1619 * We can only remove the node passed in by the 1620 * caller if it is the root of the ephemeral tree. 1621 * Otherwise, the caller will remove it. 1622 */ 1623 if (e == eph && isTreeRoot == FALSE) 1624 return (0); 1625 1626 /* 1627 * Okay, we have a leaf node, time 1628 * to prune it! 1629 * 1630 * Note that prior can only be NULL if 1631 * and only if it is the root of the 1632 * ephemeral tree. 1633 */ 1634 prior = e->ne_prior; 1635 1636 mi = e->ne_mount; 1637 mutex_enter(&mi->mi_lock); 1638 vfsp = mi->mi_vfsp; 1639 1640 /* 1641 * Cleared by umount2_engine. 1642 */ 1643 VFS_HOLD(vfsp); 1644 1645 /* 1646 * Inform nfs4_unmount to not recursively 1647 * descend into this node's children when it 1648 * gets processed. 1649 */ 1650 mi->mi_flags |= MI4_EPHEMERAL_RECURSED; 1651 mutex_exit(&mi->mi_lock); 1652 1653 error = umount2_engine(vfsp, flag, cr, FALSE); 1654 if (error) { 1655 /* 1656 * We need to reenable nfs4_unmount's ability 1657 * to recursively descend on this node. 1658 */ 1659 mutex_enter(&mi->mi_lock); 1660 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED; 1661 mutex_exit(&mi->mi_lock); 1662 1663 return (error); 1664 } 1665 1666 /* 1667 * If we are the current node, we do not want to 1668 * touch anything else. At this point, the only 1669 * way the current node can have survived to here 1670 * is if it is the root of the ephemeral tree and 1671 * we are unmounting the enclosing mntinfo4. 1672 */ 1673 if (e == eph) { 1674 ASSERT(prior == NULL); 1675 return (0); 1676 } 1677 1678 /* 1679 * Stitch up the prior node. Note that since 1680 * we have handled the root of the tree, prior 1681 * must be non-NULL. 1682 */ 1683 ASSERT(prior != NULL); 1684 if (prior->ne_child == e) { 1685 prior->ne_child = NULL; 1686 } else { 1687 ASSERT(prior->ne_peer == e); 1688 1689 prior->ne_peer = NULL; 1690 } 1691 1692 e = prior; 1693 } 1694 1695 /* NOTREACHED */ 1696 } 1697 1698 /* 1699 * Common code to safely release net_cnt_lock and net_tree_lock 1700 */ 1701 void 1702 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock, 1703 nfs4_ephemeral_tree_t **pnet) 1704 { 1705 nfs4_ephemeral_tree_t *net = *pnet; 1706 1707 if (*pmust_unlock) { 1708 mutex_enter(&net->net_cnt_lock); 1709 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING; 1710 nfs4_ephemeral_tree_decr(net); 1711 mutex_exit(&net->net_cnt_lock); 1712 1713 mutex_exit(&net->net_tree_lock); 1714 1715 *pmust_unlock = FALSE; 1716 } 1717 } 1718 1719 /* 1720 * While we may have removed any child or sibling nodes of this 1721 * ephemeral node, we can not nuke it until we know that there 1722 * were no actived vnodes on it. This will do that final 1723 * work once we know it is not busy. 1724 */ 1725 void 1726 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock, 1727 nfs4_ephemeral_tree_t **pnet) 1728 { 1729 /* 1730 * Now we need to get rid of the ephemeral data if it exists. 1731 */ 1732 mutex_enter(&mi->mi_lock); 1733 if (mi->mi_ephemeral) { 1734 /* 1735 * If we are the root node of an ephemeral branch 1736 * which is being removed, then we need to fixup 1737 * pointers into and out of the node. 1738 */ 1739 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED)) 1740 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral); 1741 1742 ASSERT(mi->mi_ephemeral != NULL); 1743 1744 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral)); 1745 mi->mi_ephemeral = NULL; 1746 } 1747 mutex_exit(&mi->mi_lock); 1748 1749 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet); 1750 } 1751 1752 /* 1753 * Unmount an ephemeral node. 1754 */ 1755 int 1756 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr, 1757 bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet) 1758 { 1759 int error = 0; 1760 nfs4_ephemeral_t *eph; 1761 nfs4_ephemeral_tree_t *net; 1762 int is_derooting = FALSE; 1763 int is_recursed = FALSE; 1764 int was_locked = FALSE; 1765 1766 /* 1767 * Make sure to set the default state for cleaning 1768 * up the tree in the caller (and on the way out). 1769 */ 1770 *pmust_unlock = FALSE; 1771 1772 /* 1773 * The active vnodes on this file system may be ephemeral 1774 * children. We need to check for and try to unmount them 1775 * here. If any can not be unmounted, we are going 1776 * to return EBUSY. 1777 */ 1778 mutex_enter(&mi->mi_lock); 1779 1780 /* 1781 * If an ephemeral tree, we need to check to see if 1782 * the lock is already held. If it is, then we need 1783 * to see if we are being called as a result of 1784 * the recursive removal of some node of the tree or 1785 * if we are another attempt to remove the tree. 1786 * 1787 * mi_flags & MI4_EPHEMERAL indicates an ephemeral 1788 * node. mi_ephemeral being non-NULL also does this. 1789 * 1790 * mi_ephemeral_tree being non-NULL is sufficient 1791 * to also indicate either it is an ephemeral node 1792 * or the enclosing mntinfo4. 1793 * 1794 * Do we need MI4_EPHEMERAL? Yes, it is useful for 1795 * when we delete the ephemeral node and need to 1796 * differentiate from an ephemeral node and the 1797 * enclosing root node. 1798 */ 1799 *pnet = net = mi->mi_ephemeral_tree; 1800 if (net == NULL) { 1801 mutex_exit(&mi->mi_lock); 1802 return (0); 1803 } 1804 1805 eph = mi->mi_ephemeral; 1806 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED; 1807 is_derooting = (eph == NULL); 1808 1809 /* 1810 * If this is not recursion, then we need to 1811 * grab a ref count. 1812 * 1813 * But wait, we also do not want to do that 1814 * if a harvester thread has already grabbed 1815 * the lock. 1816 */ 1817 if (!is_recursed) { 1818 mutex_enter(&net->net_cnt_lock); 1819 if (net->net_status & 1820 NFS4_EPHEMERAL_TREE_LOCKED) { 1821 /* 1822 * If the tree is locked, we need 1823 * to decide whether we are the 1824 * harvester or some explicit call 1825 * for a umount. The only way that 1826 * we are the harvester is if 1827 * MS_SYSSPACE is set. 1828 * 1829 * We only let the harvester through 1830 * at this point. 1831 * 1832 * We return EBUSY so that the 1833 * caller knows something is 1834 * going on. Note that by that 1835 * time, the umount in the other 1836 * thread may have already occured. 1837 */ 1838 if (!(flag & MS_SYSSPACE)) { 1839 mutex_exit(&net->net_cnt_lock); 1840 mutex_exit(&mi->mi_lock); 1841 1842 return (EBUSY); 1843 } 1844 1845 was_locked = TRUE; 1846 } else { 1847 net->net_refcnt++; 1848 ASSERT(net->net_refcnt != 0); 1849 } 1850 1851 mutex_exit(&net->net_cnt_lock); 1852 } 1853 mutex_exit(&mi->mi_lock); 1854 1855 /* 1856 * If we are not the harvester, we need to check 1857 * to see if we need to grab the tree lock. 1858 */ 1859 if (was_locked == FALSE) { 1860 /* 1861 * If we grab the lock, it means that no other 1862 * operation is working on the tree. If we don't 1863 * grab it, we need to decide if this is because 1864 * we are a recursive call or a new operation. 1865 */ 1866 if (mutex_tryenter(&net->net_tree_lock)) { 1867 *pmust_unlock = TRUE; 1868 } else { 1869 /* 1870 * If we are a recursive call, we can 1871 * proceed without the lock. 1872 * Otherwise we have to wait until 1873 * the lock becomes free. 1874 */ 1875 if (!is_recursed) { 1876 mutex_enter(&net->net_cnt_lock); 1877 if (net->net_status & 1878 (NFS4_EPHEMERAL_TREE_DEROOTING 1879 | NFS4_EPHEMERAL_TREE_INVALID)) { 1880 nfs4_ephemeral_tree_decr(net); 1881 mutex_exit(&net->net_cnt_lock); 1882 goto is_busy; 1883 } 1884 mutex_exit(&net->net_cnt_lock); 1885 1886 /* 1887 * We can't hold any other locks whilst 1888 * we wait on this to free up. 1889 */ 1890 mutex_enter(&net->net_tree_lock); 1891 1892 /* 1893 * Note that while mi->mi_ephemeral 1894 * may change and thus we have to 1895 * update eph, it is the case that 1896 * we have tied down net and 1897 * do not care if mi->mi_ephemeral_tree 1898 * has changed. 1899 */ 1900 mutex_enter(&mi->mi_lock); 1901 eph = mi->mi_ephemeral; 1902 mutex_exit(&mi->mi_lock); 1903 1904 /* 1905 * Okay, we need to see if either the 1906 * tree got nuked or the current node 1907 * got nuked. Both of which will cause 1908 * an error. 1909 * 1910 * Note that a subsequent retry of the 1911 * umount shall work. 1912 */ 1913 mutex_enter(&net->net_cnt_lock); 1914 if (net->net_status & 1915 NFS4_EPHEMERAL_TREE_INVALID || 1916 (!is_derooting && eph == NULL)) { 1917 nfs4_ephemeral_tree_decr(net); 1918 mutex_exit(&net->net_cnt_lock); 1919 mutex_exit(&net->net_tree_lock); 1920 goto is_busy; 1921 } 1922 mutex_exit(&net->net_cnt_lock); 1923 *pmust_unlock = TRUE; 1924 } 1925 } 1926 } 1927 1928 /* 1929 * Only once we have grabbed the lock can we mark what we 1930 * are planning on doing to the ephemeral tree. 1931 */ 1932 if (*pmust_unlock) { 1933 mutex_enter(&net->net_cnt_lock); 1934 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING; 1935 1936 /* 1937 * Check to see if we are nuking the root. 1938 */ 1939 if (is_derooting) 1940 net->net_status |= 1941 NFS4_EPHEMERAL_TREE_DEROOTING; 1942 mutex_exit(&net->net_cnt_lock); 1943 } 1944 1945 if (!is_derooting) { 1946 /* 1947 * Only work on children if the caller has not already 1948 * done so. 1949 */ 1950 if (!is_recursed) { 1951 ASSERT(eph != NULL); 1952 1953 error = nfs4_ephemeral_unmount_engine(eph, 1954 FALSE, flag, cr); 1955 if (error) 1956 goto is_busy; 1957 } 1958 } else { 1959 eph = net->net_root; 1960 1961 /* 1962 * Only work if there is something there. 1963 */ 1964 if (eph) { 1965 error = nfs4_ephemeral_unmount_engine(eph, TRUE, 1966 flag, cr); 1967 if (error) { 1968 mutex_enter(&net->net_cnt_lock); 1969 net->net_status &= 1970 ~NFS4_EPHEMERAL_TREE_DEROOTING; 1971 mutex_exit(&net->net_cnt_lock); 1972 goto is_busy; 1973 } 1974 1975 /* 1976 * Nothing else which goes wrong will 1977 * invalidate the blowing away of the 1978 * ephmeral tree. 1979 */ 1980 net->net_root = NULL; 1981 } 1982 1983 /* 1984 * We have derooted and we have caused the tree to be 1985 * invalidated. 1986 */ 1987 mutex_enter(&net->net_cnt_lock); 1988 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING; 1989 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID; 1990 nfs4_ephemeral_tree_decr(net); 1991 mutex_exit(&net->net_cnt_lock); 1992 1993 if (was_locked == FALSE) 1994 mutex_exit(&net->net_tree_lock); 1995 1996 /* 1997 * We have just blown away any notation of this 1998 * tree being locked. We can't let the caller 1999 * try to clean things up. 2000 */ 2001 *pmust_unlock = FALSE; 2002 2003 /* 2004 * At this point, the tree should no 2005 * longer be associated with the 2006 * mntinfo4. We need to pull it off 2007 * there and let the harvester take 2008 * care of it once the refcnt drops. 2009 */ 2010 mutex_enter(&mi->mi_lock); 2011 mi->mi_ephemeral_tree = NULL; 2012 mutex_exit(&mi->mi_lock); 2013 } 2014 2015 return (0); 2016 2017 is_busy: 2018 2019 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet); 2020 2021 return (error); 2022 } 2023 2024 /* 2025 * Do the umount and record any error in the parent. 2026 */ 2027 static void 2028 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag, 2029 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior) 2030 { 2031 int error; 2032 2033 error = umount2_engine(vfsp, flag, kcred, FALSE); 2034 if (error) { 2035 if (prior) { 2036 if (prior->ne_child == e) 2037 prior->ne_state |= 2038 NFS4_EPHEMERAL_CHILD_ERROR; 2039 else 2040 prior->ne_state |= 2041 NFS4_EPHEMERAL_PEER_ERROR; 2042 } 2043 } 2044 } 2045 2046 /* 2047 * For each tree in the forest (where the forest is in 2048 * effect all of the ephemeral trees for this zone), 2049 * scan to see if a node can be unmounted. Note that 2050 * unlike nfs4_ephemeral_unmount_engine(), we do 2051 * not process the current node before children or 2052 * siblings. I.e., if a node can be unmounted, we 2053 * do not recursively check to see if the nodes 2054 * hanging off of it can also be unmounted. 2055 * 2056 * Instead, we delve down deep to try and remove the 2057 * children first. Then, because we share code with 2058 * nfs4_ephemeral_unmount_engine(), we will try 2059 * them again. This could be a performance issue in 2060 * the future. 2061 * 2062 * Also note that unlike nfs4_ephemeral_unmount_engine(), 2063 * we do not halt on an error. We will not remove the 2064 * current node, but we will keep on trying to remove 2065 * the others. 2066 * 2067 * force indicates that we want the unmount to occur 2068 * even if there is something blocking it. 2069 * 2070 * time_check indicates that we want to see if the 2071 * mount has expired past mount_to or not. Typically 2072 * we want to do this and only on a shutdown of the 2073 * zone would we want to ignore the check. 2074 */ 2075 static void 2076 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg, 2077 bool_t force, bool_t time_check) 2078 { 2079 nfs4_ephemeral_tree_t *net; 2080 nfs4_ephemeral_tree_t *prev = NULL; 2081 nfs4_ephemeral_tree_t *next; 2082 nfs4_ephemeral_t *e; 2083 nfs4_ephemeral_t *prior; 2084 time_t now = gethrestime_sec(); 2085 2086 nfs4_ephemeral_tree_t *harvest = NULL; 2087 2088 int flag; 2089 2090 mntinfo4_t *mi; 2091 vfs_t *vfsp; 2092 2093 if (force) 2094 flag = MS_FORCE | MS_SYSSPACE; 2095 else 2096 flag = MS_SYSSPACE; 2097 2098 mutex_enter(&ntg->ntg_forest_lock); 2099 for (net = ntg->ntg_forest; net != NULL; net = next) { 2100 next = net->net_next; 2101 2102 nfs4_ephemeral_tree_hold(net); 2103 2104 mutex_enter(&net->net_tree_lock); 2105 2106 /* 2107 * Let the unmount code know that the 2108 * tree is already locked! 2109 */ 2110 mutex_enter(&net->net_cnt_lock); 2111 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED; 2112 mutex_exit(&net->net_cnt_lock); 2113 2114 /* 2115 * If the intent is force all ephemeral nodes to 2116 * be unmounted in this zone, we can short circuit a 2117 * lot of tree traversal and simply zap the root node. 2118 */ 2119 if (force) { 2120 if (net->net_root) { 2121 mi = net->net_root->ne_mount; 2122 vfsp = mi->mi_vfsp; 2123 2124 /* 2125 * Cleared by umount2_engine. 2126 */ 2127 VFS_HOLD(vfsp); 2128 2129 (void) umount2_engine(vfsp, flag, 2130 kcred, FALSE); 2131 2132 goto check_done; 2133 } 2134 } 2135 2136 e = net->net_root; 2137 if (e) 2138 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; 2139 2140 while (e) { 2141 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) { 2142 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING; 2143 if (e->ne_child) { 2144 e = e->ne_child; 2145 e->ne_state = 2146 NFS4_EPHEMERAL_VISIT_CHILD; 2147 } 2148 2149 continue; 2150 } else if (e->ne_state == 2151 NFS4_EPHEMERAL_VISIT_SIBLING) { 2152 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME; 2153 if (e->ne_peer) { 2154 e = e->ne_peer; 2155 e->ne_state = 2156 NFS4_EPHEMERAL_VISIT_CHILD; 2157 } 2158 2159 continue; 2160 } else if (e->ne_state == 2161 NFS4_EPHEMERAL_CHILD_ERROR) { 2162 prior = e->ne_prior; 2163 2164 /* 2165 * If a child reported an error, do 2166 * not bother trying to unmount. 2167 * 2168 * If your prior node is a parent, 2169 * pass the error up such that they 2170 * also do not try to unmount. 2171 * 2172 * However, if your prior is a sibling, 2173 * let them try to unmount if they can. 2174 */ 2175 if (prior) { 2176 if (prior->ne_child == e) 2177 prior->ne_state |= 2178 NFS4_EPHEMERAL_CHILD_ERROR; 2179 else 2180 prior->ne_state |= 2181 NFS4_EPHEMERAL_PEER_ERROR; 2182 } 2183 2184 /* 2185 * Clear the error and if needed, process peers. 2186 * 2187 * Once we mask out the error, we know whether 2188 * or we have to process another node. 2189 */ 2190 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR; 2191 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME) 2192 e = prior; 2193 2194 continue; 2195 } else if (e->ne_state == 2196 NFS4_EPHEMERAL_PEER_ERROR) { 2197 prior = e->ne_prior; 2198 2199 if (prior) { 2200 if (prior->ne_child == e) 2201 prior->ne_state = 2202 NFS4_EPHEMERAL_CHILD_ERROR; 2203 else 2204 prior->ne_state = 2205 NFS4_EPHEMERAL_PEER_ERROR; 2206 } 2207 2208 /* 2209 * Clear the error from this node and do the 2210 * correct processing. 2211 */ 2212 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR; 2213 continue; 2214 } 2215 2216 prior = e->ne_prior; 2217 e->ne_state = NFS4_EPHEMERAL_OK; 2218 2219 /* 2220 * It must be the case that we need to process 2221 * this node. 2222 */ 2223 if (!time_check || 2224 now - e->ne_ref_time > e->ne_mount_to) { 2225 mi = e->ne_mount; 2226 vfsp = mi->mi_vfsp; 2227 2228 /* 2229 * Cleared by umount2_engine. 2230 */ 2231 VFS_HOLD(vfsp); 2232 2233 /* 2234 * Note that we effectively work down to the 2235 * leaf nodes first, try to unmount them, 2236 * then work our way back up into the leaf 2237 * nodes. 2238 * 2239 * Also note that we deal with a lot of 2240 * complexity by sharing the work with 2241 * the manual unmount code. 2242 */ 2243 nfs4_ephemeral_record_umount(vfsp, flag, 2244 e, prior); 2245 } 2246 2247 e = prior; 2248 } 2249 2250 check_done: 2251 2252 /* 2253 * At this point we are done processing this tree. 2254 * 2255 * If the tree is invalid and we are the only reference 2256 * to it, then we push it on the local linked list 2257 * to remove it at the end. We avoid that action now 2258 * to keep the tree processing going along at a fair clip. 2259 * 2260 * Else, even if we are the only reference, we drop 2261 * our hold on the current tree and allow it to be 2262 * reused as needed. 2263 */ 2264 mutex_enter(&net->net_cnt_lock); 2265 if (net->net_refcnt == 1 && 2266 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) { 2267 nfs4_ephemeral_tree_decr(net); 2268 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2269 mutex_exit(&net->net_cnt_lock); 2270 mutex_exit(&net->net_tree_lock); 2271 2272 if (prev) 2273 prev->net_next = net->net_next; 2274 else 2275 ntg->ntg_forest = net->net_next; 2276 2277 net->net_next = harvest; 2278 harvest = net; 2279 continue; 2280 } 2281 2282 nfs4_ephemeral_tree_decr(net); 2283 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2284 mutex_exit(&net->net_cnt_lock); 2285 mutex_exit(&net->net_tree_lock); 2286 2287 prev = net; 2288 } 2289 mutex_exit(&ntg->ntg_forest_lock); 2290 2291 for (net = harvest; net != NULL; net = next) { 2292 next = net->net_next; 2293 2294 mutex_destroy(&net->net_tree_lock); 2295 mutex_destroy(&net->net_cnt_lock); 2296 kmem_free(net, sizeof (*net)); 2297 } 2298 } 2299 2300 /* 2301 * This is the thread which decides when the harvesting 2302 * can proceed and when to kill it off for this zone. 2303 */ 2304 static void 2305 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg) 2306 { 2307 clock_t timeleft; 2308 zone_t *zone = curproc->p_zone; 2309 2310 for (;;) { 2311 timeleft = zone_status_timedwait(zone, lbolt + 2312 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN); 2313 2314 /* 2315 * zone is exiting... 2316 */ 2317 if (timeleft != -1) { 2318 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN); 2319 zthread_exit(); 2320 /* NOTREACHED */ 2321 } 2322 2323 /* 2324 * Only bother scanning if there is potential 2325 * work to be done. 2326 */ 2327 if (ntg->ntg_forest == NULL) 2328 continue; 2329 2330 /* 2331 * Now scan the list and get rid of everything which 2332 * is old. 2333 */ 2334 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE); 2335 } 2336 2337 /* NOTREACHED */ 2338 } 2339 2340 /* 2341 * The zone specific glue needed to start the unmount harvester. 2342 * 2343 * Note that we want to avoid holding the mutex as long as possible, 2344 * hence the multiple checks. 2345 * 2346 * The caller should avoid us getting down here in the first 2347 * place. 2348 */ 2349 static void 2350 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg) 2351 { 2352 /* 2353 * It got started before we got here... 2354 */ 2355 if (ntg->ntg_thread_started) 2356 return; 2357 2358 mutex_enter(&nfs4_ephemeral_thread_lock); 2359 2360 if (ntg->ntg_thread_started) { 2361 mutex_exit(&nfs4_ephemeral_thread_lock); 2362 return; 2363 } 2364 2365 /* 2366 * Start the unmounter harvester thread for this zone. 2367 */ 2368 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester, 2369 ntg, 0, minclsyspri); 2370 2371 ntg->ntg_thread_started = TRUE; 2372 mutex_exit(&nfs4_ephemeral_thread_lock); 2373 } 2374 2375 /*ARGSUSED*/ 2376 static void * 2377 nfs4_ephemeral_zsd_create(zoneid_t zoneid) 2378 { 2379 nfs4_trigger_globals_t *ntg; 2380 2381 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP); 2382 ntg->ntg_thread_started = FALSE; 2383 2384 /* 2385 * This is the default.... 2386 */ 2387 ntg->ntg_mount_to = nfs4_trigger_thread_timer; 2388 2389 mutex_init(&ntg->ntg_forest_lock, NULL, 2390 MUTEX_DEFAULT, NULL); 2391 2392 return (ntg); 2393 } 2394 2395 /* 2396 * Try a nice gentle walk down the forest and convince 2397 * all of the trees to gracefully give it up. 2398 */ 2399 /*ARGSUSED*/ 2400 static void 2401 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg) 2402 { 2403 nfs4_trigger_globals_t *ntg = arg; 2404 2405 if (!ntg) 2406 return; 2407 2408 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE); 2409 } 2410 2411 /* 2412 * Race along the forest and rip all of the trees out by 2413 * their rootballs! 2414 */ 2415 /*ARGSUSED*/ 2416 static void 2417 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg) 2418 { 2419 nfs4_trigger_globals_t *ntg = arg; 2420 2421 if (!ntg) 2422 return; 2423 2424 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE); 2425 2426 mutex_destroy(&ntg->ntg_forest_lock); 2427 kmem_free(ntg, sizeof (*ntg)); 2428 } 2429 2430 /* 2431 * This is the zone independent cleanup needed for 2432 * emphemeral mount processing. 2433 */ 2434 void 2435 nfs4_ephemeral_fini(void) 2436 { 2437 (void) zone_key_delete(nfs4_ephemeral_key); 2438 mutex_destroy(&nfs4_ephemeral_thread_lock); 2439 } 2440 2441 /* 2442 * This is the zone independent initialization needed for 2443 * emphemeral mount processing. 2444 */ 2445 void 2446 nfs4_ephemeral_init(void) 2447 { 2448 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT, 2449 NULL); 2450 2451 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create, 2452 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy); 2453 } 2454 2455 /* 2456 * nfssys() calls this function to set the per-zone 2457 * value of mount_to to drive when an ephemeral mount is 2458 * timed out. Each mount will grab a copy of this value 2459 * when mounted. 2460 */ 2461 void 2462 nfs4_ephemeral_set_mount_to(uint_t mount_to) 2463 { 2464 nfs4_trigger_globals_t *ntg; 2465 zone_t *zone = curproc->p_zone; 2466 2467 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 2468 2469 ntg->ntg_mount_to = mount_to; 2470 } 2471 2472 /* 2473 * Walk the list of v4 mount options; if they are currently set in vfsp, 2474 * append them to a new comma-separated mount option string, and return it. 2475 * 2476 * Caller should free by calling nfs4_trigger_destroy_mntopts(). 2477 */ 2478 static char * 2479 nfs4_trigger_create_mntopts(vfs_t *vfsp) 2480 { 2481 uint_t i; 2482 char *mntopts; 2483 struct vfssw *vswp; 2484 mntopts_t *optproto; 2485 2486 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP); 2487 2488 /* get the list of applicable mount options for v4; locks *vswp */ 2489 vswp = vfs_getvfssw(MNTTYPE_NFS4); 2490 optproto = &vswp->vsw_optproto; 2491 2492 for (i = 0; i < optproto->mo_count; i++) { 2493 struct mntopt *mop = &optproto->mo_list[i]; 2494 2495 if (mop->mo_flags & MO_EMPTY) 2496 continue; 2497 2498 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) { 2499 kmem_free(mntopts, MAX_MNTOPT_STR); 2500 vfs_unrefvfssw(vswp); 2501 return (NULL); 2502 } 2503 } 2504 2505 vfs_unrefvfssw(vswp); 2506 2507 /* 2508 * MNTOPT_XATTR is not in the v4 mount opt proto list, 2509 * and it may only be passed via MS_OPTIONSTR, so we 2510 * must handle it here. 2511 * 2512 * Ideally, it would be in the list, but NFS does not specify its 2513 * own opt proto list, it uses instead the default one. Since 2514 * not all filesystems support extended attrs, it would not be 2515 * appropriate to add it there. 2516 */ 2517 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) || 2518 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) { 2519 kmem_free(mntopts, MAX_MNTOPT_STR); 2520 return (NULL); 2521 } 2522 2523 return (mntopts); 2524 } 2525 2526 static void 2527 nfs4_trigger_destroy_mntopts(char *mntopts) 2528 { 2529 if (mntopts) 2530 kmem_free(mntopts, MAX_MNTOPT_STR); 2531 } 2532 2533 /* 2534 * Check a single mount option (optname). Add to mntopts if it is set in VFS. 2535 */ 2536 static int 2537 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp) 2538 { 2539 if (mntopts == NULL || optname == NULL || vfsp == NULL) 2540 return (EINVAL); 2541 2542 if (vfs_optionisset(vfsp, optname, NULL)) { 2543 size_t mntoptslen = strlen(mntopts); 2544 size_t optnamelen = strlen(optname); 2545 2546 /* +1 for ',', +1 for NUL */ 2547 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR) 2548 return (EOVERFLOW); 2549 2550 /* first or subsequent mount option? */ 2551 if (*mntopts != '\0') 2552 (void) strcat(mntopts, ","); 2553 2554 (void) strcat(mntopts, optname); 2555 } 2556 2557 return (0); 2558 } 2559 2560 static enum clnt_stat 2561 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr) 2562 { 2563 int retries, error; 2564 uint_t max_msgsize; 2565 enum clnt_stat status; 2566 CLIENT *cl; 2567 struct timeval timeout; 2568 2569 /* as per recov_newserver() */ 2570 max_msgsize = 0; 2571 retries = 1; 2572 timeout.tv_sec = 2; 2573 timeout.tv_usec = 0; 2574 2575 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM, 2576 NFS_V4, max_msgsize, retries, CRED(), &cl); 2577 if (error) 2578 return (RPC_FAILED); 2579 2580 if (nointr) 2581 cl->cl_nosignal = TRUE; 2582 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL, 2583 timeout); 2584 if (nointr) 2585 cl->cl_nosignal = FALSE; 2586 2587 AUTH_DESTROY(cl->cl_auth); 2588 CLNT_DESTROY(cl); 2589 2590 return (status); 2591 } 2592