1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are 29 * triggered from a "stub" rnode via a special set of vnodeops. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/vfs_opreg.h> 40 #include <sys/file.h> 41 #include <sys/filio.h> 42 #include <sys/uio.h> 43 #include <sys/buf.h> 44 #include <sys/mman.h> 45 #include <sys/pathname.h> 46 #include <sys/dirent.h> 47 #include <sys/debug.h> 48 #include <sys/vmsystm.h> 49 #include <sys/fcntl.h> 50 #include <sys/flock.h> 51 #include <sys/swap.h> 52 #include <sys/errno.h> 53 #include <sys/strsubr.h> 54 #include <sys/sysmacros.h> 55 #include <sys/kmem.h> 56 #include <sys/mount.h> 57 #include <sys/cmn_err.h> 58 #include <sys/pathconf.h> 59 #include <sys/utsname.h> 60 #include <sys/dnlc.h> 61 #include <sys/acl.h> 62 #include <sys/systeminfo.h> 63 #include <sys/policy.h> 64 #include <sys/sdt.h> 65 #include <sys/list.h> 66 #include <sys/stat.h> 67 #include <sys/mntent.h> 68 69 #include <rpc/types.h> 70 #include <rpc/auth.h> 71 #include <rpc/clnt.h> 72 73 #include <nfs/nfs.h> 74 #include <nfs/nfs_clnt.h> 75 #include <nfs/nfs_acl.h> 76 #include <nfs/lm.h> 77 #include <nfs/nfs4.h> 78 #include <nfs/nfs4_kprot.h> 79 #include <nfs/rnode4.h> 80 #include <nfs/nfs4_clnt.h> 81 82 #include <vm/hat.h> 83 #include <vm/as.h> 84 #include <vm/page.h> 85 #include <vm/pvn.h> 86 #include <vm/seg.h> 87 #include <vm/seg_map.h> 88 #include <vm/seg_kpm.h> 89 #include <vm/seg_vn.h> 90 91 #include <fs/fs_subr.h> 92 93 #include <sys/ddi.h> 94 #include <sys/int_fmtio.h> 95 96 #include <sys/sunddi.h> 97 98 /* 99 * The automatic unmounter thread stuff! 100 */ 101 static int nfs4_trigger_thread_timer = 20; /* in seconds */ 102 103 /* 104 * Just a default.... 105 */ 106 static uint_t nfs4_trigger_mount_to = 240; 107 108 typedef struct nfs4_trigger_globals { 109 kmutex_t ntg_forest_lock; 110 uint_t ntg_mount_to; 111 int ntg_thread_started; 112 nfs4_ephemeral_tree_t *ntg_forest; 113 } nfs4_trigger_globals_t; 114 115 kmutex_t nfs4_ephemeral_thread_lock; 116 117 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED; 118 119 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *); 120 121 /* 122 * Used for ephemeral mounts; contains data either duplicated from 123 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount. 124 * 125 * It's intended that this structure is used solely for ephemeral 126 * mount-type specific data, for passing this data to 127 * nfs4_trigger_nargs_create(). 128 */ 129 typedef struct ephemeral_servinfo { 130 char *esi_hostname; 131 char *esi_netname; 132 char *esi_path; 133 int esi_path_len; 134 int esi_mount_flags; 135 struct netbuf *esi_addr; 136 struct netbuf *esi_syncaddr; 137 struct knetconfig *esi_knconf; 138 } ephemeral_servinfo_t; 139 140 /* 141 * Collect together the mount-type specific and generic data args. 142 */ 143 typedef struct domount_args { 144 ephemeral_servinfo_t *dma_esi; 145 char *dma_hostlist; /* comma-sep. for RO failover */ 146 struct nfs_args *dma_nargs; 147 } domount_args_t; 148 149 150 /* 151 * The vnode ops functions for a trigger stub vnode 152 */ 153 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *); 154 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *, 155 caller_context_t *); 156 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *, 157 caller_context_t *); 158 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *, 159 caller_context_t *); 160 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *, 161 caller_context_t *); 162 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **, 163 struct pathname *, int, vnode_t *, cred_t *, caller_context_t *, 164 int *, pathname_t *); 165 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *, 166 enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *, 167 vsecattr_t *); 168 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *, 169 int); 170 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *, 171 caller_context_t *, int); 172 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *, 173 cred_t *, caller_context_t *, int); 174 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *, 175 vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp); 176 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 177 caller_context_t *, int); 178 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *, 179 cred_t *, caller_context_t *, int); 180 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *); 181 182 /* 183 * Regular NFSv4 vnodeops that we need to reference directly 184 */ 185 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 186 caller_context_t *); 187 extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 188 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *); 189 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 190 extern int nfs4_lookup(vnode_t *, char *, vnode_t **, 191 struct pathname *, int, vnode_t *, cred_t *, 192 caller_context_t *, int *, pathname_t *); 193 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 194 caller_context_t *); 195 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 196 caller_context_t *); 197 extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 198 extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 199 200 static int nfs4_trigger_mount(vnode_t *, vnode_t **); 201 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **, 202 cred_t *); 203 static domount_args_t *nfs4_trigger_domount_args_create(vnode_t *); 204 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma, 205 vnode_t *vp); 206 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *); 207 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *); 208 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *, 209 servinfo4_t *); 210 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *, 211 ephemeral_servinfo_t *); 212 static void nfs4_trigger_nargs_destroy(struct nfs_args *); 213 static char *nfs4_trigger_create_mntopts(vfs_t *); 214 static void nfs4_trigger_destroy_mntopts(char *); 215 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *); 216 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int); 217 218 extern int umount2_engine(vfs_t *, int, cred_t *, int); 219 220 221 vnodeops_t *nfs4_trigger_vnodeops; 222 223 /* 224 * These are the vnodeops that we must define for stub vnodes. 225 * 226 * 227 * Many of the VOPs defined for NFSv4 do not need to be defined here, 228 * for various reasons. This will result in the VFS default function being 229 * used: 230 * 231 * - These VOPs require a previous VOP_OPEN to have occurred. That will have 232 * lost the reference to the stub vnode, meaning these should not be called: 233 * close, read, write, ioctl, readdir, seek. 234 * 235 * - These VOPs are meaningless for vnodes without data pages. Since the 236 * stub vnode is of type VDIR, these should not be called: 237 * space, getpage, putpage, map, addmap, delmap, pageio, fsync. 238 * 239 * - These VOPs are otherwise not applicable, and should not be called: 240 * dump, setsecattr. 241 * 242 * 243 * These VOPs we do not want to define, but nor do we want the VFS default 244 * action. Instead, we specify the VFS error function, with fs_error(), but 245 * note that fs_error() is not actually called. Instead it results in the 246 * use of the error function defined for the particular VOP, in vn_ops_table[]: 247 * 248 * - frlock, dispose, shrlock. 249 * 250 * 251 * These VOPs we define to use the corresponding regular NFSv4 vnodeop. 252 * NOTE: if any of these ops involve an OTW call with the stub FH, then 253 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo() 254 * to protect the security data in the servinfo4_t for the "parent" 255 * filesystem that contains the stub. 256 * 257 * - These VOPs should not trigger a mount, so that "ls -l" does not: 258 * pathconf, getsecattr. 259 * 260 * - These VOPs would not make sense to trigger: 261 * inactive, rwlock, rwunlock, fid, realvp. 262 */ 263 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = { 264 VOPNAME_OPEN, { .vop_open = nfs4_trigger_open }, 265 VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr }, 266 VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr }, 267 VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access }, 268 VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup }, 269 VOPNAME_CREATE, { .vop_create = nfs4_trigger_create }, 270 VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove }, 271 VOPNAME_LINK, { .vop_link = nfs4_trigger_link }, 272 VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename }, 273 VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir }, 274 VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir }, 275 VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink }, 276 VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink }, 277 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 278 VOPNAME_FID, { .vop_fid = nfs4_fid }, 279 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 280 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 281 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 282 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 283 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 284 VOPNAME_FRLOCK, { .error = fs_error }, 285 VOPNAME_DISPOSE, { .error = fs_error }, 286 VOPNAME_SHRLOCK, { .error = fs_error }, 287 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 288 NULL, NULL 289 }; 290 291 static void 292 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net) 293 { 294 ASSERT(mutex_owned(&net->net_cnt_lock)); 295 net->net_refcnt++; 296 ASSERT(net->net_refcnt != 0); 297 } 298 299 static void 300 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net) 301 { 302 mutex_enter(&net->net_cnt_lock); 303 nfs4_ephemeral_tree_incr(net); 304 mutex_exit(&net->net_cnt_lock); 305 } 306 307 /* 308 * We need a safe way to decrement the refcnt whilst the 309 * lock is being held. 310 */ 311 static void 312 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net) 313 { 314 ASSERT(mutex_owned(&net->net_cnt_lock)); 315 ASSERT(net->net_refcnt != 0); 316 net->net_refcnt--; 317 } 318 319 static void 320 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net) 321 { 322 mutex_enter(&net->net_cnt_lock); 323 nfs4_ephemeral_tree_decr(net); 324 mutex_exit(&net->net_cnt_lock); 325 } 326 327 /* 328 * Trigger ops for stub vnodes; for mirror mounts, etc. 329 * 330 * The general idea is that a "triggering" op will first call 331 * nfs4_trigger_mount(), which will find out whether a mount has already 332 * been triggered. 333 * 334 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode 335 * of the covering vfs. 336 * 337 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so, 338 * and again set newvp, as above. 339 * 340 * The triggering op may then re-issue the VOP by calling it on newvp. 341 * 342 * Note that some ops may perform custom action, and may or may not need 343 * to trigger a mount. 344 * 345 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We 346 * obviously can't do this with VOP_<whatever>, since it's a stub vnode 347 * and that would just recurse. Instead, we call the v4 op directly, 348 * by name. This is OK, since we know that the vnode is for NFSv4, 349 * otherwise it couldn't be a stub. 350 * 351 */ 352 353 static int 354 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 355 { 356 int error; 357 vnode_t *newvp; 358 359 error = nfs4_trigger_mount(*vpp, &newvp); 360 if (error) 361 return (error); 362 363 /* Release the stub vnode, as we're losing the reference to it */ 364 VN_RELE(*vpp); 365 366 /* Give the caller the root vnode of the newly-mounted fs */ 367 *vpp = newvp; 368 369 /* return with VN_HELD(newvp) */ 370 return (VOP_OPEN(vpp, flag, cr, ct)); 371 } 372 373 /* 374 * For the majority of cases, nfs4_trigger_getattr() will not trigger 375 * a mount. However, if ATTR_TRIGGER is set, we are being informed 376 * that we need to force the mount before we attempt to determine 377 * the attributes. The intent is an atomic operation for security 378 * testing. 379 */ 380 static int 381 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 382 caller_context_t *ct) 383 { 384 int error; 385 386 if (flags & ATTR_TRIGGER) { 387 vnode_t *newvp; 388 389 error = nfs4_trigger_mount(vp, &newvp); 390 if (error) 391 return (error); 392 393 error = VOP_GETATTR(newvp, vap, flags, cr, ct); 394 VN_RELE(newvp); 395 } else { 396 error = nfs4_getattr(vp, vap, flags, cr, ct); 397 } 398 399 return (error); 400 } 401 402 static int 403 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 404 caller_context_t *ct) 405 { 406 int error; 407 vnode_t *newvp; 408 409 error = nfs4_trigger_mount(vp, &newvp); 410 if (error) 411 return (error); 412 413 error = VOP_SETATTR(newvp, vap, flags, cr, ct); 414 VN_RELE(newvp); 415 416 return (error); 417 } 418 419 static int 420 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr, 421 caller_context_t *ct) 422 { 423 int error; 424 vnode_t *newvp; 425 426 error = nfs4_trigger_mount(vp, &newvp); 427 if (error) 428 return (error); 429 430 error = VOP_ACCESS(newvp, mode, flags, cr, ct); 431 VN_RELE(newvp); 432 433 return (error); 434 } 435 436 static int 437 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, 438 struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr, 439 caller_context_t *ct, int *deflags, pathname_t *rpnp) 440 { 441 int error; 442 vnode_t *newdvp; 443 rnode4_t *drp = VTOR4(dvp); 444 445 ASSERT(RP_ISSTUB(drp)); 446 447 /* for now, we only support mirror-mounts */ 448 ASSERT(RP_ISSTUB_MIRRORMOUNT(drp)); 449 450 /* 451 * It's not legal to lookup ".." for an fs root, so we mustn't pass 452 * that up. Instead, pass onto the regular op, regardless of whether 453 * we've triggered a mount. 454 */ 455 if (strcmp(nm, "..") == 0) 456 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 457 ct, deflags, rpnp)); 458 459 error = nfs4_trigger_mount(dvp, &newdvp); 460 if (error) 461 return (error); 462 463 error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct, 464 deflags, rpnp); 465 VN_RELE(newdvp); 466 467 return (error); 468 } 469 470 static int 471 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va, 472 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr, 473 int flags, caller_context_t *ct, vsecattr_t *vsecp) 474 { 475 int error; 476 vnode_t *newdvp; 477 478 error = nfs4_trigger_mount(dvp, &newdvp); 479 if (error) 480 return (error); 481 482 error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr, 483 flags, ct, vsecp); 484 VN_RELE(newdvp); 485 486 return (error); 487 } 488 489 static int 490 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, 491 int flags) 492 { 493 int error; 494 vnode_t *newdvp; 495 496 error = nfs4_trigger_mount(dvp, &newdvp); 497 if (error) 498 return (error); 499 500 error = VOP_REMOVE(newdvp, nm, cr, ct, flags); 501 VN_RELE(newdvp); 502 503 return (error); 504 } 505 506 static int 507 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 508 caller_context_t *ct, int flags) 509 { 510 int error; 511 vnode_t *newtdvp; 512 513 error = nfs4_trigger_mount(tdvp, &newtdvp); 514 if (error) 515 return (error); 516 517 /* 518 * We don't check whether svp is a stub. Let the NFSv4 code 519 * detect that error, and return accordingly. 520 */ 521 error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags); 522 VN_RELE(newtdvp); 523 524 return (error); 525 } 526 527 static int 528 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, 529 cred_t *cr, caller_context_t *ct, int flags) 530 { 531 int error; 532 vnode_t *newsdvp; 533 rnode4_t *tdrp = VTOR4(tdvp); 534 535 /* 536 * We know that sdvp is a stub, otherwise we would not be here. 537 * 538 * If tdvp is also be a stub, there are two possibilities: it 539 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)] 540 * or it is a different stub [!VN_CMP(sdvp, tdvp)]. 541 * 542 * In the former case, just trigger sdvp, and treat tdvp as 543 * though it were not a stub. 544 * 545 * In the latter case, it might be a different stub for the 546 * same server fs as sdvp, or for a different server fs. 547 * Regardless, from the client perspective this would still 548 * be a cross-filesystem rename, and should not be allowed, 549 * so return EXDEV, without triggering either mount. 550 */ 551 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp)) 552 return (EXDEV); 553 554 error = nfs4_trigger_mount(sdvp, &newsdvp); 555 if (error) 556 return (error); 557 558 error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags); 559 560 VN_RELE(newsdvp); 561 562 return (error); 563 } 564 565 /* ARGSUSED */ 566 static int 567 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, 568 cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) 569 { 570 int error; 571 vnode_t *newdvp; 572 573 error = nfs4_trigger_mount(dvp, &newdvp); 574 if (error) 575 return (error); 576 577 error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp); 578 VN_RELE(newdvp); 579 580 return (error); 581 } 582 583 static int 584 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 585 caller_context_t *ct, int flags) 586 { 587 int error; 588 vnode_t *newdvp; 589 590 error = nfs4_trigger_mount(dvp, &newdvp); 591 if (error) 592 return (error); 593 594 error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags); 595 VN_RELE(newdvp); 596 597 return (error); 598 } 599 600 static int 601 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, 602 cred_t *cr, caller_context_t *ct, int flags) 603 { 604 int error; 605 vnode_t *newdvp; 606 607 error = nfs4_trigger_mount(dvp, &newdvp); 608 if (error) 609 return (error); 610 611 error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags); 612 VN_RELE(newdvp); 613 614 return (error); 615 } 616 617 static int 618 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, 619 caller_context_t *ct) 620 { 621 int error; 622 vnode_t *newvp; 623 624 error = nfs4_trigger_mount(vp, &newvp); 625 if (error) 626 return (error); 627 628 error = VOP_READLINK(newvp, uiop, cr, ct); 629 VN_RELE(newvp); 630 631 return (error); 632 } 633 634 /* end of trigger vnode ops */ 635 636 637 /* 638 * Mount upon a trigger vnode; for mirror-mounts, etc. 639 * 640 * The mount may have already occurred, via another thread. If not, 641 * assemble the location information - which may require fetching - and 642 * perform the mount. 643 * 644 * Sets newvp to be the root of the fs that is now covering vp. Note 645 * that we return with VN_HELD(*newvp). 646 * 647 * The caller is responsible for passing the VOP onto the covering fs. 648 */ 649 static int 650 nfs4_trigger_mount(vnode_t *vp, vnode_t **newvpp) 651 { 652 int error; 653 vfs_t *vfsp; 654 rnode4_t *rp = VTOR4(vp); 655 mntinfo4_t *mi = VTOMI4(vp); 656 domount_args_t *dma; 657 658 nfs4_ephemeral_tree_t *net; 659 660 bool_t must_unlock = FALSE; 661 bool_t is_building = FALSE; 662 663 cred_t *zcred; 664 665 nfs4_trigger_globals_t *ntg; 666 667 zone_t *zone = curproc->p_zone; 668 669 ASSERT(RP_ISSTUB(rp)); 670 671 /* for now, we only support mirror-mounts */ 672 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 673 674 *newvpp = NULL; 675 676 /* 677 * Has the mount already occurred? 678 */ 679 error = vn_vfsrlock_wait(vp); 680 if (error) 681 goto done; 682 vfsp = vn_mountedvfs(vp); 683 if (vfsp != NULL) { 684 /* the mount has already occurred */ 685 error = VFS_ROOT(vfsp, newvpp); 686 if (!error) { 687 /* need to update the reference time */ 688 mutex_enter(&mi->mi_lock); 689 if (mi->mi_ephemeral) 690 mi->mi_ephemeral->ne_ref_time = 691 gethrestime_sec(); 692 mutex_exit(&mi->mi_lock); 693 } 694 695 vn_vfsunlock(vp); 696 goto done; 697 } 698 vn_vfsunlock(vp); 699 700 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 701 ASSERT(ntg != NULL); 702 703 mutex_enter(&mi->mi_lock); 704 705 /* 706 * We need to lock down the ephemeral tree. 707 */ 708 if (mi->mi_ephemeral_tree == NULL) { 709 net = kmem_zalloc(sizeof (*net), KM_SLEEP); 710 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL); 711 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL); 712 net->net_refcnt = 1; 713 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING; 714 is_building = TRUE; 715 716 /* 717 * We need to add it to the zone specific list for 718 * automatic unmounting and harvesting of deadwood. 719 */ 720 mutex_enter(&ntg->ntg_forest_lock); 721 if (ntg->ntg_forest != NULL) 722 net->net_next = ntg->ntg_forest; 723 ntg->ntg_forest = net; 724 mutex_exit(&ntg->ntg_forest_lock); 725 726 /* 727 * No lock order confusion with mi_lock because no 728 * other node could have grabbed net_tree_lock. 729 */ 730 mutex_enter(&net->net_tree_lock); 731 mi->mi_ephemeral_tree = net; 732 net->net_mount = mi; 733 mutex_exit(&mi->mi_lock); 734 } else { 735 net = mi->mi_ephemeral_tree; 736 nfs4_ephemeral_tree_hold(net); 737 738 mutex_exit(&mi->mi_lock); 739 740 mutex_enter(&net->net_tree_lock); 741 742 /* 743 * We can only procede if the tree is neither locked 744 * nor being torn down. 745 */ 746 mutex_enter(&net->net_cnt_lock); 747 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) { 748 nfs4_ephemeral_tree_decr(net); 749 mutex_exit(&net->net_cnt_lock); 750 mutex_exit(&net->net_tree_lock); 751 752 return (EIO); 753 } 754 mutex_exit(&net->net_cnt_lock); 755 } 756 757 mutex_enter(&net->net_cnt_lock); 758 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING; 759 mutex_exit(&net->net_cnt_lock); 760 761 must_unlock = TRUE; 762 763 dma = nfs4_trigger_domount_args_create(vp); 764 if (dma == NULL) { 765 error = EINVAL; 766 goto done; 767 } 768 769 /* 770 * Need to be root for this call to make mount work. 771 * Note that since we define mirror mounts to work 772 * for any user, we allow the mount to proceed. And 773 * we realize that the server will perform security 774 * checks to make sure that the client is allowed 775 * access. Finally, once the mount takes place, 776 * directory permissions will ensure that the 777 * content is secure. 778 */ 779 zcred = zone_get_kcred(getzoneid()); 780 ASSERT(zcred != NULL); 781 782 error = nfs4_trigger_domount(vp, dma, &vfsp, zcred); 783 nfs4_trigger_domount_args_destroy(dma, vp); 784 785 crfree(zcred); 786 787 if (!error) 788 error = VFS_ROOT(vfsp, newvpp); 789 done: 790 if (must_unlock) { 791 mutex_enter(&net->net_cnt_lock); 792 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING; 793 if (is_building) 794 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING; 795 nfs4_ephemeral_tree_decr(net); 796 mutex_exit(&net->net_cnt_lock); 797 798 mutex_exit(&net->net_tree_lock); 799 } 800 801 if (!error && (newvpp == NULL || *newvpp == NULL)) 802 error = ENOSYS; 803 804 return (error); 805 } 806 807 /* 808 * Collect together both the generic & mount-type specific args. 809 */ 810 static domount_args_t * 811 nfs4_trigger_domount_args_create(vnode_t *vp) 812 { 813 int nointr; 814 char *hostlist; 815 servinfo4_t *svp; 816 struct nfs_args *nargs, *nargs_head; 817 enum clnt_stat status; 818 ephemeral_servinfo_t *esi, *esi_first; 819 domount_args_t *dma; 820 mntinfo4_t *mi = VTOMI4(vp); 821 822 nointr = !(mi->mi_flags & MI4_INT); 823 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 824 825 svp = mi->mi_curr_serv; 826 /* check if the current server is responding */ 827 status = nfs4_trigger_ping_server(svp, nointr); 828 if (status == RPC_SUCCESS) { 829 esi_first = nfs4_trigger_esi_create(vp, svp); 830 if (esi_first == NULL) { 831 kmem_free(hostlist, MAXPATHLEN); 832 return (NULL); 833 } 834 835 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN); 836 837 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first); 838 } else { 839 /* current server did not respond */ 840 esi_first = NULL; 841 nargs_head = NULL; 842 } 843 nargs = nargs_head; 844 845 /* 846 * NFS RO failover. 847 * 848 * If we have multiple servinfo4 structures, linked via sv_next, 849 * we must create one nfs_args for each, linking the nfs_args via 850 * nfs_ext_u.nfs_extB.next. 851 * 852 * We need to build a corresponding esi for each, too, but that is 853 * used solely for building nfs_args, and may be immediately 854 * discarded, as domount() requires the info from just one esi, 855 * but all the nfs_args. 856 * 857 * Currently, the NFS mount code will hang if not all servers 858 * requested are available. To avoid that, we need to ping each 859 * server, here, and remove it from the list if it is not 860 * responding. This has the side-effect of that server then 861 * being permanently unavailable for this failover mount, even if 862 * it recovers. That's unfortunate, but the best we can do until 863 * the mount code path is fixed. 864 */ 865 866 /* 867 * If the current server was down, loop indefinitely until we find 868 * at least one responsive server. 869 */ 870 do { 871 /* no locking needed for sv_next; it is only set at fs mount */ 872 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 873 struct nfs_args *next; 874 875 /* 876 * nargs_head: the head of the nfs_args list 877 * nargs: the current tail of the list 878 * next: the newly-created element to be added 879 */ 880 881 /* 882 * We've already tried the current server, above; 883 * if it was responding, we have already included it 884 * and it may now be ignored. 885 * 886 * Otherwise, try it again, since it may now have 887 * recovered. 888 */ 889 if (svp == mi->mi_curr_serv && esi_first != NULL) 890 continue; 891 892 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 893 if (svp->sv_flags & SV4_NOTINUSE) { 894 nfs_rw_exit(&svp->sv_lock); 895 continue; 896 } 897 nfs_rw_exit(&svp->sv_lock); 898 899 /* check if the server is responding */ 900 status = nfs4_trigger_ping_server(svp, nointr); 901 /* if the server did not respond, ignore it */ 902 if (status != RPC_SUCCESS) 903 continue; 904 905 esi = nfs4_trigger_esi_create(vp, svp); 906 if (esi == NULL) 907 continue; 908 909 /* 910 * If the original current server (mi_curr_serv) 911 * was down when when we first tried it, 912 * (i.e. esi_first == NULL), 913 * we select this new server (svp) to be the server 914 * that we will actually contact (esi_first). 915 * 916 * Note that it's possible that mi_curr_serv == svp, 917 * if that mi_curr_serv was down but has now recovered. 918 */ 919 next = nfs4_trigger_nargs_create(mi, svp, esi); 920 if (esi_first == NULL) { 921 ASSERT(nargs == NULL); 922 ASSERT(nargs_head == NULL); 923 nargs_head = next; 924 esi_first = esi; 925 (void) strlcpy(hostlist, 926 esi_first->esi_hostname, MAXPATHLEN); 927 } else { 928 ASSERT(nargs_head != NULL); 929 nargs->nfs_ext_u.nfs_extB.next = next; 930 (void) strlcat(hostlist, ",", MAXPATHLEN); 931 (void) strlcat(hostlist, esi->esi_hostname, 932 MAXPATHLEN); 933 /* esi was only needed for hostname & nargs */ 934 nfs4_trigger_esi_destroy(esi, vp); 935 } 936 937 nargs = next; 938 } 939 940 /* if we've had no response at all, wait a second */ 941 if (esi_first == NULL) 942 delay(drv_usectohz(1000000)); 943 944 } while (esi_first == NULL); 945 ASSERT(nargs_head != NULL); 946 947 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP); 948 dma->dma_esi = esi_first; 949 dma->dma_hostlist = hostlist; 950 dma->dma_nargs = nargs_head; 951 952 return (dma); 953 } 954 955 static void 956 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp) 957 { 958 if (dma != NULL) { 959 if (dma->dma_esi != NULL && vp != NULL) 960 nfs4_trigger_esi_destroy(dma->dma_esi, vp); 961 962 if (dma->dma_hostlist != NULL) 963 kmem_free(dma->dma_hostlist, MAXPATHLEN); 964 965 if (dma->dma_nargs != NULL) { 966 struct nfs_args *nargs = dma->dma_nargs; 967 968 do { 969 struct nfs_args *next = 970 nargs->nfs_ext_u.nfs_extB.next; 971 972 nfs4_trigger_nargs_destroy(nargs); 973 nargs = next; 974 } while (nargs != NULL); 975 } 976 977 kmem_free(dma, sizeof (domount_args_t)); 978 } 979 } 980 981 /* 982 * The ephemeral_servinfo_t struct contains basic information we will need to 983 * perform the mount. Whilst the structure is generic across different 984 * types of ephemeral mount, the way we gather its contents differs. 985 */ 986 static ephemeral_servinfo_t * 987 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp) 988 { 989 ephemeral_servinfo_t *esi; 990 rnode4_t *rp = VTOR4(vp); 991 992 ASSERT(RP_ISSTUB(rp)); 993 994 /* Call the ephemeral type-specific routine */ 995 if (RP_ISSTUB_MIRRORMOUNT(rp)) 996 esi = nfs4_trigger_esi_create_mirrormount(vp, svp); 997 else 998 esi = NULL; 999 1000 /* for now, we only support mirror-mounts */ 1001 ASSERT(esi != NULL); 1002 1003 return (esi); 1004 } 1005 1006 static void 1007 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp) 1008 { 1009 rnode4_t *rp = VTOR4(vp); 1010 1011 ASSERT(RP_ISSTUB(rp)); 1012 1013 /* for now, we only support mirror-mounts */ 1014 ASSERT(RP_ISSTUB_MIRRORMOUNT(rp)); 1015 1016 /* Currently, no need for an ephemeral type-specific routine */ 1017 1018 /* 1019 * The contents of ephemeral_servinfo_t goes into nfs_args, 1020 * and will be handled by nfs4_trigger_nargs_destroy(). 1021 * We need only free the structure itself. 1022 */ 1023 if (esi != NULL) 1024 kmem_free(esi, sizeof (ephemeral_servinfo_t)); 1025 } 1026 1027 /* 1028 * Some of this may turn out to be common with other ephemeral types, 1029 * in which case it should be moved to nfs4_trigger_esi_create(), or a 1030 * common function called. 1031 */ 1032 static ephemeral_servinfo_t * 1033 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp) 1034 { 1035 char *stubpath; 1036 struct knetconfig *sikncp, *svkncp; 1037 struct netbuf *bufp; 1038 ephemeral_servinfo_t *esi; 1039 1040 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP); 1041 1042 /* initially set to be our type of ephemeral mount; may be added to */ 1043 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT; 1044 1045 /* 1046 * We're copying info from the stub rnode's servinfo4, but 1047 * we must create new copies, not pointers, since this information 1048 * is to be associated with the new mount, which will be 1049 * unmounted (and its structures freed) separately 1050 */ 1051 1052 /* 1053 * Sizes passed to kmem_[z]alloc here must match those freed 1054 * in nfs4_free_args() 1055 */ 1056 1057 /* 1058 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this 1059 * is difficult to avoid: as we need to read svp to calculate the 1060 * sizes to be allocated. 1061 */ 1062 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1063 1064 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP); 1065 (void) strcat(esi->esi_hostname, svp->sv_hostname); 1066 1067 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1068 bufp = esi->esi_addr; 1069 bufp->len = svp->sv_addr.len; 1070 bufp->maxlen = svp->sv_addr.maxlen; 1071 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1072 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len); 1073 1074 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP); 1075 sikncp = esi->esi_knconf; 1076 svkncp = svp->sv_knconf; 1077 sikncp->knc_semantics = svkncp->knc_semantics; 1078 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1079 (void) strcat((char *)sikncp->knc_protofmly, 1080 (char *)svkncp->knc_protofmly); 1081 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP); 1082 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto); 1083 sikncp->knc_rdev = svkncp->knc_rdev; 1084 1085 /* 1086 * Used when AUTH_DH is negotiated. 1087 * 1088 * This is ephemeral mount-type specific, since it contains the 1089 * server's time-sync syncaddr. 1090 */ 1091 if (svp->sv_dhsec) { 1092 struct netbuf *bufp; 1093 sec_data_t *sdata; 1094 dh_k4_clntdata_t *data; 1095 1096 sdata = svp->sv_dhsec; 1097 data = (dh_k4_clntdata_t *)sdata->data; 1098 ASSERT(sdata->rpcflavor == AUTH_DH); 1099 1100 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); 1101 bufp->len = data->syncaddr.len; 1102 bufp->maxlen = data->syncaddr.maxlen; 1103 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP); 1104 bcopy(data->syncaddr.buf, bufp->buf, bufp->len); 1105 esi->esi_syncaddr = bufp; 1106 1107 if (data->netname != NULL) { 1108 int nmlen = data->netnamelen; 1109 1110 /* 1111 * We need to copy from a dh_k4_clntdata_t 1112 * netname/netnamelen pair to a NUL-terminated 1113 * netname string suitable for putting in nfs_args, 1114 * where the latter has no netnamelen field. 1115 */ 1116 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP); 1117 bcopy(data->netname, esi->esi_netname, nmlen); 1118 } 1119 } else { 1120 esi->esi_syncaddr = NULL; 1121 esi->esi_netname = NULL; 1122 } 1123 1124 stubpath = fn_path(VTOSV(vp)->sv_name); 1125 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */ 1126 ASSERT(*stubpath == '.'); 1127 stubpath += 1; 1128 1129 /* for nfs_args->fh */ 1130 esi->esi_path_len = strlen(svp->sv_path) + strlen(stubpath) + 1; 1131 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP); 1132 (void) strcat(esi->esi_path, svp->sv_path); 1133 (void) strcat(esi->esi_path, stubpath); 1134 1135 stubpath -= 1; 1136 /* stubpath allocated by fn_path() */ 1137 kmem_free(stubpath, strlen(stubpath) + 1); 1138 1139 nfs_rw_exit(&svp->sv_lock); 1140 1141 return (esi); 1142 } 1143 1144 /* 1145 * Assemble the args, and call the generic VFS mount function to 1146 * finally perform the ephemeral mount. 1147 */ 1148 static int 1149 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp, 1150 cred_t *cr) 1151 { 1152 struct mounta *uap; 1153 char *mntpt, *orig_path, *path; 1154 const char *orig_mntpt; 1155 int retval; 1156 int mntpt_len; 1157 int spec_len; 1158 zone_t *zone = curproc->p_zone; 1159 bool_t has_leading_slash; 1160 1161 vfs_t *stubvfsp = stubvp->v_vfsp; 1162 ephemeral_servinfo_t *esi = dma->dma_esi; 1163 struct nfs_args *nargs = dma->dma_nargs; 1164 1165 /* first, construct the mount point for the ephemeral mount */ 1166 orig_path = path = fn_path(VTOSV(stubvp)->sv_name); 1167 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt); 1168 1169 if (*orig_path == '.') 1170 orig_path++; 1171 1172 /* 1173 * Get rid of zone's root path 1174 */ 1175 if (zone != global_zone) { 1176 /* 1177 * -1 for trailing '/' and -1 for EOS. 1178 */ 1179 if (strncmp(zone->zone_rootpath, orig_mntpt, 1180 zone->zone_rootpathlen - 1) == 0) { 1181 orig_mntpt += (zone->zone_rootpathlen - 2); 1182 } 1183 } 1184 1185 mntpt_len = strlen(orig_mntpt) + strlen(orig_path); 1186 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP); 1187 (void) strcat(mntpt, orig_mntpt); 1188 (void) strcat(mntpt, orig_path); 1189 1190 kmem_free(path, strlen(path) + 1); 1191 path = esi->esi_path; 1192 if (*path == '.') 1193 path++; 1194 if (path[0] == '/' && path[1] == '/') 1195 path++; 1196 has_leading_slash = (*path == '/'); 1197 1198 spec_len = strlen(dma->dma_hostlist); 1199 spec_len += strlen(path); 1200 1201 /* We are going to have to add this in */ 1202 if (!has_leading_slash) 1203 spec_len++; 1204 1205 /* We need to get the ':' for dma_hostlist:esi_path */ 1206 spec_len++; 1207 1208 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP); 1209 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP); 1210 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist, 1211 has_leading_slash ? "" : "/", path); 1212 1213 uap->dir = mntpt; 1214 1215 uap->flags = MS_SYSSPACE | MS_DATA; 1216 /* fstype-independent mount options not covered elsewhere */ 1217 /* copy parent's mount(1M) "-m" flag */ 1218 if (stubvfsp->vfs_flag & VFS_NOMNTTAB) 1219 uap->flags |= MS_NOMNTTAB; 1220 1221 uap->fstype = MNTTYPE_NFS4; 1222 uap->dataptr = (char *)nargs; 1223 /* not needed for MS_SYSSPACE */ 1224 uap->datalen = 0; 1225 1226 /* use optptr to pass in extra mount options */ 1227 uap->flags |= MS_OPTIONSTR; 1228 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp); 1229 if (uap->optptr == NULL) { 1230 retval = EINVAL; 1231 goto done; 1232 } 1233 /* domount() expects us to count the trailing NUL */ 1234 uap->optlen = strlen(uap->optptr) + 1; 1235 1236 retval = domount(NULL, uap, stubvp, cr, vfsp); 1237 if (retval == 0) 1238 VFS_RELE(*vfsp); 1239 done: 1240 if (uap->optptr) 1241 nfs4_trigger_destroy_mntopts(uap->optptr); 1242 1243 kmem_free(uap->spec, spec_len + 1); 1244 kmem_free(uap, sizeof (struct mounta)); 1245 kmem_free(mntpt, mntpt_len + 1); 1246 1247 return (retval); 1248 } 1249 1250 /* 1251 * Build an nfs_args structure for passing to domount(). 1252 * 1253 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t; 1254 * generic data - common to all ephemeral mount types - is read directly 1255 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode. 1256 */ 1257 static struct nfs_args * 1258 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp, 1259 ephemeral_servinfo_t *esi) 1260 { 1261 sec_data_t *secdata; 1262 struct nfs_args *nargs; 1263 1264 /* setup the nfs args */ 1265 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP); 1266 1267 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1268 1269 nargs->addr = esi->esi_addr; 1270 1271 /* for AUTH_DH by negotiation */ 1272 if (esi->esi_syncaddr || esi->esi_netname) { 1273 nargs->flags |= NFSMNT_SECURE; 1274 nargs->syncaddr = esi->esi_syncaddr; 1275 nargs->netname = esi->esi_netname; 1276 } 1277 1278 nargs->flags |= NFSMNT_KNCONF; 1279 nargs->knconf = esi->esi_knconf; 1280 nargs->flags |= NFSMNT_HOSTNAME; 1281 nargs->hostname = esi->esi_hostname; 1282 nargs->fh = esi->esi_path; 1283 1284 /* general mount settings, all copied from parent mount */ 1285 mutex_enter(&mi->mi_lock); 1286 1287 if (!(mi->mi_flags & MI4_HARD)) 1288 nargs->flags |= NFSMNT_SOFT; 1289 1290 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO | 1291 NFSMNT_RETRANS; 1292 nargs->wsize = mi->mi_stsize; 1293 nargs->rsize = mi->mi_tsize; 1294 nargs->timeo = mi->mi_timeo; 1295 nargs->retrans = mi->mi_retrans; 1296 1297 if (mi->mi_flags & MI4_INT) 1298 nargs->flags |= NFSMNT_INT; 1299 if (mi->mi_flags & MI4_NOAC) 1300 nargs->flags |= NFSMNT_NOAC; 1301 1302 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN | 1303 NFSMNT_ACDIRMAX; 1304 nargs->acregmin = HR2SEC(mi->mi_acregmin); 1305 nargs->acregmax = HR2SEC(mi->mi_acregmax); 1306 nargs->acdirmin = HR2SEC(mi->mi_acdirmin); 1307 nargs->acdirmax = HR2SEC(mi->mi_acdirmax); 1308 1309 if (mi->mi_flags & MI4_NOCTO) 1310 nargs->flags |= NFSMNT_NOCTO; 1311 if (mi->mi_flags & MI4_GRPID) 1312 nargs->flags |= NFSMNT_GRPID; 1313 if (mi->mi_flags & MI4_LLOCK) 1314 nargs->flags |= NFSMNT_LLOCK; 1315 if (mi->mi_flags & MI4_NOPRINT) 1316 nargs->flags |= NFSMNT_NOPRINT; 1317 if (mi->mi_flags & MI4_DIRECTIO) 1318 nargs->flags |= NFSMNT_DIRECTIO; 1319 if (mi->mi_flags & MI4_PUBLIC) 1320 nargs->flags |= NFSMNT_PUBLIC; 1321 1322 mutex_exit(&mi->mi_lock); 1323 1324 /* add any specific flags for this type of ephemeral mount */ 1325 nargs->flags |= esi->esi_mount_flags; 1326 1327 /* 1328 * Security data & negotiation policy. 1329 * 1330 * We need to preserve the parent mount's preference for security 1331 * negotiation, translating SV4_TRYSECDEFAULT -> NFSMNT_SECDEFAULT. 1332 * 1333 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific 1334 * security flavour was requested, with data in sv_secdata, and that 1335 * no negotiation should occur. If this specified flavour fails, that's 1336 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT. 1337 * 1338 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in 1339 * default flavour, in sv_secdata, but then negotiate a new flavour. 1340 * Possible flavours are recorded in an array in sv_secinfo, with 1341 * currently in-use flavour pointed to by sv_currsec. 1342 * 1343 * If sv_currsec is set, i.e. if negotiation has already occurred, 1344 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless, 1345 * we will set NFSMNT_SECDEFAULT, to enable negotiation. 1346 */ 1347 if (svp->sv_flags & SV4_TRYSECDEFAULT) { 1348 /* enable negotiation for ephemeral mount */ 1349 nargs->flags |= NFSMNT_SECDEFAULT; 1350 1351 /* 1352 * As a starting point for negotiation, copy parent 1353 * mount's negotiated flavour (sv_currsec) if available, 1354 * or its passed-in flavour (sv_secdata) if not. 1355 */ 1356 if (svp->sv_currsec != NULL) 1357 secdata = copy_sec_data(svp->sv_currsec); 1358 else if (svp->sv_secdata != NULL) 1359 secdata = copy_sec_data(svp->sv_secdata); 1360 else 1361 secdata = NULL; 1362 } else { 1363 /* do not enable negotiation; copy parent's passed-in flavour */ 1364 if (svp->sv_secdata != NULL) 1365 secdata = copy_sec_data(svp->sv_secdata); 1366 else 1367 secdata = NULL; 1368 } 1369 1370 nfs_rw_exit(&svp->sv_lock); 1371 1372 nargs->flags |= NFSMNT_NEWARGS; 1373 nargs->nfs_args_ext = NFS_ARGS_EXTB; 1374 nargs->nfs_ext_u.nfs_extB.secdata = secdata; 1375 1376 /* for NFS RO failover; caller will set if necessary */ 1377 nargs->nfs_ext_u.nfs_extB.next = NULL; 1378 1379 return (nargs); 1380 } 1381 1382 static void 1383 nfs4_trigger_nargs_destroy(struct nfs_args *nargs) 1384 { 1385 /* 1386 * Either the mount failed, in which case the data is not needed, or 1387 * nfs4_mount() has either taken copies of what it needs or, 1388 * where it has merely copied the ptr, it has set *our* ptr to NULL, 1389 * whereby nfs4_free_args() will ignore it. 1390 */ 1391 nfs4_free_args(nargs); 1392 kmem_free(nargs, sizeof (struct nfs_args)); 1393 } 1394 1395 /* 1396 * When we finally get into the mounting, we need to add this 1397 * node to the ephemeral tree. 1398 * 1399 * This is called from nfs4_mount(). 1400 */ 1401 int 1402 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp) 1403 { 1404 mntinfo4_t *mi_parent; 1405 nfs4_ephemeral_t *eph; 1406 nfs4_ephemeral_tree_t *net; 1407 1408 nfs4_ephemeral_t *prior; 1409 nfs4_ephemeral_t *child; 1410 1411 nfs4_ephemeral_t *peer; 1412 1413 nfs4_trigger_globals_t *ntg; 1414 zone_t *zone = curproc->p_zone; 1415 1416 int rc = 0; 1417 1418 mi_parent = VTOMI4(mvp); 1419 1420 /* 1421 * Get this before grabbing anything else! 1422 */ 1423 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 1424 if (!ntg->ntg_thread_started) { 1425 nfs4_ephemeral_start_harvester(ntg); 1426 } 1427 1428 mutex_enter(&mi_parent->mi_lock); 1429 mutex_enter(&mi->mi_lock); 1430 1431 net = mi->mi_ephemeral_tree = 1432 mi_parent->mi_ephemeral_tree; 1433 1434 /* 1435 * If the mi_ephemeral_tree is NULL, then it 1436 * means that either the harvester or a manual 1437 * umount has cleared the tree out right before 1438 * we got here. 1439 * 1440 * There is nothing we can do here, so return 1441 * to the caller and let them decide whether they 1442 * try again. 1443 */ 1444 if (net == NULL) { 1445 mutex_exit(&mi->mi_lock); 1446 mutex_exit(&mi_parent->mi_lock); 1447 1448 return (EBUSY); 1449 } 1450 1451 nfs4_ephemeral_tree_hold(net); 1452 1453 /* 1454 * We need to tack together the ephemeral mount 1455 * with this new mntinfo. 1456 */ 1457 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP); 1458 eph->ne_mount = mi; 1459 eph->ne_ref_time = gethrestime_sec(); 1460 1461 /* 1462 * We need to tell the ephemeral mount when 1463 * to time out. 1464 */ 1465 eph->ne_mount_to = ntg->ntg_mount_to; 1466 1467 mi->mi_flags |= MI4_EPHEMERAL; 1468 mi->mi_ephemeral = eph; 1469 1470 /* 1471 * If the enclosing mntinfo4 is also ephemeral, 1472 * then we need to point to its enclosing parent. 1473 * Else the enclosing mntinfo4 is the enclosing parent. 1474 * 1475 * We also need to weave this ephemeral node 1476 * into the tree. 1477 */ 1478 if (mi_parent->mi_flags & MI4_EPHEMERAL) { 1479 /* 1480 * We need to decide if we are 1481 * the root node of this branch 1482 * or if we are a sibling of this 1483 * branch. 1484 */ 1485 prior = mi_parent->mi_ephemeral; 1486 if (prior == NULL) { 1487 /* 1488 * Race condition, clean up, and 1489 * let caller handle mntinfo. 1490 */ 1491 mi->mi_flags &= ~MI4_EPHEMERAL; 1492 mi->mi_ephemeral = NULL; 1493 kmem_free(eph, sizeof (*eph)); 1494 rc = EBUSY; 1495 } else { 1496 if (prior->ne_child == NULL) { 1497 prior->ne_child = eph; 1498 } else { 1499 child = prior->ne_child; 1500 1501 prior->ne_child = eph; 1502 eph->ne_peer = child; 1503 1504 child->ne_prior = eph; 1505 } 1506 1507 eph->ne_prior = prior; 1508 } 1509 } else { 1510 /* 1511 * The parent mntinfo4 is the non-ephemeral 1512 * root of the ephemeral tree. We 1513 * need to decide if we are the root 1514 * node of that tree or if we are a 1515 * sibling of the root node. 1516 * 1517 * We are the root if there is no 1518 * other node. 1519 */ 1520 if (net->net_root == NULL) { 1521 net->net_root = eph; 1522 } else { 1523 eph->ne_peer = peer = net->net_root; 1524 ASSERT(peer != NULL); 1525 net->net_root = eph; 1526 1527 peer->ne_prior = eph; 1528 } 1529 1530 eph->ne_prior = NULL; 1531 } 1532 1533 nfs4_ephemeral_tree_rele(net); 1534 1535 mutex_exit(&mi->mi_lock); 1536 mutex_exit(&mi_parent->mi_lock); 1537 1538 return (rc); 1539 } 1540 1541 /* 1542 * Commit the changes to the ephemeral tree for removing this node. 1543 */ 1544 static void 1545 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph) 1546 { 1547 nfs4_ephemeral_t *e = eph; 1548 nfs4_ephemeral_t *peer; 1549 nfs4_ephemeral_t *prior; 1550 1551 peer = eph->ne_peer; 1552 prior = e->ne_prior; 1553 1554 /* 1555 * If this branch root was not the 1556 * tree root, then we need to fix back pointers. 1557 */ 1558 if (prior) { 1559 if (prior->ne_child == e) { 1560 prior->ne_child = peer; 1561 } else { 1562 prior->ne_peer = peer; 1563 } 1564 1565 if (peer) 1566 peer->ne_prior = prior; 1567 } else if (peer) { 1568 peer->ne_mount->mi_ephemeral_tree->net_root = peer; 1569 peer->ne_prior = NULL; 1570 } else { 1571 e->ne_mount->mi_ephemeral_tree->net_root = NULL; 1572 } 1573 } 1574 1575 /* 1576 * We want to avoid recursion at all costs. So we need to 1577 * unroll the tree. We do this by a depth first traversal to 1578 * leaf nodes. We blast away the leaf and work our way back 1579 * up and down the tree. 1580 */ 1581 static int 1582 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph, 1583 int isTreeRoot, int flag, cred_t *cr) 1584 { 1585 nfs4_ephemeral_t *e = eph; 1586 nfs4_ephemeral_t *prior; 1587 mntinfo4_t *mi; 1588 vfs_t *vfsp; 1589 int error; 1590 1591 /* 1592 * We use the loop while unrolling the ephemeral tree. 1593 */ 1594 for (;;) { 1595 /* 1596 * First we walk down the child. 1597 */ 1598 if (e->ne_child) { 1599 prior = e; 1600 e = e->ne_child; 1601 continue; 1602 } 1603 1604 /* 1605 * If we are the root of the branch we are removing, 1606 * we end it here. But if the branch is the root of 1607 * the tree, we have to forge on. We do not consider 1608 * the peer list for the root because while it may 1609 * be okay to remove, it is both extra work and a 1610 * potential for a false-positive error to stall the 1611 * unmount attempt. 1612 */ 1613 if (e == eph && isTreeRoot == FALSE) 1614 return (0); 1615 1616 /* 1617 * Next we walk down the peer list. 1618 */ 1619 if (e->ne_peer) { 1620 prior = e; 1621 e = e->ne_peer; 1622 continue; 1623 } 1624 1625 /* 1626 * We can only remove the node passed in by the 1627 * caller if it is the root of the ephemeral tree. 1628 * Otherwise, the caller will remove it. 1629 */ 1630 if (e == eph && isTreeRoot == FALSE) 1631 return (0); 1632 1633 /* 1634 * Okay, we have a leaf node, time 1635 * to prune it! 1636 * 1637 * Note that prior can only be NULL if 1638 * and only if it is the root of the 1639 * ephemeral tree. 1640 */ 1641 prior = e->ne_prior; 1642 1643 mi = e->ne_mount; 1644 mutex_enter(&mi->mi_lock); 1645 vfsp = mi->mi_vfsp; 1646 1647 /* 1648 * Cleared by umount2_engine. 1649 */ 1650 VFS_HOLD(vfsp); 1651 1652 /* 1653 * Inform nfs4_unmount to not recursively 1654 * descend into this node's children when it 1655 * gets processed. 1656 */ 1657 mi->mi_flags |= MI4_EPHEMERAL_RECURSED; 1658 mutex_exit(&mi->mi_lock); 1659 1660 error = umount2_engine(vfsp, flag, cr, FALSE); 1661 if (error) { 1662 /* 1663 * We need to reenable nfs4_unmount's ability 1664 * to recursively descend on this node. 1665 */ 1666 mutex_enter(&mi->mi_lock); 1667 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED; 1668 mutex_exit(&mi->mi_lock); 1669 1670 return (error); 1671 } 1672 1673 /* 1674 * If we are the current node, we do not want to 1675 * touch anything else. At this point, the only 1676 * way the current node can have survived to here 1677 * is if it is the root of the ephemeral tree and 1678 * we are unmounting the enclosing mntinfo4. 1679 */ 1680 if (e == eph) { 1681 ASSERT(prior == NULL); 1682 return (0); 1683 } 1684 1685 /* 1686 * Stitch up the prior node. Note that since 1687 * we have handled the root of the tree, prior 1688 * must be non-NULL. 1689 */ 1690 ASSERT(prior != NULL); 1691 if (prior->ne_child == e) { 1692 prior->ne_child = NULL; 1693 } else { 1694 ASSERT(prior->ne_peer == e); 1695 1696 prior->ne_peer = NULL; 1697 } 1698 1699 e = prior; 1700 } 1701 1702 /* NOTREACHED */ 1703 } 1704 1705 /* 1706 * Common code to safely release net_cnt_lock and net_tree_lock 1707 */ 1708 void 1709 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock, 1710 bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1711 { 1712 nfs4_ephemeral_tree_t *net = *pnet; 1713 1714 if (*pmust_unlock) { 1715 mutex_enter(&net->net_cnt_lock); 1716 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING; 1717 if (*pmust_rele) 1718 nfs4_ephemeral_tree_decr(net); 1719 mutex_exit(&net->net_cnt_lock); 1720 1721 mutex_exit(&net->net_tree_lock); 1722 1723 *pmust_unlock = FALSE; 1724 } 1725 } 1726 1727 /* 1728 * While we may have removed any child or sibling nodes of this 1729 * ephemeral node, we can not nuke it until we know that there 1730 * were no actived vnodes on it. This will do that final 1731 * work once we know it is not busy. 1732 */ 1733 void 1734 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock, 1735 bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1736 { 1737 /* 1738 * Now we need to get rid of the ephemeral data if it exists. 1739 */ 1740 mutex_enter(&mi->mi_lock); 1741 if (mi->mi_ephemeral) { 1742 /* 1743 * If we are the root node of an ephemeral branch 1744 * which is being removed, then we need to fixup 1745 * pointers into and out of the node. 1746 */ 1747 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED)) 1748 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral); 1749 1750 ASSERT(mi->mi_ephemeral != NULL); 1751 1752 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral)); 1753 mi->mi_ephemeral = NULL; 1754 } 1755 mutex_exit(&mi->mi_lock); 1756 1757 nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele, pnet); 1758 } 1759 1760 /* 1761 * Unmount an ephemeral node. 1762 */ 1763 int 1764 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr, 1765 bool_t *pmust_unlock, bool_t *pmust_rele, nfs4_ephemeral_tree_t **pnet) 1766 { 1767 int error = 0; 1768 nfs4_ephemeral_t *eph; 1769 nfs4_ephemeral_tree_t *net; 1770 int is_derooting = FALSE; 1771 int is_recursed = FALSE; 1772 int was_locked = FALSE; 1773 1774 /* 1775 * Make sure to set the default state for cleaning 1776 * up the tree in the caller (and on the way out). 1777 */ 1778 *pmust_unlock = *pmust_rele = FALSE; 1779 1780 /* 1781 * The active vnodes on this file system may be ephemeral 1782 * children. We need to check for and try to unmount them 1783 * here. If any can not be unmounted, we are going 1784 * to return EBUSY. 1785 */ 1786 mutex_enter(&mi->mi_lock); 1787 1788 /* 1789 * If an ephemeral tree, we need to check to see if 1790 * the lock is already held. If it is, then we need 1791 * to see if we are being called as a result of 1792 * the recursive removal of some node of the tree or 1793 * if we are another attempt to remove the tree. 1794 * 1795 * mi_flags & MI4_EPHEMERAL indicates an ephemeral 1796 * node. mi_ephemeral being non-NULL also does this. 1797 * 1798 * mi_ephemeral_tree being non-NULL is sufficient 1799 * to also indicate either it is an ephemeral node 1800 * or the enclosing mntinfo4. 1801 * 1802 * Do we need MI4_EPHEMERAL? Yes, it is useful for 1803 * when we delete the ephemeral node and need to 1804 * differentiate from an ephemeral node and the 1805 * enclosing root node. 1806 */ 1807 *pnet = net = mi->mi_ephemeral_tree; 1808 if (net == NULL) { 1809 mutex_exit(&mi->mi_lock); 1810 return (0); 1811 } 1812 1813 eph = mi->mi_ephemeral; 1814 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED; 1815 is_derooting = (eph == NULL); 1816 1817 /* 1818 * If this is not recursion, then we need to 1819 * grab a ref count. 1820 * 1821 * But wait, we also do not want to do that 1822 * if a harvester thread has already grabbed 1823 * the lock. 1824 */ 1825 if (!is_recursed) { 1826 mutex_enter(&net->net_cnt_lock); 1827 if (net->net_status & 1828 NFS4_EPHEMERAL_TREE_LOCKED) { 1829 /* 1830 * If the tree is locked, we need 1831 * to decide whether we are the 1832 * harvester or some explicit call 1833 * for a umount. The only way that 1834 * we are the harvester is if 1835 * MS_SYSSPACE is set. 1836 * 1837 * We only let the harvester through 1838 * at this point. 1839 * 1840 * We return EBUSY so that the 1841 * caller knows something is 1842 * going on. Note that by that 1843 * time, the umount in the other 1844 * thread may have already occured. 1845 */ 1846 if (!(flag & MS_SYSSPACE)) { 1847 mutex_exit(&net->net_cnt_lock); 1848 mutex_exit(&mi->mi_lock); 1849 1850 return (EBUSY); 1851 } 1852 1853 was_locked = TRUE; 1854 } else { 1855 nfs4_ephemeral_tree_incr(net); 1856 *pmust_rele = TRUE; 1857 } 1858 1859 mutex_exit(&net->net_cnt_lock); 1860 } 1861 mutex_exit(&mi->mi_lock); 1862 1863 /* 1864 * If we are not the harvester, we need to check 1865 * to see if we need to grab the tree lock. 1866 */ 1867 if (was_locked == FALSE) { 1868 /* 1869 * If we grab the lock, it means that no other 1870 * operation is working on the tree. If we don't 1871 * grab it, we need to decide if this is because 1872 * we are a recursive call or a new operation. 1873 */ 1874 if (mutex_tryenter(&net->net_tree_lock)) { 1875 *pmust_unlock = TRUE; 1876 } else { 1877 /* 1878 * If we are a recursive call, we can 1879 * proceed without the lock. 1880 * Otherwise we have to wait until 1881 * the lock becomes free. 1882 */ 1883 if (!is_recursed) { 1884 mutex_enter(&net->net_cnt_lock); 1885 if (net->net_status & 1886 (NFS4_EPHEMERAL_TREE_DEROOTING 1887 | NFS4_EPHEMERAL_TREE_INVALID)) { 1888 nfs4_ephemeral_tree_decr(net); 1889 mutex_exit(&net->net_cnt_lock); 1890 *pmust_rele = FALSE; 1891 goto is_busy; 1892 } 1893 mutex_exit(&net->net_cnt_lock); 1894 1895 /* 1896 * We can't hold any other locks whilst 1897 * we wait on this to free up. 1898 */ 1899 mutex_enter(&net->net_tree_lock); 1900 1901 /* 1902 * Note that while mi->mi_ephemeral 1903 * may change and thus we have to 1904 * update eph, it is the case that 1905 * we have tied down net and 1906 * do not care if mi->mi_ephemeral_tree 1907 * has changed. 1908 */ 1909 mutex_enter(&mi->mi_lock); 1910 eph = mi->mi_ephemeral; 1911 mutex_exit(&mi->mi_lock); 1912 1913 /* 1914 * Okay, we need to see if either the 1915 * tree got nuked or the current node 1916 * got nuked. Both of which will cause 1917 * an error. 1918 * 1919 * Note that a subsequent retry of the 1920 * umount shall work. 1921 */ 1922 mutex_enter(&net->net_cnt_lock); 1923 if (net->net_status & 1924 NFS4_EPHEMERAL_TREE_INVALID || 1925 (!is_derooting && eph == NULL)) { 1926 nfs4_ephemeral_tree_decr(net); 1927 mutex_exit(&net->net_cnt_lock); 1928 mutex_exit(&net->net_tree_lock); 1929 *pmust_rele = FALSE; 1930 goto is_busy; 1931 } 1932 mutex_exit(&net->net_cnt_lock); 1933 *pmust_unlock = TRUE; 1934 } 1935 } 1936 } 1937 1938 /* 1939 * Only once we have grabbed the lock can we mark what we 1940 * are planning on doing to the ephemeral tree. 1941 */ 1942 if (*pmust_unlock) { 1943 mutex_enter(&net->net_cnt_lock); 1944 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING; 1945 1946 /* 1947 * Check to see if we are nuking the root. 1948 */ 1949 if (is_derooting) 1950 net->net_status |= 1951 NFS4_EPHEMERAL_TREE_DEROOTING; 1952 mutex_exit(&net->net_cnt_lock); 1953 } 1954 1955 if (!is_derooting) { 1956 /* 1957 * Only work on children if the caller has not already 1958 * done so. 1959 */ 1960 if (!is_recursed) { 1961 ASSERT(eph != NULL); 1962 1963 error = nfs4_ephemeral_unmount_engine(eph, 1964 FALSE, flag, cr); 1965 if (error) 1966 goto is_busy; 1967 } 1968 } else { 1969 eph = net->net_root; 1970 1971 /* 1972 * Only work if there is something there. 1973 */ 1974 if (eph) { 1975 error = nfs4_ephemeral_unmount_engine(eph, TRUE, 1976 flag, cr); 1977 if (error) { 1978 mutex_enter(&net->net_cnt_lock); 1979 net->net_status &= 1980 ~NFS4_EPHEMERAL_TREE_DEROOTING; 1981 mutex_exit(&net->net_cnt_lock); 1982 goto is_busy; 1983 } 1984 1985 /* 1986 * Nothing else which goes wrong will 1987 * invalidate the blowing away of the 1988 * ephmeral tree. 1989 */ 1990 net->net_root = NULL; 1991 } 1992 1993 /* 1994 * We have derooted and we have caused the tree to be 1995 * invalidated. 1996 */ 1997 mutex_enter(&net->net_cnt_lock); 1998 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING; 1999 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID; 2000 if (was_locked == FALSE) 2001 nfs4_ephemeral_tree_decr(net); 2002 mutex_exit(&net->net_cnt_lock); 2003 2004 if (was_locked == FALSE) 2005 mutex_exit(&net->net_tree_lock); 2006 2007 /* 2008 * We have just blown away any notation of this 2009 * tree being locked. We can't let the caller 2010 * try to clean things up. 2011 */ 2012 *pmust_unlock = FALSE; 2013 2014 /* 2015 * At this point, the tree should no longer be 2016 * associated with the mntinfo4. We need to pull 2017 * it off there and let the harvester take 2018 * care of it once the refcnt drops. 2019 */ 2020 mutex_enter(&mi->mi_lock); 2021 mi->mi_ephemeral_tree = NULL; 2022 mutex_exit(&mi->mi_lock); 2023 } 2024 2025 return (0); 2026 2027 is_busy: 2028 2029 nfs4_ephemeral_umount_unlock(pmust_unlock, pmust_rele, 2030 pnet); 2031 2032 return (error); 2033 } 2034 2035 /* 2036 * Do the umount and record any error in the parent. 2037 */ 2038 static void 2039 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag, 2040 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior) 2041 { 2042 int error; 2043 2044 error = umount2_engine(vfsp, flag, kcred, FALSE); 2045 if (error) { 2046 if (prior) { 2047 if (prior->ne_child == e) 2048 prior->ne_state |= 2049 NFS4_EPHEMERAL_CHILD_ERROR; 2050 else 2051 prior->ne_state |= 2052 NFS4_EPHEMERAL_PEER_ERROR; 2053 } 2054 } 2055 } 2056 2057 /* 2058 * For each tree in the forest (where the forest is in 2059 * effect all of the ephemeral trees for this zone), 2060 * scan to see if a node can be unmounted. Note that 2061 * unlike nfs4_ephemeral_unmount_engine(), we do 2062 * not process the current node before children or 2063 * siblings. I.e., if a node can be unmounted, we 2064 * do not recursively check to see if the nodes 2065 * hanging off of it can also be unmounted. 2066 * 2067 * Instead, we delve down deep to try and remove the 2068 * children first. Then, because we share code with 2069 * nfs4_ephemeral_unmount_engine(), we will try 2070 * them again. This could be a performance issue in 2071 * the future. 2072 * 2073 * Also note that unlike nfs4_ephemeral_unmount_engine(), 2074 * we do not halt on an error. We will not remove the 2075 * current node, but we will keep on trying to remove 2076 * the others. 2077 * 2078 * force indicates that we want the unmount to occur 2079 * even if there is something blocking it. 2080 * 2081 * time_check indicates that we want to see if the 2082 * mount has expired past mount_to or not. Typically 2083 * we want to do this and only on a shutdown of the 2084 * zone would we want to ignore the check. 2085 */ 2086 static void 2087 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg, 2088 bool_t force, bool_t time_check) 2089 { 2090 nfs4_ephemeral_tree_t *net; 2091 nfs4_ephemeral_tree_t *prev = NULL; 2092 nfs4_ephemeral_tree_t *next; 2093 nfs4_ephemeral_t *e; 2094 nfs4_ephemeral_t *prior; 2095 time_t now = gethrestime_sec(); 2096 2097 nfs4_ephemeral_tree_t *harvest = NULL; 2098 2099 int flag; 2100 2101 mntinfo4_t *mi; 2102 vfs_t *vfsp; 2103 2104 if (force) 2105 flag = MS_FORCE | MS_SYSSPACE; 2106 else 2107 flag = MS_SYSSPACE; 2108 2109 mutex_enter(&ntg->ntg_forest_lock); 2110 for (net = ntg->ntg_forest; net != NULL; net = next) { 2111 next = net->net_next; 2112 2113 nfs4_ephemeral_tree_hold(net); 2114 2115 mutex_enter(&net->net_tree_lock); 2116 2117 /* 2118 * Let the unmount code know that the 2119 * tree is already locked! 2120 */ 2121 mutex_enter(&net->net_cnt_lock); 2122 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED; 2123 mutex_exit(&net->net_cnt_lock); 2124 2125 /* 2126 * If the intent is force all ephemeral nodes to 2127 * be unmounted in this zone, we can short circuit a 2128 * lot of tree traversal and simply zap the root node. 2129 */ 2130 if (force) { 2131 if (net->net_root) { 2132 mi = net->net_root->ne_mount; 2133 vfsp = mi->mi_vfsp; 2134 2135 /* 2136 * Cleared by umount2_engine. 2137 */ 2138 VFS_HOLD(vfsp); 2139 2140 (void) umount2_engine(vfsp, flag, 2141 kcred, FALSE); 2142 2143 goto check_done; 2144 } 2145 } 2146 2147 e = net->net_root; 2148 if (e) 2149 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD; 2150 2151 while (e) { 2152 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) { 2153 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING; 2154 if (e->ne_child) { 2155 e = e->ne_child; 2156 e->ne_state = 2157 NFS4_EPHEMERAL_VISIT_CHILD; 2158 } 2159 2160 continue; 2161 } else if (e->ne_state == 2162 NFS4_EPHEMERAL_VISIT_SIBLING) { 2163 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME; 2164 if (e->ne_peer) { 2165 e = e->ne_peer; 2166 e->ne_state = 2167 NFS4_EPHEMERAL_VISIT_CHILD; 2168 } 2169 2170 continue; 2171 } else if (e->ne_state == 2172 NFS4_EPHEMERAL_CHILD_ERROR) { 2173 prior = e->ne_prior; 2174 2175 /* 2176 * If a child reported an error, do 2177 * not bother trying to unmount. 2178 * 2179 * If your prior node is a parent, 2180 * pass the error up such that they 2181 * also do not try to unmount. 2182 * 2183 * However, if your prior is a sibling, 2184 * let them try to unmount if they can. 2185 */ 2186 if (prior) { 2187 if (prior->ne_child == e) 2188 prior->ne_state |= 2189 NFS4_EPHEMERAL_CHILD_ERROR; 2190 else 2191 prior->ne_state |= 2192 NFS4_EPHEMERAL_PEER_ERROR; 2193 } 2194 2195 /* 2196 * Clear the error and if needed, process peers. 2197 * 2198 * Once we mask out the error, we know whether 2199 * or we have to process another node. 2200 */ 2201 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR; 2202 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME) 2203 e = prior; 2204 2205 continue; 2206 } else if (e->ne_state == 2207 NFS4_EPHEMERAL_PEER_ERROR) { 2208 prior = e->ne_prior; 2209 2210 if (prior) { 2211 if (prior->ne_child == e) 2212 prior->ne_state = 2213 NFS4_EPHEMERAL_CHILD_ERROR; 2214 else 2215 prior->ne_state = 2216 NFS4_EPHEMERAL_PEER_ERROR; 2217 } 2218 2219 /* 2220 * Clear the error from this node and do the 2221 * correct processing. 2222 */ 2223 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR; 2224 continue; 2225 } 2226 2227 prior = e->ne_prior; 2228 e->ne_state = NFS4_EPHEMERAL_OK; 2229 2230 /* 2231 * It must be the case that we need to process 2232 * this node. 2233 */ 2234 if (!time_check || 2235 now - e->ne_ref_time > e->ne_mount_to) { 2236 mi = e->ne_mount; 2237 vfsp = mi->mi_vfsp; 2238 2239 /* 2240 * Cleared by umount2_engine. 2241 */ 2242 VFS_HOLD(vfsp); 2243 2244 /* 2245 * Note that we effectively work down to the 2246 * leaf nodes first, try to unmount them, 2247 * then work our way back up into the leaf 2248 * nodes. 2249 * 2250 * Also note that we deal with a lot of 2251 * complexity by sharing the work with 2252 * the manual unmount code. 2253 */ 2254 nfs4_ephemeral_record_umount(vfsp, flag, 2255 e, prior); 2256 } 2257 2258 e = prior; 2259 } 2260 2261 check_done: 2262 2263 /* 2264 * At this point we are done processing this tree. 2265 * 2266 * If the tree is invalid and we are the only reference 2267 * to it, then we push it on the local linked list 2268 * to remove it at the end. We avoid that action now 2269 * to keep the tree processing going along at a fair clip. 2270 * 2271 * Else, even if we are the only reference, we drop 2272 * our hold on the current tree and allow it to be 2273 * reused as needed. 2274 */ 2275 mutex_enter(&net->net_cnt_lock); 2276 if (net->net_refcnt == 1 && 2277 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) { 2278 nfs4_ephemeral_tree_decr(net); 2279 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2280 mutex_exit(&net->net_cnt_lock); 2281 mutex_exit(&net->net_tree_lock); 2282 2283 if (prev) 2284 prev->net_next = net->net_next; 2285 else 2286 ntg->ntg_forest = net->net_next; 2287 2288 net->net_next = harvest; 2289 harvest = net; 2290 continue; 2291 } 2292 2293 nfs4_ephemeral_tree_decr(net); 2294 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED; 2295 mutex_exit(&net->net_cnt_lock); 2296 mutex_exit(&net->net_tree_lock); 2297 2298 prev = net; 2299 } 2300 mutex_exit(&ntg->ntg_forest_lock); 2301 2302 for (net = harvest; net != NULL; net = next) { 2303 next = net->net_next; 2304 2305 mutex_destroy(&net->net_tree_lock); 2306 mutex_destroy(&net->net_cnt_lock); 2307 kmem_free(net, sizeof (*net)); 2308 } 2309 } 2310 2311 /* 2312 * This is the thread which decides when the harvesting 2313 * can proceed and when to kill it off for this zone. 2314 */ 2315 static void 2316 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg) 2317 { 2318 clock_t timeleft; 2319 zone_t *zone = curproc->p_zone; 2320 2321 for (;;) { 2322 timeleft = zone_status_timedwait(zone, lbolt + 2323 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN); 2324 2325 /* 2326 * zone is exiting... 2327 */ 2328 if (timeleft != -1) { 2329 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN); 2330 zthread_exit(); 2331 /* NOTREACHED */ 2332 } 2333 2334 /* 2335 * Only bother scanning if there is potential 2336 * work to be done. 2337 */ 2338 if (ntg->ntg_forest == NULL) 2339 continue; 2340 2341 /* 2342 * Now scan the list and get rid of everything which 2343 * is old. 2344 */ 2345 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE); 2346 } 2347 2348 /* NOTREACHED */ 2349 } 2350 2351 /* 2352 * The zone specific glue needed to start the unmount harvester. 2353 * 2354 * Note that we want to avoid holding the mutex as long as possible, 2355 * hence the multiple checks. 2356 * 2357 * The caller should avoid us getting down here in the first 2358 * place. 2359 */ 2360 static void 2361 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg) 2362 { 2363 /* 2364 * It got started before we got here... 2365 */ 2366 if (ntg->ntg_thread_started) 2367 return; 2368 2369 mutex_enter(&nfs4_ephemeral_thread_lock); 2370 2371 if (ntg->ntg_thread_started) { 2372 mutex_exit(&nfs4_ephemeral_thread_lock); 2373 return; 2374 } 2375 2376 /* 2377 * Start the unmounter harvester thread for this zone. 2378 */ 2379 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester, 2380 ntg, 0, minclsyspri); 2381 2382 ntg->ntg_thread_started = TRUE; 2383 mutex_exit(&nfs4_ephemeral_thread_lock); 2384 } 2385 2386 /*ARGSUSED*/ 2387 static void * 2388 nfs4_ephemeral_zsd_create(zoneid_t zoneid) 2389 { 2390 nfs4_trigger_globals_t *ntg; 2391 2392 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP); 2393 ntg->ntg_thread_started = FALSE; 2394 2395 /* 2396 * This is the default.... 2397 */ 2398 ntg->ntg_mount_to = nfs4_trigger_thread_timer; 2399 2400 mutex_init(&ntg->ntg_forest_lock, NULL, 2401 MUTEX_DEFAULT, NULL); 2402 2403 return (ntg); 2404 } 2405 2406 /* 2407 * Try a nice gentle walk down the forest and convince 2408 * all of the trees to gracefully give it up. 2409 */ 2410 /*ARGSUSED*/ 2411 static void 2412 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg) 2413 { 2414 nfs4_trigger_globals_t *ntg = arg; 2415 2416 if (!ntg) 2417 return; 2418 2419 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE); 2420 } 2421 2422 /* 2423 * Race along the forest and rip all of the trees out by 2424 * their rootballs! 2425 */ 2426 /*ARGSUSED*/ 2427 static void 2428 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg) 2429 { 2430 nfs4_trigger_globals_t *ntg = arg; 2431 2432 if (!ntg) 2433 return; 2434 2435 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE); 2436 2437 mutex_destroy(&ntg->ntg_forest_lock); 2438 kmem_free(ntg, sizeof (*ntg)); 2439 } 2440 2441 /* 2442 * This is the zone independent cleanup needed for 2443 * emphemeral mount processing. 2444 */ 2445 void 2446 nfs4_ephemeral_fini(void) 2447 { 2448 (void) zone_key_delete(nfs4_ephemeral_key); 2449 mutex_destroy(&nfs4_ephemeral_thread_lock); 2450 } 2451 2452 /* 2453 * This is the zone independent initialization needed for 2454 * emphemeral mount processing. 2455 */ 2456 void 2457 nfs4_ephemeral_init(void) 2458 { 2459 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT, 2460 NULL); 2461 2462 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create, 2463 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy); 2464 } 2465 2466 /* 2467 * nfssys() calls this function to set the per-zone 2468 * value of mount_to to drive when an ephemeral mount is 2469 * timed out. Each mount will grab a copy of this value 2470 * when mounted. 2471 */ 2472 void 2473 nfs4_ephemeral_set_mount_to(uint_t mount_to) 2474 { 2475 nfs4_trigger_globals_t *ntg; 2476 zone_t *zone = curproc->p_zone; 2477 2478 ntg = zone_getspecific(nfs4_ephemeral_key, zone); 2479 2480 ntg->ntg_mount_to = mount_to; 2481 } 2482 2483 /* 2484 * Walk the list of v4 mount options; if they are currently set in vfsp, 2485 * append them to a new comma-separated mount option string, and return it. 2486 * 2487 * Caller should free by calling nfs4_trigger_destroy_mntopts(). 2488 */ 2489 static char * 2490 nfs4_trigger_create_mntopts(vfs_t *vfsp) 2491 { 2492 uint_t i; 2493 char *mntopts; 2494 struct vfssw *vswp; 2495 mntopts_t *optproto; 2496 2497 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP); 2498 2499 /* get the list of applicable mount options for v4; locks *vswp */ 2500 vswp = vfs_getvfssw(MNTTYPE_NFS4); 2501 optproto = &vswp->vsw_optproto; 2502 2503 for (i = 0; i < optproto->mo_count; i++) { 2504 struct mntopt *mop = &optproto->mo_list[i]; 2505 2506 if (mop->mo_flags & MO_EMPTY) 2507 continue; 2508 2509 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) { 2510 kmem_free(mntopts, MAX_MNTOPT_STR); 2511 vfs_unrefvfssw(vswp); 2512 return (NULL); 2513 } 2514 } 2515 2516 vfs_unrefvfssw(vswp); 2517 2518 /* 2519 * MNTOPT_XATTR is not in the v4 mount opt proto list, 2520 * and it may only be passed via MS_OPTIONSTR, so we 2521 * must handle it here. 2522 * 2523 * Ideally, it would be in the list, but NFS does not specify its 2524 * own opt proto list, it uses instead the default one. Since 2525 * not all filesystems support extended attrs, it would not be 2526 * appropriate to add it there. 2527 */ 2528 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) || 2529 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) { 2530 kmem_free(mntopts, MAX_MNTOPT_STR); 2531 return (NULL); 2532 } 2533 2534 return (mntopts); 2535 } 2536 2537 static void 2538 nfs4_trigger_destroy_mntopts(char *mntopts) 2539 { 2540 if (mntopts) 2541 kmem_free(mntopts, MAX_MNTOPT_STR); 2542 } 2543 2544 /* 2545 * Check a single mount option (optname). Add to mntopts if it is set in VFS. 2546 */ 2547 static int 2548 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp) 2549 { 2550 if (mntopts == NULL || optname == NULL || vfsp == NULL) 2551 return (EINVAL); 2552 2553 if (vfs_optionisset(vfsp, optname, NULL)) { 2554 size_t mntoptslen = strlen(mntopts); 2555 size_t optnamelen = strlen(optname); 2556 2557 /* +1 for ',', +1 for NUL */ 2558 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR) 2559 return (EOVERFLOW); 2560 2561 /* first or subsequent mount option? */ 2562 if (*mntopts != '\0') 2563 (void) strcat(mntopts, ","); 2564 2565 (void) strcat(mntopts, optname); 2566 } 2567 2568 return (0); 2569 } 2570 2571 static enum clnt_stat 2572 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr) 2573 { 2574 int retries, error; 2575 uint_t max_msgsize; 2576 enum clnt_stat status; 2577 CLIENT *cl; 2578 struct timeval timeout; 2579 2580 /* as per recov_newserver() */ 2581 max_msgsize = 0; 2582 retries = 1; 2583 timeout.tv_sec = 2; 2584 timeout.tv_usec = 0; 2585 2586 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, NFS_PROGRAM, 2587 NFS_V4, max_msgsize, retries, CRED(), &cl); 2588 if (error) 2589 return (RPC_FAILED); 2590 2591 if (nointr) 2592 cl->cl_nosignal = TRUE; 2593 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL, 2594 timeout); 2595 if (nointr) 2596 cl->cl_nosignal = FALSE; 2597 2598 AUTH_DESTROY(cl->cl_auth); 2599 CLNT_DESTROY(cl); 2600 2601 return (status); 2602 } 2603