1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/thread.h> 35 #include <sys/t_lock.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/errno.h> 40 #include <sys/buf.h> 41 #include <sys/stat.h> 42 #include <sys/cred.h> 43 #include <sys/kmem.h> 44 #include <sys/debug.h> 45 #include <sys/dnlc.h> 46 #include <sys/vmsystm.h> 47 #include <sys/flock.h> 48 #include <sys/share.h> 49 #include <sys/cmn_err.h> 50 #include <sys/tiuser.h> 51 #include <sys/sysmacros.h> 52 #include <sys/callb.h> 53 #include <sys/acl.h> 54 #include <sys/kstat.h> 55 #include <sys/signal.h> 56 #include <sys/list.h> 57 #include <sys/zone.h> 58 59 #include <rpc/types.h> 60 #include <rpc/xdr.h> 61 #include <rpc/auth.h> 62 #include <rpc/clnt.h> 63 64 #include <nfs/nfs.h> 65 #include <nfs/nfs_clnt.h> 66 67 #include <nfs/rnode.h> 68 #include <nfs/nfs_acl.h> 69 #include <nfs/lm.h> 70 71 #include <vm/hat.h> 72 #include <vm/as.h> 73 #include <vm/page.h> 74 #include <vm/pvn.h> 75 #include <vm/seg.h> 76 #include <vm/seg_map.h> 77 #include <vm/seg_vn.h> 78 79 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 80 cred_t *); 81 static int nfs_getattr_cache(vnode_t *, struct vattr *); 82 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 83 84 struct mi_globals { 85 kmutex_t mig_lock; /* lock protecting mig_list */ 86 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 87 boolean_t mig_destructor_called; 88 }; 89 90 static zone_key_t mi_list_key; 91 92 /* Debugging flag for PC file shares. */ 93 extern int share_debug; 94 95 /* 96 * Attributes caching: 97 * 98 * Attributes are cached in the rnode in struct vattr form. 99 * There is a time associated with the cached attributes (r_attrtime) 100 * which tells whether the attributes are valid. The time is initialized 101 * to the difference between current time and the modify time of the vnode 102 * when new attributes are cached. This allows the attributes for 103 * files that have changed recently to be timed out sooner than for files 104 * that have not changed for a long time. There are minimum and maximum 105 * timeout values that can be set per mount point. 106 */ 107 108 int 109 nfs_waitfor_purge_complete(vnode_t *vp) 110 { 111 rnode_t *rp; 112 k_sigset_t smask; 113 114 rp = VTOR(vp); 115 if (rp->r_serial != NULL && rp->r_serial != curthread) { 116 mutex_enter(&rp->r_statelock); 117 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 118 while (rp->r_serial != NULL) { 119 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 120 sigunintr(&smask); 121 mutex_exit(&rp->r_statelock); 122 return (EINTR); 123 } 124 } 125 sigunintr(&smask); 126 mutex_exit(&rp->r_statelock); 127 } 128 return (0); 129 } 130 131 /* 132 * Validate caches by checking cached attributes. If the cached 133 * attributes have timed out, then get new attributes from the server. 134 * As a side affect, this will do cache invalidation if the attributes 135 * have changed. 136 * 137 * If the attributes have not timed out and if there is a cache 138 * invalidation being done by some other thread, then wait until that 139 * thread has completed the cache invalidation. 140 */ 141 int 142 nfs_validate_caches(vnode_t *vp, cred_t *cr) 143 { 144 int error; 145 struct vattr va; 146 147 if (ATTRCACHE_VALID(vp)) { 148 error = nfs_waitfor_purge_complete(vp); 149 if (error) 150 return (error); 151 return (0); 152 } 153 154 va.va_mask = AT_ALL; 155 return (nfs_getattr_otw(vp, &va, cr)); 156 } 157 158 /* 159 * Validate caches by checking cached attributes. If the cached 160 * attributes have timed out, then get new attributes from the server. 161 * As a side affect, this will do cache invalidation if the attributes 162 * have changed. 163 * 164 * If the attributes have not timed out and if there is a cache 165 * invalidation being done by some other thread, then wait until that 166 * thread has completed the cache invalidation. 167 */ 168 int 169 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 170 { 171 int error; 172 struct vattr va; 173 174 if (ATTRCACHE_VALID(vp)) { 175 error = nfs_waitfor_purge_complete(vp); 176 if (error) 177 return (error); 178 return (0); 179 } 180 181 va.va_mask = AT_ALL; 182 return (nfs3_getattr_otw(vp, &va, cr)); 183 } 184 185 /* 186 * Purge all of the various NFS `data' caches. 187 */ 188 void 189 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 190 { 191 rnode_t *rp; 192 char *contents; 193 int size; 194 int error; 195 196 /* 197 * Purge the DNLC for any entries which refer to this file. 198 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 199 */ 200 rp = VTOR(vp); 201 mutex_enter(&rp->r_statelock); 202 if (vp->v_count > 1 && 203 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 204 !(rp->r_flags & RINDNLCPURGE)) { 205 /* 206 * Set the RINDNLCPURGE flag to prevent recursive entry 207 * into dnlc_purge_vp() 208 */ 209 if (vp->v_type == VDIR) 210 rp->r_flags |= RINDNLCPURGE; 211 mutex_exit(&rp->r_statelock); 212 dnlc_purge_vp(vp); 213 mutex_enter(&rp->r_statelock); 214 if (rp->r_flags & RINDNLCPURGE) 215 rp->r_flags &= ~RINDNLCPURGE; 216 } 217 218 /* 219 * Clear any readdir state bits and purge the readlink response cache. 220 */ 221 contents = rp->r_symlink.contents; 222 size = rp->r_symlink.size; 223 rp->r_symlink.contents = NULL; 224 mutex_exit(&rp->r_statelock); 225 226 if (contents != NULL) { 227 228 kmem_free((void *)contents, size); 229 } 230 231 /* 232 * Flush the page cache. 233 */ 234 if (vn_has_cached_data(vp)) { 235 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 236 if (error && (error == ENOSPC || error == EDQUOT)) { 237 mutex_enter(&rp->r_statelock); 238 if (!rp->r_error) 239 rp->r_error = error; 240 mutex_exit(&rp->r_statelock); 241 } 242 } 243 244 /* 245 * Flush the readdir response cache. 246 */ 247 if (HAVE_RDDIR_CACHE(rp)) 248 nfs_purge_rddir_cache(vp); 249 } 250 251 /* 252 * Purge the readdir cache of all entries 253 */ 254 void 255 nfs_purge_rddir_cache(vnode_t *vp) 256 { 257 rnode_t *rp; 258 rddir_cache *rdc; 259 rddir_cache *nrdc; 260 261 rp = VTOR(vp); 262 top: 263 mutex_enter(&rp->r_statelock); 264 rp->r_direof = NULL; 265 rp->r_flags &= ~RLOOKUP; 266 rp->r_flags |= RREADDIRPLUS; 267 rdc = avl_first(&rp->r_dir); 268 while (rdc != NULL) { 269 nrdc = AVL_NEXT(&rp->r_dir, rdc); 270 avl_remove(&rp->r_dir, rdc); 271 rddir_cache_rele(rdc); 272 rdc = nrdc; 273 } 274 mutex_exit(&rp->r_statelock); 275 } 276 277 /* 278 * Do a cache check based on the post-operation attributes. 279 * Then make them the new cached attributes. If no attributes 280 * were returned, then mark the attributes as timed out. 281 */ 282 void 283 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 284 { 285 vattr_t attr; 286 287 if (!poap->attributes) { 288 PURGE_ATTRCACHE(vp); 289 return; 290 } 291 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 292 } 293 294 /* 295 * Same as above, but using a vattr 296 */ 297 void 298 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 299 cred_t *cr) 300 { 301 if (!poap->attributes) { 302 PURGE_ATTRCACHE(vp); 303 return; 304 } 305 nfs_attr_cache(vp, poap->fres.vap, t, cr); 306 } 307 308 /* 309 * Do a cache check based on the weak cache consistency attributes. 310 * These consist of a small set of pre-operation attributes and the 311 * full set of post-operation attributes. 312 * 313 * If we are given the pre-operation attributes, then use them to 314 * check the validity of the various caches. Then, if we got the 315 * post-operation attributes, make them the new cached attributes. 316 * If we didn't get the post-operation attributes, then mark the 317 * attribute cache as timed out so that the next reference will 318 * cause a GETATTR to the server to refresh with the current 319 * attributes. 320 * 321 * Otherwise, if we didn't get the pre-operation attributes, but 322 * we did get the post-operation attributes, then use these 323 * attributes to check the validity of the various caches. This 324 * will probably cause a flush of the caches because if the 325 * operation succeeded, the attributes of the object were changed 326 * in some way from the old post-operation attributes. This 327 * should be okay because it is the safe thing to do. After 328 * checking the data caches, then we make these the new cached 329 * attributes. 330 * 331 * Otherwise, we didn't get either the pre- or post-operation 332 * attributes. Simply mark the attribute cache as timed out so 333 * the next reference will cause a GETATTR to the server to 334 * refresh with the current attributes. 335 * 336 * If an error occurred trying to convert the over the wire 337 * attributes to a vattr, then simply mark the attribute cache as 338 * timed out. 339 */ 340 void 341 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 342 { 343 vattr_t bva; 344 vattr_t ava; 345 346 if (wccp->after.attributes) { 347 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 348 PURGE_ATTRCACHE(vp); 349 return; 350 } 351 if (wccp->before.attributes) { 352 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 353 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 354 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 355 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 356 bva.va_size = wccp->before.attr.size; 357 nfs3_attr_cache(vp, &bva, &ava, t, cr); 358 } else 359 nfs_attr_cache(vp, &ava, t, cr); 360 } else { 361 PURGE_ATTRCACHE(vp); 362 } 363 } 364 365 /* 366 * Set attributes cache for given vnode using nfsattr. 367 * 368 * This routine does not do cache validation with the attributes. 369 * 370 * If an error occurred trying to convert the over the wire 371 * attributes to a vattr, then simply mark the attribute cache as 372 * timed out. 373 */ 374 void 375 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 376 { 377 rnode_t *rp; 378 struct vattr va; 379 380 if (!nattr_to_vattr(vp, na, &va)) { 381 rp = VTOR(vp); 382 mutex_enter(&rp->r_statelock); 383 if (rp->r_mtime <= t) 384 nfs_attrcache_va(vp, &va); 385 mutex_exit(&rp->r_statelock); 386 } else { 387 PURGE_ATTRCACHE(vp); 388 } 389 } 390 391 /* 392 * Set attributes cache for given vnode using fattr3. 393 * 394 * This routine does not do cache validation with the attributes. 395 * 396 * If an error occurred trying to convert the over the wire 397 * attributes to a vattr, then simply mark the attribute cache as 398 * timed out. 399 */ 400 void 401 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 402 { 403 rnode_t *rp; 404 struct vattr va; 405 406 if (!fattr3_to_vattr(vp, na, &va)) { 407 rp = VTOR(vp); 408 mutex_enter(&rp->r_statelock); 409 if (rp->r_mtime <= t) 410 nfs_attrcache_va(vp, &va); 411 mutex_exit(&rp->r_statelock); 412 } else { 413 PURGE_ATTRCACHE(vp); 414 } 415 } 416 417 /* 418 * Do a cache check based on attributes returned over the wire. The 419 * new attributes are cached. 420 * 421 * If an error occurred trying to convert the over the wire attributes 422 * to a vattr, then just return that error. 423 * 424 * As a side affect, the vattr argument is filled in with the converted 425 * attributes. 426 */ 427 int 428 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 429 cred_t *cr) 430 { 431 int error; 432 433 error = nattr_to_vattr(vp, na, vap); 434 if (error) 435 return (error); 436 nfs_attr_cache(vp, vap, t, cr); 437 return (0); 438 } 439 440 /* 441 * Do a cache check based on attributes returned over the wire. The 442 * new attributes are cached. 443 * 444 * If an error occurred trying to convert the over the wire attributes 445 * to a vattr, then just return that error. 446 * 447 * As a side affect, the vattr argument is filled in with the converted 448 * attributes. 449 */ 450 int 451 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 452 { 453 int error; 454 455 error = fattr3_to_vattr(vp, na, vap); 456 if (error) 457 return (error); 458 nfs_attr_cache(vp, vap, t, cr); 459 return (0); 460 } 461 462 /* 463 * Use the passed in virtual attributes to check to see whether the 464 * data and metadata caches are valid, cache the new attributes, and 465 * then do the cache invalidation if required. 466 * 467 * The cache validation and caching of the new attributes is done 468 * atomically via the use of the mutex, r_statelock. If required, 469 * the cache invalidation is done atomically w.r.t. the cache 470 * validation and caching of the attributes via the pseudo lock, 471 * r_serial. 472 * 473 * This routine is used to do cache validation and attributes caching 474 * for operations with a single set of post operation attributes. 475 */ 476 void 477 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 478 { 479 rnode_t *rp; 480 int mtime_changed = 0; 481 int ctime_changed = 0; 482 vsecattr_t *vsp; 483 int was_serial; 484 len_t preattr_rsize; 485 boolean_t writeattr_set = B_FALSE; 486 boolean_t cachepurge_set = B_FALSE; 487 488 rp = VTOR(vp); 489 490 mutex_enter(&rp->r_statelock); 491 492 if (rp->r_serial != curthread) { 493 klwp_t *lwp = ttolwp(curthread); 494 495 was_serial = 0; 496 if (lwp != NULL) 497 lwp->lwp_nostop++; 498 while (rp->r_serial != NULL) { 499 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 500 mutex_exit(&rp->r_statelock); 501 if (lwp != NULL) 502 lwp->lwp_nostop--; 503 return; 504 } 505 } 506 if (lwp != NULL) 507 lwp->lwp_nostop--; 508 } else 509 was_serial = 1; 510 511 if (rp->r_mtime > t) { 512 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 513 PURGE_ATTRCACHE_LOCKED(rp); 514 mutex_exit(&rp->r_statelock); 515 return; 516 } 517 518 /* 519 * Write thread after writing data to file on remote server, 520 * will always set RWRITEATTR to indicate that file on remote 521 * server was modified with a WRITE operation and would have 522 * marked attribute cache as timed out. If RWRITEATTR 523 * is set, then do not check for mtime and ctime change. 524 */ 525 if (!(rp->r_flags & RWRITEATTR)) { 526 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 527 mtime_changed = 1; 528 529 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 530 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 531 ctime_changed = 1; 532 } else { 533 writeattr_set = B_TRUE; 534 } 535 536 preattr_rsize = rp->r_size; 537 538 nfs_attrcache_va(vp, vap); 539 540 /* 541 * If we have updated filesize in nfs_attrcache_va, as soon as we 542 * drop statelock we will be in transition of purging all 543 * our caches and updating them. It is possible for another 544 * thread to pick this new file size and read in zeroed data. 545 * stall other threads till cache purge is complete. 546 */ 547 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 548 /* 549 * If RWRITEATTR was set and we have updated the file 550 * size, Server's returned file size need not necessarily 551 * be because of this Client's WRITE. We need to purge 552 * all caches. 553 */ 554 if (writeattr_set) 555 mtime_changed = 1; 556 557 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 558 rp->r_flags |= RINCACHEPURGE; 559 cachepurge_set = B_TRUE; 560 } 561 } 562 563 if (!mtime_changed && !ctime_changed) { 564 mutex_exit(&rp->r_statelock); 565 return; 566 } 567 568 rp->r_serial = curthread; 569 570 mutex_exit(&rp->r_statelock); 571 572 if (mtime_changed) 573 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 574 575 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 576 mutex_enter(&rp->r_statelock); 577 rp->r_flags &= ~RINCACHEPURGE; 578 cv_broadcast(&rp->r_cv); 579 mutex_exit(&rp->r_statelock); 580 cachepurge_set = B_FALSE; 581 } 582 583 if (ctime_changed) { 584 (void) nfs_access_purge_rp(rp); 585 if (rp->r_secattr != NULL) { 586 mutex_enter(&rp->r_statelock); 587 vsp = rp->r_secattr; 588 rp->r_secattr = NULL; 589 mutex_exit(&rp->r_statelock); 590 if (vsp != NULL) 591 nfs_acl_free(vsp); 592 } 593 } 594 595 if (!was_serial) { 596 mutex_enter(&rp->r_statelock); 597 rp->r_serial = NULL; 598 cv_broadcast(&rp->r_cv); 599 mutex_exit(&rp->r_statelock); 600 } 601 } 602 603 /* 604 * Use the passed in "before" virtual attributes to check to see 605 * whether the data and metadata caches are valid, cache the "after" 606 * new attributes, and then do the cache invalidation if required. 607 * 608 * The cache validation and caching of the new attributes is done 609 * atomically via the use of the mutex, r_statelock. If required, 610 * the cache invalidation is done atomically w.r.t. the cache 611 * validation and caching of the attributes via the pseudo lock, 612 * r_serial. 613 * 614 * This routine is used to do cache validation and attributes caching 615 * for operations with both pre operation attributes and post operation 616 * attributes. 617 */ 618 static void 619 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 620 cred_t *cr) 621 { 622 rnode_t *rp; 623 int mtime_changed = 0; 624 int ctime_changed = 0; 625 vsecattr_t *vsp; 626 int was_serial; 627 len_t preattr_rsize; 628 boolean_t writeattr_set = B_FALSE; 629 boolean_t cachepurge_set = B_FALSE; 630 631 rp = VTOR(vp); 632 633 mutex_enter(&rp->r_statelock); 634 635 if (rp->r_serial != curthread) { 636 klwp_t *lwp = ttolwp(curthread); 637 638 was_serial = 0; 639 if (lwp != NULL) 640 lwp->lwp_nostop++; 641 while (rp->r_serial != NULL) { 642 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 643 mutex_exit(&rp->r_statelock); 644 if (lwp != NULL) 645 lwp->lwp_nostop--; 646 return; 647 } 648 } 649 if (lwp != NULL) 650 lwp->lwp_nostop--; 651 } else 652 was_serial = 1; 653 654 if (rp->r_mtime > t) { 655 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size)) 656 PURGE_ATTRCACHE_LOCKED(rp); 657 mutex_exit(&rp->r_statelock); 658 return; 659 } 660 661 /* 662 * Write thread after writing data to file on remote server, 663 * will always set RWRITEATTR to indicate that file on remote 664 * server was modified with a WRITE operation and would have 665 * marked attribute cache as timed out. If RWRITEATTR 666 * is set, then do not check for mtime and ctime change. 667 */ 668 if (!(rp->r_flags & RWRITEATTR)) { 669 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 670 mtime_changed = 1; 671 672 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 673 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 674 ctime_changed = 1; 675 } else { 676 writeattr_set = B_TRUE; 677 } 678 679 preattr_rsize = rp->r_size; 680 681 nfs_attrcache_va(vp, avap); 682 683 /* 684 * If we have updated filesize in nfs_attrcache_va, as soon as we 685 * drop statelock we will be in transition of purging all 686 * our caches and updating them. It is possible for another 687 * thread to pick this new file size and read in zeroed data. 688 * stall other threads till cache purge is complete. 689 */ 690 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 691 /* 692 * If RWRITEATTR was set and we have updated the file 693 * size, Server's returned file size need not necessarily 694 * be because of this Client's WRITE. We need to purge 695 * all caches. 696 */ 697 if (writeattr_set) 698 mtime_changed = 1; 699 700 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 701 rp->r_flags |= RINCACHEPURGE; 702 cachepurge_set = B_TRUE; 703 } 704 } 705 706 if (!mtime_changed && !ctime_changed) { 707 mutex_exit(&rp->r_statelock); 708 return; 709 } 710 711 rp->r_serial = curthread; 712 713 mutex_exit(&rp->r_statelock); 714 715 if (mtime_changed) 716 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 717 718 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 719 mutex_enter(&rp->r_statelock); 720 rp->r_flags &= ~RINCACHEPURGE; 721 cv_broadcast(&rp->r_cv); 722 mutex_exit(&rp->r_statelock); 723 cachepurge_set = B_FALSE; 724 } 725 726 if (ctime_changed) { 727 (void) nfs_access_purge_rp(rp); 728 if (rp->r_secattr != NULL) { 729 mutex_enter(&rp->r_statelock); 730 vsp = rp->r_secattr; 731 rp->r_secattr = NULL; 732 mutex_exit(&rp->r_statelock); 733 if (vsp != NULL) 734 nfs_acl_free(vsp); 735 } 736 } 737 738 if (!was_serial) { 739 mutex_enter(&rp->r_statelock); 740 rp->r_serial = NULL; 741 cv_broadcast(&rp->r_cv); 742 mutex_exit(&rp->r_statelock); 743 } 744 } 745 746 /* 747 * Set attributes cache for given vnode using virtual attributes. 748 * 749 * Set the timeout value on the attribute cache and fill it 750 * with the passed in attributes. 751 * 752 * The caller must be holding r_statelock. 753 */ 754 void 755 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 756 { 757 rnode_t *rp; 758 mntinfo_t *mi; 759 hrtime_t delta; 760 hrtime_t now; 761 762 rp = VTOR(vp); 763 764 ASSERT(MUTEX_HELD(&rp->r_statelock)); 765 766 now = gethrtime(); 767 768 mi = VTOMI(vp); 769 770 /* 771 * Delta is the number of nanoseconds that we will 772 * cache the attributes of the file. It is based on 773 * the number of nanoseconds since the last time that 774 * we detected a change. The assumption is that files 775 * that changed recently are likely to change again. 776 * There is a minimum and a maximum for regular files 777 * and for directories which is enforced though. 778 * 779 * Using the time since last change was detected 780 * eliminates direct comparison or calculation 781 * using mixed client and server times. NFS does 782 * not make any assumptions regarding the client 783 * and server clocks being synchronized. 784 */ 785 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 786 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 787 va->va_size != rp->r_attr.va_size) 788 rp->r_mtime = now; 789 790 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 791 delta = 0; 792 else { 793 delta = now - rp->r_mtime; 794 if (vp->v_type == VDIR) { 795 if (delta < mi->mi_acdirmin) 796 delta = mi->mi_acdirmin; 797 else if (delta > mi->mi_acdirmax) 798 delta = mi->mi_acdirmax; 799 } else { 800 if (delta < mi->mi_acregmin) 801 delta = mi->mi_acregmin; 802 else if (delta > mi->mi_acregmax) 803 delta = mi->mi_acregmax; 804 } 805 } 806 rp->r_attrtime = now + delta; 807 rp->r_attr = *va; 808 /* 809 * Update the size of the file if there is no cached data or if 810 * the cached data is clean and there is no data being written 811 * out. 812 */ 813 if (rp->r_size != va->va_size && 814 (!vn_has_cached_data(vp) || 815 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 816 rp->r_size = va->va_size; 817 nfs_setswaplike(vp, va); 818 rp->r_flags &= ~RWRITEATTR; 819 } 820 821 /* 822 * Fill in attribute from the cache. 823 * If valid, then return 0 to indicate that no error occurred, 824 * otherwise return 1 to indicate that an error occurred. 825 */ 826 static int 827 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 828 { 829 rnode_t *rp; 830 831 rp = VTOR(vp); 832 mutex_enter(&rp->r_statelock); 833 if (ATTRCACHE_VALID(vp)) { 834 /* 835 * Cached attributes are valid 836 */ 837 *vap = rp->r_attr; 838 mutex_exit(&rp->r_statelock); 839 return (0); 840 } 841 mutex_exit(&rp->r_statelock); 842 return (1); 843 } 844 845 /* 846 * Get attributes over-the-wire and update attributes cache 847 * if no error occurred in the over-the-wire operation. 848 * Return 0 if successful, otherwise error. 849 */ 850 int 851 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 852 { 853 int error; 854 struct nfsattrstat ns; 855 int douprintf; 856 mntinfo_t *mi; 857 failinfo_t fi; 858 hrtime_t t; 859 860 mi = VTOMI(vp); 861 fi.vp = vp; 862 fi.fhp = NULL; /* no need to update, filehandle not copied */ 863 fi.copyproc = nfscopyfh; 864 fi.lookupproc = nfslookup; 865 fi.xattrdirproc = acl_getxattrdir2; 866 867 if (mi->mi_flags & MI_ACL) { 868 error = acl_getattr2_otw(vp, vap, cr); 869 if (mi->mi_flags & MI_ACL) 870 return (error); 871 } 872 873 douprintf = 1; 874 875 t = gethrtime(); 876 877 error = rfs2call(mi, RFS_GETATTR, 878 xdr_fhandle, (caddr_t)VTOFH(vp), 879 xdr_attrstat, (caddr_t)&ns, cr, 880 &douprintf, &ns.ns_status, 0, &fi); 881 882 if (!error) { 883 error = geterrno(ns.ns_status); 884 if (!error) 885 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 886 else { 887 PURGE_STALE_FH(error, vp, cr); 888 } 889 } 890 891 return (error); 892 } 893 894 /* 895 * Return either cached ot remote attributes. If get remote attr 896 * use them to check and invalidate caches, then cache the new attributes. 897 */ 898 int 899 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 900 { 901 int error; 902 rnode_t *rp; 903 904 /* 905 * If we've got cached attributes, we're done, otherwise go 906 * to the server to get attributes, which will update the cache 907 * in the process. 908 */ 909 error = nfs_getattr_cache(vp, vap); 910 if (error) 911 error = nfs_getattr_otw(vp, vap, cr); 912 913 /* Return the client's view of file size */ 914 rp = VTOR(vp); 915 mutex_enter(&rp->r_statelock); 916 vap->va_size = rp->r_size; 917 mutex_exit(&rp->r_statelock); 918 919 return (error); 920 } 921 922 /* 923 * Get attributes over-the-wire and update attributes cache 924 * if no error occurred in the over-the-wire operation. 925 * Return 0 if successful, otherwise error. 926 */ 927 int 928 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 929 { 930 int error; 931 GETATTR3args args; 932 GETATTR3vres res; 933 int douprintf; 934 failinfo_t fi; 935 hrtime_t t; 936 937 args.object = *VTOFH3(vp); 938 fi.vp = vp; 939 fi.fhp = (caddr_t)&args.object; 940 fi.copyproc = nfs3copyfh; 941 fi.lookupproc = nfs3lookup; 942 fi.xattrdirproc = acl_getxattrdir3; 943 res.fres.vp = vp; 944 res.fres.vap = vap; 945 946 douprintf = 1; 947 948 t = gethrtime(); 949 950 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 951 xdr_nfs_fh3, (caddr_t)&args, 952 xdr_GETATTR3vres, (caddr_t)&res, cr, 953 &douprintf, &res.status, 0, &fi); 954 955 if (error) 956 return (error); 957 958 error = geterrno3(res.status); 959 if (error) { 960 PURGE_STALE_FH(error, vp, cr); 961 return (error); 962 } 963 964 /* 965 * Catch status codes that indicate fattr3 to vattr translation failure 966 */ 967 if (res.fres.status) 968 return (res.fres.status); 969 970 nfs_attr_cache(vp, vap, t, cr); 971 return (0); 972 } 973 974 /* 975 * Return either cached or remote attributes. If get remote attr 976 * use them to check and invalidate caches, then cache the new attributes. 977 */ 978 int 979 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 980 { 981 int error; 982 rnode_t *rp; 983 984 /* 985 * If we've got cached attributes, we're done, otherwise go 986 * to the server to get attributes, which will update the cache 987 * in the process. 988 */ 989 error = nfs_getattr_cache(vp, vap); 990 if (error) 991 error = nfs3_getattr_otw(vp, vap, cr); 992 993 /* Return the client's view of file size */ 994 rp = VTOR(vp); 995 mutex_enter(&rp->r_statelock); 996 vap->va_size = rp->r_size; 997 mutex_exit(&rp->r_statelock); 998 999 return (error); 1000 } 1001 1002 vtype_t nf_to_vt[] = { 1003 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 1004 }; 1005 /* 1006 * Convert NFS Version 2 over the network attributes to the local 1007 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1008 * network representation and the local representation is done here. 1009 * Returns 0 for success, error if failed due to overflow. 1010 */ 1011 int 1012 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 1013 { 1014 /* overflow in time attributes? */ 1015 #ifndef _LP64 1016 if (!NFS2_FATTR_TIME_OK(na)) 1017 return (EOVERFLOW); 1018 #endif 1019 1020 if (na->na_type < NFNON || na->na_type > NFSOC) 1021 vap->va_type = VBAD; 1022 else 1023 vap->va_type = nf_to_vt[na->na_type]; 1024 vap->va_mode = na->na_mode; 1025 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 1026 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 1027 vap->va_fsid = vp->v_vfsp->vfs_dev; 1028 vap->va_nodeid = na->na_nodeid; 1029 vap->va_nlink = na->na_nlink; 1030 vap->va_size = na->na_size; /* keep for cache validation */ 1031 /* 1032 * nfs protocol defines times as unsigned so don't extend sign, 1033 * unless sysadmin set nfs_allow_preepoch_time. 1034 */ 1035 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 1036 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 1037 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 1038 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 1039 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 1040 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 1041 /* 1042 * Shannon's law - uncompress the received dev_t 1043 * if the top half of is zero indicating a response 1044 * from an `older style' OS. Except for when it is a 1045 * `new style' OS sending the maj device of zero, 1046 * in which case the algorithm still works because the 1047 * fact that it is a new style server 1048 * is hidden by the minor device not being greater 1049 * than 255 (a requirement in this case). 1050 */ 1051 if ((na->na_rdev & 0xffff0000) == 0) 1052 vap->va_rdev = nfsv2_expdev(na->na_rdev); 1053 else 1054 vap->va_rdev = expldev(na->na_rdev); 1055 1056 vap->va_nblocks = na->na_blocks; 1057 switch (na->na_type) { 1058 case NFBLK: 1059 vap->va_blksize = DEV_BSIZE; 1060 break; 1061 1062 case NFCHR: 1063 vap->va_blksize = MAXBSIZE; 1064 break; 1065 1066 case NFSOC: 1067 default: 1068 vap->va_blksize = na->na_blocksize; 1069 break; 1070 } 1071 /* 1072 * This bit of ugliness is a hack to preserve the 1073 * over-the-wire protocols for named-pipe vnodes. 1074 * It remaps the special over-the-wire type to the 1075 * VFIFO type. (see note in nfs.h) 1076 */ 1077 if (NA_ISFIFO(na)) { 1078 vap->va_type = VFIFO; 1079 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1080 vap->va_rdev = 0; 1081 vap->va_blksize = na->na_blocksize; 1082 } 1083 vap->va_seq = 0; 1084 return (0); 1085 } 1086 1087 /* 1088 * Convert NFS Version 3 over the network attributes to the local 1089 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1090 * network representation and the local representation is done here. 1091 */ 1092 vtype_t nf3_to_vt[] = { 1093 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1094 }; 1095 1096 int 1097 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1098 { 1099 1100 #ifndef _LP64 1101 /* overflow in time attributes? */ 1102 if (!NFS3_FATTR_TIME_OK(na)) 1103 return (EOVERFLOW); 1104 #endif 1105 if (!NFS3_SIZE_OK(na->size)) 1106 /* file too big */ 1107 return (EFBIG); 1108 1109 vap->va_mask = AT_ALL; 1110 1111 if (na->type < NF3REG || na->type > NF3FIFO) 1112 vap->va_type = VBAD; 1113 else 1114 vap->va_type = nf3_to_vt[na->type]; 1115 vap->va_mode = na->mode; 1116 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1117 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1118 vap->va_fsid = vp->v_vfsp->vfs_dev; 1119 vap->va_nodeid = na->fileid; 1120 vap->va_nlink = na->nlink; 1121 vap->va_size = na->size; 1122 1123 /* 1124 * nfs protocol defines times as unsigned so don't extend sign, 1125 * unless sysadmin set nfs_allow_preepoch_time. 1126 */ 1127 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1128 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1129 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1130 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1131 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1132 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1133 1134 switch (na->type) { 1135 case NF3BLK: 1136 vap->va_rdev = makedevice(na->rdev.specdata1, 1137 na->rdev.specdata2); 1138 vap->va_blksize = DEV_BSIZE; 1139 vap->va_nblocks = 0; 1140 break; 1141 case NF3CHR: 1142 vap->va_rdev = makedevice(na->rdev.specdata1, 1143 na->rdev.specdata2); 1144 vap->va_blksize = MAXBSIZE; 1145 vap->va_nblocks = 0; 1146 break; 1147 case NF3REG: 1148 case NF3DIR: 1149 case NF3LNK: 1150 vap->va_rdev = 0; 1151 vap->va_blksize = MAXBSIZE; 1152 vap->va_nblocks = (u_longlong_t) 1153 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1154 (size3)DEV_BSIZE); 1155 break; 1156 case NF3SOCK: 1157 case NF3FIFO: 1158 default: 1159 vap->va_rdev = 0; 1160 vap->va_blksize = MAXBSIZE; 1161 vap->va_nblocks = 0; 1162 break; 1163 } 1164 vap->va_seq = 0; 1165 return (0); 1166 } 1167 1168 /* 1169 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1170 * for the demand-based allocation of async threads per-mount. The 1171 * nfs_async_timeout is the amount of time a thread will live after it 1172 * becomes idle, unless new I/O requests are received before the thread 1173 * dies. See nfs_async_putpage and nfs_async_start. 1174 */ 1175 1176 int nfs_async_timeout = -1; /* uninitialized */ 1177 1178 static void nfs_async_start(struct vfs *); 1179 1180 static void 1181 free_async_args(struct nfs_async_reqs *args) 1182 { 1183 rnode_t *rp; 1184 1185 if (args->a_io != NFS_INACTIVE) { 1186 rp = VTOR(args->a_vp); 1187 mutex_enter(&rp->r_statelock); 1188 rp->r_count--; 1189 if (args->a_io == NFS_PUTAPAGE || 1190 args->a_io == NFS_PAGEIO) 1191 rp->r_awcount--; 1192 cv_broadcast(&rp->r_cv); 1193 mutex_exit(&rp->r_statelock); 1194 VN_RELE(args->a_vp); 1195 } 1196 crfree(args->a_cred); 1197 kmem_free(args, sizeof (*args)); 1198 } 1199 1200 /* 1201 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1202 * pageout(), running in the global zone, have legitimate reasons to do 1203 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1204 * use of a a per-mount "asynchronous requests manager thread" which is 1205 * signaled by the various asynchronous work routines when there is 1206 * asynchronous work to be done. It is responsible for creating new 1207 * worker threads if necessary, and notifying existing worker threads 1208 * that there is work to be done. 1209 * 1210 * In other words, it will "take the specifications from the customers and 1211 * give them to the engineers." 1212 * 1213 * Worker threads die off of their own accord if they are no longer 1214 * needed. 1215 * 1216 * This thread is killed when the zone is going away or the filesystem 1217 * is being unmounted. 1218 */ 1219 void 1220 nfs_async_manager(vfs_t *vfsp) 1221 { 1222 callb_cpr_t cprinfo; 1223 mntinfo_t *mi; 1224 uint_t max_threads; 1225 1226 mi = VFTOMI(vfsp); 1227 1228 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1229 "nfs_async_manager"); 1230 1231 mutex_enter(&mi->mi_async_lock); 1232 /* 1233 * We want to stash the max number of threads that this mount was 1234 * allowed so we can use it later when the variable is set to zero as 1235 * part of the zone/mount going away. 1236 * 1237 * We want to be able to create at least one thread to handle 1238 * asyncrhonous inactive calls. 1239 */ 1240 max_threads = MAX(mi->mi_max_threads, 1); 1241 mutex_enter(&mi->mi_lock); 1242 /* 1243 * We don't want to wait for mi_max_threads to go to zero, since that 1244 * happens as part of a failed unmount, but this thread should only 1245 * exit when the mount/zone is really going away. 1246 * 1247 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1248 * attempted: the various _async_*() functions know to do things 1249 * inline if mi_max_threads == 0. Henceforth we just drain out the 1250 * outstanding requests. 1251 * 1252 * Note that we still create zthreads even if we notice the zone is 1253 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1254 * shutdown sequence to take slightly longer in some cases, but 1255 * doesn't violate the protocol, as all threads will exit as soon as 1256 * they're done processing the remaining requests. 1257 */ 1258 while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) || 1259 mi->mi_async_req_count > 0) { 1260 mutex_exit(&mi->mi_lock); 1261 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1262 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1263 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1264 while (mi->mi_async_req_count > 0) { 1265 /* 1266 * Paranoia: If the mount started out having 1267 * (mi->mi_max_threads == 0), and the value was 1268 * later changed (via a debugger or somesuch), 1269 * we could be confused since we will think we 1270 * can't create any threads, and the calling 1271 * code (which looks at the current value of 1272 * mi->mi_max_threads, now non-zero) thinks we 1273 * can. 1274 * 1275 * So, because we're paranoid, we create threads 1276 * up to the maximum of the original and the 1277 * current value. This means that future 1278 * (debugger-induced) lowerings of 1279 * mi->mi_max_threads are ignored for our 1280 * purposes, but who told them they could change 1281 * random values on a live kernel anyhow? 1282 */ 1283 if (mi->mi_threads < 1284 MAX(mi->mi_max_threads, max_threads)) { 1285 mi->mi_threads++; 1286 mutex_exit(&mi->mi_async_lock); 1287 VFS_HOLD(vfsp); /* hold for new thread */ 1288 (void) zthread_create(NULL, 0, nfs_async_start, 1289 vfsp, 0, minclsyspri); 1290 mutex_enter(&mi->mi_async_lock); 1291 } 1292 cv_signal(&mi->mi_async_work_cv); 1293 ASSERT(mi->mi_async_req_count != 0); 1294 mi->mi_async_req_count--; 1295 } 1296 mutex_enter(&mi->mi_lock); 1297 } 1298 mutex_exit(&mi->mi_lock); 1299 /* 1300 * Let everyone know we're done. 1301 */ 1302 mi->mi_manager_thread = NULL; 1303 cv_broadcast(&mi->mi_async_cv); 1304 1305 /* 1306 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1307 * since CALLB_CPR_EXIT is actually responsible for releasing 1308 * 'mi_async_lock'. 1309 */ 1310 CALLB_CPR_EXIT(&cprinfo); 1311 VFS_RELE(vfsp); /* release thread's hold */ 1312 zthread_exit(); 1313 } 1314 1315 /* 1316 * Signal (and wait for) the async manager thread to clean up and go away. 1317 */ 1318 void 1319 nfs_async_manager_stop(vfs_t *vfsp) 1320 { 1321 mntinfo_t *mi = VFTOMI(vfsp); 1322 1323 mutex_enter(&mi->mi_async_lock); 1324 mutex_enter(&mi->mi_lock); 1325 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1326 mutex_exit(&mi->mi_lock); 1327 cv_broadcast(&mi->mi_async_reqs_cv); 1328 while (mi->mi_manager_thread != NULL) 1329 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1330 mutex_exit(&mi->mi_async_lock); 1331 } 1332 1333 int 1334 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1335 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1336 u_offset_t, caddr_t, struct seg *, cred_t *)) 1337 { 1338 rnode_t *rp; 1339 mntinfo_t *mi; 1340 struct nfs_async_reqs *args; 1341 1342 rp = VTOR(vp); 1343 ASSERT(rp->r_freef == NULL); 1344 1345 mi = VTOMI(vp); 1346 1347 /* 1348 * If addr falls in a different segment, don't bother doing readahead. 1349 */ 1350 if (addr >= seg->s_base + seg->s_size) 1351 return (-1); 1352 1353 /* 1354 * If we can't allocate a request structure, punt on the readahead. 1355 */ 1356 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1357 return (-1); 1358 1359 /* 1360 * If a lock operation is pending, don't initiate any new 1361 * readaheads. Otherwise, bump r_count to indicate the new 1362 * asynchronous I/O. 1363 */ 1364 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1365 kmem_free(args, sizeof (*args)); 1366 return (-1); 1367 } 1368 mutex_enter(&rp->r_statelock); 1369 rp->r_count++; 1370 mutex_exit(&rp->r_statelock); 1371 nfs_rw_exit(&rp->r_lkserlock); 1372 1373 args->a_next = NULL; 1374 #ifdef DEBUG 1375 args->a_queuer = curthread; 1376 #endif 1377 VN_HOLD(vp); 1378 args->a_vp = vp; 1379 ASSERT(cr != NULL); 1380 crhold(cr); 1381 args->a_cred = cr; 1382 args->a_io = NFS_READ_AHEAD; 1383 args->a_nfs_readahead = readahead; 1384 args->a_nfs_blkoff = blkoff; 1385 args->a_nfs_seg = seg; 1386 args->a_nfs_addr = addr; 1387 1388 mutex_enter(&mi->mi_async_lock); 1389 1390 /* 1391 * If asyncio has been disabled, don't bother readahead. 1392 */ 1393 if (mi->mi_max_threads == 0) { 1394 mutex_exit(&mi->mi_async_lock); 1395 goto noasync; 1396 } 1397 1398 /* 1399 * Link request structure into the async list and 1400 * wakeup async thread to do the i/o. 1401 */ 1402 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1403 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1404 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1405 } else { 1406 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1407 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1408 } 1409 1410 if (mi->mi_io_kstats) { 1411 mutex_enter(&mi->mi_lock); 1412 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1413 mutex_exit(&mi->mi_lock); 1414 } 1415 1416 mi->mi_async_req_count++; 1417 ASSERT(mi->mi_async_req_count != 0); 1418 cv_signal(&mi->mi_async_reqs_cv); 1419 mutex_exit(&mi->mi_async_lock); 1420 return (0); 1421 1422 noasync: 1423 mutex_enter(&rp->r_statelock); 1424 rp->r_count--; 1425 cv_broadcast(&rp->r_cv); 1426 mutex_exit(&rp->r_statelock); 1427 VN_RELE(vp); 1428 crfree(cr); 1429 kmem_free(args, sizeof (*args)); 1430 return (-1); 1431 } 1432 1433 int 1434 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1435 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1436 u_offset_t, size_t, int, cred_t *)) 1437 { 1438 rnode_t *rp; 1439 mntinfo_t *mi; 1440 struct nfs_async_reqs *args; 1441 1442 ASSERT(flags & B_ASYNC); 1443 ASSERT(vp->v_vfsp != NULL); 1444 1445 rp = VTOR(vp); 1446 ASSERT(rp->r_count > 0); 1447 1448 mi = VTOMI(vp); 1449 1450 /* 1451 * If we can't allocate a request structure, do the putpage 1452 * operation synchronously in this thread's context. 1453 */ 1454 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1455 goto noasync; 1456 1457 args->a_next = NULL; 1458 #ifdef DEBUG 1459 args->a_queuer = curthread; 1460 #endif 1461 VN_HOLD(vp); 1462 args->a_vp = vp; 1463 ASSERT(cr != NULL); 1464 crhold(cr); 1465 args->a_cred = cr; 1466 args->a_io = NFS_PUTAPAGE; 1467 args->a_nfs_putapage = putapage; 1468 args->a_nfs_pp = pp; 1469 args->a_nfs_off = off; 1470 args->a_nfs_len = (uint_t)len; 1471 args->a_nfs_flags = flags; 1472 1473 mutex_enter(&mi->mi_async_lock); 1474 1475 /* 1476 * If asyncio has been disabled, then make a synchronous request. 1477 * This check is done a second time in case async io was diabled 1478 * while this thread was blocked waiting for memory pressure to 1479 * reduce or for the queue to drain. 1480 */ 1481 if (mi->mi_max_threads == 0) { 1482 mutex_exit(&mi->mi_async_lock); 1483 goto noasync; 1484 } 1485 1486 /* 1487 * Link request structure into the async list and 1488 * wakeup async thread to do the i/o. 1489 */ 1490 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1491 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1492 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1493 } else { 1494 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1495 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1496 } 1497 1498 mutex_enter(&rp->r_statelock); 1499 rp->r_count++; 1500 rp->r_awcount++; 1501 mutex_exit(&rp->r_statelock); 1502 1503 if (mi->mi_io_kstats) { 1504 mutex_enter(&mi->mi_lock); 1505 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1506 mutex_exit(&mi->mi_lock); 1507 } 1508 1509 mi->mi_async_req_count++; 1510 ASSERT(mi->mi_async_req_count != 0); 1511 cv_signal(&mi->mi_async_reqs_cv); 1512 mutex_exit(&mi->mi_async_lock); 1513 return (0); 1514 1515 noasync: 1516 if (args != NULL) { 1517 VN_RELE(vp); 1518 crfree(cr); 1519 kmem_free(args, sizeof (*args)); 1520 } 1521 1522 if (curproc == proc_pageout || curproc == proc_fsflush) { 1523 /* 1524 * If we get here in the context of the pageout/fsflush, 1525 * we refuse to do a sync write, because this may hang 1526 * pageout (and the machine). In this case, we just 1527 * re-mark the page as dirty and punt on the page. 1528 * 1529 * Make sure B_FORCE isn't set. We can re-mark the 1530 * pages as dirty and unlock the pages in one swoop by 1531 * passing in B_ERROR to pvn_write_done(). However, 1532 * we should make sure B_FORCE isn't set - we don't 1533 * want the page tossed before it gets written out. 1534 */ 1535 if (flags & B_FORCE) 1536 flags &= ~(B_INVAL | B_FORCE); 1537 pvn_write_done(pp, flags | B_ERROR); 1538 return (0); 1539 } 1540 if (nfs_zone() != mi->mi_zone) { 1541 /* 1542 * So this was a cross-zone sync putpage. We pass in B_ERROR 1543 * to pvn_write_done() to re-mark the pages as dirty and unlock 1544 * them. 1545 * 1546 * We don't want to clear B_FORCE here as the caller presumably 1547 * knows what they're doing if they set it. 1548 */ 1549 pvn_write_done(pp, flags | B_ERROR); 1550 return (EPERM); 1551 } 1552 return ((*putapage)(vp, pp, off, len, flags, cr)); 1553 } 1554 1555 int 1556 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1557 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1558 size_t, int, cred_t *)) 1559 { 1560 rnode_t *rp; 1561 mntinfo_t *mi; 1562 struct nfs_async_reqs *args; 1563 1564 ASSERT(flags & B_ASYNC); 1565 ASSERT(vp->v_vfsp != NULL); 1566 1567 rp = VTOR(vp); 1568 ASSERT(rp->r_count > 0); 1569 1570 mi = VTOMI(vp); 1571 1572 /* 1573 * If we can't allocate a request structure, do the pageio 1574 * request synchronously in this thread's context. 1575 */ 1576 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1577 goto noasync; 1578 1579 args->a_next = NULL; 1580 #ifdef DEBUG 1581 args->a_queuer = curthread; 1582 #endif 1583 VN_HOLD(vp); 1584 args->a_vp = vp; 1585 ASSERT(cr != NULL); 1586 crhold(cr); 1587 args->a_cred = cr; 1588 args->a_io = NFS_PAGEIO; 1589 args->a_nfs_pageio = pageio; 1590 args->a_nfs_pp = pp; 1591 args->a_nfs_off = io_off; 1592 args->a_nfs_len = (uint_t)io_len; 1593 args->a_nfs_flags = flags; 1594 1595 mutex_enter(&mi->mi_async_lock); 1596 1597 /* 1598 * If asyncio has been disabled, then make a synchronous request. 1599 * This check is done a second time in case async io was diabled 1600 * while this thread was blocked waiting for memory pressure to 1601 * reduce or for the queue to drain. 1602 */ 1603 if (mi->mi_max_threads == 0) { 1604 mutex_exit(&mi->mi_async_lock); 1605 goto noasync; 1606 } 1607 1608 /* 1609 * Link request structure into the async list and 1610 * wakeup async thread to do the i/o. 1611 */ 1612 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1613 mi->mi_async_reqs[NFS_PAGEIO] = args; 1614 mi->mi_async_tail[NFS_PAGEIO] = args; 1615 } else { 1616 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1617 mi->mi_async_tail[NFS_PAGEIO] = args; 1618 } 1619 1620 mutex_enter(&rp->r_statelock); 1621 rp->r_count++; 1622 rp->r_awcount++; 1623 mutex_exit(&rp->r_statelock); 1624 1625 if (mi->mi_io_kstats) { 1626 mutex_enter(&mi->mi_lock); 1627 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1628 mutex_exit(&mi->mi_lock); 1629 } 1630 1631 mi->mi_async_req_count++; 1632 ASSERT(mi->mi_async_req_count != 0); 1633 cv_signal(&mi->mi_async_reqs_cv); 1634 mutex_exit(&mi->mi_async_lock); 1635 return (0); 1636 1637 noasync: 1638 if (args != NULL) { 1639 VN_RELE(vp); 1640 crfree(cr); 1641 kmem_free(args, sizeof (*args)); 1642 } 1643 1644 /* 1645 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1646 * the page list), for writes we do it synchronously, except for 1647 * proc_pageout/proc_fsflush as described below. 1648 */ 1649 if (flags & B_READ) { 1650 pvn_read_done(pp, flags | B_ERROR); 1651 return (0); 1652 } 1653 1654 if (curproc == proc_pageout || curproc == proc_fsflush) { 1655 /* 1656 * If we get here in the context of the pageout/fsflush, 1657 * we refuse to do a sync write, because this may hang 1658 * pageout/fsflush (and the machine). In this case, we just 1659 * re-mark the page as dirty and punt on the page. 1660 * 1661 * Make sure B_FORCE isn't set. We can re-mark the 1662 * pages as dirty and unlock the pages in one swoop by 1663 * passing in B_ERROR to pvn_write_done(). However, 1664 * we should make sure B_FORCE isn't set - we don't 1665 * want the page tossed before it gets written out. 1666 */ 1667 if (flags & B_FORCE) 1668 flags &= ~(B_INVAL | B_FORCE); 1669 pvn_write_done(pp, flags | B_ERROR); 1670 return (0); 1671 } 1672 1673 if (nfs_zone() != mi->mi_zone) { 1674 /* 1675 * So this was a cross-zone sync pageio. We pass in B_ERROR 1676 * to pvn_write_done() to re-mark the pages as dirty and unlock 1677 * them. 1678 * 1679 * We don't want to clear B_FORCE here as the caller presumably 1680 * knows what they're doing if they set it. 1681 */ 1682 pvn_write_done(pp, flags | B_ERROR); 1683 return (EPERM); 1684 } 1685 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1686 } 1687 1688 void 1689 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1690 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1691 { 1692 rnode_t *rp; 1693 mntinfo_t *mi; 1694 struct nfs_async_reqs *args; 1695 1696 rp = VTOR(vp); 1697 ASSERT(rp->r_freef == NULL); 1698 1699 mi = VTOMI(vp); 1700 1701 /* 1702 * If we can't allocate a request structure, do the readdir 1703 * operation synchronously in this thread's context. 1704 */ 1705 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1706 goto noasync; 1707 1708 args->a_next = NULL; 1709 #ifdef DEBUG 1710 args->a_queuer = curthread; 1711 #endif 1712 VN_HOLD(vp); 1713 args->a_vp = vp; 1714 ASSERT(cr != NULL); 1715 crhold(cr); 1716 args->a_cred = cr; 1717 args->a_io = NFS_READDIR; 1718 args->a_nfs_readdir = readdir; 1719 args->a_nfs_rdc = rdc; 1720 1721 mutex_enter(&mi->mi_async_lock); 1722 1723 /* 1724 * If asyncio has been disabled, then make a synchronous request. 1725 */ 1726 if (mi->mi_max_threads == 0) { 1727 mutex_exit(&mi->mi_async_lock); 1728 goto noasync; 1729 } 1730 1731 /* 1732 * Link request structure into the async list and 1733 * wakeup async thread to do the i/o. 1734 */ 1735 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1736 mi->mi_async_reqs[NFS_READDIR] = args; 1737 mi->mi_async_tail[NFS_READDIR] = args; 1738 } else { 1739 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1740 mi->mi_async_tail[NFS_READDIR] = args; 1741 } 1742 1743 mutex_enter(&rp->r_statelock); 1744 rp->r_count++; 1745 mutex_exit(&rp->r_statelock); 1746 1747 if (mi->mi_io_kstats) { 1748 mutex_enter(&mi->mi_lock); 1749 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1750 mutex_exit(&mi->mi_lock); 1751 } 1752 1753 mi->mi_async_req_count++; 1754 ASSERT(mi->mi_async_req_count != 0); 1755 cv_signal(&mi->mi_async_reqs_cv); 1756 mutex_exit(&mi->mi_async_lock); 1757 return; 1758 1759 noasync: 1760 if (args != NULL) { 1761 VN_RELE(vp); 1762 crfree(cr); 1763 kmem_free(args, sizeof (*args)); 1764 } 1765 1766 rdc->entries = NULL; 1767 mutex_enter(&rp->r_statelock); 1768 ASSERT(rdc->flags & RDDIR); 1769 rdc->flags &= ~RDDIR; 1770 rdc->flags |= RDDIRREQ; 1771 /* 1772 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1773 * is set, wakeup the thread sleeping in cv_wait_sig(). 1774 * The woken up thread will reset the flag to RDDIR and will 1775 * continue with the readdir opeartion. 1776 */ 1777 if (rdc->flags & RDDIRWAIT) { 1778 rdc->flags &= ~RDDIRWAIT; 1779 cv_broadcast(&rdc->cv); 1780 } 1781 mutex_exit(&rp->r_statelock); 1782 rddir_cache_rele(rdc); 1783 } 1784 1785 void 1786 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1787 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1788 cred_t *)) 1789 { 1790 rnode_t *rp; 1791 mntinfo_t *mi; 1792 struct nfs_async_reqs *args; 1793 page_t *pp; 1794 1795 rp = VTOR(vp); 1796 mi = VTOMI(vp); 1797 1798 /* 1799 * If we can't allocate a request structure, do the commit 1800 * operation synchronously in this thread's context. 1801 */ 1802 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1803 goto noasync; 1804 1805 args->a_next = NULL; 1806 #ifdef DEBUG 1807 args->a_queuer = curthread; 1808 #endif 1809 VN_HOLD(vp); 1810 args->a_vp = vp; 1811 ASSERT(cr != NULL); 1812 crhold(cr); 1813 args->a_cred = cr; 1814 args->a_io = NFS_COMMIT; 1815 args->a_nfs_commit = commit; 1816 args->a_nfs_plist = plist; 1817 args->a_nfs_offset = offset; 1818 args->a_nfs_count = count; 1819 1820 mutex_enter(&mi->mi_async_lock); 1821 1822 /* 1823 * If asyncio has been disabled, then make a synchronous request. 1824 * This check is done a second time in case async io was diabled 1825 * while this thread was blocked waiting for memory pressure to 1826 * reduce or for the queue to drain. 1827 */ 1828 if (mi->mi_max_threads == 0) { 1829 mutex_exit(&mi->mi_async_lock); 1830 goto noasync; 1831 } 1832 1833 /* 1834 * Link request structure into the async list and 1835 * wakeup async thread to do the i/o. 1836 */ 1837 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1838 mi->mi_async_reqs[NFS_COMMIT] = args; 1839 mi->mi_async_tail[NFS_COMMIT] = args; 1840 } else { 1841 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1842 mi->mi_async_tail[NFS_COMMIT] = args; 1843 } 1844 1845 mutex_enter(&rp->r_statelock); 1846 rp->r_count++; 1847 mutex_exit(&rp->r_statelock); 1848 1849 if (mi->mi_io_kstats) { 1850 mutex_enter(&mi->mi_lock); 1851 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1852 mutex_exit(&mi->mi_lock); 1853 } 1854 1855 mi->mi_async_req_count++; 1856 ASSERT(mi->mi_async_req_count != 0); 1857 cv_signal(&mi->mi_async_reqs_cv); 1858 mutex_exit(&mi->mi_async_lock); 1859 return; 1860 1861 noasync: 1862 if (args != NULL) { 1863 VN_RELE(vp); 1864 crfree(cr); 1865 kmem_free(args, sizeof (*args)); 1866 } 1867 1868 if (curproc == proc_pageout || curproc == proc_fsflush || 1869 nfs_zone() != mi->mi_zone) { 1870 while (plist != NULL) { 1871 pp = plist; 1872 page_sub(&plist, pp); 1873 pp->p_fsdata = C_COMMIT; 1874 page_unlock(pp); 1875 } 1876 return; 1877 } 1878 (*commit)(vp, plist, offset, count, cr); 1879 } 1880 1881 void 1882 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1883 void (*inactive)(vnode_t *, cred_t *, caller_context_t *)) 1884 { 1885 mntinfo_t *mi; 1886 struct nfs_async_reqs *args; 1887 1888 mi = VTOMI(vp); 1889 1890 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1891 args->a_next = NULL; 1892 #ifdef DEBUG 1893 args->a_queuer = curthread; 1894 #endif 1895 args->a_vp = vp; 1896 ASSERT(cr != NULL); 1897 crhold(cr); 1898 args->a_cred = cr; 1899 args->a_io = NFS_INACTIVE; 1900 args->a_nfs_inactive = inactive; 1901 1902 /* 1903 * Note that we don't check mi->mi_max_threads here, since we 1904 * *need* to get rid of this vnode regardless of whether someone 1905 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1906 * 1907 * The manager thread knows about this and is willing to create 1908 * at least one thread to accommodate us. 1909 */ 1910 mutex_enter(&mi->mi_async_lock); 1911 if (mi->mi_manager_thread == NULL) { 1912 rnode_t *rp = VTOR(vp); 1913 1914 mutex_exit(&mi->mi_async_lock); 1915 crfree(cr); /* drop our reference */ 1916 kmem_free(args, sizeof (*args)); 1917 /* 1918 * We can't do an over-the-wire call since we're in the wrong 1919 * zone, so we need to clean up state as best we can and then 1920 * throw away the vnode. 1921 */ 1922 mutex_enter(&rp->r_statelock); 1923 if (rp->r_unldvp != NULL) { 1924 vnode_t *unldvp; 1925 char *unlname; 1926 cred_t *unlcred; 1927 1928 unldvp = rp->r_unldvp; 1929 rp->r_unldvp = NULL; 1930 unlname = rp->r_unlname; 1931 rp->r_unlname = NULL; 1932 unlcred = rp->r_unlcred; 1933 rp->r_unlcred = NULL; 1934 mutex_exit(&rp->r_statelock); 1935 1936 VN_RELE(unldvp); 1937 kmem_free(unlname, MAXNAMELEN); 1938 crfree(unlcred); 1939 } else { 1940 mutex_exit(&rp->r_statelock); 1941 } 1942 /* 1943 * No need to explicitly throw away any cached pages. The 1944 * eventual rinactive() will attempt a synchronous 1945 * VOP_PUTPAGE() which will immediately fail since the request 1946 * is coming from the wrong zone, and then will proceed to call 1947 * nfs_invalidate_pages() which will clean things up for us. 1948 */ 1949 rp_addfree(VTOR(vp), cr); 1950 return; 1951 } 1952 1953 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1954 mi->mi_async_reqs[NFS_INACTIVE] = args; 1955 } else { 1956 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1957 } 1958 mi->mi_async_tail[NFS_INACTIVE] = args; 1959 /* 1960 * Don't increment r_count, since we're trying to get rid of the vnode. 1961 */ 1962 1963 mi->mi_async_req_count++; 1964 ASSERT(mi->mi_async_req_count != 0); 1965 cv_signal(&mi->mi_async_reqs_cv); 1966 mutex_exit(&mi->mi_async_lock); 1967 } 1968 1969 /* 1970 * The async queues for each mounted file system are arranged as a 1971 * set of queues, one for each async i/o type. Requests are taken 1972 * from the queues in a round-robin fashion. A number of consecutive 1973 * requests are taken from each queue before moving on to the next 1974 * queue. This functionality may allow the NFS Version 2 server to do 1975 * write clustering, even if the client is mixing writes and reads 1976 * because it will take multiple write requests from the queue 1977 * before processing any of the other async i/o types. 1978 * 1979 * XXX The nfs_async_start thread is unsafe in the light of the present 1980 * model defined by cpr to suspend the system. Specifically over the 1981 * wire calls are cpr-unsafe. The thread should be reevaluated in 1982 * case of future updates to the cpr model. 1983 */ 1984 static void 1985 nfs_async_start(struct vfs *vfsp) 1986 { 1987 struct nfs_async_reqs *args; 1988 mntinfo_t *mi = VFTOMI(vfsp); 1989 clock_t time_left = 1; 1990 callb_cpr_t cprinfo; 1991 int i; 1992 1993 /* 1994 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1995 * built in an implementation independent manner. 1996 */ 1997 if (nfs_async_timeout == -1) 1998 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1999 2000 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 2001 2002 mutex_enter(&mi->mi_async_lock); 2003 for (;;) { 2004 /* 2005 * Find the next queue containing an entry. We start 2006 * at the current queue pointer and then round robin 2007 * through all of them until we either find a non-empty 2008 * queue or have looked through all of them. 2009 */ 2010 for (i = 0; i < NFS_ASYNC_TYPES; i++) { 2011 args = *mi->mi_async_curr; 2012 if (args != NULL) 2013 break; 2014 mi->mi_async_curr++; 2015 if (mi->mi_async_curr == 2016 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 2017 mi->mi_async_curr = &mi->mi_async_reqs[0]; 2018 } 2019 /* 2020 * If we didn't find a entry, then block until woken up 2021 * again and then look through the queues again. 2022 */ 2023 if (args == NULL) { 2024 /* 2025 * Exiting is considered to be safe for CPR as well 2026 */ 2027 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2028 2029 /* 2030 * Wakeup thread waiting to unmount the file 2031 * system only if all async threads are inactive. 2032 * 2033 * If we've timed-out and there's nothing to do, 2034 * then get rid of this thread. 2035 */ 2036 if (mi->mi_max_threads == 0 || time_left <= 0) { 2037 if (--mi->mi_threads == 0) 2038 cv_signal(&mi->mi_async_cv); 2039 CALLB_CPR_EXIT(&cprinfo); 2040 VFS_RELE(vfsp); /* release thread's hold */ 2041 zthread_exit(); 2042 /* NOTREACHED */ 2043 } 2044 time_left = cv_timedwait(&mi->mi_async_work_cv, 2045 &mi->mi_async_lock, nfs_async_timeout + lbolt); 2046 2047 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 2048 2049 continue; 2050 } 2051 time_left = 1; 2052 2053 /* 2054 * Remove the request from the async queue and then 2055 * update the current async request queue pointer. If 2056 * the current queue is empty or we have removed enough 2057 * consecutive entries from it, then reset the counter 2058 * for this queue and then move the current pointer to 2059 * the next queue. 2060 */ 2061 *mi->mi_async_curr = args->a_next; 2062 if (*mi->mi_async_curr == NULL || 2063 --mi->mi_async_clusters[args->a_io] == 0) { 2064 mi->mi_async_clusters[args->a_io] = 2065 mi->mi_async_init_clusters; 2066 mi->mi_async_curr++; 2067 if (mi->mi_async_curr == 2068 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 2069 mi->mi_async_curr = &mi->mi_async_reqs[0]; 2070 } 2071 2072 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 2073 mutex_enter(&mi->mi_lock); 2074 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 2075 mutex_exit(&mi->mi_lock); 2076 } 2077 2078 mutex_exit(&mi->mi_async_lock); 2079 2080 /* 2081 * Obtain arguments from the async request structure. 2082 */ 2083 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2084 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2085 args->a_nfs_addr, args->a_nfs_seg, 2086 args->a_cred); 2087 } else if (args->a_io == NFS_PUTAPAGE) { 2088 (void) (*args->a_nfs_putapage)(args->a_vp, 2089 args->a_nfs_pp, args->a_nfs_off, 2090 args->a_nfs_len, args->a_nfs_flags, 2091 args->a_cred); 2092 } else if (args->a_io == NFS_PAGEIO) { 2093 (void) (*args->a_nfs_pageio)(args->a_vp, 2094 args->a_nfs_pp, args->a_nfs_off, 2095 args->a_nfs_len, args->a_nfs_flags, 2096 args->a_cred); 2097 } else if (args->a_io == NFS_READDIR) { 2098 (void) ((*args->a_nfs_readdir)(args->a_vp, 2099 args->a_nfs_rdc, args->a_cred)); 2100 } else if (args->a_io == NFS_COMMIT) { 2101 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2102 args->a_nfs_offset, args->a_nfs_count, 2103 args->a_cred); 2104 } else if (args->a_io == NFS_INACTIVE) { 2105 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL); 2106 } 2107 2108 /* 2109 * Now, release the vnode and free the credentials 2110 * structure. 2111 */ 2112 free_async_args(args); 2113 /* 2114 * Reacquire the mutex because it will be needed above. 2115 */ 2116 mutex_enter(&mi->mi_async_lock); 2117 } 2118 } 2119 2120 void 2121 nfs_async_stop(struct vfs *vfsp) 2122 { 2123 mntinfo_t *mi = VFTOMI(vfsp); 2124 2125 /* 2126 * Wait for all outstanding async operations to complete and for the 2127 * worker threads to exit. 2128 */ 2129 mutex_enter(&mi->mi_async_lock); 2130 mi->mi_max_threads = 0; 2131 cv_broadcast(&mi->mi_async_work_cv); 2132 while (mi->mi_threads != 0) 2133 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2134 mutex_exit(&mi->mi_async_lock); 2135 } 2136 2137 /* 2138 * nfs_async_stop_sig: 2139 * Wait for all outstanding putpage operation to complete. If a signal 2140 * is deliver we will abort and return non-zero. If we can put all the 2141 * pages we will return 0. This routine is called from nfs_unmount and 2142 * nfs3_unmount to make these operations interruptible. 2143 */ 2144 int 2145 nfs_async_stop_sig(struct vfs *vfsp) 2146 { 2147 mntinfo_t *mi = VFTOMI(vfsp); 2148 ushort_t omax; 2149 int rval; 2150 2151 /* 2152 * Wait for all outstanding async operations to complete and for the 2153 * worker threads to exit. 2154 */ 2155 mutex_enter(&mi->mi_async_lock); 2156 omax = mi->mi_max_threads; 2157 mi->mi_max_threads = 0; 2158 /* 2159 * Tell all the worker threads to exit. 2160 */ 2161 cv_broadcast(&mi->mi_async_work_cv); 2162 while (mi->mi_threads != 0) { 2163 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2164 break; 2165 } 2166 rval = (mi->mi_threads != 0); /* Interrupted */ 2167 if (rval) 2168 mi->mi_max_threads = omax; 2169 mutex_exit(&mi->mi_async_lock); 2170 2171 return (rval); 2172 } 2173 2174 int 2175 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2176 { 2177 int pagecreate; 2178 int n; 2179 int saved_n; 2180 caddr_t saved_base; 2181 u_offset_t offset; 2182 int error; 2183 int sm_error; 2184 vnode_t *vp = RTOV(rp); 2185 2186 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2187 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2188 if (!vpm_enable) { 2189 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2190 } 2191 2192 /* 2193 * Move bytes in at most PAGESIZE chunks. We must avoid 2194 * spanning pages in uiomove() because page faults may cause 2195 * the cache to be invalidated out from under us. The r_size is not 2196 * updated until after the uiomove. If we push the last page of a 2197 * file before r_size is correct, we will lose the data written past 2198 * the current (and invalid) r_size. 2199 */ 2200 do { 2201 offset = uio->uio_loffset; 2202 pagecreate = 0; 2203 2204 /* 2205 * n is the number of bytes required to satisfy the request 2206 * or the number of bytes to fill out the page. 2207 */ 2208 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2209 2210 /* 2211 * Check to see if we can skip reading in the page 2212 * and just allocate the memory. We can do this 2213 * if we are going to rewrite the entire mapping 2214 * or if we are going to write to or beyond the current 2215 * end of file from the beginning of the mapping. 2216 * 2217 * The read of r_size is now protected by r_statelock. 2218 */ 2219 mutex_enter(&rp->r_statelock); 2220 /* 2221 * When pgcreated is nonzero the caller has already done 2222 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2223 * segkpm this means we already have at least one page 2224 * created and mapped at base. 2225 */ 2226 pagecreate = pgcreated || 2227 ((offset & PAGEOFFSET) == 0 && 2228 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2229 2230 mutex_exit(&rp->r_statelock); 2231 if (!vpm_enable && pagecreate) { 2232 /* 2233 * The last argument tells segmap_pagecreate() to 2234 * always lock the page, as opposed to sometimes 2235 * returning with the page locked. This way we avoid a 2236 * fault on the ensuing uiomove(), but also 2237 * more importantly (to fix bug 1094402) we can 2238 * call segmap_fault() to unlock the page in all 2239 * cases. An alternative would be to modify 2240 * segmap_pagecreate() to tell us when it is 2241 * locking a page, but that's a fairly major 2242 * interface change. 2243 */ 2244 if (pgcreated == 0) 2245 (void) segmap_pagecreate(segkmap, base, 2246 (uint_t)n, 1); 2247 saved_base = base; 2248 saved_n = n; 2249 } 2250 2251 /* 2252 * The number of bytes of data in the last page can not 2253 * be accurately be determined while page is being 2254 * uiomove'd to and the size of the file being updated. 2255 * Thus, inform threads which need to know accurately 2256 * how much data is in the last page of the file. They 2257 * will not do the i/o immediately, but will arrange for 2258 * the i/o to happen later when this modify operation 2259 * will have finished. 2260 */ 2261 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2262 mutex_enter(&rp->r_statelock); 2263 rp->r_flags |= RMODINPROGRESS; 2264 rp->r_modaddr = (offset & MAXBMASK); 2265 mutex_exit(&rp->r_statelock); 2266 2267 if (vpm_enable) { 2268 /* 2269 * Copy data. If new pages are created, part of 2270 * the page that is not written will be initizliazed 2271 * with zeros. 2272 */ 2273 error = vpm_data_copy(vp, offset, n, uio, 2274 !pagecreate, NULL, 0, S_WRITE); 2275 } else { 2276 error = uiomove(base, n, UIO_WRITE, uio); 2277 } 2278 2279 /* 2280 * r_size is the maximum number of 2281 * bytes known to be in the file. 2282 * Make sure it is at least as high as the 2283 * first unwritten byte pointed to by uio_loffset. 2284 */ 2285 mutex_enter(&rp->r_statelock); 2286 if (rp->r_size < uio->uio_loffset) 2287 rp->r_size = uio->uio_loffset; 2288 rp->r_flags &= ~RMODINPROGRESS; 2289 rp->r_flags |= RDIRTY; 2290 mutex_exit(&rp->r_statelock); 2291 2292 /* n = # of bytes written */ 2293 n = (int)(uio->uio_loffset - offset); 2294 2295 if (!vpm_enable) { 2296 base += n; 2297 } 2298 tcount -= n; 2299 /* 2300 * If we created pages w/o initializing them completely, 2301 * we need to zero the part that wasn't set up. 2302 * This happens on a most EOF write cases and if 2303 * we had some sort of error during the uiomove. 2304 */ 2305 if (!vpm_enable && pagecreate) { 2306 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2307 (void) kzero(base, PAGESIZE - n); 2308 2309 if (pgcreated) { 2310 /* 2311 * Caller is responsible for this page, 2312 * it was not created in this loop. 2313 */ 2314 pgcreated = 0; 2315 } else { 2316 /* 2317 * For bug 1094402: segmap_pagecreate locks 2318 * page. Unlock it. This also unlocks the 2319 * pages allocated by page_create_va() in 2320 * segmap_pagecreate(). 2321 */ 2322 sm_error = segmap_fault(kas.a_hat, segkmap, 2323 saved_base, saved_n, 2324 F_SOFTUNLOCK, S_WRITE); 2325 if (error == 0) 2326 error = sm_error; 2327 } 2328 } 2329 } while (tcount > 0 && error == 0); 2330 2331 return (error); 2332 } 2333 2334 int 2335 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2336 { 2337 rnode_t *rp; 2338 page_t *pp; 2339 u_offset_t eoff; 2340 u_offset_t io_off; 2341 size_t io_len; 2342 int error; 2343 int rdirty; 2344 int err; 2345 2346 rp = VTOR(vp); 2347 ASSERT(rp->r_count > 0); 2348 2349 if (!vn_has_cached_data(vp)) 2350 return (0); 2351 2352 ASSERT(vp->v_type != VCHR); 2353 2354 /* 2355 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2356 * writes. B_FORCE is set to force the VM system to actually 2357 * invalidate the pages, even if the i/o failed. The pages 2358 * need to get invalidated because they can't be written out 2359 * because there isn't any space left on either the server's 2360 * file system or in the user's disk quota. The B_FREE bit 2361 * is cleared to avoid confusion as to whether this is a 2362 * request to place the page on the freelist or to destroy 2363 * it. 2364 */ 2365 if ((rp->r_flags & ROUTOFSPACE) || 2366 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2367 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2368 2369 if (len == 0) { 2370 /* 2371 * If doing a full file synchronous operation, then clear 2372 * the RDIRTY bit. If a page gets dirtied while the flush 2373 * is happening, then RDIRTY will get set again. The 2374 * RDIRTY bit must get cleared before the flush so that 2375 * we don't lose this information. 2376 * 2377 * If there are no full file async write operations 2378 * pending and RDIRTY bit is set, clear it. 2379 */ 2380 if (off == (u_offset_t)0 && 2381 !(flags & B_ASYNC) && 2382 (rp->r_flags & RDIRTY)) { 2383 mutex_enter(&rp->r_statelock); 2384 rdirty = (rp->r_flags & RDIRTY); 2385 rp->r_flags &= ~RDIRTY; 2386 mutex_exit(&rp->r_statelock); 2387 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2388 mutex_enter(&rp->r_statelock); 2389 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) { 2390 rdirty = (rp->r_flags & RDIRTY); 2391 rp->r_flags &= ~RDIRTY; 2392 } 2393 mutex_exit(&rp->r_statelock); 2394 } else 2395 rdirty = 0; 2396 2397 /* 2398 * Search the entire vp list for pages >= off, and flush 2399 * the dirty pages. 2400 */ 2401 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2402 flags, cr); 2403 2404 /* 2405 * If an error occurred and the file was marked as dirty 2406 * before and we aren't forcibly invalidating pages, then 2407 * reset the RDIRTY flag. 2408 */ 2409 if (error && rdirty && 2410 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2411 mutex_enter(&rp->r_statelock); 2412 rp->r_flags |= RDIRTY; 2413 mutex_exit(&rp->r_statelock); 2414 } 2415 } else { 2416 /* 2417 * Do a range from [off...off + len) looking for pages 2418 * to deal with. 2419 */ 2420 error = 0; 2421 #ifdef lint 2422 io_len = 0; 2423 #endif 2424 eoff = off + len; 2425 mutex_enter(&rp->r_statelock); 2426 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2427 io_off += io_len) { 2428 mutex_exit(&rp->r_statelock); 2429 /* 2430 * If we are not invalidating, synchronously 2431 * freeing or writing pages use the routine 2432 * page_lookup_nowait() to prevent reclaiming 2433 * them from the free list. 2434 */ 2435 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2436 pp = page_lookup(vp, io_off, 2437 (flags & (B_INVAL | B_FREE)) ? 2438 SE_EXCL : SE_SHARED); 2439 } else { 2440 pp = page_lookup_nowait(vp, io_off, 2441 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2442 } 2443 2444 if (pp == NULL || !pvn_getdirty(pp, flags)) 2445 io_len = PAGESIZE; 2446 else { 2447 err = (*rp->r_putapage)(vp, pp, &io_off, 2448 &io_len, flags, cr); 2449 if (!error) 2450 error = err; 2451 /* 2452 * "io_off" and "io_len" are returned as 2453 * the range of pages we actually wrote. 2454 * This allows us to skip ahead more quickly 2455 * since several pages may've been dealt 2456 * with by this iteration of the loop. 2457 */ 2458 } 2459 mutex_enter(&rp->r_statelock); 2460 } 2461 mutex_exit(&rp->r_statelock); 2462 } 2463 2464 return (error); 2465 } 2466 2467 void 2468 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2469 { 2470 rnode_t *rp; 2471 2472 rp = VTOR(vp); 2473 mutex_enter(&rp->r_statelock); 2474 while (rp->r_flags & RTRUNCATE) 2475 cv_wait(&rp->r_cv, &rp->r_statelock); 2476 rp->r_flags |= RTRUNCATE; 2477 if (off == (u_offset_t)0) { 2478 rp->r_flags &= ~RDIRTY; 2479 if (!(rp->r_flags & RSTALE)) 2480 rp->r_error = 0; 2481 } 2482 rp->r_truncaddr = off; 2483 mutex_exit(&rp->r_statelock); 2484 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2485 B_INVAL | B_TRUNC, cr); 2486 mutex_enter(&rp->r_statelock); 2487 rp->r_flags &= ~RTRUNCATE; 2488 cv_broadcast(&rp->r_cv); 2489 mutex_exit(&rp->r_statelock); 2490 } 2491 2492 static int nfs_write_error_to_cons_only = 0; 2493 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2494 2495 /* 2496 * Print a file handle 2497 */ 2498 void 2499 nfs_printfhandle(nfs_fhandle *fhp) 2500 { 2501 int *ip; 2502 char *buf; 2503 size_t bufsize; 2504 char *cp; 2505 2506 /* 2507 * 13 == "(file handle:" 2508 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2509 * 1 == ' ' 2510 * 8 == maximum strlen of "%x" 2511 * 3 == ")\n\0" 2512 */ 2513 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2514 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2515 if (buf == NULL) 2516 return; 2517 2518 cp = buf; 2519 (void) strcpy(cp, "(file handle:"); 2520 while (*cp != '\0') 2521 cp++; 2522 for (ip = (int *)fhp->fh_buf; 2523 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2524 ip++) { 2525 (void) sprintf(cp, " %x", *ip); 2526 while (*cp != '\0') 2527 cp++; 2528 } 2529 (void) strcpy(cp, ")\n"); 2530 2531 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2532 2533 kmem_free(buf, bufsize); 2534 } 2535 2536 /* 2537 * Notify the system administrator that an NFS write error has 2538 * occurred. 2539 */ 2540 2541 /* seconds between ENOSPC/EDQUOT messages */ 2542 clock_t nfs_write_error_interval = 5; 2543 2544 void 2545 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2546 { 2547 mntinfo_t *mi; 2548 2549 mi = VTOMI(vp); 2550 /* 2551 * In case of forced unmount or zone shutdown, do not print any 2552 * messages since it can flood the console with error messages. 2553 */ 2554 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2555 return; 2556 2557 /* 2558 * No use in flooding the console with ENOSPC 2559 * messages from the same file system. 2560 */ 2561 if ((error != ENOSPC && error != EDQUOT) || 2562 lbolt - mi->mi_printftime > 0) { 2563 zoneid_t zoneid = mi->mi_zone->zone_id; 2564 2565 #ifdef DEBUG 2566 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2567 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2568 #else 2569 nfs_perror(error, "NFS write error on host %s: %m.\n", 2570 VTOR(vp)->r_server->sv_hostname, NULL); 2571 #endif 2572 if (error == ENOSPC || error == EDQUOT) { 2573 zcmn_err(zoneid, CE_CONT, 2574 MSG("^File: userid=%d, groupid=%d\n"), 2575 crgetuid(cr), crgetgid(cr)); 2576 if (crgetuid(CRED()) != crgetuid(cr) || 2577 crgetgid(CRED()) != crgetgid(cr)) { 2578 zcmn_err(zoneid, CE_CONT, 2579 MSG("^User: userid=%d, groupid=%d\n"), 2580 crgetuid(CRED()), crgetgid(CRED())); 2581 } 2582 mi->mi_printftime = lbolt + 2583 nfs_write_error_interval * hz; 2584 } 2585 nfs_printfhandle(&VTOR(vp)->r_fh); 2586 #ifdef DEBUG 2587 if (error == EACCES) { 2588 zcmn_err(zoneid, CE_CONT, 2589 MSG("^nfs_bio: cred is%s kcred\n"), 2590 cr == kcred ? "" : " not"); 2591 } 2592 #endif 2593 } 2594 } 2595 2596 /* ARGSUSED */ 2597 static void * 2598 nfs_mi_init(zoneid_t zoneid) 2599 { 2600 struct mi_globals *mig; 2601 2602 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2603 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2604 list_create(&mig->mig_list, sizeof (mntinfo_t), 2605 offsetof(mntinfo_t, mi_zone_node)); 2606 mig->mig_destructor_called = B_FALSE; 2607 return (mig); 2608 } 2609 2610 /* 2611 * Callback routine to tell all NFS mounts in the zone to stop creating new 2612 * threads. Existing threads should exit. 2613 */ 2614 /* ARGSUSED */ 2615 static void 2616 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2617 { 2618 struct mi_globals *mig = data; 2619 mntinfo_t *mi; 2620 2621 ASSERT(mig != NULL); 2622 again: 2623 mutex_enter(&mig->mig_lock); 2624 for (mi = list_head(&mig->mig_list); mi != NULL; 2625 mi = list_next(&mig->mig_list, mi)) { 2626 2627 /* 2628 * If we've done the shutdown work for this FS, skip. 2629 * Once we go off the end of the list, we're done. 2630 */ 2631 if (mi->mi_flags & MI_DEAD) 2632 continue; 2633 2634 /* 2635 * We will do work, so not done. Get a hold on the FS. 2636 */ 2637 VFS_HOLD(mi->mi_vfsp); 2638 2639 /* 2640 * purge the DNLC for this filesystem 2641 */ 2642 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2643 2644 mutex_enter(&mi->mi_async_lock); 2645 /* 2646 * Tell existing async worker threads to exit. 2647 */ 2648 mi->mi_max_threads = 0; 2649 cv_broadcast(&mi->mi_async_work_cv); 2650 /* 2651 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2652 * getting ready to exit when it's done with its current work. 2653 * Also set MI_DEAD to note we've acted on this FS. 2654 */ 2655 mutex_enter(&mi->mi_lock); 2656 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD); 2657 mutex_exit(&mi->mi_lock); 2658 /* 2659 * Wake up the async manager thread. 2660 */ 2661 cv_broadcast(&mi->mi_async_reqs_cv); 2662 mutex_exit(&mi->mi_async_lock); 2663 2664 /* 2665 * Drop lock and release FS, which may change list, then repeat. 2666 * We're done when every mi has been done or the list is empty. 2667 */ 2668 mutex_exit(&mig->mig_lock); 2669 VFS_RELE(mi->mi_vfsp); 2670 goto again; 2671 } 2672 mutex_exit(&mig->mig_lock); 2673 } 2674 2675 static void 2676 nfs_mi_free_globals(struct mi_globals *mig) 2677 { 2678 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2679 mutex_destroy(&mig->mig_lock); 2680 kmem_free(mig, sizeof (*mig)); 2681 2682 } 2683 2684 /* ARGSUSED */ 2685 static void 2686 nfs_mi_destroy(zoneid_t zoneid, void *data) 2687 { 2688 struct mi_globals *mig = data; 2689 2690 ASSERT(mig != NULL); 2691 mutex_enter(&mig->mig_lock); 2692 if (list_head(&mig->mig_list) != NULL) { 2693 /* Still waiting for VFS_FREEVFS() */ 2694 mig->mig_destructor_called = B_TRUE; 2695 mutex_exit(&mig->mig_lock); 2696 return; 2697 } 2698 nfs_mi_free_globals(mig); 2699 } 2700 2701 /* 2702 * Add an NFS mount to the per-zone list of NFS mounts. 2703 */ 2704 void 2705 nfs_mi_zonelist_add(mntinfo_t *mi) 2706 { 2707 struct mi_globals *mig; 2708 2709 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2710 mutex_enter(&mig->mig_lock); 2711 list_insert_head(&mig->mig_list, mi); 2712 mutex_exit(&mig->mig_lock); 2713 } 2714 2715 /* 2716 * Remove an NFS mount from the per-zone list of NFS mounts. 2717 */ 2718 static void 2719 nfs_mi_zonelist_remove(mntinfo_t *mi) 2720 { 2721 struct mi_globals *mig; 2722 2723 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2724 mutex_enter(&mig->mig_lock); 2725 list_remove(&mig->mig_list, mi); 2726 /* 2727 * We can be called asynchronously by VFS_FREEVFS() after the zone 2728 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2729 * mi globals. 2730 */ 2731 if (list_head(&mig->mig_list) == NULL && 2732 mig->mig_destructor_called == B_TRUE) { 2733 nfs_mi_free_globals(mig); 2734 return; 2735 } 2736 mutex_exit(&mig->mig_lock); 2737 } 2738 2739 /* 2740 * NFS Client initialization routine. This routine should only be called 2741 * once. It performs the following tasks: 2742 * - Initalize all global locks 2743 * - Call sub-initialization routines (localize access to variables) 2744 */ 2745 int 2746 nfs_clntinit(void) 2747 { 2748 #ifdef DEBUG 2749 static boolean_t nfs_clntup = B_FALSE; 2750 #endif 2751 int error; 2752 2753 #ifdef DEBUG 2754 ASSERT(nfs_clntup == B_FALSE); 2755 #endif 2756 2757 error = nfs_subrinit(); 2758 if (error) 2759 return (error); 2760 2761 error = nfs_vfsinit(); 2762 if (error) { 2763 /* 2764 * Cleanup nfs_subrinit() work 2765 */ 2766 nfs_subrfini(); 2767 return (error); 2768 } 2769 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2770 nfs_mi_destroy); 2771 2772 nfs4_clnt_init(); 2773 2774 #ifdef DEBUG 2775 nfs_clntup = B_TRUE; 2776 #endif 2777 2778 return (0); 2779 } 2780 2781 /* 2782 * This routine is only called if the NFS Client has been initialized but 2783 * the module failed to be installed. This routine will cleanup the previously 2784 * allocated/initialized work. 2785 */ 2786 void 2787 nfs_clntfini(void) 2788 { 2789 (void) zone_key_delete(mi_list_key); 2790 nfs_subrfini(); 2791 nfs_vfsfini(); 2792 nfs4_clnt_fini(); 2793 } 2794 2795 /* 2796 * nfs_lockrelease: 2797 * 2798 * Release any locks on the given vnode that are held by the current 2799 * process. 2800 */ 2801 void 2802 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2803 { 2804 flock64_t ld; 2805 struct shrlock shr; 2806 char *buf; 2807 int remote_lock_possible; 2808 int ret; 2809 2810 ASSERT((uintptr_t)vp > KERNELBASE); 2811 2812 /* 2813 * Generate an explicit unlock operation for the entire file. As a 2814 * partial optimization, only generate the unlock if there is a 2815 * lock registered for the file. We could check whether this 2816 * particular process has any locks on the file, but that would 2817 * require the local locking code to provide yet another query 2818 * routine. Note that no explicit synchronization is needed here. 2819 * At worst, flk_has_remote_locks() will return a false positive, 2820 * in which case the unlock call wastes time but doesn't harm 2821 * correctness. 2822 * 2823 * In addition, an unlock request is generated if the process 2824 * is listed as possibly having a lock on the file because the 2825 * server and client lock managers may have gotten out of sync. 2826 * N.B. It is important to make sure nfs_remove_locking_id() is 2827 * called here even if flk_has_remote_locks(vp) reports true. 2828 * If it is not called and there is an entry on the process id 2829 * list, that entry will never get removed. 2830 */ 2831 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2832 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2833 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2834 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2835 ld.l_whence = 0; /* unlock from start of file */ 2836 ld.l_start = 0; 2837 ld.l_len = 0; /* do entire file */ 2838 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr, 2839 NULL); 2840 2841 if (ret != 0) { 2842 /* 2843 * If VOP_FRLOCK fails, make sure we unregister 2844 * local locks before we continue. 2845 */ 2846 ld.l_pid = ttoproc(curthread)->p_pid; 2847 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2848 #ifdef DEBUG 2849 nfs_perror(ret, 2850 "NFS lock release error on vp %p: %m.\n", 2851 (void *)vp, NULL); 2852 #endif 2853 } 2854 2855 /* 2856 * The call to VOP_FRLOCK may put the pid back on the 2857 * list. We need to remove it. 2858 */ 2859 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2860 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2861 } 2862 2863 /* 2864 * As long as the vp has a share matching our pid, 2865 * pluck it off and unshare it. There are circumstances in 2866 * which the call to nfs_remove_locking_id() may put the 2867 * owner back on the list, in which case we simply do a 2868 * redundant and harmless unshare. 2869 */ 2870 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2871 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2872 (char *)NULL, buf, &shr.s_own_len)) { 2873 shr.s_owner = buf; 2874 shr.s_access = 0; 2875 shr.s_deny = 0; 2876 shr.s_sysid = 0; 2877 shr.s_pid = curproc->p_pid; 2878 2879 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL); 2880 #ifdef DEBUG 2881 if (ret != 0) { 2882 nfs_perror(ret, 2883 "NFS share release error on vp %p: %m.\n", 2884 (void *)vp, NULL); 2885 } 2886 #endif 2887 } 2888 kmem_free(buf, MAX_SHR_OWNER_LEN); 2889 } 2890 2891 /* 2892 * nfs_lockcompletion: 2893 * 2894 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2895 * as non cachable (set VNOCACHE bit). 2896 */ 2897 2898 void 2899 nfs_lockcompletion(vnode_t *vp, int cmd) 2900 { 2901 #ifdef DEBUG 2902 rnode_t *rp = VTOR(vp); 2903 2904 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2905 #endif 2906 2907 if (cmd == F_SETLK || cmd == F_SETLKW) { 2908 if (!lm_safemap(vp)) { 2909 mutex_enter(&vp->v_lock); 2910 vp->v_flag |= VNOCACHE; 2911 mutex_exit(&vp->v_lock); 2912 } else { 2913 mutex_enter(&vp->v_lock); 2914 vp->v_flag &= ~VNOCACHE; 2915 mutex_exit(&vp->v_lock); 2916 } 2917 } 2918 /* 2919 * The cached attributes of the file are stale after acquiring 2920 * the lock on the file. They were updated when the file was 2921 * opened, but not updated when the lock was acquired. Therefore the 2922 * cached attributes are invalidated after the lock is obtained. 2923 */ 2924 PURGE_ATTRCACHE(vp); 2925 } 2926 2927 /* 2928 * The lock manager holds state making it possible for the client 2929 * and server to be out of sync. For example, if the response from 2930 * the server granting a lock request is lost, the server will think 2931 * the lock is granted and the client will think the lock is lost. 2932 * The client can tell when it is not positive if it is in sync with 2933 * the server. 2934 * 2935 * To deal with this, a list of processes for which the client is 2936 * not sure if the server holds a lock is attached to the rnode. 2937 * When such a process closes the rnode, an unlock request is sent 2938 * to the server to unlock the entire file. 2939 * 2940 * The list is kept as a singularly linked NULL terminated list. 2941 * Because it is only added to under extreme error conditions, the 2942 * list shouldn't get very big. DEBUG kernels print a message if 2943 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 2944 * choosen to be 8, but can be tuned at runtime. 2945 */ 2946 #ifdef DEBUG 2947 /* int nfs_lmpl_high_water = 8; */ 2948 int nfs_lmpl_high_water = 128; 2949 int nfs_cnt_add_locking_id = 0; 2950 int nfs_len_add_locking_id = 0; 2951 #endif /* DEBUG */ 2952 2953 /* 2954 * Record that the nfs lock manager server may be holding a lock on 2955 * a vnode for a process. 2956 * 2957 * Because the nfs lock manager server holds state, it is possible 2958 * for the server to get out of sync with the client. This routine is called 2959 * from the client when it is no longer sure if the server is in sync 2960 * with the client. nfs_lockrelease() will then notice this and send 2961 * an unlock request when the file is closed 2962 */ 2963 void 2964 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 2965 { 2966 rnode_t *rp; 2967 lmpl_t *new; 2968 lmpl_t *cur; 2969 lmpl_t **lmplp; 2970 #ifdef DEBUG 2971 int list_len = 1; 2972 #endif /* DEBUG */ 2973 2974 #ifdef DEBUG 2975 ++nfs_cnt_add_locking_id; 2976 #endif /* DEBUG */ 2977 /* 2978 * allocate new lmpl_t now so we don't sleep 2979 * later after grabbing mutexes 2980 */ 2981 ASSERT(len < MAX_SHR_OWNER_LEN); 2982 new = kmem_alloc(sizeof (*new), KM_SLEEP); 2983 new->lmpl_type = type; 2984 new->lmpl_pid = pid; 2985 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 2986 bcopy(id, new->lmpl_owner, len); 2987 new->lmpl_own_len = len; 2988 new->lmpl_next = (lmpl_t *)NULL; 2989 #ifdef DEBUG 2990 if (type == RLMPL_PID) { 2991 ASSERT(len == sizeof (pid_t)); 2992 ASSERT(pid == *(pid_t *)new->lmpl_owner); 2993 } else { 2994 ASSERT(type == RLMPL_OWNER); 2995 } 2996 #endif 2997 2998 rp = VTOR(vp); 2999 mutex_enter(&rp->r_statelock); 3000 3001 /* 3002 * Add this id to the list for this rnode only if the 3003 * rnode is active and the id is not already there. 3004 */ 3005 ASSERT(rp->r_flags & RHASHED); 3006 lmplp = &(rp->r_lmpl); 3007 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3008 if (cur->lmpl_pid == pid && 3009 cur->lmpl_type == type && 3010 cur->lmpl_own_len == len && 3011 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 3012 kmem_free(new->lmpl_owner, len); 3013 kmem_free(new, sizeof (*new)); 3014 break; 3015 } 3016 lmplp = &cur->lmpl_next; 3017 #ifdef DEBUG 3018 ++list_len; 3019 #endif /* DEBUG */ 3020 } 3021 if (cur == (lmpl_t *)NULL) { 3022 *lmplp = new; 3023 #ifdef DEBUG 3024 if (list_len > nfs_len_add_locking_id) { 3025 nfs_len_add_locking_id = list_len; 3026 } 3027 if (list_len > nfs_lmpl_high_water) { 3028 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 3029 "vp=%p is %d", (void *)vp, list_len); 3030 } 3031 #endif /* DEBUG */ 3032 } 3033 3034 #ifdef DEBUG 3035 if (share_debug) { 3036 int nitems = 0; 3037 int npids = 0; 3038 int nowners = 0; 3039 3040 /* 3041 * Count the number of things left on r_lmpl after the remove. 3042 */ 3043 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3044 cur = cur->lmpl_next) { 3045 nitems++; 3046 if (cur->lmpl_type == RLMPL_PID) { 3047 npids++; 3048 } else if (cur->lmpl_type == RLMPL_OWNER) { 3049 nowners++; 3050 } else { 3051 cmn_err(CE_PANIC, "nfs_add_locking_id: " 3052 "unrecognized lmpl_type %d", 3053 cur->lmpl_type); 3054 } 3055 } 3056 3057 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 3058 "OWNs = %d items left on r_lmpl\n", 3059 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 3060 } 3061 #endif 3062 3063 mutex_exit(&rp->r_statelock); 3064 } 3065 3066 /* 3067 * Remove an id from the lock manager id list. 3068 * 3069 * If the id is not in the list return 0. If it was found and 3070 * removed, return 1. 3071 */ 3072 static int 3073 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 3074 { 3075 lmpl_t *cur; 3076 lmpl_t **lmplp; 3077 rnode_t *rp; 3078 int rv = 0; 3079 3080 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 3081 3082 rp = VTOR(vp); 3083 3084 mutex_enter(&rp->r_statelock); 3085 ASSERT(rp->r_flags & RHASHED); 3086 lmplp = &(rp->r_lmpl); 3087 3088 /* 3089 * Search through the list and remove the entry for this id 3090 * if it is there. The special case id == NULL allows removal 3091 * of the first share on the r_lmpl list belonging to the 3092 * current process (if any), without regard to further details 3093 * of its identity. 3094 */ 3095 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3096 if (cur->lmpl_type == type && 3097 cur->lmpl_pid == curproc->p_pid && 3098 (id == (char *)NULL || 3099 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 3100 *lmplp = cur->lmpl_next; 3101 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 3102 if (rid != NULL) { 3103 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 3104 *rlen = cur->lmpl_own_len; 3105 } 3106 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 3107 kmem_free(cur, sizeof (*cur)); 3108 rv = 1; 3109 break; 3110 } 3111 lmplp = &cur->lmpl_next; 3112 } 3113 3114 #ifdef DEBUG 3115 if (share_debug) { 3116 int nitems = 0; 3117 int npids = 0; 3118 int nowners = 0; 3119 3120 /* 3121 * Count the number of things left on r_lmpl after the remove. 3122 */ 3123 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3124 cur = cur->lmpl_next) { 3125 nitems++; 3126 if (cur->lmpl_type == RLMPL_PID) { 3127 npids++; 3128 } else if (cur->lmpl_type == RLMPL_OWNER) { 3129 nowners++; 3130 } else { 3131 cmn_err(CE_PANIC, 3132 "nrli: unrecognized lmpl_type %d", 3133 cur->lmpl_type); 3134 } 3135 } 3136 3137 cmn_err(CE_CONT, 3138 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3139 (type == RLMPL_PID) ? "P" : "O", 3140 npids, 3141 nowners, 3142 nitems); 3143 } 3144 #endif 3145 3146 mutex_exit(&rp->r_statelock); 3147 return (rv); 3148 } 3149 3150 void 3151 nfs_free_mi(mntinfo_t *mi) 3152 { 3153 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3154 ASSERT(mi->mi_manager_thread == NULL); 3155 ASSERT(mi->mi_threads == 0); 3156 3157 /* 3158 * Remove the node from the global list before we start tearing it down. 3159 */ 3160 nfs_mi_zonelist_remove(mi); 3161 if (mi->mi_klmconfig) { 3162 lm_free_config(mi->mi_klmconfig); 3163 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3164 } 3165 mutex_destroy(&mi->mi_lock); 3166 mutex_destroy(&mi->mi_remap_lock); 3167 mutex_destroy(&mi->mi_async_lock); 3168 cv_destroy(&mi->mi_failover_cv); 3169 cv_destroy(&mi->mi_async_work_cv); 3170 cv_destroy(&mi->mi_async_reqs_cv); 3171 cv_destroy(&mi->mi_async_cv); 3172 zone_rele(mi->mi_zone); 3173 kmem_free(mi, sizeof (*mi)); 3174 } 3175 3176 static int 3177 mnt_kstat_update(kstat_t *ksp, int rw) 3178 { 3179 mntinfo_t *mi; 3180 struct mntinfo_kstat *mik; 3181 vfs_t *vfsp; 3182 int i; 3183 3184 /* this is a read-only kstat. Bail out on a write */ 3185 if (rw == KSTAT_WRITE) 3186 return (EACCES); 3187 3188 /* 3189 * We don't want to wait here as kstat_chain_lock could be held by 3190 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3191 * and thus could lead to a deadlock. 3192 */ 3193 vfsp = (struct vfs *)ksp->ks_private; 3194 3195 3196 mi = VFTOMI(vfsp); 3197 3198 mik = (struct mntinfo_kstat *)ksp->ks_data; 3199 3200 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3201 mik->mik_vers = (uint32_t)mi->mi_vers; 3202 mik->mik_flags = mi->mi_flags; 3203 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3204 mik->mik_curread = (uint32_t)mi->mi_curread; 3205 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3206 mik->mik_retrans = mi->mi_retrans; 3207 mik->mik_timeo = mi->mi_timeo; 3208 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3209 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3210 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3211 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3212 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3213 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3214 mik->mik_timers[i].deviate = 3215 (uint32_t)mi->mi_timers[i].rt_deviate; 3216 mik->mik_timers[i].rtxcur = 3217 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3218 } 3219 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3220 mik->mik_failover = (uint32_t)mi->mi_failover; 3221 mik->mik_remap = (uint32_t)mi->mi_remap; 3222 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3223 3224 return (0); 3225 } 3226 3227 void 3228 nfs_mnt_kstat_init(struct vfs *vfsp) 3229 { 3230 mntinfo_t *mi = VFTOMI(vfsp); 3231 3232 /* 3233 * Create the version specific kstats. 3234 * 3235 * PSARC 2001/697 Contract Private Interface 3236 * All nfs kstats are under SunMC contract 3237 * Please refer to the PSARC listed above and contact 3238 * SunMC before making any changes! 3239 * 3240 * Changes must be reviewed by Solaris File Sharing 3241 * Changes must be communicated to contract-2001-697@sun.com 3242 * 3243 */ 3244 3245 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3246 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3247 if (mi->mi_io_kstats) { 3248 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3249 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3250 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3251 kstat_install(mi->mi_io_kstats); 3252 } 3253 3254 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3255 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3256 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3257 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3258 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3259 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3260 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3261 kstat_install(mi->mi_ro_kstats); 3262 } 3263 } 3264 3265 nfs_delmapcall_t * 3266 nfs_init_delmapcall() 3267 { 3268 nfs_delmapcall_t *delmap_call; 3269 3270 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3271 delmap_call->call_id = curthread; 3272 delmap_call->error = 0; 3273 3274 return (delmap_call); 3275 } 3276 3277 void 3278 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3279 { 3280 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3281 } 3282 3283 /* 3284 * Searches for the current delmap caller (based on curthread) in the list of 3285 * callers. If it is found, we remove it and free the delmap caller. 3286 * Returns: 3287 * 0 if the caller wasn't found 3288 * 1 if the caller was found, removed and freed. *errp is set to what 3289 * the result of the delmap was. 3290 */ 3291 int 3292 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3293 { 3294 nfs_delmapcall_t *delmap_call; 3295 3296 /* 3297 * If the list doesn't exist yet, we create it and return 3298 * that the caller wasn't found. No list = no callers. 3299 */ 3300 mutex_enter(&rp->r_statelock); 3301 if (!(rp->r_flags & RDELMAPLIST)) { 3302 /* The list does not exist */ 3303 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3304 offsetof(nfs_delmapcall_t, call_node)); 3305 rp->r_flags |= RDELMAPLIST; 3306 mutex_exit(&rp->r_statelock); 3307 return (0); 3308 } else { 3309 /* The list exists so search it */ 3310 for (delmap_call = list_head(&rp->r_indelmap); 3311 delmap_call != NULL; 3312 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3313 if (delmap_call->call_id == curthread) { 3314 /* current caller is in the list */ 3315 *errp = delmap_call->error; 3316 list_remove(&rp->r_indelmap, delmap_call); 3317 mutex_exit(&rp->r_statelock); 3318 nfs_free_delmapcall(delmap_call); 3319 return (1); 3320 } 3321 } 3322 } 3323 mutex_exit(&rp->r_statelock); 3324 return (0); 3325 } 3326