1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/types.h> 31 #include <sys/systm.h> 32 #include <sys/thread.h> 33 #include <sys/t_lock.h> 34 #include <sys/time.h> 35 #include <sys/vnode.h> 36 #include <sys/vfs.h> 37 #include <sys/errno.h> 38 #include <sys/buf.h> 39 #include <sys/stat.h> 40 #include <sys/cred.h> 41 #include <sys/kmem.h> 42 #include <sys/debug.h> 43 #include <sys/dnlc.h> 44 #include <sys/vmsystm.h> 45 #include <sys/flock.h> 46 #include <sys/share.h> 47 #include <sys/cmn_err.h> 48 #include <sys/tiuser.h> 49 #include <sys/sysmacros.h> 50 #include <sys/callb.h> 51 #include <sys/acl.h> 52 #include <sys/kstat.h> 53 #include <sys/signal.h> 54 #include <sys/list.h> 55 #include <sys/zone.h> 56 57 #include <rpc/types.h> 58 #include <rpc/xdr.h> 59 #include <rpc/auth.h> 60 #include <rpc/clnt.h> 61 62 #include <nfs/nfs.h> 63 #include <nfs/nfs_clnt.h> 64 65 #include <nfs/rnode.h> 66 #include <nfs/nfs_acl.h> 67 #include <nfs/lm.h> 68 69 #include <vm/hat.h> 70 #include <vm/as.h> 71 #include <vm/page.h> 72 #include <vm/pvn.h> 73 #include <vm/seg.h> 74 #include <vm/seg_map.h> 75 #include <vm/seg_vn.h> 76 77 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 78 cred_t *); 79 static int nfs_getattr_cache(vnode_t *, struct vattr *); 80 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 81 82 struct mi_globals { 83 kmutex_t mig_lock; /* lock protecting mig_list */ 84 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 85 boolean_t mig_destructor_called; 86 }; 87 88 static zone_key_t mi_list_key; 89 90 /* Debugging flag for PC file shares. */ 91 extern int share_debug; 92 93 /* 94 * Attributes caching: 95 * 96 * Attributes are cached in the rnode in struct vattr form. 97 * There is a time associated with the cached attributes (r_attrtime) 98 * which tells whether the attributes are valid. The time is initialized 99 * to the difference between current time and the modify time of the vnode 100 * when new attributes are cached. This allows the attributes for 101 * files that have changed recently to be timed out sooner than for files 102 * that have not changed for a long time. There are minimum and maximum 103 * timeout values that can be set per mount point. 104 */ 105 106 int 107 nfs_waitfor_purge_complete(vnode_t *vp) 108 { 109 rnode_t *rp; 110 k_sigset_t smask; 111 112 rp = VTOR(vp); 113 if (rp->r_serial != NULL && rp->r_serial != curthread) { 114 mutex_enter(&rp->r_statelock); 115 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 116 while (rp->r_serial != NULL) { 117 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 118 sigunintr(&smask); 119 mutex_exit(&rp->r_statelock); 120 return (EINTR); 121 } 122 } 123 sigunintr(&smask); 124 mutex_exit(&rp->r_statelock); 125 } 126 return (0); 127 } 128 129 /* 130 * Validate caches by checking cached attributes. If the cached 131 * attributes have timed out, then get new attributes from the server. 132 * As a side affect, this will do cache invalidation if the attributes 133 * have changed. 134 * 135 * If the attributes have not timed out and if there is a cache 136 * invalidation being done by some other thread, then wait until that 137 * thread has completed the cache invalidation. 138 */ 139 int 140 nfs_validate_caches(vnode_t *vp, cred_t *cr) 141 { 142 int error; 143 struct vattr va; 144 145 if (ATTRCACHE_VALID(vp)) { 146 error = nfs_waitfor_purge_complete(vp); 147 if (error) 148 return (error); 149 return (0); 150 } 151 152 va.va_mask = AT_ALL; 153 return (nfs_getattr_otw(vp, &va, cr)); 154 } 155 156 /* 157 * Validate caches by checking cached attributes. If the cached 158 * attributes have timed out, then get new attributes from the server. 159 * As a side affect, this will do cache invalidation if the attributes 160 * have changed. 161 * 162 * If the attributes have not timed out and if there is a cache 163 * invalidation being done by some other thread, then wait until that 164 * thread has completed the cache invalidation. 165 */ 166 int 167 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 168 { 169 int error; 170 struct vattr va; 171 172 if (ATTRCACHE_VALID(vp)) { 173 error = nfs_waitfor_purge_complete(vp); 174 if (error) 175 return (error); 176 return (0); 177 } 178 179 va.va_mask = AT_ALL; 180 return (nfs3_getattr_otw(vp, &va, cr)); 181 } 182 183 /* 184 * Purge all of the various NFS `data' caches. 185 */ 186 void 187 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 188 { 189 rnode_t *rp; 190 char *contents; 191 int size; 192 int error; 193 194 /* 195 * Purge the DNLC for any entries which refer to this file. 196 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 197 */ 198 rp = VTOR(vp); 199 mutex_enter(&rp->r_statelock); 200 if (vp->v_count > 1 && 201 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 202 !(rp->r_flags & RINDNLCPURGE)) { 203 /* 204 * Set the RINDNLCPURGE flag to prevent recursive entry 205 * into dnlc_purge_vp() 206 */ 207 if (vp->v_type == VDIR) 208 rp->r_flags |= RINDNLCPURGE; 209 mutex_exit(&rp->r_statelock); 210 dnlc_purge_vp(vp); 211 mutex_enter(&rp->r_statelock); 212 if (rp->r_flags & RINDNLCPURGE) 213 rp->r_flags &= ~RINDNLCPURGE; 214 } 215 216 /* 217 * Clear any readdir state bits and purge the readlink response cache. 218 */ 219 contents = rp->r_symlink.contents; 220 size = rp->r_symlink.size; 221 rp->r_symlink.contents = NULL; 222 mutex_exit(&rp->r_statelock); 223 224 if (contents != NULL) { 225 226 kmem_free((void *)contents, size); 227 } 228 229 /* 230 * Flush the page cache. 231 */ 232 if (vn_has_cached_data(vp)) { 233 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 234 if (error && (error == ENOSPC || error == EDQUOT)) { 235 mutex_enter(&rp->r_statelock); 236 if (!rp->r_error) 237 rp->r_error = error; 238 mutex_exit(&rp->r_statelock); 239 } 240 } 241 242 /* 243 * Flush the readdir response cache. 244 */ 245 if (HAVE_RDDIR_CACHE(rp)) 246 nfs_purge_rddir_cache(vp); 247 } 248 249 /* 250 * Purge the readdir cache of all entries 251 */ 252 void 253 nfs_purge_rddir_cache(vnode_t *vp) 254 { 255 rnode_t *rp; 256 rddir_cache *rdc; 257 rddir_cache *nrdc; 258 259 rp = VTOR(vp); 260 top: 261 mutex_enter(&rp->r_statelock); 262 rp->r_direof = NULL; 263 rp->r_flags &= ~RLOOKUP; 264 rp->r_flags |= RREADDIRPLUS; 265 rdc = avl_first(&rp->r_dir); 266 while (rdc != NULL) { 267 nrdc = AVL_NEXT(&rp->r_dir, rdc); 268 avl_remove(&rp->r_dir, rdc); 269 rddir_cache_rele(rdc); 270 rdc = nrdc; 271 } 272 mutex_exit(&rp->r_statelock); 273 } 274 275 /* 276 * Do a cache check based on the post-operation attributes. 277 * Then make them the new cached attributes. If no attributes 278 * were returned, then mark the attributes as timed out. 279 */ 280 void 281 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 282 { 283 vattr_t attr; 284 285 if (!poap->attributes) { 286 PURGE_ATTRCACHE(vp); 287 return; 288 } 289 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 290 } 291 292 /* 293 * Same as above, but using a vattr 294 */ 295 void 296 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 297 cred_t *cr) 298 { 299 if (!poap->attributes) { 300 PURGE_ATTRCACHE(vp); 301 return; 302 } 303 nfs_attr_cache(vp, poap->fres.vap, t, cr); 304 } 305 306 /* 307 * Do a cache check based on the weak cache consistency attributes. 308 * These consist of a small set of pre-operation attributes and the 309 * full set of post-operation attributes. 310 * 311 * If we are given the pre-operation attributes, then use them to 312 * check the validity of the various caches. Then, if we got the 313 * post-operation attributes, make them the new cached attributes. 314 * If we didn't get the post-operation attributes, then mark the 315 * attribute cache as timed out so that the next reference will 316 * cause a GETATTR to the server to refresh with the current 317 * attributes. 318 * 319 * Otherwise, if we didn't get the pre-operation attributes, but 320 * we did get the post-operation attributes, then use these 321 * attributes to check the validity of the various caches. This 322 * will probably cause a flush of the caches because if the 323 * operation succeeded, the attributes of the object were changed 324 * in some way from the old post-operation attributes. This 325 * should be okay because it is the safe thing to do. After 326 * checking the data caches, then we make these the new cached 327 * attributes. 328 * 329 * Otherwise, we didn't get either the pre- or post-operation 330 * attributes. Simply mark the attribute cache as timed out so 331 * the next reference will cause a GETATTR to the server to 332 * refresh with the current attributes. 333 * 334 * If an error occurred trying to convert the over the wire 335 * attributes to a vattr, then simply mark the attribute cache as 336 * timed out. 337 */ 338 void 339 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 340 { 341 vattr_t bva; 342 vattr_t ava; 343 344 if (wccp->after.attributes) { 345 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 346 PURGE_ATTRCACHE(vp); 347 return; 348 } 349 if (wccp->before.attributes) { 350 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 351 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 352 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 353 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 354 bva.va_size = wccp->before.attr.size; 355 nfs3_attr_cache(vp, &bva, &ava, t, cr); 356 } else 357 nfs_attr_cache(vp, &ava, t, cr); 358 } else { 359 PURGE_ATTRCACHE(vp); 360 } 361 } 362 363 /* 364 * Set attributes cache for given vnode using nfsattr. 365 * 366 * This routine does not do cache validation with the attributes. 367 * 368 * If an error occurred trying to convert the over the wire 369 * attributes to a vattr, then simply mark the attribute cache as 370 * timed out. 371 */ 372 void 373 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 374 { 375 rnode_t *rp; 376 struct vattr va; 377 378 if (!nattr_to_vattr(vp, na, &va)) { 379 rp = VTOR(vp); 380 mutex_enter(&rp->r_statelock); 381 if (rp->r_mtime <= t) 382 nfs_attrcache_va(vp, &va); 383 mutex_exit(&rp->r_statelock); 384 } else { 385 PURGE_ATTRCACHE(vp); 386 } 387 } 388 389 /* 390 * Set attributes cache for given vnode using fattr3. 391 * 392 * This routine does not do cache validation with the attributes. 393 * 394 * If an error occurred trying to convert the over the wire 395 * attributes to a vattr, then simply mark the attribute cache as 396 * timed out. 397 */ 398 void 399 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 400 { 401 rnode_t *rp; 402 struct vattr va; 403 404 if (!fattr3_to_vattr(vp, na, &va)) { 405 rp = VTOR(vp); 406 mutex_enter(&rp->r_statelock); 407 if (rp->r_mtime <= t) 408 nfs_attrcache_va(vp, &va); 409 mutex_exit(&rp->r_statelock); 410 } else { 411 PURGE_ATTRCACHE(vp); 412 } 413 } 414 415 /* 416 * Do a cache check based on attributes returned over the wire. The 417 * new attributes are cached. 418 * 419 * If an error occurred trying to convert the over the wire attributes 420 * to a vattr, then just return that error. 421 * 422 * As a side affect, the vattr argument is filled in with the converted 423 * attributes. 424 */ 425 int 426 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 427 cred_t *cr) 428 { 429 int error; 430 431 error = nattr_to_vattr(vp, na, vap); 432 if (error) 433 return (error); 434 nfs_attr_cache(vp, vap, t, cr); 435 return (0); 436 } 437 438 /* 439 * Do a cache check based on attributes returned over the wire. The 440 * new attributes are cached. 441 * 442 * If an error occurred trying to convert the over the wire attributes 443 * to a vattr, then just return that error. 444 * 445 * As a side affect, the vattr argument is filled in with the converted 446 * attributes. 447 */ 448 int 449 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 450 { 451 int error; 452 453 error = fattr3_to_vattr(vp, na, vap); 454 if (error) 455 return (error); 456 nfs_attr_cache(vp, vap, t, cr); 457 return (0); 458 } 459 460 /* 461 * Use the passed in virtual attributes to check to see whether the 462 * data and metadata caches are valid, cache the new attributes, and 463 * then do the cache invalidation if required. 464 * 465 * The cache validation and caching of the new attributes is done 466 * atomically via the use of the mutex, r_statelock. If required, 467 * the cache invalidation is done atomically w.r.t. the cache 468 * validation and caching of the attributes via the pseudo lock, 469 * r_serial. 470 * 471 * This routine is used to do cache validation and attributes caching 472 * for operations with a single set of post operation attributes. 473 */ 474 void 475 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 476 { 477 rnode_t *rp; 478 int mtime_changed = 0; 479 int ctime_changed = 0; 480 vsecattr_t *vsp; 481 int was_serial; 482 len_t preattr_rsize; 483 boolean_t writeattr_set = B_FALSE; 484 boolean_t cachepurge_set = B_FALSE; 485 486 rp = VTOR(vp); 487 488 mutex_enter(&rp->r_statelock); 489 490 if (rp->r_serial != curthread) { 491 klwp_t *lwp = ttolwp(curthread); 492 493 was_serial = 0; 494 if (lwp != NULL) 495 lwp->lwp_nostop++; 496 while (rp->r_serial != NULL) { 497 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 498 mutex_exit(&rp->r_statelock); 499 if (lwp != NULL) 500 lwp->lwp_nostop--; 501 return; 502 } 503 } 504 if (lwp != NULL) 505 lwp->lwp_nostop--; 506 } else 507 was_serial = 1; 508 509 if (rp->r_mtime > t) { 510 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 511 PURGE_ATTRCACHE_LOCKED(rp); 512 mutex_exit(&rp->r_statelock); 513 return; 514 } 515 516 /* 517 * Write thread after writing data to file on remote server, 518 * will always set RWRITEATTR to indicate that file on remote 519 * server was modified with a WRITE operation and would have 520 * marked attribute cache as timed out. If RWRITEATTR 521 * is set, then do not check for mtime and ctime change. 522 */ 523 if (!(rp->r_flags & RWRITEATTR)) { 524 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 525 mtime_changed = 1; 526 527 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 528 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 529 ctime_changed = 1; 530 } else { 531 writeattr_set = B_TRUE; 532 } 533 534 preattr_rsize = rp->r_size; 535 536 nfs_attrcache_va(vp, vap); 537 538 /* 539 * If we have updated filesize in nfs_attrcache_va, as soon as we 540 * drop statelock we will be in transition of purging all 541 * our caches and updating them. It is possible for another 542 * thread to pick this new file size and read in zeroed data. 543 * stall other threads till cache purge is complete. 544 */ 545 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 546 /* 547 * If RWRITEATTR was set and we have updated the file 548 * size, Server's returned file size need not necessarily 549 * be because of this Client's WRITE. We need to purge 550 * all caches. 551 */ 552 if (writeattr_set) 553 mtime_changed = 1; 554 555 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 556 rp->r_flags |= RINCACHEPURGE; 557 cachepurge_set = B_TRUE; 558 } 559 } 560 561 if (!mtime_changed && !ctime_changed) { 562 mutex_exit(&rp->r_statelock); 563 return; 564 } 565 566 rp->r_serial = curthread; 567 568 mutex_exit(&rp->r_statelock); 569 570 if (mtime_changed) 571 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 572 573 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 574 mutex_enter(&rp->r_statelock); 575 rp->r_flags &= ~RINCACHEPURGE; 576 cv_broadcast(&rp->r_cv); 577 mutex_exit(&rp->r_statelock); 578 cachepurge_set = B_FALSE; 579 } 580 581 if (ctime_changed) { 582 (void) nfs_access_purge_rp(rp); 583 if (rp->r_secattr != NULL) { 584 mutex_enter(&rp->r_statelock); 585 vsp = rp->r_secattr; 586 rp->r_secattr = NULL; 587 mutex_exit(&rp->r_statelock); 588 if (vsp != NULL) 589 nfs_acl_free(vsp); 590 } 591 } 592 593 if (!was_serial) { 594 mutex_enter(&rp->r_statelock); 595 rp->r_serial = NULL; 596 cv_broadcast(&rp->r_cv); 597 mutex_exit(&rp->r_statelock); 598 } 599 } 600 601 /* 602 * Use the passed in "before" virtual attributes to check to see 603 * whether the data and metadata caches are valid, cache the "after" 604 * new attributes, and then do the cache invalidation if required. 605 * 606 * The cache validation and caching of the new attributes is done 607 * atomically via the use of the mutex, r_statelock. If required, 608 * the cache invalidation is done atomically w.r.t. the cache 609 * validation and caching of the attributes via the pseudo lock, 610 * r_serial. 611 * 612 * This routine is used to do cache validation and attributes caching 613 * for operations with both pre operation attributes and post operation 614 * attributes. 615 */ 616 static void 617 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 618 cred_t *cr) 619 { 620 rnode_t *rp; 621 int mtime_changed = 0; 622 int ctime_changed = 0; 623 vsecattr_t *vsp; 624 int was_serial; 625 len_t preattr_rsize; 626 boolean_t writeattr_set = B_FALSE; 627 boolean_t cachepurge_set = B_FALSE; 628 629 rp = VTOR(vp); 630 631 mutex_enter(&rp->r_statelock); 632 633 if (rp->r_serial != curthread) { 634 klwp_t *lwp = ttolwp(curthread); 635 636 was_serial = 0; 637 if (lwp != NULL) 638 lwp->lwp_nostop++; 639 while (rp->r_serial != NULL) { 640 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 641 mutex_exit(&rp->r_statelock); 642 if (lwp != NULL) 643 lwp->lwp_nostop--; 644 return; 645 } 646 } 647 if (lwp != NULL) 648 lwp->lwp_nostop--; 649 } else 650 was_serial = 1; 651 652 if (rp->r_mtime > t) { 653 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size)) 654 PURGE_ATTRCACHE_LOCKED(rp); 655 mutex_exit(&rp->r_statelock); 656 return; 657 } 658 659 /* 660 * Write thread after writing data to file on remote server, 661 * will always set RWRITEATTR to indicate that file on remote 662 * server was modified with a WRITE operation and would have 663 * marked attribute cache as timed out. If RWRITEATTR 664 * is set, then do not check for mtime and ctime change. 665 */ 666 if (!(rp->r_flags & RWRITEATTR)) { 667 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 668 mtime_changed = 1; 669 670 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 671 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 672 ctime_changed = 1; 673 } else { 674 writeattr_set = B_TRUE; 675 } 676 677 preattr_rsize = rp->r_size; 678 679 nfs_attrcache_va(vp, avap); 680 681 /* 682 * If we have updated filesize in nfs_attrcache_va, as soon as we 683 * drop statelock we will be in transition of purging all 684 * our caches and updating them. It is possible for another 685 * thread to pick this new file size and read in zeroed data. 686 * stall other threads till cache purge is complete. 687 */ 688 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 689 /* 690 * If RWRITEATTR was set and we have updated the file 691 * size, Server's returned file size need not necessarily 692 * be because of this Client's WRITE. We need to purge 693 * all caches. 694 */ 695 if (writeattr_set) 696 mtime_changed = 1; 697 698 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 699 rp->r_flags |= RINCACHEPURGE; 700 cachepurge_set = B_TRUE; 701 } 702 } 703 704 if (!mtime_changed && !ctime_changed) { 705 mutex_exit(&rp->r_statelock); 706 return; 707 } 708 709 rp->r_serial = curthread; 710 711 mutex_exit(&rp->r_statelock); 712 713 if (mtime_changed) 714 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 715 716 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 717 mutex_enter(&rp->r_statelock); 718 rp->r_flags &= ~RINCACHEPURGE; 719 cv_broadcast(&rp->r_cv); 720 mutex_exit(&rp->r_statelock); 721 cachepurge_set = B_FALSE; 722 } 723 724 if (ctime_changed) { 725 (void) nfs_access_purge_rp(rp); 726 if (rp->r_secattr != NULL) { 727 mutex_enter(&rp->r_statelock); 728 vsp = rp->r_secattr; 729 rp->r_secattr = NULL; 730 mutex_exit(&rp->r_statelock); 731 if (vsp != NULL) 732 nfs_acl_free(vsp); 733 } 734 } 735 736 if (!was_serial) { 737 mutex_enter(&rp->r_statelock); 738 rp->r_serial = NULL; 739 cv_broadcast(&rp->r_cv); 740 mutex_exit(&rp->r_statelock); 741 } 742 } 743 744 /* 745 * Set attributes cache for given vnode using virtual attributes. 746 * 747 * Set the timeout value on the attribute cache and fill it 748 * with the passed in attributes. 749 * 750 * The caller must be holding r_statelock. 751 */ 752 void 753 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 754 { 755 rnode_t *rp; 756 mntinfo_t *mi; 757 hrtime_t delta; 758 hrtime_t now; 759 760 rp = VTOR(vp); 761 762 ASSERT(MUTEX_HELD(&rp->r_statelock)); 763 764 now = gethrtime(); 765 766 mi = VTOMI(vp); 767 768 /* 769 * Delta is the number of nanoseconds that we will 770 * cache the attributes of the file. It is based on 771 * the number of nanoseconds since the last time that 772 * we detected a change. The assumption is that files 773 * that changed recently are likely to change again. 774 * There is a minimum and a maximum for regular files 775 * and for directories which is enforced though. 776 * 777 * Using the time since last change was detected 778 * eliminates direct comparison or calculation 779 * using mixed client and server times. NFS does 780 * not make any assumptions regarding the client 781 * and server clocks being synchronized. 782 */ 783 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 784 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 785 va->va_size != rp->r_attr.va_size) 786 rp->r_mtime = now; 787 788 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 789 delta = 0; 790 else { 791 delta = now - rp->r_mtime; 792 if (vp->v_type == VDIR) { 793 if (delta < mi->mi_acdirmin) 794 delta = mi->mi_acdirmin; 795 else if (delta > mi->mi_acdirmax) 796 delta = mi->mi_acdirmax; 797 } else { 798 if (delta < mi->mi_acregmin) 799 delta = mi->mi_acregmin; 800 else if (delta > mi->mi_acregmax) 801 delta = mi->mi_acregmax; 802 } 803 } 804 rp->r_attrtime = now + delta; 805 rp->r_attr = *va; 806 /* 807 * Update the size of the file if there is no cached data or if 808 * the cached data is clean and there is no data being written 809 * out. 810 */ 811 if (rp->r_size != va->va_size && 812 (!vn_has_cached_data(vp) || 813 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 814 rp->r_size = va->va_size; 815 nfs_setswaplike(vp, va); 816 rp->r_flags &= ~RWRITEATTR; 817 } 818 819 /* 820 * Fill in attribute from the cache. 821 * If valid, then return 0 to indicate that no error occurred, 822 * otherwise return 1 to indicate that an error occurred. 823 */ 824 static int 825 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 826 { 827 rnode_t *rp; 828 uint_t mask = vap->va_mask; 829 830 rp = VTOR(vp); 831 mutex_enter(&rp->r_statelock); 832 if (ATTRCACHE_VALID(vp)) { 833 /* 834 * Cached attributes are valid 835 */ 836 *vap = rp->r_attr; 837 /* 838 * Set the caller's va_mask to the set of attributes 839 * that were requested ANDed with the attributes that 840 * are available. If attributes were requested that 841 * are not available, those bits must be turned off 842 * in the callers va_mask. 843 */ 844 vap->va_mask &= mask; 845 mutex_exit(&rp->r_statelock); 846 return (0); 847 } 848 mutex_exit(&rp->r_statelock); 849 return (1); 850 } 851 852 /* 853 * Get attributes over-the-wire and update attributes cache 854 * if no error occurred in the over-the-wire operation. 855 * Return 0 if successful, otherwise error. 856 */ 857 int 858 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 859 { 860 int error; 861 struct nfsattrstat ns; 862 int douprintf; 863 mntinfo_t *mi; 864 failinfo_t fi; 865 hrtime_t t; 866 867 mi = VTOMI(vp); 868 fi.vp = vp; 869 fi.fhp = NULL; /* no need to update, filehandle not copied */ 870 fi.copyproc = nfscopyfh; 871 fi.lookupproc = nfslookup; 872 fi.xattrdirproc = acl_getxattrdir2; 873 874 if (mi->mi_flags & MI_ACL) { 875 error = acl_getattr2_otw(vp, vap, cr); 876 if (mi->mi_flags & MI_ACL) 877 return (error); 878 } 879 880 douprintf = 1; 881 882 t = gethrtime(); 883 884 error = rfs2call(mi, RFS_GETATTR, 885 xdr_fhandle, (caddr_t)VTOFH(vp), 886 xdr_attrstat, (caddr_t)&ns, cr, 887 &douprintf, &ns.ns_status, 0, &fi); 888 889 if (!error) { 890 error = geterrno(ns.ns_status); 891 if (!error) 892 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 893 else { 894 PURGE_STALE_FH(error, vp, cr); 895 } 896 } 897 898 return (error); 899 } 900 901 /* 902 * Return either cached ot remote attributes. If get remote attr 903 * use them to check and invalidate caches, then cache the new attributes. 904 */ 905 int 906 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 907 { 908 int error; 909 rnode_t *rp; 910 911 /* 912 * If we've got cached attributes, we're done, otherwise go 913 * to the server to get attributes, which will update the cache 914 * in the process. 915 */ 916 error = nfs_getattr_cache(vp, vap); 917 if (error) 918 error = nfs_getattr_otw(vp, vap, cr); 919 920 /* Return the client's view of file size */ 921 rp = VTOR(vp); 922 mutex_enter(&rp->r_statelock); 923 vap->va_size = rp->r_size; 924 mutex_exit(&rp->r_statelock); 925 926 return (error); 927 } 928 929 /* 930 * Get attributes over-the-wire and update attributes cache 931 * if no error occurred in the over-the-wire operation. 932 * Return 0 if successful, otherwise error. 933 */ 934 int 935 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 936 { 937 int error; 938 GETATTR3args args; 939 GETATTR3vres res; 940 int douprintf; 941 failinfo_t fi; 942 hrtime_t t; 943 944 args.object = *VTOFH3(vp); 945 fi.vp = vp; 946 fi.fhp = (caddr_t)&args.object; 947 fi.copyproc = nfs3copyfh; 948 fi.lookupproc = nfs3lookup; 949 fi.xattrdirproc = acl_getxattrdir3; 950 res.fres.vp = vp; 951 res.fres.vap = vap; 952 953 douprintf = 1; 954 955 t = gethrtime(); 956 957 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 958 xdr_nfs_fh3, (caddr_t)&args, 959 xdr_GETATTR3vres, (caddr_t)&res, cr, 960 &douprintf, &res.status, 0, &fi); 961 962 if (error) 963 return (error); 964 965 error = geterrno3(res.status); 966 if (error) { 967 PURGE_STALE_FH(error, vp, cr); 968 return (error); 969 } 970 971 /* 972 * Catch status codes that indicate fattr3 to vattr translation failure 973 */ 974 if (res.fres.status) 975 return (res.fres.status); 976 977 nfs_attr_cache(vp, vap, t, cr); 978 return (0); 979 } 980 981 /* 982 * Return either cached or remote attributes. If get remote attr 983 * use them to check and invalidate caches, then cache the new attributes. 984 */ 985 int 986 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 987 { 988 int error; 989 rnode_t *rp; 990 991 /* 992 * If we've got cached attributes, we're done, otherwise go 993 * to the server to get attributes, which will update the cache 994 * in the process. 995 */ 996 error = nfs_getattr_cache(vp, vap); 997 if (error) 998 error = nfs3_getattr_otw(vp, vap, cr); 999 1000 /* Return the client's view of file size */ 1001 rp = VTOR(vp); 1002 mutex_enter(&rp->r_statelock); 1003 vap->va_size = rp->r_size; 1004 mutex_exit(&rp->r_statelock); 1005 1006 return (error); 1007 } 1008 1009 vtype_t nf_to_vt[] = { 1010 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 1011 }; 1012 /* 1013 * Convert NFS Version 2 over the network attributes to the local 1014 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1015 * network representation and the local representation is done here. 1016 * Returns 0 for success, error if failed due to overflow. 1017 */ 1018 int 1019 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 1020 { 1021 /* overflow in time attributes? */ 1022 #ifndef _LP64 1023 if (!NFS2_FATTR_TIME_OK(na)) 1024 return (EOVERFLOW); 1025 #endif 1026 1027 vap->va_mask = AT_ALL; 1028 1029 if (na->na_type < NFNON || na->na_type > NFSOC) 1030 vap->va_type = VBAD; 1031 else 1032 vap->va_type = nf_to_vt[na->na_type]; 1033 vap->va_mode = na->na_mode; 1034 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 1035 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 1036 vap->va_fsid = vp->v_vfsp->vfs_dev; 1037 vap->va_nodeid = na->na_nodeid; 1038 vap->va_nlink = na->na_nlink; 1039 vap->va_size = na->na_size; /* keep for cache validation */ 1040 /* 1041 * nfs protocol defines times as unsigned so don't extend sign, 1042 * unless sysadmin set nfs_allow_preepoch_time. 1043 */ 1044 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 1045 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 1046 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 1047 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 1048 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 1049 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 1050 /* 1051 * Shannon's law - uncompress the received dev_t 1052 * if the top half of is zero indicating a response 1053 * from an `older style' OS. Except for when it is a 1054 * `new style' OS sending the maj device of zero, 1055 * in which case the algorithm still works because the 1056 * fact that it is a new style server 1057 * is hidden by the minor device not being greater 1058 * than 255 (a requirement in this case). 1059 */ 1060 if ((na->na_rdev & 0xffff0000) == 0) 1061 vap->va_rdev = nfsv2_expdev(na->na_rdev); 1062 else 1063 vap->va_rdev = expldev(na->na_rdev); 1064 1065 vap->va_nblocks = na->na_blocks; 1066 switch (na->na_type) { 1067 case NFBLK: 1068 vap->va_blksize = DEV_BSIZE; 1069 break; 1070 1071 case NFCHR: 1072 vap->va_blksize = MAXBSIZE; 1073 break; 1074 1075 case NFSOC: 1076 default: 1077 vap->va_blksize = na->na_blocksize; 1078 break; 1079 } 1080 /* 1081 * This bit of ugliness is a hack to preserve the 1082 * over-the-wire protocols for named-pipe vnodes. 1083 * It remaps the special over-the-wire type to the 1084 * VFIFO type. (see note in nfs.h) 1085 */ 1086 if (NA_ISFIFO(na)) { 1087 vap->va_type = VFIFO; 1088 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1089 vap->va_rdev = 0; 1090 vap->va_blksize = na->na_blocksize; 1091 } 1092 vap->va_seq = 0; 1093 return (0); 1094 } 1095 1096 /* 1097 * Convert NFS Version 3 over the network attributes to the local 1098 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1099 * network representation and the local representation is done here. 1100 */ 1101 vtype_t nf3_to_vt[] = { 1102 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1103 }; 1104 1105 int 1106 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1107 { 1108 1109 #ifndef _LP64 1110 /* overflow in time attributes? */ 1111 if (!NFS3_FATTR_TIME_OK(na)) 1112 return (EOVERFLOW); 1113 #endif 1114 if (!NFS3_SIZE_OK(na->size)) 1115 /* file too big */ 1116 return (EFBIG); 1117 1118 vap->va_mask = AT_ALL; 1119 1120 if (na->type < NF3REG || na->type > NF3FIFO) 1121 vap->va_type = VBAD; 1122 else 1123 vap->va_type = nf3_to_vt[na->type]; 1124 vap->va_mode = na->mode; 1125 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1126 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1127 vap->va_fsid = vp->v_vfsp->vfs_dev; 1128 vap->va_nodeid = na->fileid; 1129 vap->va_nlink = na->nlink; 1130 vap->va_size = na->size; 1131 1132 /* 1133 * nfs protocol defines times as unsigned so don't extend sign, 1134 * unless sysadmin set nfs_allow_preepoch_time. 1135 */ 1136 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1137 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1138 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1139 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1140 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1141 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1142 1143 switch (na->type) { 1144 case NF3BLK: 1145 vap->va_rdev = makedevice(na->rdev.specdata1, 1146 na->rdev.specdata2); 1147 vap->va_blksize = DEV_BSIZE; 1148 vap->va_nblocks = 0; 1149 break; 1150 case NF3CHR: 1151 vap->va_rdev = makedevice(na->rdev.specdata1, 1152 na->rdev.specdata2); 1153 vap->va_blksize = MAXBSIZE; 1154 vap->va_nblocks = 0; 1155 break; 1156 case NF3REG: 1157 case NF3DIR: 1158 case NF3LNK: 1159 vap->va_rdev = 0; 1160 vap->va_blksize = MAXBSIZE; 1161 vap->va_nblocks = (u_longlong_t) 1162 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1163 (size3)DEV_BSIZE); 1164 break; 1165 case NF3SOCK: 1166 case NF3FIFO: 1167 default: 1168 vap->va_rdev = 0; 1169 vap->va_blksize = MAXBSIZE; 1170 vap->va_nblocks = 0; 1171 break; 1172 } 1173 vap->va_seq = 0; 1174 return (0); 1175 } 1176 1177 /* 1178 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1179 * for the demand-based allocation of async threads per-mount. The 1180 * nfs_async_timeout is the amount of time a thread will live after it 1181 * becomes idle, unless new I/O requests are received before the thread 1182 * dies. See nfs_async_putpage and nfs_async_start. 1183 */ 1184 1185 int nfs_async_timeout = -1; /* uninitialized */ 1186 1187 static void nfs_async_start(struct vfs *); 1188 static void nfs_async_pgops_start(struct vfs *); 1189 static void nfs_async_common_start(struct vfs *, int); 1190 1191 static void 1192 free_async_args(struct nfs_async_reqs *args) 1193 { 1194 rnode_t *rp; 1195 1196 if (args->a_io != NFS_INACTIVE) { 1197 rp = VTOR(args->a_vp); 1198 mutex_enter(&rp->r_statelock); 1199 rp->r_count--; 1200 if (args->a_io == NFS_PUTAPAGE || 1201 args->a_io == NFS_PAGEIO) 1202 rp->r_awcount--; 1203 cv_broadcast(&rp->r_cv); 1204 mutex_exit(&rp->r_statelock); 1205 VN_RELE(args->a_vp); 1206 } 1207 crfree(args->a_cred); 1208 kmem_free(args, sizeof (*args)); 1209 } 1210 1211 /* 1212 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1213 * pageout(), running in the global zone, have legitimate reasons to do 1214 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1215 * use of a a per-mount "asynchronous requests manager thread" which is 1216 * signaled by the various asynchronous work routines when there is 1217 * asynchronous work to be done. It is responsible for creating new 1218 * worker threads if necessary, and notifying existing worker threads 1219 * that there is work to be done. 1220 * 1221 * In other words, it will "take the specifications from the customers and 1222 * give them to the engineers." 1223 * 1224 * Worker threads die off of their own accord if they are no longer 1225 * needed. 1226 * 1227 * This thread is killed when the zone is going away or the filesystem 1228 * is being unmounted. 1229 */ 1230 void 1231 nfs_async_manager(vfs_t *vfsp) 1232 { 1233 callb_cpr_t cprinfo; 1234 mntinfo_t *mi; 1235 uint_t max_threads; 1236 1237 mi = VFTOMI(vfsp); 1238 1239 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1240 "nfs_async_manager"); 1241 1242 mutex_enter(&mi->mi_async_lock); 1243 /* 1244 * We want to stash the max number of threads that this mount was 1245 * allowed so we can use it later when the variable is set to zero as 1246 * part of the zone/mount going away. 1247 * 1248 * We want to be able to create at least one thread to handle 1249 * asynchronous inactive calls. 1250 */ 1251 max_threads = MAX(mi->mi_max_threads, 1); 1252 /* 1253 * We don't want to wait for mi_max_threads to go to zero, since that 1254 * happens as part of a failed unmount, but this thread should only 1255 * exit when the mount/zone is really going away. 1256 * 1257 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1258 * attempted: the various _async_*() functions know to do things 1259 * inline if mi_max_threads == 0. Henceforth we just drain out the 1260 * outstanding requests. 1261 * 1262 * Note that we still create zthreads even if we notice the zone is 1263 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1264 * shutdown sequence to take slightly longer in some cases, but 1265 * doesn't violate the protocol, as all threads will exit as soon as 1266 * they're done processing the remaining requests. 1267 */ 1268 for (;;) { 1269 while (mi->mi_async_req_count > 0) { 1270 /* 1271 * Paranoia: If the mount started out having 1272 * (mi->mi_max_threads == 0), and the value was 1273 * later changed (via a debugger or somesuch), 1274 * we could be confused since we will think we 1275 * can't create any threads, and the calling 1276 * code (which looks at the current value of 1277 * mi->mi_max_threads, now non-zero) thinks we 1278 * can. 1279 * 1280 * So, because we're paranoid, we create threads 1281 * up to the maximum of the original and the 1282 * current value. This means that future 1283 * (debugger-induced) lowerings of 1284 * mi->mi_max_threads are ignored for our 1285 * purposes, but who told them they could change 1286 * random values on a live kernel anyhow? 1287 */ 1288 if (mi->mi_threads[NFS_ASYNC_QUEUE] < 1289 MAX(mi->mi_max_threads, max_threads)) { 1290 mi->mi_threads[NFS_ASYNC_QUEUE]++; 1291 mutex_exit(&mi->mi_async_lock); 1292 VFS_HOLD(vfsp); /* hold for new thread */ 1293 (void) zthread_create(NULL, 0, nfs_async_start, 1294 vfsp, 0, minclsyspri); 1295 mutex_enter(&mi->mi_async_lock); 1296 } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] < 1297 NUM_ASYNC_PGOPS_THREADS) { 1298 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++; 1299 mutex_exit(&mi->mi_async_lock); 1300 VFS_HOLD(vfsp); /* hold for new thread */ 1301 (void) zthread_create(NULL, 0, 1302 nfs_async_pgops_start, vfsp, 0, 1303 minclsyspri); 1304 mutex_enter(&mi->mi_async_lock); 1305 } 1306 NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1307 ASSERT(mi->mi_async_req_count != 0); 1308 mi->mi_async_req_count--; 1309 } 1310 1311 mutex_enter(&mi->mi_lock); 1312 if (mi->mi_flags & MI_ASYNC_MGR_STOP) { 1313 mutex_exit(&mi->mi_lock); 1314 break; 1315 } 1316 mutex_exit(&mi->mi_lock); 1317 1318 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1319 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1320 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1321 } 1322 /* 1323 * Let everyone know we're done. 1324 */ 1325 mi->mi_manager_thread = NULL; 1326 cv_broadcast(&mi->mi_async_cv); 1327 1328 /* 1329 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1330 * since CALLB_CPR_EXIT is actually responsible for releasing 1331 * 'mi_async_lock'. 1332 */ 1333 CALLB_CPR_EXIT(&cprinfo); 1334 VFS_RELE(vfsp); /* release thread's hold */ 1335 zthread_exit(); 1336 } 1337 1338 /* 1339 * Signal (and wait for) the async manager thread to clean up and go away. 1340 */ 1341 void 1342 nfs_async_manager_stop(vfs_t *vfsp) 1343 { 1344 mntinfo_t *mi = VFTOMI(vfsp); 1345 1346 mutex_enter(&mi->mi_async_lock); 1347 mutex_enter(&mi->mi_lock); 1348 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1349 mutex_exit(&mi->mi_lock); 1350 cv_broadcast(&mi->mi_async_reqs_cv); 1351 while (mi->mi_manager_thread != NULL) 1352 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1353 mutex_exit(&mi->mi_async_lock); 1354 } 1355 1356 int 1357 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1358 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1359 u_offset_t, caddr_t, struct seg *, cred_t *)) 1360 { 1361 rnode_t *rp; 1362 mntinfo_t *mi; 1363 struct nfs_async_reqs *args; 1364 1365 rp = VTOR(vp); 1366 ASSERT(rp->r_freef == NULL); 1367 1368 mi = VTOMI(vp); 1369 1370 /* 1371 * If addr falls in a different segment, don't bother doing readahead. 1372 */ 1373 if (addr >= seg->s_base + seg->s_size) 1374 return (-1); 1375 1376 /* 1377 * If we can't allocate a request structure, punt on the readahead. 1378 */ 1379 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1380 return (-1); 1381 1382 /* 1383 * If a lock operation is pending, don't initiate any new 1384 * readaheads. Otherwise, bump r_count to indicate the new 1385 * asynchronous I/O. 1386 */ 1387 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1388 kmem_free(args, sizeof (*args)); 1389 return (-1); 1390 } 1391 mutex_enter(&rp->r_statelock); 1392 rp->r_count++; 1393 mutex_exit(&rp->r_statelock); 1394 nfs_rw_exit(&rp->r_lkserlock); 1395 1396 args->a_next = NULL; 1397 #ifdef DEBUG 1398 args->a_queuer = curthread; 1399 #endif 1400 VN_HOLD(vp); 1401 args->a_vp = vp; 1402 ASSERT(cr != NULL); 1403 crhold(cr); 1404 args->a_cred = cr; 1405 args->a_io = NFS_READ_AHEAD; 1406 args->a_nfs_readahead = readahead; 1407 args->a_nfs_blkoff = blkoff; 1408 args->a_nfs_seg = seg; 1409 args->a_nfs_addr = addr; 1410 1411 mutex_enter(&mi->mi_async_lock); 1412 1413 /* 1414 * If asyncio has been disabled, don't bother readahead. 1415 */ 1416 if (mi->mi_max_threads == 0) { 1417 mutex_exit(&mi->mi_async_lock); 1418 goto noasync; 1419 } 1420 1421 /* 1422 * Link request structure into the async list and 1423 * wakeup async thread to do the i/o. 1424 */ 1425 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1426 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1427 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1428 } else { 1429 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1430 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1431 } 1432 1433 if (mi->mi_io_kstats) { 1434 mutex_enter(&mi->mi_lock); 1435 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1436 mutex_exit(&mi->mi_lock); 1437 } 1438 1439 mi->mi_async_req_count++; 1440 ASSERT(mi->mi_async_req_count != 0); 1441 cv_signal(&mi->mi_async_reqs_cv); 1442 mutex_exit(&mi->mi_async_lock); 1443 return (0); 1444 1445 noasync: 1446 mutex_enter(&rp->r_statelock); 1447 rp->r_count--; 1448 cv_broadcast(&rp->r_cv); 1449 mutex_exit(&rp->r_statelock); 1450 VN_RELE(vp); 1451 crfree(cr); 1452 kmem_free(args, sizeof (*args)); 1453 return (-1); 1454 } 1455 1456 int 1457 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1458 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1459 u_offset_t, size_t, int, cred_t *)) 1460 { 1461 rnode_t *rp; 1462 mntinfo_t *mi; 1463 struct nfs_async_reqs *args; 1464 1465 ASSERT(flags & B_ASYNC); 1466 ASSERT(vp->v_vfsp != NULL); 1467 1468 rp = VTOR(vp); 1469 ASSERT(rp->r_count > 0); 1470 1471 mi = VTOMI(vp); 1472 1473 /* 1474 * If we can't allocate a request structure, do the putpage 1475 * operation synchronously in this thread's context. 1476 */ 1477 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1478 goto noasync; 1479 1480 args->a_next = NULL; 1481 #ifdef DEBUG 1482 args->a_queuer = curthread; 1483 #endif 1484 VN_HOLD(vp); 1485 args->a_vp = vp; 1486 ASSERT(cr != NULL); 1487 crhold(cr); 1488 args->a_cred = cr; 1489 args->a_io = NFS_PUTAPAGE; 1490 args->a_nfs_putapage = putapage; 1491 args->a_nfs_pp = pp; 1492 args->a_nfs_off = off; 1493 args->a_nfs_len = (uint_t)len; 1494 args->a_nfs_flags = flags; 1495 1496 mutex_enter(&mi->mi_async_lock); 1497 1498 /* 1499 * If asyncio has been disabled, then make a synchronous request. 1500 * This check is done a second time in case async io was diabled 1501 * while this thread was blocked waiting for memory pressure to 1502 * reduce or for the queue to drain. 1503 */ 1504 if (mi->mi_max_threads == 0) { 1505 mutex_exit(&mi->mi_async_lock); 1506 goto noasync; 1507 } 1508 1509 /* 1510 * Link request structure into the async list and 1511 * wakeup async thread to do the i/o. 1512 */ 1513 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1514 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1515 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1516 } else { 1517 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1518 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1519 } 1520 1521 mutex_enter(&rp->r_statelock); 1522 rp->r_count++; 1523 rp->r_awcount++; 1524 mutex_exit(&rp->r_statelock); 1525 1526 if (mi->mi_io_kstats) { 1527 mutex_enter(&mi->mi_lock); 1528 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1529 mutex_exit(&mi->mi_lock); 1530 } 1531 1532 mi->mi_async_req_count++; 1533 ASSERT(mi->mi_async_req_count != 0); 1534 cv_signal(&mi->mi_async_reqs_cv); 1535 mutex_exit(&mi->mi_async_lock); 1536 return (0); 1537 1538 noasync: 1539 if (args != NULL) { 1540 VN_RELE(vp); 1541 crfree(cr); 1542 kmem_free(args, sizeof (*args)); 1543 } 1544 1545 if (curproc == proc_pageout || curproc == proc_fsflush) { 1546 /* 1547 * If we get here in the context of the pageout/fsflush, 1548 * we refuse to do a sync write, because this may hang 1549 * pageout (and the machine). In this case, we just 1550 * re-mark the page as dirty and punt on the page. 1551 * 1552 * Make sure B_FORCE isn't set. We can re-mark the 1553 * pages as dirty and unlock the pages in one swoop by 1554 * passing in B_ERROR to pvn_write_done(). However, 1555 * we should make sure B_FORCE isn't set - we don't 1556 * want the page tossed before it gets written out. 1557 */ 1558 if (flags & B_FORCE) 1559 flags &= ~(B_INVAL | B_FORCE); 1560 pvn_write_done(pp, flags | B_ERROR); 1561 return (0); 1562 } 1563 if (nfs_zone() != mi->mi_zone) { 1564 /* 1565 * So this was a cross-zone sync putpage. We pass in B_ERROR 1566 * to pvn_write_done() to re-mark the pages as dirty and unlock 1567 * them. 1568 * 1569 * We don't want to clear B_FORCE here as the caller presumably 1570 * knows what they're doing if they set it. 1571 */ 1572 pvn_write_done(pp, flags | B_ERROR); 1573 return (EPERM); 1574 } 1575 return ((*putapage)(vp, pp, off, len, flags, cr)); 1576 } 1577 1578 int 1579 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1580 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1581 size_t, int, cred_t *)) 1582 { 1583 rnode_t *rp; 1584 mntinfo_t *mi; 1585 struct nfs_async_reqs *args; 1586 1587 ASSERT(flags & B_ASYNC); 1588 ASSERT(vp->v_vfsp != NULL); 1589 1590 rp = VTOR(vp); 1591 ASSERT(rp->r_count > 0); 1592 1593 mi = VTOMI(vp); 1594 1595 /* 1596 * If we can't allocate a request structure, do the pageio 1597 * request synchronously in this thread's context. 1598 */ 1599 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1600 goto noasync; 1601 1602 args->a_next = NULL; 1603 #ifdef DEBUG 1604 args->a_queuer = curthread; 1605 #endif 1606 VN_HOLD(vp); 1607 args->a_vp = vp; 1608 ASSERT(cr != NULL); 1609 crhold(cr); 1610 args->a_cred = cr; 1611 args->a_io = NFS_PAGEIO; 1612 args->a_nfs_pageio = pageio; 1613 args->a_nfs_pp = pp; 1614 args->a_nfs_off = io_off; 1615 args->a_nfs_len = (uint_t)io_len; 1616 args->a_nfs_flags = flags; 1617 1618 mutex_enter(&mi->mi_async_lock); 1619 1620 /* 1621 * If asyncio has been disabled, then make a synchronous request. 1622 * This check is done a second time in case async io was diabled 1623 * while this thread was blocked waiting for memory pressure to 1624 * reduce or for the queue to drain. 1625 */ 1626 if (mi->mi_max_threads == 0) { 1627 mutex_exit(&mi->mi_async_lock); 1628 goto noasync; 1629 } 1630 1631 /* 1632 * Link request structure into the async list and 1633 * wakeup async thread to do the i/o. 1634 */ 1635 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1636 mi->mi_async_reqs[NFS_PAGEIO] = args; 1637 mi->mi_async_tail[NFS_PAGEIO] = args; 1638 } else { 1639 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1640 mi->mi_async_tail[NFS_PAGEIO] = args; 1641 } 1642 1643 mutex_enter(&rp->r_statelock); 1644 rp->r_count++; 1645 rp->r_awcount++; 1646 mutex_exit(&rp->r_statelock); 1647 1648 if (mi->mi_io_kstats) { 1649 mutex_enter(&mi->mi_lock); 1650 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1651 mutex_exit(&mi->mi_lock); 1652 } 1653 1654 mi->mi_async_req_count++; 1655 ASSERT(mi->mi_async_req_count != 0); 1656 cv_signal(&mi->mi_async_reqs_cv); 1657 mutex_exit(&mi->mi_async_lock); 1658 return (0); 1659 1660 noasync: 1661 if (args != NULL) { 1662 VN_RELE(vp); 1663 crfree(cr); 1664 kmem_free(args, sizeof (*args)); 1665 } 1666 1667 /* 1668 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1669 * the page list), for writes we do it synchronously, except for 1670 * proc_pageout/proc_fsflush as described below. 1671 */ 1672 if (flags & B_READ) { 1673 pvn_read_done(pp, flags | B_ERROR); 1674 return (0); 1675 } 1676 1677 if (curproc == proc_pageout || curproc == proc_fsflush) { 1678 /* 1679 * If we get here in the context of the pageout/fsflush, 1680 * we refuse to do a sync write, because this may hang 1681 * pageout/fsflush (and the machine). In this case, we just 1682 * re-mark the page as dirty and punt on the page. 1683 * 1684 * Make sure B_FORCE isn't set. We can re-mark the 1685 * pages as dirty and unlock the pages in one swoop by 1686 * passing in B_ERROR to pvn_write_done(). However, 1687 * we should make sure B_FORCE isn't set - we don't 1688 * want the page tossed before it gets written out. 1689 */ 1690 if (flags & B_FORCE) 1691 flags &= ~(B_INVAL | B_FORCE); 1692 pvn_write_done(pp, flags | B_ERROR); 1693 return (0); 1694 } 1695 1696 if (nfs_zone() != mi->mi_zone) { 1697 /* 1698 * So this was a cross-zone sync pageio. We pass in B_ERROR 1699 * to pvn_write_done() to re-mark the pages as dirty and unlock 1700 * them. 1701 * 1702 * We don't want to clear B_FORCE here as the caller presumably 1703 * knows what they're doing if they set it. 1704 */ 1705 pvn_write_done(pp, flags | B_ERROR); 1706 return (EPERM); 1707 } 1708 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1709 } 1710 1711 void 1712 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1713 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1714 { 1715 rnode_t *rp; 1716 mntinfo_t *mi; 1717 struct nfs_async_reqs *args; 1718 1719 rp = VTOR(vp); 1720 ASSERT(rp->r_freef == NULL); 1721 1722 mi = VTOMI(vp); 1723 1724 /* 1725 * If we can't allocate a request structure, do the readdir 1726 * operation synchronously in this thread's context. 1727 */ 1728 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1729 goto noasync; 1730 1731 args->a_next = NULL; 1732 #ifdef DEBUG 1733 args->a_queuer = curthread; 1734 #endif 1735 VN_HOLD(vp); 1736 args->a_vp = vp; 1737 ASSERT(cr != NULL); 1738 crhold(cr); 1739 args->a_cred = cr; 1740 args->a_io = NFS_READDIR; 1741 args->a_nfs_readdir = readdir; 1742 args->a_nfs_rdc = rdc; 1743 1744 mutex_enter(&mi->mi_async_lock); 1745 1746 /* 1747 * If asyncio has been disabled, then make a synchronous request. 1748 */ 1749 if (mi->mi_max_threads == 0) { 1750 mutex_exit(&mi->mi_async_lock); 1751 goto noasync; 1752 } 1753 1754 /* 1755 * Link request structure into the async list and 1756 * wakeup async thread to do the i/o. 1757 */ 1758 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1759 mi->mi_async_reqs[NFS_READDIR] = args; 1760 mi->mi_async_tail[NFS_READDIR] = args; 1761 } else { 1762 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1763 mi->mi_async_tail[NFS_READDIR] = args; 1764 } 1765 1766 mutex_enter(&rp->r_statelock); 1767 rp->r_count++; 1768 mutex_exit(&rp->r_statelock); 1769 1770 if (mi->mi_io_kstats) { 1771 mutex_enter(&mi->mi_lock); 1772 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1773 mutex_exit(&mi->mi_lock); 1774 } 1775 1776 mi->mi_async_req_count++; 1777 ASSERT(mi->mi_async_req_count != 0); 1778 cv_signal(&mi->mi_async_reqs_cv); 1779 mutex_exit(&mi->mi_async_lock); 1780 return; 1781 1782 noasync: 1783 if (args != NULL) { 1784 VN_RELE(vp); 1785 crfree(cr); 1786 kmem_free(args, sizeof (*args)); 1787 } 1788 1789 rdc->entries = NULL; 1790 mutex_enter(&rp->r_statelock); 1791 ASSERT(rdc->flags & RDDIR); 1792 rdc->flags &= ~RDDIR; 1793 rdc->flags |= RDDIRREQ; 1794 /* 1795 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1796 * is set, wakeup the thread sleeping in cv_wait_sig(). 1797 * The woken up thread will reset the flag to RDDIR and will 1798 * continue with the readdir opeartion. 1799 */ 1800 if (rdc->flags & RDDIRWAIT) { 1801 rdc->flags &= ~RDDIRWAIT; 1802 cv_broadcast(&rdc->cv); 1803 } 1804 mutex_exit(&rp->r_statelock); 1805 rddir_cache_rele(rdc); 1806 } 1807 1808 void 1809 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1810 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1811 cred_t *)) 1812 { 1813 rnode_t *rp; 1814 mntinfo_t *mi; 1815 struct nfs_async_reqs *args; 1816 page_t *pp; 1817 1818 rp = VTOR(vp); 1819 mi = VTOMI(vp); 1820 1821 /* 1822 * If we can't allocate a request structure, do the commit 1823 * operation synchronously in this thread's context. 1824 */ 1825 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1826 goto noasync; 1827 1828 args->a_next = NULL; 1829 #ifdef DEBUG 1830 args->a_queuer = curthread; 1831 #endif 1832 VN_HOLD(vp); 1833 args->a_vp = vp; 1834 ASSERT(cr != NULL); 1835 crhold(cr); 1836 args->a_cred = cr; 1837 args->a_io = NFS_COMMIT; 1838 args->a_nfs_commit = commit; 1839 args->a_nfs_plist = plist; 1840 args->a_nfs_offset = offset; 1841 args->a_nfs_count = count; 1842 1843 mutex_enter(&mi->mi_async_lock); 1844 1845 /* 1846 * If asyncio has been disabled, then make a synchronous request. 1847 * This check is done a second time in case async io was diabled 1848 * while this thread was blocked waiting for memory pressure to 1849 * reduce or for the queue to drain. 1850 */ 1851 if (mi->mi_max_threads == 0) { 1852 mutex_exit(&mi->mi_async_lock); 1853 goto noasync; 1854 } 1855 1856 /* 1857 * Link request structure into the async list and 1858 * wakeup async thread to do the i/o. 1859 */ 1860 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1861 mi->mi_async_reqs[NFS_COMMIT] = args; 1862 mi->mi_async_tail[NFS_COMMIT] = args; 1863 } else { 1864 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1865 mi->mi_async_tail[NFS_COMMIT] = args; 1866 } 1867 1868 mutex_enter(&rp->r_statelock); 1869 rp->r_count++; 1870 mutex_exit(&rp->r_statelock); 1871 1872 if (mi->mi_io_kstats) { 1873 mutex_enter(&mi->mi_lock); 1874 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1875 mutex_exit(&mi->mi_lock); 1876 } 1877 1878 mi->mi_async_req_count++; 1879 ASSERT(mi->mi_async_req_count != 0); 1880 cv_signal(&mi->mi_async_reqs_cv); 1881 mutex_exit(&mi->mi_async_lock); 1882 return; 1883 1884 noasync: 1885 if (args != NULL) { 1886 VN_RELE(vp); 1887 crfree(cr); 1888 kmem_free(args, sizeof (*args)); 1889 } 1890 1891 if (curproc == proc_pageout || curproc == proc_fsflush || 1892 nfs_zone() != mi->mi_zone) { 1893 while (plist != NULL) { 1894 pp = plist; 1895 page_sub(&plist, pp); 1896 pp->p_fsdata = C_COMMIT; 1897 page_unlock(pp); 1898 } 1899 return; 1900 } 1901 (*commit)(vp, plist, offset, count, cr); 1902 } 1903 1904 void 1905 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1906 void (*inactive)(vnode_t *, cred_t *, caller_context_t *)) 1907 { 1908 mntinfo_t *mi; 1909 struct nfs_async_reqs *args; 1910 1911 mi = VTOMI(vp); 1912 1913 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1914 args->a_next = NULL; 1915 #ifdef DEBUG 1916 args->a_queuer = curthread; 1917 #endif 1918 args->a_vp = vp; 1919 ASSERT(cr != NULL); 1920 crhold(cr); 1921 args->a_cred = cr; 1922 args->a_io = NFS_INACTIVE; 1923 args->a_nfs_inactive = inactive; 1924 1925 /* 1926 * Note that we don't check mi->mi_max_threads here, since we 1927 * *need* to get rid of this vnode regardless of whether someone 1928 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1929 * 1930 * The manager thread knows about this and is willing to create 1931 * at least one thread to accommodate us. 1932 */ 1933 mutex_enter(&mi->mi_async_lock); 1934 if (mi->mi_manager_thread == NULL) { 1935 rnode_t *rp = VTOR(vp); 1936 1937 mutex_exit(&mi->mi_async_lock); 1938 crfree(cr); /* drop our reference */ 1939 kmem_free(args, sizeof (*args)); 1940 /* 1941 * We can't do an over-the-wire call since we're in the wrong 1942 * zone, so we need to clean up state as best we can and then 1943 * throw away the vnode. 1944 */ 1945 mutex_enter(&rp->r_statelock); 1946 if (rp->r_unldvp != NULL) { 1947 vnode_t *unldvp; 1948 char *unlname; 1949 cred_t *unlcred; 1950 1951 unldvp = rp->r_unldvp; 1952 rp->r_unldvp = NULL; 1953 unlname = rp->r_unlname; 1954 rp->r_unlname = NULL; 1955 unlcred = rp->r_unlcred; 1956 rp->r_unlcred = NULL; 1957 mutex_exit(&rp->r_statelock); 1958 1959 VN_RELE(unldvp); 1960 kmem_free(unlname, MAXNAMELEN); 1961 crfree(unlcred); 1962 } else { 1963 mutex_exit(&rp->r_statelock); 1964 } 1965 /* 1966 * No need to explicitly throw away any cached pages. The 1967 * eventual rinactive() will attempt a synchronous 1968 * VOP_PUTPAGE() which will immediately fail since the request 1969 * is coming from the wrong zone, and then will proceed to call 1970 * nfs_invalidate_pages() which will clean things up for us. 1971 */ 1972 rp_addfree(VTOR(vp), cr); 1973 return; 1974 } 1975 1976 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1977 mi->mi_async_reqs[NFS_INACTIVE] = args; 1978 } else { 1979 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1980 } 1981 mi->mi_async_tail[NFS_INACTIVE] = args; 1982 /* 1983 * Don't increment r_count, since we're trying to get rid of the vnode. 1984 */ 1985 1986 mi->mi_async_req_count++; 1987 ASSERT(mi->mi_async_req_count != 0); 1988 cv_signal(&mi->mi_async_reqs_cv); 1989 mutex_exit(&mi->mi_async_lock); 1990 } 1991 1992 static void 1993 nfs_async_start(struct vfs *vfsp) 1994 { 1995 nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE); 1996 } 1997 1998 static void 1999 nfs_async_pgops_start(struct vfs *vfsp) 2000 { 2001 nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE); 2002 } 2003 2004 /* 2005 * The async queues for each mounted file system are arranged as a 2006 * set of queues, one for each async i/o type. Requests are taken 2007 * from the queues in a round-robin fashion. A number of consecutive 2008 * requests are taken from each queue before moving on to the next 2009 * queue. This functionality may allow the NFS Version 2 server to do 2010 * write clustering, even if the client is mixing writes and reads 2011 * because it will take multiple write requests from the queue 2012 * before processing any of the other async i/o types. 2013 * 2014 * XXX The nfs_async_common_start thread is unsafe in the light of the present 2015 * model defined by cpr to suspend the system. Specifically over the 2016 * wire calls are cpr-unsafe. The thread should be reevaluated in 2017 * case of future updates to the cpr model. 2018 */ 2019 static void 2020 nfs_async_common_start(struct vfs *vfsp, int async_queue) 2021 { 2022 struct nfs_async_reqs *args; 2023 mntinfo_t *mi = VFTOMI(vfsp); 2024 clock_t time_left = 1; 2025 callb_cpr_t cprinfo; 2026 int i; 2027 int async_types; 2028 kcondvar_t *async_work_cv; 2029 2030 if (async_queue == NFS_ASYNC_QUEUE) { 2031 async_types = NFS_ASYNC_TYPES; 2032 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE]; 2033 } else { 2034 async_types = NFS_ASYNC_PGOPS_TYPES; 2035 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]; 2036 } 2037 2038 /* 2039 * Dynamic initialization of nfs_async_timeout to allow nfs to be 2040 * built in an implementation independent manner. 2041 */ 2042 if (nfs_async_timeout == -1) 2043 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 2044 2045 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 2046 2047 mutex_enter(&mi->mi_async_lock); 2048 for (;;) { 2049 /* 2050 * Find the next queue containing an entry. We start 2051 * at the current queue pointer and then round robin 2052 * through all of them until we either find a non-empty 2053 * queue or have looked through all of them. 2054 */ 2055 for (i = 0; i < async_types; i++) { 2056 args = *mi->mi_async_curr[async_queue]; 2057 if (args != NULL) 2058 break; 2059 mi->mi_async_curr[async_queue]++; 2060 if (mi->mi_async_curr[async_queue] == 2061 &mi->mi_async_reqs[async_types]) { 2062 mi->mi_async_curr[async_queue] = 2063 &mi->mi_async_reqs[0]; 2064 } 2065 } 2066 /* 2067 * If we didn't find a entry, then block until woken up 2068 * again and then look through the queues again. 2069 */ 2070 if (args == NULL) { 2071 /* 2072 * Exiting is considered to be safe for CPR as well 2073 */ 2074 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2075 2076 /* 2077 * Wakeup thread waiting to unmount the file 2078 * system only if all async threads are inactive. 2079 * 2080 * If we've timed-out and there's nothing to do, 2081 * then get rid of this thread. 2082 */ 2083 if (mi->mi_max_threads == 0 || time_left <= 0) { 2084 --mi->mi_threads[async_queue]; 2085 2086 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 && 2087 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0) 2088 cv_signal(&mi->mi_async_cv); 2089 CALLB_CPR_EXIT(&cprinfo); 2090 VFS_RELE(vfsp); /* release thread's hold */ 2091 zthread_exit(); 2092 /* NOTREACHED */ 2093 } 2094 time_left = cv_reltimedwait(async_work_cv, 2095 &mi->mi_async_lock, nfs_async_timeout, 2096 TR_CLOCK_TICK); 2097 2098 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 2099 2100 continue; 2101 } 2102 time_left = 1; 2103 2104 /* 2105 * Remove the request from the async queue and then 2106 * update the current async request queue pointer. If 2107 * the current queue is empty or we have removed enough 2108 * consecutive entries from it, then reset the counter 2109 * for this queue and then move the current pointer to 2110 * the next queue. 2111 */ 2112 *mi->mi_async_curr[async_queue] = args->a_next; 2113 if (*mi->mi_async_curr[async_queue] == NULL || 2114 --mi->mi_async_clusters[args->a_io] == 0) { 2115 mi->mi_async_clusters[args->a_io] = 2116 mi->mi_async_init_clusters; 2117 mi->mi_async_curr[async_queue]++; 2118 if (mi->mi_async_curr[async_queue] == 2119 &mi->mi_async_reqs[async_types]) { 2120 mi->mi_async_curr[async_queue] = 2121 &mi->mi_async_reqs[0]; 2122 } 2123 } 2124 2125 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 2126 mutex_enter(&mi->mi_lock); 2127 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 2128 mutex_exit(&mi->mi_lock); 2129 } 2130 2131 mutex_exit(&mi->mi_async_lock); 2132 2133 /* 2134 * Obtain arguments from the async request structure. 2135 */ 2136 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2137 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2138 args->a_nfs_addr, args->a_nfs_seg, 2139 args->a_cred); 2140 } else if (args->a_io == NFS_PUTAPAGE) { 2141 (void) (*args->a_nfs_putapage)(args->a_vp, 2142 args->a_nfs_pp, args->a_nfs_off, 2143 args->a_nfs_len, args->a_nfs_flags, 2144 args->a_cred); 2145 } else if (args->a_io == NFS_PAGEIO) { 2146 (void) (*args->a_nfs_pageio)(args->a_vp, 2147 args->a_nfs_pp, args->a_nfs_off, 2148 args->a_nfs_len, args->a_nfs_flags, 2149 args->a_cred); 2150 } else if (args->a_io == NFS_READDIR) { 2151 (void) ((*args->a_nfs_readdir)(args->a_vp, 2152 args->a_nfs_rdc, args->a_cred)); 2153 } else if (args->a_io == NFS_COMMIT) { 2154 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2155 args->a_nfs_offset, args->a_nfs_count, 2156 args->a_cred); 2157 } else if (args->a_io == NFS_INACTIVE) { 2158 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL); 2159 } 2160 2161 /* 2162 * Now, release the vnode and free the credentials 2163 * structure. 2164 */ 2165 free_async_args(args); 2166 /* 2167 * Reacquire the mutex because it will be needed above. 2168 */ 2169 mutex_enter(&mi->mi_async_lock); 2170 } 2171 } 2172 2173 void 2174 nfs_async_stop(struct vfs *vfsp) 2175 { 2176 mntinfo_t *mi = VFTOMI(vfsp); 2177 2178 /* 2179 * Wait for all outstanding async operations to complete and for the 2180 * worker threads to exit. 2181 */ 2182 mutex_enter(&mi->mi_async_lock); 2183 mi->mi_max_threads = 0; 2184 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2185 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2186 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) 2187 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2188 mutex_exit(&mi->mi_async_lock); 2189 } 2190 2191 /* 2192 * nfs_async_stop_sig: 2193 * Wait for all outstanding putpage operation to complete. If a signal 2194 * is deliver we will abort and return non-zero. If we can put all the 2195 * pages we will return 0. This routine is called from nfs_unmount and 2196 * nfs3_unmount to make these operations interruptible. 2197 */ 2198 int 2199 nfs_async_stop_sig(struct vfs *vfsp) 2200 { 2201 mntinfo_t *mi = VFTOMI(vfsp); 2202 ushort_t omax; 2203 int rval; 2204 2205 /* 2206 * Wait for all outstanding async operations to complete and for the 2207 * worker threads to exit. 2208 */ 2209 mutex_enter(&mi->mi_async_lock); 2210 omax = mi->mi_max_threads; 2211 mi->mi_max_threads = 0; 2212 /* 2213 * Tell all the worker threads to exit. 2214 */ 2215 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2216 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2217 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) { 2218 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2219 break; 2220 } 2221 rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2222 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0); /* Interrupted */ 2223 if (rval) 2224 mi->mi_max_threads = omax; 2225 mutex_exit(&mi->mi_async_lock); 2226 2227 return (rval); 2228 } 2229 2230 int 2231 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2232 { 2233 int pagecreate; 2234 int n; 2235 int saved_n; 2236 caddr_t saved_base; 2237 u_offset_t offset; 2238 int error; 2239 int sm_error; 2240 vnode_t *vp = RTOV(rp); 2241 2242 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2243 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2244 if (!vpm_enable) { 2245 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2246 } 2247 2248 /* 2249 * Move bytes in at most PAGESIZE chunks. We must avoid 2250 * spanning pages in uiomove() because page faults may cause 2251 * the cache to be invalidated out from under us. The r_size is not 2252 * updated until after the uiomove. If we push the last page of a 2253 * file before r_size is correct, we will lose the data written past 2254 * the current (and invalid) r_size. 2255 */ 2256 do { 2257 offset = uio->uio_loffset; 2258 pagecreate = 0; 2259 2260 /* 2261 * n is the number of bytes required to satisfy the request 2262 * or the number of bytes to fill out the page. 2263 */ 2264 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2265 2266 /* 2267 * Check to see if we can skip reading in the page 2268 * and just allocate the memory. We can do this 2269 * if we are going to rewrite the entire mapping 2270 * or if we are going to write to or beyond the current 2271 * end of file from the beginning of the mapping. 2272 * 2273 * The read of r_size is now protected by r_statelock. 2274 */ 2275 mutex_enter(&rp->r_statelock); 2276 /* 2277 * When pgcreated is nonzero the caller has already done 2278 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2279 * segkpm this means we already have at least one page 2280 * created and mapped at base. 2281 */ 2282 pagecreate = pgcreated || 2283 ((offset & PAGEOFFSET) == 0 && 2284 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2285 2286 mutex_exit(&rp->r_statelock); 2287 if (!vpm_enable && pagecreate) { 2288 /* 2289 * The last argument tells segmap_pagecreate() to 2290 * always lock the page, as opposed to sometimes 2291 * returning with the page locked. This way we avoid a 2292 * fault on the ensuing uiomove(), but also 2293 * more importantly (to fix bug 1094402) we can 2294 * call segmap_fault() to unlock the page in all 2295 * cases. An alternative would be to modify 2296 * segmap_pagecreate() to tell us when it is 2297 * locking a page, but that's a fairly major 2298 * interface change. 2299 */ 2300 if (pgcreated == 0) 2301 (void) segmap_pagecreate(segkmap, base, 2302 (uint_t)n, 1); 2303 saved_base = base; 2304 saved_n = n; 2305 } 2306 2307 /* 2308 * The number of bytes of data in the last page can not 2309 * be accurately be determined while page is being 2310 * uiomove'd to and the size of the file being updated. 2311 * Thus, inform threads which need to know accurately 2312 * how much data is in the last page of the file. They 2313 * will not do the i/o immediately, but will arrange for 2314 * the i/o to happen later when this modify operation 2315 * will have finished. 2316 */ 2317 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2318 mutex_enter(&rp->r_statelock); 2319 rp->r_flags |= RMODINPROGRESS; 2320 rp->r_modaddr = (offset & MAXBMASK); 2321 mutex_exit(&rp->r_statelock); 2322 2323 if (vpm_enable) { 2324 /* 2325 * Copy data. If new pages are created, part of 2326 * the page that is not written will be initizliazed 2327 * with zeros. 2328 */ 2329 error = vpm_data_copy(vp, offset, n, uio, 2330 !pagecreate, NULL, 0, S_WRITE); 2331 } else { 2332 error = uiomove(base, n, UIO_WRITE, uio); 2333 } 2334 2335 /* 2336 * r_size is the maximum number of 2337 * bytes known to be in the file. 2338 * Make sure it is at least as high as the 2339 * first unwritten byte pointed to by uio_loffset. 2340 */ 2341 mutex_enter(&rp->r_statelock); 2342 if (rp->r_size < uio->uio_loffset) 2343 rp->r_size = uio->uio_loffset; 2344 rp->r_flags &= ~RMODINPROGRESS; 2345 rp->r_flags |= RDIRTY; 2346 mutex_exit(&rp->r_statelock); 2347 2348 /* n = # of bytes written */ 2349 n = (int)(uio->uio_loffset - offset); 2350 2351 if (!vpm_enable) { 2352 base += n; 2353 } 2354 tcount -= n; 2355 /* 2356 * If we created pages w/o initializing them completely, 2357 * we need to zero the part that wasn't set up. 2358 * This happens on a most EOF write cases and if 2359 * we had some sort of error during the uiomove. 2360 */ 2361 if (!vpm_enable && pagecreate) { 2362 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2363 (void) kzero(base, PAGESIZE - n); 2364 2365 if (pgcreated) { 2366 /* 2367 * Caller is responsible for this page, 2368 * it was not created in this loop. 2369 */ 2370 pgcreated = 0; 2371 } else { 2372 /* 2373 * For bug 1094402: segmap_pagecreate locks 2374 * page. Unlock it. This also unlocks the 2375 * pages allocated by page_create_va() in 2376 * segmap_pagecreate(). 2377 */ 2378 sm_error = segmap_fault(kas.a_hat, segkmap, 2379 saved_base, saved_n, 2380 F_SOFTUNLOCK, S_WRITE); 2381 if (error == 0) 2382 error = sm_error; 2383 } 2384 } 2385 } while (tcount > 0 && error == 0); 2386 2387 return (error); 2388 } 2389 2390 int 2391 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2392 { 2393 rnode_t *rp; 2394 page_t *pp; 2395 u_offset_t eoff; 2396 u_offset_t io_off; 2397 size_t io_len; 2398 int error; 2399 int rdirty; 2400 int err; 2401 2402 rp = VTOR(vp); 2403 ASSERT(rp->r_count > 0); 2404 2405 if (!vn_has_cached_data(vp)) 2406 return (0); 2407 2408 ASSERT(vp->v_type != VCHR); 2409 2410 /* 2411 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2412 * writes. B_FORCE is set to force the VM system to actually 2413 * invalidate the pages, even if the i/o failed. The pages 2414 * need to get invalidated because they can't be written out 2415 * because there isn't any space left on either the server's 2416 * file system or in the user's disk quota. The B_FREE bit 2417 * is cleared to avoid confusion as to whether this is a 2418 * request to place the page on the freelist or to destroy 2419 * it. 2420 */ 2421 if ((rp->r_flags & ROUTOFSPACE) || 2422 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2423 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2424 2425 if (len == 0) { 2426 /* 2427 * If doing a full file synchronous operation, then clear 2428 * the RDIRTY bit. If a page gets dirtied while the flush 2429 * is happening, then RDIRTY will get set again. The 2430 * RDIRTY bit must get cleared before the flush so that 2431 * we don't lose this information. 2432 * 2433 * If there are no full file async write operations 2434 * pending and RDIRTY bit is set, clear it. 2435 */ 2436 if (off == (u_offset_t)0 && 2437 !(flags & B_ASYNC) && 2438 (rp->r_flags & RDIRTY)) { 2439 mutex_enter(&rp->r_statelock); 2440 rdirty = (rp->r_flags & RDIRTY); 2441 rp->r_flags &= ~RDIRTY; 2442 mutex_exit(&rp->r_statelock); 2443 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2444 mutex_enter(&rp->r_statelock); 2445 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) { 2446 rdirty = (rp->r_flags & RDIRTY); 2447 rp->r_flags &= ~RDIRTY; 2448 } 2449 mutex_exit(&rp->r_statelock); 2450 } else 2451 rdirty = 0; 2452 2453 /* 2454 * Search the entire vp list for pages >= off, and flush 2455 * the dirty pages. 2456 */ 2457 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2458 flags, cr); 2459 2460 /* 2461 * If an error occurred and the file was marked as dirty 2462 * before and we aren't forcibly invalidating pages, then 2463 * reset the RDIRTY flag. 2464 */ 2465 if (error && rdirty && 2466 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2467 mutex_enter(&rp->r_statelock); 2468 rp->r_flags |= RDIRTY; 2469 mutex_exit(&rp->r_statelock); 2470 } 2471 } else { 2472 /* 2473 * Do a range from [off...off + len) looking for pages 2474 * to deal with. 2475 */ 2476 error = 0; 2477 #ifdef lint 2478 io_len = 0; 2479 #endif 2480 eoff = off + len; 2481 mutex_enter(&rp->r_statelock); 2482 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2483 io_off += io_len) { 2484 mutex_exit(&rp->r_statelock); 2485 /* 2486 * If we are not invalidating, synchronously 2487 * freeing or writing pages use the routine 2488 * page_lookup_nowait() to prevent reclaiming 2489 * them from the free list. 2490 */ 2491 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2492 pp = page_lookup(vp, io_off, 2493 (flags & (B_INVAL | B_FREE)) ? 2494 SE_EXCL : SE_SHARED); 2495 } else { 2496 pp = page_lookup_nowait(vp, io_off, 2497 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2498 } 2499 2500 if (pp == NULL || !pvn_getdirty(pp, flags)) 2501 io_len = PAGESIZE; 2502 else { 2503 err = (*rp->r_putapage)(vp, pp, &io_off, 2504 &io_len, flags, cr); 2505 if (!error) 2506 error = err; 2507 /* 2508 * "io_off" and "io_len" are returned as 2509 * the range of pages we actually wrote. 2510 * This allows us to skip ahead more quickly 2511 * since several pages may've been dealt 2512 * with by this iteration of the loop. 2513 */ 2514 } 2515 mutex_enter(&rp->r_statelock); 2516 } 2517 mutex_exit(&rp->r_statelock); 2518 } 2519 2520 return (error); 2521 } 2522 2523 void 2524 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2525 { 2526 rnode_t *rp; 2527 2528 rp = VTOR(vp); 2529 mutex_enter(&rp->r_statelock); 2530 while (rp->r_flags & RTRUNCATE) 2531 cv_wait(&rp->r_cv, &rp->r_statelock); 2532 rp->r_flags |= RTRUNCATE; 2533 if (off == (u_offset_t)0) { 2534 rp->r_flags &= ~RDIRTY; 2535 if (!(rp->r_flags & RSTALE)) 2536 rp->r_error = 0; 2537 } 2538 rp->r_truncaddr = off; 2539 mutex_exit(&rp->r_statelock); 2540 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2541 B_INVAL | B_TRUNC, cr); 2542 mutex_enter(&rp->r_statelock); 2543 rp->r_flags &= ~RTRUNCATE; 2544 cv_broadcast(&rp->r_cv); 2545 mutex_exit(&rp->r_statelock); 2546 } 2547 2548 static int nfs_write_error_to_cons_only = 0; 2549 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2550 2551 /* 2552 * Print a file handle 2553 */ 2554 void 2555 nfs_printfhandle(nfs_fhandle *fhp) 2556 { 2557 int *ip; 2558 char *buf; 2559 size_t bufsize; 2560 char *cp; 2561 2562 /* 2563 * 13 == "(file handle:" 2564 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2565 * 1 == ' ' 2566 * 8 == maximum strlen of "%x" 2567 * 3 == ")\n\0" 2568 */ 2569 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2570 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2571 if (buf == NULL) 2572 return; 2573 2574 cp = buf; 2575 (void) strcpy(cp, "(file handle:"); 2576 while (*cp != '\0') 2577 cp++; 2578 for (ip = (int *)fhp->fh_buf; 2579 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2580 ip++) { 2581 (void) sprintf(cp, " %x", *ip); 2582 while (*cp != '\0') 2583 cp++; 2584 } 2585 (void) strcpy(cp, ")\n"); 2586 2587 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2588 2589 kmem_free(buf, bufsize); 2590 } 2591 2592 /* 2593 * Notify the system administrator that an NFS write error has 2594 * occurred. 2595 */ 2596 2597 /* seconds between ENOSPC/EDQUOT messages */ 2598 clock_t nfs_write_error_interval = 5; 2599 2600 void 2601 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2602 { 2603 mntinfo_t *mi; 2604 clock_t now; 2605 2606 mi = VTOMI(vp); 2607 /* 2608 * In case of forced unmount or zone shutdown, do not print any 2609 * messages since it can flood the console with error messages. 2610 */ 2611 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2612 return; 2613 2614 /* 2615 * No use in flooding the console with ENOSPC 2616 * messages from the same file system. 2617 */ 2618 now = ddi_get_lbolt(); 2619 if ((error != ENOSPC && error != EDQUOT) || 2620 now - mi->mi_printftime > 0) { 2621 zoneid_t zoneid = mi->mi_zone->zone_id; 2622 2623 #ifdef DEBUG 2624 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2625 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2626 #else 2627 nfs_perror(error, "NFS write error on host %s: %m.\n", 2628 VTOR(vp)->r_server->sv_hostname, NULL); 2629 #endif 2630 if (error == ENOSPC || error == EDQUOT) { 2631 zcmn_err(zoneid, CE_CONT, 2632 MSG("^File: userid=%d, groupid=%d\n"), 2633 crgetuid(cr), crgetgid(cr)); 2634 if (crgetuid(CRED()) != crgetuid(cr) || 2635 crgetgid(CRED()) != crgetgid(cr)) { 2636 zcmn_err(zoneid, CE_CONT, 2637 MSG("^User: userid=%d, groupid=%d\n"), 2638 crgetuid(CRED()), crgetgid(CRED())); 2639 } 2640 mi->mi_printftime = now + 2641 nfs_write_error_interval * hz; 2642 } 2643 nfs_printfhandle(&VTOR(vp)->r_fh); 2644 #ifdef DEBUG 2645 if (error == EACCES) { 2646 zcmn_err(zoneid, CE_CONT, 2647 MSG("^nfs_bio: cred is%s kcred\n"), 2648 cr == kcred ? "" : " not"); 2649 } 2650 #endif 2651 } 2652 } 2653 2654 /* ARGSUSED */ 2655 static void * 2656 nfs_mi_init(zoneid_t zoneid) 2657 { 2658 struct mi_globals *mig; 2659 2660 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2661 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2662 list_create(&mig->mig_list, sizeof (mntinfo_t), 2663 offsetof(mntinfo_t, mi_zone_node)); 2664 mig->mig_destructor_called = B_FALSE; 2665 return (mig); 2666 } 2667 2668 /* 2669 * Callback routine to tell all NFS mounts in the zone to stop creating new 2670 * threads. Existing threads should exit. 2671 */ 2672 /* ARGSUSED */ 2673 static void 2674 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2675 { 2676 struct mi_globals *mig = data; 2677 mntinfo_t *mi; 2678 2679 ASSERT(mig != NULL); 2680 again: 2681 mutex_enter(&mig->mig_lock); 2682 for (mi = list_head(&mig->mig_list); mi != NULL; 2683 mi = list_next(&mig->mig_list, mi)) { 2684 2685 /* 2686 * If we've done the shutdown work for this FS, skip. 2687 * Once we go off the end of the list, we're done. 2688 */ 2689 if (mi->mi_flags & MI_DEAD) 2690 continue; 2691 2692 /* 2693 * We will do work, so not done. Get a hold on the FS. 2694 */ 2695 VFS_HOLD(mi->mi_vfsp); 2696 2697 /* 2698 * purge the DNLC for this filesystem 2699 */ 2700 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2701 2702 mutex_enter(&mi->mi_async_lock); 2703 /* 2704 * Tell existing async worker threads to exit. 2705 */ 2706 mi->mi_max_threads = 0; 2707 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2708 /* 2709 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2710 * getting ready to exit when it's done with its current work. 2711 * Also set MI_DEAD to note we've acted on this FS. 2712 */ 2713 mutex_enter(&mi->mi_lock); 2714 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD); 2715 mutex_exit(&mi->mi_lock); 2716 /* 2717 * Wake up the async manager thread. 2718 */ 2719 cv_broadcast(&mi->mi_async_reqs_cv); 2720 mutex_exit(&mi->mi_async_lock); 2721 2722 /* 2723 * Drop lock and release FS, which may change list, then repeat. 2724 * We're done when every mi has been done or the list is empty. 2725 */ 2726 mutex_exit(&mig->mig_lock); 2727 VFS_RELE(mi->mi_vfsp); 2728 goto again; 2729 } 2730 mutex_exit(&mig->mig_lock); 2731 } 2732 2733 static void 2734 nfs_mi_free_globals(struct mi_globals *mig) 2735 { 2736 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2737 mutex_destroy(&mig->mig_lock); 2738 kmem_free(mig, sizeof (*mig)); 2739 2740 } 2741 2742 /* ARGSUSED */ 2743 static void 2744 nfs_mi_destroy(zoneid_t zoneid, void *data) 2745 { 2746 struct mi_globals *mig = data; 2747 2748 ASSERT(mig != NULL); 2749 mutex_enter(&mig->mig_lock); 2750 if (list_head(&mig->mig_list) != NULL) { 2751 /* Still waiting for VFS_FREEVFS() */ 2752 mig->mig_destructor_called = B_TRUE; 2753 mutex_exit(&mig->mig_lock); 2754 return; 2755 } 2756 nfs_mi_free_globals(mig); 2757 } 2758 2759 /* 2760 * Add an NFS mount to the per-zone list of NFS mounts. 2761 */ 2762 void 2763 nfs_mi_zonelist_add(mntinfo_t *mi) 2764 { 2765 struct mi_globals *mig; 2766 2767 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2768 mutex_enter(&mig->mig_lock); 2769 list_insert_head(&mig->mig_list, mi); 2770 mutex_exit(&mig->mig_lock); 2771 } 2772 2773 /* 2774 * Remove an NFS mount from the per-zone list of NFS mounts. 2775 */ 2776 static void 2777 nfs_mi_zonelist_remove(mntinfo_t *mi) 2778 { 2779 struct mi_globals *mig; 2780 2781 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2782 mutex_enter(&mig->mig_lock); 2783 list_remove(&mig->mig_list, mi); 2784 /* 2785 * We can be called asynchronously by VFS_FREEVFS() after the zone 2786 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2787 * mi globals. 2788 */ 2789 if (list_head(&mig->mig_list) == NULL && 2790 mig->mig_destructor_called == B_TRUE) { 2791 nfs_mi_free_globals(mig); 2792 return; 2793 } 2794 mutex_exit(&mig->mig_lock); 2795 } 2796 2797 /* 2798 * NFS Client initialization routine. This routine should only be called 2799 * once. It performs the following tasks: 2800 * - Initalize all global locks 2801 * - Call sub-initialization routines (localize access to variables) 2802 */ 2803 int 2804 nfs_clntinit(void) 2805 { 2806 #ifdef DEBUG 2807 static boolean_t nfs_clntup = B_FALSE; 2808 #endif 2809 int error; 2810 2811 #ifdef DEBUG 2812 ASSERT(nfs_clntup == B_FALSE); 2813 #endif 2814 2815 error = nfs_subrinit(); 2816 if (error) 2817 return (error); 2818 2819 error = nfs_vfsinit(); 2820 if (error) { 2821 /* 2822 * Cleanup nfs_subrinit() work 2823 */ 2824 nfs_subrfini(); 2825 return (error); 2826 } 2827 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2828 nfs_mi_destroy); 2829 2830 nfs4_clnt_init(); 2831 2832 #ifdef DEBUG 2833 nfs_clntup = B_TRUE; 2834 #endif 2835 2836 return (0); 2837 } 2838 2839 /* 2840 * This routine is only called if the NFS Client has been initialized but 2841 * the module failed to be installed. This routine will cleanup the previously 2842 * allocated/initialized work. 2843 */ 2844 void 2845 nfs_clntfini(void) 2846 { 2847 (void) zone_key_delete(mi_list_key); 2848 nfs_subrfini(); 2849 nfs_vfsfini(); 2850 nfs4_clnt_fini(); 2851 } 2852 2853 /* 2854 * nfs_lockrelease: 2855 * 2856 * Release any locks on the given vnode that are held by the current 2857 * process. 2858 */ 2859 void 2860 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2861 { 2862 flock64_t ld; 2863 struct shrlock shr; 2864 char *buf; 2865 int remote_lock_possible; 2866 int ret; 2867 2868 ASSERT((uintptr_t)vp > KERNELBASE); 2869 2870 /* 2871 * Generate an explicit unlock operation for the entire file. As a 2872 * partial optimization, only generate the unlock if there is a 2873 * lock registered for the file. We could check whether this 2874 * particular process has any locks on the file, but that would 2875 * require the local locking code to provide yet another query 2876 * routine. Note that no explicit synchronization is needed here. 2877 * At worst, flk_has_remote_locks() will return a false positive, 2878 * in which case the unlock call wastes time but doesn't harm 2879 * correctness. 2880 * 2881 * In addition, an unlock request is generated if the process 2882 * is listed as possibly having a lock on the file because the 2883 * server and client lock managers may have gotten out of sync. 2884 * N.B. It is important to make sure nfs_remove_locking_id() is 2885 * called here even if flk_has_remote_locks(vp) reports true. 2886 * If it is not called and there is an entry on the process id 2887 * list, that entry will never get removed. 2888 */ 2889 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2890 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2891 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2892 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2893 ld.l_whence = 0; /* unlock from start of file */ 2894 ld.l_start = 0; 2895 ld.l_len = 0; /* do entire file */ 2896 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr, 2897 NULL); 2898 2899 if (ret != 0) { 2900 /* 2901 * If VOP_FRLOCK fails, make sure we unregister 2902 * local locks before we continue. 2903 */ 2904 ld.l_pid = ttoproc(curthread)->p_pid; 2905 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2906 #ifdef DEBUG 2907 nfs_perror(ret, 2908 "NFS lock release error on vp %p: %m.\n", 2909 (void *)vp, NULL); 2910 #endif 2911 } 2912 2913 /* 2914 * The call to VOP_FRLOCK may put the pid back on the 2915 * list. We need to remove it. 2916 */ 2917 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2918 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2919 } 2920 2921 /* 2922 * As long as the vp has a share matching our pid, 2923 * pluck it off and unshare it. There are circumstances in 2924 * which the call to nfs_remove_locking_id() may put the 2925 * owner back on the list, in which case we simply do a 2926 * redundant and harmless unshare. 2927 */ 2928 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2929 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2930 (char *)NULL, buf, &shr.s_own_len)) { 2931 shr.s_owner = buf; 2932 shr.s_access = 0; 2933 shr.s_deny = 0; 2934 shr.s_sysid = 0; 2935 shr.s_pid = curproc->p_pid; 2936 2937 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL); 2938 #ifdef DEBUG 2939 if (ret != 0) { 2940 nfs_perror(ret, 2941 "NFS share release error on vp %p: %m.\n", 2942 (void *)vp, NULL); 2943 } 2944 #endif 2945 } 2946 kmem_free(buf, MAX_SHR_OWNER_LEN); 2947 } 2948 2949 /* 2950 * nfs_lockcompletion: 2951 * 2952 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2953 * as non cachable (set VNOCACHE bit). 2954 */ 2955 2956 void 2957 nfs_lockcompletion(vnode_t *vp, int cmd) 2958 { 2959 #ifdef DEBUG 2960 rnode_t *rp = VTOR(vp); 2961 2962 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2963 #endif 2964 2965 if (cmd == F_SETLK || cmd == F_SETLKW) { 2966 if (!lm_safemap(vp)) { 2967 mutex_enter(&vp->v_lock); 2968 vp->v_flag |= VNOCACHE; 2969 mutex_exit(&vp->v_lock); 2970 } else { 2971 mutex_enter(&vp->v_lock); 2972 vp->v_flag &= ~VNOCACHE; 2973 mutex_exit(&vp->v_lock); 2974 } 2975 } 2976 /* 2977 * The cached attributes of the file are stale after acquiring 2978 * the lock on the file. They were updated when the file was 2979 * opened, but not updated when the lock was acquired. Therefore the 2980 * cached attributes are invalidated after the lock is obtained. 2981 */ 2982 PURGE_ATTRCACHE(vp); 2983 } 2984 2985 /* 2986 * The lock manager holds state making it possible for the client 2987 * and server to be out of sync. For example, if the response from 2988 * the server granting a lock request is lost, the server will think 2989 * the lock is granted and the client will think the lock is lost. 2990 * The client can tell when it is not positive if it is in sync with 2991 * the server. 2992 * 2993 * To deal with this, a list of processes for which the client is 2994 * not sure if the server holds a lock is attached to the rnode. 2995 * When such a process closes the rnode, an unlock request is sent 2996 * to the server to unlock the entire file. 2997 * 2998 * The list is kept as a singularly linked NULL terminated list. 2999 * Because it is only added to under extreme error conditions, the 3000 * list shouldn't get very big. DEBUG kernels print a message if 3001 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 3002 * choosen to be 8, but can be tuned at runtime. 3003 */ 3004 #ifdef DEBUG 3005 /* int nfs_lmpl_high_water = 8; */ 3006 int nfs_lmpl_high_water = 128; 3007 int nfs_cnt_add_locking_id = 0; 3008 int nfs_len_add_locking_id = 0; 3009 #endif /* DEBUG */ 3010 3011 /* 3012 * Record that the nfs lock manager server may be holding a lock on 3013 * a vnode for a process. 3014 * 3015 * Because the nfs lock manager server holds state, it is possible 3016 * for the server to get out of sync with the client. This routine is called 3017 * from the client when it is no longer sure if the server is in sync 3018 * with the client. nfs_lockrelease() will then notice this and send 3019 * an unlock request when the file is closed 3020 */ 3021 void 3022 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 3023 { 3024 rnode_t *rp; 3025 lmpl_t *new; 3026 lmpl_t *cur; 3027 lmpl_t **lmplp; 3028 #ifdef DEBUG 3029 int list_len = 1; 3030 #endif /* DEBUG */ 3031 3032 #ifdef DEBUG 3033 ++nfs_cnt_add_locking_id; 3034 #endif /* DEBUG */ 3035 /* 3036 * allocate new lmpl_t now so we don't sleep 3037 * later after grabbing mutexes 3038 */ 3039 ASSERT(len < MAX_SHR_OWNER_LEN); 3040 new = kmem_alloc(sizeof (*new), KM_SLEEP); 3041 new->lmpl_type = type; 3042 new->lmpl_pid = pid; 3043 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 3044 bcopy(id, new->lmpl_owner, len); 3045 new->lmpl_own_len = len; 3046 new->lmpl_next = (lmpl_t *)NULL; 3047 #ifdef DEBUG 3048 if (type == RLMPL_PID) { 3049 ASSERT(len == sizeof (pid_t)); 3050 ASSERT(pid == *(pid_t *)new->lmpl_owner); 3051 } else { 3052 ASSERT(type == RLMPL_OWNER); 3053 } 3054 #endif 3055 3056 rp = VTOR(vp); 3057 mutex_enter(&rp->r_statelock); 3058 3059 /* 3060 * Add this id to the list for this rnode only if the 3061 * rnode is active and the id is not already there. 3062 */ 3063 ASSERT(rp->r_flags & RHASHED); 3064 lmplp = &(rp->r_lmpl); 3065 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3066 if (cur->lmpl_pid == pid && 3067 cur->lmpl_type == type && 3068 cur->lmpl_own_len == len && 3069 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 3070 kmem_free(new->lmpl_owner, len); 3071 kmem_free(new, sizeof (*new)); 3072 break; 3073 } 3074 lmplp = &cur->lmpl_next; 3075 #ifdef DEBUG 3076 ++list_len; 3077 #endif /* DEBUG */ 3078 } 3079 if (cur == (lmpl_t *)NULL) { 3080 *lmplp = new; 3081 #ifdef DEBUG 3082 if (list_len > nfs_len_add_locking_id) { 3083 nfs_len_add_locking_id = list_len; 3084 } 3085 if (list_len > nfs_lmpl_high_water) { 3086 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 3087 "vp=%p is %d", (void *)vp, list_len); 3088 } 3089 #endif /* DEBUG */ 3090 } 3091 3092 #ifdef DEBUG 3093 if (share_debug) { 3094 int nitems = 0; 3095 int npids = 0; 3096 int nowners = 0; 3097 3098 /* 3099 * Count the number of things left on r_lmpl after the remove. 3100 */ 3101 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3102 cur = cur->lmpl_next) { 3103 nitems++; 3104 if (cur->lmpl_type == RLMPL_PID) { 3105 npids++; 3106 } else if (cur->lmpl_type == RLMPL_OWNER) { 3107 nowners++; 3108 } else { 3109 cmn_err(CE_PANIC, "nfs_add_locking_id: " 3110 "unrecognized lmpl_type %d", 3111 cur->lmpl_type); 3112 } 3113 } 3114 3115 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 3116 "OWNs = %d items left on r_lmpl\n", 3117 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 3118 } 3119 #endif 3120 3121 mutex_exit(&rp->r_statelock); 3122 } 3123 3124 /* 3125 * Remove an id from the lock manager id list. 3126 * 3127 * If the id is not in the list return 0. If it was found and 3128 * removed, return 1. 3129 */ 3130 static int 3131 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 3132 { 3133 lmpl_t *cur; 3134 lmpl_t **lmplp; 3135 rnode_t *rp; 3136 int rv = 0; 3137 3138 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 3139 3140 rp = VTOR(vp); 3141 3142 mutex_enter(&rp->r_statelock); 3143 ASSERT(rp->r_flags & RHASHED); 3144 lmplp = &(rp->r_lmpl); 3145 3146 /* 3147 * Search through the list and remove the entry for this id 3148 * if it is there. The special case id == NULL allows removal 3149 * of the first share on the r_lmpl list belonging to the 3150 * current process (if any), without regard to further details 3151 * of its identity. 3152 */ 3153 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3154 if (cur->lmpl_type == type && 3155 cur->lmpl_pid == curproc->p_pid && 3156 (id == (char *)NULL || 3157 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 3158 *lmplp = cur->lmpl_next; 3159 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 3160 if (rid != NULL) { 3161 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 3162 *rlen = cur->lmpl_own_len; 3163 } 3164 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 3165 kmem_free(cur, sizeof (*cur)); 3166 rv = 1; 3167 break; 3168 } 3169 lmplp = &cur->lmpl_next; 3170 } 3171 3172 #ifdef DEBUG 3173 if (share_debug) { 3174 int nitems = 0; 3175 int npids = 0; 3176 int nowners = 0; 3177 3178 /* 3179 * Count the number of things left on r_lmpl after the remove. 3180 */ 3181 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3182 cur = cur->lmpl_next) { 3183 nitems++; 3184 if (cur->lmpl_type == RLMPL_PID) { 3185 npids++; 3186 } else if (cur->lmpl_type == RLMPL_OWNER) { 3187 nowners++; 3188 } else { 3189 cmn_err(CE_PANIC, 3190 "nrli: unrecognized lmpl_type %d", 3191 cur->lmpl_type); 3192 } 3193 } 3194 3195 cmn_err(CE_CONT, 3196 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3197 (type == RLMPL_PID) ? "P" : "O", 3198 npids, 3199 nowners, 3200 nitems); 3201 } 3202 #endif 3203 3204 mutex_exit(&rp->r_statelock); 3205 return (rv); 3206 } 3207 3208 void 3209 nfs_free_mi(mntinfo_t *mi) 3210 { 3211 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3212 ASSERT(mi->mi_manager_thread == NULL); 3213 ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 && 3214 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0); 3215 3216 /* 3217 * Remove the node from the global list before we start tearing it down. 3218 */ 3219 nfs_mi_zonelist_remove(mi); 3220 if (mi->mi_klmconfig) { 3221 lm_free_config(mi->mi_klmconfig); 3222 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3223 } 3224 mutex_destroy(&mi->mi_lock); 3225 mutex_destroy(&mi->mi_remap_lock); 3226 mutex_destroy(&mi->mi_async_lock); 3227 cv_destroy(&mi->mi_failover_cv); 3228 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]); 3229 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]); 3230 cv_destroy(&mi->mi_async_reqs_cv); 3231 cv_destroy(&mi->mi_async_cv); 3232 zone_rele(mi->mi_zone); 3233 kmem_free(mi, sizeof (*mi)); 3234 } 3235 3236 static int 3237 mnt_kstat_update(kstat_t *ksp, int rw) 3238 { 3239 mntinfo_t *mi; 3240 struct mntinfo_kstat *mik; 3241 vfs_t *vfsp; 3242 int i; 3243 3244 /* this is a read-only kstat. Bail out on a write */ 3245 if (rw == KSTAT_WRITE) 3246 return (EACCES); 3247 3248 /* 3249 * We don't want to wait here as kstat_chain_lock could be held by 3250 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3251 * and thus could lead to a deadlock. 3252 */ 3253 vfsp = (struct vfs *)ksp->ks_private; 3254 3255 3256 mi = VFTOMI(vfsp); 3257 3258 mik = (struct mntinfo_kstat *)ksp->ks_data; 3259 3260 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3261 mik->mik_vers = (uint32_t)mi->mi_vers; 3262 mik->mik_flags = mi->mi_flags; 3263 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3264 mik->mik_curread = (uint32_t)mi->mi_curread; 3265 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3266 mik->mik_retrans = mi->mi_retrans; 3267 mik->mik_timeo = mi->mi_timeo; 3268 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3269 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3270 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3271 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3272 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3273 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3274 mik->mik_timers[i].deviate = 3275 (uint32_t)mi->mi_timers[i].rt_deviate; 3276 mik->mik_timers[i].rtxcur = 3277 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3278 } 3279 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3280 mik->mik_failover = (uint32_t)mi->mi_failover; 3281 mik->mik_remap = (uint32_t)mi->mi_remap; 3282 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3283 3284 return (0); 3285 } 3286 3287 void 3288 nfs_mnt_kstat_init(struct vfs *vfsp) 3289 { 3290 mntinfo_t *mi = VFTOMI(vfsp); 3291 3292 /* 3293 * Create the version specific kstats. 3294 * 3295 * PSARC 2001/697 Contract Private Interface 3296 * All nfs kstats are under SunMC contract 3297 * Please refer to the PSARC listed above and contact 3298 * SunMC before making any changes! 3299 * 3300 * Changes must be reviewed by Solaris File Sharing 3301 * Changes must be communicated to contract-2001-697@sun.com 3302 * 3303 */ 3304 3305 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3306 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3307 if (mi->mi_io_kstats) { 3308 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3309 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3310 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3311 kstat_install(mi->mi_io_kstats); 3312 } 3313 3314 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3315 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3316 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3317 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3318 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3319 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3320 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3321 kstat_install(mi->mi_ro_kstats); 3322 } 3323 } 3324 3325 nfs_delmapcall_t * 3326 nfs_init_delmapcall() 3327 { 3328 nfs_delmapcall_t *delmap_call; 3329 3330 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3331 delmap_call->call_id = curthread; 3332 delmap_call->error = 0; 3333 3334 return (delmap_call); 3335 } 3336 3337 void 3338 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3339 { 3340 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3341 } 3342 3343 /* 3344 * Searches for the current delmap caller (based on curthread) in the list of 3345 * callers. If it is found, we remove it and free the delmap caller. 3346 * Returns: 3347 * 0 if the caller wasn't found 3348 * 1 if the caller was found, removed and freed. *errp is set to what 3349 * the result of the delmap was. 3350 */ 3351 int 3352 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3353 { 3354 nfs_delmapcall_t *delmap_call; 3355 3356 /* 3357 * If the list doesn't exist yet, we create it and return 3358 * that the caller wasn't found. No list = no callers. 3359 */ 3360 mutex_enter(&rp->r_statelock); 3361 if (!(rp->r_flags & RDELMAPLIST)) { 3362 /* The list does not exist */ 3363 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3364 offsetof(nfs_delmapcall_t, call_node)); 3365 rp->r_flags |= RDELMAPLIST; 3366 mutex_exit(&rp->r_statelock); 3367 return (0); 3368 } else { 3369 /* The list exists so search it */ 3370 for (delmap_call = list_head(&rp->r_indelmap); 3371 delmap_call != NULL; 3372 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3373 if (delmap_call->call_id == curthread) { 3374 /* current caller is in the list */ 3375 *errp = delmap_call->error; 3376 list_remove(&rp->r_indelmap, delmap_call); 3377 mutex_exit(&rp->r_statelock); 3378 nfs_free_delmapcall(delmap_call); 3379 return (1); 3380 } 3381 } 3382 } 3383 mutex_exit(&rp->r_statelock); 3384 return (0); 3385 } 3386