1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 25 * All rights reserved. 26 */ 27 28 #include <sys/param.h> 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/thread.h> 32 #include <sys/t_lock.h> 33 #include <sys/time.h> 34 #include <sys/vnode.h> 35 #include <sys/vfs.h> 36 #include <sys/errno.h> 37 #include <sys/buf.h> 38 #include <sys/stat.h> 39 #include <sys/cred.h> 40 #include <sys/kmem.h> 41 #include <sys/debug.h> 42 #include <sys/dnlc.h> 43 #include <sys/vmsystm.h> 44 #include <sys/flock.h> 45 #include <sys/share.h> 46 #include <sys/cmn_err.h> 47 #include <sys/tiuser.h> 48 #include <sys/sysmacros.h> 49 #include <sys/callb.h> 50 #include <sys/acl.h> 51 #include <sys/kstat.h> 52 #include <sys/signal.h> 53 #include <sys/list.h> 54 #include <sys/zone.h> 55 56 #include <rpc/types.h> 57 #include <rpc/xdr.h> 58 #include <rpc/auth.h> 59 #include <rpc/clnt.h> 60 61 #include <nfs/nfs.h> 62 #include <nfs/nfs_clnt.h> 63 64 #include <nfs/rnode.h> 65 #include <nfs/nfs_acl.h> 66 #include <nfs/lm.h> 67 68 #include <vm/hat.h> 69 #include <vm/as.h> 70 #include <vm/page.h> 71 #include <vm/pvn.h> 72 #include <vm/seg.h> 73 #include <vm/seg_map.h> 74 #include <vm/seg_vn.h> 75 76 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 77 cred_t *); 78 static int nfs_getattr_cache(vnode_t *, struct vattr *); 79 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 80 81 struct mi_globals { 82 kmutex_t mig_lock; /* lock protecting mig_list */ 83 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 84 boolean_t mig_destructor_called; 85 }; 86 87 static zone_key_t mi_list_key; 88 89 /* Debugging flag for PC file shares. */ 90 extern int share_debug; 91 92 /* 93 * Attributes caching: 94 * 95 * Attributes are cached in the rnode in struct vattr form. 96 * There is a time associated with the cached attributes (r_attrtime) 97 * which tells whether the attributes are valid. The time is initialized 98 * to the difference between current time and the modify time of the vnode 99 * when new attributes are cached. This allows the attributes for 100 * files that have changed recently to be timed out sooner than for files 101 * that have not changed for a long time. There are minimum and maximum 102 * timeout values that can be set per mount point. 103 */ 104 105 int 106 nfs_waitfor_purge_complete(vnode_t *vp) 107 { 108 rnode_t *rp; 109 k_sigset_t smask; 110 111 rp = VTOR(vp); 112 if (rp->r_serial != NULL && rp->r_serial != curthread) { 113 mutex_enter(&rp->r_statelock); 114 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 115 while (rp->r_serial != NULL) { 116 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 117 sigunintr(&smask); 118 mutex_exit(&rp->r_statelock); 119 return (EINTR); 120 } 121 } 122 sigunintr(&smask); 123 mutex_exit(&rp->r_statelock); 124 } 125 return (0); 126 } 127 128 /* 129 * Validate caches by checking cached attributes. If the cached 130 * attributes have timed out, then get new attributes from the server. 131 * As a side affect, this will do cache invalidation if the attributes 132 * have changed. 133 * 134 * If the attributes have not timed out and if there is a cache 135 * invalidation being done by some other thread, then wait until that 136 * thread has completed the cache invalidation. 137 */ 138 int 139 nfs_validate_caches(vnode_t *vp, cred_t *cr) 140 { 141 int error; 142 struct vattr va; 143 144 if (ATTRCACHE_VALID(vp)) { 145 error = nfs_waitfor_purge_complete(vp); 146 if (error) 147 return (error); 148 return (0); 149 } 150 151 va.va_mask = AT_ALL; 152 return (nfs_getattr_otw(vp, &va, cr)); 153 } 154 155 /* 156 * Validate caches by checking cached attributes. If the cached 157 * attributes have timed out, then get new attributes from the server. 158 * As a side affect, this will do cache invalidation if the attributes 159 * have changed. 160 * 161 * If the attributes have not timed out and if there is a cache 162 * invalidation being done by some other thread, then wait until that 163 * thread has completed the cache invalidation. 164 */ 165 int 166 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 167 { 168 int error; 169 struct vattr va; 170 171 if (ATTRCACHE_VALID(vp)) { 172 error = nfs_waitfor_purge_complete(vp); 173 if (error) 174 return (error); 175 return (0); 176 } 177 178 va.va_mask = AT_ALL; 179 return (nfs3_getattr_otw(vp, &va, cr)); 180 } 181 182 /* 183 * Purge all of the various NFS `data' caches. 184 */ 185 void 186 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 187 { 188 rnode_t *rp; 189 char *contents; 190 int size; 191 int error; 192 193 /* 194 * Purge the DNLC for any entries which refer to this file. 195 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 196 */ 197 rp = VTOR(vp); 198 mutex_enter(&rp->r_statelock); 199 if (vp->v_count > 1 && 200 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 201 !(rp->r_flags & RINDNLCPURGE)) { 202 /* 203 * Set the RINDNLCPURGE flag to prevent recursive entry 204 * into dnlc_purge_vp() 205 */ 206 if (vp->v_type == VDIR) 207 rp->r_flags |= RINDNLCPURGE; 208 mutex_exit(&rp->r_statelock); 209 dnlc_purge_vp(vp); 210 mutex_enter(&rp->r_statelock); 211 if (rp->r_flags & RINDNLCPURGE) 212 rp->r_flags &= ~RINDNLCPURGE; 213 } 214 215 /* 216 * Clear any readdir state bits and purge the readlink response cache. 217 */ 218 contents = rp->r_symlink.contents; 219 size = rp->r_symlink.size; 220 rp->r_symlink.contents = NULL; 221 mutex_exit(&rp->r_statelock); 222 223 if (contents != NULL) { 224 225 kmem_free((void *)contents, size); 226 } 227 228 /* 229 * Flush the page cache. 230 */ 231 if (vn_has_cached_data(vp)) { 232 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 233 if (error && (error == ENOSPC || error == EDQUOT)) { 234 mutex_enter(&rp->r_statelock); 235 if (!rp->r_error) 236 rp->r_error = error; 237 mutex_exit(&rp->r_statelock); 238 } 239 } 240 241 /* 242 * Flush the readdir response cache. 243 */ 244 if (HAVE_RDDIR_CACHE(rp)) 245 nfs_purge_rddir_cache(vp); 246 } 247 248 /* 249 * Purge the readdir cache of all entries 250 */ 251 void 252 nfs_purge_rddir_cache(vnode_t *vp) 253 { 254 rnode_t *rp; 255 rddir_cache *rdc; 256 rddir_cache *nrdc; 257 258 rp = VTOR(vp); 259 top: 260 mutex_enter(&rp->r_statelock); 261 rp->r_direof = NULL; 262 rp->r_flags &= ~RLOOKUP; 263 rp->r_flags |= RREADDIRPLUS; 264 rdc = avl_first(&rp->r_dir); 265 while (rdc != NULL) { 266 nrdc = AVL_NEXT(&rp->r_dir, rdc); 267 avl_remove(&rp->r_dir, rdc); 268 rddir_cache_rele(rdc); 269 rdc = nrdc; 270 } 271 mutex_exit(&rp->r_statelock); 272 } 273 274 /* 275 * Do a cache check based on the post-operation attributes. 276 * Then make them the new cached attributes. If no attributes 277 * were returned, then mark the attributes as timed out. 278 */ 279 void 280 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 281 { 282 vattr_t attr; 283 284 if (!poap->attributes) { 285 PURGE_ATTRCACHE(vp); 286 return; 287 } 288 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 289 } 290 291 /* 292 * Same as above, but using a vattr 293 */ 294 void 295 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 296 cred_t *cr) 297 { 298 if (!poap->attributes) { 299 PURGE_ATTRCACHE(vp); 300 return; 301 } 302 nfs_attr_cache(vp, poap->fres.vap, t, cr); 303 } 304 305 /* 306 * Do a cache check based on the weak cache consistency attributes. 307 * These consist of a small set of pre-operation attributes and the 308 * full set of post-operation attributes. 309 * 310 * If we are given the pre-operation attributes, then use them to 311 * check the validity of the various caches. Then, if we got the 312 * post-operation attributes, make them the new cached attributes. 313 * If we didn't get the post-operation attributes, then mark the 314 * attribute cache as timed out so that the next reference will 315 * cause a GETATTR to the server to refresh with the current 316 * attributes. 317 * 318 * Otherwise, if we didn't get the pre-operation attributes, but 319 * we did get the post-operation attributes, then use these 320 * attributes to check the validity of the various caches. This 321 * will probably cause a flush of the caches because if the 322 * operation succeeded, the attributes of the object were changed 323 * in some way from the old post-operation attributes. This 324 * should be okay because it is the safe thing to do. After 325 * checking the data caches, then we make these the new cached 326 * attributes. 327 * 328 * Otherwise, we didn't get either the pre- or post-operation 329 * attributes. Simply mark the attribute cache as timed out so 330 * the next reference will cause a GETATTR to the server to 331 * refresh with the current attributes. 332 * 333 * If an error occurred trying to convert the over the wire 334 * attributes to a vattr, then simply mark the attribute cache as 335 * timed out. 336 */ 337 void 338 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 339 { 340 vattr_t bva; 341 vattr_t ava; 342 343 if (wccp->after.attributes) { 344 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 345 PURGE_ATTRCACHE(vp); 346 return; 347 } 348 if (wccp->before.attributes) { 349 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 350 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 351 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 352 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 353 bva.va_size = wccp->before.attr.size; 354 nfs3_attr_cache(vp, &bva, &ava, t, cr); 355 } else 356 nfs_attr_cache(vp, &ava, t, cr); 357 } else { 358 PURGE_ATTRCACHE(vp); 359 } 360 } 361 362 /* 363 * Set attributes cache for given vnode using nfsattr. 364 * 365 * This routine does not do cache validation with the attributes. 366 * 367 * If an error occurred trying to convert the over the wire 368 * attributes to a vattr, then simply mark the attribute cache as 369 * timed out. 370 */ 371 void 372 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 373 { 374 rnode_t *rp; 375 struct vattr va; 376 377 if (!nattr_to_vattr(vp, na, &va)) { 378 rp = VTOR(vp); 379 mutex_enter(&rp->r_statelock); 380 if (rp->r_mtime <= t) 381 nfs_attrcache_va(vp, &va); 382 mutex_exit(&rp->r_statelock); 383 } else { 384 PURGE_ATTRCACHE(vp); 385 } 386 } 387 388 /* 389 * Set attributes cache for given vnode using fattr3. 390 * 391 * This routine does not do cache validation with the attributes. 392 * 393 * If an error occurred trying to convert the over the wire 394 * attributes to a vattr, then simply mark the attribute cache as 395 * timed out. 396 */ 397 void 398 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 399 { 400 rnode_t *rp; 401 struct vattr va; 402 403 if (!fattr3_to_vattr(vp, na, &va)) { 404 rp = VTOR(vp); 405 mutex_enter(&rp->r_statelock); 406 if (rp->r_mtime <= t) 407 nfs_attrcache_va(vp, &va); 408 mutex_exit(&rp->r_statelock); 409 } else { 410 PURGE_ATTRCACHE(vp); 411 } 412 } 413 414 /* 415 * Do a cache check based on attributes returned over the wire. The 416 * new attributes are cached. 417 * 418 * If an error occurred trying to convert the over the wire attributes 419 * to a vattr, then just return that error. 420 * 421 * As a side affect, the vattr argument is filled in with the converted 422 * attributes. 423 */ 424 int 425 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 426 cred_t *cr) 427 { 428 int error; 429 430 error = nattr_to_vattr(vp, na, vap); 431 if (error) 432 return (error); 433 nfs_attr_cache(vp, vap, t, cr); 434 return (0); 435 } 436 437 /* 438 * Do a cache check based on attributes returned over the wire. The 439 * new attributes are cached. 440 * 441 * If an error occurred trying to convert the over the wire attributes 442 * to a vattr, then just return that error. 443 * 444 * As a side affect, the vattr argument is filled in with the converted 445 * attributes. 446 */ 447 int 448 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 449 { 450 int error; 451 452 error = fattr3_to_vattr(vp, na, vap); 453 if (error) 454 return (error); 455 nfs_attr_cache(vp, vap, t, cr); 456 return (0); 457 } 458 459 /* 460 * Use the passed in virtual attributes to check to see whether the 461 * data and metadata caches are valid, cache the new attributes, and 462 * then do the cache invalidation if required. 463 * 464 * The cache validation and caching of the new attributes is done 465 * atomically via the use of the mutex, r_statelock. If required, 466 * the cache invalidation is done atomically w.r.t. the cache 467 * validation and caching of the attributes via the pseudo lock, 468 * r_serial. 469 * 470 * This routine is used to do cache validation and attributes caching 471 * for operations with a single set of post operation attributes. 472 */ 473 void 474 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 475 { 476 rnode_t *rp; 477 int mtime_changed = 0; 478 int ctime_changed = 0; 479 vsecattr_t *vsp; 480 int was_serial; 481 len_t preattr_rsize; 482 boolean_t writeattr_set = B_FALSE; 483 boolean_t cachepurge_set = B_FALSE; 484 485 rp = VTOR(vp); 486 487 mutex_enter(&rp->r_statelock); 488 489 if (rp->r_serial != curthread) { 490 klwp_t *lwp = ttolwp(curthread); 491 492 was_serial = 0; 493 if (lwp != NULL) 494 lwp->lwp_nostop++; 495 while (rp->r_serial != NULL) { 496 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 497 mutex_exit(&rp->r_statelock); 498 if (lwp != NULL) 499 lwp->lwp_nostop--; 500 return; 501 } 502 } 503 if (lwp != NULL) 504 lwp->lwp_nostop--; 505 } else 506 was_serial = 1; 507 508 if (rp->r_mtime > t) { 509 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 510 PURGE_ATTRCACHE_LOCKED(rp); 511 mutex_exit(&rp->r_statelock); 512 return; 513 } 514 515 /* 516 * Write thread after writing data to file on remote server, 517 * will always set RWRITEATTR to indicate that file on remote 518 * server was modified with a WRITE operation and would have 519 * marked attribute cache as timed out. If RWRITEATTR 520 * is set, then do not check for mtime and ctime change. 521 */ 522 if (!(rp->r_flags & RWRITEATTR)) { 523 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 524 mtime_changed = 1; 525 526 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 527 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 528 ctime_changed = 1; 529 } else { 530 writeattr_set = B_TRUE; 531 } 532 533 preattr_rsize = rp->r_size; 534 535 nfs_attrcache_va(vp, vap); 536 537 /* 538 * If we have updated filesize in nfs_attrcache_va, as soon as we 539 * drop statelock we will be in transition of purging all 540 * our caches and updating them. It is possible for another 541 * thread to pick this new file size and read in zeroed data. 542 * stall other threads till cache purge is complete. 543 */ 544 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 545 /* 546 * If RWRITEATTR was set and we have updated the file 547 * size, Server's returned file size need not necessarily 548 * be because of this Client's WRITE. We need to purge 549 * all caches. 550 */ 551 if (writeattr_set) 552 mtime_changed = 1; 553 554 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 555 rp->r_flags |= RINCACHEPURGE; 556 cachepurge_set = B_TRUE; 557 } 558 } 559 560 if (!mtime_changed && !ctime_changed) { 561 mutex_exit(&rp->r_statelock); 562 return; 563 } 564 565 rp->r_serial = curthread; 566 567 mutex_exit(&rp->r_statelock); 568 569 if (mtime_changed) 570 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 571 572 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 573 mutex_enter(&rp->r_statelock); 574 rp->r_flags &= ~RINCACHEPURGE; 575 cv_broadcast(&rp->r_cv); 576 mutex_exit(&rp->r_statelock); 577 cachepurge_set = B_FALSE; 578 } 579 580 if (ctime_changed) { 581 (void) nfs_access_purge_rp(rp); 582 if (rp->r_secattr != NULL) { 583 mutex_enter(&rp->r_statelock); 584 vsp = rp->r_secattr; 585 rp->r_secattr = NULL; 586 mutex_exit(&rp->r_statelock); 587 if (vsp != NULL) 588 nfs_acl_free(vsp); 589 } 590 } 591 592 if (!was_serial) { 593 mutex_enter(&rp->r_statelock); 594 rp->r_serial = NULL; 595 cv_broadcast(&rp->r_cv); 596 mutex_exit(&rp->r_statelock); 597 } 598 } 599 600 /* 601 * Use the passed in "before" virtual attributes to check to see 602 * whether the data and metadata caches are valid, cache the "after" 603 * new attributes, and then do the cache invalidation if required. 604 * 605 * The cache validation and caching of the new attributes is done 606 * atomically via the use of the mutex, r_statelock. If required, 607 * the cache invalidation is done atomically w.r.t. the cache 608 * validation and caching of the attributes via the pseudo lock, 609 * r_serial. 610 * 611 * This routine is used to do cache validation and attributes caching 612 * for operations with both pre operation attributes and post operation 613 * attributes. 614 */ 615 static void 616 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 617 cred_t *cr) 618 { 619 rnode_t *rp; 620 int mtime_changed = 0; 621 int ctime_changed = 0; 622 vsecattr_t *vsp; 623 int was_serial; 624 len_t preattr_rsize; 625 boolean_t writeattr_set = B_FALSE; 626 boolean_t cachepurge_set = B_FALSE; 627 628 rp = VTOR(vp); 629 630 mutex_enter(&rp->r_statelock); 631 632 if (rp->r_serial != curthread) { 633 klwp_t *lwp = ttolwp(curthread); 634 635 was_serial = 0; 636 if (lwp != NULL) 637 lwp->lwp_nostop++; 638 while (rp->r_serial != NULL) { 639 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 640 mutex_exit(&rp->r_statelock); 641 if (lwp != NULL) 642 lwp->lwp_nostop--; 643 return; 644 } 645 } 646 if (lwp != NULL) 647 lwp->lwp_nostop--; 648 } else 649 was_serial = 1; 650 651 if (rp->r_mtime > t) { 652 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size)) 653 PURGE_ATTRCACHE_LOCKED(rp); 654 mutex_exit(&rp->r_statelock); 655 return; 656 } 657 658 /* 659 * Write thread after writing data to file on remote server, 660 * will always set RWRITEATTR to indicate that file on remote 661 * server was modified with a WRITE operation and would have 662 * marked attribute cache as timed out. If RWRITEATTR 663 * is set, then do not check for mtime and ctime change. 664 */ 665 if (!(rp->r_flags & RWRITEATTR)) { 666 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 667 mtime_changed = 1; 668 669 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 670 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 671 ctime_changed = 1; 672 } else { 673 writeattr_set = B_TRUE; 674 } 675 676 preattr_rsize = rp->r_size; 677 678 nfs_attrcache_va(vp, avap); 679 680 /* 681 * If we have updated filesize in nfs_attrcache_va, as soon as we 682 * drop statelock we will be in transition of purging all 683 * our caches and updating them. It is possible for another 684 * thread to pick this new file size and read in zeroed data. 685 * stall other threads till cache purge is complete. 686 */ 687 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 688 /* 689 * If RWRITEATTR was set and we have updated the file 690 * size, Server's returned file size need not necessarily 691 * be because of this Client's WRITE. We need to purge 692 * all caches. 693 */ 694 if (writeattr_set) 695 mtime_changed = 1; 696 697 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 698 rp->r_flags |= RINCACHEPURGE; 699 cachepurge_set = B_TRUE; 700 } 701 } 702 703 if (!mtime_changed && !ctime_changed) { 704 mutex_exit(&rp->r_statelock); 705 return; 706 } 707 708 rp->r_serial = curthread; 709 710 mutex_exit(&rp->r_statelock); 711 712 if (mtime_changed) 713 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 714 715 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 716 mutex_enter(&rp->r_statelock); 717 rp->r_flags &= ~RINCACHEPURGE; 718 cv_broadcast(&rp->r_cv); 719 mutex_exit(&rp->r_statelock); 720 cachepurge_set = B_FALSE; 721 } 722 723 if (ctime_changed) { 724 (void) nfs_access_purge_rp(rp); 725 if (rp->r_secattr != NULL) { 726 mutex_enter(&rp->r_statelock); 727 vsp = rp->r_secattr; 728 rp->r_secattr = NULL; 729 mutex_exit(&rp->r_statelock); 730 if (vsp != NULL) 731 nfs_acl_free(vsp); 732 } 733 } 734 735 if (!was_serial) { 736 mutex_enter(&rp->r_statelock); 737 rp->r_serial = NULL; 738 cv_broadcast(&rp->r_cv); 739 mutex_exit(&rp->r_statelock); 740 } 741 } 742 743 /* 744 * Set attributes cache for given vnode using virtual attributes. 745 * 746 * Set the timeout value on the attribute cache and fill it 747 * with the passed in attributes. 748 * 749 * The caller must be holding r_statelock. 750 */ 751 void 752 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 753 { 754 rnode_t *rp; 755 mntinfo_t *mi; 756 hrtime_t delta; 757 hrtime_t now; 758 759 rp = VTOR(vp); 760 761 ASSERT(MUTEX_HELD(&rp->r_statelock)); 762 763 now = gethrtime(); 764 765 mi = VTOMI(vp); 766 767 /* 768 * Delta is the number of nanoseconds that we will 769 * cache the attributes of the file. It is based on 770 * the number of nanoseconds since the last time that 771 * we detected a change. The assumption is that files 772 * that changed recently are likely to change again. 773 * There is a minimum and a maximum for regular files 774 * and for directories which is enforced though. 775 * 776 * Using the time since last change was detected 777 * eliminates direct comparison or calculation 778 * using mixed client and server times. NFS does 779 * not make any assumptions regarding the client 780 * and server clocks being synchronized. 781 */ 782 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 783 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 784 va->va_size != rp->r_attr.va_size) 785 rp->r_mtime = now; 786 787 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 788 delta = 0; 789 else { 790 delta = now - rp->r_mtime; 791 if (vp->v_type == VDIR) { 792 if (delta < mi->mi_acdirmin) 793 delta = mi->mi_acdirmin; 794 else if (delta > mi->mi_acdirmax) 795 delta = mi->mi_acdirmax; 796 } else { 797 if (delta < mi->mi_acregmin) 798 delta = mi->mi_acregmin; 799 else if (delta > mi->mi_acregmax) 800 delta = mi->mi_acregmax; 801 } 802 } 803 rp->r_attrtime = now + delta; 804 rp->r_attr = *va; 805 /* 806 * Update the size of the file if there is no cached data or if 807 * the cached data is clean and there is no data being written 808 * out. 809 */ 810 if (rp->r_size != va->va_size && 811 (!vn_has_cached_data(vp) || 812 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 813 rp->r_size = va->va_size; 814 nfs_setswaplike(vp, va); 815 rp->r_flags &= ~RWRITEATTR; 816 } 817 818 /* 819 * Fill in attribute from the cache. 820 * If valid, then return 0 to indicate that no error occurred, 821 * otherwise return 1 to indicate that an error occurred. 822 */ 823 static int 824 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 825 { 826 rnode_t *rp; 827 uint_t mask = vap->va_mask; 828 829 rp = VTOR(vp); 830 mutex_enter(&rp->r_statelock); 831 if (ATTRCACHE_VALID(vp)) { 832 /* 833 * Cached attributes are valid 834 */ 835 *vap = rp->r_attr; 836 /* 837 * Set the caller's va_mask to the set of attributes 838 * that were requested ANDed with the attributes that 839 * are available. If attributes were requested that 840 * are not available, those bits must be turned off 841 * in the callers va_mask. 842 */ 843 vap->va_mask &= mask; 844 mutex_exit(&rp->r_statelock); 845 return (0); 846 } 847 mutex_exit(&rp->r_statelock); 848 return (1); 849 } 850 851 /* 852 * Get attributes over-the-wire and update attributes cache 853 * if no error occurred in the over-the-wire operation. 854 * Return 0 if successful, otherwise error. 855 */ 856 int 857 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 858 { 859 int error; 860 struct nfsattrstat ns; 861 int douprintf; 862 mntinfo_t *mi; 863 failinfo_t fi; 864 hrtime_t t; 865 866 mi = VTOMI(vp); 867 fi.vp = vp; 868 fi.fhp = NULL; /* no need to update, filehandle not copied */ 869 fi.copyproc = nfscopyfh; 870 fi.lookupproc = nfslookup; 871 fi.xattrdirproc = acl_getxattrdir2; 872 873 if (mi->mi_flags & MI_ACL) { 874 error = acl_getattr2_otw(vp, vap, cr); 875 if (mi->mi_flags & MI_ACL) 876 return (error); 877 } 878 879 douprintf = 1; 880 881 t = gethrtime(); 882 883 error = rfs2call(mi, RFS_GETATTR, 884 xdr_fhandle, (caddr_t)VTOFH(vp), 885 xdr_attrstat, (caddr_t)&ns, cr, 886 &douprintf, &ns.ns_status, 0, &fi); 887 888 if (!error) { 889 error = geterrno(ns.ns_status); 890 if (!error) 891 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 892 else { 893 PURGE_STALE_FH(error, vp, cr); 894 } 895 } 896 897 return (error); 898 } 899 900 /* 901 * Return either cached ot remote attributes. If get remote attr 902 * use them to check and invalidate caches, then cache the new attributes. 903 */ 904 int 905 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 906 { 907 int error; 908 rnode_t *rp; 909 910 /* 911 * If we've got cached attributes, we're done, otherwise go 912 * to the server to get attributes, which will update the cache 913 * in the process. 914 */ 915 error = nfs_getattr_cache(vp, vap); 916 if (error) 917 error = nfs_getattr_otw(vp, vap, cr); 918 919 /* Return the client's view of file size */ 920 rp = VTOR(vp); 921 mutex_enter(&rp->r_statelock); 922 vap->va_size = rp->r_size; 923 mutex_exit(&rp->r_statelock); 924 925 return (error); 926 } 927 928 /* 929 * Get attributes over-the-wire and update attributes cache 930 * if no error occurred in the over-the-wire operation. 931 * Return 0 if successful, otherwise error. 932 */ 933 int 934 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 935 { 936 int error; 937 GETATTR3args args; 938 GETATTR3vres res; 939 int douprintf; 940 failinfo_t fi; 941 hrtime_t t; 942 943 args.object = *VTOFH3(vp); 944 fi.vp = vp; 945 fi.fhp = (caddr_t)&args.object; 946 fi.copyproc = nfs3copyfh; 947 fi.lookupproc = nfs3lookup; 948 fi.xattrdirproc = acl_getxattrdir3; 949 res.fres.vp = vp; 950 res.fres.vap = vap; 951 952 douprintf = 1; 953 954 t = gethrtime(); 955 956 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 957 xdr_nfs_fh3, (caddr_t)&args, 958 xdr_GETATTR3vres, (caddr_t)&res, cr, 959 &douprintf, &res.status, 0, &fi); 960 961 if (error) 962 return (error); 963 964 error = geterrno3(res.status); 965 if (error) { 966 PURGE_STALE_FH(error, vp, cr); 967 return (error); 968 } 969 970 /* 971 * Catch status codes that indicate fattr3 to vattr translation failure 972 */ 973 if (res.fres.status) 974 return (res.fres.status); 975 976 nfs_attr_cache(vp, vap, t, cr); 977 return (0); 978 } 979 980 /* 981 * Return either cached or remote attributes. If get remote attr 982 * use them to check and invalidate caches, then cache the new attributes. 983 */ 984 int 985 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 986 { 987 int error; 988 rnode_t *rp; 989 990 /* 991 * If we've got cached attributes, we're done, otherwise go 992 * to the server to get attributes, which will update the cache 993 * in the process. 994 */ 995 error = nfs_getattr_cache(vp, vap); 996 if (error) 997 error = nfs3_getattr_otw(vp, vap, cr); 998 999 /* Return the client's view of file size */ 1000 rp = VTOR(vp); 1001 mutex_enter(&rp->r_statelock); 1002 vap->va_size = rp->r_size; 1003 mutex_exit(&rp->r_statelock); 1004 1005 return (error); 1006 } 1007 1008 vtype_t nf_to_vt[] = { 1009 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 1010 }; 1011 /* 1012 * Convert NFS Version 2 over the network attributes to the local 1013 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1014 * network representation and the local representation is done here. 1015 * Returns 0 for success, error if failed due to overflow. 1016 */ 1017 int 1018 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 1019 { 1020 /* overflow in time attributes? */ 1021 #ifndef _LP64 1022 if (!NFS2_FATTR_TIME_OK(na)) 1023 return (EOVERFLOW); 1024 #endif 1025 1026 vap->va_mask = AT_ALL; 1027 1028 if (na->na_type < NFNON || na->na_type > NFSOC) 1029 vap->va_type = VBAD; 1030 else 1031 vap->va_type = nf_to_vt[na->na_type]; 1032 vap->va_mode = na->na_mode; 1033 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 1034 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 1035 vap->va_fsid = vp->v_vfsp->vfs_dev; 1036 vap->va_nodeid = na->na_nodeid; 1037 vap->va_nlink = na->na_nlink; 1038 vap->va_size = na->na_size; /* keep for cache validation */ 1039 /* 1040 * nfs protocol defines times as unsigned so don't extend sign, 1041 * unless sysadmin set nfs_allow_preepoch_time. 1042 */ 1043 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 1044 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 1045 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 1046 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 1047 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 1048 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 1049 /* 1050 * Shannon's law - uncompress the received dev_t 1051 * if the top half of is zero indicating a response 1052 * from an `older style' OS. Except for when it is a 1053 * `new style' OS sending the maj device of zero, 1054 * in which case the algorithm still works because the 1055 * fact that it is a new style server 1056 * is hidden by the minor device not being greater 1057 * than 255 (a requirement in this case). 1058 */ 1059 if ((na->na_rdev & 0xffff0000) == 0) 1060 vap->va_rdev = nfsv2_expdev(na->na_rdev); 1061 else 1062 vap->va_rdev = expldev(na->na_rdev); 1063 1064 vap->va_nblocks = na->na_blocks; 1065 switch (na->na_type) { 1066 case NFBLK: 1067 vap->va_blksize = DEV_BSIZE; 1068 break; 1069 1070 case NFCHR: 1071 vap->va_blksize = MAXBSIZE; 1072 break; 1073 1074 case NFSOC: 1075 default: 1076 vap->va_blksize = na->na_blocksize; 1077 break; 1078 } 1079 /* 1080 * This bit of ugliness is a hack to preserve the 1081 * over-the-wire protocols for named-pipe vnodes. 1082 * It remaps the special over-the-wire type to the 1083 * VFIFO type. (see note in nfs.h) 1084 */ 1085 if (NA_ISFIFO(na)) { 1086 vap->va_type = VFIFO; 1087 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1088 vap->va_rdev = 0; 1089 vap->va_blksize = na->na_blocksize; 1090 } 1091 vap->va_seq = 0; 1092 return (0); 1093 } 1094 1095 /* 1096 * Convert NFS Version 3 over the network attributes to the local 1097 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1098 * network representation and the local representation is done here. 1099 */ 1100 vtype_t nf3_to_vt[] = { 1101 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1102 }; 1103 1104 int 1105 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1106 { 1107 1108 #ifndef _LP64 1109 /* overflow in time attributes? */ 1110 if (!NFS3_FATTR_TIME_OK(na)) 1111 return (EOVERFLOW); 1112 #endif 1113 if (!NFS3_SIZE_OK(na->size)) 1114 /* file too big */ 1115 return (EFBIG); 1116 1117 vap->va_mask = AT_ALL; 1118 1119 if (na->type < NF3REG || na->type > NF3FIFO) 1120 vap->va_type = VBAD; 1121 else 1122 vap->va_type = nf3_to_vt[na->type]; 1123 vap->va_mode = na->mode; 1124 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1125 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1126 vap->va_fsid = vp->v_vfsp->vfs_dev; 1127 vap->va_nodeid = na->fileid; 1128 vap->va_nlink = na->nlink; 1129 vap->va_size = na->size; 1130 1131 /* 1132 * nfs protocol defines times as unsigned so don't extend sign, 1133 * unless sysadmin set nfs_allow_preepoch_time. 1134 */ 1135 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1136 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1137 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1138 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1139 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1140 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1141 1142 switch (na->type) { 1143 case NF3BLK: 1144 vap->va_rdev = makedevice(na->rdev.specdata1, 1145 na->rdev.specdata2); 1146 vap->va_blksize = DEV_BSIZE; 1147 vap->va_nblocks = 0; 1148 break; 1149 case NF3CHR: 1150 vap->va_rdev = makedevice(na->rdev.specdata1, 1151 na->rdev.specdata2); 1152 vap->va_blksize = MAXBSIZE; 1153 vap->va_nblocks = 0; 1154 break; 1155 case NF3REG: 1156 case NF3DIR: 1157 case NF3LNK: 1158 vap->va_rdev = 0; 1159 vap->va_blksize = MAXBSIZE; 1160 vap->va_nblocks = (u_longlong_t) 1161 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1162 (size3)DEV_BSIZE); 1163 break; 1164 case NF3SOCK: 1165 case NF3FIFO: 1166 default: 1167 vap->va_rdev = 0; 1168 vap->va_blksize = MAXBSIZE; 1169 vap->va_nblocks = 0; 1170 break; 1171 } 1172 vap->va_seq = 0; 1173 return (0); 1174 } 1175 1176 /* 1177 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1178 * for the demand-based allocation of async threads per-mount. The 1179 * nfs_async_timeout is the amount of time a thread will live after it 1180 * becomes idle, unless new I/O requests are received before the thread 1181 * dies. See nfs_async_putpage and nfs_async_start. 1182 */ 1183 1184 int nfs_async_timeout = -1; /* uninitialized */ 1185 1186 static void nfs_async_start(struct vfs *); 1187 static void nfs_async_pgops_start(struct vfs *); 1188 static void nfs_async_common_start(struct vfs *, int); 1189 1190 static void 1191 free_async_args(struct nfs_async_reqs *args) 1192 { 1193 rnode_t *rp; 1194 1195 if (args->a_io != NFS_INACTIVE) { 1196 rp = VTOR(args->a_vp); 1197 mutex_enter(&rp->r_statelock); 1198 rp->r_count--; 1199 if (args->a_io == NFS_PUTAPAGE || 1200 args->a_io == NFS_PAGEIO) 1201 rp->r_awcount--; 1202 cv_broadcast(&rp->r_cv); 1203 mutex_exit(&rp->r_statelock); 1204 VN_RELE(args->a_vp); 1205 } 1206 crfree(args->a_cred); 1207 kmem_free(args, sizeof (*args)); 1208 } 1209 1210 /* 1211 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1212 * pageout(), running in the global zone, have legitimate reasons to do 1213 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1214 * use of a a per-mount "asynchronous requests manager thread" which is 1215 * signaled by the various asynchronous work routines when there is 1216 * asynchronous work to be done. It is responsible for creating new 1217 * worker threads if necessary, and notifying existing worker threads 1218 * that there is work to be done. 1219 * 1220 * In other words, it will "take the specifications from the customers and 1221 * give them to the engineers." 1222 * 1223 * Worker threads die off of their own accord if they are no longer 1224 * needed. 1225 * 1226 * This thread is killed when the zone is going away or the filesystem 1227 * is being unmounted. 1228 */ 1229 void 1230 nfs_async_manager(vfs_t *vfsp) 1231 { 1232 callb_cpr_t cprinfo; 1233 mntinfo_t *mi; 1234 uint_t max_threads; 1235 1236 mi = VFTOMI(vfsp); 1237 1238 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1239 "nfs_async_manager"); 1240 1241 mutex_enter(&mi->mi_async_lock); 1242 /* 1243 * We want to stash the max number of threads that this mount was 1244 * allowed so we can use it later when the variable is set to zero as 1245 * part of the zone/mount going away. 1246 * 1247 * We want to be able to create at least one thread to handle 1248 * asynchronous inactive calls. 1249 */ 1250 max_threads = MAX(mi->mi_max_threads, 1); 1251 /* 1252 * We don't want to wait for mi_max_threads to go to zero, since that 1253 * happens as part of a failed unmount, but this thread should only 1254 * exit when the mount/zone is really going away. 1255 * 1256 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1257 * attempted: the various _async_*() functions know to do things 1258 * inline if mi_max_threads == 0. Henceforth we just drain out the 1259 * outstanding requests. 1260 * 1261 * Note that we still create zthreads even if we notice the zone is 1262 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1263 * shutdown sequence to take slightly longer in some cases, but 1264 * doesn't violate the protocol, as all threads will exit as soon as 1265 * they're done processing the remaining requests. 1266 */ 1267 for (;;) { 1268 while (mi->mi_async_req_count > 0) { 1269 /* 1270 * Paranoia: If the mount started out having 1271 * (mi->mi_max_threads == 0), and the value was 1272 * later changed (via a debugger or somesuch), 1273 * we could be confused since we will think we 1274 * can't create any threads, and the calling 1275 * code (which looks at the current value of 1276 * mi->mi_max_threads, now non-zero) thinks we 1277 * can. 1278 * 1279 * So, because we're paranoid, we create threads 1280 * up to the maximum of the original and the 1281 * current value. This means that future 1282 * (debugger-induced) lowerings of 1283 * mi->mi_max_threads are ignored for our 1284 * purposes, but who told them they could change 1285 * random values on a live kernel anyhow? 1286 */ 1287 if (mi->mi_threads[NFS_ASYNC_QUEUE] < 1288 MAX(mi->mi_max_threads, max_threads)) { 1289 mi->mi_threads[NFS_ASYNC_QUEUE]++; 1290 mutex_exit(&mi->mi_async_lock); 1291 VFS_HOLD(vfsp); /* hold for new thread */ 1292 (void) zthread_create(NULL, 0, nfs_async_start, 1293 vfsp, 0, minclsyspri); 1294 mutex_enter(&mi->mi_async_lock); 1295 } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] < 1296 NUM_ASYNC_PGOPS_THREADS) { 1297 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++; 1298 mutex_exit(&mi->mi_async_lock); 1299 VFS_HOLD(vfsp); /* hold for new thread */ 1300 (void) zthread_create(NULL, 0, 1301 nfs_async_pgops_start, vfsp, 0, 1302 minclsyspri); 1303 mutex_enter(&mi->mi_async_lock); 1304 } 1305 NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1306 ASSERT(mi->mi_async_req_count != 0); 1307 mi->mi_async_req_count--; 1308 } 1309 1310 mutex_enter(&mi->mi_lock); 1311 if (mi->mi_flags & MI_ASYNC_MGR_STOP) { 1312 mutex_exit(&mi->mi_lock); 1313 break; 1314 } 1315 mutex_exit(&mi->mi_lock); 1316 1317 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1318 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1319 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1320 } 1321 /* 1322 * Let everyone know we're done. 1323 */ 1324 mi->mi_manager_thread = NULL; 1325 cv_broadcast(&mi->mi_async_cv); 1326 1327 /* 1328 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1329 * since CALLB_CPR_EXIT is actually responsible for releasing 1330 * 'mi_async_lock'. 1331 */ 1332 CALLB_CPR_EXIT(&cprinfo); 1333 VFS_RELE(vfsp); /* release thread's hold */ 1334 zthread_exit(); 1335 } 1336 1337 /* 1338 * Signal (and wait for) the async manager thread to clean up and go away. 1339 */ 1340 void 1341 nfs_async_manager_stop(vfs_t *vfsp) 1342 { 1343 mntinfo_t *mi = VFTOMI(vfsp); 1344 1345 mutex_enter(&mi->mi_async_lock); 1346 mutex_enter(&mi->mi_lock); 1347 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1348 mutex_exit(&mi->mi_lock); 1349 cv_broadcast(&mi->mi_async_reqs_cv); 1350 while (mi->mi_manager_thread != NULL) 1351 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1352 mutex_exit(&mi->mi_async_lock); 1353 } 1354 1355 int 1356 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1357 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1358 u_offset_t, caddr_t, struct seg *, cred_t *)) 1359 { 1360 rnode_t *rp; 1361 mntinfo_t *mi; 1362 struct nfs_async_reqs *args; 1363 1364 rp = VTOR(vp); 1365 ASSERT(rp->r_freef == NULL); 1366 1367 mi = VTOMI(vp); 1368 1369 /* 1370 * If addr falls in a different segment, don't bother doing readahead. 1371 */ 1372 if (addr >= seg->s_base + seg->s_size) 1373 return (-1); 1374 1375 /* 1376 * If we can't allocate a request structure, punt on the readahead. 1377 */ 1378 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1379 return (-1); 1380 1381 /* 1382 * If a lock operation is pending, don't initiate any new 1383 * readaheads. Otherwise, bump r_count to indicate the new 1384 * asynchronous I/O. 1385 */ 1386 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1387 kmem_free(args, sizeof (*args)); 1388 return (-1); 1389 } 1390 mutex_enter(&rp->r_statelock); 1391 rp->r_count++; 1392 mutex_exit(&rp->r_statelock); 1393 nfs_rw_exit(&rp->r_lkserlock); 1394 1395 args->a_next = NULL; 1396 #ifdef DEBUG 1397 args->a_queuer = curthread; 1398 #endif 1399 VN_HOLD(vp); 1400 args->a_vp = vp; 1401 ASSERT(cr != NULL); 1402 crhold(cr); 1403 args->a_cred = cr; 1404 args->a_io = NFS_READ_AHEAD; 1405 args->a_nfs_readahead = readahead; 1406 args->a_nfs_blkoff = blkoff; 1407 args->a_nfs_seg = seg; 1408 args->a_nfs_addr = addr; 1409 1410 mutex_enter(&mi->mi_async_lock); 1411 1412 /* 1413 * If asyncio has been disabled, don't bother readahead. 1414 */ 1415 if (mi->mi_max_threads == 0) { 1416 mutex_exit(&mi->mi_async_lock); 1417 goto noasync; 1418 } 1419 1420 /* 1421 * Link request structure into the async list and 1422 * wakeup async thread to do the i/o. 1423 */ 1424 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1425 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1426 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1427 } else { 1428 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1429 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1430 } 1431 1432 if (mi->mi_io_kstats) { 1433 mutex_enter(&mi->mi_lock); 1434 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1435 mutex_exit(&mi->mi_lock); 1436 } 1437 1438 mi->mi_async_req_count++; 1439 ASSERT(mi->mi_async_req_count != 0); 1440 cv_signal(&mi->mi_async_reqs_cv); 1441 mutex_exit(&mi->mi_async_lock); 1442 return (0); 1443 1444 noasync: 1445 mutex_enter(&rp->r_statelock); 1446 rp->r_count--; 1447 cv_broadcast(&rp->r_cv); 1448 mutex_exit(&rp->r_statelock); 1449 VN_RELE(vp); 1450 crfree(cr); 1451 kmem_free(args, sizeof (*args)); 1452 return (-1); 1453 } 1454 1455 int 1456 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1457 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1458 u_offset_t, size_t, int, cred_t *)) 1459 { 1460 rnode_t *rp; 1461 mntinfo_t *mi; 1462 struct nfs_async_reqs *args; 1463 1464 ASSERT(flags & B_ASYNC); 1465 ASSERT(vp->v_vfsp != NULL); 1466 1467 rp = VTOR(vp); 1468 ASSERT(rp->r_count > 0); 1469 1470 mi = VTOMI(vp); 1471 1472 /* 1473 * If we can't allocate a request structure, do the putpage 1474 * operation synchronously in this thread's context. 1475 */ 1476 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1477 goto noasync; 1478 1479 args->a_next = NULL; 1480 #ifdef DEBUG 1481 args->a_queuer = curthread; 1482 #endif 1483 VN_HOLD(vp); 1484 args->a_vp = vp; 1485 ASSERT(cr != NULL); 1486 crhold(cr); 1487 args->a_cred = cr; 1488 args->a_io = NFS_PUTAPAGE; 1489 args->a_nfs_putapage = putapage; 1490 args->a_nfs_pp = pp; 1491 args->a_nfs_off = off; 1492 args->a_nfs_len = (uint_t)len; 1493 args->a_nfs_flags = flags; 1494 1495 mutex_enter(&mi->mi_async_lock); 1496 1497 /* 1498 * If asyncio has been disabled, then make a synchronous request. 1499 * This check is done a second time in case async io was diabled 1500 * while this thread was blocked waiting for memory pressure to 1501 * reduce or for the queue to drain. 1502 */ 1503 if (mi->mi_max_threads == 0) { 1504 mutex_exit(&mi->mi_async_lock); 1505 goto noasync; 1506 } 1507 1508 /* 1509 * Link request structure into the async list and 1510 * wakeup async thread to do the i/o. 1511 */ 1512 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1513 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1514 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1515 } else { 1516 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1517 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1518 } 1519 1520 mutex_enter(&rp->r_statelock); 1521 rp->r_count++; 1522 rp->r_awcount++; 1523 mutex_exit(&rp->r_statelock); 1524 1525 if (mi->mi_io_kstats) { 1526 mutex_enter(&mi->mi_lock); 1527 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1528 mutex_exit(&mi->mi_lock); 1529 } 1530 1531 mi->mi_async_req_count++; 1532 ASSERT(mi->mi_async_req_count != 0); 1533 cv_signal(&mi->mi_async_reqs_cv); 1534 mutex_exit(&mi->mi_async_lock); 1535 return (0); 1536 1537 noasync: 1538 if (args != NULL) { 1539 VN_RELE(vp); 1540 crfree(cr); 1541 kmem_free(args, sizeof (*args)); 1542 } 1543 1544 if (curproc == proc_pageout || curproc == proc_fsflush) { 1545 /* 1546 * If we get here in the context of the pageout/fsflush, 1547 * we refuse to do a sync write, because this may hang 1548 * pageout (and the machine). In this case, we just 1549 * re-mark the page as dirty and punt on the page. 1550 * 1551 * Make sure B_FORCE isn't set. We can re-mark the 1552 * pages as dirty and unlock the pages in one swoop by 1553 * passing in B_ERROR to pvn_write_done(). However, 1554 * we should make sure B_FORCE isn't set - we don't 1555 * want the page tossed before it gets written out. 1556 */ 1557 if (flags & B_FORCE) 1558 flags &= ~(B_INVAL | B_FORCE); 1559 pvn_write_done(pp, flags | B_ERROR); 1560 return (0); 1561 } 1562 if (nfs_zone() != mi->mi_zone) { 1563 /* 1564 * So this was a cross-zone sync putpage. We pass in B_ERROR 1565 * to pvn_write_done() to re-mark the pages as dirty and unlock 1566 * them. 1567 * 1568 * We don't want to clear B_FORCE here as the caller presumably 1569 * knows what they're doing if they set it. 1570 */ 1571 pvn_write_done(pp, flags | B_ERROR); 1572 return (EPERM); 1573 } 1574 return ((*putapage)(vp, pp, off, len, flags, cr)); 1575 } 1576 1577 int 1578 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1579 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1580 size_t, int, cred_t *)) 1581 { 1582 rnode_t *rp; 1583 mntinfo_t *mi; 1584 struct nfs_async_reqs *args; 1585 1586 ASSERT(flags & B_ASYNC); 1587 ASSERT(vp->v_vfsp != NULL); 1588 1589 rp = VTOR(vp); 1590 ASSERT(rp->r_count > 0); 1591 1592 mi = VTOMI(vp); 1593 1594 /* 1595 * If we can't allocate a request structure, do the pageio 1596 * request synchronously in this thread's context. 1597 */ 1598 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1599 goto noasync; 1600 1601 args->a_next = NULL; 1602 #ifdef DEBUG 1603 args->a_queuer = curthread; 1604 #endif 1605 VN_HOLD(vp); 1606 args->a_vp = vp; 1607 ASSERT(cr != NULL); 1608 crhold(cr); 1609 args->a_cred = cr; 1610 args->a_io = NFS_PAGEIO; 1611 args->a_nfs_pageio = pageio; 1612 args->a_nfs_pp = pp; 1613 args->a_nfs_off = io_off; 1614 args->a_nfs_len = (uint_t)io_len; 1615 args->a_nfs_flags = flags; 1616 1617 mutex_enter(&mi->mi_async_lock); 1618 1619 /* 1620 * If asyncio has been disabled, then make a synchronous request. 1621 * This check is done a second time in case async io was diabled 1622 * while this thread was blocked waiting for memory pressure to 1623 * reduce or for the queue to drain. 1624 */ 1625 if (mi->mi_max_threads == 0) { 1626 mutex_exit(&mi->mi_async_lock); 1627 goto noasync; 1628 } 1629 1630 /* 1631 * Link request structure into the async list and 1632 * wakeup async thread to do the i/o. 1633 */ 1634 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1635 mi->mi_async_reqs[NFS_PAGEIO] = args; 1636 mi->mi_async_tail[NFS_PAGEIO] = args; 1637 } else { 1638 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1639 mi->mi_async_tail[NFS_PAGEIO] = args; 1640 } 1641 1642 mutex_enter(&rp->r_statelock); 1643 rp->r_count++; 1644 rp->r_awcount++; 1645 mutex_exit(&rp->r_statelock); 1646 1647 if (mi->mi_io_kstats) { 1648 mutex_enter(&mi->mi_lock); 1649 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1650 mutex_exit(&mi->mi_lock); 1651 } 1652 1653 mi->mi_async_req_count++; 1654 ASSERT(mi->mi_async_req_count != 0); 1655 cv_signal(&mi->mi_async_reqs_cv); 1656 mutex_exit(&mi->mi_async_lock); 1657 return (0); 1658 1659 noasync: 1660 if (args != NULL) { 1661 VN_RELE(vp); 1662 crfree(cr); 1663 kmem_free(args, sizeof (*args)); 1664 } 1665 1666 /* 1667 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1668 * the page list), for writes we do it synchronously, except for 1669 * proc_pageout/proc_fsflush as described below. 1670 */ 1671 if (flags & B_READ) { 1672 pvn_read_done(pp, flags | B_ERROR); 1673 return (0); 1674 } 1675 1676 if (curproc == proc_pageout || curproc == proc_fsflush) { 1677 /* 1678 * If we get here in the context of the pageout/fsflush, 1679 * we refuse to do a sync write, because this may hang 1680 * pageout/fsflush (and the machine). In this case, we just 1681 * re-mark the page as dirty and punt on the page. 1682 * 1683 * Make sure B_FORCE isn't set. We can re-mark the 1684 * pages as dirty and unlock the pages in one swoop by 1685 * passing in B_ERROR to pvn_write_done(). However, 1686 * we should make sure B_FORCE isn't set - we don't 1687 * want the page tossed before it gets written out. 1688 */ 1689 if (flags & B_FORCE) 1690 flags &= ~(B_INVAL | B_FORCE); 1691 pvn_write_done(pp, flags | B_ERROR); 1692 return (0); 1693 } 1694 1695 if (nfs_zone() != mi->mi_zone) { 1696 /* 1697 * So this was a cross-zone sync pageio. We pass in B_ERROR 1698 * to pvn_write_done() to re-mark the pages as dirty and unlock 1699 * them. 1700 * 1701 * We don't want to clear B_FORCE here as the caller presumably 1702 * knows what they're doing if they set it. 1703 */ 1704 pvn_write_done(pp, flags | B_ERROR); 1705 return (EPERM); 1706 } 1707 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1708 } 1709 1710 void 1711 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1712 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1713 { 1714 rnode_t *rp; 1715 mntinfo_t *mi; 1716 struct nfs_async_reqs *args; 1717 1718 rp = VTOR(vp); 1719 ASSERT(rp->r_freef == NULL); 1720 1721 mi = VTOMI(vp); 1722 1723 /* 1724 * If we can't allocate a request structure, do the readdir 1725 * operation synchronously in this thread's context. 1726 */ 1727 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1728 goto noasync; 1729 1730 args->a_next = NULL; 1731 #ifdef DEBUG 1732 args->a_queuer = curthread; 1733 #endif 1734 VN_HOLD(vp); 1735 args->a_vp = vp; 1736 ASSERT(cr != NULL); 1737 crhold(cr); 1738 args->a_cred = cr; 1739 args->a_io = NFS_READDIR; 1740 args->a_nfs_readdir = readdir; 1741 args->a_nfs_rdc = rdc; 1742 1743 mutex_enter(&mi->mi_async_lock); 1744 1745 /* 1746 * If asyncio has been disabled, then make a synchronous request. 1747 */ 1748 if (mi->mi_max_threads == 0) { 1749 mutex_exit(&mi->mi_async_lock); 1750 goto noasync; 1751 } 1752 1753 /* 1754 * Link request structure into the async list and 1755 * wakeup async thread to do the i/o. 1756 */ 1757 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1758 mi->mi_async_reqs[NFS_READDIR] = args; 1759 mi->mi_async_tail[NFS_READDIR] = args; 1760 } else { 1761 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1762 mi->mi_async_tail[NFS_READDIR] = args; 1763 } 1764 1765 mutex_enter(&rp->r_statelock); 1766 rp->r_count++; 1767 mutex_exit(&rp->r_statelock); 1768 1769 if (mi->mi_io_kstats) { 1770 mutex_enter(&mi->mi_lock); 1771 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1772 mutex_exit(&mi->mi_lock); 1773 } 1774 1775 mi->mi_async_req_count++; 1776 ASSERT(mi->mi_async_req_count != 0); 1777 cv_signal(&mi->mi_async_reqs_cv); 1778 mutex_exit(&mi->mi_async_lock); 1779 return; 1780 1781 noasync: 1782 if (args != NULL) { 1783 VN_RELE(vp); 1784 crfree(cr); 1785 kmem_free(args, sizeof (*args)); 1786 } 1787 1788 rdc->entries = NULL; 1789 mutex_enter(&rp->r_statelock); 1790 ASSERT(rdc->flags & RDDIR); 1791 rdc->flags &= ~RDDIR; 1792 rdc->flags |= RDDIRREQ; 1793 /* 1794 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1795 * is set, wakeup the thread sleeping in cv_wait_sig(). 1796 * The woken up thread will reset the flag to RDDIR and will 1797 * continue with the readdir opeartion. 1798 */ 1799 if (rdc->flags & RDDIRWAIT) { 1800 rdc->flags &= ~RDDIRWAIT; 1801 cv_broadcast(&rdc->cv); 1802 } 1803 mutex_exit(&rp->r_statelock); 1804 rddir_cache_rele(rdc); 1805 } 1806 1807 void 1808 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1809 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *)) 1810 { 1811 rnode_t *rp; 1812 mntinfo_t *mi; 1813 struct nfs_async_reqs *args; 1814 page_t *pp; 1815 1816 rp = VTOR(vp); 1817 mi = VTOMI(vp); 1818 1819 /* 1820 * If we can't allocate a request structure, do the commit 1821 * operation synchronously in this thread's context. 1822 */ 1823 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1824 goto noasync; 1825 1826 args->a_next = NULL; 1827 #ifdef DEBUG 1828 args->a_queuer = curthread; 1829 #endif 1830 VN_HOLD(vp); 1831 args->a_vp = vp; 1832 ASSERT(cr != NULL); 1833 crhold(cr); 1834 args->a_cred = cr; 1835 args->a_io = NFS_COMMIT; 1836 args->a_nfs_commit = commit; 1837 args->a_nfs_plist = plist; 1838 args->a_nfs_offset = offset; 1839 args->a_nfs_count = count; 1840 1841 mutex_enter(&mi->mi_async_lock); 1842 1843 /* 1844 * If asyncio has been disabled, then make a synchronous request. 1845 * This check is done a second time in case async io was diabled 1846 * while this thread was blocked waiting for memory pressure to 1847 * reduce or for the queue to drain. 1848 */ 1849 if (mi->mi_max_threads == 0) { 1850 mutex_exit(&mi->mi_async_lock); 1851 goto noasync; 1852 } 1853 1854 /* 1855 * Link request structure into the async list and 1856 * wakeup async thread to do the i/o. 1857 */ 1858 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1859 mi->mi_async_reqs[NFS_COMMIT] = args; 1860 mi->mi_async_tail[NFS_COMMIT] = args; 1861 } else { 1862 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1863 mi->mi_async_tail[NFS_COMMIT] = args; 1864 } 1865 1866 mutex_enter(&rp->r_statelock); 1867 rp->r_count++; 1868 mutex_exit(&rp->r_statelock); 1869 1870 if (mi->mi_io_kstats) { 1871 mutex_enter(&mi->mi_lock); 1872 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1873 mutex_exit(&mi->mi_lock); 1874 } 1875 1876 mi->mi_async_req_count++; 1877 ASSERT(mi->mi_async_req_count != 0); 1878 cv_signal(&mi->mi_async_reqs_cv); 1879 mutex_exit(&mi->mi_async_lock); 1880 return; 1881 1882 noasync: 1883 if (args != NULL) { 1884 VN_RELE(vp); 1885 crfree(cr); 1886 kmem_free(args, sizeof (*args)); 1887 } 1888 1889 if (curproc == proc_pageout || curproc == proc_fsflush || 1890 nfs_zone() != mi->mi_zone) { 1891 while (plist != NULL) { 1892 pp = plist; 1893 page_sub(&plist, pp); 1894 pp->p_fsdata = C_COMMIT; 1895 page_unlock(pp); 1896 } 1897 return; 1898 } 1899 (*commit)(vp, plist, offset, count, cr); 1900 } 1901 1902 void 1903 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1904 void (*inactive)(vnode_t *, cred_t *, caller_context_t *)) 1905 { 1906 mntinfo_t *mi; 1907 struct nfs_async_reqs *args; 1908 1909 mi = VTOMI(vp); 1910 1911 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1912 args->a_next = NULL; 1913 #ifdef DEBUG 1914 args->a_queuer = curthread; 1915 #endif 1916 args->a_vp = vp; 1917 ASSERT(cr != NULL); 1918 crhold(cr); 1919 args->a_cred = cr; 1920 args->a_io = NFS_INACTIVE; 1921 args->a_nfs_inactive = inactive; 1922 1923 /* 1924 * Note that we don't check mi->mi_max_threads here, since we 1925 * *need* to get rid of this vnode regardless of whether someone 1926 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1927 * 1928 * The manager thread knows about this and is willing to create 1929 * at least one thread to accommodate us. 1930 */ 1931 mutex_enter(&mi->mi_async_lock); 1932 if (mi->mi_manager_thread == NULL) { 1933 rnode_t *rp = VTOR(vp); 1934 1935 mutex_exit(&mi->mi_async_lock); 1936 crfree(cr); /* drop our reference */ 1937 kmem_free(args, sizeof (*args)); 1938 /* 1939 * We can't do an over-the-wire call since we're in the wrong 1940 * zone, so we need to clean up state as best we can and then 1941 * throw away the vnode. 1942 */ 1943 mutex_enter(&rp->r_statelock); 1944 if (rp->r_unldvp != NULL) { 1945 vnode_t *unldvp; 1946 char *unlname; 1947 cred_t *unlcred; 1948 1949 unldvp = rp->r_unldvp; 1950 rp->r_unldvp = NULL; 1951 unlname = rp->r_unlname; 1952 rp->r_unlname = NULL; 1953 unlcred = rp->r_unlcred; 1954 rp->r_unlcred = NULL; 1955 mutex_exit(&rp->r_statelock); 1956 1957 VN_RELE(unldvp); 1958 kmem_free(unlname, MAXNAMELEN); 1959 crfree(unlcred); 1960 } else { 1961 mutex_exit(&rp->r_statelock); 1962 } 1963 /* 1964 * No need to explicitly throw away any cached pages. The 1965 * eventual rinactive() will attempt a synchronous 1966 * VOP_PUTPAGE() which will immediately fail since the request 1967 * is coming from the wrong zone, and then will proceed to call 1968 * nfs_invalidate_pages() which will clean things up for us. 1969 */ 1970 rp_addfree(VTOR(vp), cr); 1971 return; 1972 } 1973 1974 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1975 mi->mi_async_reqs[NFS_INACTIVE] = args; 1976 } else { 1977 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1978 } 1979 mi->mi_async_tail[NFS_INACTIVE] = args; 1980 /* 1981 * Don't increment r_count, since we're trying to get rid of the vnode. 1982 */ 1983 1984 mi->mi_async_req_count++; 1985 ASSERT(mi->mi_async_req_count != 0); 1986 cv_signal(&mi->mi_async_reqs_cv); 1987 mutex_exit(&mi->mi_async_lock); 1988 } 1989 1990 static void 1991 nfs_async_start(struct vfs *vfsp) 1992 { 1993 nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE); 1994 } 1995 1996 static void 1997 nfs_async_pgops_start(struct vfs *vfsp) 1998 { 1999 nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE); 2000 } 2001 2002 /* 2003 * The async queues for each mounted file system are arranged as a 2004 * set of queues, one for each async i/o type. Requests are taken 2005 * from the queues in a round-robin fashion. A number of consecutive 2006 * requests are taken from each queue before moving on to the next 2007 * queue. This functionality may allow the NFS Version 2 server to do 2008 * write clustering, even if the client is mixing writes and reads 2009 * because it will take multiple write requests from the queue 2010 * before processing any of the other async i/o types. 2011 * 2012 * XXX The nfs_async_common_start thread is unsafe in the light of the present 2013 * model defined by cpr to suspend the system. Specifically over the 2014 * wire calls are cpr-unsafe. The thread should be reevaluated in 2015 * case of future updates to the cpr model. 2016 */ 2017 static void 2018 nfs_async_common_start(struct vfs *vfsp, int async_queue) 2019 { 2020 struct nfs_async_reqs *args; 2021 mntinfo_t *mi = VFTOMI(vfsp); 2022 clock_t time_left = 1; 2023 callb_cpr_t cprinfo; 2024 int i; 2025 int async_types; 2026 kcondvar_t *async_work_cv; 2027 2028 if (async_queue == NFS_ASYNC_QUEUE) { 2029 async_types = NFS_ASYNC_TYPES; 2030 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE]; 2031 } else { 2032 async_types = NFS_ASYNC_PGOPS_TYPES; 2033 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]; 2034 } 2035 2036 /* 2037 * Dynamic initialization of nfs_async_timeout to allow nfs to be 2038 * built in an implementation independent manner. 2039 */ 2040 if (nfs_async_timeout == -1) 2041 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 2042 2043 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 2044 2045 mutex_enter(&mi->mi_async_lock); 2046 for (;;) { 2047 /* 2048 * Find the next queue containing an entry. We start 2049 * at the current queue pointer and then round robin 2050 * through all of them until we either find a non-empty 2051 * queue or have looked through all of them. 2052 */ 2053 for (i = 0; i < async_types; i++) { 2054 args = *mi->mi_async_curr[async_queue]; 2055 if (args != NULL) 2056 break; 2057 mi->mi_async_curr[async_queue]++; 2058 if (mi->mi_async_curr[async_queue] == 2059 &mi->mi_async_reqs[async_types]) { 2060 mi->mi_async_curr[async_queue] = 2061 &mi->mi_async_reqs[0]; 2062 } 2063 } 2064 /* 2065 * If we didn't find a entry, then block until woken up 2066 * again and then look through the queues again. 2067 */ 2068 if (args == NULL) { 2069 /* 2070 * Exiting is considered to be safe for CPR as well 2071 */ 2072 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2073 2074 /* 2075 * Wakeup thread waiting to unmount the file 2076 * system only if all async threads are inactive. 2077 * 2078 * If we've timed-out and there's nothing to do, 2079 * then get rid of this thread. 2080 */ 2081 if (mi->mi_max_threads == 0 || time_left <= 0) { 2082 --mi->mi_threads[async_queue]; 2083 2084 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 && 2085 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0) 2086 cv_signal(&mi->mi_async_cv); 2087 CALLB_CPR_EXIT(&cprinfo); 2088 VFS_RELE(vfsp); /* release thread's hold */ 2089 zthread_exit(); 2090 /* NOTREACHED */ 2091 } 2092 time_left = cv_reltimedwait(async_work_cv, 2093 &mi->mi_async_lock, nfs_async_timeout, 2094 TR_CLOCK_TICK); 2095 2096 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 2097 2098 continue; 2099 } 2100 time_left = 1; 2101 2102 /* 2103 * Remove the request from the async queue and then 2104 * update the current async request queue pointer. If 2105 * the current queue is empty or we have removed enough 2106 * consecutive entries from it, then reset the counter 2107 * for this queue and then move the current pointer to 2108 * the next queue. 2109 */ 2110 *mi->mi_async_curr[async_queue] = args->a_next; 2111 if (*mi->mi_async_curr[async_queue] == NULL || 2112 --mi->mi_async_clusters[args->a_io] == 0) { 2113 mi->mi_async_clusters[args->a_io] = 2114 mi->mi_async_init_clusters; 2115 mi->mi_async_curr[async_queue]++; 2116 if (mi->mi_async_curr[async_queue] == 2117 &mi->mi_async_reqs[async_types]) { 2118 mi->mi_async_curr[async_queue] = 2119 &mi->mi_async_reqs[0]; 2120 } 2121 } 2122 2123 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 2124 mutex_enter(&mi->mi_lock); 2125 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 2126 mutex_exit(&mi->mi_lock); 2127 } 2128 2129 mutex_exit(&mi->mi_async_lock); 2130 2131 /* 2132 * Obtain arguments from the async request structure. 2133 */ 2134 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2135 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2136 args->a_nfs_addr, args->a_nfs_seg, 2137 args->a_cred); 2138 } else if (args->a_io == NFS_PUTAPAGE) { 2139 (void) (*args->a_nfs_putapage)(args->a_vp, 2140 args->a_nfs_pp, args->a_nfs_off, 2141 args->a_nfs_len, args->a_nfs_flags, 2142 args->a_cred); 2143 } else if (args->a_io == NFS_PAGEIO) { 2144 (void) (*args->a_nfs_pageio)(args->a_vp, 2145 args->a_nfs_pp, args->a_nfs_off, 2146 args->a_nfs_len, args->a_nfs_flags, 2147 args->a_cred); 2148 } else if (args->a_io == NFS_READDIR) { 2149 (void) ((*args->a_nfs_readdir)(args->a_vp, 2150 args->a_nfs_rdc, args->a_cred)); 2151 } else if (args->a_io == NFS_COMMIT) { 2152 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2153 args->a_nfs_offset, args->a_nfs_count, 2154 args->a_cred); 2155 } else if (args->a_io == NFS_INACTIVE) { 2156 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL); 2157 } 2158 2159 /* 2160 * Now, release the vnode and free the credentials 2161 * structure. 2162 */ 2163 free_async_args(args); 2164 /* 2165 * Reacquire the mutex because it will be needed above. 2166 */ 2167 mutex_enter(&mi->mi_async_lock); 2168 } 2169 } 2170 2171 void 2172 nfs_async_stop(struct vfs *vfsp) 2173 { 2174 mntinfo_t *mi = VFTOMI(vfsp); 2175 2176 /* 2177 * Wait for all outstanding async operations to complete and for the 2178 * worker threads to exit. 2179 */ 2180 mutex_enter(&mi->mi_async_lock); 2181 mi->mi_max_threads = 0; 2182 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2183 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2184 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) 2185 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2186 mutex_exit(&mi->mi_async_lock); 2187 } 2188 2189 /* 2190 * nfs_async_stop_sig: 2191 * Wait for all outstanding putpage operation to complete. If a signal 2192 * is deliver we will abort and return non-zero. If we can put all the 2193 * pages we will return 0. This routine is called from nfs_unmount and 2194 * nfs3_unmount to make these operations interruptible. 2195 */ 2196 int 2197 nfs_async_stop_sig(struct vfs *vfsp) 2198 { 2199 mntinfo_t *mi = VFTOMI(vfsp); 2200 ushort_t omax; 2201 int rval; 2202 2203 /* 2204 * Wait for all outstanding async operations to complete and for the 2205 * worker threads to exit. 2206 */ 2207 mutex_enter(&mi->mi_async_lock); 2208 omax = mi->mi_max_threads; 2209 mi->mi_max_threads = 0; 2210 /* 2211 * Tell all the worker threads to exit. 2212 */ 2213 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2214 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2215 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) { 2216 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2217 break; 2218 } 2219 rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2220 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0); /* Interrupted */ 2221 if (rval) 2222 mi->mi_max_threads = omax; 2223 mutex_exit(&mi->mi_async_lock); 2224 2225 return (rval); 2226 } 2227 2228 int 2229 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2230 { 2231 int pagecreate; 2232 int n; 2233 int saved_n; 2234 caddr_t saved_base; 2235 u_offset_t offset; 2236 int error; 2237 int sm_error; 2238 vnode_t *vp = RTOV(rp); 2239 2240 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2241 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2242 if (!vpm_enable) { 2243 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2244 } 2245 2246 /* 2247 * Move bytes in at most PAGESIZE chunks. We must avoid 2248 * spanning pages in uiomove() because page faults may cause 2249 * the cache to be invalidated out from under us. The r_size is not 2250 * updated until after the uiomove. If we push the last page of a 2251 * file before r_size is correct, we will lose the data written past 2252 * the current (and invalid) r_size. 2253 */ 2254 do { 2255 offset = uio->uio_loffset; 2256 pagecreate = 0; 2257 2258 /* 2259 * n is the number of bytes required to satisfy the request 2260 * or the number of bytes to fill out the page. 2261 */ 2262 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2263 2264 /* 2265 * Check to see if we can skip reading in the page 2266 * and just allocate the memory. We can do this 2267 * if we are going to rewrite the entire mapping 2268 * or if we are going to write to or beyond the current 2269 * end of file from the beginning of the mapping. 2270 * 2271 * The read of r_size is now protected by r_statelock. 2272 */ 2273 mutex_enter(&rp->r_statelock); 2274 /* 2275 * When pgcreated is nonzero the caller has already done 2276 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2277 * segkpm this means we already have at least one page 2278 * created and mapped at base. 2279 */ 2280 pagecreate = pgcreated || 2281 ((offset & PAGEOFFSET) == 0 && 2282 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2283 2284 mutex_exit(&rp->r_statelock); 2285 if (!vpm_enable && pagecreate) { 2286 /* 2287 * The last argument tells segmap_pagecreate() to 2288 * always lock the page, as opposed to sometimes 2289 * returning with the page locked. This way we avoid a 2290 * fault on the ensuing uiomove(), but also 2291 * more importantly (to fix bug 1094402) we can 2292 * call segmap_fault() to unlock the page in all 2293 * cases. An alternative would be to modify 2294 * segmap_pagecreate() to tell us when it is 2295 * locking a page, but that's a fairly major 2296 * interface change. 2297 */ 2298 if (pgcreated == 0) 2299 (void) segmap_pagecreate(segkmap, base, 2300 (uint_t)n, 1); 2301 saved_base = base; 2302 saved_n = n; 2303 } 2304 2305 /* 2306 * The number of bytes of data in the last page can not 2307 * be accurately be determined while page is being 2308 * uiomove'd to and the size of the file being updated. 2309 * Thus, inform threads which need to know accurately 2310 * how much data is in the last page of the file. They 2311 * will not do the i/o immediately, but will arrange for 2312 * the i/o to happen later when this modify operation 2313 * will have finished. 2314 */ 2315 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2316 mutex_enter(&rp->r_statelock); 2317 rp->r_flags |= RMODINPROGRESS; 2318 rp->r_modaddr = (offset & MAXBMASK); 2319 mutex_exit(&rp->r_statelock); 2320 2321 if (vpm_enable) { 2322 /* 2323 * Copy data. If new pages are created, part of 2324 * the page that is not written will be initizliazed 2325 * with zeros. 2326 */ 2327 error = vpm_data_copy(vp, offset, n, uio, 2328 !pagecreate, NULL, 0, S_WRITE); 2329 } else { 2330 error = uiomove(base, n, UIO_WRITE, uio); 2331 } 2332 2333 /* 2334 * r_size is the maximum number of 2335 * bytes known to be in the file. 2336 * Make sure it is at least as high as the 2337 * first unwritten byte pointed to by uio_loffset. 2338 */ 2339 mutex_enter(&rp->r_statelock); 2340 if (rp->r_size < uio->uio_loffset) 2341 rp->r_size = uio->uio_loffset; 2342 rp->r_flags &= ~RMODINPROGRESS; 2343 rp->r_flags |= RDIRTY; 2344 mutex_exit(&rp->r_statelock); 2345 2346 /* n = # of bytes written */ 2347 n = (int)(uio->uio_loffset - offset); 2348 2349 if (!vpm_enable) { 2350 base += n; 2351 } 2352 tcount -= n; 2353 /* 2354 * If we created pages w/o initializing them completely, 2355 * we need to zero the part that wasn't set up. 2356 * This happens on a most EOF write cases and if 2357 * we had some sort of error during the uiomove. 2358 */ 2359 if (!vpm_enable && pagecreate) { 2360 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2361 (void) kzero(base, PAGESIZE - n); 2362 2363 if (pgcreated) { 2364 /* 2365 * Caller is responsible for this page, 2366 * it was not created in this loop. 2367 */ 2368 pgcreated = 0; 2369 } else { 2370 /* 2371 * For bug 1094402: segmap_pagecreate locks 2372 * page. Unlock it. This also unlocks the 2373 * pages allocated by page_create_va() in 2374 * segmap_pagecreate(). 2375 */ 2376 sm_error = segmap_fault(kas.a_hat, segkmap, 2377 saved_base, saved_n, 2378 F_SOFTUNLOCK, S_WRITE); 2379 if (error == 0) 2380 error = sm_error; 2381 } 2382 } 2383 } while (tcount > 0 && error == 0); 2384 2385 return (error); 2386 } 2387 2388 int 2389 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2390 { 2391 rnode_t *rp; 2392 page_t *pp; 2393 u_offset_t eoff; 2394 u_offset_t io_off; 2395 size_t io_len; 2396 int error; 2397 int rdirty; 2398 int err; 2399 2400 rp = VTOR(vp); 2401 ASSERT(rp->r_count > 0); 2402 2403 if (!vn_has_cached_data(vp)) 2404 return (0); 2405 2406 ASSERT(vp->v_type != VCHR); 2407 2408 /* 2409 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2410 * writes. B_FORCE is set to force the VM system to actually 2411 * invalidate the pages, even if the i/o failed. The pages 2412 * need to get invalidated because they can't be written out 2413 * because there isn't any space left on either the server's 2414 * file system or in the user's disk quota. The B_FREE bit 2415 * is cleared to avoid confusion as to whether this is a 2416 * request to place the page on the freelist or to destroy 2417 * it. 2418 */ 2419 if ((rp->r_flags & ROUTOFSPACE) || 2420 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2421 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2422 2423 if (len == 0) { 2424 /* 2425 * If doing a full file synchronous operation, then clear 2426 * the RDIRTY bit. If a page gets dirtied while the flush 2427 * is happening, then RDIRTY will get set again. The 2428 * RDIRTY bit must get cleared before the flush so that 2429 * we don't lose this information. 2430 * 2431 * If there are no full file async write operations 2432 * pending and RDIRTY bit is set, clear it. 2433 */ 2434 if (off == (u_offset_t)0 && 2435 !(flags & B_ASYNC) && 2436 (rp->r_flags & RDIRTY)) { 2437 mutex_enter(&rp->r_statelock); 2438 rdirty = (rp->r_flags & RDIRTY); 2439 rp->r_flags &= ~RDIRTY; 2440 mutex_exit(&rp->r_statelock); 2441 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2442 mutex_enter(&rp->r_statelock); 2443 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) { 2444 rdirty = (rp->r_flags & RDIRTY); 2445 rp->r_flags &= ~RDIRTY; 2446 } 2447 mutex_exit(&rp->r_statelock); 2448 } else 2449 rdirty = 0; 2450 2451 /* 2452 * Search the entire vp list for pages >= off, and flush 2453 * the dirty pages. 2454 */ 2455 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2456 flags, cr); 2457 2458 /* 2459 * If an error occurred and the file was marked as dirty 2460 * before and we aren't forcibly invalidating pages, then 2461 * reset the RDIRTY flag. 2462 */ 2463 if (error && rdirty && 2464 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2465 mutex_enter(&rp->r_statelock); 2466 rp->r_flags |= RDIRTY; 2467 mutex_exit(&rp->r_statelock); 2468 } 2469 } else { 2470 /* 2471 * Do a range from [off...off + len) looking for pages 2472 * to deal with. 2473 */ 2474 error = 0; 2475 #ifdef lint 2476 io_len = 0; 2477 #endif 2478 eoff = off + len; 2479 mutex_enter(&rp->r_statelock); 2480 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2481 io_off += io_len) { 2482 mutex_exit(&rp->r_statelock); 2483 /* 2484 * If we are not invalidating, synchronously 2485 * freeing or writing pages use the routine 2486 * page_lookup_nowait() to prevent reclaiming 2487 * them from the free list. 2488 */ 2489 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2490 pp = page_lookup(vp, io_off, 2491 (flags & (B_INVAL | B_FREE)) ? 2492 SE_EXCL : SE_SHARED); 2493 } else { 2494 pp = page_lookup_nowait(vp, io_off, 2495 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2496 } 2497 2498 if (pp == NULL || !pvn_getdirty(pp, flags)) 2499 io_len = PAGESIZE; 2500 else { 2501 err = (*rp->r_putapage)(vp, pp, &io_off, 2502 &io_len, flags, cr); 2503 if (!error) 2504 error = err; 2505 /* 2506 * "io_off" and "io_len" are returned as 2507 * the range of pages we actually wrote. 2508 * This allows us to skip ahead more quickly 2509 * since several pages may've been dealt 2510 * with by this iteration of the loop. 2511 */ 2512 } 2513 mutex_enter(&rp->r_statelock); 2514 } 2515 mutex_exit(&rp->r_statelock); 2516 } 2517 2518 return (error); 2519 } 2520 2521 void 2522 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2523 { 2524 rnode_t *rp; 2525 2526 rp = VTOR(vp); 2527 mutex_enter(&rp->r_statelock); 2528 while (rp->r_flags & RTRUNCATE) 2529 cv_wait(&rp->r_cv, &rp->r_statelock); 2530 rp->r_flags |= RTRUNCATE; 2531 if (off == (u_offset_t)0) { 2532 rp->r_flags &= ~RDIRTY; 2533 if (!(rp->r_flags & RSTALE)) 2534 rp->r_error = 0; 2535 } 2536 rp->r_truncaddr = off; 2537 mutex_exit(&rp->r_statelock); 2538 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2539 B_INVAL | B_TRUNC, cr); 2540 mutex_enter(&rp->r_statelock); 2541 rp->r_flags &= ~RTRUNCATE; 2542 cv_broadcast(&rp->r_cv); 2543 mutex_exit(&rp->r_statelock); 2544 } 2545 2546 static int nfs_write_error_to_cons_only = 0; 2547 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2548 2549 /* 2550 * Print a file handle 2551 */ 2552 void 2553 nfs_printfhandle(nfs_fhandle *fhp) 2554 { 2555 int *ip; 2556 char *buf; 2557 size_t bufsize; 2558 char *cp; 2559 2560 /* 2561 * 13 == "(file handle:" 2562 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2563 * 1 == ' ' 2564 * 8 == maximum strlen of "%x" 2565 * 3 == ")\n\0" 2566 */ 2567 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2568 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2569 if (buf == NULL) 2570 return; 2571 2572 cp = buf; 2573 (void) strcpy(cp, "(file handle:"); 2574 while (*cp != '\0') 2575 cp++; 2576 for (ip = (int *)fhp->fh_buf; 2577 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2578 ip++) { 2579 (void) sprintf(cp, " %x", *ip); 2580 while (*cp != '\0') 2581 cp++; 2582 } 2583 (void) strcpy(cp, ")\n"); 2584 2585 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2586 2587 kmem_free(buf, bufsize); 2588 } 2589 2590 /* 2591 * Notify the system administrator that an NFS write error has 2592 * occurred. 2593 */ 2594 2595 /* seconds between ENOSPC/EDQUOT messages */ 2596 clock_t nfs_write_error_interval = 5; 2597 2598 void 2599 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2600 { 2601 mntinfo_t *mi; 2602 clock_t now; 2603 2604 mi = VTOMI(vp); 2605 /* 2606 * In case of forced unmount or zone shutdown, do not print any 2607 * messages since it can flood the console with error messages. 2608 */ 2609 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2610 return; 2611 2612 /* 2613 * No use in flooding the console with ENOSPC 2614 * messages from the same file system. 2615 */ 2616 now = ddi_get_lbolt(); 2617 if ((error != ENOSPC && error != EDQUOT) || 2618 now - mi->mi_printftime > 0) { 2619 zoneid_t zoneid = mi->mi_zone->zone_id; 2620 2621 #ifdef DEBUG 2622 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2623 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2624 #else 2625 nfs_perror(error, "NFS write error on host %s: %m.\n", 2626 VTOR(vp)->r_server->sv_hostname, NULL); 2627 #endif 2628 if (error == ENOSPC || error == EDQUOT) { 2629 zcmn_err(zoneid, CE_CONT, 2630 MSG("^File: userid=%d, groupid=%d\n"), 2631 crgetuid(cr), crgetgid(cr)); 2632 if (crgetuid(CRED()) != crgetuid(cr) || 2633 crgetgid(CRED()) != crgetgid(cr)) { 2634 zcmn_err(zoneid, CE_CONT, 2635 MSG("^User: userid=%d, groupid=%d\n"), 2636 crgetuid(CRED()), crgetgid(CRED())); 2637 } 2638 mi->mi_printftime = now + 2639 nfs_write_error_interval * hz; 2640 } 2641 nfs_printfhandle(&VTOR(vp)->r_fh); 2642 #ifdef DEBUG 2643 if (error == EACCES) { 2644 zcmn_err(zoneid, CE_CONT, 2645 MSG("^nfs_bio: cred is%s kcred\n"), 2646 cr == kcred ? "" : " not"); 2647 } 2648 #endif 2649 } 2650 } 2651 2652 /* ARGSUSED */ 2653 static void * 2654 nfs_mi_init(zoneid_t zoneid) 2655 { 2656 struct mi_globals *mig; 2657 2658 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2659 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2660 list_create(&mig->mig_list, sizeof (mntinfo_t), 2661 offsetof(mntinfo_t, mi_zone_node)); 2662 mig->mig_destructor_called = B_FALSE; 2663 return (mig); 2664 } 2665 2666 /* 2667 * Callback routine to tell all NFS mounts in the zone to stop creating new 2668 * threads. Existing threads should exit. 2669 */ 2670 /* ARGSUSED */ 2671 static void 2672 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2673 { 2674 struct mi_globals *mig = data; 2675 mntinfo_t *mi; 2676 2677 ASSERT(mig != NULL); 2678 again: 2679 mutex_enter(&mig->mig_lock); 2680 for (mi = list_head(&mig->mig_list); mi != NULL; 2681 mi = list_next(&mig->mig_list, mi)) { 2682 2683 /* 2684 * If we've done the shutdown work for this FS, skip. 2685 * Once we go off the end of the list, we're done. 2686 */ 2687 if (mi->mi_flags & MI_DEAD) 2688 continue; 2689 2690 /* 2691 * We will do work, so not done. Get a hold on the FS. 2692 */ 2693 VFS_HOLD(mi->mi_vfsp); 2694 2695 /* 2696 * purge the DNLC for this filesystem 2697 */ 2698 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2699 2700 mutex_enter(&mi->mi_async_lock); 2701 /* 2702 * Tell existing async worker threads to exit. 2703 */ 2704 mi->mi_max_threads = 0; 2705 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2706 /* 2707 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2708 * getting ready to exit when it's done with its current work. 2709 * Also set MI_DEAD to note we've acted on this FS. 2710 */ 2711 mutex_enter(&mi->mi_lock); 2712 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD); 2713 mutex_exit(&mi->mi_lock); 2714 /* 2715 * Wake up the async manager thread. 2716 */ 2717 cv_broadcast(&mi->mi_async_reqs_cv); 2718 mutex_exit(&mi->mi_async_lock); 2719 2720 /* 2721 * Drop lock and release FS, which may change list, then repeat. 2722 * We're done when every mi has been done or the list is empty. 2723 */ 2724 mutex_exit(&mig->mig_lock); 2725 VFS_RELE(mi->mi_vfsp); 2726 goto again; 2727 } 2728 mutex_exit(&mig->mig_lock); 2729 } 2730 2731 static void 2732 nfs_mi_free_globals(struct mi_globals *mig) 2733 { 2734 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2735 mutex_destroy(&mig->mig_lock); 2736 kmem_free(mig, sizeof (*mig)); 2737 2738 } 2739 2740 /* ARGSUSED */ 2741 static void 2742 nfs_mi_destroy(zoneid_t zoneid, void *data) 2743 { 2744 struct mi_globals *mig = data; 2745 2746 ASSERT(mig != NULL); 2747 mutex_enter(&mig->mig_lock); 2748 if (list_head(&mig->mig_list) != NULL) { 2749 /* Still waiting for VFS_FREEVFS() */ 2750 mig->mig_destructor_called = B_TRUE; 2751 mutex_exit(&mig->mig_lock); 2752 return; 2753 } 2754 nfs_mi_free_globals(mig); 2755 } 2756 2757 /* 2758 * Add an NFS mount to the per-zone list of NFS mounts. 2759 */ 2760 void 2761 nfs_mi_zonelist_add(mntinfo_t *mi) 2762 { 2763 struct mi_globals *mig; 2764 2765 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2766 mutex_enter(&mig->mig_lock); 2767 list_insert_head(&mig->mig_list, mi); 2768 mutex_exit(&mig->mig_lock); 2769 } 2770 2771 /* 2772 * Remove an NFS mount from the per-zone list of NFS mounts. 2773 */ 2774 static void 2775 nfs_mi_zonelist_remove(mntinfo_t *mi) 2776 { 2777 struct mi_globals *mig; 2778 2779 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2780 mutex_enter(&mig->mig_lock); 2781 list_remove(&mig->mig_list, mi); 2782 /* 2783 * We can be called asynchronously by VFS_FREEVFS() after the zone 2784 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2785 * mi globals. 2786 */ 2787 if (list_head(&mig->mig_list) == NULL && 2788 mig->mig_destructor_called == B_TRUE) { 2789 nfs_mi_free_globals(mig); 2790 return; 2791 } 2792 mutex_exit(&mig->mig_lock); 2793 } 2794 2795 /* 2796 * NFS Client initialization routine. This routine should only be called 2797 * once. It performs the following tasks: 2798 * - Initalize all global locks 2799 * - Call sub-initialization routines (localize access to variables) 2800 */ 2801 int 2802 nfs_clntinit(void) 2803 { 2804 #ifdef DEBUG 2805 static boolean_t nfs_clntup = B_FALSE; 2806 #endif 2807 int error; 2808 2809 #ifdef DEBUG 2810 ASSERT(nfs_clntup == B_FALSE); 2811 #endif 2812 2813 error = nfs_subrinit(); 2814 if (error) 2815 return (error); 2816 2817 error = nfs_vfsinit(); 2818 if (error) { 2819 /* 2820 * Cleanup nfs_subrinit() work 2821 */ 2822 nfs_subrfini(); 2823 return (error); 2824 } 2825 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2826 nfs_mi_destroy); 2827 2828 nfs4_clnt_init(); 2829 2830 #ifdef DEBUG 2831 nfs_clntup = B_TRUE; 2832 #endif 2833 2834 return (0); 2835 } 2836 2837 /* 2838 * This routine is only called if the NFS Client has been initialized but 2839 * the module failed to be installed. This routine will cleanup the previously 2840 * allocated/initialized work. 2841 */ 2842 void 2843 nfs_clntfini(void) 2844 { 2845 (void) zone_key_delete(mi_list_key); 2846 nfs_subrfini(); 2847 nfs_vfsfini(); 2848 nfs4_clnt_fini(); 2849 } 2850 2851 /* 2852 * nfs_lockrelease: 2853 * 2854 * Release any locks on the given vnode that are held by the current 2855 * process. 2856 */ 2857 void 2858 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2859 { 2860 flock64_t ld; 2861 struct shrlock shr; 2862 char *buf; 2863 int remote_lock_possible; 2864 int ret; 2865 2866 ASSERT((uintptr_t)vp > KERNELBASE); 2867 2868 /* 2869 * Generate an explicit unlock operation for the entire file. As a 2870 * partial optimization, only generate the unlock if there is a 2871 * lock registered for the file. We could check whether this 2872 * particular process has any locks on the file, but that would 2873 * require the local locking code to provide yet another query 2874 * routine. Note that no explicit synchronization is needed here. 2875 * At worst, flk_has_remote_locks() will return a false positive, 2876 * in which case the unlock call wastes time but doesn't harm 2877 * correctness. 2878 * 2879 * In addition, an unlock request is generated if the process 2880 * is listed as possibly having a lock on the file because the 2881 * server and client lock managers may have gotten out of sync. 2882 * N.B. It is important to make sure nfs_remove_locking_id() is 2883 * called here even if flk_has_remote_locks(vp) reports true. 2884 * If it is not called and there is an entry on the process id 2885 * list, that entry will never get removed. 2886 */ 2887 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2888 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2889 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2890 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2891 ld.l_whence = 0; /* unlock from start of file */ 2892 ld.l_start = 0; 2893 ld.l_len = 0; /* do entire file */ 2894 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr, 2895 NULL); 2896 2897 if (ret != 0) { 2898 /* 2899 * If VOP_FRLOCK fails, make sure we unregister 2900 * local locks before we continue. 2901 */ 2902 ld.l_pid = ttoproc(curthread)->p_pid; 2903 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2904 #ifdef DEBUG 2905 nfs_perror(ret, 2906 "NFS lock release error on vp %p: %m.\n", 2907 (void *)vp, NULL); 2908 #endif 2909 } 2910 2911 /* 2912 * The call to VOP_FRLOCK may put the pid back on the 2913 * list. We need to remove it. 2914 */ 2915 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2916 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2917 } 2918 2919 /* 2920 * As long as the vp has a share matching our pid, 2921 * pluck it off and unshare it. There are circumstances in 2922 * which the call to nfs_remove_locking_id() may put the 2923 * owner back on the list, in which case we simply do a 2924 * redundant and harmless unshare. 2925 */ 2926 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2927 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2928 (char *)NULL, buf, &shr.s_own_len)) { 2929 shr.s_owner = buf; 2930 shr.s_access = 0; 2931 shr.s_deny = 0; 2932 shr.s_sysid = 0; 2933 shr.s_pid = curproc->p_pid; 2934 2935 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL); 2936 #ifdef DEBUG 2937 if (ret != 0) { 2938 nfs_perror(ret, 2939 "NFS share release error on vp %p: %m.\n", 2940 (void *)vp, NULL); 2941 } 2942 #endif 2943 } 2944 kmem_free(buf, MAX_SHR_OWNER_LEN); 2945 } 2946 2947 /* 2948 * nfs_lockcompletion: 2949 * 2950 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2951 * as non cachable (set VNOCACHE bit). 2952 */ 2953 2954 void 2955 nfs_lockcompletion(vnode_t *vp, int cmd) 2956 { 2957 #ifdef DEBUG 2958 rnode_t *rp = VTOR(vp); 2959 2960 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2961 #endif 2962 2963 if (cmd == F_SETLK || cmd == F_SETLKW) { 2964 if (!lm_safemap(vp)) { 2965 mutex_enter(&vp->v_lock); 2966 vp->v_flag |= VNOCACHE; 2967 mutex_exit(&vp->v_lock); 2968 } else { 2969 mutex_enter(&vp->v_lock); 2970 vp->v_flag &= ~VNOCACHE; 2971 mutex_exit(&vp->v_lock); 2972 } 2973 } 2974 /* 2975 * The cached attributes of the file are stale after acquiring 2976 * the lock on the file. They were updated when the file was 2977 * opened, but not updated when the lock was acquired. Therefore the 2978 * cached attributes are invalidated after the lock is obtained. 2979 */ 2980 PURGE_ATTRCACHE(vp); 2981 } 2982 2983 /* 2984 * The lock manager holds state making it possible for the client 2985 * and server to be out of sync. For example, if the response from 2986 * the server granting a lock request is lost, the server will think 2987 * the lock is granted and the client will think the lock is lost. 2988 * The client can tell when it is not positive if it is in sync with 2989 * the server. 2990 * 2991 * To deal with this, a list of processes for which the client is 2992 * not sure if the server holds a lock is attached to the rnode. 2993 * When such a process closes the rnode, an unlock request is sent 2994 * to the server to unlock the entire file. 2995 * 2996 * The list is kept as a singularly linked NULL terminated list. 2997 * Because it is only added to under extreme error conditions, the 2998 * list shouldn't get very big. DEBUG kernels print a message if 2999 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 3000 * choosen to be 8, but can be tuned at runtime. 3001 */ 3002 #ifdef DEBUG 3003 /* int nfs_lmpl_high_water = 8; */ 3004 int nfs_lmpl_high_water = 128; 3005 int nfs_cnt_add_locking_id = 0; 3006 int nfs_len_add_locking_id = 0; 3007 #endif /* DEBUG */ 3008 3009 /* 3010 * Record that the nfs lock manager server may be holding a lock on 3011 * a vnode for a process. 3012 * 3013 * Because the nfs lock manager server holds state, it is possible 3014 * for the server to get out of sync with the client. This routine is called 3015 * from the client when it is no longer sure if the server is in sync 3016 * with the client. nfs_lockrelease() will then notice this and send 3017 * an unlock request when the file is closed 3018 */ 3019 void 3020 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 3021 { 3022 rnode_t *rp; 3023 lmpl_t *new; 3024 lmpl_t *cur; 3025 lmpl_t **lmplp; 3026 #ifdef DEBUG 3027 int list_len = 1; 3028 #endif /* DEBUG */ 3029 3030 #ifdef DEBUG 3031 ++nfs_cnt_add_locking_id; 3032 #endif /* DEBUG */ 3033 /* 3034 * allocate new lmpl_t now so we don't sleep 3035 * later after grabbing mutexes 3036 */ 3037 ASSERT(len < MAX_SHR_OWNER_LEN); 3038 new = kmem_alloc(sizeof (*new), KM_SLEEP); 3039 new->lmpl_type = type; 3040 new->lmpl_pid = pid; 3041 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 3042 bcopy(id, new->lmpl_owner, len); 3043 new->lmpl_own_len = len; 3044 new->lmpl_next = (lmpl_t *)NULL; 3045 #ifdef DEBUG 3046 if (type == RLMPL_PID) { 3047 ASSERT(len == sizeof (pid_t)); 3048 ASSERT(pid == *(pid_t *)new->lmpl_owner); 3049 } else { 3050 ASSERT(type == RLMPL_OWNER); 3051 } 3052 #endif 3053 3054 rp = VTOR(vp); 3055 mutex_enter(&rp->r_statelock); 3056 3057 /* 3058 * Add this id to the list for this rnode only if the 3059 * rnode is active and the id is not already there. 3060 */ 3061 ASSERT(rp->r_flags & RHASHED); 3062 lmplp = &(rp->r_lmpl); 3063 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3064 if (cur->lmpl_pid == pid && 3065 cur->lmpl_type == type && 3066 cur->lmpl_own_len == len && 3067 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 3068 kmem_free(new->lmpl_owner, len); 3069 kmem_free(new, sizeof (*new)); 3070 break; 3071 } 3072 lmplp = &cur->lmpl_next; 3073 #ifdef DEBUG 3074 ++list_len; 3075 #endif /* DEBUG */ 3076 } 3077 if (cur == (lmpl_t *)NULL) { 3078 *lmplp = new; 3079 #ifdef DEBUG 3080 if (list_len > nfs_len_add_locking_id) { 3081 nfs_len_add_locking_id = list_len; 3082 } 3083 if (list_len > nfs_lmpl_high_water) { 3084 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 3085 "vp=%p is %d", (void *)vp, list_len); 3086 } 3087 #endif /* DEBUG */ 3088 } 3089 3090 #ifdef DEBUG 3091 if (share_debug) { 3092 int nitems = 0; 3093 int npids = 0; 3094 int nowners = 0; 3095 3096 /* 3097 * Count the number of things left on r_lmpl after the remove. 3098 */ 3099 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3100 cur = cur->lmpl_next) { 3101 nitems++; 3102 if (cur->lmpl_type == RLMPL_PID) { 3103 npids++; 3104 } else if (cur->lmpl_type == RLMPL_OWNER) { 3105 nowners++; 3106 } else { 3107 cmn_err(CE_PANIC, "nfs_add_locking_id: " 3108 "unrecognized lmpl_type %d", 3109 cur->lmpl_type); 3110 } 3111 } 3112 3113 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 3114 "OWNs = %d items left on r_lmpl\n", 3115 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 3116 } 3117 #endif 3118 3119 mutex_exit(&rp->r_statelock); 3120 } 3121 3122 /* 3123 * Remove an id from the lock manager id list. 3124 * 3125 * If the id is not in the list return 0. If it was found and 3126 * removed, return 1. 3127 */ 3128 static int 3129 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 3130 { 3131 lmpl_t *cur; 3132 lmpl_t **lmplp; 3133 rnode_t *rp; 3134 int rv = 0; 3135 3136 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 3137 3138 rp = VTOR(vp); 3139 3140 mutex_enter(&rp->r_statelock); 3141 ASSERT(rp->r_flags & RHASHED); 3142 lmplp = &(rp->r_lmpl); 3143 3144 /* 3145 * Search through the list and remove the entry for this id 3146 * if it is there. The special case id == NULL allows removal 3147 * of the first share on the r_lmpl list belonging to the 3148 * current process (if any), without regard to further details 3149 * of its identity. 3150 */ 3151 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3152 if (cur->lmpl_type == type && 3153 cur->lmpl_pid == curproc->p_pid && 3154 (id == (char *)NULL || 3155 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 3156 *lmplp = cur->lmpl_next; 3157 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 3158 if (rid != NULL) { 3159 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 3160 *rlen = cur->lmpl_own_len; 3161 } 3162 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 3163 kmem_free(cur, sizeof (*cur)); 3164 rv = 1; 3165 break; 3166 } 3167 lmplp = &cur->lmpl_next; 3168 } 3169 3170 #ifdef DEBUG 3171 if (share_debug) { 3172 int nitems = 0; 3173 int npids = 0; 3174 int nowners = 0; 3175 3176 /* 3177 * Count the number of things left on r_lmpl after the remove. 3178 */ 3179 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3180 cur = cur->lmpl_next) { 3181 nitems++; 3182 if (cur->lmpl_type == RLMPL_PID) { 3183 npids++; 3184 } else if (cur->lmpl_type == RLMPL_OWNER) { 3185 nowners++; 3186 } else { 3187 cmn_err(CE_PANIC, 3188 "nrli: unrecognized lmpl_type %d", 3189 cur->lmpl_type); 3190 } 3191 } 3192 3193 cmn_err(CE_CONT, 3194 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3195 (type == RLMPL_PID) ? "P" : "O", 3196 npids, 3197 nowners, 3198 nitems); 3199 } 3200 #endif 3201 3202 mutex_exit(&rp->r_statelock); 3203 return (rv); 3204 } 3205 3206 void 3207 nfs_free_mi(mntinfo_t *mi) 3208 { 3209 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3210 ASSERT(mi->mi_manager_thread == NULL); 3211 ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 && 3212 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0); 3213 3214 /* 3215 * Remove the node from the global list before we start tearing it down. 3216 */ 3217 nfs_mi_zonelist_remove(mi); 3218 if (mi->mi_klmconfig) { 3219 lm_free_config(mi->mi_klmconfig); 3220 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3221 } 3222 mutex_destroy(&mi->mi_lock); 3223 mutex_destroy(&mi->mi_remap_lock); 3224 mutex_destroy(&mi->mi_async_lock); 3225 mutex_destroy(&mi->mi_rnodes_lock); 3226 cv_destroy(&mi->mi_failover_cv); 3227 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]); 3228 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]); 3229 cv_destroy(&mi->mi_async_reqs_cv); 3230 cv_destroy(&mi->mi_async_cv); 3231 list_destroy(&mi->mi_rnodes); 3232 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS); 3233 kmem_free(mi, sizeof (*mi)); 3234 } 3235 3236 static int 3237 mnt_kstat_update(kstat_t *ksp, int rw) 3238 { 3239 mntinfo_t *mi; 3240 struct mntinfo_kstat *mik; 3241 vfs_t *vfsp; 3242 int i; 3243 3244 /* this is a read-only kstat. Bail out on a write */ 3245 if (rw == KSTAT_WRITE) 3246 return (EACCES); 3247 3248 /* 3249 * We don't want to wait here as kstat_chain_lock could be held by 3250 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3251 * and thus could lead to a deadlock. 3252 */ 3253 vfsp = (struct vfs *)ksp->ks_private; 3254 3255 3256 mi = VFTOMI(vfsp); 3257 3258 mik = (struct mntinfo_kstat *)ksp->ks_data; 3259 3260 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3261 mik->mik_vers = (uint32_t)mi->mi_vers; 3262 mik->mik_flags = mi->mi_flags; 3263 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3264 mik->mik_curread = (uint32_t)mi->mi_curread; 3265 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3266 mik->mik_retrans = mi->mi_retrans; 3267 mik->mik_timeo = mi->mi_timeo; 3268 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3269 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3270 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3271 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3272 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3273 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3274 mik->mik_timers[i].deviate = 3275 (uint32_t)mi->mi_timers[i].rt_deviate; 3276 mik->mik_timers[i].rtxcur = 3277 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3278 } 3279 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3280 mik->mik_failover = (uint32_t)mi->mi_failover; 3281 mik->mik_remap = (uint32_t)mi->mi_remap; 3282 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3283 3284 return (0); 3285 } 3286 3287 void 3288 nfs_mnt_kstat_init(struct vfs *vfsp) 3289 { 3290 mntinfo_t *mi = VFTOMI(vfsp); 3291 3292 /* 3293 * Create the version specific kstats. 3294 * 3295 * PSARC 2001/697 Contract Private Interface 3296 * All nfs kstats are under SunMC contract 3297 * Please refer to the PSARC listed above and contact 3298 * SunMC before making any changes! 3299 * 3300 * Changes must be reviewed by Solaris File Sharing 3301 * Changes must be communicated to contract-2001-697@sun.com 3302 * 3303 */ 3304 3305 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3306 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3307 if (mi->mi_io_kstats) { 3308 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3309 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3310 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3311 kstat_install(mi->mi_io_kstats); 3312 } 3313 3314 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3315 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3316 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3317 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3318 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3319 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3320 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3321 kstat_install(mi->mi_ro_kstats); 3322 } 3323 } 3324 3325 nfs_delmapcall_t * 3326 nfs_init_delmapcall() 3327 { 3328 nfs_delmapcall_t *delmap_call; 3329 3330 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3331 delmap_call->call_id = curthread; 3332 delmap_call->error = 0; 3333 3334 return (delmap_call); 3335 } 3336 3337 void 3338 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3339 { 3340 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3341 } 3342 3343 /* 3344 * Searches for the current delmap caller (based on curthread) in the list of 3345 * callers. If it is found, we remove it and free the delmap caller. 3346 * Returns: 3347 * 0 if the caller wasn't found 3348 * 1 if the caller was found, removed and freed. *errp is set to what 3349 * the result of the delmap was. 3350 */ 3351 int 3352 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3353 { 3354 nfs_delmapcall_t *delmap_call; 3355 3356 /* 3357 * If the list doesn't exist yet, we create it and return 3358 * that the caller wasn't found. No list = no callers. 3359 */ 3360 mutex_enter(&rp->r_statelock); 3361 if (!(rp->r_flags & RDELMAPLIST)) { 3362 /* The list does not exist */ 3363 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3364 offsetof(nfs_delmapcall_t, call_node)); 3365 rp->r_flags |= RDELMAPLIST; 3366 mutex_exit(&rp->r_statelock); 3367 return (0); 3368 } else { 3369 /* The list exists so search it */ 3370 for (delmap_call = list_head(&rp->r_indelmap); 3371 delmap_call != NULL; 3372 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3373 if (delmap_call->call_id == curthread) { 3374 /* current caller is in the list */ 3375 *errp = delmap_call->error; 3376 list_remove(&rp->r_indelmap, delmap_call); 3377 mutex_exit(&rp->r_statelock); 3378 nfs_free_delmapcall(delmap_call); 3379 return (1); 3380 } 3381 } 3382 } 3383 mutex_exit(&rp->r_statelock); 3384 return (0); 3385 } 3386