1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 /* 32 * Copyright 2018 Nexenta Systems, Inc. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/thread.h> 39 #include <sys/t_lock.h> 40 #include <sys/time.h> 41 #include <sys/vnode.h> 42 #include <sys/vfs.h> 43 #include <sys/errno.h> 44 #include <sys/buf.h> 45 #include <sys/stat.h> 46 #include <sys/cred.h> 47 #include <sys/kmem.h> 48 #include <sys/debug.h> 49 #include <sys/dnlc.h> 50 #include <sys/vmsystm.h> 51 #include <sys/flock.h> 52 #include <sys/share.h> 53 #include <sys/cmn_err.h> 54 #include <sys/tiuser.h> 55 #include <sys/sysmacros.h> 56 #include <sys/callb.h> 57 #include <sys/acl.h> 58 #include <sys/kstat.h> 59 #include <sys/signal.h> 60 #include <sys/list.h> 61 #include <sys/zone.h> 62 63 #include <rpc/types.h> 64 #include <rpc/xdr.h> 65 #include <rpc/auth.h> 66 #include <rpc/clnt.h> 67 68 #include <nfs/nfs.h> 69 #include <nfs/nfs_clnt.h> 70 #include <nfs/nfs_cmd.h> 71 72 #include <nfs/rnode.h> 73 #include <nfs/nfs_acl.h> 74 #include <nfs/lm.h> 75 76 #include <vm/hat.h> 77 #include <vm/as.h> 78 #include <vm/page.h> 79 #include <vm/pvn.h> 80 #include <vm/seg.h> 81 #include <vm/seg_map.h> 82 #include <vm/seg_vn.h> 83 84 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 85 cred_t *); 86 static int nfs_getattr_cache(vnode_t *, struct vattr *); 87 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 88 89 struct mi_globals { 90 kmutex_t mig_lock; /* lock protecting mig_list */ 91 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 92 boolean_t mig_destructor_called; 93 }; 94 95 static zone_key_t mi_list_key; 96 97 /* Debugging flag for PC file shares. */ 98 extern int share_debug; 99 100 /* 101 * Attributes caching: 102 * 103 * Attributes are cached in the rnode in struct vattr form. 104 * There is a time associated with the cached attributes (r_attrtime) 105 * which tells whether the attributes are valid. The time is initialized 106 * to the difference between current time and the modify time of the vnode 107 * when new attributes are cached. This allows the attributes for 108 * files that have changed recently to be timed out sooner than for files 109 * that have not changed for a long time. There are minimum and maximum 110 * timeout values that can be set per mount point. 111 */ 112 113 int 114 nfs_waitfor_purge_complete(vnode_t *vp) 115 { 116 rnode_t *rp; 117 k_sigset_t smask; 118 119 rp = VTOR(vp); 120 if (rp->r_serial != NULL && rp->r_serial != curthread) { 121 mutex_enter(&rp->r_statelock); 122 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 123 while (rp->r_serial != NULL) { 124 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 125 sigunintr(&smask); 126 mutex_exit(&rp->r_statelock); 127 return (EINTR); 128 } 129 } 130 sigunintr(&smask); 131 mutex_exit(&rp->r_statelock); 132 } 133 return (0); 134 } 135 136 /* 137 * Validate caches by checking cached attributes. If the cached 138 * attributes have timed out, then get new attributes from the server. 139 * As a side affect, this will do cache invalidation if the attributes 140 * have changed. 141 * 142 * If the attributes have not timed out and if there is a cache 143 * invalidation being done by some other thread, then wait until that 144 * thread has completed the cache invalidation. 145 */ 146 int 147 nfs_validate_caches(vnode_t *vp, cred_t *cr) 148 { 149 int error; 150 struct vattr va; 151 152 if (ATTRCACHE_VALID(vp)) { 153 error = nfs_waitfor_purge_complete(vp); 154 if (error) 155 return (error); 156 return (0); 157 } 158 159 va.va_mask = AT_ALL; 160 return (nfs_getattr_otw(vp, &va, cr)); 161 } 162 163 /* 164 * Validate caches by checking cached attributes. If the cached 165 * attributes have timed out, then get new attributes from the server. 166 * As a side affect, this will do cache invalidation if the attributes 167 * have changed. 168 * 169 * If the attributes have not timed out and if there is a cache 170 * invalidation being done by some other thread, then wait until that 171 * thread has completed the cache invalidation. 172 */ 173 int 174 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 175 { 176 int error; 177 struct vattr va; 178 179 if (ATTRCACHE_VALID(vp)) { 180 error = nfs_waitfor_purge_complete(vp); 181 if (error) 182 return (error); 183 return (0); 184 } 185 186 va.va_mask = AT_ALL; 187 return (nfs3_getattr_otw(vp, &va, cr)); 188 } 189 190 /* 191 * Purge all of the various NFS `data' caches. 192 */ 193 void 194 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 195 { 196 rnode_t *rp; 197 char *contents; 198 int size; 199 int error; 200 201 /* 202 * Purge the DNLC for any entries which refer to this file. 203 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 204 */ 205 rp = VTOR(vp); 206 mutex_enter(&rp->r_statelock); 207 if (vp->v_count > 1 && 208 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 209 !(rp->r_flags & RINDNLCPURGE)) { 210 /* 211 * Set the RINDNLCPURGE flag to prevent recursive entry 212 * into dnlc_purge_vp() 213 */ 214 if (vp->v_type == VDIR) 215 rp->r_flags |= RINDNLCPURGE; 216 mutex_exit(&rp->r_statelock); 217 dnlc_purge_vp(vp); 218 mutex_enter(&rp->r_statelock); 219 if (rp->r_flags & RINDNLCPURGE) 220 rp->r_flags &= ~RINDNLCPURGE; 221 } 222 223 /* 224 * Clear any readdir state bits and purge the readlink response cache. 225 */ 226 contents = rp->r_symlink.contents; 227 size = rp->r_symlink.size; 228 rp->r_symlink.contents = NULL; 229 mutex_exit(&rp->r_statelock); 230 231 if (contents != NULL) { 232 233 kmem_free((void *)contents, size); 234 } 235 236 /* 237 * Flush the page cache. 238 */ 239 if (vn_has_cached_data(vp)) { 240 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 241 if (error && (error == ENOSPC || error == EDQUOT)) { 242 mutex_enter(&rp->r_statelock); 243 if (!rp->r_error) 244 rp->r_error = error; 245 mutex_exit(&rp->r_statelock); 246 } 247 } 248 249 /* 250 * Flush the readdir response cache. 251 */ 252 if (HAVE_RDDIR_CACHE(rp)) 253 nfs_purge_rddir_cache(vp); 254 } 255 256 /* 257 * Purge the readdir cache of all entries 258 */ 259 void 260 nfs_purge_rddir_cache(vnode_t *vp) 261 { 262 rnode_t *rp; 263 rddir_cache *rdc; 264 rddir_cache *nrdc; 265 266 rp = VTOR(vp); 267 mutex_enter(&rp->r_statelock); 268 rp->r_direof = NULL; 269 rp->r_flags &= ~RLOOKUP; 270 rp->r_flags |= RREADDIRPLUS; 271 rdc = avl_first(&rp->r_dir); 272 while (rdc != NULL) { 273 nrdc = AVL_NEXT(&rp->r_dir, rdc); 274 avl_remove(&rp->r_dir, rdc); 275 rddir_cache_rele(rdc); 276 rdc = nrdc; 277 } 278 mutex_exit(&rp->r_statelock); 279 } 280 281 /* 282 * Do a cache check based on the post-operation attributes. 283 * Then make them the new cached attributes. If no attributes 284 * were returned, then mark the attributes as timed out. 285 */ 286 void 287 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 288 { 289 vattr_t attr; 290 291 if (!poap->attributes) { 292 PURGE_ATTRCACHE(vp); 293 return; 294 } 295 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 296 } 297 298 /* 299 * Same as above, but using a vattr 300 */ 301 void 302 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 303 cred_t *cr) 304 { 305 if (!poap->attributes) { 306 PURGE_ATTRCACHE(vp); 307 return; 308 } 309 nfs_attr_cache(vp, poap->fres.vap, t, cr); 310 } 311 312 /* 313 * Do a cache check based on the weak cache consistency attributes. 314 * These consist of a small set of pre-operation attributes and the 315 * full set of post-operation attributes. 316 * 317 * If we are given the pre-operation attributes, then use them to 318 * check the validity of the various caches. Then, if we got the 319 * post-operation attributes, make them the new cached attributes. 320 * If we didn't get the post-operation attributes, then mark the 321 * attribute cache as timed out so that the next reference will 322 * cause a GETATTR to the server to refresh with the current 323 * attributes. 324 * 325 * Otherwise, if we didn't get the pre-operation attributes, but 326 * we did get the post-operation attributes, then use these 327 * attributes to check the validity of the various caches. This 328 * will probably cause a flush of the caches because if the 329 * operation succeeded, the attributes of the object were changed 330 * in some way from the old post-operation attributes. This 331 * should be okay because it is the safe thing to do. After 332 * checking the data caches, then we make these the new cached 333 * attributes. 334 * 335 * Otherwise, we didn't get either the pre- or post-operation 336 * attributes. Simply mark the attribute cache as timed out so 337 * the next reference will cause a GETATTR to the server to 338 * refresh with the current attributes. 339 * 340 * If an error occurred trying to convert the over the wire 341 * attributes to a vattr, then simply mark the attribute cache as 342 * timed out. 343 */ 344 void 345 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 346 { 347 vattr_t bva; 348 vattr_t ava; 349 350 if (wccp->after.attributes) { 351 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 352 PURGE_ATTRCACHE(vp); 353 return; 354 } 355 if (wccp->before.attributes) { 356 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 357 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 358 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 359 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 360 bva.va_size = wccp->before.attr.size; 361 nfs3_attr_cache(vp, &bva, &ava, t, cr); 362 } else 363 nfs_attr_cache(vp, &ava, t, cr); 364 } else { 365 PURGE_ATTRCACHE(vp); 366 } 367 } 368 369 /* 370 * Set attributes cache for given vnode using nfsattr. 371 * 372 * This routine does not do cache validation with the attributes. 373 * 374 * If an error occurred trying to convert the over the wire 375 * attributes to a vattr, then simply mark the attribute cache as 376 * timed out. 377 */ 378 void 379 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 380 { 381 rnode_t *rp; 382 struct vattr va; 383 384 if (!nattr_to_vattr(vp, na, &va)) { 385 rp = VTOR(vp); 386 mutex_enter(&rp->r_statelock); 387 if (rp->r_mtime <= t) 388 nfs_attrcache_va(vp, &va); 389 mutex_exit(&rp->r_statelock); 390 } else { 391 PURGE_ATTRCACHE(vp); 392 } 393 } 394 395 /* 396 * Set attributes cache for given vnode using fattr3. 397 * 398 * This routine does not do cache validation with the attributes. 399 * 400 * If an error occurred trying to convert the over the wire 401 * attributes to a vattr, then simply mark the attribute cache as 402 * timed out. 403 */ 404 void 405 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 406 { 407 rnode_t *rp; 408 struct vattr va; 409 410 if (!fattr3_to_vattr(vp, na, &va)) { 411 rp = VTOR(vp); 412 mutex_enter(&rp->r_statelock); 413 if (rp->r_mtime <= t) 414 nfs_attrcache_va(vp, &va); 415 mutex_exit(&rp->r_statelock); 416 } else { 417 PURGE_ATTRCACHE(vp); 418 } 419 } 420 421 /* 422 * Do a cache check based on attributes returned over the wire. The 423 * new attributes are cached. 424 * 425 * If an error occurred trying to convert the over the wire attributes 426 * to a vattr, then just return that error. 427 * 428 * As a side affect, the vattr argument is filled in with the converted 429 * attributes. 430 */ 431 int 432 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 433 cred_t *cr) 434 { 435 int error; 436 437 error = nattr_to_vattr(vp, na, vap); 438 if (error) 439 return (error); 440 nfs_attr_cache(vp, vap, t, cr); 441 return (0); 442 } 443 444 /* 445 * Do a cache check based on attributes returned over the wire. The 446 * new attributes are cached. 447 * 448 * If an error occurred trying to convert the over the wire attributes 449 * to a vattr, then just return that error. 450 * 451 * As a side affect, the vattr argument is filled in with the converted 452 * attributes. 453 */ 454 int 455 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 456 { 457 int error; 458 459 error = fattr3_to_vattr(vp, na, vap); 460 if (error) 461 return (error); 462 nfs_attr_cache(vp, vap, t, cr); 463 return (0); 464 } 465 466 /* 467 * Use the passed in virtual attributes to check to see whether the 468 * data and metadata caches are valid, cache the new attributes, and 469 * then do the cache invalidation if required. 470 * 471 * The cache validation and caching of the new attributes is done 472 * atomically via the use of the mutex, r_statelock. If required, 473 * the cache invalidation is done atomically w.r.t. the cache 474 * validation and caching of the attributes via the pseudo lock, 475 * r_serial. 476 * 477 * This routine is used to do cache validation and attributes caching 478 * for operations with a single set of post operation attributes. 479 */ 480 void 481 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 482 { 483 rnode_t *rp; 484 int mtime_changed = 0; 485 int ctime_changed = 0; 486 vsecattr_t *vsp; 487 int was_serial; 488 len_t preattr_rsize; 489 boolean_t writeattr_set = B_FALSE; 490 boolean_t cachepurge_set = B_FALSE; 491 492 rp = VTOR(vp); 493 494 mutex_enter(&rp->r_statelock); 495 496 if (rp->r_serial != curthread) { 497 klwp_t *lwp = ttolwp(curthread); 498 499 was_serial = 0; 500 if (lwp != NULL) 501 lwp->lwp_nostop++; 502 while (rp->r_serial != NULL) { 503 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 504 mutex_exit(&rp->r_statelock); 505 if (lwp != NULL) 506 lwp->lwp_nostop--; 507 return; 508 } 509 } 510 if (lwp != NULL) 511 lwp->lwp_nostop--; 512 } else 513 was_serial = 1; 514 515 if (rp->r_mtime > t) { 516 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 517 PURGE_ATTRCACHE_LOCKED(rp); 518 mutex_exit(&rp->r_statelock); 519 return; 520 } 521 522 /* 523 * Write thread after writing data to file on remote server, 524 * will always set RWRITEATTR to indicate that file on remote 525 * server was modified with a WRITE operation and would have 526 * marked attribute cache as timed out. If RWRITEATTR 527 * is set, then do not check for mtime and ctime change. 528 */ 529 if (!(rp->r_flags & RWRITEATTR)) { 530 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 531 mtime_changed = 1; 532 533 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 534 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 535 ctime_changed = 1; 536 } else { 537 writeattr_set = B_TRUE; 538 } 539 540 preattr_rsize = rp->r_size; 541 542 nfs_attrcache_va(vp, vap); 543 544 /* 545 * If we have updated filesize in nfs_attrcache_va, as soon as we 546 * drop statelock we will be in transition of purging all 547 * our caches and updating them. It is possible for another 548 * thread to pick this new file size and read in zeroed data. 549 * stall other threads till cache purge is complete. 550 */ 551 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 552 /* 553 * If RWRITEATTR was set and we have updated the file 554 * size, Server's returned file size need not necessarily 555 * be because of this Client's WRITE. We need to purge 556 * all caches. 557 */ 558 if (writeattr_set) 559 mtime_changed = 1; 560 561 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 562 rp->r_flags |= RINCACHEPURGE; 563 cachepurge_set = B_TRUE; 564 } 565 } 566 567 if (!mtime_changed && !ctime_changed) { 568 mutex_exit(&rp->r_statelock); 569 return; 570 } 571 572 rp->r_serial = curthread; 573 574 mutex_exit(&rp->r_statelock); 575 576 if (mtime_changed) 577 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 578 579 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 580 mutex_enter(&rp->r_statelock); 581 rp->r_flags &= ~RINCACHEPURGE; 582 cv_broadcast(&rp->r_cv); 583 mutex_exit(&rp->r_statelock); 584 cachepurge_set = B_FALSE; 585 } 586 587 if (ctime_changed) { 588 (void) nfs_access_purge_rp(rp); 589 if (rp->r_secattr != NULL) { 590 mutex_enter(&rp->r_statelock); 591 vsp = rp->r_secattr; 592 rp->r_secattr = NULL; 593 mutex_exit(&rp->r_statelock); 594 if (vsp != NULL) 595 nfs_acl_free(vsp); 596 } 597 } 598 599 if (!was_serial) { 600 mutex_enter(&rp->r_statelock); 601 rp->r_serial = NULL; 602 cv_broadcast(&rp->r_cv); 603 mutex_exit(&rp->r_statelock); 604 } 605 } 606 607 /* 608 * Use the passed in "before" virtual attributes to check to see 609 * whether the data and metadata caches are valid, cache the "after" 610 * new attributes, and then do the cache invalidation if required. 611 * 612 * The cache validation and caching of the new attributes is done 613 * atomically via the use of the mutex, r_statelock. If required, 614 * the cache invalidation is done atomically w.r.t. the cache 615 * validation and caching of the attributes via the pseudo lock, 616 * r_serial. 617 * 618 * This routine is used to do cache validation and attributes caching 619 * for operations with both pre operation attributes and post operation 620 * attributes. 621 */ 622 static void 623 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 624 cred_t *cr) 625 { 626 rnode_t *rp; 627 int mtime_changed = 0; 628 int ctime_changed = 0; 629 vsecattr_t *vsp; 630 int was_serial; 631 len_t preattr_rsize; 632 boolean_t writeattr_set = B_FALSE; 633 boolean_t cachepurge_set = B_FALSE; 634 635 rp = VTOR(vp); 636 637 mutex_enter(&rp->r_statelock); 638 639 if (rp->r_serial != curthread) { 640 klwp_t *lwp = ttolwp(curthread); 641 642 was_serial = 0; 643 if (lwp != NULL) 644 lwp->lwp_nostop++; 645 while (rp->r_serial != NULL) { 646 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 647 mutex_exit(&rp->r_statelock); 648 if (lwp != NULL) 649 lwp->lwp_nostop--; 650 return; 651 } 652 } 653 if (lwp != NULL) 654 lwp->lwp_nostop--; 655 } else 656 was_serial = 1; 657 658 if (rp->r_mtime > t) { 659 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size)) 660 PURGE_ATTRCACHE_LOCKED(rp); 661 mutex_exit(&rp->r_statelock); 662 return; 663 } 664 665 /* 666 * Write thread after writing data to file on remote server, 667 * will always set RWRITEATTR to indicate that file on remote 668 * server was modified with a WRITE operation and would have 669 * marked attribute cache as timed out. If RWRITEATTR 670 * is set, then do not check for mtime and ctime change. 671 */ 672 if (!(rp->r_flags & RWRITEATTR)) { 673 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 674 mtime_changed = 1; 675 676 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 677 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 678 ctime_changed = 1; 679 } else { 680 writeattr_set = B_TRUE; 681 } 682 683 preattr_rsize = rp->r_size; 684 685 nfs_attrcache_va(vp, avap); 686 687 /* 688 * If we have updated filesize in nfs_attrcache_va, as soon as we 689 * drop statelock we will be in transition of purging all 690 * our caches and updating them. It is possible for another 691 * thread to pick this new file size and read in zeroed data. 692 * stall other threads till cache purge is complete. 693 */ 694 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) { 695 /* 696 * If RWRITEATTR was set and we have updated the file 697 * size, Server's returned file size need not necessarily 698 * be because of this Client's WRITE. We need to purge 699 * all caches. 700 */ 701 if (writeattr_set) 702 mtime_changed = 1; 703 704 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) { 705 rp->r_flags |= RINCACHEPURGE; 706 cachepurge_set = B_TRUE; 707 } 708 } 709 710 if (!mtime_changed && !ctime_changed) { 711 mutex_exit(&rp->r_statelock); 712 return; 713 } 714 715 rp->r_serial = curthread; 716 717 mutex_exit(&rp->r_statelock); 718 719 if (mtime_changed) 720 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 721 722 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) { 723 mutex_enter(&rp->r_statelock); 724 rp->r_flags &= ~RINCACHEPURGE; 725 cv_broadcast(&rp->r_cv); 726 mutex_exit(&rp->r_statelock); 727 cachepurge_set = B_FALSE; 728 } 729 730 if (ctime_changed) { 731 (void) nfs_access_purge_rp(rp); 732 if (rp->r_secattr != NULL) { 733 mutex_enter(&rp->r_statelock); 734 vsp = rp->r_secattr; 735 rp->r_secattr = NULL; 736 mutex_exit(&rp->r_statelock); 737 if (vsp != NULL) 738 nfs_acl_free(vsp); 739 } 740 } 741 742 if (!was_serial) { 743 mutex_enter(&rp->r_statelock); 744 rp->r_serial = NULL; 745 cv_broadcast(&rp->r_cv); 746 mutex_exit(&rp->r_statelock); 747 } 748 } 749 750 /* 751 * Set attributes cache for given vnode using virtual attributes. 752 * 753 * Set the timeout value on the attribute cache and fill it 754 * with the passed in attributes. 755 * 756 * The caller must be holding r_statelock. 757 */ 758 void 759 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 760 { 761 rnode_t *rp; 762 mntinfo_t *mi; 763 hrtime_t delta; 764 hrtime_t now; 765 766 rp = VTOR(vp); 767 768 ASSERT(MUTEX_HELD(&rp->r_statelock)); 769 770 now = gethrtime(); 771 772 mi = VTOMI(vp); 773 774 /* 775 * Delta is the number of nanoseconds that we will 776 * cache the attributes of the file. It is based on 777 * the number of nanoseconds since the last time that 778 * we detected a change. The assumption is that files 779 * that changed recently are likely to change again. 780 * There is a minimum and a maximum for regular files 781 * and for directories which is enforced though. 782 * 783 * Using the time since last change was detected 784 * eliminates direct comparison or calculation 785 * using mixed client and server times. NFS does 786 * not make any assumptions regarding the client 787 * and server clocks being synchronized. 788 */ 789 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 790 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 791 va->va_size != rp->r_attr.va_size) 792 rp->r_mtime = now; 793 794 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 795 delta = 0; 796 else { 797 delta = now - rp->r_mtime; 798 if (vp->v_type == VDIR) { 799 if (delta < mi->mi_acdirmin) 800 delta = mi->mi_acdirmin; 801 else if (delta > mi->mi_acdirmax) 802 delta = mi->mi_acdirmax; 803 } else { 804 if (delta < mi->mi_acregmin) 805 delta = mi->mi_acregmin; 806 else if (delta > mi->mi_acregmax) 807 delta = mi->mi_acregmax; 808 } 809 } 810 rp->r_attrtime = now + delta; 811 rp->r_attr = *va; 812 /* 813 * Update the size of the file if there is no cached data or if 814 * the cached data is clean and there is no data being written 815 * out. 816 */ 817 if (rp->r_size != va->va_size && 818 (!vn_has_cached_data(vp) || 819 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 820 rp->r_size = va->va_size; 821 nfs_setswaplike(vp, va); 822 rp->r_flags &= ~RWRITEATTR; 823 } 824 825 /* 826 * Fill in attribute from the cache. 827 * If valid, then return 0 to indicate that no error occurred, 828 * otherwise return 1 to indicate that an error occurred. 829 */ 830 static int 831 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 832 { 833 rnode_t *rp; 834 uint_t mask = vap->va_mask; 835 836 rp = VTOR(vp); 837 mutex_enter(&rp->r_statelock); 838 if (ATTRCACHE_VALID(vp)) { 839 /* 840 * Cached attributes are valid 841 */ 842 *vap = rp->r_attr; 843 /* 844 * Set the caller's va_mask to the set of attributes 845 * that were requested ANDed with the attributes that 846 * are available. If attributes were requested that 847 * are not available, those bits must be turned off 848 * in the callers va_mask. 849 */ 850 vap->va_mask &= mask; 851 mutex_exit(&rp->r_statelock); 852 return (0); 853 } 854 mutex_exit(&rp->r_statelock); 855 return (1); 856 } 857 858 /* 859 * Get attributes over-the-wire and update attributes cache 860 * if no error occurred in the over-the-wire operation. 861 * Return 0 if successful, otherwise error. 862 */ 863 int 864 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 865 { 866 int error; 867 struct nfsattrstat ns; 868 int douprintf; 869 mntinfo_t *mi; 870 failinfo_t fi; 871 hrtime_t t; 872 873 mi = VTOMI(vp); 874 fi.vp = vp; 875 fi.fhp = NULL; /* no need to update, filehandle not copied */ 876 fi.copyproc = nfscopyfh; 877 fi.lookupproc = nfslookup; 878 fi.xattrdirproc = acl_getxattrdir2; 879 880 if (mi->mi_flags & MI_ACL) { 881 error = acl_getattr2_otw(vp, vap, cr); 882 if (mi->mi_flags & MI_ACL) 883 return (error); 884 } 885 886 douprintf = 1; 887 888 t = gethrtime(); 889 890 error = rfs2call(mi, RFS_GETATTR, 891 xdr_fhandle, (caddr_t)VTOFH(vp), 892 xdr_attrstat, (caddr_t)&ns, cr, 893 &douprintf, &ns.ns_status, 0, &fi); 894 895 if (!error) { 896 error = geterrno(ns.ns_status); 897 if (!error) 898 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 899 else { 900 PURGE_STALE_FH(error, vp, cr); 901 } 902 } 903 904 return (error); 905 } 906 907 /* 908 * Return either cached ot remote attributes. If get remote attr 909 * use them to check and invalidate caches, then cache the new attributes. 910 */ 911 int 912 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 913 { 914 int error; 915 rnode_t *rp; 916 917 /* 918 * If we've got cached attributes, we're done, otherwise go 919 * to the server to get attributes, which will update the cache 920 * in the process. 921 */ 922 error = nfs_getattr_cache(vp, vap); 923 if (error) 924 error = nfs_getattr_otw(vp, vap, cr); 925 926 /* Return the client's view of file size */ 927 rp = VTOR(vp); 928 mutex_enter(&rp->r_statelock); 929 vap->va_size = rp->r_size; 930 mutex_exit(&rp->r_statelock); 931 932 return (error); 933 } 934 935 /* 936 * Get attributes over-the-wire and update attributes cache 937 * if no error occurred in the over-the-wire operation. 938 * Return 0 if successful, otherwise error. 939 */ 940 int 941 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 942 { 943 int error; 944 GETATTR3args args; 945 GETATTR3vres res; 946 int douprintf; 947 failinfo_t fi; 948 hrtime_t t; 949 950 args.object = *VTOFH3(vp); 951 fi.vp = vp; 952 fi.fhp = (caddr_t)&args.object; 953 fi.copyproc = nfs3copyfh; 954 fi.lookupproc = nfs3lookup; 955 fi.xattrdirproc = acl_getxattrdir3; 956 res.fres.vp = vp; 957 res.fres.vap = vap; 958 959 douprintf = 1; 960 961 t = gethrtime(); 962 963 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 964 xdr_nfs_fh3, (caddr_t)&args, 965 xdr_GETATTR3vres, (caddr_t)&res, cr, 966 &douprintf, &res.status, 0, &fi); 967 968 if (error) 969 return (error); 970 971 error = geterrno3(res.status); 972 if (error) { 973 PURGE_STALE_FH(error, vp, cr); 974 return (error); 975 } 976 977 /* 978 * Catch status codes that indicate fattr3 to vattr translation failure 979 */ 980 if (res.fres.status) 981 return (res.fres.status); 982 983 nfs_attr_cache(vp, vap, t, cr); 984 return (0); 985 } 986 987 /* 988 * Return either cached or remote attributes. If get remote attr 989 * use them to check and invalidate caches, then cache the new attributes. 990 */ 991 int 992 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 993 { 994 int error; 995 rnode_t *rp; 996 997 /* 998 * If we've got cached attributes, we're done, otherwise go 999 * to the server to get attributes, which will update the cache 1000 * in the process. 1001 */ 1002 error = nfs_getattr_cache(vp, vap); 1003 if (error) 1004 error = nfs3_getattr_otw(vp, vap, cr); 1005 1006 /* Return the client's view of file size */ 1007 rp = VTOR(vp); 1008 mutex_enter(&rp->r_statelock); 1009 vap->va_size = rp->r_size; 1010 mutex_exit(&rp->r_statelock); 1011 1012 return (error); 1013 } 1014 1015 vtype_t nf_to_vt[] = { 1016 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 1017 }; 1018 /* 1019 * Convert NFS Version 2 over the network attributes to the local 1020 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1021 * network representation and the local representation is done here. 1022 * Returns 0 for success, error if failed due to overflow. 1023 */ 1024 int 1025 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 1026 { 1027 /* overflow in time attributes? */ 1028 #ifndef _LP64 1029 if (!NFS2_FATTR_TIME_OK(na)) 1030 return (EOVERFLOW); 1031 #endif 1032 1033 vap->va_mask = AT_ALL; 1034 1035 if (na->na_type < NFNON || na->na_type > NFSOC) 1036 vap->va_type = VBAD; 1037 else 1038 vap->va_type = nf_to_vt[na->na_type]; 1039 vap->va_mode = na->na_mode; 1040 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 1041 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 1042 vap->va_fsid = vp->v_vfsp->vfs_dev; 1043 vap->va_nodeid = na->na_nodeid; 1044 vap->va_nlink = na->na_nlink; 1045 vap->va_size = na->na_size; /* keep for cache validation */ 1046 /* 1047 * nfs protocol defines times as unsigned so don't extend sign, 1048 * unless sysadmin set nfs_allow_preepoch_time. 1049 */ 1050 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 1051 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 1052 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 1053 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 1054 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 1055 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 1056 /* 1057 * Shannon's law - uncompress the received dev_t 1058 * if the top half of is zero indicating a response 1059 * from an `older style' OS. Except for when it is a 1060 * `new style' OS sending the maj device of zero, 1061 * in which case the algorithm still works because the 1062 * fact that it is a new style server 1063 * is hidden by the minor device not being greater 1064 * than 255 (a requirement in this case). 1065 */ 1066 if ((na->na_rdev & 0xffff0000) == 0) 1067 vap->va_rdev = nfsv2_expdev(na->na_rdev); 1068 else 1069 vap->va_rdev = expldev(na->na_rdev); 1070 1071 vap->va_nblocks = na->na_blocks; 1072 switch (na->na_type) { 1073 case NFBLK: 1074 vap->va_blksize = DEV_BSIZE; 1075 break; 1076 1077 case NFCHR: 1078 vap->va_blksize = MAXBSIZE; 1079 break; 1080 1081 case NFSOC: 1082 default: 1083 vap->va_blksize = na->na_blocksize; 1084 break; 1085 } 1086 /* 1087 * This bit of ugliness is a hack to preserve the 1088 * over-the-wire protocols for named-pipe vnodes. 1089 * It remaps the special over-the-wire type to the 1090 * VFIFO type. (see note in nfs.h) 1091 */ 1092 if (NA_ISFIFO(na)) { 1093 vap->va_type = VFIFO; 1094 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1095 vap->va_rdev = 0; 1096 vap->va_blksize = na->na_blocksize; 1097 } 1098 vap->va_seq = 0; 1099 return (0); 1100 } 1101 1102 /* 1103 * Convert NFS Version 3 over the network attributes to the local 1104 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1105 * network representation and the local representation is done here. 1106 */ 1107 vtype_t nf3_to_vt[] = { 1108 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1109 }; 1110 1111 int 1112 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1113 { 1114 1115 #ifndef _LP64 1116 /* overflow in time attributes? */ 1117 if (!NFS3_FATTR_TIME_OK(na)) 1118 return (EOVERFLOW); 1119 #endif 1120 if (!NFS3_SIZE_OK(na->size)) 1121 /* file too big */ 1122 return (EFBIG); 1123 1124 vap->va_mask = AT_ALL; 1125 1126 if (na->type < NF3REG || na->type > NF3FIFO) 1127 vap->va_type = VBAD; 1128 else 1129 vap->va_type = nf3_to_vt[na->type]; 1130 vap->va_mode = na->mode; 1131 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1132 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1133 vap->va_fsid = vp->v_vfsp->vfs_dev; 1134 vap->va_nodeid = na->fileid; 1135 vap->va_nlink = na->nlink; 1136 vap->va_size = na->size; 1137 1138 /* 1139 * nfs protocol defines times as unsigned so don't extend sign, 1140 * unless sysadmin set nfs_allow_preepoch_time. 1141 */ 1142 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1143 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1144 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1145 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1146 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1147 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1148 1149 switch (na->type) { 1150 case NF3BLK: 1151 vap->va_rdev = makedevice(na->rdev.specdata1, 1152 na->rdev.specdata2); 1153 vap->va_blksize = DEV_BSIZE; 1154 vap->va_nblocks = 0; 1155 break; 1156 case NF3CHR: 1157 vap->va_rdev = makedevice(na->rdev.specdata1, 1158 na->rdev.specdata2); 1159 vap->va_blksize = MAXBSIZE; 1160 vap->va_nblocks = 0; 1161 break; 1162 case NF3REG: 1163 case NF3DIR: 1164 case NF3LNK: 1165 vap->va_rdev = 0; 1166 vap->va_blksize = MAXBSIZE; 1167 vap->va_nblocks = (u_longlong_t) 1168 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1169 (size3)DEV_BSIZE); 1170 break; 1171 case NF3SOCK: 1172 case NF3FIFO: 1173 default: 1174 vap->va_rdev = 0; 1175 vap->va_blksize = MAXBSIZE; 1176 vap->va_nblocks = 0; 1177 break; 1178 } 1179 vap->va_seq = 0; 1180 return (0); 1181 } 1182 1183 /* 1184 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1185 * for the demand-based allocation of async threads per-mount. The 1186 * nfs_async_timeout is the amount of time a thread will live after it 1187 * becomes idle, unless new I/O requests are received before the thread 1188 * dies. See nfs_async_putpage and nfs_async_start. 1189 */ 1190 1191 int nfs_async_timeout = -1; /* uninitialized */ 1192 1193 static void nfs_async_start(struct vfs *); 1194 static void nfs_async_pgops_start(struct vfs *); 1195 static void nfs_async_common_start(struct vfs *, int); 1196 1197 static void 1198 free_async_args(struct nfs_async_reqs *args) 1199 { 1200 rnode_t *rp; 1201 1202 if (args->a_io != NFS_INACTIVE) { 1203 rp = VTOR(args->a_vp); 1204 mutex_enter(&rp->r_statelock); 1205 rp->r_count--; 1206 if (args->a_io == NFS_PUTAPAGE || 1207 args->a_io == NFS_PAGEIO) 1208 rp->r_awcount--; 1209 cv_broadcast(&rp->r_cv); 1210 mutex_exit(&rp->r_statelock); 1211 VN_RELE(args->a_vp); 1212 } 1213 crfree(args->a_cred); 1214 kmem_free(args, sizeof (*args)); 1215 } 1216 1217 /* 1218 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1219 * pageout(), running in the global zone, have legitimate reasons to do 1220 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1221 * use of a a per-mount "asynchronous requests manager thread" which is 1222 * signaled by the various asynchronous work routines when there is 1223 * asynchronous work to be done. It is responsible for creating new 1224 * worker threads if necessary, and notifying existing worker threads 1225 * that there is work to be done. 1226 * 1227 * In other words, it will "take the specifications from the customers and 1228 * give them to the engineers." 1229 * 1230 * Worker threads die off of their own accord if they are no longer 1231 * needed. 1232 * 1233 * This thread is killed when the zone is going away or the filesystem 1234 * is being unmounted. 1235 */ 1236 void 1237 nfs_async_manager(vfs_t *vfsp) 1238 { 1239 callb_cpr_t cprinfo; 1240 mntinfo_t *mi; 1241 uint_t max_threads; 1242 1243 mi = VFTOMI(vfsp); 1244 1245 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1246 "nfs_async_manager"); 1247 1248 mutex_enter(&mi->mi_async_lock); 1249 /* 1250 * We want to stash the max number of threads that this mount was 1251 * allowed so we can use it later when the variable is set to zero as 1252 * part of the zone/mount going away. 1253 * 1254 * We want to be able to create at least one thread to handle 1255 * asynchronous inactive calls. 1256 */ 1257 max_threads = MAX(mi->mi_max_threads, 1); 1258 /* 1259 * We don't want to wait for mi_max_threads to go to zero, since that 1260 * happens as part of a failed unmount, but this thread should only 1261 * exit when the mount/zone is really going away. 1262 * 1263 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1264 * attempted: the various _async_*() functions know to do things 1265 * inline if mi_max_threads == 0. Henceforth we just drain out the 1266 * outstanding requests. 1267 * 1268 * Note that we still create zthreads even if we notice the zone is 1269 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1270 * shutdown sequence to take slightly longer in some cases, but 1271 * doesn't violate the protocol, as all threads will exit as soon as 1272 * they're done processing the remaining requests. 1273 */ 1274 for (;;) { 1275 while (mi->mi_async_req_count > 0) { 1276 /* 1277 * Paranoia: If the mount started out having 1278 * (mi->mi_max_threads == 0), and the value was 1279 * later changed (via a debugger or somesuch), 1280 * we could be confused since we will think we 1281 * can't create any threads, and the calling 1282 * code (which looks at the current value of 1283 * mi->mi_max_threads, now non-zero) thinks we 1284 * can. 1285 * 1286 * So, because we're paranoid, we create threads 1287 * up to the maximum of the original and the 1288 * current value. This means that future 1289 * (debugger-induced) lowerings of 1290 * mi->mi_max_threads are ignored for our 1291 * purposes, but who told them they could change 1292 * random values on a live kernel anyhow? 1293 */ 1294 if (mi->mi_threads[NFS_ASYNC_QUEUE] < 1295 MAX(mi->mi_max_threads, max_threads)) { 1296 mi->mi_threads[NFS_ASYNC_QUEUE]++; 1297 mutex_exit(&mi->mi_async_lock); 1298 VFS_HOLD(vfsp); /* hold for new thread */ 1299 (void) zthread_create(NULL, 0, nfs_async_start, 1300 vfsp, 0, minclsyspri); 1301 mutex_enter(&mi->mi_async_lock); 1302 } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] < 1303 NUM_ASYNC_PGOPS_THREADS) { 1304 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++; 1305 mutex_exit(&mi->mi_async_lock); 1306 VFS_HOLD(vfsp); /* hold for new thread */ 1307 (void) zthread_create(NULL, 0, 1308 nfs_async_pgops_start, vfsp, 0, 1309 minclsyspri); 1310 mutex_enter(&mi->mi_async_lock); 1311 } 1312 NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1313 ASSERT(mi->mi_async_req_count != 0); 1314 mi->mi_async_req_count--; 1315 } 1316 1317 mutex_enter(&mi->mi_lock); 1318 if (mi->mi_flags & MI_ASYNC_MGR_STOP) { 1319 mutex_exit(&mi->mi_lock); 1320 break; 1321 } 1322 mutex_exit(&mi->mi_lock); 1323 1324 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1325 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1326 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1327 } 1328 /* 1329 * Let everyone know we're done. 1330 */ 1331 mi->mi_manager_thread = NULL; 1332 cv_broadcast(&mi->mi_async_cv); 1333 1334 /* 1335 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1336 * since CALLB_CPR_EXIT is actually responsible for releasing 1337 * 'mi_async_lock'. 1338 */ 1339 CALLB_CPR_EXIT(&cprinfo); 1340 VFS_RELE(vfsp); /* release thread's hold */ 1341 zthread_exit(); 1342 } 1343 1344 /* 1345 * Signal (and wait for) the async manager thread to clean up and go away. 1346 */ 1347 void 1348 nfs_async_manager_stop(vfs_t *vfsp) 1349 { 1350 mntinfo_t *mi = VFTOMI(vfsp); 1351 1352 mutex_enter(&mi->mi_async_lock); 1353 mutex_enter(&mi->mi_lock); 1354 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1355 mutex_exit(&mi->mi_lock); 1356 cv_broadcast(&mi->mi_async_reqs_cv); 1357 while (mi->mi_manager_thread != NULL) 1358 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1359 mutex_exit(&mi->mi_async_lock); 1360 } 1361 1362 int 1363 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1364 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1365 u_offset_t, caddr_t, struct seg *, cred_t *)) 1366 { 1367 rnode_t *rp; 1368 mntinfo_t *mi; 1369 struct nfs_async_reqs *args; 1370 1371 rp = VTOR(vp); 1372 ASSERT(rp->r_freef == NULL); 1373 1374 mi = VTOMI(vp); 1375 1376 /* 1377 * If addr falls in a different segment, don't bother doing readahead. 1378 */ 1379 if (addr >= seg->s_base + seg->s_size) 1380 return (-1); 1381 1382 /* 1383 * If we can't allocate a request structure, punt on the readahead. 1384 */ 1385 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1386 return (-1); 1387 1388 /* 1389 * If a lock operation is pending, don't initiate any new 1390 * readaheads. Otherwise, bump r_count to indicate the new 1391 * asynchronous I/O. 1392 */ 1393 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1394 kmem_free(args, sizeof (*args)); 1395 return (-1); 1396 } 1397 mutex_enter(&rp->r_statelock); 1398 rp->r_count++; 1399 mutex_exit(&rp->r_statelock); 1400 nfs_rw_exit(&rp->r_lkserlock); 1401 1402 args->a_next = NULL; 1403 #ifdef DEBUG 1404 args->a_queuer = curthread; 1405 #endif 1406 VN_HOLD(vp); 1407 args->a_vp = vp; 1408 ASSERT(cr != NULL); 1409 crhold(cr); 1410 args->a_cred = cr; 1411 args->a_io = NFS_READ_AHEAD; 1412 args->a_nfs_readahead = readahead; 1413 args->a_nfs_blkoff = blkoff; 1414 args->a_nfs_seg = seg; 1415 args->a_nfs_addr = addr; 1416 1417 mutex_enter(&mi->mi_async_lock); 1418 1419 /* 1420 * If asyncio has been disabled, don't bother readahead. 1421 */ 1422 if (mi->mi_max_threads == 0) { 1423 mutex_exit(&mi->mi_async_lock); 1424 goto noasync; 1425 } 1426 1427 /* 1428 * Link request structure into the async list and 1429 * wakeup async thread to do the i/o. 1430 */ 1431 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1432 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1433 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1434 } else { 1435 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1436 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1437 } 1438 1439 if (mi->mi_io_kstats) { 1440 mutex_enter(&mi->mi_lock); 1441 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1442 mutex_exit(&mi->mi_lock); 1443 } 1444 1445 mi->mi_async_req_count++; 1446 ASSERT(mi->mi_async_req_count != 0); 1447 cv_signal(&mi->mi_async_reqs_cv); 1448 mutex_exit(&mi->mi_async_lock); 1449 return (0); 1450 1451 noasync: 1452 mutex_enter(&rp->r_statelock); 1453 rp->r_count--; 1454 cv_broadcast(&rp->r_cv); 1455 mutex_exit(&rp->r_statelock); 1456 VN_RELE(vp); 1457 crfree(cr); 1458 kmem_free(args, sizeof (*args)); 1459 return (-1); 1460 } 1461 1462 int 1463 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1464 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1465 u_offset_t, size_t, int, cred_t *)) 1466 { 1467 rnode_t *rp; 1468 mntinfo_t *mi; 1469 struct nfs_async_reqs *args; 1470 1471 ASSERT(flags & B_ASYNC); 1472 ASSERT(vp->v_vfsp != NULL); 1473 1474 rp = VTOR(vp); 1475 ASSERT(rp->r_count > 0); 1476 1477 mi = VTOMI(vp); 1478 1479 /* 1480 * If we can't allocate a request structure, do the putpage 1481 * operation synchronously in this thread's context. 1482 */ 1483 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1484 goto noasync; 1485 1486 args->a_next = NULL; 1487 #ifdef DEBUG 1488 args->a_queuer = curthread; 1489 #endif 1490 VN_HOLD(vp); 1491 args->a_vp = vp; 1492 ASSERT(cr != NULL); 1493 crhold(cr); 1494 args->a_cred = cr; 1495 args->a_io = NFS_PUTAPAGE; 1496 args->a_nfs_putapage = putapage; 1497 args->a_nfs_pp = pp; 1498 args->a_nfs_off = off; 1499 args->a_nfs_len = (uint_t)len; 1500 args->a_nfs_flags = flags; 1501 1502 mutex_enter(&mi->mi_async_lock); 1503 1504 /* 1505 * If asyncio has been disabled, then make a synchronous request. 1506 * This check is done a second time in case async io was diabled 1507 * while this thread was blocked waiting for memory pressure to 1508 * reduce or for the queue to drain. 1509 */ 1510 if (mi->mi_max_threads == 0) { 1511 mutex_exit(&mi->mi_async_lock); 1512 goto noasync; 1513 } 1514 1515 /* 1516 * Link request structure into the async list and 1517 * wakeup async thread to do the i/o. 1518 */ 1519 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1520 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1521 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1522 } else { 1523 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1524 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1525 } 1526 1527 mutex_enter(&rp->r_statelock); 1528 rp->r_count++; 1529 rp->r_awcount++; 1530 mutex_exit(&rp->r_statelock); 1531 1532 if (mi->mi_io_kstats) { 1533 mutex_enter(&mi->mi_lock); 1534 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1535 mutex_exit(&mi->mi_lock); 1536 } 1537 1538 mi->mi_async_req_count++; 1539 ASSERT(mi->mi_async_req_count != 0); 1540 cv_signal(&mi->mi_async_reqs_cv); 1541 mutex_exit(&mi->mi_async_lock); 1542 return (0); 1543 1544 noasync: 1545 if (args != NULL) { 1546 VN_RELE(vp); 1547 crfree(cr); 1548 kmem_free(args, sizeof (*args)); 1549 } 1550 1551 if (curproc == proc_pageout || curproc == proc_fsflush) { 1552 /* 1553 * If we get here in the context of the pageout/fsflush, 1554 * we refuse to do a sync write, because this may hang 1555 * pageout (and the machine). In this case, we just 1556 * re-mark the page as dirty and punt on the page. 1557 * 1558 * Make sure B_FORCE isn't set. We can re-mark the 1559 * pages as dirty and unlock the pages in one swoop by 1560 * passing in B_ERROR to pvn_write_done(). However, 1561 * we should make sure B_FORCE isn't set - we don't 1562 * want the page tossed before it gets written out. 1563 */ 1564 if (flags & B_FORCE) 1565 flags &= ~(B_INVAL | B_FORCE); 1566 pvn_write_done(pp, flags | B_ERROR); 1567 return (0); 1568 } 1569 if (nfs_zone() != mi->mi_zone) { 1570 /* 1571 * So this was a cross-zone sync putpage. We pass in B_ERROR 1572 * to pvn_write_done() to re-mark the pages as dirty and unlock 1573 * them. 1574 * 1575 * We don't want to clear B_FORCE here as the caller presumably 1576 * knows what they're doing if they set it. 1577 */ 1578 pvn_write_done(pp, flags | B_ERROR); 1579 return (EPERM); 1580 } 1581 return ((*putapage)(vp, pp, off, len, flags, cr)); 1582 } 1583 1584 int 1585 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1586 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1587 size_t, int, cred_t *)) 1588 { 1589 rnode_t *rp; 1590 mntinfo_t *mi; 1591 struct nfs_async_reqs *args; 1592 1593 ASSERT(flags & B_ASYNC); 1594 ASSERT(vp->v_vfsp != NULL); 1595 1596 rp = VTOR(vp); 1597 ASSERT(rp->r_count > 0); 1598 1599 mi = VTOMI(vp); 1600 1601 /* 1602 * If we can't allocate a request structure, do the pageio 1603 * request synchronously in this thread's context. 1604 */ 1605 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1606 goto noasync; 1607 1608 args->a_next = NULL; 1609 #ifdef DEBUG 1610 args->a_queuer = curthread; 1611 #endif 1612 VN_HOLD(vp); 1613 args->a_vp = vp; 1614 ASSERT(cr != NULL); 1615 crhold(cr); 1616 args->a_cred = cr; 1617 args->a_io = NFS_PAGEIO; 1618 args->a_nfs_pageio = pageio; 1619 args->a_nfs_pp = pp; 1620 args->a_nfs_off = io_off; 1621 args->a_nfs_len = (uint_t)io_len; 1622 args->a_nfs_flags = flags; 1623 1624 mutex_enter(&mi->mi_async_lock); 1625 1626 /* 1627 * If asyncio has been disabled, then make a synchronous request. 1628 * This check is done a second time in case async io was diabled 1629 * while this thread was blocked waiting for memory pressure to 1630 * reduce or for the queue to drain. 1631 */ 1632 if (mi->mi_max_threads == 0) { 1633 mutex_exit(&mi->mi_async_lock); 1634 goto noasync; 1635 } 1636 1637 /* 1638 * Link request structure into the async list and 1639 * wakeup async thread to do the i/o. 1640 */ 1641 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1642 mi->mi_async_reqs[NFS_PAGEIO] = args; 1643 mi->mi_async_tail[NFS_PAGEIO] = args; 1644 } else { 1645 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1646 mi->mi_async_tail[NFS_PAGEIO] = args; 1647 } 1648 1649 mutex_enter(&rp->r_statelock); 1650 rp->r_count++; 1651 rp->r_awcount++; 1652 mutex_exit(&rp->r_statelock); 1653 1654 if (mi->mi_io_kstats) { 1655 mutex_enter(&mi->mi_lock); 1656 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1657 mutex_exit(&mi->mi_lock); 1658 } 1659 1660 mi->mi_async_req_count++; 1661 ASSERT(mi->mi_async_req_count != 0); 1662 cv_signal(&mi->mi_async_reqs_cv); 1663 mutex_exit(&mi->mi_async_lock); 1664 return (0); 1665 1666 noasync: 1667 if (args != NULL) { 1668 VN_RELE(vp); 1669 crfree(cr); 1670 kmem_free(args, sizeof (*args)); 1671 } 1672 1673 /* 1674 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1675 * the page list), for writes we do it synchronously, except for 1676 * proc_pageout/proc_fsflush as described below. 1677 */ 1678 if (flags & B_READ) { 1679 pvn_read_done(pp, flags | B_ERROR); 1680 return (0); 1681 } 1682 1683 if (curproc == proc_pageout || curproc == proc_fsflush) { 1684 /* 1685 * If we get here in the context of the pageout/fsflush, 1686 * we refuse to do a sync write, because this may hang 1687 * pageout/fsflush (and the machine). In this case, we just 1688 * re-mark the page as dirty and punt on the page. 1689 * 1690 * Make sure B_FORCE isn't set. We can re-mark the 1691 * pages as dirty and unlock the pages in one swoop by 1692 * passing in B_ERROR to pvn_write_done(). However, 1693 * we should make sure B_FORCE isn't set - we don't 1694 * want the page tossed before it gets written out. 1695 */ 1696 if (flags & B_FORCE) 1697 flags &= ~(B_INVAL | B_FORCE); 1698 pvn_write_done(pp, flags | B_ERROR); 1699 return (0); 1700 } 1701 1702 if (nfs_zone() != mi->mi_zone) { 1703 /* 1704 * So this was a cross-zone sync pageio. We pass in B_ERROR 1705 * to pvn_write_done() to re-mark the pages as dirty and unlock 1706 * them. 1707 * 1708 * We don't want to clear B_FORCE here as the caller presumably 1709 * knows what they're doing if they set it. 1710 */ 1711 pvn_write_done(pp, flags | B_ERROR); 1712 return (EPERM); 1713 } 1714 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1715 } 1716 1717 void 1718 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1719 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1720 { 1721 rnode_t *rp; 1722 mntinfo_t *mi; 1723 struct nfs_async_reqs *args; 1724 1725 rp = VTOR(vp); 1726 ASSERT(rp->r_freef == NULL); 1727 1728 mi = VTOMI(vp); 1729 1730 /* 1731 * If we can't allocate a request structure, do the readdir 1732 * operation synchronously in this thread's context. 1733 */ 1734 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1735 goto noasync; 1736 1737 args->a_next = NULL; 1738 #ifdef DEBUG 1739 args->a_queuer = curthread; 1740 #endif 1741 VN_HOLD(vp); 1742 args->a_vp = vp; 1743 ASSERT(cr != NULL); 1744 crhold(cr); 1745 args->a_cred = cr; 1746 args->a_io = NFS_READDIR; 1747 args->a_nfs_readdir = readdir; 1748 args->a_nfs_rdc = rdc; 1749 1750 mutex_enter(&mi->mi_async_lock); 1751 1752 /* 1753 * If asyncio has been disabled, then make a synchronous request. 1754 */ 1755 if (mi->mi_max_threads == 0) { 1756 mutex_exit(&mi->mi_async_lock); 1757 goto noasync; 1758 } 1759 1760 /* 1761 * Link request structure into the async list and 1762 * wakeup async thread to do the i/o. 1763 */ 1764 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1765 mi->mi_async_reqs[NFS_READDIR] = args; 1766 mi->mi_async_tail[NFS_READDIR] = args; 1767 } else { 1768 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1769 mi->mi_async_tail[NFS_READDIR] = args; 1770 } 1771 1772 mutex_enter(&rp->r_statelock); 1773 rp->r_count++; 1774 mutex_exit(&rp->r_statelock); 1775 1776 if (mi->mi_io_kstats) { 1777 mutex_enter(&mi->mi_lock); 1778 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1779 mutex_exit(&mi->mi_lock); 1780 } 1781 1782 mi->mi_async_req_count++; 1783 ASSERT(mi->mi_async_req_count != 0); 1784 cv_signal(&mi->mi_async_reqs_cv); 1785 mutex_exit(&mi->mi_async_lock); 1786 return; 1787 1788 noasync: 1789 if (args != NULL) { 1790 VN_RELE(vp); 1791 crfree(cr); 1792 kmem_free(args, sizeof (*args)); 1793 } 1794 1795 rdc->entries = NULL; 1796 mutex_enter(&rp->r_statelock); 1797 ASSERT(rdc->flags & RDDIR); 1798 rdc->flags &= ~RDDIR; 1799 rdc->flags |= RDDIRREQ; 1800 /* 1801 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1802 * is set, wakeup the thread sleeping in cv_wait_sig(). 1803 * The woken up thread will reset the flag to RDDIR and will 1804 * continue with the readdir opeartion. 1805 */ 1806 if (rdc->flags & RDDIRWAIT) { 1807 rdc->flags &= ~RDDIRWAIT; 1808 cv_broadcast(&rdc->cv); 1809 } 1810 mutex_exit(&rp->r_statelock); 1811 rddir_cache_rele(rdc); 1812 } 1813 1814 void 1815 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1816 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *)) 1817 { 1818 rnode_t *rp; 1819 mntinfo_t *mi; 1820 struct nfs_async_reqs *args; 1821 page_t *pp; 1822 1823 rp = VTOR(vp); 1824 mi = VTOMI(vp); 1825 1826 /* 1827 * If we can't allocate a request structure, do the commit 1828 * operation synchronously in this thread's context. 1829 */ 1830 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1831 goto noasync; 1832 1833 args->a_next = NULL; 1834 #ifdef DEBUG 1835 args->a_queuer = curthread; 1836 #endif 1837 VN_HOLD(vp); 1838 args->a_vp = vp; 1839 ASSERT(cr != NULL); 1840 crhold(cr); 1841 args->a_cred = cr; 1842 args->a_io = NFS_COMMIT; 1843 args->a_nfs_commit = commit; 1844 args->a_nfs_plist = plist; 1845 args->a_nfs_offset = offset; 1846 args->a_nfs_count = count; 1847 1848 mutex_enter(&mi->mi_async_lock); 1849 1850 /* 1851 * If asyncio has been disabled, then make a synchronous request. 1852 * This check is done a second time in case async io was diabled 1853 * while this thread was blocked waiting for memory pressure to 1854 * reduce or for the queue to drain. 1855 */ 1856 if (mi->mi_max_threads == 0) { 1857 mutex_exit(&mi->mi_async_lock); 1858 goto noasync; 1859 } 1860 1861 /* 1862 * Link request structure into the async list and 1863 * wakeup async thread to do the i/o. 1864 */ 1865 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1866 mi->mi_async_reqs[NFS_COMMIT] = args; 1867 mi->mi_async_tail[NFS_COMMIT] = args; 1868 } else { 1869 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1870 mi->mi_async_tail[NFS_COMMIT] = args; 1871 } 1872 1873 mutex_enter(&rp->r_statelock); 1874 rp->r_count++; 1875 mutex_exit(&rp->r_statelock); 1876 1877 if (mi->mi_io_kstats) { 1878 mutex_enter(&mi->mi_lock); 1879 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1880 mutex_exit(&mi->mi_lock); 1881 } 1882 1883 mi->mi_async_req_count++; 1884 ASSERT(mi->mi_async_req_count != 0); 1885 cv_signal(&mi->mi_async_reqs_cv); 1886 mutex_exit(&mi->mi_async_lock); 1887 return; 1888 1889 noasync: 1890 if (args != NULL) { 1891 VN_RELE(vp); 1892 crfree(cr); 1893 kmem_free(args, sizeof (*args)); 1894 } 1895 1896 if (curproc == proc_pageout || curproc == proc_fsflush || 1897 nfs_zone() != mi->mi_zone) { 1898 while (plist != NULL) { 1899 pp = plist; 1900 page_sub(&plist, pp); 1901 pp->p_fsdata = C_COMMIT; 1902 page_unlock(pp); 1903 } 1904 return; 1905 } 1906 (*commit)(vp, plist, offset, count, cr); 1907 } 1908 1909 void 1910 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1911 void (*inactive)(vnode_t *, cred_t *, caller_context_t *)) 1912 { 1913 mntinfo_t *mi; 1914 struct nfs_async_reqs *args; 1915 1916 mi = VTOMI(vp); 1917 1918 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1919 args->a_next = NULL; 1920 #ifdef DEBUG 1921 args->a_queuer = curthread; 1922 #endif 1923 args->a_vp = vp; 1924 ASSERT(cr != NULL); 1925 crhold(cr); 1926 args->a_cred = cr; 1927 args->a_io = NFS_INACTIVE; 1928 args->a_nfs_inactive = inactive; 1929 1930 /* 1931 * Note that we don't check mi->mi_max_threads here, since we 1932 * *need* to get rid of this vnode regardless of whether someone 1933 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1934 * 1935 * The manager thread knows about this and is willing to create 1936 * at least one thread to accommodate us. 1937 */ 1938 mutex_enter(&mi->mi_async_lock); 1939 if (mi->mi_manager_thread == NULL) { 1940 rnode_t *rp = VTOR(vp); 1941 1942 mutex_exit(&mi->mi_async_lock); 1943 crfree(cr); /* drop our reference */ 1944 kmem_free(args, sizeof (*args)); 1945 /* 1946 * We can't do an over-the-wire call since we're in the wrong 1947 * zone, so we need to clean up state as best we can and then 1948 * throw away the vnode. 1949 */ 1950 mutex_enter(&rp->r_statelock); 1951 if (rp->r_unldvp != NULL) { 1952 vnode_t *unldvp; 1953 char *unlname; 1954 cred_t *unlcred; 1955 1956 unldvp = rp->r_unldvp; 1957 rp->r_unldvp = NULL; 1958 unlname = rp->r_unlname; 1959 rp->r_unlname = NULL; 1960 unlcred = rp->r_unlcred; 1961 rp->r_unlcred = NULL; 1962 mutex_exit(&rp->r_statelock); 1963 1964 VN_RELE(unldvp); 1965 kmem_free(unlname, MAXNAMELEN); 1966 crfree(unlcred); 1967 } else { 1968 mutex_exit(&rp->r_statelock); 1969 } 1970 /* 1971 * No need to explicitly throw away any cached pages. The 1972 * eventual rinactive() will attempt a synchronous 1973 * VOP_PUTPAGE() which will immediately fail since the request 1974 * is coming from the wrong zone, and then will proceed to call 1975 * nfs_invalidate_pages() which will clean things up for us. 1976 */ 1977 rp_addfree(VTOR(vp), cr); 1978 return; 1979 } 1980 1981 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1982 mi->mi_async_reqs[NFS_INACTIVE] = args; 1983 } else { 1984 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1985 } 1986 mi->mi_async_tail[NFS_INACTIVE] = args; 1987 /* 1988 * Don't increment r_count, since we're trying to get rid of the vnode. 1989 */ 1990 1991 mi->mi_async_req_count++; 1992 ASSERT(mi->mi_async_req_count != 0); 1993 cv_signal(&mi->mi_async_reqs_cv); 1994 mutex_exit(&mi->mi_async_lock); 1995 } 1996 1997 static void 1998 nfs_async_start(struct vfs *vfsp) 1999 { 2000 nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE); 2001 } 2002 2003 static void 2004 nfs_async_pgops_start(struct vfs *vfsp) 2005 { 2006 nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE); 2007 } 2008 2009 /* 2010 * The async queues for each mounted file system are arranged as a 2011 * set of queues, one for each async i/o type. Requests are taken 2012 * from the queues in a round-robin fashion. A number of consecutive 2013 * requests are taken from each queue before moving on to the next 2014 * queue. This functionality may allow the NFS Version 2 server to do 2015 * write clustering, even if the client is mixing writes and reads 2016 * because it will take multiple write requests from the queue 2017 * before processing any of the other async i/o types. 2018 * 2019 * XXX The nfs_async_common_start thread is unsafe in the light of the present 2020 * model defined by cpr to suspend the system. Specifically over the 2021 * wire calls are cpr-unsafe. The thread should be reevaluated in 2022 * case of future updates to the cpr model. 2023 */ 2024 static void 2025 nfs_async_common_start(struct vfs *vfsp, int async_queue) 2026 { 2027 struct nfs_async_reqs *args; 2028 mntinfo_t *mi = VFTOMI(vfsp); 2029 clock_t time_left = 1; 2030 callb_cpr_t cprinfo; 2031 int i; 2032 int async_types; 2033 kcondvar_t *async_work_cv; 2034 2035 if (async_queue == NFS_ASYNC_QUEUE) { 2036 async_types = NFS_ASYNC_TYPES; 2037 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE]; 2038 } else { 2039 async_types = NFS_ASYNC_PGOPS_TYPES; 2040 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]; 2041 } 2042 2043 /* 2044 * Dynamic initialization of nfs_async_timeout to allow nfs to be 2045 * built in an implementation independent manner. 2046 */ 2047 if (nfs_async_timeout == -1) 2048 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 2049 2050 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 2051 2052 mutex_enter(&mi->mi_async_lock); 2053 for (;;) { 2054 /* 2055 * Find the next queue containing an entry. We start 2056 * at the current queue pointer and then round robin 2057 * through all of them until we either find a non-empty 2058 * queue or have looked through all of them. 2059 */ 2060 for (i = 0; i < async_types; i++) { 2061 args = *mi->mi_async_curr[async_queue]; 2062 if (args != NULL) 2063 break; 2064 mi->mi_async_curr[async_queue]++; 2065 if (mi->mi_async_curr[async_queue] == 2066 &mi->mi_async_reqs[async_types]) { 2067 mi->mi_async_curr[async_queue] = 2068 &mi->mi_async_reqs[0]; 2069 } 2070 } 2071 /* 2072 * If we didn't find a entry, then block until woken up 2073 * again and then look through the queues again. 2074 */ 2075 if (args == NULL) { 2076 /* 2077 * Exiting is considered to be safe for CPR as well 2078 */ 2079 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2080 2081 /* 2082 * Wakeup thread waiting to unmount the file 2083 * system only if all async threads are inactive. 2084 * 2085 * If we've timed-out and there's nothing to do, 2086 * then get rid of this thread. 2087 */ 2088 if (mi->mi_max_threads == 0 || time_left <= 0) { 2089 --mi->mi_threads[async_queue]; 2090 2091 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 && 2092 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0) 2093 cv_signal(&mi->mi_async_cv); 2094 CALLB_CPR_EXIT(&cprinfo); 2095 VFS_RELE(vfsp); /* release thread's hold */ 2096 zthread_exit(); 2097 /* NOTREACHED */ 2098 } 2099 time_left = cv_reltimedwait(async_work_cv, 2100 &mi->mi_async_lock, nfs_async_timeout, 2101 TR_CLOCK_TICK); 2102 2103 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 2104 2105 continue; 2106 } 2107 time_left = 1; 2108 2109 /* 2110 * Remove the request from the async queue and then 2111 * update the current async request queue pointer. If 2112 * the current queue is empty or we have removed enough 2113 * consecutive entries from it, then reset the counter 2114 * for this queue and then move the current pointer to 2115 * the next queue. 2116 */ 2117 *mi->mi_async_curr[async_queue] = args->a_next; 2118 if (*mi->mi_async_curr[async_queue] == NULL || 2119 --mi->mi_async_clusters[args->a_io] == 0) { 2120 mi->mi_async_clusters[args->a_io] = 2121 mi->mi_async_init_clusters; 2122 mi->mi_async_curr[async_queue]++; 2123 if (mi->mi_async_curr[async_queue] == 2124 &mi->mi_async_reqs[async_types]) { 2125 mi->mi_async_curr[async_queue] = 2126 &mi->mi_async_reqs[0]; 2127 } 2128 } 2129 2130 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 2131 mutex_enter(&mi->mi_lock); 2132 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 2133 mutex_exit(&mi->mi_lock); 2134 } 2135 2136 mutex_exit(&mi->mi_async_lock); 2137 2138 /* 2139 * Obtain arguments from the async request structure. 2140 */ 2141 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2142 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2143 args->a_nfs_addr, args->a_nfs_seg, 2144 args->a_cred); 2145 } else if (args->a_io == NFS_PUTAPAGE) { 2146 (void) (*args->a_nfs_putapage)(args->a_vp, 2147 args->a_nfs_pp, args->a_nfs_off, 2148 args->a_nfs_len, args->a_nfs_flags, 2149 args->a_cred); 2150 } else if (args->a_io == NFS_PAGEIO) { 2151 (void) (*args->a_nfs_pageio)(args->a_vp, 2152 args->a_nfs_pp, args->a_nfs_off, 2153 args->a_nfs_len, args->a_nfs_flags, 2154 args->a_cred); 2155 } else if (args->a_io == NFS_READDIR) { 2156 (void) ((*args->a_nfs_readdir)(args->a_vp, 2157 args->a_nfs_rdc, args->a_cred)); 2158 } else if (args->a_io == NFS_COMMIT) { 2159 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2160 args->a_nfs_offset, args->a_nfs_count, 2161 args->a_cred); 2162 } else if (args->a_io == NFS_INACTIVE) { 2163 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL); 2164 } 2165 2166 /* 2167 * Now, release the vnode and free the credentials 2168 * structure. 2169 */ 2170 free_async_args(args); 2171 /* 2172 * Reacquire the mutex because it will be needed above. 2173 */ 2174 mutex_enter(&mi->mi_async_lock); 2175 } 2176 } 2177 2178 void 2179 nfs_async_stop(struct vfs *vfsp) 2180 { 2181 mntinfo_t *mi = VFTOMI(vfsp); 2182 2183 /* 2184 * Wait for all outstanding async operations to complete and for the 2185 * worker threads to exit. 2186 */ 2187 mutex_enter(&mi->mi_async_lock); 2188 mi->mi_max_threads = 0; 2189 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2190 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2191 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) 2192 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2193 mutex_exit(&mi->mi_async_lock); 2194 } 2195 2196 /* 2197 * nfs_async_stop_sig: 2198 * Wait for all outstanding putpage operation to complete. If a signal 2199 * is deliver we will abort and return non-zero. If we can put all the 2200 * pages we will return 0. This routine is called from nfs_unmount and 2201 * nfs3_unmount to make these operations interruptible. 2202 */ 2203 int 2204 nfs_async_stop_sig(struct vfs *vfsp) 2205 { 2206 mntinfo_t *mi = VFTOMI(vfsp); 2207 ushort_t omax; 2208 int rval; 2209 2210 /* 2211 * Wait for all outstanding async operations to complete and for the 2212 * worker threads to exit. 2213 */ 2214 mutex_enter(&mi->mi_async_lock); 2215 omax = mi->mi_max_threads; 2216 mi->mi_max_threads = 0; 2217 /* 2218 * Tell all the worker threads to exit. 2219 */ 2220 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2221 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2222 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) { 2223 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2224 break; 2225 } 2226 rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 || 2227 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0); /* Interrupted */ 2228 if (rval) 2229 mi->mi_max_threads = omax; 2230 mutex_exit(&mi->mi_async_lock); 2231 2232 return (rval); 2233 } 2234 2235 int 2236 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2237 { 2238 int pagecreate; 2239 int n; 2240 int saved_n; 2241 caddr_t saved_base; 2242 u_offset_t offset; 2243 int error; 2244 int sm_error; 2245 vnode_t *vp = RTOV(rp); 2246 2247 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2248 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2249 if (!vpm_enable) { 2250 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2251 } 2252 2253 /* 2254 * Move bytes in at most PAGESIZE chunks. We must avoid 2255 * spanning pages in uiomove() because page faults may cause 2256 * the cache to be invalidated out from under us. The r_size is not 2257 * updated until after the uiomove. If we push the last page of a 2258 * file before r_size is correct, we will lose the data written past 2259 * the current (and invalid) r_size. 2260 */ 2261 do { 2262 offset = uio->uio_loffset; 2263 pagecreate = 0; 2264 2265 /* 2266 * n is the number of bytes required to satisfy the request 2267 * or the number of bytes to fill out the page. 2268 */ 2269 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2270 2271 /* 2272 * Check to see if we can skip reading in the page 2273 * and just allocate the memory. We can do this 2274 * if we are going to rewrite the entire mapping 2275 * or if we are going to write to or beyond the current 2276 * end of file from the beginning of the mapping. 2277 * 2278 * The read of r_size is now protected by r_statelock. 2279 */ 2280 mutex_enter(&rp->r_statelock); 2281 /* 2282 * When pgcreated is nonzero the caller has already done 2283 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2284 * segkpm this means we already have at least one page 2285 * created and mapped at base. 2286 */ 2287 pagecreate = pgcreated || 2288 ((offset & PAGEOFFSET) == 0 && 2289 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2290 2291 mutex_exit(&rp->r_statelock); 2292 if (!vpm_enable && pagecreate) { 2293 /* 2294 * The last argument tells segmap_pagecreate() to 2295 * always lock the page, as opposed to sometimes 2296 * returning with the page locked. This way we avoid a 2297 * fault on the ensuing uiomove(), but also 2298 * more importantly (to fix bug 1094402) we can 2299 * call segmap_fault() to unlock the page in all 2300 * cases. An alternative would be to modify 2301 * segmap_pagecreate() to tell us when it is 2302 * locking a page, but that's a fairly major 2303 * interface change. 2304 */ 2305 if (pgcreated == 0) 2306 (void) segmap_pagecreate(segkmap, base, 2307 (uint_t)n, 1); 2308 saved_base = base; 2309 saved_n = n; 2310 } 2311 2312 /* 2313 * The number of bytes of data in the last page can not 2314 * be accurately be determined while page is being 2315 * uiomove'd to and the size of the file being updated. 2316 * Thus, inform threads which need to know accurately 2317 * how much data is in the last page of the file. They 2318 * will not do the i/o immediately, but will arrange for 2319 * the i/o to happen later when this modify operation 2320 * will have finished. 2321 */ 2322 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2323 mutex_enter(&rp->r_statelock); 2324 rp->r_flags |= RMODINPROGRESS; 2325 rp->r_modaddr = (offset & MAXBMASK); 2326 mutex_exit(&rp->r_statelock); 2327 2328 if (vpm_enable) { 2329 /* 2330 * Copy data. If new pages are created, part of 2331 * the page that is not written will be initizliazed 2332 * with zeros. 2333 */ 2334 error = vpm_data_copy(vp, offset, n, uio, 2335 !pagecreate, NULL, 0, S_WRITE); 2336 } else { 2337 error = uiomove(base, n, UIO_WRITE, uio); 2338 } 2339 2340 /* 2341 * r_size is the maximum number of 2342 * bytes known to be in the file. 2343 * Make sure it is at least as high as the 2344 * first unwritten byte pointed to by uio_loffset. 2345 */ 2346 mutex_enter(&rp->r_statelock); 2347 if (rp->r_size < uio->uio_loffset) 2348 rp->r_size = uio->uio_loffset; 2349 rp->r_flags &= ~RMODINPROGRESS; 2350 rp->r_flags |= RDIRTY; 2351 mutex_exit(&rp->r_statelock); 2352 2353 /* n = # of bytes written */ 2354 n = (int)(uio->uio_loffset - offset); 2355 2356 if (!vpm_enable) { 2357 base += n; 2358 } 2359 tcount -= n; 2360 /* 2361 * If we created pages w/o initializing them completely, 2362 * we need to zero the part that wasn't set up. 2363 * This happens on a most EOF write cases and if 2364 * we had some sort of error during the uiomove. 2365 */ 2366 if (!vpm_enable && pagecreate) { 2367 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2368 (void) kzero(base, PAGESIZE - n); 2369 2370 if (pgcreated) { 2371 /* 2372 * Caller is responsible for this page, 2373 * it was not created in this loop. 2374 */ 2375 pgcreated = 0; 2376 } else { 2377 /* 2378 * For bug 1094402: segmap_pagecreate locks 2379 * page. Unlock it. This also unlocks the 2380 * pages allocated by page_create_va() in 2381 * segmap_pagecreate(). 2382 */ 2383 sm_error = segmap_fault(kas.a_hat, segkmap, 2384 saved_base, saved_n, 2385 F_SOFTUNLOCK, S_WRITE); 2386 if (error == 0) 2387 error = sm_error; 2388 } 2389 } 2390 } while (tcount > 0 && error == 0); 2391 2392 return (error); 2393 } 2394 2395 int 2396 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2397 { 2398 rnode_t *rp; 2399 page_t *pp; 2400 u_offset_t eoff; 2401 u_offset_t io_off; 2402 size_t io_len; 2403 int error; 2404 int rdirty; 2405 int err; 2406 2407 rp = VTOR(vp); 2408 ASSERT(rp->r_count > 0); 2409 2410 if (!vn_has_cached_data(vp)) 2411 return (0); 2412 2413 ASSERT(vp->v_type != VCHR); 2414 2415 /* 2416 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2417 * writes. B_FORCE is set to force the VM system to actually 2418 * invalidate the pages, even if the i/o failed. The pages 2419 * need to get invalidated because they can't be written out 2420 * because there isn't any space left on either the server's 2421 * file system or in the user's disk quota. The B_FREE bit 2422 * is cleared to avoid confusion as to whether this is a 2423 * request to place the page on the freelist or to destroy 2424 * it. 2425 */ 2426 if ((rp->r_flags & ROUTOFSPACE) || 2427 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2428 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2429 2430 if (len == 0) { 2431 /* 2432 * If doing a full file synchronous operation, then clear 2433 * the RDIRTY bit. If a page gets dirtied while the flush 2434 * is happening, then RDIRTY will get set again. The 2435 * RDIRTY bit must get cleared before the flush so that 2436 * we don't lose this information. 2437 * 2438 * If there are no full file async write operations 2439 * pending and RDIRTY bit is set, clear it. 2440 */ 2441 if (off == (u_offset_t)0 && 2442 !(flags & B_ASYNC) && 2443 (rp->r_flags & RDIRTY)) { 2444 mutex_enter(&rp->r_statelock); 2445 rdirty = (rp->r_flags & RDIRTY); 2446 rp->r_flags &= ~RDIRTY; 2447 mutex_exit(&rp->r_statelock); 2448 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2449 mutex_enter(&rp->r_statelock); 2450 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) { 2451 rdirty = (rp->r_flags & RDIRTY); 2452 rp->r_flags &= ~RDIRTY; 2453 } 2454 mutex_exit(&rp->r_statelock); 2455 } else 2456 rdirty = 0; 2457 2458 /* 2459 * Search the entire vp list for pages >= off, and flush 2460 * the dirty pages. 2461 */ 2462 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2463 flags, cr); 2464 2465 /* 2466 * If an error occurred and the file was marked as dirty 2467 * before and we aren't forcibly invalidating pages, then 2468 * reset the RDIRTY flag. 2469 */ 2470 if (error && rdirty && 2471 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2472 mutex_enter(&rp->r_statelock); 2473 rp->r_flags |= RDIRTY; 2474 mutex_exit(&rp->r_statelock); 2475 } 2476 } else { 2477 /* 2478 * Do a range from [off...off + len) looking for pages 2479 * to deal with. 2480 */ 2481 error = 0; 2482 #ifdef lint 2483 io_len = 0; 2484 #endif 2485 eoff = off + len; 2486 mutex_enter(&rp->r_statelock); 2487 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2488 io_off += io_len) { 2489 mutex_exit(&rp->r_statelock); 2490 /* 2491 * If we are not invalidating, synchronously 2492 * freeing or writing pages use the routine 2493 * page_lookup_nowait() to prevent reclaiming 2494 * them from the free list. 2495 */ 2496 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2497 pp = page_lookup(vp, io_off, 2498 (flags & (B_INVAL | B_FREE)) ? 2499 SE_EXCL : SE_SHARED); 2500 } else { 2501 pp = page_lookup_nowait(vp, io_off, 2502 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2503 } 2504 2505 if (pp == NULL || !pvn_getdirty(pp, flags)) 2506 io_len = PAGESIZE; 2507 else { 2508 err = (*rp->r_putapage)(vp, pp, &io_off, 2509 &io_len, flags, cr); 2510 if (!error) 2511 error = err; 2512 /* 2513 * "io_off" and "io_len" are returned as 2514 * the range of pages we actually wrote. 2515 * This allows us to skip ahead more quickly 2516 * since several pages may've been dealt 2517 * with by this iteration of the loop. 2518 */ 2519 } 2520 mutex_enter(&rp->r_statelock); 2521 } 2522 mutex_exit(&rp->r_statelock); 2523 } 2524 2525 return (error); 2526 } 2527 2528 void 2529 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2530 { 2531 rnode_t *rp; 2532 2533 rp = VTOR(vp); 2534 mutex_enter(&rp->r_statelock); 2535 while (rp->r_flags & RTRUNCATE) 2536 cv_wait(&rp->r_cv, &rp->r_statelock); 2537 rp->r_flags |= RTRUNCATE; 2538 if (off == (u_offset_t)0) { 2539 rp->r_flags &= ~RDIRTY; 2540 if (!(rp->r_flags & RSTALE)) 2541 rp->r_error = 0; 2542 } 2543 rp->r_truncaddr = off; 2544 mutex_exit(&rp->r_statelock); 2545 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2546 B_INVAL | B_TRUNC, cr); 2547 mutex_enter(&rp->r_statelock); 2548 rp->r_flags &= ~RTRUNCATE; 2549 cv_broadcast(&rp->r_cv); 2550 mutex_exit(&rp->r_statelock); 2551 } 2552 2553 static int nfs_write_error_to_cons_only = 0; 2554 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2555 2556 /* 2557 * Print a file handle 2558 */ 2559 void 2560 nfs_printfhandle(nfs_fhandle *fhp) 2561 { 2562 int *ip; 2563 char *buf; 2564 size_t bufsize; 2565 char *cp; 2566 2567 /* 2568 * 13 == "(file handle:" 2569 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2570 * 1 == ' ' 2571 * 8 == maximum strlen of "%x" 2572 * 3 == ")\n\0" 2573 */ 2574 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2575 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2576 if (buf == NULL) 2577 return; 2578 2579 cp = buf; 2580 (void) strcpy(cp, "(file handle:"); 2581 while (*cp != '\0') 2582 cp++; 2583 for (ip = (int *)fhp->fh_buf; 2584 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2585 ip++) { 2586 (void) sprintf(cp, " %x", *ip); 2587 while (*cp != '\0') 2588 cp++; 2589 } 2590 (void) strcpy(cp, ")\n"); 2591 2592 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2593 2594 kmem_free(buf, bufsize); 2595 } 2596 2597 /* 2598 * Notify the system administrator that an NFS write error has 2599 * occurred. 2600 */ 2601 2602 /* seconds between ENOSPC/EDQUOT messages */ 2603 clock_t nfs_write_error_interval = 5; 2604 2605 void 2606 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2607 { 2608 mntinfo_t *mi; 2609 clock_t now; 2610 2611 mi = VTOMI(vp); 2612 /* 2613 * In case of forced unmount or zone shutdown, do not print any 2614 * messages since it can flood the console with error messages. 2615 */ 2616 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2617 return; 2618 2619 /* 2620 * No use in flooding the console with ENOSPC 2621 * messages from the same file system. 2622 */ 2623 now = ddi_get_lbolt(); 2624 if ((error != ENOSPC && error != EDQUOT) || 2625 now - mi->mi_printftime > 0) { 2626 zoneid_t zoneid = mi->mi_zone->zone_id; 2627 2628 #ifdef DEBUG 2629 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2630 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2631 #else 2632 nfs_perror(error, "NFS write error on host %s: %m.\n", 2633 VTOR(vp)->r_server->sv_hostname, NULL); 2634 #endif 2635 if (error == ENOSPC || error == EDQUOT) { 2636 zcmn_err(zoneid, CE_CONT, 2637 MSG("^File: userid=%d, groupid=%d\n"), 2638 crgetuid(cr), crgetgid(cr)); 2639 if (crgetuid(CRED()) != crgetuid(cr) || 2640 crgetgid(CRED()) != crgetgid(cr)) { 2641 zcmn_err(zoneid, CE_CONT, 2642 MSG("^User: userid=%d, groupid=%d\n"), 2643 crgetuid(CRED()), crgetgid(CRED())); 2644 } 2645 mi->mi_printftime = now + 2646 nfs_write_error_interval * hz; 2647 } 2648 nfs_printfhandle(&VTOR(vp)->r_fh); 2649 #ifdef DEBUG 2650 if (error == EACCES) { 2651 zcmn_err(zoneid, CE_CONT, 2652 MSG("^nfs_bio: cred is%s kcred\n"), 2653 cr == kcred ? "" : " not"); 2654 } 2655 #endif 2656 } 2657 } 2658 2659 /* ARGSUSED */ 2660 static void * 2661 nfs_mi_init(zoneid_t zoneid) 2662 { 2663 struct mi_globals *mig; 2664 2665 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2666 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2667 list_create(&mig->mig_list, sizeof (mntinfo_t), 2668 offsetof(mntinfo_t, mi_zone_node)); 2669 mig->mig_destructor_called = B_FALSE; 2670 return (mig); 2671 } 2672 2673 /* 2674 * Callback routine to tell all NFS mounts in the zone to stop creating new 2675 * threads. Existing threads should exit. 2676 */ 2677 /* ARGSUSED */ 2678 static void 2679 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2680 { 2681 struct mi_globals *mig = data; 2682 mntinfo_t *mi; 2683 2684 ASSERT(mig != NULL); 2685 again: 2686 mutex_enter(&mig->mig_lock); 2687 for (mi = list_head(&mig->mig_list); mi != NULL; 2688 mi = list_next(&mig->mig_list, mi)) { 2689 2690 /* 2691 * If we've done the shutdown work for this FS, skip. 2692 * Once we go off the end of the list, we're done. 2693 */ 2694 if (mi->mi_flags & MI_DEAD) 2695 continue; 2696 2697 /* 2698 * We will do work, so not done. Get a hold on the FS. 2699 */ 2700 VFS_HOLD(mi->mi_vfsp); 2701 2702 /* 2703 * purge the DNLC for this filesystem 2704 */ 2705 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2706 2707 mutex_enter(&mi->mi_async_lock); 2708 /* 2709 * Tell existing async worker threads to exit. 2710 */ 2711 mi->mi_max_threads = 0; 2712 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2713 /* 2714 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2715 * getting ready to exit when it's done with its current work. 2716 * Also set MI_DEAD to note we've acted on this FS. 2717 */ 2718 mutex_enter(&mi->mi_lock); 2719 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD); 2720 mutex_exit(&mi->mi_lock); 2721 /* 2722 * Wake up the async manager thread. 2723 */ 2724 cv_broadcast(&mi->mi_async_reqs_cv); 2725 mutex_exit(&mi->mi_async_lock); 2726 2727 /* 2728 * Drop lock and release FS, which may change list, then repeat. 2729 * We're done when every mi has been done or the list is empty. 2730 */ 2731 mutex_exit(&mig->mig_lock); 2732 VFS_RELE(mi->mi_vfsp); 2733 goto again; 2734 } 2735 mutex_exit(&mig->mig_lock); 2736 } 2737 2738 static void 2739 nfs_mi_free_globals(struct mi_globals *mig) 2740 { 2741 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2742 mutex_destroy(&mig->mig_lock); 2743 kmem_free(mig, sizeof (*mig)); 2744 2745 } 2746 2747 /* ARGSUSED */ 2748 static void 2749 nfs_mi_destroy(zoneid_t zoneid, void *data) 2750 { 2751 struct mi_globals *mig = data; 2752 2753 ASSERT(mig != NULL); 2754 mutex_enter(&mig->mig_lock); 2755 if (list_head(&mig->mig_list) != NULL) { 2756 /* Still waiting for VFS_FREEVFS() */ 2757 mig->mig_destructor_called = B_TRUE; 2758 mutex_exit(&mig->mig_lock); 2759 return; 2760 } 2761 nfs_mi_free_globals(mig); 2762 } 2763 2764 /* 2765 * Add an NFS mount to the per-zone list of NFS mounts. 2766 */ 2767 void 2768 nfs_mi_zonelist_add(mntinfo_t *mi) 2769 { 2770 struct mi_globals *mig; 2771 2772 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2773 mutex_enter(&mig->mig_lock); 2774 list_insert_head(&mig->mig_list, mi); 2775 mutex_exit(&mig->mig_lock); 2776 } 2777 2778 /* 2779 * Remove an NFS mount from the per-zone list of NFS mounts. 2780 */ 2781 static void 2782 nfs_mi_zonelist_remove(mntinfo_t *mi) 2783 { 2784 struct mi_globals *mig; 2785 2786 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2787 mutex_enter(&mig->mig_lock); 2788 list_remove(&mig->mig_list, mi); 2789 /* 2790 * We can be called asynchronously by VFS_FREEVFS() after the zone 2791 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2792 * mi globals. 2793 */ 2794 if (list_head(&mig->mig_list) == NULL && 2795 mig->mig_destructor_called == B_TRUE) { 2796 nfs_mi_free_globals(mig); 2797 return; 2798 } 2799 mutex_exit(&mig->mig_lock); 2800 } 2801 2802 /* 2803 * NFS Client initialization routine. This routine should only be called 2804 * once. It performs the following tasks: 2805 * - Initalize all global locks 2806 * - Call sub-initialization routines (localize access to variables) 2807 */ 2808 int 2809 nfs_clntinit(void) 2810 { 2811 #ifdef DEBUG 2812 static boolean_t nfs_clntup = B_FALSE; 2813 #endif 2814 int error; 2815 2816 #ifdef DEBUG 2817 ASSERT(nfs_clntup == B_FALSE); 2818 #endif 2819 2820 error = nfs_subrinit(); 2821 if (error) 2822 return (error); 2823 2824 error = nfs_vfsinit(); 2825 if (error) { 2826 /* 2827 * Cleanup nfs_subrinit() work 2828 */ 2829 nfs_subrfini(); 2830 return (error); 2831 } 2832 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2833 nfs_mi_destroy); 2834 2835 nfs4_clnt_init(); 2836 2837 nfscmd_init(); 2838 2839 #ifdef DEBUG 2840 nfs_clntup = B_TRUE; 2841 #endif 2842 2843 return (0); 2844 } 2845 2846 /* 2847 * This routine is only called if the NFS Client has been initialized but 2848 * the module failed to be installed. This routine will cleanup the previously 2849 * allocated/initialized work. 2850 */ 2851 void 2852 nfs_clntfini(void) 2853 { 2854 (void) zone_key_delete(mi_list_key); 2855 nfs_subrfini(); 2856 nfs_vfsfini(); 2857 nfs4_clnt_fini(); 2858 nfscmd_fini(); 2859 } 2860 2861 /* 2862 * nfs_lockrelease: 2863 * 2864 * Release any locks on the given vnode that are held by the current 2865 * process. 2866 */ 2867 void 2868 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2869 { 2870 flock64_t ld; 2871 struct shrlock shr; 2872 char *buf; 2873 int remote_lock_possible; 2874 int ret; 2875 2876 ASSERT((uintptr_t)vp > KERNELBASE); 2877 2878 /* 2879 * Generate an explicit unlock operation for the entire file. As a 2880 * partial optimization, only generate the unlock if there is a 2881 * lock registered for the file. We could check whether this 2882 * particular process has any locks on the file, but that would 2883 * require the local locking code to provide yet another query 2884 * routine. Note that no explicit synchronization is needed here. 2885 * At worst, flk_has_remote_locks() will return a false positive, 2886 * in which case the unlock call wastes time but doesn't harm 2887 * correctness. 2888 * 2889 * In addition, an unlock request is generated if the process 2890 * is listed as possibly having a lock on the file because the 2891 * server and client lock managers may have gotten out of sync. 2892 * N.B. It is important to make sure nfs_remove_locking_id() is 2893 * called here even if flk_has_remote_locks(vp) reports true. 2894 * If it is not called and there is an entry on the process id 2895 * list, that entry will never get removed. 2896 */ 2897 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2898 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2899 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2900 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2901 ld.l_whence = 0; /* unlock from start of file */ 2902 ld.l_start = 0; 2903 ld.l_len = 0; /* do entire file */ 2904 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr, 2905 NULL); 2906 2907 if (ret != 0) { 2908 /* 2909 * If VOP_FRLOCK fails, make sure we unregister 2910 * local locks before we continue. 2911 */ 2912 ld.l_pid = ttoproc(curthread)->p_pid; 2913 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2914 #ifdef DEBUG 2915 nfs_perror(ret, 2916 "NFS lock release error on vp %p: %m.\n", 2917 (void *)vp, NULL); 2918 #endif 2919 } 2920 2921 /* 2922 * The call to VOP_FRLOCK may put the pid back on the 2923 * list. We need to remove it. 2924 */ 2925 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2926 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2927 } 2928 2929 /* 2930 * As long as the vp has a share matching our pid, 2931 * pluck it off and unshare it. There are circumstances in 2932 * which the call to nfs_remove_locking_id() may put the 2933 * owner back on the list, in which case we simply do a 2934 * redundant and harmless unshare. 2935 */ 2936 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2937 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2938 (char *)NULL, buf, &shr.s_own_len)) { 2939 shr.s_owner = buf; 2940 shr.s_access = 0; 2941 shr.s_deny = 0; 2942 shr.s_sysid = 0; 2943 shr.s_pid = curproc->p_pid; 2944 2945 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL); 2946 #ifdef DEBUG 2947 if (ret != 0) { 2948 nfs_perror(ret, 2949 "NFS share release error on vp %p: %m.\n", 2950 (void *)vp, NULL); 2951 } 2952 #endif 2953 } 2954 kmem_free(buf, MAX_SHR_OWNER_LEN); 2955 } 2956 2957 /* 2958 * nfs_lockcompletion: 2959 * 2960 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2961 * as non cachable (set VNOCACHE bit). 2962 */ 2963 2964 void 2965 nfs_lockcompletion(vnode_t *vp, int cmd) 2966 { 2967 #ifdef DEBUG 2968 rnode_t *rp = VTOR(vp); 2969 2970 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2971 #endif 2972 2973 if (cmd == F_SETLK || cmd == F_SETLKW) { 2974 if (!lm_safemap(vp)) { 2975 mutex_enter(&vp->v_lock); 2976 vp->v_flag |= VNOCACHE; 2977 mutex_exit(&vp->v_lock); 2978 } else { 2979 mutex_enter(&vp->v_lock); 2980 vp->v_flag &= ~VNOCACHE; 2981 mutex_exit(&vp->v_lock); 2982 } 2983 } 2984 /* 2985 * The cached attributes of the file are stale after acquiring 2986 * the lock on the file. They were updated when the file was 2987 * opened, but not updated when the lock was acquired. Therefore the 2988 * cached attributes are invalidated after the lock is obtained. 2989 */ 2990 PURGE_ATTRCACHE(vp); 2991 } 2992 2993 /* 2994 * The lock manager holds state making it possible for the client 2995 * and server to be out of sync. For example, if the response from 2996 * the server granting a lock request is lost, the server will think 2997 * the lock is granted and the client will think the lock is lost. 2998 * The client can tell when it is not positive if it is in sync with 2999 * the server. 3000 * 3001 * To deal with this, a list of processes for which the client is 3002 * not sure if the server holds a lock is attached to the rnode. 3003 * When such a process closes the rnode, an unlock request is sent 3004 * to the server to unlock the entire file. 3005 * 3006 * The list is kept as a singularly linked NULL terminated list. 3007 * Because it is only added to under extreme error conditions, the 3008 * list shouldn't get very big. DEBUG kernels print a message if 3009 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 3010 * choosen to be 8, but can be tuned at runtime. 3011 */ 3012 #ifdef DEBUG 3013 /* int nfs_lmpl_high_water = 8; */ 3014 int nfs_lmpl_high_water = 128; 3015 int nfs_cnt_add_locking_id = 0; 3016 int nfs_len_add_locking_id = 0; 3017 #endif /* DEBUG */ 3018 3019 /* 3020 * Record that the nfs lock manager server may be holding a lock on 3021 * a vnode for a process. 3022 * 3023 * Because the nfs lock manager server holds state, it is possible 3024 * for the server to get out of sync with the client. This routine is called 3025 * from the client when it is no longer sure if the server is in sync 3026 * with the client. nfs_lockrelease() will then notice this and send 3027 * an unlock request when the file is closed 3028 */ 3029 void 3030 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 3031 { 3032 rnode_t *rp; 3033 lmpl_t *new; 3034 lmpl_t *cur; 3035 lmpl_t **lmplp; 3036 #ifdef DEBUG 3037 int list_len = 1; 3038 #endif /* DEBUG */ 3039 3040 #ifdef DEBUG 3041 ++nfs_cnt_add_locking_id; 3042 #endif /* DEBUG */ 3043 /* 3044 * allocate new lmpl_t now so we don't sleep 3045 * later after grabbing mutexes 3046 */ 3047 ASSERT(len < MAX_SHR_OWNER_LEN); 3048 new = kmem_alloc(sizeof (*new), KM_SLEEP); 3049 new->lmpl_type = type; 3050 new->lmpl_pid = pid; 3051 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 3052 bcopy(id, new->lmpl_owner, len); 3053 new->lmpl_own_len = len; 3054 new->lmpl_next = (lmpl_t *)NULL; 3055 #ifdef DEBUG 3056 if (type == RLMPL_PID) { 3057 ASSERT(len == sizeof (pid_t)); 3058 ASSERT(pid == *(pid_t *)new->lmpl_owner); 3059 } else { 3060 ASSERT(type == RLMPL_OWNER); 3061 } 3062 #endif 3063 3064 rp = VTOR(vp); 3065 mutex_enter(&rp->r_statelock); 3066 3067 /* 3068 * Add this id to the list for this rnode only if the 3069 * rnode is active and the id is not already there. 3070 */ 3071 ASSERT(rp->r_flags & RHASHED); 3072 lmplp = &(rp->r_lmpl); 3073 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3074 if (cur->lmpl_pid == pid && 3075 cur->lmpl_type == type && 3076 cur->lmpl_own_len == len && 3077 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 3078 kmem_free(new->lmpl_owner, len); 3079 kmem_free(new, sizeof (*new)); 3080 break; 3081 } 3082 lmplp = &cur->lmpl_next; 3083 #ifdef DEBUG 3084 ++list_len; 3085 #endif /* DEBUG */ 3086 } 3087 if (cur == (lmpl_t *)NULL) { 3088 *lmplp = new; 3089 #ifdef DEBUG 3090 if (list_len > nfs_len_add_locking_id) { 3091 nfs_len_add_locking_id = list_len; 3092 } 3093 if (list_len > nfs_lmpl_high_water) { 3094 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 3095 "vp=%p is %d", (void *)vp, list_len); 3096 } 3097 #endif /* DEBUG */ 3098 } 3099 3100 #ifdef DEBUG 3101 if (share_debug) { 3102 int nitems = 0; 3103 int npids = 0; 3104 int nowners = 0; 3105 3106 /* 3107 * Count the number of things left on r_lmpl after the remove. 3108 */ 3109 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3110 cur = cur->lmpl_next) { 3111 nitems++; 3112 if (cur->lmpl_type == RLMPL_PID) { 3113 npids++; 3114 } else if (cur->lmpl_type == RLMPL_OWNER) { 3115 nowners++; 3116 } else { 3117 cmn_err(CE_PANIC, "nfs_add_locking_id: " 3118 "unrecognized lmpl_type %d", 3119 cur->lmpl_type); 3120 } 3121 } 3122 3123 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 3124 "OWNs = %d items left on r_lmpl\n", 3125 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 3126 } 3127 #endif 3128 3129 mutex_exit(&rp->r_statelock); 3130 } 3131 3132 /* 3133 * Remove an id from the lock manager id list. 3134 * 3135 * If the id is not in the list return 0. If it was found and 3136 * removed, return 1. 3137 */ 3138 static int 3139 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 3140 { 3141 lmpl_t *cur; 3142 lmpl_t **lmplp; 3143 rnode_t *rp; 3144 int rv = 0; 3145 3146 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 3147 3148 rp = VTOR(vp); 3149 3150 mutex_enter(&rp->r_statelock); 3151 ASSERT(rp->r_flags & RHASHED); 3152 lmplp = &(rp->r_lmpl); 3153 3154 /* 3155 * Search through the list and remove the entry for this id 3156 * if it is there. The special case id == NULL allows removal 3157 * of the first share on the r_lmpl list belonging to the 3158 * current process (if any), without regard to further details 3159 * of its identity. 3160 */ 3161 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3162 if (cur->lmpl_type == type && 3163 cur->lmpl_pid == curproc->p_pid && 3164 (id == (char *)NULL || 3165 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 3166 *lmplp = cur->lmpl_next; 3167 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 3168 if (rid != NULL) { 3169 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 3170 *rlen = cur->lmpl_own_len; 3171 } 3172 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 3173 kmem_free(cur, sizeof (*cur)); 3174 rv = 1; 3175 break; 3176 } 3177 lmplp = &cur->lmpl_next; 3178 } 3179 3180 #ifdef DEBUG 3181 if (share_debug) { 3182 int nitems = 0; 3183 int npids = 0; 3184 int nowners = 0; 3185 3186 /* 3187 * Count the number of things left on r_lmpl after the remove. 3188 */ 3189 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3190 cur = cur->lmpl_next) { 3191 nitems++; 3192 if (cur->lmpl_type == RLMPL_PID) { 3193 npids++; 3194 } else if (cur->lmpl_type == RLMPL_OWNER) { 3195 nowners++; 3196 } else { 3197 cmn_err(CE_PANIC, 3198 "nrli: unrecognized lmpl_type %d", 3199 cur->lmpl_type); 3200 } 3201 } 3202 3203 cmn_err(CE_CONT, 3204 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3205 (type == RLMPL_PID) ? "P" : "O", 3206 npids, 3207 nowners, 3208 nitems); 3209 } 3210 #endif 3211 3212 mutex_exit(&rp->r_statelock); 3213 return (rv); 3214 } 3215 3216 void 3217 nfs_free_mi(mntinfo_t *mi) 3218 { 3219 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3220 ASSERT(mi->mi_manager_thread == NULL); 3221 ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 && 3222 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0); 3223 3224 /* 3225 * Remove the node from the global list before we start tearing it down. 3226 */ 3227 nfs_mi_zonelist_remove(mi); 3228 if (mi->mi_klmconfig) { 3229 lm_free_config(mi->mi_klmconfig); 3230 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3231 } 3232 mutex_destroy(&mi->mi_lock); 3233 mutex_destroy(&mi->mi_remap_lock); 3234 mutex_destroy(&mi->mi_async_lock); 3235 mutex_destroy(&mi->mi_rnodes_lock); 3236 cv_destroy(&mi->mi_failover_cv); 3237 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]); 3238 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]); 3239 cv_destroy(&mi->mi_async_reqs_cv); 3240 cv_destroy(&mi->mi_async_cv); 3241 list_destroy(&mi->mi_rnodes); 3242 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS); 3243 kmem_free(mi, sizeof (*mi)); 3244 } 3245 3246 static int 3247 mnt_kstat_update(kstat_t *ksp, int rw) 3248 { 3249 mntinfo_t *mi; 3250 struct mntinfo_kstat *mik; 3251 vfs_t *vfsp; 3252 int i; 3253 3254 /* this is a read-only kstat. Bail out on a write */ 3255 if (rw == KSTAT_WRITE) 3256 return (EACCES); 3257 3258 /* 3259 * We don't want to wait here as kstat_chain_lock could be held by 3260 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3261 * and thus could lead to a deadlock. 3262 */ 3263 vfsp = (struct vfs *)ksp->ks_private; 3264 3265 3266 mi = VFTOMI(vfsp); 3267 3268 mik = (struct mntinfo_kstat *)ksp->ks_data; 3269 3270 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3271 mik->mik_vers = (uint32_t)mi->mi_vers; 3272 mik->mik_flags = mi->mi_flags; 3273 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3274 mik->mik_curread = (uint32_t)mi->mi_curread; 3275 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3276 mik->mik_retrans = mi->mi_retrans; 3277 mik->mik_timeo = mi->mi_timeo; 3278 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3279 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3280 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3281 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3282 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3283 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3284 mik->mik_timers[i].deviate = 3285 (uint32_t)mi->mi_timers[i].rt_deviate; 3286 mik->mik_timers[i].rtxcur = 3287 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3288 } 3289 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3290 mik->mik_failover = (uint32_t)mi->mi_failover; 3291 mik->mik_remap = (uint32_t)mi->mi_remap; 3292 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3293 3294 return (0); 3295 } 3296 3297 void 3298 nfs_mnt_kstat_init(struct vfs *vfsp) 3299 { 3300 mntinfo_t *mi = VFTOMI(vfsp); 3301 3302 /* 3303 * Create the version specific kstats. 3304 * 3305 * PSARC 2001/697 Contract Private Interface 3306 * All nfs kstats are under SunMC contract 3307 * Please refer to the PSARC listed above and contact 3308 * SunMC before making any changes! 3309 * 3310 * Changes must be reviewed by Solaris File Sharing 3311 * Changes must be communicated to contract-2001-697@sun.com 3312 * 3313 */ 3314 3315 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3316 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3317 if (mi->mi_io_kstats) { 3318 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3319 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3320 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3321 kstat_install(mi->mi_io_kstats); 3322 } 3323 3324 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3325 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3326 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3327 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3328 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3329 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3330 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3331 kstat_install(mi->mi_ro_kstats); 3332 } 3333 } 3334 3335 nfs_delmapcall_t * 3336 nfs_init_delmapcall() 3337 { 3338 nfs_delmapcall_t *delmap_call; 3339 3340 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3341 delmap_call->call_id = curthread; 3342 delmap_call->error = 0; 3343 3344 return (delmap_call); 3345 } 3346 3347 void 3348 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3349 { 3350 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3351 } 3352 3353 /* 3354 * Searches for the current delmap caller (based on curthread) in the list of 3355 * callers. If it is found, we remove it and free the delmap caller. 3356 * Returns: 3357 * 0 if the caller wasn't found 3358 * 1 if the caller was found, removed and freed. *errp is set to what 3359 * the result of the delmap was. 3360 */ 3361 int 3362 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3363 { 3364 nfs_delmapcall_t *delmap_call; 3365 3366 /* 3367 * If the list doesn't exist yet, we create it and return 3368 * that the caller wasn't found. No list = no callers. 3369 */ 3370 mutex_enter(&rp->r_statelock); 3371 if (!(rp->r_flags & RDELMAPLIST)) { 3372 /* The list does not exist */ 3373 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3374 offsetof(nfs_delmapcall_t, call_node)); 3375 rp->r_flags |= RDELMAPLIST; 3376 mutex_exit(&rp->r_statelock); 3377 return (0); 3378 } else { 3379 /* The list exists so search it */ 3380 for (delmap_call = list_head(&rp->r_indelmap); 3381 delmap_call != NULL; 3382 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3383 if (delmap_call->call_id == curthread) { 3384 /* current caller is in the list */ 3385 *errp = delmap_call->error; 3386 list_remove(&rp->r_indelmap, delmap_call); 3387 mutex_exit(&rp->r_statelock); 3388 nfs_free_delmapcall(delmap_call); 3389 return (1); 3390 } 3391 } 3392 } 3393 mutex_exit(&rp->r_statelock); 3394 return (0); 3395 } 3396