1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/thread.h> 35 #include <sys/t_lock.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/errno.h> 40 #include <sys/buf.h> 41 #include <sys/stat.h> 42 #include <sys/cred.h> 43 #include <sys/kmem.h> 44 #include <sys/debug.h> 45 #include <sys/dnlc.h> 46 #include <sys/vmsystm.h> 47 #include <sys/flock.h> 48 #include <sys/share.h> 49 #include <sys/cmn_err.h> 50 #include <sys/tiuser.h> 51 #include <sys/sysmacros.h> 52 #include <sys/callb.h> 53 #include <sys/acl.h> 54 #include <sys/kstat.h> 55 #include <sys/signal.h> 56 #include <sys/list.h> 57 #include <sys/zone.h> 58 59 #include <rpc/types.h> 60 #include <rpc/xdr.h> 61 #include <rpc/auth.h> 62 #include <rpc/clnt.h> 63 64 #include <nfs/nfs.h> 65 #include <nfs/nfs_clnt.h> 66 67 #include <nfs/rnode.h> 68 #include <nfs/nfs_acl.h> 69 #include <nfs/lm.h> 70 71 #include <vm/hat.h> 72 #include <vm/as.h> 73 #include <vm/page.h> 74 #include <vm/pvn.h> 75 #include <vm/seg.h> 76 #include <vm/seg_map.h> 77 #include <vm/seg_vn.h> 78 79 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 80 cred_t *); 81 static int nfs_getattr_cache(vnode_t *, struct vattr *); 82 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 83 84 struct mi_globals { 85 kmutex_t mig_lock; /* lock protecting mig_list */ 86 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 87 boolean_t mig_destructor_called; 88 }; 89 90 static zone_key_t mi_list_key; 91 92 /* Debugging flag for PC file shares. */ 93 extern int share_debug; 94 95 /* 96 * Attributes caching: 97 * 98 * Attributes are cached in the rnode in struct vattr form. 99 * There is a time associated with the cached attributes (r_attrtime) 100 * which tells whether the attributes are valid. The time is initialized 101 * to the difference between current time and the modify time of the vnode 102 * when new attributes are cached. This allows the attributes for 103 * files that have changed recently to be timed out sooner than for files 104 * that have not changed for a long time. There are minimum and maximum 105 * timeout values that can be set per mount point. 106 */ 107 108 int 109 nfs_waitfor_purge_complete(vnode_t *vp) 110 { 111 rnode_t *rp; 112 k_sigset_t smask; 113 114 rp = VTOR(vp); 115 if (rp->r_serial != NULL && rp->r_serial != curthread) { 116 mutex_enter(&rp->r_statelock); 117 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 118 while (rp->r_serial != NULL) { 119 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 120 sigunintr(&smask); 121 mutex_exit(&rp->r_statelock); 122 return (EINTR); 123 } 124 } 125 sigunintr(&smask); 126 mutex_exit(&rp->r_statelock); 127 } 128 return (0); 129 } 130 131 /* 132 * Validate caches by checking cached attributes. If the cached 133 * attributes have timed out, then get new attributes from the server. 134 * As a side affect, this will do cache invalidation if the attributes 135 * have changed. 136 * 137 * If the attributes have not timed out and if there is a cache 138 * invalidation being done by some other thread, then wait until that 139 * thread has completed the cache invalidation. 140 */ 141 int 142 nfs_validate_caches(vnode_t *vp, cred_t *cr) 143 { 144 int error; 145 struct vattr va; 146 147 if (ATTRCACHE_VALID(vp)) { 148 error = nfs_waitfor_purge_complete(vp); 149 if (error) 150 return (error); 151 return (0); 152 } 153 154 va.va_mask = AT_ALL; 155 return (nfs_getattr_otw(vp, &va, cr)); 156 } 157 158 /* 159 * Validate caches by checking cached attributes. If the cached 160 * attributes have timed out, then get new attributes from the server. 161 * As a side affect, this will do cache invalidation if the attributes 162 * have changed. 163 * 164 * If the attributes have not timed out and if there is a cache 165 * invalidation being done by some other thread, then wait until that 166 * thread has completed the cache invalidation. 167 */ 168 int 169 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 170 { 171 int error; 172 struct vattr va; 173 174 if (ATTRCACHE_VALID(vp)) { 175 error = nfs_waitfor_purge_complete(vp); 176 if (error) 177 return (error); 178 return (0); 179 } 180 181 va.va_mask = AT_ALL; 182 return (nfs3_getattr_otw(vp, &va, cr)); 183 } 184 185 /* 186 * Purge all of the various NFS `data' caches. 187 */ 188 void 189 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 190 { 191 rnode_t *rp; 192 char *contents; 193 int size; 194 int error; 195 196 /* 197 * Purge the DNLC for any entries which refer to this file. 198 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 199 */ 200 rp = VTOR(vp); 201 mutex_enter(&rp->r_statelock); 202 if (vp->v_count > 1 && 203 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 204 !(rp->r_flags & RINDNLCPURGE)) { 205 /* 206 * Set the RINDNLCPURGE flag to prevent recursive entry 207 * into dnlc_purge_vp() 208 */ 209 if (vp->v_type == VDIR) 210 rp->r_flags |= RINDNLCPURGE; 211 mutex_exit(&rp->r_statelock); 212 dnlc_purge_vp(vp); 213 mutex_enter(&rp->r_statelock); 214 if (rp->r_flags & RINDNLCPURGE) 215 rp->r_flags &= ~RINDNLCPURGE; 216 } 217 218 /* 219 * Clear any readdir state bits and purge the readlink response cache. 220 */ 221 contents = rp->r_symlink.contents; 222 size = rp->r_symlink.size; 223 rp->r_symlink.contents = NULL; 224 mutex_exit(&rp->r_statelock); 225 226 if (contents != NULL) { 227 228 kmem_free((void *)contents, size); 229 } 230 231 /* 232 * Flush the page cache. 233 */ 234 if (vn_has_cached_data(vp)) { 235 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr); 236 if (error && (error == ENOSPC || error == EDQUOT)) { 237 mutex_enter(&rp->r_statelock); 238 if (!rp->r_error) 239 rp->r_error = error; 240 mutex_exit(&rp->r_statelock); 241 } 242 } 243 244 /* 245 * Flush the readdir response cache. 246 */ 247 if (HAVE_RDDIR_CACHE(rp)) 248 nfs_purge_rddir_cache(vp); 249 } 250 251 /* 252 * Purge the readdir cache of all entries 253 */ 254 void 255 nfs_purge_rddir_cache(vnode_t *vp) 256 { 257 rnode_t *rp; 258 rddir_cache *rdc; 259 rddir_cache *nrdc; 260 261 rp = VTOR(vp); 262 top: 263 mutex_enter(&rp->r_statelock); 264 rp->r_direof = NULL; 265 rp->r_flags &= ~RLOOKUP; 266 rp->r_flags |= RREADDIRPLUS; 267 rdc = avl_first(&rp->r_dir); 268 while (rdc != NULL) { 269 nrdc = AVL_NEXT(&rp->r_dir, rdc); 270 avl_remove(&rp->r_dir, rdc); 271 rddir_cache_rele(rdc); 272 rdc = nrdc; 273 } 274 mutex_exit(&rp->r_statelock); 275 } 276 277 /* 278 * Do a cache check based on the post-operation attributes. 279 * Then make them the new cached attributes. If no attributes 280 * were returned, then mark the attributes as timed out. 281 */ 282 void 283 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 284 { 285 vattr_t attr; 286 287 if (!poap->attributes) { 288 PURGE_ATTRCACHE(vp); 289 return; 290 } 291 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 292 } 293 294 /* 295 * Same as above, but using a vattr 296 */ 297 void 298 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 299 cred_t *cr) 300 { 301 if (!poap->attributes) { 302 PURGE_ATTRCACHE(vp); 303 return; 304 } 305 nfs_attr_cache(vp, poap->fres.vap, t, cr); 306 } 307 308 /* 309 * Do a cache check based on the weak cache consistency attributes. 310 * These consist of a small set of pre-operation attributes and the 311 * full set of post-operation attributes. 312 * 313 * If we are given the pre-operation attributes, then use them to 314 * check the validity of the various caches. Then, if we got the 315 * post-operation attributes, make them the new cached attributes. 316 * If we didn't get the post-operation attributes, then mark the 317 * attribute cache as timed out so that the next reference will 318 * cause a GETATTR to the server to refresh with the current 319 * attributes. 320 * 321 * Otherwise, if we didn't get the pre-operation attributes, but 322 * we did get the post-operation attributes, then use these 323 * attributes to check the validity of the various caches. This 324 * will probably cause a flush of the caches because if the 325 * operation succeeded, the attributes of the object were changed 326 * in some way from the old post-operation attributes. This 327 * should be okay because it is the safe thing to do. After 328 * checking the data caches, then we make these the new cached 329 * attributes. 330 * 331 * Otherwise, we didn't get either the pre- or post-operation 332 * attributes. Simply mark the attribute cache as timed out so 333 * the next reference will cause a GETATTR to the server to 334 * refresh with the current attributes. 335 * 336 * If an error occurred trying to convert the over the wire 337 * attributes to a vattr, then simply mark the attribute cache as 338 * timed out. 339 */ 340 void 341 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 342 { 343 vattr_t bva; 344 vattr_t ava; 345 346 if (wccp->after.attributes) { 347 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 348 PURGE_ATTRCACHE(vp); 349 return; 350 } 351 if (wccp->before.attributes) { 352 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 353 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 354 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 355 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 356 bva.va_size = wccp->before.attr.size; 357 nfs3_attr_cache(vp, &bva, &ava, t, cr); 358 } else 359 nfs_attr_cache(vp, &ava, t, cr); 360 } else { 361 PURGE_ATTRCACHE(vp); 362 } 363 } 364 365 /* 366 * Set attributes cache for given vnode using nfsattr. 367 * 368 * This routine does not do cache validation with the attributes. 369 * 370 * If an error occurred trying to convert the over the wire 371 * attributes to a vattr, then simply mark the attribute cache as 372 * timed out. 373 */ 374 void 375 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 376 { 377 rnode_t *rp; 378 struct vattr va; 379 380 if (!nattr_to_vattr(vp, na, &va)) { 381 rp = VTOR(vp); 382 mutex_enter(&rp->r_statelock); 383 if (rp->r_mtime <= t) 384 nfs_attrcache_va(vp, &va); 385 mutex_exit(&rp->r_statelock); 386 } else { 387 PURGE_ATTRCACHE(vp); 388 } 389 } 390 391 /* 392 * Set attributes cache for given vnode using fattr3. 393 * 394 * This routine does not do cache validation with the attributes. 395 * 396 * If an error occurred trying to convert the over the wire 397 * attributes to a vattr, then simply mark the attribute cache as 398 * timed out. 399 */ 400 void 401 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 402 { 403 rnode_t *rp; 404 struct vattr va; 405 406 if (!fattr3_to_vattr(vp, na, &va)) { 407 rp = VTOR(vp); 408 mutex_enter(&rp->r_statelock); 409 if (rp->r_mtime <= t) 410 nfs_attrcache_va(vp, &va); 411 mutex_exit(&rp->r_statelock); 412 } else { 413 PURGE_ATTRCACHE(vp); 414 } 415 } 416 417 /* 418 * Do a cache check based on attributes returned over the wire. The 419 * new attributes are cached. 420 * 421 * If an error occurred trying to convert the over the wire attributes 422 * to a vattr, then just return that error. 423 * 424 * As a side affect, the vattr argument is filled in with the converted 425 * attributes. 426 */ 427 int 428 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 429 cred_t *cr) 430 { 431 int error; 432 433 error = nattr_to_vattr(vp, na, vap); 434 if (error) 435 return (error); 436 nfs_attr_cache(vp, vap, t, cr); 437 return (0); 438 } 439 440 /* 441 * Do a cache check based on attributes returned over the wire. The 442 * new attributes are cached. 443 * 444 * If an error occurred trying to convert the over the wire attributes 445 * to a vattr, then just return that error. 446 * 447 * As a side affect, the vattr argument is filled in with the converted 448 * attributes. 449 */ 450 int 451 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 452 { 453 int error; 454 455 error = fattr3_to_vattr(vp, na, vap); 456 if (error) 457 return (error); 458 nfs_attr_cache(vp, vap, t, cr); 459 return (0); 460 } 461 462 /* 463 * Use the passed in virtual attributes to check to see whether the 464 * data and metadata caches are valid, cache the new attributes, and 465 * then do the cache invalidation if required. 466 * 467 * The cache validation and caching of the new attributes is done 468 * atomically via the use of the mutex, r_statelock. If required, 469 * the cache invalidation is done atomically w.r.t. the cache 470 * validation and caching of the attributes via the pseudo lock, 471 * r_serial. 472 * 473 * This routine is used to do cache validation and attributes caching 474 * for operations with a single set of post operation attributes. 475 */ 476 void 477 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 478 { 479 rnode_t *rp; 480 int mtime_changed; 481 int ctime_changed; 482 vsecattr_t *vsp; 483 int was_serial; 484 485 rp = VTOR(vp); 486 487 mutex_enter(&rp->r_statelock); 488 489 if (rp->r_serial != curthread) { 490 klwp_t *lwp = ttolwp(curthread); 491 492 was_serial = 0; 493 if (lwp != NULL) 494 lwp->lwp_nostop++; 495 while (rp->r_serial != NULL) { 496 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 497 mutex_exit(&rp->r_statelock); 498 if (lwp != NULL) 499 lwp->lwp_nostop--; 500 return; 501 } 502 } 503 if (lwp != NULL) 504 lwp->lwp_nostop--; 505 } else 506 was_serial = 1; 507 508 if (rp->r_mtime > t) { 509 mutex_exit(&rp->r_statelock); 510 return; 511 } 512 513 if (!(rp->r_flags & RWRITEATTR)) { 514 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 515 mtime_changed = 1; 516 else 517 mtime_changed = 0; 518 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 519 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 520 ctime_changed = 1; 521 else 522 ctime_changed = 0; 523 } else if (rp->r_size != vap->va_size && 524 (!vn_has_cached_data(vp) || 525 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) { 526 mtime_changed = 1; 527 ctime_changed = 0; 528 } else { 529 mtime_changed = 0; 530 ctime_changed = 0; 531 } 532 533 nfs_attrcache_va(vp, vap); 534 535 if (!mtime_changed && !ctime_changed) { 536 mutex_exit(&rp->r_statelock); 537 return; 538 } 539 540 rp->r_serial = curthread; 541 542 mutex_exit(&rp->r_statelock); 543 544 if (mtime_changed) 545 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 546 547 if (ctime_changed) { 548 (void) nfs_access_purge_rp(rp); 549 if (rp->r_secattr != NULL) { 550 mutex_enter(&rp->r_statelock); 551 vsp = rp->r_secattr; 552 rp->r_secattr = NULL; 553 mutex_exit(&rp->r_statelock); 554 if (vsp != NULL) 555 nfs_acl_free(vsp); 556 } 557 } 558 559 if (!was_serial) { 560 mutex_enter(&rp->r_statelock); 561 rp->r_serial = NULL; 562 cv_broadcast(&rp->r_cv); 563 mutex_exit(&rp->r_statelock); 564 } 565 } 566 567 /* 568 * Use the passed in "before" virtual attributes to check to see 569 * whether the data and metadata caches are valid, cache the "after" 570 * new attributes, and then do the cache invalidation if required. 571 * 572 * The cache validation and caching of the new attributes is done 573 * atomically via the use of the mutex, r_statelock. If required, 574 * the cache invalidation is done atomically w.r.t. the cache 575 * validation and caching of the attributes via the pseudo lock, 576 * r_serial. 577 * 578 * This routine is used to do cache validation and attributes caching 579 * for operations with both pre operation attributes and post operation 580 * attributes. 581 */ 582 static void 583 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 584 cred_t *cr) 585 { 586 rnode_t *rp; 587 int mtime_changed; 588 int ctime_changed; 589 vsecattr_t *vsp; 590 int was_serial; 591 592 rp = VTOR(vp); 593 594 mutex_enter(&rp->r_statelock); 595 596 if (rp->r_serial != curthread) { 597 klwp_t *lwp = ttolwp(curthread); 598 599 was_serial = 0; 600 if (lwp != NULL) 601 lwp->lwp_nostop++; 602 while (rp->r_serial != NULL) { 603 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 604 mutex_exit(&rp->r_statelock); 605 if (lwp != NULL) 606 lwp->lwp_nostop--; 607 return; 608 } 609 } 610 if (lwp != NULL) 611 lwp->lwp_nostop--; 612 } else 613 was_serial = 1; 614 615 if (rp->r_mtime > t) { 616 mutex_exit(&rp->r_statelock); 617 return; 618 } 619 620 if (!(rp->r_flags & RWRITEATTR)) { 621 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 622 mtime_changed = 1; 623 else 624 mtime_changed = 0; 625 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 626 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 627 ctime_changed = 1; 628 else 629 ctime_changed = 0; 630 } else { 631 mtime_changed = 0; 632 ctime_changed = 0; 633 } 634 635 nfs_attrcache_va(vp, avap); 636 637 if (!mtime_changed && !ctime_changed) { 638 mutex_exit(&rp->r_statelock); 639 return; 640 } 641 642 rp->r_serial = curthread; 643 644 mutex_exit(&rp->r_statelock); 645 646 if (mtime_changed) 647 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 648 649 if (ctime_changed) { 650 (void) nfs_access_purge_rp(rp); 651 if (rp->r_secattr != NULL) { 652 mutex_enter(&rp->r_statelock); 653 vsp = rp->r_secattr; 654 rp->r_secattr = NULL; 655 mutex_exit(&rp->r_statelock); 656 if (vsp != NULL) 657 nfs_acl_free(vsp); 658 } 659 } 660 661 if (!was_serial) { 662 mutex_enter(&rp->r_statelock); 663 rp->r_serial = NULL; 664 cv_broadcast(&rp->r_cv); 665 mutex_exit(&rp->r_statelock); 666 } 667 } 668 669 /* 670 * Set attributes cache for given vnode using virtual attributes. 671 * 672 * Set the timeout value on the attribute cache and fill it 673 * with the passed in attributes. 674 * 675 * The caller must be holding r_statelock. 676 */ 677 void 678 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 679 { 680 rnode_t *rp; 681 mntinfo_t *mi; 682 hrtime_t delta; 683 hrtime_t now; 684 685 rp = VTOR(vp); 686 687 ASSERT(MUTEX_HELD(&rp->r_statelock)); 688 689 now = gethrtime(); 690 691 mi = VTOMI(vp); 692 693 /* 694 * Delta is the number of nanoseconds that we will 695 * cache the attributes of the file. It is based on 696 * the number of nanoseconds since the last time that 697 * we detected a change. The assumption is that files 698 * that changed recently are likely to change again. 699 * There is a minimum and a maximum for regular files 700 * and for directories which is enforced though. 701 * 702 * Using the time since last change was detected 703 * eliminates direct comparison or calculation 704 * using mixed client and server times. NFS does 705 * not make any assumptions regarding the client 706 * and server clocks being synchronized. 707 */ 708 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 709 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 710 va->va_size != rp->r_attr.va_size) 711 rp->r_mtime = now; 712 713 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 714 delta = 0; 715 else { 716 delta = now - rp->r_mtime; 717 if (vp->v_type == VDIR) { 718 if (delta < mi->mi_acdirmin) 719 delta = mi->mi_acdirmin; 720 else if (delta > mi->mi_acdirmax) 721 delta = mi->mi_acdirmax; 722 } else { 723 if (delta < mi->mi_acregmin) 724 delta = mi->mi_acregmin; 725 else if (delta > mi->mi_acregmax) 726 delta = mi->mi_acregmax; 727 } 728 } 729 rp->r_attrtime = now + delta; 730 rp->r_attr = *va; 731 /* 732 * Update the size of the file if there is no cached data or if 733 * the cached data is clean and there is no data being written 734 * out. 735 */ 736 if (rp->r_size != va->va_size && 737 (!vn_has_cached_data(vp) || 738 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 739 rp->r_size = va->va_size; 740 nfs_setswaplike(vp, va); 741 rp->r_flags &= ~RWRITEATTR; 742 } 743 744 /* 745 * Fill in attribute from the cache. 746 * If valid, then return 0 to indicate that no error occurred, 747 * otherwise return 1 to indicate that an error occurred. 748 */ 749 static int 750 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 751 { 752 rnode_t *rp; 753 754 rp = VTOR(vp); 755 mutex_enter(&rp->r_statelock); 756 if (ATTRCACHE_VALID(vp)) { 757 /* 758 * Cached attributes are valid 759 */ 760 *vap = rp->r_attr; 761 mutex_exit(&rp->r_statelock); 762 return (0); 763 } 764 mutex_exit(&rp->r_statelock); 765 return (1); 766 } 767 768 /* 769 * Get attributes over-the-wire and update attributes cache 770 * if no error occurred in the over-the-wire operation. 771 * Return 0 if successful, otherwise error. 772 */ 773 int 774 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 775 { 776 int error; 777 struct nfsattrstat ns; 778 int douprintf; 779 mntinfo_t *mi; 780 failinfo_t fi; 781 hrtime_t t; 782 783 mi = VTOMI(vp); 784 fi.vp = vp; 785 fi.fhp = NULL; /* no need to update, filehandle not copied */ 786 fi.copyproc = nfscopyfh; 787 fi.lookupproc = nfslookup; 788 fi.xattrdirproc = acl_getxattrdir2; 789 790 if (mi->mi_flags & MI_ACL) { 791 error = acl_getattr2_otw(vp, vap, cr); 792 if (mi->mi_flags & MI_ACL) 793 return (error); 794 } 795 796 douprintf = 1; 797 798 t = gethrtime(); 799 800 error = rfs2call(mi, RFS_GETATTR, 801 xdr_fhandle, (caddr_t)VTOFH(vp), 802 xdr_attrstat, (caddr_t)&ns, cr, 803 &douprintf, &ns.ns_status, 0, &fi); 804 805 if (!error) { 806 error = geterrno(ns.ns_status); 807 if (!error) 808 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 809 else { 810 PURGE_STALE_FH(error, vp, cr); 811 } 812 } 813 814 return (error); 815 } 816 817 /* 818 * Return either cached ot remote attributes. If get remote attr 819 * use them to check and invalidate caches, then cache the new attributes. 820 */ 821 int 822 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 823 { 824 int error; 825 rnode_t *rp; 826 827 /* 828 * If we've got cached attributes, we're done, otherwise go 829 * to the server to get attributes, which will update the cache 830 * in the process. 831 */ 832 error = nfs_getattr_cache(vp, vap); 833 if (error) 834 error = nfs_getattr_otw(vp, vap, cr); 835 836 /* Return the client's view of file size */ 837 rp = VTOR(vp); 838 mutex_enter(&rp->r_statelock); 839 vap->va_size = rp->r_size; 840 mutex_exit(&rp->r_statelock); 841 842 return (error); 843 } 844 845 /* 846 * Get attributes over-the-wire and update attributes cache 847 * if no error occurred in the over-the-wire operation. 848 * Return 0 if successful, otherwise error. 849 */ 850 int 851 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 852 { 853 int error; 854 GETATTR3args args; 855 GETATTR3vres res; 856 int douprintf; 857 failinfo_t fi; 858 hrtime_t t; 859 860 args.object = *VTOFH3(vp); 861 fi.vp = vp; 862 fi.fhp = (caddr_t)&args.object; 863 fi.copyproc = nfs3copyfh; 864 fi.lookupproc = nfs3lookup; 865 fi.xattrdirproc = acl_getxattrdir3; 866 res.fres.vp = vp; 867 res.fres.vap = vap; 868 869 douprintf = 1; 870 871 t = gethrtime(); 872 873 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 874 xdr_nfs_fh3, (caddr_t)&args, 875 xdr_GETATTR3vres, (caddr_t)&res, cr, 876 &douprintf, &res.status, 0, &fi); 877 878 if (error) 879 return (error); 880 881 error = geterrno3(res.status); 882 if (error) { 883 PURGE_STALE_FH(error, vp, cr); 884 return (error); 885 } 886 887 /* 888 * Catch status codes that indicate fattr3 to vattr translation failure 889 */ 890 if (res.fres.status) 891 return (res.fres.status); 892 893 nfs_attr_cache(vp, vap, t, cr); 894 return (0); 895 } 896 897 /* 898 * Return either cached or remote attributes. If get remote attr 899 * use them to check and invalidate caches, then cache the new attributes. 900 */ 901 int 902 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 903 { 904 int error; 905 rnode_t *rp; 906 907 /* 908 * If we've got cached attributes, we're done, otherwise go 909 * to the server to get attributes, which will update the cache 910 * in the process. 911 */ 912 error = nfs_getattr_cache(vp, vap); 913 if (error) 914 error = nfs3_getattr_otw(vp, vap, cr); 915 916 /* Return the client's view of file size */ 917 rp = VTOR(vp); 918 mutex_enter(&rp->r_statelock); 919 vap->va_size = rp->r_size; 920 mutex_exit(&rp->r_statelock); 921 922 return (error); 923 } 924 925 vtype_t nf_to_vt[] = { 926 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 927 }; 928 /* 929 * Convert NFS Version 2 over the network attributes to the local 930 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 931 * network representation and the local representation is done here. 932 * Returns 0 for success, error if failed due to overflow. 933 */ 934 int 935 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 936 { 937 /* overflow in time attributes? */ 938 #ifndef _LP64 939 if (!NFS2_FATTR_TIME_OK(na)) 940 return (EOVERFLOW); 941 #endif 942 943 if (na->na_type < NFNON || na->na_type > NFSOC) 944 vap->va_type = VBAD; 945 else 946 vap->va_type = nf_to_vt[na->na_type]; 947 vap->va_mode = na->na_mode; 948 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 949 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 950 vap->va_fsid = vp->v_vfsp->vfs_dev; 951 vap->va_nodeid = na->na_nodeid; 952 vap->va_nlink = na->na_nlink; 953 vap->va_size = na->na_size; /* keep for cache validation */ 954 /* 955 * nfs protocol defines times as unsigned so don't extend sign, 956 * unless sysadmin set nfs_allow_preepoch_time. 957 */ 958 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 959 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 960 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 961 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 962 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 963 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 964 /* 965 * Shannon's law - uncompress the received dev_t 966 * if the top half of is zero indicating a response 967 * from an `older style' OS. Except for when it is a 968 * `new style' OS sending the maj device of zero, 969 * in which case the algorithm still works because the 970 * fact that it is a new style server 971 * is hidden by the minor device not being greater 972 * than 255 (a requirement in this case). 973 */ 974 if ((na->na_rdev & 0xffff0000) == 0) 975 vap->va_rdev = nfsv2_expdev(na->na_rdev); 976 else 977 vap->va_rdev = expldev(na->na_rdev); 978 979 vap->va_nblocks = na->na_blocks; 980 switch (na->na_type) { 981 case NFBLK: 982 vap->va_blksize = DEV_BSIZE; 983 break; 984 985 case NFCHR: 986 vap->va_blksize = MAXBSIZE; 987 break; 988 989 case NFSOC: 990 default: 991 vap->va_blksize = na->na_blocksize; 992 break; 993 } 994 /* 995 * This bit of ugliness is a hack to preserve the 996 * over-the-wire protocols for named-pipe vnodes. 997 * It remaps the special over-the-wire type to the 998 * VFIFO type. (see note in nfs.h) 999 */ 1000 if (NA_ISFIFO(na)) { 1001 vap->va_type = VFIFO; 1002 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1003 vap->va_rdev = 0; 1004 vap->va_blksize = na->na_blocksize; 1005 } 1006 vap->va_seq = 0; 1007 return (0); 1008 } 1009 1010 /* 1011 * Convert NFS Version 3 over the network attributes to the local 1012 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1013 * network representation and the local representation is done here. 1014 */ 1015 vtype_t nf3_to_vt[] = { 1016 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1017 }; 1018 1019 int 1020 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1021 { 1022 1023 #ifndef _LP64 1024 /* overflow in time attributes? */ 1025 if (!NFS3_FATTR_TIME_OK(na)) 1026 return (EOVERFLOW); 1027 #endif 1028 if (!NFS3_SIZE_OK(na->size)) 1029 /* file too big */ 1030 return (EFBIG); 1031 1032 vap->va_mask = AT_ALL; 1033 1034 if (na->type < NF3REG || na->type > NF3FIFO) 1035 vap->va_type = VBAD; 1036 else 1037 vap->va_type = nf3_to_vt[na->type]; 1038 vap->va_mode = na->mode; 1039 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1040 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1041 vap->va_fsid = vp->v_vfsp->vfs_dev; 1042 vap->va_nodeid = na->fileid; 1043 vap->va_nlink = na->nlink; 1044 vap->va_size = na->size; 1045 1046 /* 1047 * nfs protocol defines times as unsigned so don't extend sign, 1048 * unless sysadmin set nfs_allow_preepoch_time. 1049 */ 1050 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1051 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1052 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1053 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1054 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1055 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1056 1057 switch (na->type) { 1058 case NF3BLK: 1059 vap->va_rdev = makedevice(na->rdev.specdata1, 1060 na->rdev.specdata2); 1061 vap->va_blksize = DEV_BSIZE; 1062 vap->va_nblocks = 0; 1063 break; 1064 case NF3CHR: 1065 vap->va_rdev = makedevice(na->rdev.specdata1, 1066 na->rdev.specdata2); 1067 vap->va_blksize = MAXBSIZE; 1068 vap->va_nblocks = 0; 1069 break; 1070 case NF3REG: 1071 case NF3DIR: 1072 case NF3LNK: 1073 vap->va_rdev = 0; 1074 vap->va_blksize = MAXBSIZE; 1075 vap->va_nblocks = (u_longlong_t) 1076 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1077 (size3)DEV_BSIZE); 1078 break; 1079 case NF3SOCK: 1080 case NF3FIFO: 1081 default: 1082 vap->va_rdev = 0; 1083 vap->va_blksize = MAXBSIZE; 1084 vap->va_nblocks = 0; 1085 break; 1086 } 1087 vap->va_seq = 0; 1088 return (0); 1089 } 1090 1091 /* 1092 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1093 * for the demand-based allocation of async threads per-mount. The 1094 * nfs_async_timeout is the amount of time a thread will live after it 1095 * becomes idle, unless new I/O requests are received before the thread 1096 * dies. See nfs_async_putpage and nfs_async_start. 1097 */ 1098 1099 int nfs_async_timeout = -1; /* uninitialized */ 1100 1101 static void nfs_async_start(struct vfs *); 1102 1103 static void 1104 free_async_args(struct nfs_async_reqs *args) 1105 { 1106 rnode_t *rp; 1107 1108 if (args->a_io != NFS_INACTIVE) { 1109 rp = VTOR(args->a_vp); 1110 mutex_enter(&rp->r_statelock); 1111 rp->r_count--; 1112 if (args->a_io == NFS_PUTAPAGE || 1113 args->a_io == NFS_PAGEIO) 1114 rp->r_awcount--; 1115 cv_broadcast(&rp->r_cv); 1116 mutex_exit(&rp->r_statelock); 1117 VN_RELE(args->a_vp); 1118 } 1119 crfree(args->a_cred); 1120 kmem_free(args, sizeof (*args)); 1121 } 1122 1123 /* 1124 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1125 * pageout(), running in the global zone, have legitimate reasons to do 1126 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1127 * use of a a per-mount "asynchronous requests manager thread" which is 1128 * signaled by the various asynchronous work routines when there is 1129 * asynchronous work to be done. It is responsible for creating new 1130 * worker threads if necessary, and notifying existing worker threads 1131 * that there is work to be done. 1132 * 1133 * In other words, it will "take the specifications from the customers and 1134 * give them to the engineers." 1135 * 1136 * Worker threads die off of their own accord if they are no longer 1137 * needed. 1138 * 1139 * This thread is killed when the zone is going away or the filesystem 1140 * is being unmounted. 1141 */ 1142 void 1143 nfs_async_manager(vfs_t *vfsp) 1144 { 1145 callb_cpr_t cprinfo; 1146 mntinfo_t *mi; 1147 uint_t max_threads; 1148 1149 mi = VFTOMI(vfsp); 1150 1151 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1152 "nfs_async_manager"); 1153 1154 mutex_enter(&mi->mi_async_lock); 1155 /* 1156 * We want to stash the max number of threads that this mount was 1157 * allowed so we can use it later when the variable is set to zero as 1158 * part of the zone/mount going away. 1159 * 1160 * We want to be able to create at least one thread to handle 1161 * asyncrhonous inactive calls. 1162 */ 1163 max_threads = MAX(mi->mi_max_threads, 1); 1164 mutex_enter(&mi->mi_lock); 1165 /* 1166 * We don't want to wait for mi_max_threads to go to zero, since that 1167 * happens as part of a failed unmount, but this thread should only 1168 * exit when the mount/zone is really going away. 1169 * 1170 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1171 * attempted: the various _async_*() functions know to do things 1172 * inline if mi_max_threads == 0. Henceforth we just drain out the 1173 * outstanding requests. 1174 * 1175 * Note that we still create zthreads even if we notice the zone is 1176 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1177 * shutdown sequence to take slightly longer in some cases, but 1178 * doesn't violate the protocol, as all threads will exit as soon as 1179 * they're done processing the remaining requests. 1180 */ 1181 while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) || 1182 mi->mi_async_req_count > 0) { 1183 mutex_exit(&mi->mi_lock); 1184 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1185 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1186 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1187 while (mi->mi_async_req_count > 0) { 1188 /* 1189 * Paranoia: If the mount started out having 1190 * (mi->mi_max_threads == 0), and the value was 1191 * later changed (via a debugger or somesuch), 1192 * we could be confused since we will think we 1193 * can't create any threads, and the calling 1194 * code (which looks at the current value of 1195 * mi->mi_max_threads, now non-zero) thinks we 1196 * can. 1197 * 1198 * So, because we're paranoid, we create threads 1199 * up to the maximum of the original and the 1200 * current value. This means that future 1201 * (debugger-induced) lowerings of 1202 * mi->mi_max_threads are ignored for our 1203 * purposes, but who told them they could change 1204 * random values on a live kernel anyhow? 1205 */ 1206 if (mi->mi_threads < 1207 MAX(mi->mi_max_threads, max_threads)) { 1208 mi->mi_threads++; 1209 mutex_exit(&mi->mi_async_lock); 1210 VFS_HOLD(vfsp); /* hold for new thread */ 1211 (void) zthread_create(NULL, 0, nfs_async_start, 1212 vfsp, 0, minclsyspri); 1213 mutex_enter(&mi->mi_async_lock); 1214 } 1215 cv_signal(&mi->mi_async_work_cv); 1216 ASSERT(mi->mi_async_req_count != 0); 1217 mi->mi_async_req_count--; 1218 } 1219 mutex_enter(&mi->mi_lock); 1220 } 1221 mutex_exit(&mi->mi_lock); 1222 /* 1223 * Let everyone know we're done. 1224 */ 1225 mi->mi_manager_thread = NULL; 1226 cv_broadcast(&mi->mi_async_cv); 1227 1228 /* 1229 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1230 * since CALLB_CPR_EXIT is actually responsible for releasing 1231 * 'mi_async_lock'. 1232 */ 1233 CALLB_CPR_EXIT(&cprinfo); 1234 VFS_RELE(vfsp); /* release thread's hold */ 1235 zthread_exit(); 1236 } 1237 1238 /* 1239 * Signal (and wait for) the async manager thread to clean up and go away. 1240 */ 1241 void 1242 nfs_async_manager_stop(vfs_t *vfsp) 1243 { 1244 mntinfo_t *mi = VFTOMI(vfsp); 1245 1246 mutex_enter(&mi->mi_async_lock); 1247 mutex_enter(&mi->mi_lock); 1248 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1249 mutex_exit(&mi->mi_lock); 1250 cv_broadcast(&mi->mi_async_reqs_cv); 1251 while (mi->mi_manager_thread != NULL) 1252 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1253 mutex_exit(&mi->mi_async_lock); 1254 } 1255 1256 int 1257 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1258 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1259 u_offset_t, caddr_t, struct seg *, cred_t *)) 1260 { 1261 rnode_t *rp; 1262 mntinfo_t *mi; 1263 struct nfs_async_reqs *args; 1264 1265 rp = VTOR(vp); 1266 ASSERT(rp->r_freef == NULL); 1267 1268 mi = VTOMI(vp); 1269 1270 /* 1271 * If addr falls in a different segment, don't bother doing readahead. 1272 */ 1273 if (addr >= seg->s_base + seg->s_size) 1274 return (-1); 1275 1276 /* 1277 * If we can't allocate a request structure, punt on the readahead. 1278 */ 1279 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1280 return (-1); 1281 1282 /* 1283 * If a lock operation is pending, don't initiate any new 1284 * readaheads. Otherwise, bump r_count to indicate the new 1285 * asynchronous I/O. 1286 */ 1287 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1288 kmem_free(args, sizeof (*args)); 1289 return (-1); 1290 } 1291 mutex_enter(&rp->r_statelock); 1292 rp->r_count++; 1293 mutex_exit(&rp->r_statelock); 1294 nfs_rw_exit(&rp->r_lkserlock); 1295 1296 args->a_next = NULL; 1297 #ifdef DEBUG 1298 args->a_queuer = curthread; 1299 #endif 1300 VN_HOLD(vp); 1301 args->a_vp = vp; 1302 ASSERT(cr != NULL); 1303 crhold(cr); 1304 args->a_cred = cr; 1305 args->a_io = NFS_READ_AHEAD; 1306 args->a_nfs_readahead = readahead; 1307 args->a_nfs_blkoff = blkoff; 1308 args->a_nfs_seg = seg; 1309 args->a_nfs_addr = addr; 1310 1311 mutex_enter(&mi->mi_async_lock); 1312 1313 /* 1314 * If asyncio has been disabled, don't bother readahead. 1315 */ 1316 if (mi->mi_max_threads == 0) { 1317 mutex_exit(&mi->mi_async_lock); 1318 goto noasync; 1319 } 1320 1321 /* 1322 * Link request structure into the async list and 1323 * wakeup async thread to do the i/o. 1324 */ 1325 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1326 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1327 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1328 } else { 1329 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1330 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1331 } 1332 1333 if (mi->mi_io_kstats) { 1334 mutex_enter(&mi->mi_lock); 1335 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1336 mutex_exit(&mi->mi_lock); 1337 } 1338 1339 mi->mi_async_req_count++; 1340 ASSERT(mi->mi_async_req_count != 0); 1341 cv_signal(&mi->mi_async_reqs_cv); 1342 mutex_exit(&mi->mi_async_lock); 1343 return (0); 1344 1345 noasync: 1346 mutex_enter(&rp->r_statelock); 1347 rp->r_count--; 1348 cv_broadcast(&rp->r_cv); 1349 mutex_exit(&rp->r_statelock); 1350 VN_RELE(vp); 1351 crfree(cr); 1352 kmem_free(args, sizeof (*args)); 1353 return (-1); 1354 } 1355 1356 int 1357 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1358 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1359 u_offset_t, size_t, int, cred_t *)) 1360 { 1361 rnode_t *rp; 1362 mntinfo_t *mi; 1363 struct nfs_async_reqs *args; 1364 1365 ASSERT(flags & B_ASYNC); 1366 ASSERT(vp->v_vfsp != NULL); 1367 1368 rp = VTOR(vp); 1369 ASSERT(rp->r_count > 0); 1370 1371 mi = VTOMI(vp); 1372 1373 /* 1374 * If we can't allocate a request structure, do the putpage 1375 * operation synchronously in this thread's context. 1376 */ 1377 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1378 goto noasync; 1379 1380 args->a_next = NULL; 1381 #ifdef DEBUG 1382 args->a_queuer = curthread; 1383 #endif 1384 VN_HOLD(vp); 1385 args->a_vp = vp; 1386 ASSERT(cr != NULL); 1387 crhold(cr); 1388 args->a_cred = cr; 1389 args->a_io = NFS_PUTAPAGE; 1390 args->a_nfs_putapage = putapage; 1391 args->a_nfs_pp = pp; 1392 args->a_nfs_off = off; 1393 args->a_nfs_len = (uint_t)len; 1394 args->a_nfs_flags = flags; 1395 1396 mutex_enter(&mi->mi_async_lock); 1397 1398 /* 1399 * If asyncio has been disabled, then make a synchronous request. 1400 * This check is done a second time in case async io was diabled 1401 * while this thread was blocked waiting for memory pressure to 1402 * reduce or for the queue to drain. 1403 */ 1404 if (mi->mi_max_threads == 0) { 1405 mutex_exit(&mi->mi_async_lock); 1406 goto noasync; 1407 } 1408 1409 /* 1410 * Link request structure into the async list and 1411 * wakeup async thread to do the i/o. 1412 */ 1413 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1414 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1415 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1416 } else { 1417 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1418 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1419 } 1420 1421 mutex_enter(&rp->r_statelock); 1422 rp->r_count++; 1423 rp->r_awcount++; 1424 mutex_exit(&rp->r_statelock); 1425 1426 if (mi->mi_io_kstats) { 1427 mutex_enter(&mi->mi_lock); 1428 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1429 mutex_exit(&mi->mi_lock); 1430 } 1431 1432 mi->mi_async_req_count++; 1433 ASSERT(mi->mi_async_req_count != 0); 1434 cv_signal(&mi->mi_async_reqs_cv); 1435 mutex_exit(&mi->mi_async_lock); 1436 return (0); 1437 1438 noasync: 1439 if (args != NULL) { 1440 VN_RELE(vp); 1441 crfree(cr); 1442 kmem_free(args, sizeof (*args)); 1443 } 1444 1445 if (curproc == proc_pageout || curproc == proc_fsflush) { 1446 /* 1447 * If we get here in the context of the pageout/fsflush, 1448 * we refuse to do a sync write, because this may hang 1449 * pageout (and the machine). In this case, we just 1450 * re-mark the page as dirty and punt on the page. 1451 * 1452 * Make sure B_FORCE isn't set. We can re-mark the 1453 * pages as dirty and unlock the pages in one swoop by 1454 * passing in B_ERROR to pvn_write_done(). However, 1455 * we should make sure B_FORCE isn't set - we don't 1456 * want the page tossed before it gets written out. 1457 */ 1458 if (flags & B_FORCE) 1459 flags &= ~(B_INVAL | B_FORCE); 1460 pvn_write_done(pp, flags | B_ERROR); 1461 return (0); 1462 } 1463 if (nfs_zone() != mi->mi_zone) { 1464 /* 1465 * So this was a cross-zone sync putpage. We pass in B_ERROR 1466 * to pvn_write_done() to re-mark the pages as dirty and unlock 1467 * them. 1468 * 1469 * We don't want to clear B_FORCE here as the caller presumably 1470 * knows what they're doing if they set it. 1471 */ 1472 pvn_write_done(pp, flags | B_ERROR); 1473 return (EPERM); 1474 } 1475 return ((*putapage)(vp, pp, off, len, flags, cr)); 1476 } 1477 1478 int 1479 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1480 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1481 size_t, int, cred_t *)) 1482 { 1483 rnode_t *rp; 1484 mntinfo_t *mi; 1485 struct nfs_async_reqs *args; 1486 1487 ASSERT(flags & B_ASYNC); 1488 ASSERT(vp->v_vfsp != NULL); 1489 1490 rp = VTOR(vp); 1491 ASSERT(rp->r_count > 0); 1492 1493 mi = VTOMI(vp); 1494 1495 /* 1496 * If we can't allocate a request structure, do the pageio 1497 * request synchronously in this thread's context. 1498 */ 1499 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1500 goto noasync; 1501 1502 args->a_next = NULL; 1503 #ifdef DEBUG 1504 args->a_queuer = curthread; 1505 #endif 1506 VN_HOLD(vp); 1507 args->a_vp = vp; 1508 ASSERT(cr != NULL); 1509 crhold(cr); 1510 args->a_cred = cr; 1511 args->a_io = NFS_PAGEIO; 1512 args->a_nfs_pageio = pageio; 1513 args->a_nfs_pp = pp; 1514 args->a_nfs_off = io_off; 1515 args->a_nfs_len = (uint_t)io_len; 1516 args->a_nfs_flags = flags; 1517 1518 mutex_enter(&mi->mi_async_lock); 1519 1520 /* 1521 * If asyncio has been disabled, then make a synchronous request. 1522 * This check is done a second time in case async io was diabled 1523 * while this thread was blocked waiting for memory pressure to 1524 * reduce or for the queue to drain. 1525 */ 1526 if (mi->mi_max_threads == 0) { 1527 mutex_exit(&mi->mi_async_lock); 1528 goto noasync; 1529 } 1530 1531 /* 1532 * Link request structure into the async list and 1533 * wakeup async thread to do the i/o. 1534 */ 1535 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1536 mi->mi_async_reqs[NFS_PAGEIO] = args; 1537 mi->mi_async_tail[NFS_PAGEIO] = args; 1538 } else { 1539 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1540 mi->mi_async_tail[NFS_PAGEIO] = args; 1541 } 1542 1543 mutex_enter(&rp->r_statelock); 1544 rp->r_count++; 1545 rp->r_awcount++; 1546 mutex_exit(&rp->r_statelock); 1547 1548 if (mi->mi_io_kstats) { 1549 mutex_enter(&mi->mi_lock); 1550 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1551 mutex_exit(&mi->mi_lock); 1552 } 1553 1554 mi->mi_async_req_count++; 1555 ASSERT(mi->mi_async_req_count != 0); 1556 cv_signal(&mi->mi_async_reqs_cv); 1557 mutex_exit(&mi->mi_async_lock); 1558 return (0); 1559 1560 noasync: 1561 if (args != NULL) { 1562 VN_RELE(vp); 1563 crfree(cr); 1564 kmem_free(args, sizeof (*args)); 1565 } 1566 1567 /* 1568 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1569 * the page list), for writes we do it synchronously, except for 1570 * proc_pageout/proc_fsflush as described below. 1571 */ 1572 if (flags & B_READ) { 1573 pvn_read_done(pp, flags | B_ERROR); 1574 return (0); 1575 } 1576 1577 if (curproc == proc_pageout || curproc == proc_fsflush) { 1578 /* 1579 * If we get here in the context of the pageout/fsflush, 1580 * we refuse to do a sync write, because this may hang 1581 * pageout/fsflush (and the machine). In this case, we just 1582 * re-mark the page as dirty and punt on the page. 1583 * 1584 * Make sure B_FORCE isn't set. We can re-mark the 1585 * pages as dirty and unlock the pages in one swoop by 1586 * passing in B_ERROR to pvn_write_done(). However, 1587 * we should make sure B_FORCE isn't set - we don't 1588 * want the page tossed before it gets written out. 1589 */ 1590 if (flags & B_FORCE) 1591 flags &= ~(B_INVAL | B_FORCE); 1592 pvn_write_done(pp, flags | B_ERROR); 1593 return (0); 1594 } 1595 1596 if (nfs_zone() != mi->mi_zone) { 1597 /* 1598 * So this was a cross-zone sync pageio. We pass in B_ERROR 1599 * to pvn_write_done() to re-mark the pages as dirty and unlock 1600 * them. 1601 * 1602 * We don't want to clear B_FORCE here as the caller presumably 1603 * knows what they're doing if they set it. 1604 */ 1605 pvn_write_done(pp, flags | B_ERROR); 1606 return (EPERM); 1607 } 1608 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1609 } 1610 1611 void 1612 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1613 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1614 { 1615 rnode_t *rp; 1616 mntinfo_t *mi; 1617 struct nfs_async_reqs *args; 1618 1619 rp = VTOR(vp); 1620 ASSERT(rp->r_freef == NULL); 1621 1622 mi = VTOMI(vp); 1623 1624 /* 1625 * If we can't allocate a request structure, do the readdir 1626 * operation synchronously in this thread's context. 1627 */ 1628 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1629 goto noasync; 1630 1631 args->a_next = NULL; 1632 #ifdef DEBUG 1633 args->a_queuer = curthread; 1634 #endif 1635 VN_HOLD(vp); 1636 args->a_vp = vp; 1637 ASSERT(cr != NULL); 1638 crhold(cr); 1639 args->a_cred = cr; 1640 args->a_io = NFS_READDIR; 1641 args->a_nfs_readdir = readdir; 1642 args->a_nfs_rdc = rdc; 1643 1644 mutex_enter(&mi->mi_async_lock); 1645 1646 /* 1647 * If asyncio has been disabled, then make a synchronous request. 1648 */ 1649 if (mi->mi_max_threads == 0) { 1650 mutex_exit(&mi->mi_async_lock); 1651 goto noasync; 1652 } 1653 1654 /* 1655 * Link request structure into the async list and 1656 * wakeup async thread to do the i/o. 1657 */ 1658 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1659 mi->mi_async_reqs[NFS_READDIR] = args; 1660 mi->mi_async_tail[NFS_READDIR] = args; 1661 } else { 1662 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1663 mi->mi_async_tail[NFS_READDIR] = args; 1664 } 1665 1666 mutex_enter(&rp->r_statelock); 1667 rp->r_count++; 1668 mutex_exit(&rp->r_statelock); 1669 1670 if (mi->mi_io_kstats) { 1671 mutex_enter(&mi->mi_lock); 1672 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1673 mutex_exit(&mi->mi_lock); 1674 } 1675 1676 mi->mi_async_req_count++; 1677 ASSERT(mi->mi_async_req_count != 0); 1678 cv_signal(&mi->mi_async_reqs_cv); 1679 mutex_exit(&mi->mi_async_lock); 1680 return; 1681 1682 noasync: 1683 if (args != NULL) { 1684 VN_RELE(vp); 1685 crfree(cr); 1686 kmem_free(args, sizeof (*args)); 1687 } 1688 1689 rdc->entries = NULL; 1690 mutex_enter(&rp->r_statelock); 1691 ASSERT(rdc->flags & RDDIR); 1692 rdc->flags &= ~RDDIR; 1693 rdc->flags |= RDDIRREQ; 1694 /* 1695 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1696 * is set, wakeup the thread sleeping in cv_wait_sig(). 1697 * The woken up thread will reset the flag to RDDIR and will 1698 * continue with the readdir opeartion. 1699 */ 1700 if (rdc->flags & RDDIRWAIT) { 1701 rdc->flags &= ~RDDIRWAIT; 1702 cv_broadcast(&rdc->cv); 1703 } 1704 mutex_exit(&rp->r_statelock); 1705 rddir_cache_rele(rdc); 1706 } 1707 1708 void 1709 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1710 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1711 cred_t *)) 1712 { 1713 rnode_t *rp; 1714 mntinfo_t *mi; 1715 struct nfs_async_reqs *args; 1716 page_t *pp; 1717 1718 rp = VTOR(vp); 1719 mi = VTOMI(vp); 1720 1721 /* 1722 * If we can't allocate a request structure, do the commit 1723 * operation synchronously in this thread's context. 1724 */ 1725 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1726 goto noasync; 1727 1728 args->a_next = NULL; 1729 #ifdef DEBUG 1730 args->a_queuer = curthread; 1731 #endif 1732 VN_HOLD(vp); 1733 args->a_vp = vp; 1734 ASSERT(cr != NULL); 1735 crhold(cr); 1736 args->a_cred = cr; 1737 args->a_io = NFS_COMMIT; 1738 args->a_nfs_commit = commit; 1739 args->a_nfs_plist = plist; 1740 args->a_nfs_offset = offset; 1741 args->a_nfs_count = count; 1742 1743 mutex_enter(&mi->mi_async_lock); 1744 1745 /* 1746 * If asyncio has been disabled, then make a synchronous request. 1747 * This check is done a second time in case async io was diabled 1748 * while this thread was blocked waiting for memory pressure to 1749 * reduce or for the queue to drain. 1750 */ 1751 if (mi->mi_max_threads == 0) { 1752 mutex_exit(&mi->mi_async_lock); 1753 goto noasync; 1754 } 1755 1756 /* 1757 * Link request structure into the async list and 1758 * wakeup async thread to do the i/o. 1759 */ 1760 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1761 mi->mi_async_reqs[NFS_COMMIT] = args; 1762 mi->mi_async_tail[NFS_COMMIT] = args; 1763 } else { 1764 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1765 mi->mi_async_tail[NFS_COMMIT] = args; 1766 } 1767 1768 mutex_enter(&rp->r_statelock); 1769 rp->r_count++; 1770 mutex_exit(&rp->r_statelock); 1771 1772 if (mi->mi_io_kstats) { 1773 mutex_enter(&mi->mi_lock); 1774 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1775 mutex_exit(&mi->mi_lock); 1776 } 1777 1778 mi->mi_async_req_count++; 1779 ASSERT(mi->mi_async_req_count != 0); 1780 cv_signal(&mi->mi_async_reqs_cv); 1781 mutex_exit(&mi->mi_async_lock); 1782 return; 1783 1784 noasync: 1785 if (args != NULL) { 1786 VN_RELE(vp); 1787 crfree(cr); 1788 kmem_free(args, sizeof (*args)); 1789 } 1790 1791 if (curproc == proc_pageout || curproc == proc_fsflush || 1792 nfs_zone() != mi->mi_zone) { 1793 while (plist != NULL) { 1794 pp = plist; 1795 page_sub(&plist, pp); 1796 pp->p_fsdata = C_COMMIT; 1797 page_unlock(pp); 1798 } 1799 return; 1800 } 1801 (*commit)(vp, plist, offset, count, cr); 1802 } 1803 1804 void 1805 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1806 void (*inactive)(vnode_t *, cred_t *)) 1807 { 1808 mntinfo_t *mi; 1809 struct nfs_async_reqs *args; 1810 1811 mi = VTOMI(vp); 1812 1813 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1814 args->a_next = NULL; 1815 #ifdef DEBUG 1816 args->a_queuer = curthread; 1817 #endif 1818 args->a_vp = vp; 1819 ASSERT(cr != NULL); 1820 crhold(cr); 1821 args->a_cred = cr; 1822 args->a_io = NFS_INACTIVE; 1823 args->a_nfs_inactive = inactive; 1824 1825 /* 1826 * Note that we don't check mi->mi_max_threads here, since we 1827 * *need* to get rid of this vnode regardless of whether someone 1828 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1829 * 1830 * The manager thread knows about this and is willing to create 1831 * at least one thread to accomodate us. 1832 */ 1833 mutex_enter(&mi->mi_async_lock); 1834 if (mi->mi_manager_thread == NULL) { 1835 rnode_t *rp = VTOR(vp); 1836 1837 mutex_exit(&mi->mi_async_lock); 1838 crfree(cr); /* drop our reference */ 1839 kmem_free(args, sizeof (*args)); 1840 /* 1841 * We can't do an over-the-wire call since we're in the wrong 1842 * zone, so we need to clean up state as best we can and then 1843 * throw away the vnode. 1844 */ 1845 mutex_enter(&rp->r_statelock); 1846 if (rp->r_unldvp != NULL) { 1847 vnode_t *unldvp; 1848 char *unlname; 1849 cred_t *unlcred; 1850 1851 unldvp = rp->r_unldvp; 1852 rp->r_unldvp = NULL; 1853 unlname = rp->r_unlname; 1854 rp->r_unlname = NULL; 1855 unlcred = rp->r_unlcred; 1856 rp->r_unlcred = NULL; 1857 mutex_exit(&rp->r_statelock); 1858 1859 VN_RELE(unldvp); 1860 kmem_free(unlname, MAXNAMELEN); 1861 crfree(unlcred); 1862 } else { 1863 mutex_exit(&rp->r_statelock); 1864 } 1865 /* 1866 * No need to explicitly throw away any cached pages. The 1867 * eventual rinactive() will attempt a synchronous 1868 * VOP_PUTPAGE() which will immediately fail since the request 1869 * is coming from the wrong zone, and then will proceed to call 1870 * nfs_invalidate_pages() which will clean things up for us. 1871 */ 1872 rp_addfree(VTOR(vp), cr); 1873 return; 1874 } 1875 1876 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1877 mi->mi_async_reqs[NFS_INACTIVE] = args; 1878 } else { 1879 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1880 } 1881 mi->mi_async_tail[NFS_INACTIVE] = args; 1882 /* 1883 * Don't increment r_count, since we're trying to get rid of the vnode. 1884 */ 1885 1886 mi->mi_async_req_count++; 1887 ASSERT(mi->mi_async_req_count != 0); 1888 cv_signal(&mi->mi_async_reqs_cv); 1889 mutex_exit(&mi->mi_async_lock); 1890 } 1891 1892 /* 1893 * The async queues for each mounted file system are arranged as a 1894 * set of queues, one for each async i/o type. Requests are taken 1895 * from the queues in a round-robin fashion. A number of consecutive 1896 * requests are taken from each queue before moving on to the next 1897 * queue. This functionality may allow the NFS Version 2 server to do 1898 * write clustering, even if the client is mixing writes and reads 1899 * because it will take multiple write requests from the queue 1900 * before processing any of the other async i/o types. 1901 * 1902 * XXX The nfs_async_start thread is unsafe in the light of the present 1903 * model defined by cpr to suspend the system. Specifically over the 1904 * wire calls are cpr-unsafe. The thread should be reevaluated in 1905 * case of future updates to the cpr model. 1906 */ 1907 static void 1908 nfs_async_start(struct vfs *vfsp) 1909 { 1910 struct nfs_async_reqs *args; 1911 mntinfo_t *mi = VFTOMI(vfsp); 1912 clock_t time_left = 1; 1913 callb_cpr_t cprinfo; 1914 int i; 1915 1916 /* 1917 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1918 * built in an implementation independent manner. 1919 */ 1920 if (nfs_async_timeout == -1) 1921 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1922 1923 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1924 1925 mutex_enter(&mi->mi_async_lock); 1926 for (;;) { 1927 /* 1928 * Find the next queue containing an entry. We start 1929 * at the current queue pointer and then round robin 1930 * through all of them until we either find a non-empty 1931 * queue or have looked through all of them. 1932 */ 1933 for (i = 0; i < NFS_ASYNC_TYPES; i++) { 1934 args = *mi->mi_async_curr; 1935 if (args != NULL) 1936 break; 1937 mi->mi_async_curr++; 1938 if (mi->mi_async_curr == 1939 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 1940 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1941 } 1942 /* 1943 * If we didn't find a entry, then block until woken up 1944 * again and then look through the queues again. 1945 */ 1946 if (args == NULL) { 1947 /* 1948 * Exiting is considered to be safe for CPR as well 1949 */ 1950 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1951 1952 /* 1953 * Wakeup thread waiting to unmount the file 1954 * system only if all async threads are inactive. 1955 * 1956 * If we've timed-out and there's nothing to do, 1957 * then get rid of this thread. 1958 */ 1959 if (mi->mi_max_threads == 0 || time_left <= 0) { 1960 if (--mi->mi_threads == 0) 1961 cv_signal(&mi->mi_async_cv); 1962 CALLB_CPR_EXIT(&cprinfo); 1963 VFS_RELE(vfsp); /* release thread's hold */ 1964 zthread_exit(); 1965 /* NOTREACHED */ 1966 } 1967 time_left = cv_timedwait(&mi->mi_async_work_cv, 1968 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1969 1970 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1971 1972 continue; 1973 } 1974 time_left = 1; 1975 1976 /* 1977 * Remove the request from the async queue and then 1978 * update the current async request queue pointer. If 1979 * the current queue is empty or we have removed enough 1980 * consecutive entries from it, then reset the counter 1981 * for this queue and then move the current pointer to 1982 * the next queue. 1983 */ 1984 *mi->mi_async_curr = args->a_next; 1985 if (*mi->mi_async_curr == NULL || 1986 --mi->mi_async_clusters[args->a_io] == 0) { 1987 mi->mi_async_clusters[args->a_io] = 1988 mi->mi_async_init_clusters; 1989 mi->mi_async_curr++; 1990 if (mi->mi_async_curr == 1991 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 1992 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1993 } 1994 1995 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 1996 mutex_enter(&mi->mi_lock); 1997 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1998 mutex_exit(&mi->mi_lock); 1999 } 2000 2001 mutex_exit(&mi->mi_async_lock); 2002 2003 /* 2004 * Obtain arguments from the async request structure. 2005 */ 2006 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2007 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2008 args->a_nfs_addr, args->a_nfs_seg, 2009 args->a_cred); 2010 } else if (args->a_io == NFS_PUTAPAGE) { 2011 (void) (*args->a_nfs_putapage)(args->a_vp, 2012 args->a_nfs_pp, args->a_nfs_off, 2013 args->a_nfs_len, args->a_nfs_flags, 2014 args->a_cred); 2015 } else if (args->a_io == NFS_PAGEIO) { 2016 (void) (*args->a_nfs_pageio)(args->a_vp, 2017 args->a_nfs_pp, args->a_nfs_off, 2018 args->a_nfs_len, args->a_nfs_flags, 2019 args->a_cred); 2020 } else if (args->a_io == NFS_READDIR) { 2021 (void) ((*args->a_nfs_readdir)(args->a_vp, 2022 args->a_nfs_rdc, args->a_cred)); 2023 } else if (args->a_io == NFS_COMMIT) { 2024 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2025 args->a_nfs_offset, args->a_nfs_count, 2026 args->a_cred); 2027 } else if (args->a_io == NFS_INACTIVE) { 2028 (*args->a_nfs_inactive)(args->a_vp, args->a_cred); 2029 } 2030 2031 /* 2032 * Now, release the vnode and free the credentials 2033 * structure. 2034 */ 2035 free_async_args(args); 2036 /* 2037 * Reacquire the mutex because it will be needed above. 2038 */ 2039 mutex_enter(&mi->mi_async_lock); 2040 } 2041 } 2042 2043 void 2044 nfs_async_stop(struct vfs *vfsp) 2045 { 2046 mntinfo_t *mi = VFTOMI(vfsp); 2047 2048 /* 2049 * Wait for all outstanding async operations to complete and for the 2050 * worker threads to exit. 2051 */ 2052 mutex_enter(&mi->mi_async_lock); 2053 mi->mi_max_threads = 0; 2054 cv_broadcast(&mi->mi_async_work_cv); 2055 while (mi->mi_threads != 0) 2056 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2057 mutex_exit(&mi->mi_async_lock); 2058 } 2059 2060 /* 2061 * nfs_async_stop_sig: 2062 * Wait for all outstanding putpage operation to complete. If a signal 2063 * is deliver we will abort and return non-zero. If we can put all the 2064 * pages we will return 0. This routine is called from nfs_unmount and 2065 * nfs3_unmount to make these operations interruptable. 2066 */ 2067 int 2068 nfs_async_stop_sig(struct vfs *vfsp) 2069 { 2070 mntinfo_t *mi = VFTOMI(vfsp); 2071 ushort_t omax; 2072 int rval; 2073 2074 /* 2075 * Wait for all outstanding async operations to complete and for the 2076 * worker threads to exit. 2077 */ 2078 mutex_enter(&mi->mi_async_lock); 2079 omax = mi->mi_max_threads; 2080 mi->mi_max_threads = 0; 2081 /* 2082 * Tell all the worker threads to exit. 2083 */ 2084 cv_broadcast(&mi->mi_async_work_cv); 2085 while (mi->mi_threads != 0) { 2086 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2087 break; 2088 } 2089 rval = (mi->mi_threads != 0); /* Interrupted */ 2090 if (rval) 2091 mi->mi_max_threads = omax; 2092 mutex_exit(&mi->mi_async_lock); 2093 2094 return (rval); 2095 } 2096 2097 int 2098 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2099 { 2100 int pagecreate; 2101 int n; 2102 int saved_n; 2103 caddr_t saved_base; 2104 u_offset_t offset; 2105 int error; 2106 int sm_error; 2107 vnode_t *vp = RTOV(rp); 2108 2109 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2110 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2111 if (!vpm_enable) { 2112 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2113 } 2114 2115 /* 2116 * Move bytes in at most PAGESIZE chunks. We must avoid 2117 * spanning pages in uiomove() because page faults may cause 2118 * the cache to be invalidated out from under us. The r_size is not 2119 * updated until after the uiomove. If we push the last page of a 2120 * file before r_size is correct, we will lose the data written past 2121 * the current (and invalid) r_size. 2122 */ 2123 do { 2124 offset = uio->uio_loffset; 2125 pagecreate = 0; 2126 2127 /* 2128 * n is the number of bytes required to satisfy the request 2129 * or the number of bytes to fill out the page. 2130 */ 2131 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2132 2133 /* 2134 * Check to see if we can skip reading in the page 2135 * and just allocate the memory. We can do this 2136 * if we are going to rewrite the entire mapping 2137 * or if we are going to write to or beyond the current 2138 * end of file from the beginning of the mapping. 2139 * 2140 * The read of r_size is now protected by r_statelock. 2141 */ 2142 mutex_enter(&rp->r_statelock); 2143 /* 2144 * When pgcreated is nonzero the caller has already done 2145 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2146 * segkpm this means we already have at least one page 2147 * created and mapped at base. 2148 */ 2149 pagecreate = pgcreated || 2150 ((offset & PAGEOFFSET) == 0 && 2151 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2152 2153 mutex_exit(&rp->r_statelock); 2154 if (!vpm_enable && pagecreate) { 2155 /* 2156 * The last argument tells segmap_pagecreate() to 2157 * always lock the page, as opposed to sometimes 2158 * returning with the page locked. This way we avoid a 2159 * fault on the ensuing uiomove(), but also 2160 * more importantly (to fix bug 1094402) we can 2161 * call segmap_fault() to unlock the page in all 2162 * cases. An alternative would be to modify 2163 * segmap_pagecreate() to tell us when it is 2164 * locking a page, but that's a fairly major 2165 * interface change. 2166 */ 2167 if (pgcreated == 0) 2168 (void) segmap_pagecreate(segkmap, base, 2169 (uint_t)n, 1); 2170 saved_base = base; 2171 saved_n = n; 2172 } 2173 2174 /* 2175 * The number of bytes of data in the last page can not 2176 * be accurately be determined while page is being 2177 * uiomove'd to and the size of the file being updated. 2178 * Thus, inform threads which need to know accurately 2179 * how much data is in the last page of the file. They 2180 * will not do the i/o immediately, but will arrange for 2181 * the i/o to happen later when this modify operation 2182 * will have finished. 2183 */ 2184 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2185 mutex_enter(&rp->r_statelock); 2186 rp->r_flags |= RMODINPROGRESS; 2187 rp->r_modaddr = (offset & MAXBMASK); 2188 mutex_exit(&rp->r_statelock); 2189 2190 if (vpm_enable) { 2191 /* 2192 * Copy data. If new pages are created, part of 2193 * the page that is not written will be initizliazed 2194 * with zeros. 2195 */ 2196 error = vpm_data_copy(vp, offset, n, uio, 2197 !pagecreate, NULL, 0, S_WRITE); 2198 } else { 2199 error = uiomove(base, n, UIO_WRITE, uio); 2200 } 2201 2202 /* 2203 * r_size is the maximum number of 2204 * bytes known to be in the file. 2205 * Make sure it is at least as high as the 2206 * first unwritten byte pointed to by uio_loffset. 2207 */ 2208 mutex_enter(&rp->r_statelock); 2209 if (rp->r_size < uio->uio_loffset) 2210 rp->r_size = uio->uio_loffset; 2211 rp->r_flags &= ~RMODINPROGRESS; 2212 rp->r_flags |= RDIRTY; 2213 mutex_exit(&rp->r_statelock); 2214 2215 /* n = # of bytes written */ 2216 n = (int)(uio->uio_loffset - offset); 2217 2218 if (!vpm_enable) { 2219 base += n; 2220 } 2221 tcount -= n; 2222 /* 2223 * If we created pages w/o initializing them completely, 2224 * we need to zero the part that wasn't set up. 2225 * This happens on a most EOF write cases and if 2226 * we had some sort of error during the uiomove. 2227 */ 2228 if (!vpm_enable && pagecreate) { 2229 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2230 (void) kzero(base, PAGESIZE - n); 2231 2232 if (pgcreated) { 2233 /* 2234 * Caller is responsible for this page, 2235 * it was not created in this loop. 2236 */ 2237 pgcreated = 0; 2238 } else { 2239 /* 2240 * For bug 1094402: segmap_pagecreate locks 2241 * page. Unlock it. This also unlocks the 2242 * pages allocated by page_create_va() in 2243 * segmap_pagecreate(). 2244 */ 2245 sm_error = segmap_fault(kas.a_hat, segkmap, 2246 saved_base, saved_n, 2247 F_SOFTUNLOCK, S_WRITE); 2248 if (error == 0) 2249 error = sm_error; 2250 } 2251 } 2252 } while (tcount > 0 && error == 0); 2253 2254 return (error); 2255 } 2256 2257 int 2258 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2259 { 2260 rnode_t *rp; 2261 page_t *pp; 2262 u_offset_t eoff; 2263 u_offset_t io_off; 2264 size_t io_len; 2265 int error; 2266 int rdirty; 2267 int err; 2268 2269 rp = VTOR(vp); 2270 ASSERT(rp->r_count > 0); 2271 2272 if (!vn_has_cached_data(vp)) 2273 return (0); 2274 2275 ASSERT(vp->v_type != VCHR); 2276 2277 /* 2278 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2279 * writes. B_FORCE is set to force the VM system to actually 2280 * invalidate the pages, even if the i/o failed. The pages 2281 * need to get invalidated because they can't be written out 2282 * because there isn't any space left on either the server's 2283 * file system or in the user's disk quota. The B_FREE bit 2284 * is cleared to avoid confusion as to whether this is a 2285 * request to place the page on the freelist or to destroy 2286 * it. 2287 */ 2288 if ((rp->r_flags & ROUTOFSPACE) || 2289 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2290 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2291 2292 if (len == 0) { 2293 /* 2294 * If doing a full file synchronous operation, then clear 2295 * the RDIRTY bit. If a page gets dirtied while the flush 2296 * is happening, then RDIRTY will get set again. The 2297 * RDIRTY bit must get cleared before the flush so that 2298 * we don't lose this information. 2299 * 2300 * If there are no full file async write operations 2301 * pending and RDIRTY bit is set, clear it. 2302 */ 2303 if (off == (u_offset_t)0 && 2304 !(flags & B_ASYNC) && 2305 (rp->r_flags & RDIRTY)) { 2306 mutex_enter(&rp->r_statelock); 2307 rdirty = (rp->r_flags & RDIRTY); 2308 rp->r_flags &= ~RDIRTY; 2309 mutex_exit(&rp->r_statelock); 2310 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2311 mutex_enter(&rp->r_statelock); 2312 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) { 2313 rdirty = (rp->r_flags & RDIRTY); 2314 rp->r_flags &= ~RDIRTY; 2315 } 2316 mutex_exit(&rp->r_statelock); 2317 } else 2318 rdirty = 0; 2319 2320 /* 2321 * Search the entire vp list for pages >= off, and flush 2322 * the dirty pages. 2323 */ 2324 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2325 flags, cr); 2326 2327 /* 2328 * If an error occured and the file was marked as dirty 2329 * before and we aren't forcibly invalidating pages, then 2330 * reset the RDIRTY flag. 2331 */ 2332 if (error && rdirty && 2333 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2334 mutex_enter(&rp->r_statelock); 2335 rp->r_flags |= RDIRTY; 2336 mutex_exit(&rp->r_statelock); 2337 } 2338 } else { 2339 /* 2340 * Do a range from [off...off + len) looking for pages 2341 * to deal with. 2342 */ 2343 error = 0; 2344 #ifdef lint 2345 io_len = 0; 2346 #endif 2347 eoff = off + len; 2348 mutex_enter(&rp->r_statelock); 2349 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2350 io_off += io_len) { 2351 mutex_exit(&rp->r_statelock); 2352 /* 2353 * If we are not invalidating, synchronously 2354 * freeing or writing pages use the routine 2355 * page_lookup_nowait() to prevent reclaiming 2356 * them from the free list. 2357 */ 2358 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2359 pp = page_lookup(vp, io_off, 2360 (flags & (B_INVAL | B_FREE)) ? 2361 SE_EXCL : SE_SHARED); 2362 } else { 2363 pp = page_lookup_nowait(vp, io_off, 2364 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2365 } 2366 2367 if (pp == NULL || !pvn_getdirty(pp, flags)) 2368 io_len = PAGESIZE; 2369 else { 2370 err = (*rp->r_putapage)(vp, pp, &io_off, 2371 &io_len, flags, cr); 2372 if (!error) 2373 error = err; 2374 /* 2375 * "io_off" and "io_len" are returned as 2376 * the range of pages we actually wrote. 2377 * This allows us to skip ahead more quickly 2378 * since several pages may've been dealt 2379 * with by this iteration of the loop. 2380 */ 2381 } 2382 mutex_enter(&rp->r_statelock); 2383 } 2384 mutex_exit(&rp->r_statelock); 2385 } 2386 2387 return (error); 2388 } 2389 2390 void 2391 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2392 { 2393 rnode_t *rp; 2394 2395 rp = VTOR(vp); 2396 mutex_enter(&rp->r_statelock); 2397 while (rp->r_flags & RTRUNCATE) 2398 cv_wait(&rp->r_cv, &rp->r_statelock); 2399 rp->r_flags |= RTRUNCATE; 2400 if (off == (u_offset_t)0) { 2401 rp->r_flags &= ~RDIRTY; 2402 if (!(rp->r_flags & RSTALE)) 2403 rp->r_error = 0; 2404 } 2405 rp->r_truncaddr = off; 2406 mutex_exit(&rp->r_statelock); 2407 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2408 B_INVAL | B_TRUNC, cr); 2409 mutex_enter(&rp->r_statelock); 2410 rp->r_flags &= ~RTRUNCATE; 2411 cv_broadcast(&rp->r_cv); 2412 mutex_exit(&rp->r_statelock); 2413 } 2414 2415 static int nfs_write_error_to_cons_only = 0; 2416 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2417 2418 /* 2419 * Print a file handle 2420 */ 2421 void 2422 nfs_printfhandle(nfs_fhandle *fhp) 2423 { 2424 int *ip; 2425 char *buf; 2426 size_t bufsize; 2427 char *cp; 2428 2429 /* 2430 * 13 == "(file handle:" 2431 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2432 * 1 == ' ' 2433 * 8 == maximum strlen of "%x" 2434 * 3 == ")\n\0" 2435 */ 2436 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2437 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2438 if (buf == NULL) 2439 return; 2440 2441 cp = buf; 2442 (void) strcpy(cp, "(file handle:"); 2443 while (*cp != '\0') 2444 cp++; 2445 for (ip = (int *)fhp->fh_buf; 2446 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2447 ip++) { 2448 (void) sprintf(cp, " %x", *ip); 2449 while (*cp != '\0') 2450 cp++; 2451 } 2452 (void) strcpy(cp, ")\n"); 2453 2454 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2455 2456 kmem_free(buf, bufsize); 2457 } 2458 2459 /* 2460 * Notify the system administrator that an NFS write error has 2461 * occurred. 2462 */ 2463 2464 /* seconds between ENOSPC/EDQUOT messages */ 2465 clock_t nfs_write_error_interval = 5; 2466 2467 void 2468 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2469 { 2470 mntinfo_t *mi; 2471 2472 mi = VTOMI(vp); 2473 /* 2474 * In case of forced unmount or zone shutdown, do not print any 2475 * messages since it can flood the console with error messages. 2476 */ 2477 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2478 return; 2479 2480 /* 2481 * No use in flooding the console with ENOSPC 2482 * messages from the same file system. 2483 */ 2484 if ((error != ENOSPC && error != EDQUOT) || 2485 lbolt - mi->mi_printftime > 0) { 2486 zoneid_t zoneid = mi->mi_zone->zone_id; 2487 2488 #ifdef DEBUG 2489 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2490 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2491 #else 2492 nfs_perror(error, "NFS write error on host %s: %m.\n", 2493 VTOR(vp)->r_server->sv_hostname, NULL); 2494 #endif 2495 if (error == ENOSPC || error == EDQUOT) { 2496 zcmn_err(zoneid, CE_CONT, 2497 MSG("^File: userid=%d, groupid=%d\n"), 2498 crgetuid(cr), crgetgid(cr)); 2499 if (crgetuid(CRED()) != crgetuid(cr) || 2500 crgetgid(CRED()) != crgetgid(cr)) { 2501 zcmn_err(zoneid, CE_CONT, 2502 MSG("^User: userid=%d, groupid=%d\n"), 2503 crgetuid(CRED()), crgetgid(CRED())); 2504 } 2505 mi->mi_printftime = lbolt + 2506 nfs_write_error_interval * hz; 2507 } 2508 nfs_printfhandle(&VTOR(vp)->r_fh); 2509 #ifdef DEBUG 2510 if (error == EACCES) { 2511 zcmn_err(zoneid, CE_CONT, 2512 MSG("^nfs_bio: cred is%s kcred\n"), 2513 cr == kcred ? "" : " not"); 2514 } 2515 #endif 2516 } 2517 } 2518 2519 /* ARGSUSED */ 2520 static void * 2521 nfs_mi_init(zoneid_t zoneid) 2522 { 2523 struct mi_globals *mig; 2524 2525 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2526 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2527 list_create(&mig->mig_list, sizeof (mntinfo_t), 2528 offsetof(mntinfo_t, mi_zone_node)); 2529 mig->mig_destructor_called = B_FALSE; 2530 return (mig); 2531 } 2532 2533 /* 2534 * Callback routine to tell all NFS mounts in the zone to stop creating new 2535 * threads. Existing threads should exit. 2536 */ 2537 /* ARGSUSED */ 2538 static void 2539 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2540 { 2541 struct mi_globals *mig = data; 2542 mntinfo_t *mi; 2543 2544 ASSERT(mig != NULL); 2545 again: 2546 mutex_enter(&mig->mig_lock); 2547 for (mi = list_head(&mig->mig_list); mi != NULL; 2548 mi = list_next(&mig->mig_list, mi)) { 2549 2550 /* 2551 * If we've done the shutdown work for this FS, skip. 2552 * Once we go off the end of the list, we're done. 2553 */ 2554 if (mi->mi_flags & MI_DEAD) 2555 continue; 2556 2557 /* 2558 * We will do work, so not done. Get a hold on the FS. 2559 */ 2560 VFS_HOLD(mi->mi_vfsp); 2561 2562 /* 2563 * purge the DNLC for this filesystem 2564 */ 2565 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2566 2567 mutex_enter(&mi->mi_async_lock); 2568 /* 2569 * Tell existing async worker threads to exit. 2570 */ 2571 mi->mi_max_threads = 0; 2572 cv_broadcast(&mi->mi_async_work_cv); 2573 /* 2574 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2575 * getting ready to exit when it's done with its current work. 2576 * Also set MI_DEAD to note we've acted on this FS. 2577 */ 2578 mutex_enter(&mi->mi_lock); 2579 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD); 2580 mutex_exit(&mi->mi_lock); 2581 /* 2582 * Wake up the async manager thread. 2583 */ 2584 cv_broadcast(&mi->mi_async_reqs_cv); 2585 mutex_exit(&mi->mi_async_lock); 2586 2587 /* 2588 * Drop lock and release FS, which may change list, then repeat. 2589 * We're done when every mi has been done or the list is empty. 2590 */ 2591 mutex_exit(&mig->mig_lock); 2592 VFS_RELE(mi->mi_vfsp); 2593 goto again; 2594 } 2595 mutex_exit(&mig->mig_lock); 2596 } 2597 2598 static void 2599 nfs_mi_free_globals(struct mi_globals *mig) 2600 { 2601 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2602 mutex_destroy(&mig->mig_lock); 2603 kmem_free(mig, sizeof (*mig)); 2604 2605 } 2606 2607 /* ARGSUSED */ 2608 static void 2609 nfs_mi_destroy(zoneid_t zoneid, void *data) 2610 { 2611 struct mi_globals *mig = data; 2612 2613 ASSERT(mig != NULL); 2614 mutex_enter(&mig->mig_lock); 2615 if (list_head(&mig->mig_list) != NULL) { 2616 /* Still waiting for VFS_FREEVFS() */ 2617 mig->mig_destructor_called = B_TRUE; 2618 mutex_exit(&mig->mig_lock); 2619 return; 2620 } 2621 nfs_mi_free_globals(mig); 2622 } 2623 2624 /* 2625 * Add an NFS mount to the per-zone list of NFS mounts. 2626 */ 2627 void 2628 nfs_mi_zonelist_add(mntinfo_t *mi) 2629 { 2630 struct mi_globals *mig; 2631 2632 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2633 mutex_enter(&mig->mig_lock); 2634 list_insert_head(&mig->mig_list, mi); 2635 mutex_exit(&mig->mig_lock); 2636 } 2637 2638 /* 2639 * Remove an NFS mount from the per-zone list of NFS mounts. 2640 */ 2641 static void 2642 nfs_mi_zonelist_remove(mntinfo_t *mi) 2643 { 2644 struct mi_globals *mig; 2645 2646 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2647 mutex_enter(&mig->mig_lock); 2648 list_remove(&mig->mig_list, mi); 2649 /* 2650 * We can be called asynchronously by VFS_FREEVFS() after the zone 2651 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2652 * mi globals. 2653 */ 2654 if (list_head(&mig->mig_list) == NULL && 2655 mig->mig_destructor_called == B_TRUE) { 2656 nfs_mi_free_globals(mig); 2657 return; 2658 } 2659 mutex_exit(&mig->mig_lock); 2660 } 2661 2662 /* 2663 * NFS Client initialization routine. This routine should only be called 2664 * once. It performs the following tasks: 2665 * - Initalize all global locks 2666 * - Call sub-initialization routines (localize access to variables) 2667 */ 2668 int 2669 nfs_clntinit(void) 2670 { 2671 #ifdef DEBUG 2672 static boolean_t nfs_clntup = B_FALSE; 2673 #endif 2674 int error; 2675 2676 #ifdef DEBUG 2677 ASSERT(nfs_clntup == B_FALSE); 2678 #endif 2679 2680 error = nfs_subrinit(); 2681 if (error) 2682 return (error); 2683 2684 error = nfs_vfsinit(); 2685 if (error) { 2686 /* 2687 * Cleanup nfs_subrinit() work 2688 */ 2689 nfs_subrfini(); 2690 return (error); 2691 } 2692 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2693 nfs_mi_destroy); 2694 2695 nfs4_clnt_init(); 2696 2697 #ifdef DEBUG 2698 nfs_clntup = B_TRUE; 2699 #endif 2700 2701 return (0); 2702 } 2703 2704 /* 2705 * This routine is only called if the NFS Client has been initialized but 2706 * the module failed to be installed. This routine will cleanup the previously 2707 * allocated/initialized work. 2708 */ 2709 void 2710 nfs_clntfini(void) 2711 { 2712 (void) zone_key_delete(mi_list_key); 2713 nfs_subrfini(); 2714 nfs_vfsfini(); 2715 nfs4_clnt_fini(); 2716 } 2717 2718 /* 2719 * nfs_lockrelease: 2720 * 2721 * Release any locks on the given vnode that are held by the current 2722 * process. 2723 */ 2724 void 2725 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2726 { 2727 flock64_t ld; 2728 struct shrlock shr; 2729 char *buf; 2730 int remote_lock_possible; 2731 int ret; 2732 2733 ASSERT((uintptr_t)vp > KERNELBASE); 2734 2735 /* 2736 * Generate an explicit unlock operation for the entire file. As a 2737 * partial optimization, only generate the unlock if there is a 2738 * lock registered for the file. We could check whether this 2739 * particular process has any locks on the file, but that would 2740 * require the local locking code to provide yet another query 2741 * routine. Note that no explicit synchronization is needed here. 2742 * At worst, flk_has_remote_locks() will return a false positive, 2743 * in which case the unlock call wastes time but doesn't harm 2744 * correctness. 2745 * 2746 * In addition, an unlock request is generated if the process 2747 * is listed as possibly having a lock on the file because the 2748 * server and client lock managers may have gotten out of sync. 2749 * N.B. It is important to make sure nfs_remove_locking_id() is 2750 * called here even if flk_has_remote_locks(vp) reports true. 2751 * If it is not called and there is an entry on the process id 2752 * list, that entry will never get removed. 2753 */ 2754 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2755 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2756 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2757 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2758 ld.l_whence = 0; /* unlock from start of file */ 2759 ld.l_start = 0; 2760 ld.l_len = 0; /* do entire file */ 2761 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr); 2762 2763 if (ret != 0) { 2764 /* 2765 * If VOP_FRLOCK fails, make sure we unregister 2766 * local locks before we continue. 2767 */ 2768 ld.l_pid = ttoproc(curthread)->p_pid; 2769 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2770 #ifdef DEBUG 2771 nfs_perror(ret, 2772 "NFS lock release error on vp %p: %m.\n", 2773 (void *)vp, NULL); 2774 #endif 2775 } 2776 2777 /* 2778 * The call to VOP_FRLOCK may put the pid back on the 2779 * list. We need to remove it. 2780 */ 2781 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2782 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2783 } 2784 2785 /* 2786 * As long as the vp has a share matching our pid, 2787 * pluck it off and unshare it. There are circumstances in 2788 * which the call to nfs_remove_locking_id() may put the 2789 * owner back on the list, in which case we simply do a 2790 * redundant and harmless unshare. 2791 */ 2792 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2793 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2794 (char *)NULL, buf, &shr.s_own_len)) { 2795 shr.s_owner = buf; 2796 shr.s_access = 0; 2797 shr.s_deny = 0; 2798 shr.s_sysid = 0; 2799 shr.s_pid = curproc->p_pid; 2800 2801 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr); 2802 #ifdef DEBUG 2803 if (ret != 0) { 2804 nfs_perror(ret, 2805 "NFS share release error on vp %p: %m.\n", 2806 (void *)vp, NULL); 2807 } 2808 #endif 2809 } 2810 kmem_free(buf, MAX_SHR_OWNER_LEN); 2811 } 2812 2813 /* 2814 * nfs_lockcompletion: 2815 * 2816 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2817 * as non cachable (set VNOCACHE bit). 2818 */ 2819 2820 void 2821 nfs_lockcompletion(vnode_t *vp, int cmd) 2822 { 2823 #ifdef DEBUG 2824 rnode_t *rp = VTOR(vp); 2825 2826 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2827 #endif 2828 2829 if (cmd == F_SETLK || cmd == F_SETLKW) { 2830 if (!lm_safemap(vp)) { 2831 mutex_enter(&vp->v_lock); 2832 vp->v_flag |= VNOCACHE; 2833 mutex_exit(&vp->v_lock); 2834 } else { 2835 mutex_enter(&vp->v_lock); 2836 vp->v_flag &= ~VNOCACHE; 2837 mutex_exit(&vp->v_lock); 2838 } 2839 } 2840 /* 2841 * The cached attributes of the file are stale after acquiring 2842 * the lock on the file. They were updated when the file was 2843 * opened, but not updated when the lock was acquired. Therefore the 2844 * cached attributes are invalidated after the lock is obtained. 2845 */ 2846 PURGE_ATTRCACHE(vp); 2847 } 2848 2849 /* 2850 * The lock manager holds state making it possible for the client 2851 * and server to be out of sync. For example, if the response from 2852 * the server granting a lock request is lost, the server will think 2853 * the lock is granted and the client will think the lock is lost. 2854 * The client can tell when it is not positive if it is in sync with 2855 * the server. 2856 * 2857 * To deal with this, a list of processes for which the client is 2858 * not sure if the server holds a lock is attached to the rnode. 2859 * When such a process closes the rnode, an unlock request is sent 2860 * to the server to unlock the entire file. 2861 * 2862 * The list is kept as a singularly linked NULL terminated list. 2863 * Because it is only added to under extreme error conditions, the 2864 * list shouldn't get very big. DEBUG kernels print a message if 2865 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 2866 * choosen to be 8, but can be tuned at runtime. 2867 */ 2868 #ifdef DEBUG 2869 /* int nfs_lmpl_high_water = 8; */ 2870 int nfs_lmpl_high_water = 128; 2871 int nfs_cnt_add_locking_id = 0; 2872 int nfs_len_add_locking_id = 0; 2873 #endif /* DEBUG */ 2874 2875 /* 2876 * Record that the nfs lock manager server may be holding a lock on 2877 * a vnode for a process. 2878 * 2879 * Because the nfs lock manager server holds state, it is possible 2880 * for the server to get out of sync with the client. This routine is called 2881 * from the client when it is no longer sure if the server is in sync 2882 * with the client. nfs_lockrelease() will then notice this and send 2883 * an unlock request when the file is closed 2884 */ 2885 void 2886 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 2887 { 2888 rnode_t *rp; 2889 lmpl_t *new; 2890 lmpl_t *cur; 2891 lmpl_t **lmplp; 2892 #ifdef DEBUG 2893 int list_len = 1; 2894 #endif /* DEBUG */ 2895 2896 #ifdef DEBUG 2897 ++nfs_cnt_add_locking_id; 2898 #endif /* DEBUG */ 2899 /* 2900 * allocate new lmpl_t now so we don't sleep 2901 * later after grabbing mutexes 2902 */ 2903 ASSERT(len < MAX_SHR_OWNER_LEN); 2904 new = kmem_alloc(sizeof (*new), KM_SLEEP); 2905 new->lmpl_type = type; 2906 new->lmpl_pid = pid; 2907 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 2908 bcopy(id, new->lmpl_owner, len); 2909 new->lmpl_own_len = len; 2910 new->lmpl_next = (lmpl_t *)NULL; 2911 #ifdef DEBUG 2912 if (type == RLMPL_PID) { 2913 ASSERT(len == sizeof (pid_t)); 2914 ASSERT(pid == *(pid_t *)new->lmpl_owner); 2915 } else { 2916 ASSERT(type == RLMPL_OWNER); 2917 } 2918 #endif 2919 2920 rp = VTOR(vp); 2921 mutex_enter(&rp->r_statelock); 2922 2923 /* 2924 * Add this id to the list for this rnode only if the 2925 * rnode is active and the id is not already there. 2926 */ 2927 ASSERT(rp->r_flags & RHASHED); 2928 lmplp = &(rp->r_lmpl); 2929 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 2930 if (cur->lmpl_pid == pid && 2931 cur->lmpl_type == type && 2932 cur->lmpl_own_len == len && 2933 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 2934 kmem_free(new->lmpl_owner, len); 2935 kmem_free(new, sizeof (*new)); 2936 break; 2937 } 2938 lmplp = &cur->lmpl_next; 2939 #ifdef DEBUG 2940 ++list_len; 2941 #endif /* DEBUG */ 2942 } 2943 if (cur == (lmpl_t *)NULL) { 2944 *lmplp = new; 2945 #ifdef DEBUG 2946 if (list_len > nfs_len_add_locking_id) { 2947 nfs_len_add_locking_id = list_len; 2948 } 2949 if (list_len > nfs_lmpl_high_water) { 2950 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 2951 "vp=%p is %d", (void *)vp, list_len); 2952 } 2953 #endif /* DEBUG */ 2954 } 2955 2956 #ifdef DEBUG 2957 if (share_debug) { 2958 int nitems = 0; 2959 int npids = 0; 2960 int nowners = 0; 2961 2962 /* 2963 * Count the number of things left on r_lmpl after the remove. 2964 */ 2965 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 2966 cur = cur->lmpl_next) { 2967 nitems++; 2968 if (cur->lmpl_type == RLMPL_PID) { 2969 npids++; 2970 } else if (cur->lmpl_type == RLMPL_OWNER) { 2971 nowners++; 2972 } else { 2973 cmn_err(CE_PANIC, "nfs_add_locking_id: " 2974 "unrecognised lmpl_type %d", 2975 cur->lmpl_type); 2976 } 2977 } 2978 2979 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 2980 "OWNs = %d items left on r_lmpl\n", 2981 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 2982 } 2983 #endif 2984 2985 mutex_exit(&rp->r_statelock); 2986 } 2987 2988 /* 2989 * Remove an id from the lock manager id list. 2990 * 2991 * If the id is not in the list return 0. If it was found and 2992 * removed, return 1. 2993 */ 2994 static int 2995 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 2996 { 2997 lmpl_t *cur; 2998 lmpl_t **lmplp; 2999 rnode_t *rp; 3000 int rv = 0; 3001 3002 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 3003 3004 rp = VTOR(vp); 3005 3006 mutex_enter(&rp->r_statelock); 3007 ASSERT(rp->r_flags & RHASHED); 3008 lmplp = &(rp->r_lmpl); 3009 3010 /* 3011 * Search through the list and remove the entry for this id 3012 * if it is there. The special case id == NULL allows removal 3013 * of the first share on the r_lmpl list belonging to the 3014 * current process (if any), without regard to further details 3015 * of its identity. 3016 */ 3017 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3018 if (cur->lmpl_type == type && 3019 cur->lmpl_pid == curproc->p_pid && 3020 (id == (char *)NULL || 3021 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 3022 *lmplp = cur->lmpl_next; 3023 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 3024 if (rid != NULL) { 3025 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 3026 *rlen = cur->lmpl_own_len; 3027 } 3028 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 3029 kmem_free(cur, sizeof (*cur)); 3030 rv = 1; 3031 break; 3032 } 3033 lmplp = &cur->lmpl_next; 3034 } 3035 3036 #ifdef DEBUG 3037 if (share_debug) { 3038 int nitems = 0; 3039 int npids = 0; 3040 int nowners = 0; 3041 3042 /* 3043 * Count the number of things left on r_lmpl after the remove. 3044 */ 3045 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3046 cur = cur->lmpl_next) { 3047 nitems++; 3048 if (cur->lmpl_type == RLMPL_PID) { 3049 npids++; 3050 } else if (cur->lmpl_type == RLMPL_OWNER) { 3051 nowners++; 3052 } else { 3053 cmn_err(CE_PANIC, 3054 "nrli: unrecognised lmpl_type %d", 3055 cur->lmpl_type); 3056 } 3057 } 3058 3059 cmn_err(CE_CONT, 3060 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3061 (type == RLMPL_PID) ? "P" : "O", 3062 npids, 3063 nowners, 3064 nitems); 3065 } 3066 #endif 3067 3068 mutex_exit(&rp->r_statelock); 3069 return (rv); 3070 } 3071 3072 void 3073 nfs_free_mi(mntinfo_t *mi) 3074 { 3075 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3076 ASSERT(mi->mi_manager_thread == NULL); 3077 ASSERT(mi->mi_threads == 0); 3078 3079 /* 3080 * Remove the node from the global list before we start tearing it down. 3081 */ 3082 nfs_mi_zonelist_remove(mi); 3083 if (mi->mi_klmconfig) { 3084 lm_free_config(mi->mi_klmconfig); 3085 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3086 } 3087 mutex_destroy(&mi->mi_lock); 3088 mutex_destroy(&mi->mi_remap_lock); 3089 mutex_destroy(&mi->mi_async_lock); 3090 cv_destroy(&mi->mi_failover_cv); 3091 cv_destroy(&mi->mi_async_work_cv); 3092 cv_destroy(&mi->mi_async_reqs_cv); 3093 cv_destroy(&mi->mi_async_cv); 3094 zone_rele(mi->mi_zone); 3095 kmem_free(mi, sizeof (*mi)); 3096 } 3097 3098 static int 3099 mnt_kstat_update(kstat_t *ksp, int rw) 3100 { 3101 mntinfo_t *mi; 3102 struct mntinfo_kstat *mik; 3103 vfs_t *vfsp; 3104 int i; 3105 3106 /* this is a read-only kstat. Bail out on a write */ 3107 if (rw == KSTAT_WRITE) 3108 return (EACCES); 3109 3110 /* 3111 * We don't want to wait here as kstat_chain_lock could be held by 3112 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3113 * and thus could lead to a deadlock. 3114 */ 3115 vfsp = (struct vfs *)ksp->ks_private; 3116 3117 3118 mi = VFTOMI(vfsp); 3119 3120 mik = (struct mntinfo_kstat *)ksp->ks_data; 3121 3122 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3123 mik->mik_vers = (uint32_t)mi->mi_vers; 3124 mik->mik_flags = mi->mi_flags; 3125 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3126 mik->mik_curread = (uint32_t)mi->mi_curread; 3127 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3128 mik->mik_retrans = mi->mi_retrans; 3129 mik->mik_timeo = mi->mi_timeo; 3130 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3131 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3132 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3133 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3134 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3135 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3136 mik->mik_timers[i].deviate = 3137 (uint32_t)mi->mi_timers[i].rt_deviate; 3138 mik->mik_timers[i].rtxcur = 3139 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3140 } 3141 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3142 mik->mik_failover = (uint32_t)mi->mi_failover; 3143 mik->mik_remap = (uint32_t)mi->mi_remap; 3144 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3145 3146 return (0); 3147 } 3148 3149 void 3150 nfs_mnt_kstat_init(struct vfs *vfsp) 3151 { 3152 mntinfo_t *mi = VFTOMI(vfsp); 3153 3154 /* 3155 * Create the version specific kstats. 3156 * 3157 * PSARC 2001/697 Contract Private Interface 3158 * All nfs kstats are under SunMC contract 3159 * Please refer to the PSARC listed above and contact 3160 * SunMC before making any changes! 3161 * 3162 * Changes must be reviewed by Solaris File Sharing 3163 * Changes must be communicated to contract-2001-697@sun.com 3164 * 3165 */ 3166 3167 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3168 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3169 if (mi->mi_io_kstats) { 3170 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3171 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3172 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3173 kstat_install(mi->mi_io_kstats); 3174 } 3175 3176 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3177 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3178 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3179 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3180 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3181 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3182 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3183 kstat_install(mi->mi_ro_kstats); 3184 } 3185 } 3186 3187 nfs_delmapcall_t * 3188 nfs_init_delmapcall() 3189 { 3190 nfs_delmapcall_t *delmap_call; 3191 3192 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3193 delmap_call->call_id = curthread; 3194 delmap_call->error = 0; 3195 3196 return (delmap_call); 3197 } 3198 3199 void 3200 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3201 { 3202 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3203 } 3204 3205 /* 3206 * Searches for the current delmap caller (based on curthread) in the list of 3207 * callers. If it is found, we remove it and free the delmap caller. 3208 * Returns: 3209 * 0 if the caller wasn't found 3210 * 1 if the caller was found, removed and freed. *errp is set to what 3211 * the result of the delmap was. 3212 */ 3213 int 3214 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3215 { 3216 nfs_delmapcall_t *delmap_call; 3217 3218 /* 3219 * If the list doesn't exist yet, we create it and return 3220 * that the caller wasn't found. No list = no callers. 3221 */ 3222 mutex_enter(&rp->r_statelock); 3223 if (!(rp->r_flags & RDELMAPLIST)) { 3224 /* The list does not exist */ 3225 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3226 offsetof(nfs_delmapcall_t, call_node)); 3227 rp->r_flags |= RDELMAPLIST; 3228 mutex_exit(&rp->r_statelock); 3229 return (0); 3230 } else { 3231 /* The list exists so search it */ 3232 for (delmap_call = list_head(&rp->r_indelmap); 3233 delmap_call != NULL; 3234 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3235 if (delmap_call->call_id == curthread) { 3236 /* current caller is in the list */ 3237 *errp = delmap_call->error; 3238 list_remove(&rp->r_indelmap, delmap_call); 3239 mutex_exit(&rp->r_statelock); 3240 nfs_free_delmapcall(delmap_call); 3241 return (1); 3242 } 3243 } 3244 } 3245 mutex_exit(&rp->r_statelock); 3246 return (0); 3247 } 3248