1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/thread.h> 35 #include <sys/t_lock.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/errno.h> 40 #include <sys/buf.h> 41 #include <sys/stat.h> 42 #include <sys/cred.h> 43 #include <sys/kmem.h> 44 #include <sys/debug.h> 45 #include <sys/dnlc.h> 46 #include <sys/vmsystm.h> 47 #include <sys/flock.h> 48 #include <sys/share.h> 49 #include <sys/cmn_err.h> 50 #include <sys/tiuser.h> 51 #include <sys/sysmacros.h> 52 #include <sys/callb.h> 53 #include <sys/acl.h> 54 #include <sys/kstat.h> 55 #include <sys/signal.h> 56 #include <sys/list.h> 57 #include <sys/zone.h> 58 59 #include <rpc/types.h> 60 #include <rpc/xdr.h> 61 #include <rpc/auth.h> 62 #include <rpc/clnt.h> 63 64 #include <nfs/nfs.h> 65 #include <nfs/nfs_clnt.h> 66 67 #include <nfs/rnode.h> 68 #include <nfs/nfs_acl.h> 69 #include <nfs/lm.h> 70 71 #include <vm/hat.h> 72 #include <vm/as.h> 73 #include <vm/page.h> 74 #include <vm/pvn.h> 75 #include <vm/seg.h> 76 #include <vm/seg_map.h> 77 #include <vm/seg_vn.h> 78 79 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 80 cred_t *); 81 static int nfs_getattr_cache(vnode_t *, struct vattr *); 82 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 83 84 struct mi_globals { 85 kmutex_t mig_lock; /* lock protecting mig_list */ 86 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 87 boolean_t mig_destructor_called; 88 }; 89 90 static zone_key_t mi_list_key; 91 92 /* Debugging flag for PC file shares. */ 93 extern int share_debug; 94 95 /* 96 * Attributes caching: 97 * 98 * Attributes are cached in the rnode in struct vattr form. 99 * There is a time associated with the cached attributes (r_attrtime) 100 * which tells whether the attributes are valid. The time is initialized 101 * to the difference between current time and the modify time of the vnode 102 * when new attributes are cached. This allows the attributes for 103 * files that have changed recently to be timed out sooner than for files 104 * that have not changed for a long time. There are minimum and maximum 105 * timeout values that can be set per mount point. 106 */ 107 108 int 109 nfs_waitfor_purge_complete(vnode_t *vp) 110 { 111 rnode_t *rp; 112 k_sigset_t smask; 113 114 rp = VTOR(vp); 115 if (rp->r_serial != NULL && rp->r_serial != curthread) { 116 mutex_enter(&rp->r_statelock); 117 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 118 while (rp->r_serial != NULL) { 119 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 120 sigunintr(&smask); 121 mutex_exit(&rp->r_statelock); 122 return (EINTR); 123 } 124 } 125 sigunintr(&smask); 126 mutex_exit(&rp->r_statelock); 127 } 128 return (0); 129 } 130 131 /* 132 * Validate caches by checking cached attributes. If the cached 133 * attributes have timed out, then get new attributes from the server. 134 * As a side affect, this will do cache invalidation if the attributes 135 * have changed. 136 * 137 * If the attributes have not timed out and if there is a cache 138 * invalidation being done by some other thread, then wait until that 139 * thread has completed the cache invalidation. 140 */ 141 int 142 nfs_validate_caches(vnode_t *vp, cred_t *cr) 143 { 144 int error; 145 struct vattr va; 146 147 if (ATTRCACHE_VALID(vp)) { 148 error = nfs_waitfor_purge_complete(vp); 149 if (error) 150 return (error); 151 return (0); 152 } 153 154 va.va_mask = AT_ALL; 155 return (nfs_getattr_otw(vp, &va, cr)); 156 } 157 158 /* 159 * Validate caches by checking cached attributes. If the cached 160 * attributes have timed out, then get new attributes from the server. 161 * As a side affect, this will do cache invalidation if the attributes 162 * have changed. 163 * 164 * If the attributes have not timed out and if there is a cache 165 * invalidation being done by some other thread, then wait until that 166 * thread has completed the cache invalidation. 167 */ 168 int 169 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 170 { 171 int error; 172 struct vattr va; 173 174 if (ATTRCACHE_VALID(vp)) { 175 error = nfs_waitfor_purge_complete(vp); 176 if (error) 177 return (error); 178 return (0); 179 } 180 181 va.va_mask = AT_ALL; 182 return (nfs3_getattr_otw(vp, &va, cr)); 183 } 184 185 /* 186 * Purge all of the various NFS `data' caches. 187 */ 188 void 189 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 190 { 191 rnode_t *rp; 192 char *contents; 193 int size; 194 int error; 195 196 /* 197 * Purge the DNLC for any entries which refer to this file. 198 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 199 */ 200 rp = VTOR(vp); 201 mutex_enter(&rp->r_statelock); 202 if (vp->v_count > 1 && 203 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 204 !(rp->r_flags & RINDNLCPURGE)) { 205 /* 206 * Set the RINDNLCPURGE flag to prevent recursive entry 207 * into dnlc_purge_vp() 208 */ 209 if (vp->v_type == VDIR) 210 rp->r_flags |= RINDNLCPURGE; 211 mutex_exit(&rp->r_statelock); 212 dnlc_purge_vp(vp); 213 mutex_enter(&rp->r_statelock); 214 if (rp->r_flags & RINDNLCPURGE) 215 rp->r_flags &= ~RINDNLCPURGE; 216 } 217 218 /* 219 * Clear any readdir state bits and purge the readlink response cache. 220 */ 221 contents = rp->r_symlink.contents; 222 size = rp->r_symlink.size; 223 rp->r_symlink.contents = NULL; 224 mutex_exit(&rp->r_statelock); 225 226 if (contents != NULL) { 227 228 kmem_free((void *)contents, size); 229 } 230 231 /* 232 * Flush the page cache. 233 */ 234 if (vn_has_cached_data(vp)) { 235 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 236 if (error && (error == ENOSPC || error == EDQUOT)) { 237 mutex_enter(&rp->r_statelock); 238 if (!rp->r_error) 239 rp->r_error = error; 240 mutex_exit(&rp->r_statelock); 241 } 242 } 243 244 /* 245 * Flush the readdir response cache. 246 */ 247 if (HAVE_RDDIR_CACHE(rp)) 248 nfs_purge_rddir_cache(vp); 249 } 250 251 /* 252 * Purge the readdir cache of all entries 253 */ 254 void 255 nfs_purge_rddir_cache(vnode_t *vp) 256 { 257 rnode_t *rp; 258 rddir_cache *rdc; 259 rddir_cache *nrdc; 260 261 rp = VTOR(vp); 262 top: 263 mutex_enter(&rp->r_statelock); 264 rp->r_direof = NULL; 265 rp->r_flags &= ~RLOOKUP; 266 rp->r_flags |= RREADDIRPLUS; 267 rdc = avl_first(&rp->r_dir); 268 while (rdc != NULL) { 269 nrdc = AVL_NEXT(&rp->r_dir, rdc); 270 avl_remove(&rp->r_dir, rdc); 271 rddir_cache_rele(rdc); 272 rdc = nrdc; 273 } 274 mutex_exit(&rp->r_statelock); 275 } 276 277 /* 278 * Do a cache check based on the post-operation attributes. 279 * Then make them the new cached attributes. If no attributes 280 * were returned, then mark the attributes as timed out. 281 */ 282 void 283 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 284 { 285 vattr_t attr; 286 287 if (!poap->attributes) { 288 PURGE_ATTRCACHE(vp); 289 return; 290 } 291 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 292 } 293 294 /* 295 * Same as above, but using a vattr 296 */ 297 void 298 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 299 cred_t *cr) 300 { 301 if (!poap->attributes) { 302 PURGE_ATTRCACHE(vp); 303 return; 304 } 305 nfs_attr_cache(vp, poap->fres.vap, t, cr); 306 } 307 308 /* 309 * Do a cache check based on the weak cache consistency attributes. 310 * These consist of a small set of pre-operation attributes and the 311 * full set of post-operation attributes. 312 * 313 * If we are given the pre-operation attributes, then use them to 314 * check the validity of the various caches. Then, if we got the 315 * post-operation attributes, make them the new cached attributes. 316 * If we didn't get the post-operation attributes, then mark the 317 * attribute cache as timed out so that the next reference will 318 * cause a GETATTR to the server to refresh with the current 319 * attributes. 320 * 321 * Otherwise, if we didn't get the pre-operation attributes, but 322 * we did get the post-operation attributes, then use these 323 * attributes to check the validity of the various caches. This 324 * will probably cause a flush of the caches because if the 325 * operation succeeded, the attributes of the object were changed 326 * in some way from the old post-operation attributes. This 327 * should be okay because it is the safe thing to do. After 328 * checking the data caches, then we make these the new cached 329 * attributes. 330 * 331 * Otherwise, we didn't get either the pre- or post-operation 332 * attributes. Simply mark the attribute cache as timed out so 333 * the next reference will cause a GETATTR to the server to 334 * refresh with the current attributes. 335 * 336 * If an error occurred trying to convert the over the wire 337 * attributes to a vattr, then simply mark the attribute cache as 338 * timed out. 339 */ 340 void 341 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 342 { 343 vattr_t bva; 344 vattr_t ava; 345 346 if (wccp->after.attributes) { 347 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 348 PURGE_ATTRCACHE(vp); 349 return; 350 } 351 if (wccp->before.attributes) { 352 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 353 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 354 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 355 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 356 bva.va_size = wccp->before.attr.size; 357 nfs3_attr_cache(vp, &bva, &ava, t, cr); 358 } else 359 nfs_attr_cache(vp, &ava, t, cr); 360 } else { 361 PURGE_ATTRCACHE(vp); 362 } 363 } 364 365 /* 366 * Set attributes cache for given vnode using nfsattr. 367 * 368 * This routine does not do cache validation with the attributes. 369 * 370 * If an error occurred trying to convert the over the wire 371 * attributes to a vattr, then simply mark the attribute cache as 372 * timed out. 373 */ 374 void 375 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 376 { 377 rnode_t *rp; 378 struct vattr va; 379 380 if (!nattr_to_vattr(vp, na, &va)) { 381 rp = VTOR(vp); 382 mutex_enter(&rp->r_statelock); 383 if (rp->r_mtime <= t) 384 nfs_attrcache_va(vp, &va); 385 mutex_exit(&rp->r_statelock); 386 } else { 387 PURGE_ATTRCACHE(vp); 388 } 389 } 390 391 /* 392 * Set attributes cache for given vnode using fattr3. 393 * 394 * This routine does not do cache validation with the attributes. 395 * 396 * If an error occurred trying to convert the over the wire 397 * attributes to a vattr, then simply mark the attribute cache as 398 * timed out. 399 */ 400 void 401 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 402 { 403 rnode_t *rp; 404 struct vattr va; 405 406 if (!fattr3_to_vattr(vp, na, &va)) { 407 rp = VTOR(vp); 408 mutex_enter(&rp->r_statelock); 409 if (rp->r_mtime <= t) 410 nfs_attrcache_va(vp, &va); 411 mutex_exit(&rp->r_statelock); 412 } else { 413 PURGE_ATTRCACHE(vp); 414 } 415 } 416 417 /* 418 * Do a cache check based on attributes returned over the wire. The 419 * new attributes are cached. 420 * 421 * If an error occurred trying to convert the over the wire attributes 422 * to a vattr, then just return that error. 423 * 424 * As a side affect, the vattr argument is filled in with the converted 425 * attributes. 426 */ 427 int 428 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 429 cred_t *cr) 430 { 431 int error; 432 433 error = nattr_to_vattr(vp, na, vap); 434 if (error) 435 return (error); 436 nfs_attr_cache(vp, vap, t, cr); 437 return (0); 438 } 439 440 /* 441 * Do a cache check based on attributes returned over the wire. The 442 * new attributes are cached. 443 * 444 * If an error occurred trying to convert the over the wire attributes 445 * to a vattr, then just return that error. 446 * 447 * As a side affect, the vattr argument is filled in with the converted 448 * attributes. 449 */ 450 int 451 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 452 { 453 int error; 454 455 error = fattr3_to_vattr(vp, na, vap); 456 if (error) 457 return (error); 458 nfs_attr_cache(vp, vap, t, cr); 459 return (0); 460 } 461 462 /* 463 * Use the passed in virtual attributes to check to see whether the 464 * data and metadata caches are valid, cache the new attributes, and 465 * then do the cache invalidation if required. 466 * 467 * The cache validation and caching of the new attributes is done 468 * atomically via the use of the mutex, r_statelock. If required, 469 * the cache invalidation is done atomically w.r.t. the cache 470 * validation and caching of the attributes via the pseudo lock, 471 * r_serial. 472 * 473 * This routine is used to do cache validation and attributes caching 474 * for operations with a single set of post operation attributes. 475 */ 476 void 477 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 478 { 479 rnode_t *rp; 480 int mtime_changed; 481 int ctime_changed; 482 vsecattr_t *vsp; 483 int was_serial; 484 485 rp = VTOR(vp); 486 487 mutex_enter(&rp->r_statelock); 488 489 if (rp->r_serial != curthread) { 490 klwp_t *lwp = ttolwp(curthread); 491 492 was_serial = 0; 493 if (lwp != NULL) 494 lwp->lwp_nostop++; 495 while (rp->r_serial != NULL) { 496 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 497 mutex_exit(&rp->r_statelock); 498 if (lwp != NULL) 499 lwp->lwp_nostop--; 500 return; 501 } 502 } 503 if (lwp != NULL) 504 lwp->lwp_nostop--; 505 } else 506 was_serial = 1; 507 508 if (rp->r_mtime > t) { 509 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 510 PURGE_ATTRCACHE_LOCKED(rp); 511 mutex_exit(&rp->r_statelock); 512 return; 513 } 514 515 if (!(rp->r_flags & RWRITEATTR)) { 516 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 517 mtime_changed = 1; 518 else 519 mtime_changed = 0; 520 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 521 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 522 ctime_changed = 1; 523 else 524 ctime_changed = 0; 525 } else if (rp->r_size != vap->va_size && 526 (!vn_has_cached_data(vp) || 527 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) { 528 mtime_changed = 1; 529 ctime_changed = 0; 530 } else { 531 mtime_changed = 0; 532 ctime_changed = 0; 533 } 534 535 nfs_attrcache_va(vp, vap); 536 537 if (!mtime_changed && !ctime_changed) { 538 mutex_exit(&rp->r_statelock); 539 return; 540 } 541 542 rp->r_serial = curthread; 543 544 mutex_exit(&rp->r_statelock); 545 546 if (mtime_changed) 547 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 548 549 if (ctime_changed) { 550 (void) nfs_access_purge_rp(rp); 551 if (rp->r_secattr != NULL) { 552 mutex_enter(&rp->r_statelock); 553 vsp = rp->r_secattr; 554 rp->r_secattr = NULL; 555 mutex_exit(&rp->r_statelock); 556 if (vsp != NULL) 557 nfs_acl_free(vsp); 558 } 559 } 560 561 if (!was_serial) { 562 mutex_enter(&rp->r_statelock); 563 rp->r_serial = NULL; 564 cv_broadcast(&rp->r_cv); 565 mutex_exit(&rp->r_statelock); 566 } 567 } 568 569 /* 570 * Use the passed in "before" virtual attributes to check to see 571 * whether the data and metadata caches are valid, cache the "after" 572 * new attributes, and then do the cache invalidation if required. 573 * 574 * The cache validation and caching of the new attributes is done 575 * atomically via the use of the mutex, r_statelock. If required, 576 * the cache invalidation is done atomically w.r.t. the cache 577 * validation and caching of the attributes via the pseudo lock, 578 * r_serial. 579 * 580 * This routine is used to do cache validation and attributes caching 581 * for operations with both pre operation attributes and post operation 582 * attributes. 583 */ 584 static void 585 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 586 cred_t *cr) 587 { 588 rnode_t *rp; 589 int mtime_changed; 590 int ctime_changed; 591 vsecattr_t *vsp; 592 int was_serial; 593 594 rp = VTOR(vp); 595 596 mutex_enter(&rp->r_statelock); 597 598 if (rp->r_serial != curthread) { 599 klwp_t *lwp = ttolwp(curthread); 600 601 was_serial = 0; 602 if (lwp != NULL) 603 lwp->lwp_nostop++; 604 while (rp->r_serial != NULL) { 605 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 606 mutex_exit(&rp->r_statelock); 607 if (lwp != NULL) 608 lwp->lwp_nostop--; 609 return; 610 } 611 } 612 if (lwp != NULL) 613 lwp->lwp_nostop--; 614 } else 615 was_serial = 1; 616 617 if (rp->r_mtime > t) { 618 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size)) 619 PURGE_ATTRCACHE_LOCKED(rp); 620 mutex_exit(&rp->r_statelock); 621 return; 622 } 623 624 if (!(rp->r_flags & RWRITEATTR)) { 625 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 626 mtime_changed = 1; 627 else 628 mtime_changed = 0; 629 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 630 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 631 ctime_changed = 1; 632 else 633 ctime_changed = 0; 634 } else { 635 mtime_changed = 0; 636 ctime_changed = 0; 637 } 638 639 nfs_attrcache_va(vp, avap); 640 641 if (!mtime_changed && !ctime_changed) { 642 mutex_exit(&rp->r_statelock); 643 return; 644 } 645 646 rp->r_serial = curthread; 647 648 mutex_exit(&rp->r_statelock); 649 650 if (mtime_changed) 651 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 652 653 if (ctime_changed) { 654 (void) nfs_access_purge_rp(rp); 655 if (rp->r_secattr != NULL) { 656 mutex_enter(&rp->r_statelock); 657 vsp = rp->r_secattr; 658 rp->r_secattr = NULL; 659 mutex_exit(&rp->r_statelock); 660 if (vsp != NULL) 661 nfs_acl_free(vsp); 662 } 663 } 664 665 if (!was_serial) { 666 mutex_enter(&rp->r_statelock); 667 rp->r_serial = NULL; 668 cv_broadcast(&rp->r_cv); 669 mutex_exit(&rp->r_statelock); 670 } 671 } 672 673 /* 674 * Set attributes cache for given vnode using virtual attributes. 675 * 676 * Set the timeout value on the attribute cache and fill it 677 * with the passed in attributes. 678 * 679 * The caller must be holding r_statelock. 680 */ 681 void 682 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 683 { 684 rnode_t *rp; 685 mntinfo_t *mi; 686 hrtime_t delta; 687 hrtime_t now; 688 689 rp = VTOR(vp); 690 691 ASSERT(MUTEX_HELD(&rp->r_statelock)); 692 693 now = gethrtime(); 694 695 mi = VTOMI(vp); 696 697 /* 698 * Delta is the number of nanoseconds that we will 699 * cache the attributes of the file. It is based on 700 * the number of nanoseconds since the last time that 701 * we detected a change. The assumption is that files 702 * that changed recently are likely to change again. 703 * There is a minimum and a maximum for regular files 704 * and for directories which is enforced though. 705 * 706 * Using the time since last change was detected 707 * eliminates direct comparison or calculation 708 * using mixed client and server times. NFS does 709 * not make any assumptions regarding the client 710 * and server clocks being synchronized. 711 */ 712 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 713 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 714 va->va_size != rp->r_attr.va_size) 715 rp->r_mtime = now; 716 717 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 718 delta = 0; 719 else { 720 delta = now - rp->r_mtime; 721 if (vp->v_type == VDIR) { 722 if (delta < mi->mi_acdirmin) 723 delta = mi->mi_acdirmin; 724 else if (delta > mi->mi_acdirmax) 725 delta = mi->mi_acdirmax; 726 } else { 727 if (delta < mi->mi_acregmin) 728 delta = mi->mi_acregmin; 729 else if (delta > mi->mi_acregmax) 730 delta = mi->mi_acregmax; 731 } 732 } 733 rp->r_attrtime = now + delta; 734 rp->r_attr = *va; 735 /* 736 * Update the size of the file if there is no cached data or if 737 * the cached data is clean and there is no data being written 738 * out. 739 */ 740 if (rp->r_size != va->va_size && 741 (!vn_has_cached_data(vp) || 742 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 743 rp->r_size = va->va_size; 744 nfs_setswaplike(vp, va); 745 rp->r_flags &= ~RWRITEATTR; 746 } 747 748 /* 749 * Fill in attribute from the cache. 750 * If valid, then return 0 to indicate that no error occurred, 751 * otherwise return 1 to indicate that an error occurred. 752 */ 753 static int 754 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 755 { 756 rnode_t *rp; 757 758 rp = VTOR(vp); 759 mutex_enter(&rp->r_statelock); 760 if (ATTRCACHE_VALID(vp)) { 761 /* 762 * Cached attributes are valid 763 */ 764 *vap = rp->r_attr; 765 mutex_exit(&rp->r_statelock); 766 return (0); 767 } 768 mutex_exit(&rp->r_statelock); 769 return (1); 770 } 771 772 /* 773 * Get attributes over-the-wire and update attributes cache 774 * if no error occurred in the over-the-wire operation. 775 * Return 0 if successful, otherwise error. 776 */ 777 int 778 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 779 { 780 int error; 781 struct nfsattrstat ns; 782 int douprintf; 783 mntinfo_t *mi; 784 failinfo_t fi; 785 hrtime_t t; 786 787 mi = VTOMI(vp); 788 fi.vp = vp; 789 fi.fhp = NULL; /* no need to update, filehandle not copied */ 790 fi.copyproc = nfscopyfh; 791 fi.lookupproc = nfslookup; 792 fi.xattrdirproc = acl_getxattrdir2; 793 794 if (mi->mi_flags & MI_ACL) { 795 error = acl_getattr2_otw(vp, vap, cr); 796 if (mi->mi_flags & MI_ACL) 797 return (error); 798 } 799 800 douprintf = 1; 801 802 t = gethrtime(); 803 804 error = rfs2call(mi, RFS_GETATTR, 805 xdr_fhandle, (caddr_t)VTOFH(vp), 806 xdr_attrstat, (caddr_t)&ns, cr, 807 &douprintf, &ns.ns_status, 0, &fi); 808 809 if (!error) { 810 error = geterrno(ns.ns_status); 811 if (!error) 812 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 813 else { 814 PURGE_STALE_FH(error, vp, cr); 815 } 816 } 817 818 return (error); 819 } 820 821 /* 822 * Return either cached ot remote attributes. If get remote attr 823 * use them to check and invalidate caches, then cache the new attributes. 824 */ 825 int 826 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 827 { 828 int error; 829 rnode_t *rp; 830 831 /* 832 * If we've got cached attributes, we're done, otherwise go 833 * to the server to get attributes, which will update the cache 834 * in the process. 835 */ 836 error = nfs_getattr_cache(vp, vap); 837 if (error) 838 error = nfs_getattr_otw(vp, vap, cr); 839 840 /* Return the client's view of file size */ 841 rp = VTOR(vp); 842 mutex_enter(&rp->r_statelock); 843 vap->va_size = rp->r_size; 844 mutex_exit(&rp->r_statelock); 845 846 return (error); 847 } 848 849 /* 850 * Get attributes over-the-wire and update attributes cache 851 * if no error occurred in the over-the-wire operation. 852 * Return 0 if successful, otherwise error. 853 */ 854 int 855 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 856 { 857 int error; 858 GETATTR3args args; 859 GETATTR3vres res; 860 int douprintf; 861 failinfo_t fi; 862 hrtime_t t; 863 864 args.object = *VTOFH3(vp); 865 fi.vp = vp; 866 fi.fhp = (caddr_t)&args.object; 867 fi.copyproc = nfs3copyfh; 868 fi.lookupproc = nfs3lookup; 869 fi.xattrdirproc = acl_getxattrdir3; 870 res.fres.vp = vp; 871 res.fres.vap = vap; 872 873 douprintf = 1; 874 875 t = gethrtime(); 876 877 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 878 xdr_nfs_fh3, (caddr_t)&args, 879 xdr_GETATTR3vres, (caddr_t)&res, cr, 880 &douprintf, &res.status, 0, &fi); 881 882 if (error) 883 return (error); 884 885 error = geterrno3(res.status); 886 if (error) { 887 PURGE_STALE_FH(error, vp, cr); 888 return (error); 889 } 890 891 /* 892 * Catch status codes that indicate fattr3 to vattr translation failure 893 */ 894 if (res.fres.status) 895 return (res.fres.status); 896 897 nfs_attr_cache(vp, vap, t, cr); 898 return (0); 899 } 900 901 /* 902 * Return either cached or remote attributes. If get remote attr 903 * use them to check and invalidate caches, then cache the new attributes. 904 */ 905 int 906 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 907 { 908 int error; 909 rnode_t *rp; 910 911 /* 912 * If we've got cached attributes, we're done, otherwise go 913 * to the server to get attributes, which will update the cache 914 * in the process. 915 */ 916 error = nfs_getattr_cache(vp, vap); 917 if (error) 918 error = nfs3_getattr_otw(vp, vap, cr); 919 920 /* Return the client's view of file size */ 921 rp = VTOR(vp); 922 mutex_enter(&rp->r_statelock); 923 vap->va_size = rp->r_size; 924 mutex_exit(&rp->r_statelock); 925 926 return (error); 927 } 928 929 vtype_t nf_to_vt[] = { 930 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 931 }; 932 /* 933 * Convert NFS Version 2 over the network attributes to the local 934 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 935 * network representation and the local representation is done here. 936 * Returns 0 for success, error if failed due to overflow. 937 */ 938 int 939 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 940 { 941 /* overflow in time attributes? */ 942 #ifndef _LP64 943 if (!NFS2_FATTR_TIME_OK(na)) 944 return (EOVERFLOW); 945 #endif 946 947 if (na->na_type < NFNON || na->na_type > NFSOC) 948 vap->va_type = VBAD; 949 else 950 vap->va_type = nf_to_vt[na->na_type]; 951 vap->va_mode = na->na_mode; 952 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 953 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 954 vap->va_fsid = vp->v_vfsp->vfs_dev; 955 vap->va_nodeid = na->na_nodeid; 956 vap->va_nlink = na->na_nlink; 957 vap->va_size = na->na_size; /* keep for cache validation */ 958 /* 959 * nfs protocol defines times as unsigned so don't extend sign, 960 * unless sysadmin set nfs_allow_preepoch_time. 961 */ 962 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 963 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 964 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 965 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 966 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 967 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 968 /* 969 * Shannon's law - uncompress the received dev_t 970 * if the top half of is zero indicating a response 971 * from an `older style' OS. Except for when it is a 972 * `new style' OS sending the maj device of zero, 973 * in which case the algorithm still works because the 974 * fact that it is a new style server 975 * is hidden by the minor device not being greater 976 * than 255 (a requirement in this case). 977 */ 978 if ((na->na_rdev & 0xffff0000) == 0) 979 vap->va_rdev = nfsv2_expdev(na->na_rdev); 980 else 981 vap->va_rdev = expldev(na->na_rdev); 982 983 vap->va_nblocks = na->na_blocks; 984 switch (na->na_type) { 985 case NFBLK: 986 vap->va_blksize = DEV_BSIZE; 987 break; 988 989 case NFCHR: 990 vap->va_blksize = MAXBSIZE; 991 break; 992 993 case NFSOC: 994 default: 995 vap->va_blksize = na->na_blocksize; 996 break; 997 } 998 /* 999 * This bit of ugliness is a hack to preserve the 1000 * over-the-wire protocols for named-pipe vnodes. 1001 * It remaps the special over-the-wire type to the 1002 * VFIFO type. (see note in nfs.h) 1003 */ 1004 if (NA_ISFIFO(na)) { 1005 vap->va_type = VFIFO; 1006 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1007 vap->va_rdev = 0; 1008 vap->va_blksize = na->na_blocksize; 1009 } 1010 vap->va_seq = 0; 1011 return (0); 1012 } 1013 1014 /* 1015 * Convert NFS Version 3 over the network attributes to the local 1016 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1017 * network representation and the local representation is done here. 1018 */ 1019 vtype_t nf3_to_vt[] = { 1020 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1021 }; 1022 1023 int 1024 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1025 { 1026 1027 #ifndef _LP64 1028 /* overflow in time attributes? */ 1029 if (!NFS3_FATTR_TIME_OK(na)) 1030 return (EOVERFLOW); 1031 #endif 1032 if (!NFS3_SIZE_OK(na->size)) 1033 /* file too big */ 1034 return (EFBIG); 1035 1036 vap->va_mask = AT_ALL; 1037 1038 if (na->type < NF3REG || na->type > NF3FIFO) 1039 vap->va_type = VBAD; 1040 else 1041 vap->va_type = nf3_to_vt[na->type]; 1042 vap->va_mode = na->mode; 1043 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1044 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1045 vap->va_fsid = vp->v_vfsp->vfs_dev; 1046 vap->va_nodeid = na->fileid; 1047 vap->va_nlink = na->nlink; 1048 vap->va_size = na->size; 1049 1050 /* 1051 * nfs protocol defines times as unsigned so don't extend sign, 1052 * unless sysadmin set nfs_allow_preepoch_time. 1053 */ 1054 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1055 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1056 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1057 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1058 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1059 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1060 1061 switch (na->type) { 1062 case NF3BLK: 1063 vap->va_rdev = makedevice(na->rdev.specdata1, 1064 na->rdev.specdata2); 1065 vap->va_blksize = DEV_BSIZE; 1066 vap->va_nblocks = 0; 1067 break; 1068 case NF3CHR: 1069 vap->va_rdev = makedevice(na->rdev.specdata1, 1070 na->rdev.specdata2); 1071 vap->va_blksize = MAXBSIZE; 1072 vap->va_nblocks = 0; 1073 break; 1074 case NF3REG: 1075 case NF3DIR: 1076 case NF3LNK: 1077 vap->va_rdev = 0; 1078 vap->va_blksize = MAXBSIZE; 1079 vap->va_nblocks = (u_longlong_t) 1080 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1081 (size3)DEV_BSIZE); 1082 break; 1083 case NF3SOCK: 1084 case NF3FIFO: 1085 default: 1086 vap->va_rdev = 0; 1087 vap->va_blksize = MAXBSIZE; 1088 vap->va_nblocks = 0; 1089 break; 1090 } 1091 vap->va_seq = 0; 1092 return (0); 1093 } 1094 1095 /* 1096 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1097 * for the demand-based allocation of async threads per-mount. The 1098 * nfs_async_timeout is the amount of time a thread will live after it 1099 * becomes idle, unless new I/O requests are received before the thread 1100 * dies. See nfs_async_putpage and nfs_async_start. 1101 */ 1102 1103 int nfs_async_timeout = -1; /* uninitialized */ 1104 1105 static void nfs_async_start(struct vfs *); 1106 1107 static void 1108 free_async_args(struct nfs_async_reqs *args) 1109 { 1110 rnode_t *rp; 1111 1112 if (args->a_io != NFS_INACTIVE) { 1113 rp = VTOR(args->a_vp); 1114 mutex_enter(&rp->r_statelock); 1115 rp->r_count--; 1116 if (args->a_io == NFS_PUTAPAGE || 1117 args->a_io == NFS_PAGEIO) 1118 rp->r_awcount--; 1119 cv_broadcast(&rp->r_cv); 1120 mutex_exit(&rp->r_statelock); 1121 VN_RELE(args->a_vp); 1122 } 1123 crfree(args->a_cred); 1124 kmem_free(args, sizeof (*args)); 1125 } 1126 1127 /* 1128 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1129 * pageout(), running in the global zone, have legitimate reasons to do 1130 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1131 * use of a a per-mount "asynchronous requests manager thread" which is 1132 * signaled by the various asynchronous work routines when there is 1133 * asynchronous work to be done. It is responsible for creating new 1134 * worker threads if necessary, and notifying existing worker threads 1135 * that there is work to be done. 1136 * 1137 * In other words, it will "take the specifications from the customers and 1138 * give them to the engineers." 1139 * 1140 * Worker threads die off of their own accord if they are no longer 1141 * needed. 1142 * 1143 * This thread is killed when the zone is going away or the filesystem 1144 * is being unmounted. 1145 */ 1146 void 1147 nfs_async_manager(vfs_t *vfsp) 1148 { 1149 callb_cpr_t cprinfo; 1150 mntinfo_t *mi; 1151 uint_t max_threads; 1152 1153 mi = VFTOMI(vfsp); 1154 1155 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1156 "nfs_async_manager"); 1157 1158 mutex_enter(&mi->mi_async_lock); 1159 /* 1160 * We want to stash the max number of threads that this mount was 1161 * allowed so we can use it later when the variable is set to zero as 1162 * part of the zone/mount going away. 1163 * 1164 * We want to be able to create at least one thread to handle 1165 * asyncrhonous inactive calls. 1166 */ 1167 max_threads = MAX(mi->mi_max_threads, 1); 1168 mutex_enter(&mi->mi_lock); 1169 /* 1170 * We don't want to wait for mi_max_threads to go to zero, since that 1171 * happens as part of a failed unmount, but this thread should only 1172 * exit when the mount/zone is really going away. 1173 * 1174 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1175 * attempted: the various _async_*() functions know to do things 1176 * inline if mi_max_threads == 0. Henceforth we just drain out the 1177 * outstanding requests. 1178 * 1179 * Note that we still create zthreads even if we notice the zone is 1180 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1181 * shutdown sequence to take slightly longer in some cases, but 1182 * doesn't violate the protocol, as all threads will exit as soon as 1183 * they're done processing the remaining requests. 1184 */ 1185 while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) || 1186 mi->mi_async_req_count > 0) { 1187 mutex_exit(&mi->mi_lock); 1188 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1189 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1190 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1191 while (mi->mi_async_req_count > 0) { 1192 /* 1193 * Paranoia: If the mount started out having 1194 * (mi->mi_max_threads == 0), and the value was 1195 * later changed (via a debugger or somesuch), 1196 * we could be confused since we will think we 1197 * can't create any threads, and the calling 1198 * code (which looks at the current value of 1199 * mi->mi_max_threads, now non-zero) thinks we 1200 * can. 1201 * 1202 * So, because we're paranoid, we create threads 1203 * up to the maximum of the original and the 1204 * current value. This means that future 1205 * (debugger-induced) lowerings of 1206 * mi->mi_max_threads are ignored for our 1207 * purposes, but who told them they could change 1208 * random values on a live kernel anyhow? 1209 */ 1210 if (mi->mi_threads < 1211 MAX(mi->mi_max_threads, max_threads)) { 1212 mi->mi_threads++; 1213 mutex_exit(&mi->mi_async_lock); 1214 VFS_HOLD(vfsp); /* hold for new thread */ 1215 (void) zthread_create(NULL, 0, nfs_async_start, 1216 vfsp, 0, minclsyspri); 1217 mutex_enter(&mi->mi_async_lock); 1218 } 1219 cv_signal(&mi->mi_async_work_cv); 1220 ASSERT(mi->mi_async_req_count != 0); 1221 mi->mi_async_req_count--; 1222 } 1223 mutex_enter(&mi->mi_lock); 1224 } 1225 mutex_exit(&mi->mi_lock); 1226 /* 1227 * Let everyone know we're done. 1228 */ 1229 mi->mi_manager_thread = NULL; 1230 cv_broadcast(&mi->mi_async_cv); 1231 1232 /* 1233 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1234 * since CALLB_CPR_EXIT is actually responsible for releasing 1235 * 'mi_async_lock'. 1236 */ 1237 CALLB_CPR_EXIT(&cprinfo); 1238 VFS_RELE(vfsp); /* release thread's hold */ 1239 zthread_exit(); 1240 } 1241 1242 /* 1243 * Signal (and wait for) the async manager thread to clean up and go away. 1244 */ 1245 void 1246 nfs_async_manager_stop(vfs_t *vfsp) 1247 { 1248 mntinfo_t *mi = VFTOMI(vfsp); 1249 1250 mutex_enter(&mi->mi_async_lock); 1251 mutex_enter(&mi->mi_lock); 1252 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1253 mutex_exit(&mi->mi_lock); 1254 cv_broadcast(&mi->mi_async_reqs_cv); 1255 while (mi->mi_manager_thread != NULL) 1256 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1257 mutex_exit(&mi->mi_async_lock); 1258 } 1259 1260 int 1261 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1262 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1263 u_offset_t, caddr_t, struct seg *, cred_t *)) 1264 { 1265 rnode_t *rp; 1266 mntinfo_t *mi; 1267 struct nfs_async_reqs *args; 1268 1269 rp = VTOR(vp); 1270 ASSERT(rp->r_freef == NULL); 1271 1272 mi = VTOMI(vp); 1273 1274 /* 1275 * If addr falls in a different segment, don't bother doing readahead. 1276 */ 1277 if (addr >= seg->s_base + seg->s_size) 1278 return (-1); 1279 1280 /* 1281 * If we can't allocate a request structure, punt on the readahead. 1282 */ 1283 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1284 return (-1); 1285 1286 /* 1287 * If a lock operation is pending, don't initiate any new 1288 * readaheads. Otherwise, bump r_count to indicate the new 1289 * asynchronous I/O. 1290 */ 1291 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1292 kmem_free(args, sizeof (*args)); 1293 return (-1); 1294 } 1295 mutex_enter(&rp->r_statelock); 1296 rp->r_count++; 1297 mutex_exit(&rp->r_statelock); 1298 nfs_rw_exit(&rp->r_lkserlock); 1299 1300 args->a_next = NULL; 1301 #ifdef DEBUG 1302 args->a_queuer = curthread; 1303 #endif 1304 VN_HOLD(vp); 1305 args->a_vp = vp; 1306 ASSERT(cr != NULL); 1307 crhold(cr); 1308 args->a_cred = cr; 1309 args->a_io = NFS_READ_AHEAD; 1310 args->a_nfs_readahead = readahead; 1311 args->a_nfs_blkoff = blkoff; 1312 args->a_nfs_seg = seg; 1313 args->a_nfs_addr = addr; 1314 1315 mutex_enter(&mi->mi_async_lock); 1316 1317 /* 1318 * If asyncio has been disabled, don't bother readahead. 1319 */ 1320 if (mi->mi_max_threads == 0) { 1321 mutex_exit(&mi->mi_async_lock); 1322 goto noasync; 1323 } 1324 1325 /* 1326 * Link request structure into the async list and 1327 * wakeup async thread to do the i/o. 1328 */ 1329 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1330 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1331 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1332 } else { 1333 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1334 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1335 } 1336 1337 if (mi->mi_io_kstats) { 1338 mutex_enter(&mi->mi_lock); 1339 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1340 mutex_exit(&mi->mi_lock); 1341 } 1342 1343 mi->mi_async_req_count++; 1344 ASSERT(mi->mi_async_req_count != 0); 1345 cv_signal(&mi->mi_async_reqs_cv); 1346 mutex_exit(&mi->mi_async_lock); 1347 return (0); 1348 1349 noasync: 1350 mutex_enter(&rp->r_statelock); 1351 rp->r_count--; 1352 cv_broadcast(&rp->r_cv); 1353 mutex_exit(&rp->r_statelock); 1354 VN_RELE(vp); 1355 crfree(cr); 1356 kmem_free(args, sizeof (*args)); 1357 return (-1); 1358 } 1359 1360 int 1361 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1362 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1363 u_offset_t, size_t, int, cred_t *)) 1364 { 1365 rnode_t *rp; 1366 mntinfo_t *mi; 1367 struct nfs_async_reqs *args; 1368 1369 ASSERT(flags & B_ASYNC); 1370 ASSERT(vp->v_vfsp != NULL); 1371 1372 rp = VTOR(vp); 1373 ASSERT(rp->r_count > 0); 1374 1375 mi = VTOMI(vp); 1376 1377 /* 1378 * If we can't allocate a request structure, do the putpage 1379 * operation synchronously in this thread's context. 1380 */ 1381 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1382 goto noasync; 1383 1384 args->a_next = NULL; 1385 #ifdef DEBUG 1386 args->a_queuer = curthread; 1387 #endif 1388 VN_HOLD(vp); 1389 args->a_vp = vp; 1390 ASSERT(cr != NULL); 1391 crhold(cr); 1392 args->a_cred = cr; 1393 args->a_io = NFS_PUTAPAGE; 1394 args->a_nfs_putapage = putapage; 1395 args->a_nfs_pp = pp; 1396 args->a_nfs_off = off; 1397 args->a_nfs_len = (uint_t)len; 1398 args->a_nfs_flags = flags; 1399 1400 mutex_enter(&mi->mi_async_lock); 1401 1402 /* 1403 * If asyncio has been disabled, then make a synchronous request. 1404 * This check is done a second time in case async io was diabled 1405 * while this thread was blocked waiting for memory pressure to 1406 * reduce or for the queue to drain. 1407 */ 1408 if (mi->mi_max_threads == 0) { 1409 mutex_exit(&mi->mi_async_lock); 1410 goto noasync; 1411 } 1412 1413 /* 1414 * Link request structure into the async list and 1415 * wakeup async thread to do the i/o. 1416 */ 1417 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1418 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1419 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1420 } else { 1421 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1422 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1423 } 1424 1425 mutex_enter(&rp->r_statelock); 1426 rp->r_count++; 1427 rp->r_awcount++; 1428 mutex_exit(&rp->r_statelock); 1429 1430 if (mi->mi_io_kstats) { 1431 mutex_enter(&mi->mi_lock); 1432 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1433 mutex_exit(&mi->mi_lock); 1434 } 1435 1436 mi->mi_async_req_count++; 1437 ASSERT(mi->mi_async_req_count != 0); 1438 cv_signal(&mi->mi_async_reqs_cv); 1439 mutex_exit(&mi->mi_async_lock); 1440 return (0); 1441 1442 noasync: 1443 if (args != NULL) { 1444 VN_RELE(vp); 1445 crfree(cr); 1446 kmem_free(args, sizeof (*args)); 1447 } 1448 1449 if (curproc == proc_pageout || curproc == proc_fsflush) { 1450 /* 1451 * If we get here in the context of the pageout/fsflush, 1452 * we refuse to do a sync write, because this may hang 1453 * pageout (and the machine). In this case, we just 1454 * re-mark the page as dirty and punt on the page. 1455 * 1456 * Make sure B_FORCE isn't set. We can re-mark the 1457 * pages as dirty and unlock the pages in one swoop by 1458 * passing in B_ERROR to pvn_write_done(). However, 1459 * we should make sure B_FORCE isn't set - we don't 1460 * want the page tossed before it gets written out. 1461 */ 1462 if (flags & B_FORCE) 1463 flags &= ~(B_INVAL | B_FORCE); 1464 pvn_write_done(pp, flags | B_ERROR); 1465 return (0); 1466 } 1467 if (nfs_zone() != mi->mi_zone) { 1468 /* 1469 * So this was a cross-zone sync putpage. We pass in B_ERROR 1470 * to pvn_write_done() to re-mark the pages as dirty and unlock 1471 * them. 1472 * 1473 * We don't want to clear B_FORCE here as the caller presumably 1474 * knows what they're doing if they set it. 1475 */ 1476 pvn_write_done(pp, flags | B_ERROR); 1477 return (EPERM); 1478 } 1479 return ((*putapage)(vp, pp, off, len, flags, cr)); 1480 } 1481 1482 int 1483 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1484 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1485 size_t, int, cred_t *)) 1486 { 1487 rnode_t *rp; 1488 mntinfo_t *mi; 1489 struct nfs_async_reqs *args; 1490 1491 ASSERT(flags & B_ASYNC); 1492 ASSERT(vp->v_vfsp != NULL); 1493 1494 rp = VTOR(vp); 1495 ASSERT(rp->r_count > 0); 1496 1497 mi = VTOMI(vp); 1498 1499 /* 1500 * If we can't allocate a request structure, do the pageio 1501 * request synchronously in this thread's context. 1502 */ 1503 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1504 goto noasync; 1505 1506 args->a_next = NULL; 1507 #ifdef DEBUG 1508 args->a_queuer = curthread; 1509 #endif 1510 VN_HOLD(vp); 1511 args->a_vp = vp; 1512 ASSERT(cr != NULL); 1513 crhold(cr); 1514 args->a_cred = cr; 1515 args->a_io = NFS_PAGEIO; 1516 args->a_nfs_pageio = pageio; 1517 args->a_nfs_pp = pp; 1518 args->a_nfs_off = io_off; 1519 args->a_nfs_len = (uint_t)io_len; 1520 args->a_nfs_flags = flags; 1521 1522 mutex_enter(&mi->mi_async_lock); 1523 1524 /* 1525 * If asyncio has been disabled, then make a synchronous request. 1526 * This check is done a second time in case async io was diabled 1527 * while this thread was blocked waiting for memory pressure to 1528 * reduce or for the queue to drain. 1529 */ 1530 if (mi->mi_max_threads == 0) { 1531 mutex_exit(&mi->mi_async_lock); 1532 goto noasync; 1533 } 1534 1535 /* 1536 * Link request structure into the async list and 1537 * wakeup async thread to do the i/o. 1538 */ 1539 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1540 mi->mi_async_reqs[NFS_PAGEIO] = args; 1541 mi->mi_async_tail[NFS_PAGEIO] = args; 1542 } else { 1543 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1544 mi->mi_async_tail[NFS_PAGEIO] = args; 1545 } 1546 1547 mutex_enter(&rp->r_statelock); 1548 rp->r_count++; 1549 rp->r_awcount++; 1550 mutex_exit(&rp->r_statelock); 1551 1552 if (mi->mi_io_kstats) { 1553 mutex_enter(&mi->mi_lock); 1554 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1555 mutex_exit(&mi->mi_lock); 1556 } 1557 1558 mi->mi_async_req_count++; 1559 ASSERT(mi->mi_async_req_count != 0); 1560 cv_signal(&mi->mi_async_reqs_cv); 1561 mutex_exit(&mi->mi_async_lock); 1562 return (0); 1563 1564 noasync: 1565 if (args != NULL) { 1566 VN_RELE(vp); 1567 crfree(cr); 1568 kmem_free(args, sizeof (*args)); 1569 } 1570 1571 /* 1572 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1573 * the page list), for writes we do it synchronously, except for 1574 * proc_pageout/proc_fsflush as described below. 1575 */ 1576 if (flags & B_READ) { 1577 pvn_read_done(pp, flags | B_ERROR); 1578 return (0); 1579 } 1580 1581 if (curproc == proc_pageout || curproc == proc_fsflush) { 1582 /* 1583 * If we get here in the context of the pageout/fsflush, 1584 * we refuse to do a sync write, because this may hang 1585 * pageout/fsflush (and the machine). In this case, we just 1586 * re-mark the page as dirty and punt on the page. 1587 * 1588 * Make sure B_FORCE isn't set. We can re-mark the 1589 * pages as dirty and unlock the pages in one swoop by 1590 * passing in B_ERROR to pvn_write_done(). However, 1591 * we should make sure B_FORCE isn't set - we don't 1592 * want the page tossed before it gets written out. 1593 */ 1594 if (flags & B_FORCE) 1595 flags &= ~(B_INVAL | B_FORCE); 1596 pvn_write_done(pp, flags | B_ERROR); 1597 return (0); 1598 } 1599 1600 if (nfs_zone() != mi->mi_zone) { 1601 /* 1602 * So this was a cross-zone sync pageio. We pass in B_ERROR 1603 * to pvn_write_done() to re-mark the pages as dirty and unlock 1604 * them. 1605 * 1606 * We don't want to clear B_FORCE here as the caller presumably 1607 * knows what they're doing if they set it. 1608 */ 1609 pvn_write_done(pp, flags | B_ERROR); 1610 return (EPERM); 1611 } 1612 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1613 } 1614 1615 void 1616 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1617 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1618 { 1619 rnode_t *rp; 1620 mntinfo_t *mi; 1621 struct nfs_async_reqs *args; 1622 1623 rp = VTOR(vp); 1624 ASSERT(rp->r_freef == NULL); 1625 1626 mi = VTOMI(vp); 1627 1628 /* 1629 * If we can't allocate a request structure, do the readdir 1630 * operation synchronously in this thread's context. 1631 */ 1632 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1633 goto noasync; 1634 1635 args->a_next = NULL; 1636 #ifdef DEBUG 1637 args->a_queuer = curthread; 1638 #endif 1639 VN_HOLD(vp); 1640 args->a_vp = vp; 1641 ASSERT(cr != NULL); 1642 crhold(cr); 1643 args->a_cred = cr; 1644 args->a_io = NFS_READDIR; 1645 args->a_nfs_readdir = readdir; 1646 args->a_nfs_rdc = rdc; 1647 1648 mutex_enter(&mi->mi_async_lock); 1649 1650 /* 1651 * If asyncio has been disabled, then make a synchronous request. 1652 */ 1653 if (mi->mi_max_threads == 0) { 1654 mutex_exit(&mi->mi_async_lock); 1655 goto noasync; 1656 } 1657 1658 /* 1659 * Link request structure into the async list and 1660 * wakeup async thread to do the i/o. 1661 */ 1662 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1663 mi->mi_async_reqs[NFS_READDIR] = args; 1664 mi->mi_async_tail[NFS_READDIR] = args; 1665 } else { 1666 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1667 mi->mi_async_tail[NFS_READDIR] = args; 1668 } 1669 1670 mutex_enter(&rp->r_statelock); 1671 rp->r_count++; 1672 mutex_exit(&rp->r_statelock); 1673 1674 if (mi->mi_io_kstats) { 1675 mutex_enter(&mi->mi_lock); 1676 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1677 mutex_exit(&mi->mi_lock); 1678 } 1679 1680 mi->mi_async_req_count++; 1681 ASSERT(mi->mi_async_req_count != 0); 1682 cv_signal(&mi->mi_async_reqs_cv); 1683 mutex_exit(&mi->mi_async_lock); 1684 return; 1685 1686 noasync: 1687 if (args != NULL) { 1688 VN_RELE(vp); 1689 crfree(cr); 1690 kmem_free(args, sizeof (*args)); 1691 } 1692 1693 rdc->entries = NULL; 1694 mutex_enter(&rp->r_statelock); 1695 ASSERT(rdc->flags & RDDIR); 1696 rdc->flags &= ~RDDIR; 1697 rdc->flags |= RDDIRREQ; 1698 /* 1699 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1700 * is set, wakeup the thread sleeping in cv_wait_sig(). 1701 * The woken up thread will reset the flag to RDDIR and will 1702 * continue with the readdir opeartion. 1703 */ 1704 if (rdc->flags & RDDIRWAIT) { 1705 rdc->flags &= ~RDDIRWAIT; 1706 cv_broadcast(&rdc->cv); 1707 } 1708 mutex_exit(&rp->r_statelock); 1709 rddir_cache_rele(rdc); 1710 } 1711 1712 void 1713 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1714 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1715 cred_t *)) 1716 { 1717 rnode_t *rp; 1718 mntinfo_t *mi; 1719 struct nfs_async_reqs *args; 1720 page_t *pp; 1721 1722 rp = VTOR(vp); 1723 mi = VTOMI(vp); 1724 1725 /* 1726 * If we can't allocate a request structure, do the commit 1727 * operation synchronously in this thread's context. 1728 */ 1729 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1730 goto noasync; 1731 1732 args->a_next = NULL; 1733 #ifdef DEBUG 1734 args->a_queuer = curthread; 1735 #endif 1736 VN_HOLD(vp); 1737 args->a_vp = vp; 1738 ASSERT(cr != NULL); 1739 crhold(cr); 1740 args->a_cred = cr; 1741 args->a_io = NFS_COMMIT; 1742 args->a_nfs_commit = commit; 1743 args->a_nfs_plist = plist; 1744 args->a_nfs_offset = offset; 1745 args->a_nfs_count = count; 1746 1747 mutex_enter(&mi->mi_async_lock); 1748 1749 /* 1750 * If asyncio has been disabled, then make a synchronous request. 1751 * This check is done a second time in case async io was diabled 1752 * while this thread was blocked waiting for memory pressure to 1753 * reduce or for the queue to drain. 1754 */ 1755 if (mi->mi_max_threads == 0) { 1756 mutex_exit(&mi->mi_async_lock); 1757 goto noasync; 1758 } 1759 1760 /* 1761 * Link request structure into the async list and 1762 * wakeup async thread to do the i/o. 1763 */ 1764 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1765 mi->mi_async_reqs[NFS_COMMIT] = args; 1766 mi->mi_async_tail[NFS_COMMIT] = args; 1767 } else { 1768 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1769 mi->mi_async_tail[NFS_COMMIT] = args; 1770 } 1771 1772 mutex_enter(&rp->r_statelock); 1773 rp->r_count++; 1774 mutex_exit(&rp->r_statelock); 1775 1776 if (mi->mi_io_kstats) { 1777 mutex_enter(&mi->mi_lock); 1778 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1779 mutex_exit(&mi->mi_lock); 1780 } 1781 1782 mi->mi_async_req_count++; 1783 ASSERT(mi->mi_async_req_count != 0); 1784 cv_signal(&mi->mi_async_reqs_cv); 1785 mutex_exit(&mi->mi_async_lock); 1786 return; 1787 1788 noasync: 1789 if (args != NULL) { 1790 VN_RELE(vp); 1791 crfree(cr); 1792 kmem_free(args, sizeof (*args)); 1793 } 1794 1795 if (curproc == proc_pageout || curproc == proc_fsflush || 1796 nfs_zone() != mi->mi_zone) { 1797 while (plist != NULL) { 1798 pp = plist; 1799 page_sub(&plist, pp); 1800 pp->p_fsdata = C_COMMIT; 1801 page_unlock(pp); 1802 } 1803 return; 1804 } 1805 (*commit)(vp, plist, offset, count, cr); 1806 } 1807 1808 void 1809 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1810 void (*inactive)(vnode_t *, cred_t *, caller_context_t *)) 1811 { 1812 mntinfo_t *mi; 1813 struct nfs_async_reqs *args; 1814 1815 mi = VTOMI(vp); 1816 1817 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1818 args->a_next = NULL; 1819 #ifdef DEBUG 1820 args->a_queuer = curthread; 1821 #endif 1822 args->a_vp = vp; 1823 ASSERT(cr != NULL); 1824 crhold(cr); 1825 args->a_cred = cr; 1826 args->a_io = NFS_INACTIVE; 1827 args->a_nfs_inactive = inactive; 1828 1829 /* 1830 * Note that we don't check mi->mi_max_threads here, since we 1831 * *need* to get rid of this vnode regardless of whether someone 1832 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1833 * 1834 * The manager thread knows about this and is willing to create 1835 * at least one thread to accommodate us. 1836 */ 1837 mutex_enter(&mi->mi_async_lock); 1838 if (mi->mi_manager_thread == NULL) { 1839 rnode_t *rp = VTOR(vp); 1840 1841 mutex_exit(&mi->mi_async_lock); 1842 crfree(cr); /* drop our reference */ 1843 kmem_free(args, sizeof (*args)); 1844 /* 1845 * We can't do an over-the-wire call since we're in the wrong 1846 * zone, so we need to clean up state as best we can and then 1847 * throw away the vnode. 1848 */ 1849 mutex_enter(&rp->r_statelock); 1850 if (rp->r_unldvp != NULL) { 1851 vnode_t *unldvp; 1852 char *unlname; 1853 cred_t *unlcred; 1854 1855 unldvp = rp->r_unldvp; 1856 rp->r_unldvp = NULL; 1857 unlname = rp->r_unlname; 1858 rp->r_unlname = NULL; 1859 unlcred = rp->r_unlcred; 1860 rp->r_unlcred = NULL; 1861 mutex_exit(&rp->r_statelock); 1862 1863 VN_RELE(unldvp); 1864 kmem_free(unlname, MAXNAMELEN); 1865 crfree(unlcred); 1866 } else { 1867 mutex_exit(&rp->r_statelock); 1868 } 1869 /* 1870 * No need to explicitly throw away any cached pages. The 1871 * eventual rinactive() will attempt a synchronous 1872 * VOP_PUTPAGE() which will immediately fail since the request 1873 * is coming from the wrong zone, and then will proceed to call 1874 * nfs_invalidate_pages() which will clean things up for us. 1875 */ 1876 rp_addfree(VTOR(vp), cr); 1877 return; 1878 } 1879 1880 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1881 mi->mi_async_reqs[NFS_INACTIVE] = args; 1882 } else { 1883 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1884 } 1885 mi->mi_async_tail[NFS_INACTIVE] = args; 1886 /* 1887 * Don't increment r_count, since we're trying to get rid of the vnode. 1888 */ 1889 1890 mi->mi_async_req_count++; 1891 ASSERT(mi->mi_async_req_count != 0); 1892 cv_signal(&mi->mi_async_reqs_cv); 1893 mutex_exit(&mi->mi_async_lock); 1894 } 1895 1896 /* 1897 * The async queues for each mounted file system are arranged as a 1898 * set of queues, one for each async i/o type. Requests are taken 1899 * from the queues in a round-robin fashion. A number of consecutive 1900 * requests are taken from each queue before moving on to the next 1901 * queue. This functionality may allow the NFS Version 2 server to do 1902 * write clustering, even if the client is mixing writes and reads 1903 * because it will take multiple write requests from the queue 1904 * before processing any of the other async i/o types. 1905 * 1906 * XXX The nfs_async_start thread is unsafe in the light of the present 1907 * model defined by cpr to suspend the system. Specifically over the 1908 * wire calls are cpr-unsafe. The thread should be reevaluated in 1909 * case of future updates to the cpr model. 1910 */ 1911 static void 1912 nfs_async_start(struct vfs *vfsp) 1913 { 1914 struct nfs_async_reqs *args; 1915 mntinfo_t *mi = VFTOMI(vfsp); 1916 clock_t time_left = 1; 1917 callb_cpr_t cprinfo; 1918 int i; 1919 1920 /* 1921 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1922 * built in an implementation independent manner. 1923 */ 1924 if (nfs_async_timeout == -1) 1925 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1926 1927 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1928 1929 mutex_enter(&mi->mi_async_lock); 1930 for (;;) { 1931 /* 1932 * Find the next queue containing an entry. We start 1933 * at the current queue pointer and then round robin 1934 * through all of them until we either find a non-empty 1935 * queue or have looked through all of them. 1936 */ 1937 for (i = 0; i < NFS_ASYNC_TYPES; i++) { 1938 args = *mi->mi_async_curr; 1939 if (args != NULL) 1940 break; 1941 mi->mi_async_curr++; 1942 if (mi->mi_async_curr == 1943 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 1944 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1945 } 1946 /* 1947 * If we didn't find a entry, then block until woken up 1948 * again and then look through the queues again. 1949 */ 1950 if (args == NULL) { 1951 /* 1952 * Exiting is considered to be safe for CPR as well 1953 */ 1954 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1955 1956 /* 1957 * Wakeup thread waiting to unmount the file 1958 * system only if all async threads are inactive. 1959 * 1960 * If we've timed-out and there's nothing to do, 1961 * then get rid of this thread. 1962 */ 1963 if (mi->mi_max_threads == 0 || time_left <= 0) { 1964 if (--mi->mi_threads == 0) 1965 cv_signal(&mi->mi_async_cv); 1966 CALLB_CPR_EXIT(&cprinfo); 1967 VFS_RELE(vfsp); /* release thread's hold */ 1968 zthread_exit(); 1969 /* NOTREACHED */ 1970 } 1971 time_left = cv_timedwait(&mi->mi_async_work_cv, 1972 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1973 1974 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1975 1976 continue; 1977 } 1978 time_left = 1; 1979 1980 /* 1981 * Remove the request from the async queue and then 1982 * update the current async request queue pointer. If 1983 * the current queue is empty or we have removed enough 1984 * consecutive entries from it, then reset the counter 1985 * for this queue and then move the current pointer to 1986 * the next queue. 1987 */ 1988 *mi->mi_async_curr = args->a_next; 1989 if (*mi->mi_async_curr == NULL || 1990 --mi->mi_async_clusters[args->a_io] == 0) { 1991 mi->mi_async_clusters[args->a_io] = 1992 mi->mi_async_init_clusters; 1993 mi->mi_async_curr++; 1994 if (mi->mi_async_curr == 1995 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 1996 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1997 } 1998 1999 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 2000 mutex_enter(&mi->mi_lock); 2001 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 2002 mutex_exit(&mi->mi_lock); 2003 } 2004 2005 mutex_exit(&mi->mi_async_lock); 2006 2007 /* 2008 * Obtain arguments from the async request structure. 2009 */ 2010 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2011 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2012 args->a_nfs_addr, args->a_nfs_seg, 2013 args->a_cred); 2014 } else if (args->a_io == NFS_PUTAPAGE) { 2015 (void) (*args->a_nfs_putapage)(args->a_vp, 2016 args->a_nfs_pp, args->a_nfs_off, 2017 args->a_nfs_len, args->a_nfs_flags, 2018 args->a_cred); 2019 } else if (args->a_io == NFS_PAGEIO) { 2020 (void) (*args->a_nfs_pageio)(args->a_vp, 2021 args->a_nfs_pp, args->a_nfs_off, 2022 args->a_nfs_len, args->a_nfs_flags, 2023 args->a_cred); 2024 } else if (args->a_io == NFS_READDIR) { 2025 (void) ((*args->a_nfs_readdir)(args->a_vp, 2026 args->a_nfs_rdc, args->a_cred)); 2027 } else if (args->a_io == NFS_COMMIT) { 2028 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2029 args->a_nfs_offset, args->a_nfs_count, 2030 args->a_cred); 2031 } else if (args->a_io == NFS_INACTIVE) { 2032 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL); 2033 } 2034 2035 /* 2036 * Now, release the vnode and free the credentials 2037 * structure. 2038 */ 2039 free_async_args(args); 2040 /* 2041 * Reacquire the mutex because it will be needed above. 2042 */ 2043 mutex_enter(&mi->mi_async_lock); 2044 } 2045 } 2046 2047 void 2048 nfs_async_stop(struct vfs *vfsp) 2049 { 2050 mntinfo_t *mi = VFTOMI(vfsp); 2051 2052 /* 2053 * Wait for all outstanding async operations to complete and for the 2054 * worker threads to exit. 2055 */ 2056 mutex_enter(&mi->mi_async_lock); 2057 mi->mi_max_threads = 0; 2058 cv_broadcast(&mi->mi_async_work_cv); 2059 while (mi->mi_threads != 0) 2060 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2061 mutex_exit(&mi->mi_async_lock); 2062 } 2063 2064 /* 2065 * nfs_async_stop_sig: 2066 * Wait for all outstanding putpage operation to complete. If a signal 2067 * is deliver we will abort and return non-zero. If we can put all the 2068 * pages we will return 0. This routine is called from nfs_unmount and 2069 * nfs3_unmount to make these operations interruptible. 2070 */ 2071 int 2072 nfs_async_stop_sig(struct vfs *vfsp) 2073 { 2074 mntinfo_t *mi = VFTOMI(vfsp); 2075 ushort_t omax; 2076 int rval; 2077 2078 /* 2079 * Wait for all outstanding async operations to complete and for the 2080 * worker threads to exit. 2081 */ 2082 mutex_enter(&mi->mi_async_lock); 2083 omax = mi->mi_max_threads; 2084 mi->mi_max_threads = 0; 2085 /* 2086 * Tell all the worker threads to exit. 2087 */ 2088 cv_broadcast(&mi->mi_async_work_cv); 2089 while (mi->mi_threads != 0) { 2090 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2091 break; 2092 } 2093 rval = (mi->mi_threads != 0); /* Interrupted */ 2094 if (rval) 2095 mi->mi_max_threads = omax; 2096 mutex_exit(&mi->mi_async_lock); 2097 2098 return (rval); 2099 } 2100 2101 int 2102 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2103 { 2104 int pagecreate; 2105 int n; 2106 int saved_n; 2107 caddr_t saved_base; 2108 u_offset_t offset; 2109 int error; 2110 int sm_error; 2111 vnode_t *vp = RTOV(rp); 2112 2113 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2114 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2115 if (!vpm_enable) { 2116 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2117 } 2118 2119 /* 2120 * Move bytes in at most PAGESIZE chunks. We must avoid 2121 * spanning pages in uiomove() because page faults may cause 2122 * the cache to be invalidated out from under us. The r_size is not 2123 * updated until after the uiomove. If we push the last page of a 2124 * file before r_size is correct, we will lose the data written past 2125 * the current (and invalid) r_size. 2126 */ 2127 do { 2128 offset = uio->uio_loffset; 2129 pagecreate = 0; 2130 2131 /* 2132 * n is the number of bytes required to satisfy the request 2133 * or the number of bytes to fill out the page. 2134 */ 2135 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2136 2137 /* 2138 * Check to see if we can skip reading in the page 2139 * and just allocate the memory. We can do this 2140 * if we are going to rewrite the entire mapping 2141 * or if we are going to write to or beyond the current 2142 * end of file from the beginning of the mapping. 2143 * 2144 * The read of r_size is now protected by r_statelock. 2145 */ 2146 mutex_enter(&rp->r_statelock); 2147 /* 2148 * When pgcreated is nonzero the caller has already done 2149 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2150 * segkpm this means we already have at least one page 2151 * created and mapped at base. 2152 */ 2153 pagecreate = pgcreated || 2154 ((offset & PAGEOFFSET) == 0 && 2155 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2156 2157 mutex_exit(&rp->r_statelock); 2158 if (!vpm_enable && pagecreate) { 2159 /* 2160 * The last argument tells segmap_pagecreate() to 2161 * always lock the page, as opposed to sometimes 2162 * returning with the page locked. This way we avoid a 2163 * fault on the ensuing uiomove(), but also 2164 * more importantly (to fix bug 1094402) we can 2165 * call segmap_fault() to unlock the page in all 2166 * cases. An alternative would be to modify 2167 * segmap_pagecreate() to tell us when it is 2168 * locking a page, but that's a fairly major 2169 * interface change. 2170 */ 2171 if (pgcreated == 0) 2172 (void) segmap_pagecreate(segkmap, base, 2173 (uint_t)n, 1); 2174 saved_base = base; 2175 saved_n = n; 2176 } 2177 2178 /* 2179 * The number of bytes of data in the last page can not 2180 * be accurately be determined while page is being 2181 * uiomove'd to and the size of the file being updated. 2182 * Thus, inform threads which need to know accurately 2183 * how much data is in the last page of the file. They 2184 * will not do the i/o immediately, but will arrange for 2185 * the i/o to happen later when this modify operation 2186 * will have finished. 2187 */ 2188 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2189 mutex_enter(&rp->r_statelock); 2190 rp->r_flags |= RMODINPROGRESS; 2191 rp->r_modaddr = (offset & MAXBMASK); 2192 mutex_exit(&rp->r_statelock); 2193 2194 if (vpm_enable) { 2195 /* 2196 * Copy data. If new pages are created, part of 2197 * the page that is not written will be initizliazed 2198 * with zeros. 2199 */ 2200 error = vpm_data_copy(vp, offset, n, uio, 2201 !pagecreate, NULL, 0, S_WRITE); 2202 } else { 2203 error = uiomove(base, n, UIO_WRITE, uio); 2204 } 2205 2206 /* 2207 * r_size is the maximum number of 2208 * bytes known to be in the file. 2209 * Make sure it is at least as high as the 2210 * first unwritten byte pointed to by uio_loffset. 2211 */ 2212 mutex_enter(&rp->r_statelock); 2213 if (rp->r_size < uio->uio_loffset) 2214 rp->r_size = uio->uio_loffset; 2215 rp->r_flags &= ~RMODINPROGRESS; 2216 rp->r_flags |= RDIRTY; 2217 mutex_exit(&rp->r_statelock); 2218 2219 /* n = # of bytes written */ 2220 n = (int)(uio->uio_loffset - offset); 2221 2222 if (!vpm_enable) { 2223 base += n; 2224 } 2225 tcount -= n; 2226 /* 2227 * If we created pages w/o initializing them completely, 2228 * we need to zero the part that wasn't set up. 2229 * This happens on a most EOF write cases and if 2230 * we had some sort of error during the uiomove. 2231 */ 2232 if (!vpm_enable && pagecreate) { 2233 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2234 (void) kzero(base, PAGESIZE - n); 2235 2236 if (pgcreated) { 2237 /* 2238 * Caller is responsible for this page, 2239 * it was not created in this loop. 2240 */ 2241 pgcreated = 0; 2242 } else { 2243 /* 2244 * For bug 1094402: segmap_pagecreate locks 2245 * page. Unlock it. This also unlocks the 2246 * pages allocated by page_create_va() in 2247 * segmap_pagecreate(). 2248 */ 2249 sm_error = segmap_fault(kas.a_hat, segkmap, 2250 saved_base, saved_n, 2251 F_SOFTUNLOCK, S_WRITE); 2252 if (error == 0) 2253 error = sm_error; 2254 } 2255 } 2256 } while (tcount > 0 && error == 0); 2257 2258 return (error); 2259 } 2260 2261 int 2262 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2263 { 2264 rnode_t *rp; 2265 page_t *pp; 2266 u_offset_t eoff; 2267 u_offset_t io_off; 2268 size_t io_len; 2269 int error; 2270 int rdirty; 2271 int err; 2272 2273 rp = VTOR(vp); 2274 ASSERT(rp->r_count > 0); 2275 2276 if (!vn_has_cached_data(vp)) 2277 return (0); 2278 2279 ASSERT(vp->v_type != VCHR); 2280 2281 /* 2282 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2283 * writes. B_FORCE is set to force the VM system to actually 2284 * invalidate the pages, even if the i/o failed. The pages 2285 * need to get invalidated because they can't be written out 2286 * because there isn't any space left on either the server's 2287 * file system or in the user's disk quota. The B_FREE bit 2288 * is cleared to avoid confusion as to whether this is a 2289 * request to place the page on the freelist or to destroy 2290 * it. 2291 */ 2292 if ((rp->r_flags & ROUTOFSPACE) || 2293 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2294 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2295 2296 if (len == 0) { 2297 /* 2298 * If doing a full file synchronous operation, then clear 2299 * the RDIRTY bit. If a page gets dirtied while the flush 2300 * is happening, then RDIRTY will get set again. The 2301 * RDIRTY bit must get cleared before the flush so that 2302 * we don't lose this information. 2303 * 2304 * If there are no full file async write operations 2305 * pending and RDIRTY bit is set, clear it. 2306 */ 2307 if (off == (u_offset_t)0 && 2308 !(flags & B_ASYNC) && 2309 (rp->r_flags & RDIRTY)) { 2310 mutex_enter(&rp->r_statelock); 2311 rdirty = (rp->r_flags & RDIRTY); 2312 rp->r_flags &= ~RDIRTY; 2313 mutex_exit(&rp->r_statelock); 2314 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2315 mutex_enter(&rp->r_statelock); 2316 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) { 2317 rdirty = (rp->r_flags & RDIRTY); 2318 rp->r_flags &= ~RDIRTY; 2319 } 2320 mutex_exit(&rp->r_statelock); 2321 } else 2322 rdirty = 0; 2323 2324 /* 2325 * Search the entire vp list for pages >= off, and flush 2326 * the dirty pages. 2327 */ 2328 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2329 flags, cr); 2330 2331 /* 2332 * If an error occurred and the file was marked as dirty 2333 * before and we aren't forcibly invalidating pages, then 2334 * reset the RDIRTY flag. 2335 */ 2336 if (error && rdirty && 2337 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2338 mutex_enter(&rp->r_statelock); 2339 rp->r_flags |= RDIRTY; 2340 mutex_exit(&rp->r_statelock); 2341 } 2342 } else { 2343 /* 2344 * Do a range from [off...off + len) looking for pages 2345 * to deal with. 2346 */ 2347 error = 0; 2348 #ifdef lint 2349 io_len = 0; 2350 #endif 2351 eoff = off + len; 2352 mutex_enter(&rp->r_statelock); 2353 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2354 io_off += io_len) { 2355 mutex_exit(&rp->r_statelock); 2356 /* 2357 * If we are not invalidating, synchronously 2358 * freeing or writing pages use the routine 2359 * page_lookup_nowait() to prevent reclaiming 2360 * them from the free list. 2361 */ 2362 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2363 pp = page_lookup(vp, io_off, 2364 (flags & (B_INVAL | B_FREE)) ? 2365 SE_EXCL : SE_SHARED); 2366 } else { 2367 pp = page_lookup_nowait(vp, io_off, 2368 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2369 } 2370 2371 if (pp == NULL || !pvn_getdirty(pp, flags)) 2372 io_len = PAGESIZE; 2373 else { 2374 err = (*rp->r_putapage)(vp, pp, &io_off, 2375 &io_len, flags, cr); 2376 if (!error) 2377 error = err; 2378 /* 2379 * "io_off" and "io_len" are returned as 2380 * the range of pages we actually wrote. 2381 * This allows us to skip ahead more quickly 2382 * since several pages may've been dealt 2383 * with by this iteration of the loop. 2384 */ 2385 } 2386 mutex_enter(&rp->r_statelock); 2387 } 2388 mutex_exit(&rp->r_statelock); 2389 } 2390 2391 return (error); 2392 } 2393 2394 void 2395 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2396 { 2397 rnode_t *rp; 2398 2399 rp = VTOR(vp); 2400 mutex_enter(&rp->r_statelock); 2401 while (rp->r_flags & RTRUNCATE) 2402 cv_wait(&rp->r_cv, &rp->r_statelock); 2403 rp->r_flags |= RTRUNCATE; 2404 if (off == (u_offset_t)0) { 2405 rp->r_flags &= ~RDIRTY; 2406 if (!(rp->r_flags & RSTALE)) 2407 rp->r_error = 0; 2408 } 2409 rp->r_truncaddr = off; 2410 mutex_exit(&rp->r_statelock); 2411 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2412 B_INVAL | B_TRUNC, cr); 2413 mutex_enter(&rp->r_statelock); 2414 rp->r_flags &= ~RTRUNCATE; 2415 cv_broadcast(&rp->r_cv); 2416 mutex_exit(&rp->r_statelock); 2417 } 2418 2419 static int nfs_write_error_to_cons_only = 0; 2420 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2421 2422 /* 2423 * Print a file handle 2424 */ 2425 void 2426 nfs_printfhandle(nfs_fhandle *fhp) 2427 { 2428 int *ip; 2429 char *buf; 2430 size_t bufsize; 2431 char *cp; 2432 2433 /* 2434 * 13 == "(file handle:" 2435 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2436 * 1 == ' ' 2437 * 8 == maximum strlen of "%x" 2438 * 3 == ")\n\0" 2439 */ 2440 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2441 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2442 if (buf == NULL) 2443 return; 2444 2445 cp = buf; 2446 (void) strcpy(cp, "(file handle:"); 2447 while (*cp != '\0') 2448 cp++; 2449 for (ip = (int *)fhp->fh_buf; 2450 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2451 ip++) { 2452 (void) sprintf(cp, " %x", *ip); 2453 while (*cp != '\0') 2454 cp++; 2455 } 2456 (void) strcpy(cp, ")\n"); 2457 2458 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2459 2460 kmem_free(buf, bufsize); 2461 } 2462 2463 /* 2464 * Notify the system administrator that an NFS write error has 2465 * occurred. 2466 */ 2467 2468 /* seconds between ENOSPC/EDQUOT messages */ 2469 clock_t nfs_write_error_interval = 5; 2470 2471 void 2472 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2473 { 2474 mntinfo_t *mi; 2475 2476 mi = VTOMI(vp); 2477 /* 2478 * In case of forced unmount or zone shutdown, do not print any 2479 * messages since it can flood the console with error messages. 2480 */ 2481 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2482 return; 2483 2484 /* 2485 * No use in flooding the console with ENOSPC 2486 * messages from the same file system. 2487 */ 2488 if ((error != ENOSPC && error != EDQUOT) || 2489 lbolt - mi->mi_printftime > 0) { 2490 zoneid_t zoneid = mi->mi_zone->zone_id; 2491 2492 #ifdef DEBUG 2493 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2494 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2495 #else 2496 nfs_perror(error, "NFS write error on host %s: %m.\n", 2497 VTOR(vp)->r_server->sv_hostname, NULL); 2498 #endif 2499 if (error == ENOSPC || error == EDQUOT) { 2500 zcmn_err(zoneid, CE_CONT, 2501 MSG("^File: userid=%d, groupid=%d\n"), 2502 crgetuid(cr), crgetgid(cr)); 2503 if (crgetuid(CRED()) != crgetuid(cr) || 2504 crgetgid(CRED()) != crgetgid(cr)) { 2505 zcmn_err(zoneid, CE_CONT, 2506 MSG("^User: userid=%d, groupid=%d\n"), 2507 crgetuid(CRED()), crgetgid(CRED())); 2508 } 2509 mi->mi_printftime = lbolt + 2510 nfs_write_error_interval * hz; 2511 } 2512 nfs_printfhandle(&VTOR(vp)->r_fh); 2513 #ifdef DEBUG 2514 if (error == EACCES) { 2515 zcmn_err(zoneid, CE_CONT, 2516 MSG("^nfs_bio: cred is%s kcred\n"), 2517 cr == kcred ? "" : " not"); 2518 } 2519 #endif 2520 } 2521 } 2522 2523 /* ARGSUSED */ 2524 static void * 2525 nfs_mi_init(zoneid_t zoneid) 2526 { 2527 struct mi_globals *mig; 2528 2529 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2530 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2531 list_create(&mig->mig_list, sizeof (mntinfo_t), 2532 offsetof(mntinfo_t, mi_zone_node)); 2533 mig->mig_destructor_called = B_FALSE; 2534 return (mig); 2535 } 2536 2537 /* 2538 * Callback routine to tell all NFS mounts in the zone to stop creating new 2539 * threads. Existing threads should exit. 2540 */ 2541 /* ARGSUSED */ 2542 static void 2543 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2544 { 2545 struct mi_globals *mig = data; 2546 mntinfo_t *mi; 2547 2548 ASSERT(mig != NULL); 2549 again: 2550 mutex_enter(&mig->mig_lock); 2551 for (mi = list_head(&mig->mig_list); mi != NULL; 2552 mi = list_next(&mig->mig_list, mi)) { 2553 2554 /* 2555 * If we've done the shutdown work for this FS, skip. 2556 * Once we go off the end of the list, we're done. 2557 */ 2558 if (mi->mi_flags & MI_DEAD) 2559 continue; 2560 2561 /* 2562 * We will do work, so not done. Get a hold on the FS. 2563 */ 2564 VFS_HOLD(mi->mi_vfsp); 2565 2566 /* 2567 * purge the DNLC for this filesystem 2568 */ 2569 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2570 2571 mutex_enter(&mi->mi_async_lock); 2572 /* 2573 * Tell existing async worker threads to exit. 2574 */ 2575 mi->mi_max_threads = 0; 2576 cv_broadcast(&mi->mi_async_work_cv); 2577 /* 2578 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2579 * getting ready to exit when it's done with its current work. 2580 * Also set MI_DEAD to note we've acted on this FS. 2581 */ 2582 mutex_enter(&mi->mi_lock); 2583 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD); 2584 mutex_exit(&mi->mi_lock); 2585 /* 2586 * Wake up the async manager thread. 2587 */ 2588 cv_broadcast(&mi->mi_async_reqs_cv); 2589 mutex_exit(&mi->mi_async_lock); 2590 2591 /* 2592 * Drop lock and release FS, which may change list, then repeat. 2593 * We're done when every mi has been done or the list is empty. 2594 */ 2595 mutex_exit(&mig->mig_lock); 2596 VFS_RELE(mi->mi_vfsp); 2597 goto again; 2598 } 2599 mutex_exit(&mig->mig_lock); 2600 } 2601 2602 static void 2603 nfs_mi_free_globals(struct mi_globals *mig) 2604 { 2605 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2606 mutex_destroy(&mig->mig_lock); 2607 kmem_free(mig, sizeof (*mig)); 2608 2609 } 2610 2611 /* ARGSUSED */ 2612 static void 2613 nfs_mi_destroy(zoneid_t zoneid, void *data) 2614 { 2615 struct mi_globals *mig = data; 2616 2617 ASSERT(mig != NULL); 2618 mutex_enter(&mig->mig_lock); 2619 if (list_head(&mig->mig_list) != NULL) { 2620 /* Still waiting for VFS_FREEVFS() */ 2621 mig->mig_destructor_called = B_TRUE; 2622 mutex_exit(&mig->mig_lock); 2623 return; 2624 } 2625 nfs_mi_free_globals(mig); 2626 } 2627 2628 /* 2629 * Add an NFS mount to the per-zone list of NFS mounts. 2630 */ 2631 void 2632 nfs_mi_zonelist_add(mntinfo_t *mi) 2633 { 2634 struct mi_globals *mig; 2635 2636 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2637 mutex_enter(&mig->mig_lock); 2638 list_insert_head(&mig->mig_list, mi); 2639 mutex_exit(&mig->mig_lock); 2640 } 2641 2642 /* 2643 * Remove an NFS mount from the per-zone list of NFS mounts. 2644 */ 2645 static void 2646 nfs_mi_zonelist_remove(mntinfo_t *mi) 2647 { 2648 struct mi_globals *mig; 2649 2650 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2651 mutex_enter(&mig->mig_lock); 2652 list_remove(&mig->mig_list, mi); 2653 /* 2654 * We can be called asynchronously by VFS_FREEVFS() after the zone 2655 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2656 * mi globals. 2657 */ 2658 if (list_head(&mig->mig_list) == NULL && 2659 mig->mig_destructor_called == B_TRUE) { 2660 nfs_mi_free_globals(mig); 2661 return; 2662 } 2663 mutex_exit(&mig->mig_lock); 2664 } 2665 2666 /* 2667 * NFS Client initialization routine. This routine should only be called 2668 * once. It performs the following tasks: 2669 * - Initalize all global locks 2670 * - Call sub-initialization routines (localize access to variables) 2671 */ 2672 int 2673 nfs_clntinit(void) 2674 { 2675 #ifdef DEBUG 2676 static boolean_t nfs_clntup = B_FALSE; 2677 #endif 2678 int error; 2679 2680 #ifdef DEBUG 2681 ASSERT(nfs_clntup == B_FALSE); 2682 #endif 2683 2684 error = nfs_subrinit(); 2685 if (error) 2686 return (error); 2687 2688 error = nfs_vfsinit(); 2689 if (error) { 2690 /* 2691 * Cleanup nfs_subrinit() work 2692 */ 2693 nfs_subrfini(); 2694 return (error); 2695 } 2696 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2697 nfs_mi_destroy); 2698 2699 nfs4_clnt_init(); 2700 2701 #ifdef DEBUG 2702 nfs_clntup = B_TRUE; 2703 #endif 2704 2705 return (0); 2706 } 2707 2708 /* 2709 * This routine is only called if the NFS Client has been initialized but 2710 * the module failed to be installed. This routine will cleanup the previously 2711 * allocated/initialized work. 2712 */ 2713 void 2714 nfs_clntfini(void) 2715 { 2716 (void) zone_key_delete(mi_list_key); 2717 nfs_subrfini(); 2718 nfs_vfsfini(); 2719 nfs4_clnt_fini(); 2720 } 2721 2722 /* 2723 * nfs_lockrelease: 2724 * 2725 * Release any locks on the given vnode that are held by the current 2726 * process. 2727 */ 2728 void 2729 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2730 { 2731 flock64_t ld; 2732 struct shrlock shr; 2733 char *buf; 2734 int remote_lock_possible; 2735 int ret; 2736 2737 ASSERT((uintptr_t)vp > KERNELBASE); 2738 2739 /* 2740 * Generate an explicit unlock operation for the entire file. As a 2741 * partial optimization, only generate the unlock if there is a 2742 * lock registered for the file. We could check whether this 2743 * particular process has any locks on the file, but that would 2744 * require the local locking code to provide yet another query 2745 * routine. Note that no explicit synchronization is needed here. 2746 * At worst, flk_has_remote_locks() will return a false positive, 2747 * in which case the unlock call wastes time but doesn't harm 2748 * correctness. 2749 * 2750 * In addition, an unlock request is generated if the process 2751 * is listed as possibly having a lock on the file because the 2752 * server and client lock managers may have gotten out of sync. 2753 * N.B. It is important to make sure nfs_remove_locking_id() is 2754 * called here even if flk_has_remote_locks(vp) reports true. 2755 * If it is not called and there is an entry on the process id 2756 * list, that entry will never get removed. 2757 */ 2758 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2759 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2760 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2761 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2762 ld.l_whence = 0; /* unlock from start of file */ 2763 ld.l_start = 0; 2764 ld.l_len = 0; /* do entire file */ 2765 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr, 2766 NULL); 2767 2768 if (ret != 0) { 2769 /* 2770 * If VOP_FRLOCK fails, make sure we unregister 2771 * local locks before we continue. 2772 */ 2773 ld.l_pid = ttoproc(curthread)->p_pid; 2774 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2775 #ifdef DEBUG 2776 nfs_perror(ret, 2777 "NFS lock release error on vp %p: %m.\n", 2778 (void *)vp, NULL); 2779 #endif 2780 } 2781 2782 /* 2783 * The call to VOP_FRLOCK may put the pid back on the 2784 * list. We need to remove it. 2785 */ 2786 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2787 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2788 } 2789 2790 /* 2791 * As long as the vp has a share matching our pid, 2792 * pluck it off and unshare it. There are circumstances in 2793 * which the call to nfs_remove_locking_id() may put the 2794 * owner back on the list, in which case we simply do a 2795 * redundant and harmless unshare. 2796 */ 2797 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2798 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2799 (char *)NULL, buf, &shr.s_own_len)) { 2800 shr.s_owner = buf; 2801 shr.s_access = 0; 2802 shr.s_deny = 0; 2803 shr.s_sysid = 0; 2804 shr.s_pid = curproc->p_pid; 2805 2806 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL); 2807 #ifdef DEBUG 2808 if (ret != 0) { 2809 nfs_perror(ret, 2810 "NFS share release error on vp %p: %m.\n", 2811 (void *)vp, NULL); 2812 } 2813 #endif 2814 } 2815 kmem_free(buf, MAX_SHR_OWNER_LEN); 2816 } 2817 2818 /* 2819 * nfs_lockcompletion: 2820 * 2821 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2822 * as non cachable (set VNOCACHE bit). 2823 */ 2824 2825 void 2826 nfs_lockcompletion(vnode_t *vp, int cmd) 2827 { 2828 #ifdef DEBUG 2829 rnode_t *rp = VTOR(vp); 2830 2831 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2832 #endif 2833 2834 if (cmd == F_SETLK || cmd == F_SETLKW) { 2835 if (!lm_safemap(vp)) { 2836 mutex_enter(&vp->v_lock); 2837 vp->v_flag |= VNOCACHE; 2838 mutex_exit(&vp->v_lock); 2839 } else { 2840 mutex_enter(&vp->v_lock); 2841 vp->v_flag &= ~VNOCACHE; 2842 mutex_exit(&vp->v_lock); 2843 } 2844 } 2845 /* 2846 * The cached attributes of the file are stale after acquiring 2847 * the lock on the file. They were updated when the file was 2848 * opened, but not updated when the lock was acquired. Therefore the 2849 * cached attributes are invalidated after the lock is obtained. 2850 */ 2851 PURGE_ATTRCACHE(vp); 2852 } 2853 2854 /* 2855 * The lock manager holds state making it possible for the client 2856 * and server to be out of sync. For example, if the response from 2857 * the server granting a lock request is lost, the server will think 2858 * the lock is granted and the client will think the lock is lost. 2859 * The client can tell when it is not positive if it is in sync with 2860 * the server. 2861 * 2862 * To deal with this, a list of processes for which the client is 2863 * not sure if the server holds a lock is attached to the rnode. 2864 * When such a process closes the rnode, an unlock request is sent 2865 * to the server to unlock the entire file. 2866 * 2867 * The list is kept as a singularly linked NULL terminated list. 2868 * Because it is only added to under extreme error conditions, the 2869 * list shouldn't get very big. DEBUG kernels print a message if 2870 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 2871 * choosen to be 8, but can be tuned at runtime. 2872 */ 2873 #ifdef DEBUG 2874 /* int nfs_lmpl_high_water = 8; */ 2875 int nfs_lmpl_high_water = 128; 2876 int nfs_cnt_add_locking_id = 0; 2877 int nfs_len_add_locking_id = 0; 2878 #endif /* DEBUG */ 2879 2880 /* 2881 * Record that the nfs lock manager server may be holding a lock on 2882 * a vnode for a process. 2883 * 2884 * Because the nfs lock manager server holds state, it is possible 2885 * for the server to get out of sync with the client. This routine is called 2886 * from the client when it is no longer sure if the server is in sync 2887 * with the client. nfs_lockrelease() will then notice this and send 2888 * an unlock request when the file is closed 2889 */ 2890 void 2891 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 2892 { 2893 rnode_t *rp; 2894 lmpl_t *new; 2895 lmpl_t *cur; 2896 lmpl_t **lmplp; 2897 #ifdef DEBUG 2898 int list_len = 1; 2899 #endif /* DEBUG */ 2900 2901 #ifdef DEBUG 2902 ++nfs_cnt_add_locking_id; 2903 #endif /* DEBUG */ 2904 /* 2905 * allocate new lmpl_t now so we don't sleep 2906 * later after grabbing mutexes 2907 */ 2908 ASSERT(len < MAX_SHR_OWNER_LEN); 2909 new = kmem_alloc(sizeof (*new), KM_SLEEP); 2910 new->lmpl_type = type; 2911 new->lmpl_pid = pid; 2912 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 2913 bcopy(id, new->lmpl_owner, len); 2914 new->lmpl_own_len = len; 2915 new->lmpl_next = (lmpl_t *)NULL; 2916 #ifdef DEBUG 2917 if (type == RLMPL_PID) { 2918 ASSERT(len == sizeof (pid_t)); 2919 ASSERT(pid == *(pid_t *)new->lmpl_owner); 2920 } else { 2921 ASSERT(type == RLMPL_OWNER); 2922 } 2923 #endif 2924 2925 rp = VTOR(vp); 2926 mutex_enter(&rp->r_statelock); 2927 2928 /* 2929 * Add this id to the list for this rnode only if the 2930 * rnode is active and the id is not already there. 2931 */ 2932 ASSERT(rp->r_flags & RHASHED); 2933 lmplp = &(rp->r_lmpl); 2934 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 2935 if (cur->lmpl_pid == pid && 2936 cur->lmpl_type == type && 2937 cur->lmpl_own_len == len && 2938 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 2939 kmem_free(new->lmpl_owner, len); 2940 kmem_free(new, sizeof (*new)); 2941 break; 2942 } 2943 lmplp = &cur->lmpl_next; 2944 #ifdef DEBUG 2945 ++list_len; 2946 #endif /* DEBUG */ 2947 } 2948 if (cur == (lmpl_t *)NULL) { 2949 *lmplp = new; 2950 #ifdef DEBUG 2951 if (list_len > nfs_len_add_locking_id) { 2952 nfs_len_add_locking_id = list_len; 2953 } 2954 if (list_len > nfs_lmpl_high_water) { 2955 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 2956 "vp=%p is %d", (void *)vp, list_len); 2957 } 2958 #endif /* DEBUG */ 2959 } 2960 2961 #ifdef DEBUG 2962 if (share_debug) { 2963 int nitems = 0; 2964 int npids = 0; 2965 int nowners = 0; 2966 2967 /* 2968 * Count the number of things left on r_lmpl after the remove. 2969 */ 2970 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 2971 cur = cur->lmpl_next) { 2972 nitems++; 2973 if (cur->lmpl_type == RLMPL_PID) { 2974 npids++; 2975 } else if (cur->lmpl_type == RLMPL_OWNER) { 2976 nowners++; 2977 } else { 2978 cmn_err(CE_PANIC, "nfs_add_locking_id: " 2979 "unrecognized lmpl_type %d", 2980 cur->lmpl_type); 2981 } 2982 } 2983 2984 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 2985 "OWNs = %d items left on r_lmpl\n", 2986 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 2987 } 2988 #endif 2989 2990 mutex_exit(&rp->r_statelock); 2991 } 2992 2993 /* 2994 * Remove an id from the lock manager id list. 2995 * 2996 * If the id is not in the list return 0. If it was found and 2997 * removed, return 1. 2998 */ 2999 static int 3000 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 3001 { 3002 lmpl_t *cur; 3003 lmpl_t **lmplp; 3004 rnode_t *rp; 3005 int rv = 0; 3006 3007 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 3008 3009 rp = VTOR(vp); 3010 3011 mutex_enter(&rp->r_statelock); 3012 ASSERT(rp->r_flags & RHASHED); 3013 lmplp = &(rp->r_lmpl); 3014 3015 /* 3016 * Search through the list and remove the entry for this id 3017 * if it is there. The special case id == NULL allows removal 3018 * of the first share on the r_lmpl list belonging to the 3019 * current process (if any), without regard to further details 3020 * of its identity. 3021 */ 3022 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3023 if (cur->lmpl_type == type && 3024 cur->lmpl_pid == curproc->p_pid && 3025 (id == (char *)NULL || 3026 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 3027 *lmplp = cur->lmpl_next; 3028 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 3029 if (rid != NULL) { 3030 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 3031 *rlen = cur->lmpl_own_len; 3032 } 3033 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 3034 kmem_free(cur, sizeof (*cur)); 3035 rv = 1; 3036 break; 3037 } 3038 lmplp = &cur->lmpl_next; 3039 } 3040 3041 #ifdef DEBUG 3042 if (share_debug) { 3043 int nitems = 0; 3044 int npids = 0; 3045 int nowners = 0; 3046 3047 /* 3048 * Count the number of things left on r_lmpl after the remove. 3049 */ 3050 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3051 cur = cur->lmpl_next) { 3052 nitems++; 3053 if (cur->lmpl_type == RLMPL_PID) { 3054 npids++; 3055 } else if (cur->lmpl_type == RLMPL_OWNER) { 3056 nowners++; 3057 } else { 3058 cmn_err(CE_PANIC, 3059 "nrli: unrecognized lmpl_type %d", 3060 cur->lmpl_type); 3061 } 3062 } 3063 3064 cmn_err(CE_CONT, 3065 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3066 (type == RLMPL_PID) ? "P" : "O", 3067 npids, 3068 nowners, 3069 nitems); 3070 } 3071 #endif 3072 3073 mutex_exit(&rp->r_statelock); 3074 return (rv); 3075 } 3076 3077 void 3078 nfs_free_mi(mntinfo_t *mi) 3079 { 3080 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3081 ASSERT(mi->mi_manager_thread == NULL); 3082 ASSERT(mi->mi_threads == 0); 3083 3084 /* 3085 * Remove the node from the global list before we start tearing it down. 3086 */ 3087 nfs_mi_zonelist_remove(mi); 3088 if (mi->mi_klmconfig) { 3089 lm_free_config(mi->mi_klmconfig); 3090 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3091 } 3092 mutex_destroy(&mi->mi_lock); 3093 mutex_destroy(&mi->mi_remap_lock); 3094 mutex_destroy(&mi->mi_async_lock); 3095 cv_destroy(&mi->mi_failover_cv); 3096 cv_destroy(&mi->mi_async_work_cv); 3097 cv_destroy(&mi->mi_async_reqs_cv); 3098 cv_destroy(&mi->mi_async_cv); 3099 zone_rele(mi->mi_zone); 3100 kmem_free(mi, sizeof (*mi)); 3101 } 3102 3103 static int 3104 mnt_kstat_update(kstat_t *ksp, int rw) 3105 { 3106 mntinfo_t *mi; 3107 struct mntinfo_kstat *mik; 3108 vfs_t *vfsp; 3109 int i; 3110 3111 /* this is a read-only kstat. Bail out on a write */ 3112 if (rw == KSTAT_WRITE) 3113 return (EACCES); 3114 3115 /* 3116 * We don't want to wait here as kstat_chain_lock could be held by 3117 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3118 * and thus could lead to a deadlock. 3119 */ 3120 vfsp = (struct vfs *)ksp->ks_private; 3121 3122 3123 mi = VFTOMI(vfsp); 3124 3125 mik = (struct mntinfo_kstat *)ksp->ks_data; 3126 3127 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3128 mik->mik_vers = (uint32_t)mi->mi_vers; 3129 mik->mik_flags = mi->mi_flags; 3130 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3131 mik->mik_curread = (uint32_t)mi->mi_curread; 3132 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3133 mik->mik_retrans = mi->mi_retrans; 3134 mik->mik_timeo = mi->mi_timeo; 3135 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3136 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3137 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3138 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3139 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3140 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3141 mik->mik_timers[i].deviate = 3142 (uint32_t)mi->mi_timers[i].rt_deviate; 3143 mik->mik_timers[i].rtxcur = 3144 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3145 } 3146 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3147 mik->mik_failover = (uint32_t)mi->mi_failover; 3148 mik->mik_remap = (uint32_t)mi->mi_remap; 3149 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3150 3151 return (0); 3152 } 3153 3154 void 3155 nfs_mnt_kstat_init(struct vfs *vfsp) 3156 { 3157 mntinfo_t *mi = VFTOMI(vfsp); 3158 3159 /* 3160 * Create the version specific kstats. 3161 * 3162 * PSARC 2001/697 Contract Private Interface 3163 * All nfs kstats are under SunMC contract 3164 * Please refer to the PSARC listed above and contact 3165 * SunMC before making any changes! 3166 * 3167 * Changes must be reviewed by Solaris File Sharing 3168 * Changes must be communicated to contract-2001-697@sun.com 3169 * 3170 */ 3171 3172 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3173 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3174 if (mi->mi_io_kstats) { 3175 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3176 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3177 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3178 kstat_install(mi->mi_io_kstats); 3179 } 3180 3181 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3182 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3183 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3184 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3185 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3186 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3187 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3188 kstat_install(mi->mi_ro_kstats); 3189 } 3190 } 3191 3192 nfs_delmapcall_t * 3193 nfs_init_delmapcall() 3194 { 3195 nfs_delmapcall_t *delmap_call; 3196 3197 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3198 delmap_call->call_id = curthread; 3199 delmap_call->error = 0; 3200 3201 return (delmap_call); 3202 } 3203 3204 void 3205 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3206 { 3207 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3208 } 3209 3210 /* 3211 * Searches for the current delmap caller (based on curthread) in the list of 3212 * callers. If it is found, we remove it and free the delmap caller. 3213 * Returns: 3214 * 0 if the caller wasn't found 3215 * 1 if the caller was found, removed and freed. *errp is set to what 3216 * the result of the delmap was. 3217 */ 3218 int 3219 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3220 { 3221 nfs_delmapcall_t *delmap_call; 3222 3223 /* 3224 * If the list doesn't exist yet, we create it and return 3225 * that the caller wasn't found. No list = no callers. 3226 */ 3227 mutex_enter(&rp->r_statelock); 3228 if (!(rp->r_flags & RDELMAPLIST)) { 3229 /* The list does not exist */ 3230 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3231 offsetof(nfs_delmapcall_t, call_node)); 3232 rp->r_flags |= RDELMAPLIST; 3233 mutex_exit(&rp->r_statelock); 3234 return (0); 3235 } else { 3236 /* The list exists so search it */ 3237 for (delmap_call = list_head(&rp->r_indelmap); 3238 delmap_call != NULL; 3239 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3240 if (delmap_call->call_id == curthread) { 3241 /* current caller is in the list */ 3242 *errp = delmap_call->error; 3243 list_remove(&rp->r_indelmap, delmap_call); 3244 mutex_exit(&rp->r_statelock); 3245 nfs_free_delmapcall(delmap_call); 3246 return (1); 3247 } 3248 } 3249 } 3250 mutex_exit(&rp->r_statelock); 3251 return (0); 3252 } 3253