1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 27 * All rights reserved. 28 */ 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/thread.h> 36 #include <sys/t_lock.h> 37 #include <sys/time.h> 38 #include <sys/vnode.h> 39 #include <sys/vfs.h> 40 #include <sys/errno.h> 41 #include <sys/buf.h> 42 #include <sys/stat.h> 43 #include <sys/cred.h> 44 #include <sys/kmem.h> 45 #include <sys/debug.h> 46 #include <sys/dnlc.h> 47 #include <sys/vmsystm.h> 48 #include <sys/flock.h> 49 #include <sys/share.h> 50 #include <sys/cmn_err.h> 51 #include <sys/tiuser.h> 52 #include <sys/sysmacros.h> 53 #include <sys/callb.h> 54 #include <sys/acl.h> 55 #include <sys/kstat.h> 56 #include <sys/signal.h> 57 #include <sys/list.h> 58 #include <sys/zone.h> 59 60 #include <rpc/types.h> 61 #include <rpc/xdr.h> 62 #include <rpc/auth.h> 63 #include <rpc/clnt.h> 64 65 #include <nfs/nfs.h> 66 #include <nfs/nfs_clnt.h> 67 68 #include <nfs/rnode.h> 69 #include <nfs/nfs_acl.h> 70 #include <nfs/lm.h> 71 72 #include <vm/hat.h> 73 #include <vm/as.h> 74 #include <vm/page.h> 75 #include <vm/pvn.h> 76 #include <vm/seg.h> 77 #include <vm/seg_map.h> 78 #include <vm/seg_vn.h> 79 80 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 81 cred_t *); 82 static int nfs_getattr_cache(vnode_t *, struct vattr *); 83 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 84 85 struct mi_globals { 86 kmutex_t mig_lock; /* lock protecting mig_list */ 87 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 88 boolean_t mig_destructor_called; 89 }; 90 91 static zone_key_t mi_list_key; 92 93 /* Debugging flag for PC file shares. */ 94 extern int share_debug; 95 96 /* 97 * Attributes caching: 98 * 99 * Attributes are cached in the rnode in struct vattr form. 100 * There is a time associated with the cached attributes (r_attrtime) 101 * which tells whether the attributes are valid. The time is initialized 102 * to the difference between current time and the modify time of the vnode 103 * when new attributes are cached. This allows the attributes for 104 * files that have changed recently to be timed out sooner than for files 105 * that have not changed for a long time. There are minimum and maximum 106 * timeout values that can be set per mount point. 107 */ 108 109 int 110 nfs_waitfor_purge_complete(vnode_t *vp) 111 { 112 rnode_t *rp; 113 k_sigset_t smask; 114 115 rp = VTOR(vp); 116 if (rp->r_serial != NULL && rp->r_serial != curthread) { 117 mutex_enter(&rp->r_statelock); 118 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 119 while (rp->r_serial != NULL) { 120 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 121 sigunintr(&smask); 122 mutex_exit(&rp->r_statelock); 123 return (EINTR); 124 } 125 } 126 sigunintr(&smask); 127 mutex_exit(&rp->r_statelock); 128 } 129 return (0); 130 } 131 132 /* 133 * Validate caches by checking cached attributes. If the cached 134 * attributes have timed out, then get new attributes from the server. 135 * As a side affect, this will do cache invalidation if the attributes 136 * have changed. 137 * 138 * If the attributes have not timed out and if there is a cache 139 * invalidation being done by some other thread, then wait until that 140 * thread has completed the cache invalidation. 141 */ 142 int 143 nfs_validate_caches(vnode_t *vp, cred_t *cr) 144 { 145 int error; 146 struct vattr va; 147 148 if (ATTRCACHE_VALID(vp)) { 149 error = nfs_waitfor_purge_complete(vp); 150 if (error) 151 return (error); 152 return (0); 153 } 154 155 va.va_mask = AT_ALL; 156 return (nfs_getattr_otw(vp, &va, cr)); 157 } 158 159 /* 160 * Validate caches by checking cached attributes. If the cached 161 * attributes have timed out, then get new attributes from the server. 162 * As a side affect, this will do cache invalidation if the attributes 163 * have changed. 164 * 165 * If the attributes have not timed out and if there is a cache 166 * invalidation being done by some other thread, then wait until that 167 * thread has completed the cache invalidation. 168 */ 169 int 170 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 171 { 172 int error; 173 struct vattr va; 174 175 if (ATTRCACHE_VALID(vp)) { 176 error = nfs_waitfor_purge_complete(vp); 177 if (error) 178 return (error); 179 return (0); 180 } 181 182 va.va_mask = AT_ALL; 183 return (nfs3_getattr_otw(vp, &va, cr)); 184 } 185 186 /* 187 * Purge all of the various NFS `data' caches. 188 */ 189 void 190 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 191 { 192 rnode_t *rp; 193 char *contents; 194 int size; 195 int error; 196 197 /* 198 * Purge the DNLC for any entries which refer to this file. 199 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 200 */ 201 rp = VTOR(vp); 202 mutex_enter(&rp->r_statelock); 203 if (vp->v_count > 1 && 204 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 205 !(rp->r_flags & RINDNLCPURGE)) { 206 /* 207 * Set the RINDNLCPURGE flag to prevent recursive entry 208 * into dnlc_purge_vp() 209 */ 210 if (vp->v_type == VDIR) 211 rp->r_flags |= RINDNLCPURGE; 212 mutex_exit(&rp->r_statelock); 213 dnlc_purge_vp(vp); 214 mutex_enter(&rp->r_statelock); 215 if (rp->r_flags & RINDNLCPURGE) 216 rp->r_flags &= ~RINDNLCPURGE; 217 } 218 219 /* 220 * Clear any readdir state bits and purge the readlink response cache. 221 */ 222 contents = rp->r_symlink.contents; 223 size = rp->r_symlink.size; 224 rp->r_symlink.contents = NULL; 225 mutex_exit(&rp->r_statelock); 226 227 if (contents != NULL) { 228 229 kmem_free((void *)contents, size); 230 } 231 232 /* 233 * Flush the page cache. 234 */ 235 if (vn_has_cached_data(vp)) { 236 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr); 237 if (error && (error == ENOSPC || error == EDQUOT)) { 238 mutex_enter(&rp->r_statelock); 239 if (!rp->r_error) 240 rp->r_error = error; 241 mutex_exit(&rp->r_statelock); 242 } 243 } 244 245 /* 246 * Flush the readdir response cache. 247 */ 248 if (HAVE_RDDIR_CACHE(rp)) 249 nfs_purge_rddir_cache(vp); 250 } 251 252 /* 253 * Purge the readdir cache of all entries 254 */ 255 void 256 nfs_purge_rddir_cache(vnode_t *vp) 257 { 258 rnode_t *rp; 259 rddir_cache *rdc; 260 rddir_cache *nrdc; 261 262 rp = VTOR(vp); 263 top: 264 mutex_enter(&rp->r_statelock); 265 rp->r_direof = NULL; 266 rp->r_flags &= ~RLOOKUP; 267 rp->r_flags |= RREADDIRPLUS; 268 rdc = avl_first(&rp->r_dir); 269 while (rdc != NULL) { 270 nrdc = AVL_NEXT(&rp->r_dir, rdc); 271 avl_remove(&rp->r_dir, rdc); 272 rddir_cache_rele(rdc); 273 rdc = nrdc; 274 } 275 mutex_exit(&rp->r_statelock); 276 } 277 278 /* 279 * Do a cache check based on the post-operation attributes. 280 * Then make them the new cached attributes. If no attributes 281 * were returned, then mark the attributes as timed out. 282 */ 283 void 284 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 285 { 286 vattr_t attr; 287 288 if (!poap->attributes) { 289 PURGE_ATTRCACHE(vp); 290 return; 291 } 292 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 293 } 294 295 /* 296 * Same as above, but using a vattr 297 */ 298 void 299 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 300 cred_t *cr) 301 { 302 if (!poap->attributes) { 303 PURGE_ATTRCACHE(vp); 304 return; 305 } 306 nfs_attr_cache(vp, poap->fres.vap, t, cr); 307 } 308 309 /* 310 * Do a cache check based on the weak cache consistency attributes. 311 * These consist of a small set of pre-operation attributes and the 312 * full set of post-operation attributes. 313 * 314 * If we are given the pre-operation attributes, then use them to 315 * check the validity of the various caches. Then, if we got the 316 * post-operation attributes, make them the new cached attributes. 317 * If we didn't get the post-operation attributes, then mark the 318 * attribute cache as timed out so that the next reference will 319 * cause a GETATTR to the server to refresh with the current 320 * attributes. 321 * 322 * Otherwise, if we didn't get the pre-operation attributes, but 323 * we did get the post-operation attributes, then use these 324 * attributes to check the validity of the various caches. This 325 * will probably cause a flush of the caches because if the 326 * operation succeeded, the attributes of the object were changed 327 * in some way from the old post-operation attributes. This 328 * should be okay because it is the safe thing to do. After 329 * checking the data caches, then we make these the new cached 330 * attributes. 331 * 332 * Otherwise, we didn't get either the pre- or post-operation 333 * attributes. Simply mark the attribute cache as timed out so 334 * the next reference will cause a GETATTR to the server to 335 * refresh with the current attributes. 336 * 337 * If an error occurred trying to convert the over the wire 338 * attributes to a vattr, then simply mark the attribute cache as 339 * timed out. 340 */ 341 void 342 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 343 { 344 vattr_t bva; 345 vattr_t ava; 346 347 if (wccp->after.attributes) { 348 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 349 PURGE_ATTRCACHE(vp); 350 return; 351 } 352 if (wccp->before.attributes) { 353 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 354 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 355 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 356 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 357 bva.va_size = wccp->before.attr.size; 358 nfs3_attr_cache(vp, &bva, &ava, t, cr); 359 } else 360 nfs_attr_cache(vp, &ava, t, cr); 361 } else { 362 PURGE_ATTRCACHE(vp); 363 } 364 } 365 366 /* 367 * Set attributes cache for given vnode using nfsattr. 368 * 369 * This routine does not do cache validation with the attributes. 370 * 371 * If an error occurred trying to convert the over the wire 372 * attributes to a vattr, then simply mark the attribute cache as 373 * timed out. 374 */ 375 void 376 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 377 { 378 rnode_t *rp; 379 struct vattr va; 380 381 if (!nattr_to_vattr(vp, na, &va)) { 382 rp = VTOR(vp); 383 mutex_enter(&rp->r_statelock); 384 if (rp->r_mtime <= t) 385 nfs_attrcache_va(vp, &va); 386 mutex_exit(&rp->r_statelock); 387 } else { 388 PURGE_ATTRCACHE(vp); 389 } 390 } 391 392 /* 393 * Set attributes cache for given vnode using fattr3. 394 * 395 * This routine does not do cache validation with the attributes. 396 * 397 * If an error occurred trying to convert the over the wire 398 * attributes to a vattr, then simply mark the attribute cache as 399 * timed out. 400 */ 401 void 402 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 403 { 404 rnode_t *rp; 405 struct vattr va; 406 407 if (!fattr3_to_vattr(vp, na, &va)) { 408 rp = VTOR(vp); 409 mutex_enter(&rp->r_statelock); 410 if (rp->r_mtime <= t) 411 nfs_attrcache_va(vp, &va); 412 mutex_exit(&rp->r_statelock); 413 } else { 414 PURGE_ATTRCACHE(vp); 415 } 416 } 417 418 /* 419 * Do a cache check based on attributes returned over the wire. The 420 * new attributes are cached. 421 * 422 * If an error occurred trying to convert the over the wire attributes 423 * to a vattr, then just return that error. 424 * 425 * As a side affect, the vattr argument is filled in with the converted 426 * attributes. 427 */ 428 int 429 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 430 cred_t *cr) 431 { 432 int error; 433 434 error = nattr_to_vattr(vp, na, vap); 435 if (error) 436 return (error); 437 nfs_attr_cache(vp, vap, t, cr); 438 return (0); 439 } 440 441 /* 442 * Do a cache check based on attributes returned over the wire. The 443 * new attributes are cached. 444 * 445 * If an error occurred trying to convert the over the wire attributes 446 * to a vattr, then just return that error. 447 * 448 * As a side affect, the vattr argument is filled in with the converted 449 * attributes. 450 */ 451 int 452 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 453 { 454 int error; 455 456 error = fattr3_to_vattr(vp, na, vap); 457 if (error) 458 return (error); 459 nfs_attr_cache(vp, vap, t, cr); 460 return (0); 461 } 462 463 /* 464 * Use the passed in virtual attributes to check to see whether the 465 * data and metadata caches are valid, cache the new attributes, and 466 * then do the cache invalidation if required. 467 * 468 * The cache validation and caching of the new attributes is done 469 * atomically via the use of the mutex, r_statelock. If required, 470 * the cache invalidation is done atomically w.r.t. the cache 471 * validation and caching of the attributes via the pseudo lock, 472 * r_serial. 473 * 474 * This routine is used to do cache validation and attributes caching 475 * for operations with a single set of post operation attributes. 476 */ 477 void 478 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 479 { 480 rnode_t *rp; 481 int mtime_changed; 482 int ctime_changed; 483 vsecattr_t *vsp; 484 int was_serial; 485 486 rp = VTOR(vp); 487 488 mutex_enter(&rp->r_statelock); 489 490 if (rp->r_serial != curthread) { 491 klwp_t *lwp = ttolwp(curthread); 492 493 was_serial = 0; 494 if (lwp != NULL) 495 lwp->lwp_nostop++; 496 while (rp->r_serial != NULL) { 497 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 498 mutex_exit(&rp->r_statelock); 499 if (lwp != NULL) 500 lwp->lwp_nostop--; 501 return; 502 } 503 } 504 if (lwp != NULL) 505 lwp->lwp_nostop--; 506 } else 507 was_serial = 1; 508 509 if (rp->r_mtime > t) { 510 mutex_exit(&rp->r_statelock); 511 return; 512 } 513 514 if (!(rp->r_flags & RWRITEATTR)) { 515 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 516 mtime_changed = 1; 517 else 518 mtime_changed = 0; 519 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 520 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 521 ctime_changed = 1; 522 else 523 ctime_changed = 0; 524 } else if (rp->r_size != vap->va_size && 525 (!vn_has_cached_data(vp) || 526 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) { 527 mtime_changed = 1; 528 ctime_changed = 0; 529 } else { 530 mtime_changed = 0; 531 ctime_changed = 0; 532 } 533 534 nfs_attrcache_va(vp, vap); 535 536 if (!mtime_changed && !ctime_changed) { 537 mutex_exit(&rp->r_statelock); 538 return; 539 } 540 541 rp->r_serial = curthread; 542 543 mutex_exit(&rp->r_statelock); 544 545 if (mtime_changed) 546 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 547 548 if (ctime_changed) { 549 (void) nfs_access_purge_rp(rp); 550 if (rp->r_secattr != NULL) { 551 mutex_enter(&rp->r_statelock); 552 vsp = rp->r_secattr; 553 rp->r_secattr = NULL; 554 mutex_exit(&rp->r_statelock); 555 if (vsp != NULL) 556 nfs_acl_free(vsp); 557 } 558 } 559 560 if (!was_serial) { 561 mutex_enter(&rp->r_statelock); 562 rp->r_serial = NULL; 563 cv_broadcast(&rp->r_cv); 564 mutex_exit(&rp->r_statelock); 565 } 566 } 567 568 /* 569 * Use the passed in "before" virtual attributes to check to see 570 * whether the data and metadata caches are valid, cache the "after" 571 * new attributes, and then do the cache invalidation if required. 572 * 573 * The cache validation and caching of the new attributes is done 574 * atomically via the use of the mutex, r_statelock. If required, 575 * the cache invalidation is done atomically w.r.t. the cache 576 * validation and caching of the attributes via the pseudo lock, 577 * r_serial. 578 * 579 * This routine is used to do cache validation and attributes caching 580 * for operations with both pre operation attributes and post operation 581 * attributes. 582 */ 583 static void 584 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 585 cred_t *cr) 586 { 587 rnode_t *rp; 588 int mtime_changed; 589 int ctime_changed; 590 vsecattr_t *vsp; 591 int was_serial; 592 593 rp = VTOR(vp); 594 595 mutex_enter(&rp->r_statelock); 596 597 if (rp->r_serial != curthread) { 598 klwp_t *lwp = ttolwp(curthread); 599 600 was_serial = 0; 601 if (lwp != NULL) 602 lwp->lwp_nostop++; 603 while (rp->r_serial != NULL) { 604 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 605 mutex_exit(&rp->r_statelock); 606 if (lwp != NULL) 607 lwp->lwp_nostop--; 608 return; 609 } 610 } 611 if (lwp != NULL) 612 lwp->lwp_nostop--; 613 } else 614 was_serial = 1; 615 616 if (rp->r_mtime > t) { 617 mutex_exit(&rp->r_statelock); 618 return; 619 } 620 621 if (!(rp->r_flags & RWRITEATTR)) { 622 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 623 mtime_changed = 1; 624 else 625 mtime_changed = 0; 626 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 627 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 628 ctime_changed = 1; 629 else 630 ctime_changed = 0; 631 } else { 632 mtime_changed = 0; 633 ctime_changed = 0; 634 } 635 636 nfs_attrcache_va(vp, avap); 637 638 if (!mtime_changed && !ctime_changed) { 639 mutex_exit(&rp->r_statelock); 640 return; 641 } 642 643 rp->r_serial = curthread; 644 645 mutex_exit(&rp->r_statelock); 646 647 if (mtime_changed) 648 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 649 650 if (ctime_changed) { 651 (void) nfs_access_purge_rp(rp); 652 if (rp->r_secattr != NULL) { 653 mutex_enter(&rp->r_statelock); 654 vsp = rp->r_secattr; 655 rp->r_secattr = NULL; 656 mutex_exit(&rp->r_statelock); 657 if (vsp != NULL) 658 nfs_acl_free(vsp); 659 } 660 } 661 662 if (!was_serial) { 663 mutex_enter(&rp->r_statelock); 664 rp->r_serial = NULL; 665 cv_broadcast(&rp->r_cv); 666 mutex_exit(&rp->r_statelock); 667 } 668 } 669 670 /* 671 * Set attributes cache for given vnode using virtual attributes. 672 * 673 * Set the timeout value on the attribute cache and fill it 674 * with the passed in attributes. 675 * 676 * The caller must be holding r_statelock. 677 */ 678 void 679 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 680 { 681 rnode_t *rp; 682 mntinfo_t *mi; 683 hrtime_t delta; 684 hrtime_t now; 685 686 rp = VTOR(vp); 687 688 ASSERT(MUTEX_HELD(&rp->r_statelock)); 689 690 now = gethrtime(); 691 692 mi = VTOMI(vp); 693 694 /* 695 * Delta is the number of nanoseconds that we will 696 * cache the attributes of the file. It is based on 697 * the number of nanoseconds since the last time that 698 * we detected a change. The assumption is that files 699 * that changed recently are likely to change again. 700 * There is a minimum and a maximum for regular files 701 * and for directories which is enforced though. 702 * 703 * Using the time since last change was detected 704 * eliminates direct comparison or calculation 705 * using mixed client and server times. NFS does 706 * not make any assumptions regarding the client 707 * and server clocks being synchronized. 708 */ 709 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 710 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 711 va->va_size != rp->r_attr.va_size) 712 rp->r_mtime = now; 713 714 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 715 delta = 0; 716 else { 717 delta = now - rp->r_mtime; 718 if (vp->v_type == VDIR) { 719 if (delta < mi->mi_acdirmin) 720 delta = mi->mi_acdirmin; 721 else if (delta > mi->mi_acdirmax) 722 delta = mi->mi_acdirmax; 723 } else { 724 if (delta < mi->mi_acregmin) 725 delta = mi->mi_acregmin; 726 else if (delta > mi->mi_acregmax) 727 delta = mi->mi_acregmax; 728 } 729 } 730 rp->r_attrtime = now + delta; 731 rp->r_attr = *va; 732 /* 733 * Update the size of the file if there is no cached data or if 734 * the cached data is clean and there is no data being written 735 * out. 736 */ 737 if (rp->r_size != va->va_size && 738 (!vn_has_cached_data(vp) || 739 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 740 rp->r_size = va->va_size; 741 nfs_setswaplike(vp, va); 742 rp->r_flags &= ~RWRITEATTR; 743 } 744 745 /* 746 * Fill in attribute from the cache. 747 * If valid, then return 0 to indicate that no error occurred, 748 * otherwise return 1 to indicate that an error occurred. 749 */ 750 static int 751 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 752 { 753 rnode_t *rp; 754 755 rp = VTOR(vp); 756 mutex_enter(&rp->r_statelock); 757 if (ATTRCACHE_VALID(vp)) { 758 /* 759 * Cached attributes are valid 760 */ 761 *vap = rp->r_attr; 762 mutex_exit(&rp->r_statelock); 763 return (0); 764 } 765 mutex_exit(&rp->r_statelock); 766 return (1); 767 } 768 769 /* 770 * Get attributes over-the-wire and update attributes cache 771 * if no error occurred in the over-the-wire operation. 772 * Return 0 if successful, otherwise error. 773 */ 774 int 775 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 776 { 777 int error; 778 struct nfsattrstat ns; 779 int douprintf; 780 mntinfo_t *mi; 781 failinfo_t fi; 782 hrtime_t t; 783 784 mi = VTOMI(vp); 785 fi.vp = vp; 786 fi.fhp = NULL; /* no need to update, filehandle not copied */ 787 fi.copyproc = nfscopyfh; 788 fi.lookupproc = nfslookup; 789 fi.xattrdirproc = acl_getxattrdir2; 790 791 if (mi->mi_flags & MI_ACL) { 792 error = acl_getattr2_otw(vp, vap, cr); 793 if (mi->mi_flags & MI_ACL) 794 return (error); 795 } 796 797 douprintf = 1; 798 799 t = gethrtime(); 800 801 error = rfs2call(mi, RFS_GETATTR, 802 xdr_fhandle, (caddr_t)VTOFH(vp), 803 xdr_attrstat, (caddr_t)&ns, cr, 804 &douprintf, &ns.ns_status, 0, &fi); 805 806 if (!error) { 807 error = geterrno(ns.ns_status); 808 if (!error) 809 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 810 else { 811 PURGE_STALE_FH(error, vp, cr); 812 } 813 } 814 815 return (error); 816 } 817 818 /* 819 * Return either cached ot remote attributes. If get remote attr 820 * use them to check and invalidate caches, then cache the new attributes. 821 */ 822 int 823 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 824 { 825 int error; 826 rnode_t *rp; 827 828 /* 829 * If we've got cached attributes, we're done, otherwise go 830 * to the server to get attributes, which will update the cache 831 * in the process. 832 */ 833 error = nfs_getattr_cache(vp, vap); 834 if (error) 835 error = nfs_getattr_otw(vp, vap, cr); 836 837 /* Return the client's view of file size */ 838 rp = VTOR(vp); 839 mutex_enter(&rp->r_statelock); 840 vap->va_size = rp->r_size; 841 mutex_exit(&rp->r_statelock); 842 843 return (error); 844 } 845 846 /* 847 * Get attributes over-the-wire and update attributes cache 848 * if no error occurred in the over-the-wire operation. 849 * Return 0 if successful, otherwise error. 850 */ 851 int 852 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 853 { 854 int error; 855 GETATTR3args args; 856 GETATTR3vres res; 857 int douprintf; 858 failinfo_t fi; 859 hrtime_t t; 860 861 args.object = *VTOFH3(vp); 862 fi.vp = vp; 863 fi.fhp = (caddr_t)&args.object; 864 fi.copyproc = nfs3copyfh; 865 fi.lookupproc = nfs3lookup; 866 fi.xattrdirproc = acl_getxattrdir3; 867 res.fres.vp = vp; 868 res.fres.vap = vap; 869 870 douprintf = 1; 871 872 t = gethrtime(); 873 874 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 875 xdr_nfs_fh3, (caddr_t)&args, 876 xdr_GETATTR3vres, (caddr_t)&res, cr, 877 &douprintf, &res.status, 0, &fi); 878 879 if (error) 880 return (error); 881 882 error = geterrno3(res.status); 883 if (error) { 884 PURGE_STALE_FH(error, vp, cr); 885 return (error); 886 } 887 888 /* 889 * Catch status codes that indicate fattr3 to vattr translation failure 890 */ 891 if (res.fres.status) 892 return (res.fres.status); 893 894 nfs_attr_cache(vp, vap, t, cr); 895 return (0); 896 } 897 898 /* 899 * Return either cached or remote attributes. If get remote attr 900 * use them to check and invalidate caches, then cache the new attributes. 901 */ 902 int 903 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 904 { 905 int error; 906 rnode_t *rp; 907 908 /* 909 * If we've got cached attributes, we're done, otherwise go 910 * to the server to get attributes, which will update the cache 911 * in the process. 912 */ 913 error = nfs_getattr_cache(vp, vap); 914 if (error) 915 error = nfs3_getattr_otw(vp, vap, cr); 916 917 /* Return the client's view of file size */ 918 rp = VTOR(vp); 919 mutex_enter(&rp->r_statelock); 920 vap->va_size = rp->r_size; 921 mutex_exit(&rp->r_statelock); 922 923 return (error); 924 } 925 926 vtype_t nf_to_vt[] = { 927 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 928 }; 929 /* 930 * Convert NFS Version 2 over the network attributes to the local 931 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 932 * network representation and the local representation is done here. 933 * Returns 0 for success, error if failed due to overflow. 934 */ 935 int 936 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 937 { 938 /* overflow in time attributes? */ 939 #ifndef _LP64 940 if (!NFS2_FATTR_TIME_OK(na)) 941 return (EOVERFLOW); 942 #endif 943 944 if (na->na_type < NFNON || na->na_type > NFSOC) 945 vap->va_type = VBAD; 946 else 947 vap->va_type = nf_to_vt[na->na_type]; 948 vap->va_mode = na->na_mode; 949 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 950 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 951 vap->va_fsid = vp->v_vfsp->vfs_dev; 952 vap->va_nodeid = na->na_nodeid; 953 vap->va_nlink = na->na_nlink; 954 vap->va_size = na->na_size; /* keep for cache validation */ 955 /* 956 * nfs protocol defines times as unsigned so don't extend sign, 957 * unless sysadmin set nfs_allow_preepoch_time. 958 */ 959 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 960 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 961 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 962 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 963 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 964 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 965 /* 966 * Shannon's law - uncompress the received dev_t 967 * if the top half of is zero indicating a response 968 * from an `older style' OS. Except for when it is a 969 * `new style' OS sending the maj device of zero, 970 * in which case the algorithm still works because the 971 * fact that it is a new style server 972 * is hidden by the minor device not being greater 973 * than 255 (a requirement in this case). 974 */ 975 if ((na->na_rdev & 0xffff0000) == 0) 976 vap->va_rdev = nfsv2_expdev(na->na_rdev); 977 else 978 vap->va_rdev = expldev(na->na_rdev); 979 980 vap->va_nblocks = na->na_blocks; 981 switch (na->na_type) { 982 case NFBLK: 983 vap->va_blksize = DEV_BSIZE; 984 break; 985 986 case NFCHR: 987 vap->va_blksize = MAXBSIZE; 988 break; 989 990 case NFSOC: 991 default: 992 vap->va_blksize = na->na_blocksize; 993 break; 994 } 995 /* 996 * This bit of ugliness is a hack to preserve the 997 * over-the-wire protocols for named-pipe vnodes. 998 * It remaps the special over-the-wire type to the 999 * VFIFO type. (see note in nfs.h) 1000 */ 1001 if (NA_ISFIFO(na)) { 1002 vap->va_type = VFIFO; 1003 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1004 vap->va_rdev = 0; 1005 vap->va_blksize = na->na_blocksize; 1006 } 1007 vap->va_seq = 0; 1008 return (0); 1009 } 1010 1011 /* 1012 * Convert NFS Version 3 over the network attributes to the local 1013 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1014 * network representation and the local representation is done here. 1015 */ 1016 vtype_t nf3_to_vt[] = { 1017 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1018 }; 1019 1020 int 1021 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1022 { 1023 1024 #ifndef _LP64 1025 /* overflow in time attributes? */ 1026 if (!NFS3_FATTR_TIME_OK(na)) 1027 return (EOVERFLOW); 1028 #endif 1029 if (!NFS3_SIZE_OK(na->size)) 1030 /* file too big */ 1031 return (EFBIG); 1032 1033 vap->va_mask = AT_ALL; 1034 1035 if (na->type < NF3REG || na->type > NF3FIFO) 1036 vap->va_type = VBAD; 1037 else 1038 vap->va_type = nf3_to_vt[na->type]; 1039 vap->va_mode = na->mode; 1040 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1041 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1042 vap->va_fsid = vp->v_vfsp->vfs_dev; 1043 vap->va_nodeid = na->fileid; 1044 vap->va_nlink = na->nlink; 1045 vap->va_size = na->size; 1046 1047 /* 1048 * nfs protocol defines times as unsigned so don't extend sign, 1049 * unless sysadmin set nfs_allow_preepoch_time. 1050 */ 1051 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1052 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1053 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1054 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1055 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1056 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1057 1058 switch (na->type) { 1059 case NF3BLK: 1060 vap->va_rdev = makedevice(na->rdev.specdata1, 1061 na->rdev.specdata2); 1062 vap->va_blksize = DEV_BSIZE; 1063 vap->va_nblocks = 0; 1064 break; 1065 case NF3CHR: 1066 vap->va_rdev = makedevice(na->rdev.specdata1, 1067 na->rdev.specdata2); 1068 vap->va_blksize = MAXBSIZE; 1069 vap->va_nblocks = 0; 1070 break; 1071 case NF3REG: 1072 case NF3DIR: 1073 case NF3LNK: 1074 vap->va_rdev = 0; 1075 vap->va_blksize = MAXBSIZE; 1076 vap->va_nblocks = (u_longlong_t) 1077 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1078 (size3)DEV_BSIZE); 1079 break; 1080 case NF3SOCK: 1081 case NF3FIFO: 1082 default: 1083 vap->va_rdev = 0; 1084 vap->va_blksize = MAXBSIZE; 1085 vap->va_nblocks = 0; 1086 break; 1087 } 1088 vap->va_seq = 0; 1089 return (0); 1090 } 1091 1092 /* 1093 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1094 * for the demand-based allocation of async threads per-mount. The 1095 * nfs_async_timeout is the amount of time a thread will live after it 1096 * becomes idle, unless new I/O requests are received before the thread 1097 * dies. See nfs_async_putpage and nfs_async_start. 1098 */ 1099 1100 int nfs_async_timeout = -1; /* uninitialized */ 1101 1102 static void nfs_async_start(struct vfs *); 1103 1104 static void 1105 free_async_args(struct nfs_async_reqs *args) 1106 { 1107 rnode_t *rp; 1108 1109 if (args->a_io != NFS_INACTIVE) { 1110 rp = VTOR(args->a_vp); 1111 mutex_enter(&rp->r_statelock); 1112 rp->r_count--; 1113 if (args->a_io == NFS_PUTAPAGE || 1114 args->a_io == NFS_PAGEIO) 1115 rp->r_awcount--; 1116 cv_broadcast(&rp->r_cv); 1117 mutex_exit(&rp->r_statelock); 1118 VN_RELE(args->a_vp); 1119 } 1120 crfree(args->a_cred); 1121 kmem_free(args, sizeof (*args)); 1122 } 1123 1124 /* 1125 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1126 * pageout(), running in the global zone, have legitimate reasons to do 1127 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1128 * use of a a per-mount "asynchronous requests manager thread" which is 1129 * signaled by the various asynchronous work routines when there is 1130 * asynchronous work to be done. It is responsible for creating new 1131 * worker threads if necessary, and notifying existing worker threads 1132 * that there is work to be done. 1133 * 1134 * In other words, it will "take the specifications from the customers and 1135 * give them to the engineers." 1136 * 1137 * Worker threads die off of their own accord if they are no longer 1138 * needed. 1139 * 1140 * This thread is killed when the zone is going away or the filesystem 1141 * is being unmounted. 1142 */ 1143 void 1144 nfs_async_manager(vfs_t *vfsp) 1145 { 1146 callb_cpr_t cprinfo; 1147 mntinfo_t *mi; 1148 uint_t max_threads; 1149 1150 mi = VFTOMI(vfsp); 1151 1152 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1153 "nfs_async_manager"); 1154 1155 mutex_enter(&mi->mi_async_lock); 1156 /* 1157 * We want to stash the max number of threads that this mount was 1158 * allowed so we can use it later when the variable is set to zero as 1159 * part of the zone/mount going away. 1160 * 1161 * We want to be able to create at least one thread to handle 1162 * asyncrhonous inactive calls. 1163 */ 1164 max_threads = MAX(mi->mi_max_threads, 1); 1165 mutex_enter(&mi->mi_lock); 1166 /* 1167 * We don't want to wait for mi_max_threads to go to zero, since that 1168 * happens as part of a failed unmount, but this thread should only 1169 * exit when the mount/zone is really going away. 1170 * 1171 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1172 * attempted: the various _async_*() functions know to do things 1173 * inline if mi_max_threads == 0. Henceforth we just drain out the 1174 * outstanding requests. 1175 * 1176 * Note that we still create zthreads even if we notice the zone is 1177 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1178 * shutdown sequence to take slightly longer in some cases, but 1179 * doesn't violate the protocol, as all threads will exit as soon as 1180 * they're done processing the remaining requests. 1181 */ 1182 while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) || 1183 mi->mi_async_req_count > 0) { 1184 mutex_exit(&mi->mi_lock); 1185 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1186 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1187 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1188 while (mi->mi_async_req_count > 0) { 1189 /* 1190 * Paranoia: If the mount started out having 1191 * (mi->mi_max_threads == 0), and the value was 1192 * later changed (via a debugger or somesuch), 1193 * we could be confused since we will think we 1194 * can't create any threads, and the calling 1195 * code (which looks at the current value of 1196 * mi->mi_max_threads, now non-zero) thinks we 1197 * can. 1198 * 1199 * So, because we're paranoid, we create threads 1200 * up to the maximum of the original and the 1201 * current value. This means that future 1202 * (debugger-induced) lowerings of 1203 * mi->mi_max_threads are ignored for our 1204 * purposes, but who told them they could change 1205 * random values on a live kernel anyhow? 1206 */ 1207 if (mi->mi_threads < 1208 MAX(mi->mi_max_threads, max_threads)) { 1209 mi->mi_threads++; 1210 mutex_exit(&mi->mi_async_lock); 1211 VFS_HOLD(vfsp); /* hold for new thread */ 1212 (void) zthread_create(NULL, 0, nfs_async_start, 1213 vfsp, 0, minclsyspri); 1214 mutex_enter(&mi->mi_async_lock); 1215 } 1216 cv_signal(&mi->mi_async_work_cv); 1217 ASSERT(mi->mi_async_req_count != 0); 1218 mi->mi_async_req_count--; 1219 } 1220 mutex_enter(&mi->mi_lock); 1221 } 1222 mutex_exit(&mi->mi_lock); 1223 /* 1224 * Let everyone know we're done. 1225 */ 1226 mi->mi_manager_thread = NULL; 1227 cv_broadcast(&mi->mi_async_cv); 1228 1229 /* 1230 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1231 * since CALLB_CPR_EXIT is actually responsible for releasing 1232 * 'mi_async_lock'. 1233 */ 1234 CALLB_CPR_EXIT(&cprinfo); 1235 VFS_RELE(vfsp); /* release thread's hold */ 1236 zthread_exit(); 1237 } 1238 1239 /* 1240 * Signal (and wait for) the async manager thread to clean up and go away. 1241 */ 1242 void 1243 nfs_async_manager_stop(vfs_t *vfsp) 1244 { 1245 mntinfo_t *mi = VFTOMI(vfsp); 1246 1247 mutex_enter(&mi->mi_async_lock); 1248 mutex_enter(&mi->mi_lock); 1249 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1250 mutex_exit(&mi->mi_lock); 1251 cv_broadcast(&mi->mi_async_reqs_cv); 1252 while (mi->mi_manager_thread != NULL) 1253 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1254 mutex_exit(&mi->mi_async_lock); 1255 } 1256 1257 int 1258 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1259 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1260 u_offset_t, caddr_t, struct seg *, cred_t *)) 1261 { 1262 rnode_t *rp; 1263 mntinfo_t *mi; 1264 struct nfs_async_reqs *args; 1265 1266 rp = VTOR(vp); 1267 ASSERT(rp->r_freef == NULL); 1268 1269 mi = VTOMI(vp); 1270 1271 /* 1272 * If addr falls in a different segment, don't bother doing readahead. 1273 */ 1274 if (addr >= seg->s_base + seg->s_size) 1275 return (-1); 1276 1277 /* 1278 * If we can't allocate a request structure, punt on the readahead. 1279 */ 1280 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1281 return (-1); 1282 1283 /* 1284 * If a lock operation is pending, don't initiate any new 1285 * readaheads. Otherwise, bump r_count to indicate the new 1286 * asynchronous I/O. 1287 */ 1288 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1289 kmem_free(args, sizeof (*args)); 1290 return (-1); 1291 } 1292 mutex_enter(&rp->r_statelock); 1293 rp->r_count++; 1294 mutex_exit(&rp->r_statelock); 1295 nfs_rw_exit(&rp->r_lkserlock); 1296 1297 args->a_next = NULL; 1298 #ifdef DEBUG 1299 args->a_queuer = curthread; 1300 #endif 1301 VN_HOLD(vp); 1302 args->a_vp = vp; 1303 ASSERT(cr != NULL); 1304 crhold(cr); 1305 args->a_cred = cr; 1306 args->a_io = NFS_READ_AHEAD; 1307 args->a_nfs_readahead = readahead; 1308 args->a_nfs_blkoff = blkoff; 1309 args->a_nfs_seg = seg; 1310 args->a_nfs_addr = addr; 1311 1312 mutex_enter(&mi->mi_async_lock); 1313 1314 /* 1315 * If asyncio has been disabled, don't bother readahead. 1316 */ 1317 if (mi->mi_max_threads == 0) { 1318 mutex_exit(&mi->mi_async_lock); 1319 goto noasync; 1320 } 1321 1322 /* 1323 * Link request structure into the async list and 1324 * wakeup async thread to do the i/o. 1325 */ 1326 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1327 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1328 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1329 } else { 1330 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1331 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1332 } 1333 1334 if (mi->mi_io_kstats) { 1335 mutex_enter(&mi->mi_lock); 1336 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1337 mutex_exit(&mi->mi_lock); 1338 } 1339 1340 mi->mi_async_req_count++; 1341 ASSERT(mi->mi_async_req_count != 0); 1342 cv_signal(&mi->mi_async_reqs_cv); 1343 mutex_exit(&mi->mi_async_lock); 1344 return (0); 1345 1346 noasync: 1347 mutex_enter(&rp->r_statelock); 1348 rp->r_count--; 1349 cv_broadcast(&rp->r_cv); 1350 mutex_exit(&rp->r_statelock); 1351 VN_RELE(vp); 1352 crfree(cr); 1353 kmem_free(args, sizeof (*args)); 1354 return (-1); 1355 } 1356 1357 int 1358 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1359 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1360 u_offset_t, size_t, int, cred_t *)) 1361 { 1362 rnode_t *rp; 1363 mntinfo_t *mi; 1364 struct nfs_async_reqs *args; 1365 1366 ASSERT(flags & B_ASYNC); 1367 ASSERT(vp->v_vfsp != NULL); 1368 1369 rp = VTOR(vp); 1370 ASSERT(rp->r_count > 0); 1371 1372 mi = VTOMI(vp); 1373 1374 /* 1375 * If we can't allocate a request structure, do the putpage 1376 * operation synchronously in this thread's context. 1377 */ 1378 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1379 goto noasync; 1380 1381 args->a_next = NULL; 1382 #ifdef DEBUG 1383 args->a_queuer = curthread; 1384 #endif 1385 VN_HOLD(vp); 1386 args->a_vp = vp; 1387 ASSERT(cr != NULL); 1388 crhold(cr); 1389 args->a_cred = cr; 1390 args->a_io = NFS_PUTAPAGE; 1391 args->a_nfs_putapage = putapage; 1392 args->a_nfs_pp = pp; 1393 args->a_nfs_off = off; 1394 args->a_nfs_len = (uint_t)len; 1395 args->a_nfs_flags = flags; 1396 1397 mutex_enter(&mi->mi_async_lock); 1398 1399 /* 1400 * If asyncio has been disabled, then make a synchronous request. 1401 * This check is done a second time in case async io was diabled 1402 * while this thread was blocked waiting for memory pressure to 1403 * reduce or for the queue to drain. 1404 */ 1405 if (mi->mi_max_threads == 0) { 1406 mutex_exit(&mi->mi_async_lock); 1407 goto noasync; 1408 } 1409 1410 /* 1411 * Link request structure into the async list and 1412 * wakeup async thread to do the i/o. 1413 */ 1414 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1415 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1416 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1417 } else { 1418 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1419 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1420 } 1421 1422 mutex_enter(&rp->r_statelock); 1423 rp->r_count++; 1424 rp->r_awcount++; 1425 mutex_exit(&rp->r_statelock); 1426 1427 if (mi->mi_io_kstats) { 1428 mutex_enter(&mi->mi_lock); 1429 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1430 mutex_exit(&mi->mi_lock); 1431 } 1432 1433 mi->mi_async_req_count++; 1434 ASSERT(mi->mi_async_req_count != 0); 1435 cv_signal(&mi->mi_async_reqs_cv); 1436 mutex_exit(&mi->mi_async_lock); 1437 return (0); 1438 1439 noasync: 1440 if (args != NULL) { 1441 VN_RELE(vp); 1442 crfree(cr); 1443 kmem_free(args, sizeof (*args)); 1444 } 1445 1446 if (curproc == proc_pageout || curproc == proc_fsflush) { 1447 /* 1448 * If we get here in the context of the pageout/fsflush, 1449 * we refuse to do a sync write, because this may hang 1450 * pageout (and the machine). In this case, we just 1451 * re-mark the page as dirty and punt on the page. 1452 * 1453 * Make sure B_FORCE isn't set. We can re-mark the 1454 * pages as dirty and unlock the pages in one swoop by 1455 * passing in B_ERROR to pvn_write_done(). However, 1456 * we should make sure B_FORCE isn't set - we don't 1457 * want the page tossed before it gets written out. 1458 */ 1459 if (flags & B_FORCE) 1460 flags &= ~(B_INVAL | B_FORCE); 1461 pvn_write_done(pp, flags | B_ERROR); 1462 return (0); 1463 } 1464 if (curproc->p_zone != mi->mi_zone) { 1465 /* 1466 * So this was a cross-zone sync putpage. We pass in B_ERROR 1467 * to pvn_write_done() to re-mark the pages as dirty and unlock 1468 * them. 1469 * 1470 * We don't want to clear B_FORCE here as the caller presumably 1471 * knows what they're doing if they set it. 1472 */ 1473 pvn_write_done(pp, flags | B_ERROR); 1474 return (EPERM); 1475 } 1476 return ((*putapage)(vp, pp, off, len, flags, cr)); 1477 } 1478 1479 int 1480 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1481 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1482 size_t, int, cred_t *)) 1483 { 1484 rnode_t *rp; 1485 mntinfo_t *mi; 1486 struct nfs_async_reqs *args; 1487 1488 ASSERT(flags & B_ASYNC); 1489 ASSERT(vp->v_vfsp != NULL); 1490 1491 rp = VTOR(vp); 1492 ASSERT(rp->r_count > 0); 1493 1494 mi = VTOMI(vp); 1495 1496 /* 1497 * If we can't allocate a request structure, do the pageio 1498 * request synchronously in this thread's context. 1499 */ 1500 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1501 goto noasync; 1502 1503 args->a_next = NULL; 1504 #ifdef DEBUG 1505 args->a_queuer = curthread; 1506 #endif 1507 VN_HOLD(vp); 1508 args->a_vp = vp; 1509 ASSERT(cr != NULL); 1510 crhold(cr); 1511 args->a_cred = cr; 1512 args->a_io = NFS_PAGEIO; 1513 args->a_nfs_pageio = pageio; 1514 args->a_nfs_pp = pp; 1515 args->a_nfs_off = io_off; 1516 args->a_nfs_len = (uint_t)io_len; 1517 args->a_nfs_flags = flags; 1518 1519 mutex_enter(&mi->mi_async_lock); 1520 1521 /* 1522 * If asyncio has been disabled, then make a synchronous request. 1523 * This check is done a second time in case async io was diabled 1524 * while this thread was blocked waiting for memory pressure to 1525 * reduce or for the queue to drain. 1526 */ 1527 if (mi->mi_max_threads == 0) { 1528 mutex_exit(&mi->mi_async_lock); 1529 goto noasync; 1530 } 1531 1532 /* 1533 * Link request structure into the async list and 1534 * wakeup async thread to do the i/o. 1535 */ 1536 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1537 mi->mi_async_reqs[NFS_PAGEIO] = args; 1538 mi->mi_async_tail[NFS_PAGEIO] = args; 1539 } else { 1540 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1541 mi->mi_async_tail[NFS_PAGEIO] = args; 1542 } 1543 1544 mutex_enter(&rp->r_statelock); 1545 rp->r_count++; 1546 rp->r_awcount++; 1547 mutex_exit(&rp->r_statelock); 1548 1549 if (mi->mi_io_kstats) { 1550 mutex_enter(&mi->mi_lock); 1551 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1552 mutex_exit(&mi->mi_lock); 1553 } 1554 1555 mi->mi_async_req_count++; 1556 ASSERT(mi->mi_async_req_count != 0); 1557 cv_signal(&mi->mi_async_reqs_cv); 1558 mutex_exit(&mi->mi_async_lock); 1559 return (0); 1560 1561 noasync: 1562 if (args != NULL) { 1563 VN_RELE(vp); 1564 crfree(cr); 1565 kmem_free(args, sizeof (*args)); 1566 } 1567 1568 /* 1569 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1570 * the page list), for writes we do it synchronously, except for 1571 * proc_pageout/proc_fsflush as described below. 1572 */ 1573 if (flags & B_READ) { 1574 pvn_read_done(pp, flags | B_ERROR); 1575 return (0); 1576 } 1577 1578 if (curproc == proc_pageout || curproc == proc_fsflush) { 1579 /* 1580 * If we get here in the context of the pageout/fsflush, 1581 * we refuse to do a sync write, because this may hang 1582 * pageout/fsflush (and the machine). In this case, we just 1583 * re-mark the page as dirty and punt on the page. 1584 * 1585 * Make sure B_FORCE isn't set. We can re-mark the 1586 * pages as dirty and unlock the pages in one swoop by 1587 * passing in B_ERROR to pvn_write_done(). However, 1588 * we should make sure B_FORCE isn't set - we don't 1589 * want the page tossed before it gets written out. 1590 */ 1591 if (flags & B_FORCE) 1592 flags &= ~(B_INVAL | B_FORCE); 1593 pvn_write_done(pp, flags | B_ERROR); 1594 return (0); 1595 } 1596 1597 if (curproc->p_zone != mi->mi_zone) { 1598 /* 1599 * So this was a cross-zone sync pageio. We pass in B_ERROR 1600 * to pvn_write_done() to re-mark the pages as dirty and unlock 1601 * them. 1602 * 1603 * We don't want to clear B_FORCE here as the caller presumably 1604 * knows what they're doing if they set it. 1605 */ 1606 pvn_write_done(pp, flags | B_ERROR); 1607 return (EPERM); 1608 } 1609 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1610 } 1611 1612 void 1613 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1614 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1615 { 1616 rnode_t *rp; 1617 mntinfo_t *mi; 1618 struct nfs_async_reqs *args; 1619 1620 rp = VTOR(vp); 1621 ASSERT(rp->r_freef == NULL); 1622 1623 mi = VTOMI(vp); 1624 1625 /* 1626 * If we can't allocate a request structure, do the readdir 1627 * operation synchronously in this thread's context. 1628 */ 1629 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1630 goto noasync; 1631 1632 args->a_next = NULL; 1633 #ifdef DEBUG 1634 args->a_queuer = curthread; 1635 #endif 1636 VN_HOLD(vp); 1637 args->a_vp = vp; 1638 ASSERT(cr != NULL); 1639 crhold(cr); 1640 args->a_cred = cr; 1641 args->a_io = NFS_READDIR; 1642 args->a_nfs_readdir = readdir; 1643 args->a_nfs_rdc = rdc; 1644 1645 mutex_enter(&mi->mi_async_lock); 1646 1647 /* 1648 * If asyncio has been disabled, then make a synchronous request. 1649 */ 1650 if (mi->mi_max_threads == 0) { 1651 mutex_exit(&mi->mi_async_lock); 1652 goto noasync; 1653 } 1654 1655 /* 1656 * Link request structure into the async list and 1657 * wakeup async thread to do the i/o. 1658 */ 1659 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1660 mi->mi_async_reqs[NFS_READDIR] = args; 1661 mi->mi_async_tail[NFS_READDIR] = args; 1662 } else { 1663 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1664 mi->mi_async_tail[NFS_READDIR] = args; 1665 } 1666 1667 mutex_enter(&rp->r_statelock); 1668 rp->r_count++; 1669 mutex_exit(&rp->r_statelock); 1670 1671 if (mi->mi_io_kstats) { 1672 mutex_enter(&mi->mi_lock); 1673 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1674 mutex_exit(&mi->mi_lock); 1675 } 1676 1677 mi->mi_async_req_count++; 1678 ASSERT(mi->mi_async_req_count != 0); 1679 cv_signal(&mi->mi_async_reqs_cv); 1680 mutex_exit(&mi->mi_async_lock); 1681 return; 1682 1683 noasync: 1684 if (args != NULL) { 1685 VN_RELE(vp); 1686 crfree(cr); 1687 kmem_free(args, sizeof (*args)); 1688 } 1689 1690 rdc->entries = NULL; 1691 mutex_enter(&rp->r_statelock); 1692 ASSERT(rdc->flags & RDDIR); 1693 rdc->flags &= ~RDDIR; 1694 rdc->flags |= RDDIRREQ; 1695 /* 1696 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1697 * is set, wakeup the thread sleeping in cv_wait_sig(). 1698 * The woken up thread will reset the flag to RDDIR and will 1699 * continue with the readdir opeartion. 1700 */ 1701 if (rdc->flags & RDDIRWAIT) { 1702 rdc->flags &= ~RDDIRWAIT; 1703 cv_broadcast(&rdc->cv); 1704 } 1705 mutex_exit(&rp->r_statelock); 1706 rddir_cache_rele(rdc); 1707 } 1708 1709 void 1710 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1711 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1712 cred_t *)) 1713 { 1714 rnode_t *rp; 1715 mntinfo_t *mi; 1716 struct nfs_async_reqs *args; 1717 page_t *pp; 1718 1719 rp = VTOR(vp); 1720 mi = VTOMI(vp); 1721 1722 /* 1723 * If we can't allocate a request structure, do the commit 1724 * operation synchronously in this thread's context. 1725 */ 1726 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1727 goto noasync; 1728 1729 args->a_next = NULL; 1730 #ifdef DEBUG 1731 args->a_queuer = curthread; 1732 #endif 1733 VN_HOLD(vp); 1734 args->a_vp = vp; 1735 ASSERT(cr != NULL); 1736 crhold(cr); 1737 args->a_cred = cr; 1738 args->a_io = NFS_COMMIT; 1739 args->a_nfs_commit = commit; 1740 args->a_nfs_plist = plist; 1741 args->a_nfs_offset = offset; 1742 args->a_nfs_count = count; 1743 1744 mutex_enter(&mi->mi_async_lock); 1745 1746 /* 1747 * If asyncio has been disabled, then make a synchronous request. 1748 * This check is done a second time in case async io was diabled 1749 * while this thread was blocked waiting for memory pressure to 1750 * reduce or for the queue to drain. 1751 */ 1752 if (mi->mi_max_threads == 0) { 1753 mutex_exit(&mi->mi_async_lock); 1754 goto noasync; 1755 } 1756 1757 /* 1758 * Link request structure into the async list and 1759 * wakeup async thread to do the i/o. 1760 */ 1761 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1762 mi->mi_async_reqs[NFS_COMMIT] = args; 1763 mi->mi_async_tail[NFS_COMMIT] = args; 1764 } else { 1765 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1766 mi->mi_async_tail[NFS_COMMIT] = args; 1767 } 1768 1769 mutex_enter(&rp->r_statelock); 1770 rp->r_count++; 1771 mutex_exit(&rp->r_statelock); 1772 1773 if (mi->mi_io_kstats) { 1774 mutex_enter(&mi->mi_lock); 1775 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1776 mutex_exit(&mi->mi_lock); 1777 } 1778 1779 mi->mi_async_req_count++; 1780 ASSERT(mi->mi_async_req_count != 0); 1781 cv_signal(&mi->mi_async_reqs_cv); 1782 mutex_exit(&mi->mi_async_lock); 1783 return; 1784 1785 noasync: 1786 if (args != NULL) { 1787 VN_RELE(vp); 1788 crfree(cr); 1789 kmem_free(args, sizeof (*args)); 1790 } 1791 1792 if (curproc == proc_pageout || curproc == proc_fsflush || 1793 curproc->p_zone != mi->mi_zone) { 1794 while (plist != NULL) { 1795 pp = plist; 1796 page_sub(&plist, pp); 1797 pp->p_fsdata = C_COMMIT; 1798 page_unlock(pp); 1799 } 1800 return; 1801 } 1802 (*commit)(vp, plist, offset, count, cr); 1803 } 1804 1805 void 1806 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1807 void (*inactive)(vnode_t *, cred_t *)) 1808 { 1809 mntinfo_t *mi; 1810 struct nfs_async_reqs *args; 1811 1812 mi = VTOMI(vp); 1813 1814 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1815 args->a_next = NULL; 1816 #ifdef DEBUG 1817 args->a_queuer = curthread; 1818 #endif 1819 args->a_vp = vp; 1820 ASSERT(cr != NULL); 1821 crhold(cr); 1822 args->a_cred = cr; 1823 args->a_io = NFS_INACTIVE; 1824 args->a_nfs_inactive = inactive; 1825 1826 /* 1827 * Note that we don't check mi->mi_max_threads here, since we 1828 * *need* to get rid of this vnode regardless of whether someone 1829 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1830 * 1831 * The manager thread knows about this and is willing to create 1832 * at least one thread to accomodate us. 1833 */ 1834 mutex_enter(&mi->mi_async_lock); 1835 if (mi->mi_manager_thread == NULL) { 1836 rnode_t *rp = VTOR(vp); 1837 1838 mutex_exit(&mi->mi_async_lock); 1839 crfree(cr); /* drop our reference */ 1840 kmem_free(args, sizeof (*args)); 1841 /* 1842 * We can't do an over-the-wire call since we're in the wrong 1843 * zone, so we need to clean up state as best we can and then 1844 * throw away the vnode. 1845 */ 1846 mutex_enter(&rp->r_statelock); 1847 if (rp->r_unldvp != NULL) { 1848 vnode_t *unldvp; 1849 char *unlname; 1850 cred_t *unlcred; 1851 1852 unldvp = rp->r_unldvp; 1853 rp->r_unldvp = NULL; 1854 unlname = rp->r_unlname; 1855 rp->r_unlname = NULL; 1856 unlcred = rp->r_unlcred; 1857 rp->r_unlcred = NULL; 1858 mutex_exit(&rp->r_statelock); 1859 1860 VN_RELE(unldvp); 1861 kmem_free(unlname, MAXNAMELEN); 1862 crfree(unlcred); 1863 } else { 1864 mutex_exit(&rp->r_statelock); 1865 } 1866 /* 1867 * No need to explicitly throw away any cached pages. The 1868 * eventual rinactive() will attempt a synchronous 1869 * VOP_PUTPAGE() which will immediately fail since the request 1870 * is coming from the wrong zone, and then will proceed to call 1871 * nfs_invalidate_pages() which will clean things up for us. 1872 */ 1873 rp_addfree(VTOR(vp), cr); 1874 return; 1875 } 1876 1877 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1878 mi->mi_async_reqs[NFS_INACTIVE] = args; 1879 } else { 1880 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1881 } 1882 mi->mi_async_tail[NFS_INACTIVE] = args; 1883 /* 1884 * Don't increment r_count, since we're trying to get rid of the vnode. 1885 */ 1886 1887 mi->mi_async_req_count++; 1888 ASSERT(mi->mi_async_req_count != 0); 1889 cv_signal(&mi->mi_async_reqs_cv); 1890 mutex_exit(&mi->mi_async_lock); 1891 } 1892 1893 /* 1894 * The async queues for each mounted file system are arranged as a 1895 * set of queues, one for each async i/o type. Requests are taken 1896 * from the queues in a round-robin fashion. A number of consecutive 1897 * requests are taken from each queue before moving on to the next 1898 * queue. This functionality may allow the NFS Version 2 server to do 1899 * write clustering, even if the client is mixing writes and reads 1900 * because it will take multiple write requests from the queue 1901 * before processing any of the other async i/o types. 1902 * 1903 * XXX The nfs_async_start thread is unsafe in the light of the present 1904 * model defined by cpr to suspend the system. Specifically over the 1905 * wire calls are cpr-unsafe. The thread should be reevaluated in 1906 * case of future updates to the cpr model. 1907 */ 1908 static void 1909 nfs_async_start(struct vfs *vfsp) 1910 { 1911 struct nfs_async_reqs *args; 1912 mntinfo_t *mi = VFTOMI(vfsp); 1913 clock_t time_left = 1; 1914 callb_cpr_t cprinfo; 1915 int i; 1916 1917 /* 1918 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1919 * built in an implementation independent manner. 1920 */ 1921 if (nfs_async_timeout == -1) 1922 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1923 1924 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1925 1926 mutex_enter(&mi->mi_async_lock); 1927 for (;;) { 1928 /* 1929 * Find the next queue containing an entry. We start 1930 * at the current queue pointer and then round robin 1931 * through all of them until we either find a non-empty 1932 * queue or have looked through all of them. 1933 */ 1934 for (i = 0; i < NFS_ASYNC_TYPES; i++) { 1935 args = *mi->mi_async_curr; 1936 if (args != NULL) 1937 break; 1938 mi->mi_async_curr++; 1939 if (mi->mi_async_curr == 1940 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 1941 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1942 } 1943 /* 1944 * If we didn't find a entry, then block until woken up 1945 * again and then look through the queues again. 1946 */ 1947 if (args == NULL) { 1948 /* 1949 * Exiting is considered to be safe for CPR as well 1950 */ 1951 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1952 1953 /* 1954 * Wakeup thread waiting to unmount the file 1955 * system only if all async threads are inactive. 1956 * 1957 * If we've timed-out and there's nothing to do, 1958 * then get rid of this thread. 1959 */ 1960 if (mi->mi_max_threads == 0 || time_left <= 0) { 1961 if (--mi->mi_threads == 0) 1962 cv_signal(&mi->mi_async_cv); 1963 CALLB_CPR_EXIT(&cprinfo); 1964 VFS_RELE(vfsp); /* release thread's hold */ 1965 zthread_exit(); 1966 /* NOTREACHED */ 1967 } 1968 time_left = cv_timedwait(&mi->mi_async_work_cv, 1969 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1970 1971 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1972 1973 continue; 1974 } 1975 time_left = 1; 1976 1977 /* 1978 * Remove the request from the async queue and then 1979 * update the current async request queue pointer. If 1980 * the current queue is empty or we have removed enough 1981 * consecutive entries from it, then reset the counter 1982 * for this queue and then move the current pointer to 1983 * the next queue. 1984 */ 1985 *mi->mi_async_curr = args->a_next; 1986 if (*mi->mi_async_curr == NULL || 1987 --mi->mi_async_clusters[args->a_io] == 0) { 1988 mi->mi_async_clusters[args->a_io] = 1989 mi->mi_async_init_clusters; 1990 mi->mi_async_curr++; 1991 if (mi->mi_async_curr == 1992 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 1993 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1994 } 1995 1996 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 1997 mutex_enter(&mi->mi_lock); 1998 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1999 mutex_exit(&mi->mi_lock); 2000 } 2001 2002 mutex_exit(&mi->mi_async_lock); 2003 2004 /* 2005 * Obtain arguments from the async request structure. 2006 */ 2007 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2008 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2009 args->a_nfs_addr, args->a_nfs_seg, 2010 args->a_cred); 2011 } else if (args->a_io == NFS_PUTAPAGE) { 2012 (void) (*args->a_nfs_putapage)(args->a_vp, 2013 args->a_nfs_pp, args->a_nfs_off, 2014 args->a_nfs_len, args->a_nfs_flags, 2015 args->a_cred); 2016 } else if (args->a_io == NFS_PAGEIO) { 2017 (void) (*args->a_nfs_pageio)(args->a_vp, 2018 args->a_nfs_pp, args->a_nfs_off, 2019 args->a_nfs_len, args->a_nfs_flags, 2020 args->a_cred); 2021 } else if (args->a_io == NFS_READDIR) { 2022 (void) ((*args->a_nfs_readdir)(args->a_vp, 2023 args->a_nfs_rdc, args->a_cred)); 2024 } else if (args->a_io == NFS_COMMIT) { 2025 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2026 args->a_nfs_offset, args->a_nfs_count, 2027 args->a_cred); 2028 } else if (args->a_io == NFS_INACTIVE) { 2029 (*args->a_nfs_inactive)(args->a_vp, args->a_cred); 2030 } 2031 2032 /* 2033 * Now, release the vnode and free the credentials 2034 * structure. 2035 */ 2036 free_async_args(args); 2037 /* 2038 * Reacquire the mutex because it will be needed above. 2039 */ 2040 mutex_enter(&mi->mi_async_lock); 2041 } 2042 } 2043 2044 void 2045 nfs_async_stop(struct vfs *vfsp) 2046 { 2047 mntinfo_t *mi = VFTOMI(vfsp); 2048 2049 /* 2050 * Wait for all outstanding async operations to complete and for the 2051 * worker threads to exit. 2052 */ 2053 mutex_enter(&mi->mi_async_lock); 2054 mi->mi_max_threads = 0; 2055 cv_broadcast(&mi->mi_async_work_cv); 2056 while (mi->mi_threads != 0) 2057 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2058 mutex_exit(&mi->mi_async_lock); 2059 } 2060 2061 /* 2062 * nfs_async_stop_sig: 2063 * Wait for all outstanding putpage operation to complete. If a signal 2064 * is deliver we will abort and return non-zero. If we can put all the 2065 * pages we will return 0. This routine is called from nfs_unmount and 2066 * nfs3_unmount to make these operations interruptable. 2067 */ 2068 int 2069 nfs_async_stop_sig(struct vfs *vfsp) 2070 { 2071 mntinfo_t *mi = VFTOMI(vfsp); 2072 ushort_t omax; 2073 int rval; 2074 2075 /* 2076 * Wait for all outstanding async operations to complete and for the 2077 * worker threads to exit. 2078 */ 2079 mutex_enter(&mi->mi_async_lock); 2080 omax = mi->mi_max_threads; 2081 mi->mi_max_threads = 0; 2082 /* 2083 * Tell all the worker threads to exit. 2084 */ 2085 cv_broadcast(&mi->mi_async_work_cv); 2086 while (mi->mi_threads != 0) { 2087 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2088 break; 2089 } 2090 rval = (mi->mi_threads != 0); /* Interrupted */ 2091 if (rval) 2092 mi->mi_max_threads = omax; 2093 mutex_exit(&mi->mi_async_lock); 2094 2095 return (rval); 2096 } 2097 2098 int 2099 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2100 { 2101 int pagecreate; 2102 int n; 2103 int saved_n; 2104 caddr_t saved_base; 2105 u_offset_t offset; 2106 int error; 2107 int sm_error; 2108 2109 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2110 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2111 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2112 2113 /* 2114 * Move bytes in at most PAGESIZE chunks. We must avoid 2115 * spanning pages in uiomove() because page faults may cause 2116 * the cache to be invalidated out from under us. The r_size is not 2117 * updated until after the uiomove. If we push the last page of a 2118 * file before r_size is correct, we will lose the data written past 2119 * the current (and invalid) r_size. 2120 */ 2121 do { 2122 offset = uio->uio_loffset; 2123 pagecreate = 0; 2124 2125 /* 2126 * n is the number of bytes required to satisfy the request 2127 * or the number of bytes to fill out the page. 2128 */ 2129 n = (int)MIN((PAGESIZE - ((uintptr_t)base & PAGEOFFSET)), 2130 tcount); 2131 2132 /* 2133 * Check to see if we can skip reading in the page 2134 * and just allocate the memory. We can do this 2135 * if we are going to rewrite the entire mapping 2136 * or if we are going to write to or beyond the current 2137 * end of file from the beginning of the mapping. 2138 * 2139 * The read of r_size is now protected by r_statelock. 2140 */ 2141 mutex_enter(&rp->r_statelock); 2142 /* 2143 * When pgcreated is nonzero the caller has already done 2144 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2145 * segkpm this means we already have at least one page 2146 * created and mapped at base. 2147 */ 2148 pagecreate = pgcreated || 2149 (((uintptr_t)base & PAGEOFFSET) == 0 && 2150 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2151 2152 mutex_exit(&rp->r_statelock); 2153 if (pagecreate) { 2154 /* 2155 * The last argument tells segmap_pagecreate() to 2156 * always lock the page, as opposed to sometimes 2157 * returning with the page locked. This way we avoid a 2158 * fault on the ensuing uiomove(), but also 2159 * more importantly (to fix bug 1094402) we can 2160 * call segmap_fault() to unlock the page in all 2161 * cases. An alternative would be to modify 2162 * segmap_pagecreate() to tell us when it is 2163 * locking a page, but that's a fairly major 2164 * interface change. 2165 */ 2166 if (pgcreated == 0) 2167 (void) segmap_pagecreate(segkmap, base, 2168 (uint_t)n, 1); 2169 saved_base = base; 2170 saved_n = n; 2171 } 2172 2173 /* 2174 * The number of bytes of data in the last page can not 2175 * be accurately be determined while page is being 2176 * uiomove'd to and the size of the file being updated. 2177 * Thus, inform threads which need to know accurately 2178 * how much data is in the last page of the file. They 2179 * will not do the i/o immediately, but will arrange for 2180 * the i/o to happen later when this modify operation 2181 * will have finished. 2182 */ 2183 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2184 mutex_enter(&rp->r_statelock); 2185 rp->r_flags |= RMODINPROGRESS; 2186 rp->r_modaddr = (offset & MAXBMASK); 2187 mutex_exit(&rp->r_statelock); 2188 2189 error = uiomove(base, n, UIO_WRITE, uio); 2190 2191 /* 2192 * r_size is the maximum number of 2193 * bytes known to be in the file. 2194 * Make sure it is at least as high as the 2195 * first unwritten byte pointed to by uio_loffset. 2196 */ 2197 mutex_enter(&rp->r_statelock); 2198 if (rp->r_size < uio->uio_loffset) 2199 rp->r_size = uio->uio_loffset; 2200 rp->r_flags &= ~RMODINPROGRESS; 2201 rp->r_flags |= RDIRTY; 2202 mutex_exit(&rp->r_statelock); 2203 2204 /* n = # of bytes written */ 2205 n = (int)(uio->uio_loffset - offset); 2206 base += n; 2207 tcount -= n; 2208 /* 2209 * If we created pages w/o initializing them completely, 2210 * we need to zero the part that wasn't set up. 2211 * This happens on a most EOF write cases and if 2212 * we had some sort of error during the uiomove. 2213 */ 2214 if (pagecreate) { 2215 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2216 (void) kzero(base, PAGESIZE - n); 2217 2218 if (pgcreated) { 2219 /* 2220 * Caller is responsible for this page, 2221 * it was not created in this loop. 2222 */ 2223 pgcreated = 0; 2224 } else { 2225 /* 2226 * For bug 1094402: segmap_pagecreate locks 2227 * page. Unlock it. This also unlocks the 2228 * pages allocated by page_create_va() in 2229 * segmap_pagecreate(). 2230 */ 2231 sm_error = segmap_fault(kas.a_hat, segkmap, 2232 saved_base, saved_n, 2233 F_SOFTUNLOCK, S_WRITE); 2234 if (error == 0) 2235 error = sm_error; 2236 } 2237 } 2238 } while (tcount > 0 && error == 0); 2239 2240 return (error); 2241 } 2242 2243 int 2244 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2245 { 2246 rnode_t *rp; 2247 page_t *pp; 2248 u_offset_t eoff; 2249 u_offset_t io_off; 2250 size_t io_len; 2251 int error; 2252 int rdirty; 2253 int err; 2254 2255 rp = VTOR(vp); 2256 ASSERT(rp->r_count > 0); 2257 2258 if (!vn_has_cached_data(vp)) 2259 return (0); 2260 2261 ASSERT(vp->v_type != VCHR); 2262 2263 /* 2264 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2265 * writes. B_FORCE is set to force the VM system to actually 2266 * invalidate the pages, even if the i/o failed. The pages 2267 * need to get invalidated because they can't be written out 2268 * because there isn't any space left on either the server's 2269 * file system or in the user's disk quota. The B_FREE bit 2270 * is cleared to avoid confusion as to whether this is a 2271 * request to place the page on the freelist or to destroy 2272 * it. 2273 */ 2274 if ((rp->r_flags & ROUTOFSPACE) || 2275 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2276 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2277 2278 if (len == 0) { 2279 /* 2280 * If doing a full file synchronous operation, then clear 2281 * the RDIRTY bit. If a page gets dirtied while the flush 2282 * is happening, then RDIRTY will get set again. The 2283 * RDIRTY bit must get cleared before the flush so that 2284 * we don't lose this information. 2285 */ 2286 if (off == (u_offset_t)0 && 2287 !(flags & B_ASYNC) && 2288 (rp->r_flags & RDIRTY)) { 2289 mutex_enter(&rp->r_statelock); 2290 rdirty = (rp->r_flags & RDIRTY); 2291 rp->r_flags &= ~RDIRTY; 2292 mutex_exit(&rp->r_statelock); 2293 } else 2294 rdirty = 0; 2295 2296 /* 2297 * Search the entire vp list for pages >= off, and flush 2298 * the dirty pages. 2299 */ 2300 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2301 flags, cr); 2302 2303 /* 2304 * If an error occured and the file was marked as dirty 2305 * before and we aren't forcibly invalidating pages, then 2306 * reset the RDIRTY flag. 2307 */ 2308 if (error && rdirty && 2309 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2310 mutex_enter(&rp->r_statelock); 2311 rp->r_flags |= RDIRTY; 2312 mutex_exit(&rp->r_statelock); 2313 } 2314 } else { 2315 /* 2316 * Do a range from [off...off + len) looking for pages 2317 * to deal with. 2318 */ 2319 error = 0; 2320 #ifdef lint 2321 io_len = 0; 2322 #endif 2323 eoff = off + len; 2324 mutex_enter(&rp->r_statelock); 2325 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2326 io_off += io_len) { 2327 mutex_exit(&rp->r_statelock); 2328 /* 2329 * If we are not invalidating, synchronously 2330 * freeing or writing pages use the routine 2331 * page_lookup_nowait() to prevent reclaiming 2332 * them from the free list. 2333 */ 2334 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2335 pp = page_lookup(vp, io_off, 2336 (flags & (B_INVAL | B_FREE)) ? 2337 SE_EXCL : SE_SHARED); 2338 } else { 2339 pp = page_lookup_nowait(vp, io_off, 2340 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2341 } 2342 2343 if (pp == NULL || !pvn_getdirty(pp, flags)) 2344 io_len = PAGESIZE; 2345 else { 2346 err = (*rp->r_putapage)(vp, pp, &io_off, 2347 &io_len, flags, cr); 2348 if (!error) 2349 error = err; 2350 /* 2351 * "io_off" and "io_len" are returned as 2352 * the range of pages we actually wrote. 2353 * This allows us to skip ahead more quickly 2354 * since several pages may've been dealt 2355 * with by this iteration of the loop. 2356 */ 2357 } 2358 mutex_enter(&rp->r_statelock); 2359 } 2360 mutex_exit(&rp->r_statelock); 2361 } 2362 2363 return (error); 2364 } 2365 2366 void 2367 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2368 { 2369 rnode_t *rp; 2370 2371 rp = VTOR(vp); 2372 mutex_enter(&rp->r_statelock); 2373 while (rp->r_flags & RTRUNCATE) 2374 cv_wait(&rp->r_cv, &rp->r_statelock); 2375 rp->r_flags |= RTRUNCATE; 2376 if (off == (u_offset_t)0) { 2377 rp->r_flags &= ~RDIRTY; 2378 if (!(rp->r_flags & RSTALE)) 2379 rp->r_error = 0; 2380 } 2381 rp->r_truncaddr = off; 2382 mutex_exit(&rp->r_statelock); 2383 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2384 B_INVAL | B_TRUNC, cr); 2385 mutex_enter(&rp->r_statelock); 2386 rp->r_flags &= ~RTRUNCATE; 2387 cv_broadcast(&rp->r_cv); 2388 mutex_exit(&rp->r_statelock); 2389 } 2390 2391 static int nfs_write_error_to_cons_only = 0; 2392 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2393 2394 /* 2395 * Print a file handle 2396 */ 2397 void 2398 nfs_printfhandle(nfs_fhandle *fhp) 2399 { 2400 int *ip; 2401 char *buf; 2402 size_t bufsize; 2403 char *cp; 2404 2405 /* 2406 * 13 == "(file handle:" 2407 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2408 * 1 == ' ' 2409 * 8 == maximum strlen of "%x" 2410 * 3 == ")\n\0" 2411 */ 2412 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2413 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2414 if (buf == NULL) 2415 return; 2416 2417 cp = buf; 2418 (void) strcpy(cp, "(file handle:"); 2419 while (*cp != '\0') 2420 cp++; 2421 for (ip = (int *)fhp->fh_buf; 2422 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2423 ip++) { 2424 (void) sprintf(cp, " %x", *ip); 2425 while (*cp != '\0') 2426 cp++; 2427 } 2428 (void) strcpy(cp, ")\n"); 2429 2430 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2431 2432 kmem_free(buf, bufsize); 2433 } 2434 2435 /* 2436 * Notify the system administrator that an NFS write error has 2437 * occurred. 2438 */ 2439 2440 /* seconds between ENOSPC/EDQUOT messages */ 2441 clock_t nfs_write_error_interval = 5; 2442 2443 void 2444 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2445 { 2446 mntinfo_t *mi; 2447 2448 mi = VTOMI(vp); 2449 /* 2450 * In case of forced unmount or zone shutdown, do not print any 2451 * messages since it can flood the console with error messages. 2452 */ 2453 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2454 return; 2455 2456 /* 2457 * No use in flooding the console with ENOSPC 2458 * messages from the same file system. 2459 */ 2460 if ((error != ENOSPC && error != EDQUOT) || 2461 lbolt - mi->mi_printftime > 0) { 2462 zoneid_t zoneid = mi->mi_zone->zone_id; 2463 2464 #ifdef DEBUG 2465 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2466 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2467 #else 2468 nfs_perror(error, "NFS write error on host %s: %m.\n", 2469 VTOR(vp)->r_server->sv_hostname, NULL); 2470 #endif 2471 if (error == ENOSPC || error == EDQUOT) { 2472 zcmn_err(zoneid, CE_CONT, 2473 MSG("^File: userid=%d, groupid=%d\n"), 2474 crgetuid(cr), crgetgid(cr)); 2475 if (crgetuid(CRED()) != crgetuid(cr) || 2476 crgetgid(CRED()) != crgetgid(cr)) { 2477 zcmn_err(zoneid, CE_CONT, 2478 MSG("^User: userid=%d, groupid=%d\n"), 2479 crgetuid(CRED()), crgetgid(CRED())); 2480 } 2481 mi->mi_printftime = lbolt + 2482 nfs_write_error_interval * hz; 2483 } 2484 nfs_printfhandle(&VTOR(vp)->r_fh); 2485 #ifdef DEBUG 2486 if (error == EACCES) { 2487 zcmn_err(zoneid, CE_CONT, 2488 MSG("^nfs_bio: cred is%s kcred\n"), 2489 cr == kcred ? "" : " not"); 2490 } 2491 #endif 2492 } 2493 } 2494 2495 /* ARGSUSED */ 2496 static void * 2497 nfs_mi_init(zoneid_t zoneid) 2498 { 2499 struct mi_globals *mig; 2500 2501 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2502 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2503 list_create(&mig->mig_list, sizeof (mntinfo_t), 2504 offsetof(mntinfo_t, mi_zone_node)); 2505 mig->mig_destructor_called = B_FALSE; 2506 return (mig); 2507 } 2508 2509 /* 2510 * Callback routine to tell all NFS mounts in the zone to stop creating new 2511 * threads. Existing threads should exit. 2512 */ 2513 /* ARGSUSED */ 2514 static void 2515 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2516 { 2517 struct mi_globals *mig = data; 2518 mntinfo_t *mi; 2519 2520 ASSERT(mig != NULL); 2521 mutex_enter(&mig->mig_lock); 2522 for (mi = list_head(&mig->mig_list); mi != NULL; 2523 mi = list_next(&mig->mig_list, mi)) { 2524 /* 2525 * purge the DNLC for this filesystem 2526 */ 2527 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2528 2529 mutex_enter(&mi->mi_async_lock); 2530 /* 2531 * Tell existing async worker threads to exit. 2532 */ 2533 mi->mi_max_threads = 0; 2534 cv_broadcast(&mi->mi_async_work_cv); 2535 /* 2536 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2537 * getting ready to exit when it's done with its current work. 2538 */ 2539 mutex_enter(&mi->mi_lock); 2540 mi->mi_flags |= MI_ASYNC_MGR_STOP; 2541 mutex_exit(&mi->mi_lock); 2542 /* 2543 * Wake up the async manager thread. 2544 */ 2545 cv_broadcast(&mi->mi_async_reqs_cv); 2546 mutex_exit(&mi->mi_async_lock); 2547 } 2548 mutex_exit(&mig->mig_lock); 2549 } 2550 2551 static void 2552 nfs_mi_free_globals(struct mi_globals *mig) 2553 { 2554 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2555 mutex_destroy(&mig->mig_lock); 2556 kmem_free(mig, sizeof (*mig)); 2557 2558 } 2559 2560 /* ARGSUSED */ 2561 static void 2562 nfs_mi_destroy(zoneid_t zoneid, void *data) 2563 { 2564 struct mi_globals *mig = data; 2565 2566 ASSERT(mig != NULL); 2567 mutex_enter(&mig->mig_lock); 2568 if (list_head(&mig->mig_list) != NULL) { 2569 /* Still waiting for VFS_FREEVFS() */ 2570 mig->mig_destructor_called = B_TRUE; 2571 mutex_exit(&mig->mig_lock); 2572 return; 2573 } 2574 nfs_mi_free_globals(mig); 2575 } 2576 2577 /* 2578 * Add an NFS mount to the per-zone list of NFS mounts. 2579 */ 2580 void 2581 nfs_mi_zonelist_add(mntinfo_t *mi) 2582 { 2583 struct mi_globals *mig; 2584 2585 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2586 mutex_enter(&mig->mig_lock); 2587 list_insert_head(&mig->mig_list, mi); 2588 mutex_exit(&mig->mig_lock); 2589 } 2590 2591 /* 2592 * Remove an NFS mount from the per-zone list of NFS mounts. 2593 */ 2594 static void 2595 nfs_mi_zonelist_remove(mntinfo_t *mi) 2596 { 2597 struct mi_globals *mig; 2598 2599 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2600 mutex_enter(&mig->mig_lock); 2601 list_remove(&mig->mig_list, mi); 2602 /* 2603 * We can be called asynchronously by VFS_FREEVFS() after the zone 2604 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2605 * mi globals. 2606 */ 2607 if (list_head(&mig->mig_list) == NULL && 2608 mig->mig_destructor_called == B_TRUE) { 2609 nfs_mi_free_globals(mig); 2610 return; 2611 } 2612 mutex_exit(&mig->mig_lock); 2613 } 2614 2615 /* 2616 * NFS Client initialization routine. This routine should only be called 2617 * once. It performs the following tasks: 2618 * - Initalize all global locks 2619 * - Call sub-initialization routines (localize access to variables) 2620 */ 2621 int 2622 nfs_clntinit(void) 2623 { 2624 #ifdef DEBUG 2625 static boolean_t nfs_clntup = B_FALSE; 2626 #endif 2627 int error; 2628 2629 #ifdef DEBUG 2630 ASSERT(nfs_clntup == B_FALSE); 2631 #endif 2632 2633 error = nfs_subrinit(); 2634 if (error) 2635 return (error); 2636 2637 error = nfs_vfsinit(); 2638 if (error) { 2639 /* 2640 * Cleanup nfs_subrinit() work 2641 */ 2642 nfs_subrfini(); 2643 return (error); 2644 } 2645 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2646 nfs_mi_destroy); 2647 2648 nfs4_clnt_init(); 2649 2650 #ifdef DEBUG 2651 nfs_clntup = B_TRUE; 2652 #endif 2653 2654 return (0); 2655 } 2656 2657 /* 2658 * This routine is only called if the NFS Client has been initialized but 2659 * the module failed to be installed. This routine will cleanup the previously 2660 * allocated/initialized work. 2661 */ 2662 void 2663 nfs_clntfini(void) 2664 { 2665 (void) zone_key_delete(mi_list_key); 2666 nfs_subrfini(); 2667 nfs_vfsfini(); 2668 nfs4_clnt_fini(); 2669 } 2670 2671 /* 2672 * nfs_lockrelease: 2673 * 2674 * Release any locks on the given vnode that are held by the current 2675 * process. 2676 */ 2677 void 2678 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2679 { 2680 flock64_t ld; 2681 struct shrlock shr; 2682 char *buf; 2683 int remote_lock_possible; 2684 int ret; 2685 2686 ASSERT((uintptr_t)vp > KERNELBASE); 2687 2688 /* 2689 * Generate an explicit unlock operation for the entire file. As a 2690 * partial optimization, only generate the unlock if there is a 2691 * lock registered for the file. We could check whether this 2692 * particular process has any locks on the file, but that would 2693 * require the local locking code to provide yet another query 2694 * routine. Note that no explicit synchronization is needed here. 2695 * At worst, flk_has_remote_locks() will return a false positive, 2696 * in which case the unlock call wastes time but doesn't harm 2697 * correctness. 2698 * 2699 * In addition, an unlock request is generated if the process 2700 * is listed as possibly having a lock on the file because the 2701 * server and client lock managers may have gotten out of sync. 2702 * N.B. It is important to make sure nfs_remove_locking_id() is 2703 * called here even if flk_has_remote_locks(vp) reports true. 2704 * If it is not called and there is an entry on the process id 2705 * list, that entry will never get removed. 2706 */ 2707 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2708 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2709 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2710 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2711 ld.l_whence = 0; /* unlock from start of file */ 2712 ld.l_start = 0; 2713 ld.l_len = 0; /* do entire file */ 2714 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr); 2715 2716 if (ret != 0) { 2717 /* 2718 * If VOP_FRLOCK fails, make sure we unregister 2719 * local locks before we continue. 2720 */ 2721 ld.l_pid = ttoproc(curthread)->p_pid; 2722 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2723 #ifdef DEBUG 2724 nfs_perror(ret, 2725 "NFS lock release error on vp %p: %m.\n", 2726 (void *)vp, NULL); 2727 #endif 2728 } 2729 2730 /* 2731 * The call to VOP_FRLOCK may put the pid back on the 2732 * list. We need to remove it. 2733 */ 2734 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2735 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2736 } 2737 2738 /* 2739 * As long as the vp has a share matching our pid, 2740 * pluck it off and unshare it. There are circumstances in 2741 * which the call to nfs_remove_locking_id() may put the 2742 * owner back on the list, in which case we simply do a 2743 * redundant and harmless unshare. 2744 */ 2745 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2746 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2747 (char *)NULL, buf, &shr.s_own_len)) { 2748 shr.s_owner = buf; 2749 shr.s_access = 0; 2750 shr.s_deny = 0; 2751 shr.s_sysid = 0; 2752 shr.s_pid = curproc->p_pid; 2753 2754 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr); 2755 #ifdef DEBUG 2756 if (ret != 0) { 2757 nfs_perror(ret, 2758 "NFS share release error on vp %p: %m.\n", 2759 (void *)vp, NULL); 2760 } 2761 #endif 2762 } 2763 kmem_free(buf, MAX_SHR_OWNER_LEN); 2764 } 2765 2766 /* 2767 * nfs_lockcompletion: 2768 * 2769 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2770 * as non cachable (set VNOCACHE bit). 2771 */ 2772 2773 void 2774 nfs_lockcompletion(vnode_t *vp, int cmd) 2775 { 2776 #ifdef DEBUG 2777 rnode_t *rp = VTOR(vp); 2778 2779 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2780 #endif 2781 2782 if (cmd == F_SETLK || cmd == F_SETLKW) { 2783 if (!lm_safemap(vp)) { 2784 mutex_enter(&vp->v_lock); 2785 vp->v_flag |= VNOCACHE; 2786 mutex_exit(&vp->v_lock); 2787 } else { 2788 mutex_enter(&vp->v_lock); 2789 vp->v_flag &= ~VNOCACHE; 2790 mutex_exit(&vp->v_lock); 2791 } 2792 } 2793 /* 2794 * The cached attributes of the file are stale after acquiring 2795 * the lock on the file. They were updated when the file was 2796 * opened, but not updated when the lock was acquired. Therefore the 2797 * cached attributes are invalidated after the lock is obtained. 2798 */ 2799 PURGE_ATTRCACHE(vp); 2800 } 2801 2802 /* 2803 * The lock manager holds state making it possible for the client 2804 * and server to be out of sync. For example, if the response from 2805 * the server granting a lock request is lost, the server will think 2806 * the lock is granted and the client will think the lock is lost. 2807 * The client can tell when it is not positive if it is in sync with 2808 * the server. 2809 * 2810 * To deal with this, a list of processes for which the client is 2811 * not sure if the server holds a lock is attached to the rnode. 2812 * When such a process closes the rnode, an unlock request is sent 2813 * to the server to unlock the entire file. 2814 * 2815 * The list is kept as a singularly linked NULL terminated list. 2816 * Because it is only added to under extreme error conditions, the 2817 * list shouldn't get very big. DEBUG kernels print a message if 2818 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 2819 * choosen to be 8, but can be tuned at runtime. 2820 */ 2821 #ifdef DEBUG 2822 /* int nfs_lmpl_high_water = 8; */ 2823 int nfs_lmpl_high_water = 128; 2824 int nfs_cnt_add_locking_id = 0; 2825 int nfs_len_add_locking_id = 0; 2826 #endif /* DEBUG */ 2827 2828 /* 2829 * Record that the nfs lock manager server may be holding a lock on 2830 * a vnode for a process. 2831 * 2832 * Because the nfs lock manager server holds state, it is possible 2833 * for the server to get out of sync with the client. This routine is called 2834 * from the client when it is no longer sure if the server is in sync 2835 * with the client. nfs_lockrelease() will then notice this and send 2836 * an unlock request when the file is closed 2837 */ 2838 void 2839 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 2840 { 2841 rnode_t *rp; 2842 lmpl_t *new; 2843 lmpl_t *cur; 2844 lmpl_t **lmplp; 2845 #ifdef DEBUG 2846 int list_len = 1; 2847 #endif /* DEBUG */ 2848 2849 #ifdef DEBUG 2850 ++nfs_cnt_add_locking_id; 2851 #endif /* DEBUG */ 2852 /* 2853 * allocate new lmpl_t now so we don't sleep 2854 * later after grabbing mutexes 2855 */ 2856 ASSERT(len < MAX_SHR_OWNER_LEN); 2857 new = kmem_alloc(sizeof (*new), KM_SLEEP); 2858 new->lmpl_type = type; 2859 new->lmpl_pid = pid; 2860 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 2861 bcopy(id, new->lmpl_owner, len); 2862 new->lmpl_own_len = len; 2863 new->lmpl_next = (lmpl_t *)NULL; 2864 #ifdef DEBUG 2865 if (type == RLMPL_PID) { 2866 ASSERT(len == sizeof (pid_t)); 2867 ASSERT(pid == *(pid_t *)new->lmpl_owner); 2868 } else { 2869 ASSERT(type == RLMPL_OWNER); 2870 } 2871 #endif 2872 2873 rp = VTOR(vp); 2874 mutex_enter(&rp->r_statelock); 2875 2876 /* 2877 * Add this id to the list for this rnode only if the 2878 * rnode is active and the id is not already there. 2879 */ 2880 ASSERT(rp->r_flags & RHASHED); 2881 lmplp = &(rp->r_lmpl); 2882 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 2883 if (cur->lmpl_pid == pid && 2884 cur->lmpl_type == type && 2885 cur->lmpl_own_len == len && 2886 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 2887 kmem_free(new->lmpl_owner, len); 2888 kmem_free(new, sizeof (*new)); 2889 break; 2890 } 2891 lmplp = &cur->lmpl_next; 2892 #ifdef DEBUG 2893 ++list_len; 2894 #endif /* DEBUG */ 2895 } 2896 if (cur == (lmpl_t *)NULL) { 2897 *lmplp = new; 2898 #ifdef DEBUG 2899 if (list_len > nfs_len_add_locking_id) { 2900 nfs_len_add_locking_id = list_len; 2901 } 2902 if (list_len > nfs_lmpl_high_water) { 2903 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 2904 "vp=%p is %d", (void *)vp, list_len); 2905 } 2906 #endif /* DEBUG */ 2907 } 2908 2909 #ifdef DEBUG 2910 if (share_debug) { 2911 int nitems = 0; 2912 int npids = 0; 2913 int nowners = 0; 2914 2915 /* 2916 * Count the number of things left on r_lmpl after the remove. 2917 */ 2918 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 2919 cur = cur->lmpl_next) { 2920 nitems++; 2921 if (cur->lmpl_type == RLMPL_PID) { 2922 npids++; 2923 } else if (cur->lmpl_type == RLMPL_OWNER) { 2924 nowners++; 2925 } else { 2926 cmn_err(CE_PANIC, "nfs_add_locking_id: " 2927 "unrecognised lmpl_type %d", 2928 cur->lmpl_type); 2929 } 2930 } 2931 2932 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 2933 "OWNs = %d items left on r_lmpl\n", 2934 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 2935 } 2936 #endif 2937 2938 mutex_exit(&rp->r_statelock); 2939 } 2940 2941 /* 2942 * Remove an id from the lock manager id list. 2943 * 2944 * If the id is not in the list return 0. If it was found and 2945 * removed, return 1. 2946 */ 2947 static int 2948 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 2949 { 2950 lmpl_t *cur; 2951 lmpl_t **lmplp; 2952 rnode_t *rp; 2953 int rv = 0; 2954 2955 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 2956 2957 rp = VTOR(vp); 2958 2959 mutex_enter(&rp->r_statelock); 2960 ASSERT(rp->r_flags & RHASHED); 2961 lmplp = &(rp->r_lmpl); 2962 2963 /* 2964 * Search through the list and remove the entry for this id 2965 * if it is there. The special case id == NULL allows removal 2966 * of the first share on the r_lmpl list belonging to the 2967 * current process (if any), without regard to further details 2968 * of its identity. 2969 */ 2970 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 2971 if (cur->lmpl_type == type && 2972 cur->lmpl_pid == curproc->p_pid && 2973 (id == (char *)NULL || 2974 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 2975 *lmplp = cur->lmpl_next; 2976 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 2977 if (rid != NULL) { 2978 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 2979 *rlen = cur->lmpl_own_len; 2980 } 2981 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 2982 kmem_free(cur, sizeof (*cur)); 2983 rv = 1; 2984 break; 2985 } 2986 lmplp = &cur->lmpl_next; 2987 } 2988 2989 #ifdef DEBUG 2990 if (share_debug) { 2991 int nitems = 0; 2992 int npids = 0; 2993 int nowners = 0; 2994 2995 /* 2996 * Count the number of things left on r_lmpl after the remove. 2997 */ 2998 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 2999 cur = cur->lmpl_next) { 3000 nitems++; 3001 if (cur->lmpl_type == RLMPL_PID) { 3002 npids++; 3003 } else if (cur->lmpl_type == RLMPL_OWNER) { 3004 nowners++; 3005 } else { 3006 cmn_err(CE_PANIC, 3007 "nrli: unrecognised lmpl_type %d", 3008 cur->lmpl_type); 3009 } 3010 } 3011 3012 cmn_err(CE_CONT, 3013 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3014 (type == RLMPL_PID) ? "P" : "O", 3015 npids, 3016 nowners, 3017 nitems); 3018 } 3019 #endif 3020 3021 mutex_exit(&rp->r_statelock); 3022 return (rv); 3023 } 3024 3025 void 3026 nfs_free_mi(mntinfo_t *mi) 3027 { 3028 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3029 ASSERT(mi->mi_manager_thread == NULL); 3030 ASSERT(mi->mi_threads == 0); 3031 3032 /* 3033 * Remove the node from the global list before we start tearing it down. 3034 */ 3035 nfs_mi_zonelist_remove(mi); 3036 if (mi->mi_klmconfig) { 3037 lm_free_config(mi->mi_klmconfig); 3038 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3039 } 3040 mutex_destroy(&mi->mi_lock); 3041 mutex_destroy(&mi->mi_async_lock); 3042 cv_destroy(&mi->mi_failover_cv); 3043 cv_destroy(&mi->mi_async_work_cv); 3044 cv_destroy(&mi->mi_async_reqs_cv); 3045 cv_destroy(&mi->mi_async_cv); 3046 zone_rele(mi->mi_zone); 3047 kmem_free(mi, sizeof (*mi)); 3048 } 3049 3050 static int 3051 mnt_kstat_update(kstat_t *ksp, int rw) 3052 { 3053 mntinfo_t *mi; 3054 struct mntinfo_kstat *mik; 3055 vfs_t *vfsp; 3056 int i; 3057 3058 /* this is a read-only kstat. Bail out on a write */ 3059 if (rw == KSTAT_WRITE) 3060 return (EACCES); 3061 3062 /* 3063 * We don't want to wait here as kstat_chain_lock could be held by 3064 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3065 * and thus could lead to a deadlock. 3066 */ 3067 vfsp = (struct vfs *)ksp->ks_private; 3068 3069 3070 mi = VFTOMI(vfsp); 3071 3072 mik = (struct mntinfo_kstat *)ksp->ks_data; 3073 3074 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3075 mik->mik_vers = (uint32_t)mi->mi_vers; 3076 mik->mik_flags = mi->mi_flags; 3077 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3078 mik->mik_curread = (uint32_t)mi->mi_curread; 3079 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3080 mik->mik_retrans = mi->mi_retrans; 3081 mik->mik_timeo = mi->mi_timeo; 3082 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3083 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3084 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3085 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3086 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3087 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3088 mik->mik_timers[i].deviate = 3089 (uint32_t)mi->mi_timers[i].rt_deviate; 3090 mik->mik_timers[i].rtxcur = 3091 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3092 } 3093 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3094 mik->mik_failover = (uint32_t)mi->mi_failover; 3095 mik->mik_remap = (uint32_t)mi->mi_remap; 3096 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3097 3098 return (0); 3099 } 3100 3101 void 3102 nfs_mnt_kstat_init(struct vfs *vfsp) 3103 { 3104 mntinfo_t *mi = VFTOMI(vfsp); 3105 3106 /* 3107 * Create the version specific kstats. 3108 * 3109 * PSARC 2001/697 Contract Private Interface 3110 * All nfs kstats are under SunMC contract 3111 * Please refer to the PSARC listed above and contact 3112 * SunMC before making any changes! 3113 * 3114 * Changes must be reviewed by Solaris File Sharing 3115 * Changes must be communicated to contract-2001-697@sun.com 3116 * 3117 */ 3118 3119 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3120 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3121 if (mi->mi_io_kstats) { 3122 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3123 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3124 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3125 kstat_install(mi->mi_io_kstats); 3126 } 3127 3128 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3129 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3130 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3131 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3132 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3133 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3134 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3135 kstat_install(mi->mi_ro_kstats); 3136 } 3137 } 3138 3139 nfs_delmapcall_t * 3140 nfs_init_delmapcall() 3141 { 3142 nfs_delmapcall_t *delmap_call; 3143 3144 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3145 delmap_call->call_id = curthread; 3146 delmap_call->error = 0; 3147 3148 return (delmap_call); 3149 } 3150 3151 void 3152 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3153 { 3154 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3155 } 3156 3157 /* 3158 * Searches for the current delmap caller (based on curthread) in the list of 3159 * callers. If it is found, we remove it and free the delmap caller. 3160 * Returns: 3161 * 0 if the caller wasn't found 3162 * 1 if the caller was found, removed and freed. *errp is set to what 3163 * the result of the delmap was. 3164 */ 3165 int 3166 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3167 { 3168 nfs_delmapcall_t *delmap_call; 3169 3170 /* 3171 * If the list doesn't exist yet, we create it and return 3172 * that the caller wasn't found. No list = no callers. 3173 */ 3174 mutex_enter(&rp->r_statelock); 3175 if (!(rp->r_flags & RDELMAPLIST)) { 3176 /* The list does not exist */ 3177 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3178 offsetof(nfs_delmapcall_t, call_node)); 3179 rp->r_flags |= RDELMAPLIST; 3180 mutex_exit(&rp->r_statelock); 3181 return (0); 3182 } else { 3183 /* The list exists so search it */ 3184 for (delmap_call = list_head(&rp->r_indelmap); 3185 delmap_call != NULL; 3186 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3187 if (delmap_call->call_id == curthread) { 3188 /* current caller is in the list */ 3189 *errp = delmap_call->error; 3190 list_remove(&rp->r_indelmap, delmap_call); 3191 mutex_exit(&rp->r_statelock); 3192 nfs_free_delmapcall(delmap_call); 3193 return (1); 3194 } 3195 } 3196 } 3197 mutex_exit(&rp->r_statelock); 3198 return (0); 3199 } 3200