1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/thread.h> 35 #include <sys/t_lock.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/errno.h> 40 #include <sys/buf.h> 41 #include <sys/stat.h> 42 #include <sys/cred.h> 43 #include <sys/kmem.h> 44 #include <sys/debug.h> 45 #include <sys/dnlc.h> 46 #include <sys/vmsystm.h> 47 #include <sys/flock.h> 48 #include <sys/share.h> 49 #include <sys/cmn_err.h> 50 #include <sys/tiuser.h> 51 #include <sys/sysmacros.h> 52 #include <sys/callb.h> 53 #include <sys/acl.h> 54 #include <sys/kstat.h> 55 #include <sys/signal.h> 56 #include <sys/disp.h> 57 #include <sys/atomic.h> 58 #include <sys/list.h> 59 #include <sys/sdt.h> 60 61 #include <rpc/types.h> 62 #include <rpc/xdr.h> 63 #include <rpc/auth.h> 64 #include <rpc/clnt.h> 65 66 #include <nfs/nfs.h> 67 #include <nfs/nfs_clnt.h> 68 #include <nfs/nfs_acl.h> 69 70 #include <nfs/nfs4.h> 71 #include <nfs/rnode4.h> 72 #include <nfs/nfs4_clnt.h> 73 74 #include <vm/hat.h> 75 #include <vm/as.h> 76 #include <vm/page.h> 77 #include <vm/pvn.h> 78 #include <vm/seg.h> 79 #include <vm/seg_map.h> 80 #include <vm/seg_vn.h> 81 82 #include <sys/ddi.h> 83 84 /* 85 * Arguments to page-flush thread. 86 */ 87 typedef struct { 88 vnode_t *vp; 89 cred_t *cr; 90 } pgflush_t; 91 92 #ifdef DEBUG 93 int nfs4_client_lease_debug; 94 int nfs4_sharedfh_debug; 95 int nfs4_fname_debug; 96 97 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 98 int nfs4_vtype_debug; 99 100 uint_t nfs4_tsd_key; 101 #endif 102 103 static time_t nfs4_client_resumed = 0; 104 static callb_id_t cid = 0; 105 106 static int nfs4renew(nfs4_server_t *); 107 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 108 static void nfs4_pgflush_thread(pgflush_t *); 109 110 static boolean_t nfs4_client_cpr_callb(void *, int); 111 112 struct mi4_globals { 113 kmutex_t mig_lock; /* lock protecting mig_list */ 114 list_t mig_list; /* list of NFS v4 mounts in zone */ 115 boolean_t mig_destructor_called; 116 }; 117 118 static zone_key_t mi4_list_key; 119 120 /* 121 * Attributes caching: 122 * 123 * Attributes are cached in the rnode in struct vattr form. 124 * There is a time associated with the cached attributes (r_time_attr_inval) 125 * which tells whether the attributes are valid. The time is initialized 126 * to the difference between current time and the modify time of the vnode 127 * when new attributes are cached. This allows the attributes for 128 * files that have changed recently to be timed out sooner than for files 129 * that have not changed for a long time. There are minimum and maximum 130 * timeout values that can be set per mount point. 131 */ 132 133 /* 134 * If a cache purge is in progress, wait for it to finish. 135 * 136 * The current thread must not be in the middle of an 137 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 138 * between this thread, a recovery thread, and the page flush thread. 139 */ 140 int 141 nfs4_waitfor_purge_complete(vnode_t *vp) 142 { 143 rnode4_t *rp; 144 k_sigset_t smask; 145 146 rp = VTOR4(vp); 147 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 148 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 149 mutex_enter(&rp->r_statelock); 150 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 151 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 152 ((rp->r_flags & R4PGFLUSH) && 153 rp->r_pgflush != curthread)) { 154 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 155 sigunintr(&smask); 156 mutex_exit(&rp->r_statelock); 157 return (EINTR); 158 } 159 } 160 sigunintr(&smask); 161 mutex_exit(&rp->r_statelock); 162 } 163 return (0); 164 } 165 166 /* 167 * Validate caches by checking cached attributes. If they have timed out, 168 * then get new attributes from the server. As a side effect, cache 169 * invalidation is done if the attributes have changed. 170 * 171 * If the attributes have not timed out and if there is a cache 172 * invalidation being done by some other thread, then wait until that 173 * thread has completed the cache invalidation. 174 */ 175 int 176 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 177 { 178 int error; 179 nfs4_ga_res_t gar; 180 181 if (ATTRCACHE4_VALID(vp)) { 182 error = nfs4_waitfor_purge_complete(vp); 183 if (error) 184 return (error); 185 return (0); 186 } 187 188 gar.n4g_va.va_mask = AT_ALL; 189 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 190 } 191 192 /* 193 * Fill in attribute from the cache. 194 * If valid, then return 0 to indicate that no error occurred, 195 * otherwise return 1 to indicate that an error occurred. 196 */ 197 static int 198 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 199 { 200 rnode4_t *rp; 201 202 rp = VTOR4(vp); 203 mutex_enter(&rp->r_statelock); 204 mutex_enter(&rp->r_statev4_lock); 205 if (ATTRCACHE4_VALID(vp)) { 206 mutex_exit(&rp->r_statev4_lock); 207 /* 208 * Cached attributes are valid 209 */ 210 *vap = rp->r_attr; 211 mutex_exit(&rp->r_statelock); 212 return (0); 213 } 214 mutex_exit(&rp->r_statev4_lock); 215 mutex_exit(&rp->r_statelock); 216 return (1); 217 } 218 219 220 /* 221 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 222 * call is synchronous because all the pages were invalidated by the 223 * nfs4_invalidate_pages() call. 224 */ 225 void 226 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 227 { 228 struct rnode4 *rp = VTOR4(vp); 229 230 /* Ensure that the ..._end_op() call has been done */ 231 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 232 233 if (errno != ESTALE) 234 return; 235 236 mutex_enter(&rp->r_statelock); 237 rp->r_flags |= R4STALE; 238 if (!rp->r_error) 239 rp->r_error = errno; 240 mutex_exit(&rp->r_statelock); 241 if (nfs4_has_pages(vp)) 242 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 243 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 244 } 245 246 /* 247 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 248 * page purge is done asynchronously. 249 */ 250 void 251 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 252 { 253 rnode4_t *rp; 254 char *contents; 255 vnode_t *xattr; 256 int size; 257 int pgflush; /* are we the page flush thread? */ 258 259 /* 260 * Purge the DNLC for any entries which refer to this file. 261 */ 262 if (vp->v_count > 1 && 263 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 264 dnlc_purge_vp(vp); 265 266 /* 267 * Clear any readdir state bits and purge the readlink response cache. 268 */ 269 rp = VTOR4(vp); 270 mutex_enter(&rp->r_statelock); 271 rp->r_flags &= ~R4LOOKUP; 272 contents = rp->r_symlink.contents; 273 size = rp->r_symlink.size; 274 rp->r_symlink.contents = NULL; 275 276 xattr = rp->r_xattr_dir; 277 rp->r_xattr_dir = NULL; 278 279 /* 280 * Purge pathconf cache too. 281 */ 282 rp->r_pathconf.pc4_xattr_valid = 0; 283 rp->r_pathconf.pc4_cache_valid = 0; 284 285 pgflush = (curthread == rp->r_pgflush); 286 mutex_exit(&rp->r_statelock); 287 288 if (contents != NULL) { 289 290 kmem_free((void *)contents, size); 291 } 292 293 if (xattr != NULL) 294 VN_RELE(xattr); 295 296 /* 297 * Flush the page cache. If the current thread is the page flush 298 * thread, don't initiate a new page flush. There's no need for 299 * it, and doing it correctly is hard. 300 */ 301 if (nfs4_has_pages(vp) && !pgflush) { 302 if (!asyncpg) { 303 (void) nfs4_waitfor_purge_complete(vp); 304 nfs4_flush_pages(vp, cr); 305 } else { 306 pgflush_t *args; 307 308 /* 309 * We don't hold r_statelock while creating the 310 * thread, in case the call blocks. So we use a 311 * flag to indicate that a page flush thread is 312 * active. 313 */ 314 mutex_enter(&rp->r_statelock); 315 if (rp->r_flags & R4PGFLUSH) { 316 mutex_exit(&rp->r_statelock); 317 } else { 318 rp->r_flags |= R4PGFLUSH; 319 mutex_exit(&rp->r_statelock); 320 321 args = kmem_alloc(sizeof (pgflush_t), 322 KM_SLEEP); 323 args->vp = vp; 324 VN_HOLD(args->vp); 325 args->cr = cr; 326 crhold(args->cr); 327 (void) zthread_create(NULL, 0, 328 nfs4_pgflush_thread, args, 0, 329 minclsyspri); 330 } 331 } 332 } 333 334 /* 335 * Flush the readdir response cache. 336 */ 337 nfs4_purge_rddir_cache(vp); 338 } 339 340 /* 341 * Invalidate all pages for the given file, after writing back the dirty 342 * ones. 343 */ 344 345 void 346 nfs4_flush_pages(vnode_t *vp, cred_t *cr) 347 { 348 int error; 349 rnode4_t *rp = VTOR4(vp); 350 351 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 352 if (error == ENOSPC || error == EDQUOT) { 353 mutex_enter(&rp->r_statelock); 354 if (!rp->r_error) 355 rp->r_error = error; 356 mutex_exit(&rp->r_statelock); 357 } 358 } 359 360 /* 361 * Page flush thread. 362 */ 363 364 static void 365 nfs4_pgflush_thread(pgflush_t *args) 366 { 367 rnode4_t *rp = VTOR4(args->vp); 368 369 /* remember which thread we are, so we don't deadlock ourselves */ 370 mutex_enter(&rp->r_statelock); 371 ASSERT(rp->r_pgflush == NULL); 372 rp->r_pgflush = curthread; 373 mutex_exit(&rp->r_statelock); 374 375 nfs4_flush_pages(args->vp, args->cr); 376 377 mutex_enter(&rp->r_statelock); 378 rp->r_pgflush = NULL; 379 rp->r_flags &= ~R4PGFLUSH; 380 cv_broadcast(&rp->r_cv); 381 mutex_exit(&rp->r_statelock); 382 383 VN_RELE(args->vp); 384 crfree(args->cr); 385 kmem_free(args, sizeof (pgflush_t)); 386 zthread_exit(); 387 } 388 389 /* 390 * Purge the readdir cache of all entries which are not currently 391 * being filled. 392 */ 393 void 394 nfs4_purge_rddir_cache(vnode_t *vp) 395 { 396 rnode4_t *rp; 397 398 rp = VTOR4(vp); 399 400 mutex_enter(&rp->r_statelock); 401 rp->r_direof = NULL; 402 rp->r_flags &= ~R4LOOKUP; 403 rp->r_flags |= R4READDIRWATTR; 404 rddir4_cache_purge(rp); 405 mutex_exit(&rp->r_statelock); 406 } 407 408 /* 409 * Set attributes cache for given vnode using virtual attributes. There is 410 * no cache validation, but if the attributes are deemed to be stale, they 411 * are ignored. This corresponds to nfs3_attrcache(). 412 * 413 * Set the timeout value on the attribute cache and fill it 414 * with the passed in attributes. 415 */ 416 void 417 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 418 { 419 rnode4_t *rp = VTOR4(vp); 420 421 mutex_enter(&rp->r_statelock); 422 if (rp->r_time_attr_saved <= t) 423 nfs4_attrcache_va(vp, garp, FALSE); 424 mutex_exit(&rp->r_statelock); 425 } 426 427 /* 428 * Use the passed in virtual attributes to check to see whether the 429 * data and metadata caches are valid, cache the new attributes, and 430 * then do the cache invalidation if required. 431 * 432 * The cache validation and caching of the new attributes is done 433 * atomically via the use of the mutex, r_statelock. If required, 434 * the cache invalidation is done atomically w.r.t. the cache 435 * validation and caching of the attributes via the pseudo lock, 436 * r_serial. 437 * 438 * This routine is used to do cache validation and attributes caching 439 * for operations with a single set of post operation attributes. 440 */ 441 442 void 443 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 444 hrtime_t t, cred_t *cr, int async, 445 change_info4 *cinfo) 446 { 447 rnode4_t *rp; 448 int mtime_changed = 0; 449 int ctime_changed = 0; 450 vsecattr_t *vsp; 451 int was_serial, set_time_cache_inval, recov; 452 vattr_t *vap = &garp->n4g_va; 453 mntinfo4_t *mi = VTOMI4(vp); 454 len_t preattr_rsize; 455 boolean_t writemodify_set = B_FALSE; 456 boolean_t cachepurge_set = B_FALSE; 457 458 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 459 460 /* Is curthread the recovery thread? */ 461 mutex_enter(&mi->mi_lock); 462 recov = (VTOMI4(vp)->mi_recovthread == curthread); 463 mutex_exit(&mi->mi_lock); 464 465 rp = VTOR4(vp); 466 mutex_enter(&rp->r_statelock); 467 was_serial = (rp->r_serial == curthread); 468 if (rp->r_serial && !was_serial) { 469 klwp_t *lwp = ttolwp(curthread); 470 471 /* 472 * If we're the recovery thread, then purge current attrs 473 * and bail out to avoid potential deadlock between another 474 * thread caching attrs (r_serial thread), recov thread, 475 * and an async writer thread. 476 */ 477 if (recov) { 478 PURGE_ATTRCACHE4_LOCKED(rp); 479 mutex_exit(&rp->r_statelock); 480 return; 481 } 482 483 if (lwp != NULL) 484 lwp->lwp_nostop++; 485 while (rp->r_serial != NULL) { 486 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 487 mutex_exit(&rp->r_statelock); 488 if (lwp != NULL) 489 lwp->lwp_nostop--; 490 return; 491 } 492 } 493 if (lwp != NULL) 494 lwp->lwp_nostop--; 495 } 496 497 /* 498 * If there is a page flush thread, the current thread needs to 499 * bail out, to prevent a possible deadlock between the current 500 * thread (which might be in a start_op/end_op region), the 501 * recovery thread, and the page flush thread. Expire the 502 * attribute cache, so that any attributes the current thread was 503 * going to set are not lost. 504 */ 505 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 506 PURGE_ATTRCACHE4_LOCKED(rp); 507 mutex_exit(&rp->r_statelock); 508 return; 509 } 510 511 if (rp->r_time_attr_saved > t) { 512 /* 513 * Attributes have been cached since these attributes were 514 * probably made. If there is an inconsistency in what is 515 * cached, mark them invalid. If not, don't act on them. 516 */ 517 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 518 PURGE_ATTRCACHE4_LOCKED(rp); 519 mutex_exit(&rp->r_statelock); 520 return; 521 } 522 set_time_cache_inval = 0; 523 if (cinfo) { 524 /* 525 * Only directory modifying callers pass non-NULL cinfo. 526 */ 527 ASSERT(vp->v_type == VDIR); 528 /* 529 * If the cache timeout either doesn't exist or hasn't expired, 530 * and dir didn't changed on server before dirmod op 531 * and dir didn't change after dirmod op but before getattr 532 * then there's a chance that the client's cached data for 533 * this object is current (not stale). No immediate cache 534 * flush is required. 535 * 536 */ 537 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 538 cinfo->before == rp->r_change && 539 (garp->n4g_change_valid && 540 cinfo->after == garp->n4g_change)) { 541 542 /* 543 * If atomic isn't set, then the before/after info 544 * cannot be blindly trusted. For this case, we tell 545 * nfs4_attrcache_va to cache the attrs but also 546 * establish an absolute maximum cache timeout. When 547 * the timeout is reached, caches will be flushed. 548 */ 549 if (! cinfo->atomic) 550 set_time_cache_inval = 1; 551 } else { 552 553 /* 554 * We're not sure exactly what changed, but we know 555 * what to do. flush all caches for dir. remove the 556 * attr timeout. 557 * 558 * a) timeout expired. flush all caches. 559 * b) r_change != cinfo.before. flush all caches. 560 * c) r_change == cinfo.before, but cinfo.after != 561 * post-op getattr(change). flush all caches. 562 * d) post-op getattr(change) not provided by server. 563 * flush all caches. 564 */ 565 mtime_changed = 1; 566 ctime_changed = 1; 567 rp->r_time_cache_inval = 0; 568 } 569 } else { 570 /* 571 * Write thread after writing data to file on remote server, 572 * will always set R4WRITEMODIFIED to indicate that file on 573 * remote server was modified with a WRITE operation and would 574 * have marked attribute cache as timed out. If R4WRITEMODIFIED 575 * is set, then do not check for mtime and ctime change. 576 */ 577 if (!(rp->r_flags & R4WRITEMODIFIED)) { 578 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 579 mtime_changed = 1; 580 581 if (rp->r_attr.va_ctime.tv_sec != 582 vap->va_ctime.tv_sec || 583 rp->r_attr.va_ctime.tv_nsec != 584 vap->va_ctime.tv_nsec) 585 ctime_changed = 1; 586 } else { 587 writemodify_set = B_TRUE; 588 } 589 } 590 591 preattr_rsize = rp->r_size; 592 593 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 594 595 /* 596 * If we have updated filesize in nfs4_attrcache_va, as soon as we 597 * drop statelock we will be in transition of purging all 598 * our caches and updating them. It is possible for another 599 * thread to pick this new file size and read in zeroed data. 600 * stall other threads till cache purge is complete. 601 */ 602 if ((!cinfo) && (rp->r_size != preattr_rsize)) { 603 /* 604 * If R4WRITEMODIFIED was set and we have updated the file 605 * size, Server's returned file size need not necessarily 606 * be because of this Client's WRITE. We need to purge 607 * all caches. 608 */ 609 if (writemodify_set) 610 mtime_changed = 1; 611 612 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) { 613 rp->r_flags |= R4INCACHEPURGE; 614 cachepurge_set = B_TRUE; 615 } 616 } 617 618 if (!mtime_changed && !ctime_changed) { 619 mutex_exit(&rp->r_statelock); 620 return; 621 } 622 623 rp->r_serial = curthread; 624 625 mutex_exit(&rp->r_statelock); 626 627 /* 628 * If we're the recov thread, then force async nfs4_purge_caches 629 * to avoid potential deadlock. 630 */ 631 if (mtime_changed) 632 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 633 634 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) { 635 mutex_enter(&rp->r_statelock); 636 rp->r_flags &= ~R4INCACHEPURGE; 637 cv_broadcast(&rp->r_cv); 638 mutex_exit(&rp->r_statelock); 639 cachepurge_set = B_FALSE; 640 } 641 642 if (ctime_changed) { 643 (void) nfs4_access_purge_rp(rp); 644 if (rp->r_secattr != NULL) { 645 mutex_enter(&rp->r_statelock); 646 vsp = rp->r_secattr; 647 rp->r_secattr = NULL; 648 mutex_exit(&rp->r_statelock); 649 if (vsp != NULL) 650 nfs4_acl_free_cache(vsp); 651 } 652 } 653 654 if (!was_serial) { 655 mutex_enter(&rp->r_statelock); 656 rp->r_serial = NULL; 657 cv_broadcast(&rp->r_cv); 658 mutex_exit(&rp->r_statelock); 659 } 660 } 661 662 /* 663 * Set attributes cache for given vnode using virtual attributes. 664 * 665 * Set the timeout value on the attribute cache and fill it 666 * with the passed in attributes. 667 * 668 * The caller must be holding r_statelock. 669 */ 670 static void 671 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 672 { 673 rnode4_t *rp; 674 mntinfo4_t *mi; 675 hrtime_t delta; 676 hrtime_t now; 677 vattr_t *vap = &garp->n4g_va; 678 679 rp = VTOR4(vp); 680 681 ASSERT(MUTEX_HELD(&rp->r_statelock)); 682 ASSERT(vap->va_mask == AT_ALL); 683 684 /* Switch to master before checking v_flag */ 685 if (IS_SHADOW(vp, rp)) 686 vp = RTOV4(rp); 687 688 now = gethrtime(); 689 690 mi = VTOMI4(vp); 691 692 /* 693 * Only establish a new cache timeout (if requested). Never 694 * extend a timeout. Never clear a timeout. Clearing a timeout 695 * is done by nfs4_update_dircaches (ancestor in our call chain) 696 */ 697 if (set_cache_timeout && ! rp->r_time_cache_inval) 698 rp->r_time_cache_inval = now + mi->mi_acdirmax; 699 700 /* 701 * Delta is the number of nanoseconds that we will 702 * cache the attributes of the file. It is based on 703 * the number of nanoseconds since the last time that 704 * we detected a change. The assumption is that files 705 * that changed recently are likely to change again. 706 * There is a minimum and a maximum for regular files 707 * and for directories which is enforced though. 708 * 709 * Using the time since last change was detected 710 * eliminates direct comparison or calculation 711 * using mixed client and server times. NFS does 712 * not make any assumptions regarding the client 713 * and server clocks being synchronized. 714 */ 715 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 716 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 717 vap->va_size != rp->r_attr.va_size) { 718 rp->r_time_attr_saved = now; 719 } 720 721 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 722 delta = 0; 723 else { 724 delta = now - rp->r_time_attr_saved; 725 if (vp->v_type == VDIR) { 726 if (delta < mi->mi_acdirmin) 727 delta = mi->mi_acdirmin; 728 else if (delta > mi->mi_acdirmax) 729 delta = mi->mi_acdirmax; 730 } else { 731 if (delta < mi->mi_acregmin) 732 delta = mi->mi_acregmin; 733 else if (delta > mi->mi_acregmax) 734 delta = mi->mi_acregmax; 735 } 736 } 737 rp->r_time_attr_inval = now + delta; 738 739 rp->r_attr = *vap; 740 if (garp->n4g_change_valid) 741 rp->r_change = garp->n4g_change; 742 743 /* 744 * The attributes that were returned may be valid and can 745 * be used, but they may not be allowed to be cached. 746 * Reset the timers to cause immediate invalidation and 747 * clear r_change so no VERIFY operations will suceed 748 */ 749 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 750 rp->r_time_attr_inval = now; 751 rp->r_time_attr_saved = now; 752 rp->r_change = 0; 753 } 754 755 /* 756 * If mounted_on_fileid returned AND the object is a stub, 757 * then set object's va_nodeid to the mounted over fid 758 * returned by server. 759 * 760 * If mounted_on_fileid not provided/supported, then 761 * just set it to 0 for now. Eventually it would be 762 * better to set it to a hashed version of FH. This 763 * would probably be good enough to provide a unique 764 * fid/d_ino within a dir. 765 * 766 * We don't need to carry mounted_on_fileid in the 767 * rnode as long as the client never requests fileid 768 * without also requesting mounted_on_fileid. For 769 * now, it stays. 770 */ 771 if (garp->n4g_mon_fid_valid) { 772 rp->r_mntd_fid = garp->n4g_mon_fid; 773 774 if (RP_ISSTUB(rp)) 775 rp->r_attr.va_nodeid = rp->r_mntd_fid; 776 } 777 778 /* 779 * Check to see if there are valid pathconf bits to 780 * cache in the rnode. 781 */ 782 if (garp->n4g_ext_res) { 783 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 784 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 785 } else { 786 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 787 rp->r_pathconf.pc4_xattr_valid = TRUE; 788 rp->r_pathconf.pc4_xattr_exists = 789 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 790 } 791 } 792 } 793 /* 794 * Update the size of the file if there is no cached data or if 795 * the cached data is clean and there is no data being written 796 * out. 797 */ 798 if (rp->r_size != vap->va_size && 799 (!vn_has_cached_data(vp) || 800 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 801 rp->r_size = vap->va_size; 802 } 803 nfs_setswaplike(vp, vap); 804 rp->r_flags &= ~R4WRITEMODIFIED; 805 } 806 807 /* 808 * Get attributes over-the-wire and update attributes cache 809 * if no error occurred in the over-the-wire operation. 810 * Return 0 if successful, otherwise error. 811 */ 812 int 813 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 814 { 815 mntinfo4_t *mi = VTOMI4(vp); 816 hrtime_t t; 817 nfs4_recov_state_t recov_state; 818 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 819 820 recov_state.rs_flags = 0; 821 recov_state.rs_num_retry_despite_err = 0; 822 823 /* Save the original mount point security flavor */ 824 (void) save_mnt_secinfo(mi->mi_curr_serv); 825 826 recov_retry: 827 828 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 829 &recov_state, NULL))) { 830 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 831 return (e.error); 832 } 833 834 t = gethrtime(); 835 836 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 837 838 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 839 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 840 NULL, OP_GETATTR, NULL) == FALSE) { 841 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 842 &recov_state, 1); 843 goto recov_retry; 844 } 845 } 846 847 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 848 849 if (!e.error) { 850 if (e.stat == NFS4_OK) { 851 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 852 } else { 853 e.error = geterrno4(e.stat); 854 855 nfs4_purge_stale_fh(e.error, vp, cr); 856 } 857 } 858 859 /* 860 * If getattr a node that is a stub for a crossed 861 * mount point, keep the original secinfo flavor for 862 * the current file system, not the crossed one. 863 */ 864 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 865 866 return (e.error); 867 } 868 869 /* 870 * Generate a compound to get attributes over-the-wire. 871 */ 872 void 873 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 874 nfs4_error_t *ep, cred_t *cr, int get_acl) 875 { 876 COMPOUND4args_clnt args; 877 COMPOUND4res_clnt res; 878 int doqueue; 879 rnode4_t *rp = VTOR4(vp); 880 nfs_argop4 argop[2]; 881 882 args.ctag = TAG_GETATTR; 883 884 args.array_len = 2; 885 args.array = argop; 886 887 /* putfh */ 888 argop[0].argop = OP_CPUTFH; 889 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 890 891 /* getattr */ 892 /* 893 * Unlike nfs version 2 and 3, where getattr returns all the 894 * attributes, nfs version 4 returns only the ones explicitly 895 * asked for. This creates problems, as some system functions 896 * (e.g. cache check) require certain attributes and if the 897 * cached node lacks some attributes such as uid/gid, it can 898 * affect system utilities (e.g. "ls") that rely on the information 899 * to be there. This can lead to anything from system crashes to 900 * corrupted information processed by user apps. 901 * So to ensure that all bases are covered, request at least 902 * the AT_ALL attribute mask. 903 */ 904 argop[1].argop = OP_GETATTR; 905 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 906 if (get_acl) 907 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 908 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 909 910 doqueue = 1; 911 912 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 913 914 if (ep->error) 915 return; 916 917 if (res.status != NFS4_OK) { 918 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 919 return; 920 } 921 922 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 923 924 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 925 } 926 927 /* 928 * Return either cached or remote attributes. If get remote attr 929 * use them to check and invalidate caches, then cache the new attributes. 930 */ 931 int 932 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 933 { 934 int error; 935 rnode4_t *rp; 936 nfs4_ga_res_t gar; 937 938 ASSERT(nfs4_consistent_type(vp)); 939 940 /* 941 * If we've got cached attributes, we're done, otherwise go 942 * to the server to get attributes, which will update the cache 943 * in the process. Either way, use the cached attributes for 944 * the caller's vattr_t. 945 * 946 * Note that we ignore the gar set by the OTW call: the attr caching 947 * code may make adjustments when storing to the rnode, and we want 948 * to see those changes here. 949 */ 950 rp = VTOR4(vp); 951 error = 0; 952 mutex_enter(&rp->r_statelock); 953 if (!ATTRCACHE4_VALID(vp)) { 954 mutex_exit(&rp->r_statelock); 955 error = nfs4_getattr_otw(vp, &gar, cr, 0); 956 mutex_enter(&rp->r_statelock); 957 } 958 959 if (!error) 960 *vap = rp->r_attr; 961 962 /* Return the client's view of file size */ 963 vap->va_size = rp->r_size; 964 965 mutex_exit(&rp->r_statelock); 966 967 ASSERT(nfs4_consistent_type(vp)); 968 969 return (error); 970 } 971 972 int 973 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 974 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 975 { 976 COMPOUND4args_clnt args; 977 COMPOUND4res_clnt res; 978 int doqueue; 979 nfs_argop4 argop[2]; 980 mntinfo4_t *mi = VTOMI4(vp); 981 bool_t needrecov = FALSE; 982 nfs4_recov_state_t recov_state; 983 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 984 nfs4_ga_ext_res_t *gerp; 985 986 recov_state.rs_flags = 0; 987 recov_state.rs_num_retry_despite_err = 0; 988 989 recov_retry: 990 args.ctag = tag_type; 991 992 args.array_len = 2; 993 args.array = argop; 994 995 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 996 if (e.error) 997 return (e.error); 998 999 /* putfh */ 1000 argop[0].argop = OP_CPUTFH; 1001 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 1002 1003 /* getattr */ 1004 argop[1].argop = OP_GETATTR; 1005 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 1006 argop[1].nfs_argop4_u.opgetattr.mi = mi; 1007 1008 doqueue = 1; 1009 1010 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1011 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 1012 rnode4info(VTOR4(vp)))); 1013 1014 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 1015 1016 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 1017 if (!needrecov && e.error) { 1018 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1019 needrecov); 1020 return (e.error); 1021 } 1022 1023 if (needrecov) { 1024 bool_t abort; 1025 1026 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1027 "nfs4_attr_otw: initiating recovery\n")); 1028 1029 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1030 NULL, OP_GETATTR, NULL); 1031 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1032 needrecov); 1033 if (!e.error) { 1034 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1035 e.error = geterrno4(res.status); 1036 } 1037 if (abort == FALSE) 1038 goto recov_retry; 1039 return (e.error); 1040 } 1041 1042 if (res.status) { 1043 e.error = geterrno4(res.status); 1044 } else { 1045 gerp = garp->n4g_ext_res; 1046 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1047 garp, sizeof (nfs4_ga_res_t)); 1048 garp->n4g_ext_res = gerp; 1049 if (garp->n4g_ext_res && 1050 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1051 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1052 ga_res.n4g_ext_res, 1053 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1054 } 1055 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1056 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1057 needrecov); 1058 return (e.error); 1059 } 1060 1061 /* 1062 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1063 * for the demand-based allocation of async threads per-mount. The 1064 * nfs_async_timeout is the amount of time a thread will live after it 1065 * becomes idle, unless new I/O requests are received before the thread 1066 * dies. See nfs4_async_putpage and nfs4_async_start. 1067 */ 1068 1069 static void nfs4_async_start(struct vfs *); 1070 1071 static void 1072 free_async_args4(struct nfs4_async_reqs *args) 1073 { 1074 rnode4_t *rp; 1075 1076 if (args->a_io != NFS4_INACTIVE) { 1077 rp = VTOR4(args->a_vp); 1078 mutex_enter(&rp->r_statelock); 1079 rp->r_count--; 1080 if (args->a_io == NFS4_PUTAPAGE || 1081 args->a_io == NFS4_PAGEIO) 1082 rp->r_awcount--; 1083 cv_broadcast(&rp->r_cv); 1084 mutex_exit(&rp->r_statelock); 1085 VN_RELE(args->a_vp); 1086 } 1087 crfree(args->a_cred); 1088 kmem_free(args, sizeof (*args)); 1089 } 1090 1091 /* 1092 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1093 * pageout(), running in the global zone, have legitimate reasons to do 1094 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1095 * use of a a per-mount "asynchronous requests manager thread" which is 1096 * signaled by the various asynchronous work routines when there is 1097 * asynchronous work to be done. It is responsible for creating new 1098 * worker threads if necessary, and notifying existing worker threads 1099 * that there is work to be done. 1100 * 1101 * In other words, it will "take the specifications from the customers and 1102 * give them to the engineers." 1103 * 1104 * Worker threads die off of their own accord if they are no longer 1105 * needed. 1106 * 1107 * This thread is killed when the zone is going away or the filesystem 1108 * is being unmounted. 1109 */ 1110 void 1111 nfs4_async_manager(vfs_t *vfsp) 1112 { 1113 callb_cpr_t cprinfo; 1114 mntinfo4_t *mi; 1115 uint_t max_threads; 1116 1117 mi = VFTOMI4(vfsp); 1118 1119 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1120 "nfs4_async_manager"); 1121 1122 mutex_enter(&mi->mi_async_lock); 1123 /* 1124 * We want to stash the max number of threads that this mount was 1125 * allowed so we can use it later when the variable is set to zero as 1126 * part of the zone/mount going away. 1127 * 1128 * We want to be able to create at least one thread to handle 1129 * asynchronous inactive calls. 1130 */ 1131 max_threads = MAX(mi->mi_max_threads, 1); 1132 /* 1133 * We don't want to wait for mi_max_threads to go to zero, since that 1134 * happens as part of a failed unmount, but this thread should only 1135 * exit when the mount is really going away. 1136 * 1137 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1138 * attempted: the various _async_*() functions know to do things 1139 * inline if mi_max_threads == 0. Henceforth we just drain out the 1140 * outstanding requests. 1141 * 1142 * Note that we still create zthreads even if we notice the zone is 1143 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1144 * shutdown sequence to take slightly longer in some cases, but 1145 * doesn't violate the protocol, as all threads will exit as soon as 1146 * they're done processing the remaining requests. 1147 */ 1148 for (;;) { 1149 while (mi->mi_async_req_count > 0) { 1150 /* 1151 * Paranoia: If the mount started out having 1152 * (mi->mi_max_threads == 0), and the value was 1153 * later changed (via a debugger or somesuch), 1154 * we could be confused since we will think we 1155 * can't create any threads, and the calling 1156 * code (which looks at the current value of 1157 * mi->mi_max_threads, now non-zero) thinks we 1158 * can. 1159 * 1160 * So, because we're paranoid, we create threads 1161 * up to the maximum of the original and the 1162 * current value. This means that future 1163 * (debugger-induced) alterations of 1164 * mi->mi_max_threads are ignored for our 1165 * purposes, but who told them they could change 1166 * random values on a live kernel anyhow? 1167 */ 1168 if (mi->mi_threads < 1169 MAX(mi->mi_max_threads, max_threads)) { 1170 mi->mi_threads++; 1171 mutex_exit(&mi->mi_async_lock); 1172 MI4_HOLD(mi); 1173 VFS_HOLD(vfsp); /* hold for new thread */ 1174 (void) zthread_create(NULL, 0, nfs4_async_start, 1175 vfsp, 0, minclsyspri); 1176 mutex_enter(&mi->mi_async_lock); 1177 } 1178 cv_signal(&mi->mi_async_work_cv); 1179 ASSERT(mi->mi_async_req_count != 0); 1180 mi->mi_async_req_count--; 1181 } 1182 1183 mutex_enter(&mi->mi_lock); 1184 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) { 1185 mutex_exit(&mi->mi_lock); 1186 break; 1187 } 1188 mutex_exit(&mi->mi_lock); 1189 1190 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1191 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1192 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1193 } 1194 1195 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1196 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1197 /* 1198 * Let everyone know we're done. 1199 */ 1200 mi->mi_manager_thread = NULL; 1201 /* 1202 * Wake up the inactive thread. 1203 */ 1204 cv_broadcast(&mi->mi_inact_req_cv); 1205 /* 1206 * Wake up anyone sitting in nfs4_async_manager_stop() 1207 */ 1208 cv_broadcast(&mi->mi_async_cv); 1209 /* 1210 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1211 * since CALLB_CPR_EXIT is actually responsible for releasing 1212 * 'mi_async_lock'. 1213 */ 1214 CALLB_CPR_EXIT(&cprinfo); 1215 VFS_RELE(vfsp); /* release thread's hold */ 1216 MI4_RELE(mi); 1217 zthread_exit(); 1218 } 1219 1220 /* 1221 * Signal (and wait for) the async manager thread to clean up and go away. 1222 */ 1223 void 1224 nfs4_async_manager_stop(vfs_t *vfsp) 1225 { 1226 mntinfo4_t *mi = VFTOMI4(vfsp); 1227 1228 mutex_enter(&mi->mi_async_lock); 1229 mutex_enter(&mi->mi_lock); 1230 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1231 mutex_exit(&mi->mi_lock); 1232 cv_broadcast(&mi->mi_async_reqs_cv); 1233 /* 1234 * Wait for the async manager thread to die. 1235 */ 1236 while (mi->mi_manager_thread != NULL) 1237 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1238 mutex_exit(&mi->mi_async_lock); 1239 } 1240 1241 int 1242 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1243 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1244 u_offset_t, caddr_t, struct seg *, cred_t *)) 1245 { 1246 rnode4_t *rp; 1247 mntinfo4_t *mi; 1248 struct nfs4_async_reqs *args; 1249 1250 rp = VTOR4(vp); 1251 ASSERT(rp->r_freef == NULL); 1252 1253 mi = VTOMI4(vp); 1254 1255 /* 1256 * If addr falls in a different segment, don't bother doing readahead. 1257 */ 1258 if (addr >= seg->s_base + seg->s_size) 1259 return (-1); 1260 1261 /* 1262 * If we can't allocate a request structure, punt on the readahead. 1263 */ 1264 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1265 return (-1); 1266 1267 /* 1268 * If a lock operation is pending, don't initiate any new 1269 * readaheads. Otherwise, bump r_count to indicate the new 1270 * asynchronous I/O. 1271 */ 1272 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1273 kmem_free(args, sizeof (*args)); 1274 return (-1); 1275 } 1276 mutex_enter(&rp->r_statelock); 1277 rp->r_count++; 1278 mutex_exit(&rp->r_statelock); 1279 nfs_rw_exit(&rp->r_lkserlock); 1280 1281 args->a_next = NULL; 1282 #ifdef DEBUG 1283 args->a_queuer = curthread; 1284 #endif 1285 VN_HOLD(vp); 1286 args->a_vp = vp; 1287 ASSERT(cr != NULL); 1288 crhold(cr); 1289 args->a_cred = cr; 1290 args->a_io = NFS4_READ_AHEAD; 1291 args->a_nfs4_readahead = readahead; 1292 args->a_nfs4_blkoff = blkoff; 1293 args->a_nfs4_seg = seg; 1294 args->a_nfs4_addr = addr; 1295 1296 mutex_enter(&mi->mi_async_lock); 1297 1298 /* 1299 * If asyncio has been disabled, don't bother readahead. 1300 */ 1301 if (mi->mi_max_threads == 0) { 1302 mutex_exit(&mi->mi_async_lock); 1303 goto noasync; 1304 } 1305 1306 /* 1307 * Link request structure into the async list and 1308 * wakeup async thread to do the i/o. 1309 */ 1310 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1311 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1312 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1313 } else { 1314 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1315 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1316 } 1317 1318 if (mi->mi_io_kstats) { 1319 mutex_enter(&mi->mi_lock); 1320 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1321 mutex_exit(&mi->mi_lock); 1322 } 1323 1324 mi->mi_async_req_count++; 1325 ASSERT(mi->mi_async_req_count != 0); 1326 cv_signal(&mi->mi_async_reqs_cv); 1327 mutex_exit(&mi->mi_async_lock); 1328 return (0); 1329 1330 noasync: 1331 mutex_enter(&rp->r_statelock); 1332 rp->r_count--; 1333 cv_broadcast(&rp->r_cv); 1334 mutex_exit(&rp->r_statelock); 1335 VN_RELE(vp); 1336 crfree(cr); 1337 kmem_free(args, sizeof (*args)); 1338 return (-1); 1339 } 1340 1341 /* 1342 * The async queues for each mounted file system are arranged as a 1343 * set of queues, one for each async i/o type. Requests are taken 1344 * from the queues in a round-robin fashion. A number of consecutive 1345 * requests are taken from each queue before moving on to the next 1346 * queue. This functionality may allow the NFS Version 2 server to do 1347 * write clustering, even if the client is mixing writes and reads 1348 * because it will take multiple write requests from the queue 1349 * before processing any of the other async i/o types. 1350 * 1351 * XXX The nfs4_async_start thread is unsafe in the light of the present 1352 * model defined by cpr to suspend the system. Specifically over the 1353 * wire calls are cpr-unsafe. The thread should be reevaluated in 1354 * case of future updates to the cpr model. 1355 */ 1356 static void 1357 nfs4_async_start(struct vfs *vfsp) 1358 { 1359 struct nfs4_async_reqs *args; 1360 mntinfo4_t *mi = VFTOMI4(vfsp); 1361 clock_t time_left = 1; 1362 callb_cpr_t cprinfo; 1363 int i; 1364 extern int nfs_async_timeout; 1365 1366 /* 1367 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1368 * built in an implementation independent manner. 1369 */ 1370 if (nfs_async_timeout == -1) 1371 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1372 1373 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1374 1375 mutex_enter(&mi->mi_async_lock); 1376 for (;;) { 1377 /* 1378 * Find the next queue containing an entry. We start 1379 * at the current queue pointer and then round robin 1380 * through all of them until we either find a non-empty 1381 * queue or have looked through all of them. 1382 */ 1383 for (i = 0; i < NFS4_ASYNC_TYPES; i++) { 1384 args = *mi->mi_async_curr; 1385 if (args != NULL) 1386 break; 1387 mi->mi_async_curr++; 1388 if (mi->mi_async_curr == 1389 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1390 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1391 } 1392 /* 1393 * If we didn't find a entry, then block until woken up 1394 * again and then look through the queues again. 1395 */ 1396 if (args == NULL) { 1397 /* 1398 * Exiting is considered to be safe for CPR as well 1399 */ 1400 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1401 1402 /* 1403 * Wakeup thread waiting to unmount the file 1404 * system only if all async threads are inactive. 1405 * 1406 * If we've timed-out and there's nothing to do, 1407 * then get rid of this thread. 1408 */ 1409 if (mi->mi_max_threads == 0 || time_left <= 0) { 1410 if (--mi->mi_threads == 0) 1411 cv_signal(&mi->mi_async_cv); 1412 CALLB_CPR_EXIT(&cprinfo); 1413 VFS_RELE(vfsp); /* release thread's hold */ 1414 MI4_RELE(mi); 1415 zthread_exit(); 1416 /* NOTREACHED */ 1417 } 1418 time_left = cv_reltimedwait(&mi->mi_async_work_cv, 1419 &mi->mi_async_lock, nfs_async_timeout, 1420 TR_CLOCK_TICK); 1421 1422 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1423 1424 continue; 1425 } else { 1426 time_left = 1; 1427 } 1428 1429 /* 1430 * Remove the request from the async queue and then 1431 * update the current async request queue pointer. If 1432 * the current queue is empty or we have removed enough 1433 * consecutive entries from it, then reset the counter 1434 * for this queue and then move the current pointer to 1435 * the next queue. 1436 */ 1437 *mi->mi_async_curr = args->a_next; 1438 if (*mi->mi_async_curr == NULL || 1439 --mi->mi_async_clusters[args->a_io] == 0) { 1440 mi->mi_async_clusters[args->a_io] = 1441 mi->mi_async_init_clusters; 1442 mi->mi_async_curr++; 1443 if (mi->mi_async_curr == 1444 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1445 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1446 } 1447 1448 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1449 mutex_enter(&mi->mi_lock); 1450 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1451 mutex_exit(&mi->mi_lock); 1452 } 1453 1454 mutex_exit(&mi->mi_async_lock); 1455 1456 /* 1457 * Obtain arguments from the async request structure. 1458 */ 1459 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1460 (*args->a_nfs4_readahead)(args->a_vp, 1461 args->a_nfs4_blkoff, args->a_nfs4_addr, 1462 args->a_nfs4_seg, args->a_cred); 1463 } else if (args->a_io == NFS4_PUTAPAGE) { 1464 (void) (*args->a_nfs4_putapage)(args->a_vp, 1465 args->a_nfs4_pp, args->a_nfs4_off, 1466 args->a_nfs4_len, args->a_nfs4_flags, 1467 args->a_cred); 1468 } else if (args->a_io == NFS4_PAGEIO) { 1469 (void) (*args->a_nfs4_pageio)(args->a_vp, 1470 args->a_nfs4_pp, args->a_nfs4_off, 1471 args->a_nfs4_len, args->a_nfs4_flags, 1472 args->a_cred); 1473 } else if (args->a_io == NFS4_READDIR) { 1474 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1475 args->a_nfs4_rdc, args->a_cred)); 1476 } else if (args->a_io == NFS4_COMMIT) { 1477 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1478 args->a_nfs4_offset, args->a_nfs4_count, 1479 args->a_cred); 1480 } else if (args->a_io == NFS4_INACTIVE) { 1481 nfs4_inactive_otw(args->a_vp, args->a_cred); 1482 } 1483 1484 /* 1485 * Now, release the vnode and free the credentials 1486 * structure. 1487 */ 1488 free_async_args4(args); 1489 /* 1490 * Reacquire the mutex because it will be needed above. 1491 */ 1492 mutex_enter(&mi->mi_async_lock); 1493 } 1494 } 1495 1496 /* 1497 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1498 * part of VOP_INACTIVE. 1499 */ 1500 1501 void 1502 nfs4_inactive_thread(mntinfo4_t *mi) 1503 { 1504 struct nfs4_async_reqs *args; 1505 callb_cpr_t cprinfo; 1506 vfs_t *vfsp = mi->mi_vfsp; 1507 1508 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1509 "nfs4_inactive_thread"); 1510 1511 for (;;) { 1512 mutex_enter(&mi->mi_async_lock); 1513 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1514 if (args == NULL) { 1515 mutex_enter(&mi->mi_lock); 1516 /* 1517 * We don't want to exit until the async manager is done 1518 * with its work; hence the check for mi_manager_thread 1519 * being NULL. 1520 * 1521 * The async manager thread will cv_broadcast() on 1522 * mi_inact_req_cv when it's done, at which point we'll 1523 * wake up and exit. 1524 */ 1525 if (mi->mi_manager_thread == NULL) 1526 goto die; 1527 mi->mi_flags |= MI4_INACTIVE_IDLE; 1528 mutex_exit(&mi->mi_lock); 1529 cv_signal(&mi->mi_async_cv); 1530 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1531 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1532 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1533 mutex_exit(&mi->mi_async_lock); 1534 } else { 1535 mutex_enter(&mi->mi_lock); 1536 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1537 mutex_exit(&mi->mi_lock); 1538 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1539 mutex_exit(&mi->mi_async_lock); 1540 nfs4_inactive_otw(args->a_vp, args->a_cred); 1541 crfree(args->a_cred); 1542 kmem_free(args, sizeof (*args)); 1543 } 1544 } 1545 die: 1546 mutex_exit(&mi->mi_lock); 1547 mi->mi_inactive_thread = NULL; 1548 cv_signal(&mi->mi_async_cv); 1549 1550 /* 1551 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1552 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1553 */ 1554 CALLB_CPR_EXIT(&cprinfo); 1555 1556 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1557 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1558 1559 MI4_RELE(mi); 1560 zthread_exit(); 1561 /* NOTREACHED */ 1562 } 1563 1564 /* 1565 * nfs_async_stop: 1566 * Wait for all outstanding putpage operations and the inactive thread to 1567 * complete; nfs4_async_stop_sig() without interruptibility. 1568 */ 1569 void 1570 nfs4_async_stop(struct vfs *vfsp) 1571 { 1572 mntinfo4_t *mi = VFTOMI4(vfsp); 1573 1574 /* 1575 * Wait for all outstanding async operations to complete and for 1576 * worker threads to exit. 1577 */ 1578 mutex_enter(&mi->mi_async_lock); 1579 mi->mi_max_threads = 0; 1580 cv_broadcast(&mi->mi_async_work_cv); 1581 while (mi->mi_threads != 0) 1582 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1583 1584 /* 1585 * Wait for the inactive thread to finish doing what it's doing. It 1586 * won't exit until the last reference to the vfs_t goes away. 1587 */ 1588 if (mi->mi_inactive_thread != NULL) { 1589 mutex_enter(&mi->mi_lock); 1590 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1591 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1592 mutex_exit(&mi->mi_lock); 1593 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1594 mutex_enter(&mi->mi_lock); 1595 } 1596 mutex_exit(&mi->mi_lock); 1597 } 1598 mutex_exit(&mi->mi_async_lock); 1599 } 1600 1601 /* 1602 * nfs_async_stop_sig: 1603 * Wait for all outstanding putpage operations and the inactive thread to 1604 * complete. If a signal is delivered we will abort and return non-zero; 1605 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1606 * need to make it interruptible. 1607 */ 1608 int 1609 nfs4_async_stop_sig(struct vfs *vfsp) 1610 { 1611 mntinfo4_t *mi = VFTOMI4(vfsp); 1612 ushort_t omax; 1613 bool_t intr = FALSE; 1614 1615 /* 1616 * Wait for all outstanding putpage operations to complete and for 1617 * worker threads to exit. 1618 */ 1619 mutex_enter(&mi->mi_async_lock); 1620 omax = mi->mi_max_threads; 1621 mi->mi_max_threads = 0; 1622 cv_broadcast(&mi->mi_async_work_cv); 1623 while (mi->mi_threads != 0) { 1624 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1625 intr = TRUE; 1626 goto interrupted; 1627 } 1628 } 1629 1630 /* 1631 * Wait for the inactive thread to finish doing what it's doing. It 1632 * won't exit until the a last reference to the vfs_t goes away. 1633 */ 1634 if (mi->mi_inactive_thread != NULL) { 1635 mutex_enter(&mi->mi_lock); 1636 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1637 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1638 mutex_exit(&mi->mi_lock); 1639 if (!cv_wait_sig(&mi->mi_async_cv, 1640 &mi->mi_async_lock)) { 1641 intr = TRUE; 1642 goto interrupted; 1643 } 1644 mutex_enter(&mi->mi_lock); 1645 } 1646 mutex_exit(&mi->mi_lock); 1647 } 1648 interrupted: 1649 if (intr) 1650 mi->mi_max_threads = omax; 1651 mutex_exit(&mi->mi_async_lock); 1652 1653 return (intr); 1654 } 1655 1656 int 1657 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1658 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1659 u_offset_t, size_t, int, cred_t *)) 1660 { 1661 rnode4_t *rp; 1662 mntinfo4_t *mi; 1663 struct nfs4_async_reqs *args; 1664 1665 ASSERT(flags & B_ASYNC); 1666 ASSERT(vp->v_vfsp != NULL); 1667 1668 rp = VTOR4(vp); 1669 ASSERT(rp->r_count > 0); 1670 1671 mi = VTOMI4(vp); 1672 1673 /* 1674 * If we can't allocate a request structure, do the putpage 1675 * operation synchronously in this thread's context. 1676 */ 1677 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1678 goto noasync; 1679 1680 args->a_next = NULL; 1681 #ifdef DEBUG 1682 args->a_queuer = curthread; 1683 #endif 1684 VN_HOLD(vp); 1685 args->a_vp = vp; 1686 ASSERT(cr != NULL); 1687 crhold(cr); 1688 args->a_cred = cr; 1689 args->a_io = NFS4_PUTAPAGE; 1690 args->a_nfs4_putapage = putapage; 1691 args->a_nfs4_pp = pp; 1692 args->a_nfs4_off = off; 1693 args->a_nfs4_len = (uint_t)len; 1694 args->a_nfs4_flags = flags; 1695 1696 mutex_enter(&mi->mi_async_lock); 1697 1698 /* 1699 * If asyncio has been disabled, then make a synchronous request. 1700 * This check is done a second time in case async io was diabled 1701 * while this thread was blocked waiting for memory pressure to 1702 * reduce or for the queue to drain. 1703 */ 1704 if (mi->mi_max_threads == 0) { 1705 mutex_exit(&mi->mi_async_lock); 1706 1707 VN_RELE(vp); 1708 crfree(cr); 1709 kmem_free(args, sizeof (*args)); 1710 goto noasync; 1711 } 1712 1713 /* 1714 * Link request structure into the async list and 1715 * wakeup async thread to do the i/o. 1716 */ 1717 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1718 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1719 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1720 } else { 1721 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1722 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1723 } 1724 1725 mutex_enter(&rp->r_statelock); 1726 rp->r_count++; 1727 rp->r_awcount++; 1728 mutex_exit(&rp->r_statelock); 1729 1730 if (mi->mi_io_kstats) { 1731 mutex_enter(&mi->mi_lock); 1732 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1733 mutex_exit(&mi->mi_lock); 1734 } 1735 1736 mi->mi_async_req_count++; 1737 ASSERT(mi->mi_async_req_count != 0); 1738 cv_signal(&mi->mi_async_reqs_cv); 1739 mutex_exit(&mi->mi_async_lock); 1740 return (0); 1741 1742 noasync: 1743 1744 if (curproc == proc_pageout || curproc == proc_fsflush || 1745 nfs_zone() == mi->mi_zone) { 1746 /* 1747 * If we get here in the context of the pageout/fsflush, 1748 * or we have run out of memory or we're attempting to 1749 * unmount we refuse to do a sync write, because this may 1750 * hang pageout/fsflush and the machine. In this case, 1751 * we just re-mark the page as dirty and punt on the page. 1752 * 1753 * Make sure B_FORCE isn't set. We can re-mark the 1754 * pages as dirty and unlock the pages in one swoop by 1755 * passing in B_ERROR to pvn_write_done(). However, 1756 * we should make sure B_FORCE isn't set - we don't 1757 * want the page tossed before it gets written out. 1758 */ 1759 if (flags & B_FORCE) 1760 flags &= ~(B_INVAL | B_FORCE); 1761 pvn_write_done(pp, flags | B_ERROR); 1762 return (0); 1763 } 1764 1765 /* 1766 * We'll get here only if (nfs_zone() != mi->mi_zone) 1767 * which means that this was a cross-zone sync putpage. 1768 * 1769 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1770 * as dirty and unlock them. 1771 * 1772 * We don't want to clear B_FORCE here as the caller presumably 1773 * knows what they're doing if they set it. 1774 */ 1775 pvn_write_done(pp, flags | B_ERROR); 1776 return (EPERM); 1777 } 1778 1779 int 1780 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1781 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1782 size_t, int, cred_t *)) 1783 { 1784 rnode4_t *rp; 1785 mntinfo4_t *mi; 1786 struct nfs4_async_reqs *args; 1787 1788 ASSERT(flags & B_ASYNC); 1789 ASSERT(vp->v_vfsp != NULL); 1790 1791 rp = VTOR4(vp); 1792 ASSERT(rp->r_count > 0); 1793 1794 mi = VTOMI4(vp); 1795 1796 /* 1797 * If we can't allocate a request structure, do the pageio 1798 * request synchronously in this thread's context. 1799 */ 1800 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1801 goto noasync; 1802 1803 args->a_next = NULL; 1804 #ifdef DEBUG 1805 args->a_queuer = curthread; 1806 #endif 1807 VN_HOLD(vp); 1808 args->a_vp = vp; 1809 ASSERT(cr != NULL); 1810 crhold(cr); 1811 args->a_cred = cr; 1812 args->a_io = NFS4_PAGEIO; 1813 args->a_nfs4_pageio = pageio; 1814 args->a_nfs4_pp = pp; 1815 args->a_nfs4_off = io_off; 1816 args->a_nfs4_len = (uint_t)io_len; 1817 args->a_nfs4_flags = flags; 1818 1819 mutex_enter(&mi->mi_async_lock); 1820 1821 /* 1822 * If asyncio has been disabled, then make a synchronous request. 1823 * This check is done a second time in case async io was diabled 1824 * while this thread was blocked waiting for memory pressure to 1825 * reduce or for the queue to drain. 1826 */ 1827 if (mi->mi_max_threads == 0) { 1828 mutex_exit(&mi->mi_async_lock); 1829 1830 VN_RELE(vp); 1831 crfree(cr); 1832 kmem_free(args, sizeof (*args)); 1833 goto noasync; 1834 } 1835 1836 /* 1837 * Link request structure into the async list and 1838 * wakeup async thread to do the i/o. 1839 */ 1840 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1841 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1842 mi->mi_async_tail[NFS4_PAGEIO] = args; 1843 } else { 1844 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1845 mi->mi_async_tail[NFS4_PAGEIO] = args; 1846 } 1847 1848 mutex_enter(&rp->r_statelock); 1849 rp->r_count++; 1850 rp->r_awcount++; 1851 mutex_exit(&rp->r_statelock); 1852 1853 if (mi->mi_io_kstats) { 1854 mutex_enter(&mi->mi_lock); 1855 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1856 mutex_exit(&mi->mi_lock); 1857 } 1858 1859 mi->mi_async_req_count++; 1860 ASSERT(mi->mi_async_req_count != 0); 1861 cv_signal(&mi->mi_async_reqs_cv); 1862 mutex_exit(&mi->mi_async_lock); 1863 return (0); 1864 1865 noasync: 1866 /* 1867 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1868 * the page list), for writes we do it synchronously, except for 1869 * proc_pageout/proc_fsflush as described below. 1870 */ 1871 if (flags & B_READ) { 1872 pvn_read_done(pp, flags | B_ERROR); 1873 return (0); 1874 } 1875 1876 if (curproc == proc_pageout || curproc == proc_fsflush) { 1877 /* 1878 * If we get here in the context of the pageout/fsflush, 1879 * we refuse to do a sync write, because this may hang 1880 * pageout/fsflush (and the machine). In this case, we just 1881 * re-mark the page as dirty and punt on the page. 1882 * 1883 * Make sure B_FORCE isn't set. We can re-mark the 1884 * pages as dirty and unlock the pages in one swoop by 1885 * passing in B_ERROR to pvn_write_done(). However, 1886 * we should make sure B_FORCE isn't set - we don't 1887 * want the page tossed before it gets written out. 1888 */ 1889 if (flags & B_FORCE) 1890 flags &= ~(B_INVAL | B_FORCE); 1891 pvn_write_done(pp, flags | B_ERROR); 1892 return (0); 1893 } 1894 1895 if (nfs_zone() != mi->mi_zone) { 1896 /* 1897 * So this was a cross-zone sync pageio. We pass in B_ERROR 1898 * to pvn_write_done() to re-mark the pages as dirty and unlock 1899 * them. 1900 * 1901 * We don't want to clear B_FORCE here as the caller presumably 1902 * knows what they're doing if they set it. 1903 */ 1904 pvn_write_done(pp, flags | B_ERROR); 1905 return (EPERM); 1906 } 1907 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1908 } 1909 1910 void 1911 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1912 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1913 { 1914 rnode4_t *rp; 1915 mntinfo4_t *mi; 1916 struct nfs4_async_reqs *args; 1917 1918 rp = VTOR4(vp); 1919 ASSERT(rp->r_freef == NULL); 1920 1921 mi = VTOMI4(vp); 1922 1923 /* 1924 * If we can't allocate a request structure, skip the readdir. 1925 */ 1926 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1927 goto noasync; 1928 1929 args->a_next = NULL; 1930 #ifdef DEBUG 1931 args->a_queuer = curthread; 1932 #endif 1933 VN_HOLD(vp); 1934 args->a_vp = vp; 1935 ASSERT(cr != NULL); 1936 crhold(cr); 1937 args->a_cred = cr; 1938 args->a_io = NFS4_READDIR; 1939 args->a_nfs4_readdir = readdir; 1940 args->a_nfs4_rdc = rdc; 1941 1942 mutex_enter(&mi->mi_async_lock); 1943 1944 /* 1945 * If asyncio has been disabled, then skip this request 1946 */ 1947 if (mi->mi_max_threads == 0) { 1948 mutex_exit(&mi->mi_async_lock); 1949 1950 VN_RELE(vp); 1951 crfree(cr); 1952 kmem_free(args, sizeof (*args)); 1953 goto noasync; 1954 } 1955 1956 /* 1957 * Link request structure into the async list and 1958 * wakeup async thread to do the i/o. 1959 */ 1960 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 1961 mi->mi_async_reqs[NFS4_READDIR] = args; 1962 mi->mi_async_tail[NFS4_READDIR] = args; 1963 } else { 1964 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 1965 mi->mi_async_tail[NFS4_READDIR] = args; 1966 } 1967 1968 mutex_enter(&rp->r_statelock); 1969 rp->r_count++; 1970 mutex_exit(&rp->r_statelock); 1971 1972 if (mi->mi_io_kstats) { 1973 mutex_enter(&mi->mi_lock); 1974 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1975 mutex_exit(&mi->mi_lock); 1976 } 1977 1978 mi->mi_async_req_count++; 1979 ASSERT(mi->mi_async_req_count != 0); 1980 cv_signal(&mi->mi_async_reqs_cv); 1981 mutex_exit(&mi->mi_async_lock); 1982 return; 1983 1984 noasync: 1985 mutex_enter(&rp->r_statelock); 1986 rdc->entries = NULL; 1987 /* 1988 * Indicate that no one is trying to fill this entry and 1989 * it still needs to be filled. 1990 */ 1991 rdc->flags &= ~RDDIR; 1992 rdc->flags |= RDDIRREQ; 1993 rddir4_cache_rele(rp, rdc); 1994 mutex_exit(&rp->r_statelock); 1995 } 1996 1997 void 1998 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1999 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 2000 cred_t *)) 2001 { 2002 rnode4_t *rp; 2003 mntinfo4_t *mi; 2004 struct nfs4_async_reqs *args; 2005 page_t *pp; 2006 2007 rp = VTOR4(vp); 2008 mi = VTOMI4(vp); 2009 2010 /* 2011 * If we can't allocate a request structure, do the commit 2012 * operation synchronously in this thread's context. 2013 */ 2014 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2015 goto noasync; 2016 2017 args->a_next = NULL; 2018 #ifdef DEBUG 2019 args->a_queuer = curthread; 2020 #endif 2021 VN_HOLD(vp); 2022 args->a_vp = vp; 2023 ASSERT(cr != NULL); 2024 crhold(cr); 2025 args->a_cred = cr; 2026 args->a_io = NFS4_COMMIT; 2027 args->a_nfs4_commit = commit; 2028 args->a_nfs4_plist = plist; 2029 args->a_nfs4_offset = offset; 2030 args->a_nfs4_count = count; 2031 2032 mutex_enter(&mi->mi_async_lock); 2033 2034 /* 2035 * If asyncio has been disabled, then make a synchronous request. 2036 * This check is done a second time in case async io was diabled 2037 * while this thread was blocked waiting for memory pressure to 2038 * reduce or for the queue to drain. 2039 */ 2040 if (mi->mi_max_threads == 0) { 2041 mutex_exit(&mi->mi_async_lock); 2042 2043 VN_RELE(vp); 2044 crfree(cr); 2045 kmem_free(args, sizeof (*args)); 2046 goto noasync; 2047 } 2048 2049 /* 2050 * Link request structure into the async list and 2051 * wakeup async thread to do the i/o. 2052 */ 2053 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2054 mi->mi_async_reqs[NFS4_COMMIT] = args; 2055 mi->mi_async_tail[NFS4_COMMIT] = args; 2056 } else { 2057 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2058 mi->mi_async_tail[NFS4_COMMIT] = args; 2059 } 2060 2061 mutex_enter(&rp->r_statelock); 2062 rp->r_count++; 2063 mutex_exit(&rp->r_statelock); 2064 2065 if (mi->mi_io_kstats) { 2066 mutex_enter(&mi->mi_lock); 2067 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2068 mutex_exit(&mi->mi_lock); 2069 } 2070 2071 mi->mi_async_req_count++; 2072 ASSERT(mi->mi_async_req_count != 0); 2073 cv_signal(&mi->mi_async_reqs_cv); 2074 mutex_exit(&mi->mi_async_lock); 2075 return; 2076 2077 noasync: 2078 if (curproc == proc_pageout || curproc == proc_fsflush || 2079 nfs_zone() != mi->mi_zone) { 2080 while (plist != NULL) { 2081 pp = plist; 2082 page_sub(&plist, pp); 2083 pp->p_fsdata = C_COMMIT; 2084 page_unlock(pp); 2085 } 2086 return; 2087 } 2088 (*commit)(vp, plist, offset, count, cr); 2089 } 2090 2091 /* 2092 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2093 * reference to the vnode is handed over to the thread; the caller should 2094 * no longer refer to the vnode. 2095 * 2096 * Unlike most of the async routines, this handoff is needed for 2097 * correctness reasons, not just performance. So doing operations in the 2098 * context of the current thread is not an option. 2099 */ 2100 void 2101 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2102 { 2103 mntinfo4_t *mi; 2104 struct nfs4_async_reqs *args; 2105 boolean_t signal_inactive_thread = B_FALSE; 2106 2107 mi = VTOMI4(vp); 2108 2109 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2110 args->a_next = NULL; 2111 #ifdef DEBUG 2112 args->a_queuer = curthread; 2113 #endif 2114 args->a_vp = vp; 2115 ASSERT(cr != NULL); 2116 crhold(cr); 2117 args->a_cred = cr; 2118 args->a_io = NFS4_INACTIVE; 2119 2120 /* 2121 * Note that we don't check mi->mi_max_threads here, since we 2122 * *need* to get rid of this vnode regardless of whether someone 2123 * set nfs4_max_threads to zero in /etc/system. 2124 * 2125 * The manager thread knows about this and is willing to create 2126 * at least one thread to accommodate us. 2127 */ 2128 mutex_enter(&mi->mi_async_lock); 2129 if (mi->mi_inactive_thread == NULL) { 2130 rnode4_t *rp; 2131 vnode_t *unldvp = NULL; 2132 char *unlname; 2133 cred_t *unlcred; 2134 2135 mutex_exit(&mi->mi_async_lock); 2136 /* 2137 * We just need to free up the memory associated with the 2138 * vnode, which can be safely done from within the current 2139 * context. 2140 */ 2141 crfree(cr); /* drop our reference */ 2142 kmem_free(args, sizeof (*args)); 2143 rp = VTOR4(vp); 2144 mutex_enter(&rp->r_statelock); 2145 if (rp->r_unldvp != NULL) { 2146 unldvp = rp->r_unldvp; 2147 rp->r_unldvp = NULL; 2148 unlname = rp->r_unlname; 2149 rp->r_unlname = NULL; 2150 unlcred = rp->r_unlcred; 2151 rp->r_unlcred = NULL; 2152 } 2153 mutex_exit(&rp->r_statelock); 2154 /* 2155 * No need to explicitly throw away any cached pages. The 2156 * eventual r4inactive() will attempt a synchronous 2157 * VOP_PUTPAGE() which will immediately fail since the request 2158 * is coming from the wrong zone, and then will proceed to call 2159 * nfs4_invalidate_pages() which will clean things up for us. 2160 * 2161 * Throw away the delegation here so rp4_addfree()'s attempt to 2162 * return any existing delegations becomes a no-op. 2163 */ 2164 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2165 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2166 FALSE); 2167 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2168 nfs_rw_exit(&mi->mi_recovlock); 2169 } 2170 nfs4_clear_open_streams(rp); 2171 2172 rp4_addfree(rp, cr); 2173 if (unldvp != NULL) { 2174 kmem_free(unlname, MAXNAMELEN); 2175 VN_RELE(unldvp); 2176 crfree(unlcred); 2177 } 2178 return; 2179 } 2180 2181 if (mi->mi_manager_thread == NULL) { 2182 /* 2183 * We want to talk to the inactive thread. 2184 */ 2185 signal_inactive_thread = B_TRUE; 2186 } 2187 2188 /* 2189 * Enqueue the vnode and wake up either the special thread (empty 2190 * list) or an async thread. 2191 */ 2192 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2193 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2194 mi->mi_async_tail[NFS4_INACTIVE] = args; 2195 signal_inactive_thread = B_TRUE; 2196 } else { 2197 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2198 mi->mi_async_tail[NFS4_INACTIVE] = args; 2199 } 2200 if (signal_inactive_thread) { 2201 cv_signal(&mi->mi_inact_req_cv); 2202 } else { 2203 mi->mi_async_req_count++; 2204 ASSERT(mi->mi_async_req_count != 0); 2205 cv_signal(&mi->mi_async_reqs_cv); 2206 } 2207 2208 mutex_exit(&mi->mi_async_lock); 2209 } 2210 2211 int 2212 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2213 { 2214 int pagecreate; 2215 int n; 2216 int saved_n; 2217 caddr_t saved_base; 2218 u_offset_t offset; 2219 int error; 2220 int sm_error; 2221 vnode_t *vp = RTOV(rp); 2222 2223 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2224 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2225 if (!vpm_enable) { 2226 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2227 } 2228 2229 /* 2230 * Move bytes in at most PAGESIZE chunks. We must avoid 2231 * spanning pages in uiomove() because page faults may cause 2232 * the cache to be invalidated out from under us. The r_size is not 2233 * updated until after the uiomove. If we push the last page of a 2234 * file before r_size is correct, we will lose the data written past 2235 * the current (and invalid) r_size. 2236 */ 2237 do { 2238 offset = uio->uio_loffset; 2239 pagecreate = 0; 2240 2241 /* 2242 * n is the number of bytes required to satisfy the request 2243 * or the number of bytes to fill out the page. 2244 */ 2245 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2246 2247 /* 2248 * Check to see if we can skip reading in the page 2249 * and just allocate the memory. We can do this 2250 * if we are going to rewrite the entire mapping 2251 * or if we are going to write to or beyond the current 2252 * end of file from the beginning of the mapping. 2253 * 2254 * The read of r_size is now protected by r_statelock. 2255 */ 2256 mutex_enter(&rp->r_statelock); 2257 /* 2258 * When pgcreated is nonzero the caller has already done 2259 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2260 * segkpm this means we already have at least one page 2261 * created and mapped at base. 2262 */ 2263 pagecreate = pgcreated || 2264 ((offset & PAGEOFFSET) == 0 && 2265 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2266 2267 mutex_exit(&rp->r_statelock); 2268 2269 if (!vpm_enable && pagecreate) { 2270 /* 2271 * The last argument tells segmap_pagecreate() to 2272 * always lock the page, as opposed to sometimes 2273 * returning with the page locked. This way we avoid a 2274 * fault on the ensuing uiomove(), but also 2275 * more importantly (to fix bug 1094402) we can 2276 * call segmap_fault() to unlock the page in all 2277 * cases. An alternative would be to modify 2278 * segmap_pagecreate() to tell us when it is 2279 * locking a page, but that's a fairly major 2280 * interface change. 2281 */ 2282 if (pgcreated == 0) 2283 (void) segmap_pagecreate(segkmap, base, 2284 (uint_t)n, 1); 2285 saved_base = base; 2286 saved_n = n; 2287 } 2288 2289 /* 2290 * The number of bytes of data in the last page can not 2291 * be accurately be determined while page is being 2292 * uiomove'd to and the size of the file being updated. 2293 * Thus, inform threads which need to know accurately 2294 * how much data is in the last page of the file. They 2295 * will not do the i/o immediately, but will arrange for 2296 * the i/o to happen later when this modify operation 2297 * will have finished. 2298 */ 2299 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2300 mutex_enter(&rp->r_statelock); 2301 rp->r_flags |= R4MODINPROGRESS; 2302 rp->r_modaddr = (offset & MAXBMASK); 2303 mutex_exit(&rp->r_statelock); 2304 2305 if (vpm_enable) { 2306 /* 2307 * Copy data. If new pages are created, part of 2308 * the page that is not written will be initizliazed 2309 * with zeros. 2310 */ 2311 error = vpm_data_copy(vp, offset, n, uio, 2312 !pagecreate, NULL, 0, S_WRITE); 2313 } else { 2314 error = uiomove(base, n, UIO_WRITE, uio); 2315 } 2316 2317 /* 2318 * r_size is the maximum number of 2319 * bytes known to be in the file. 2320 * Make sure it is at least as high as the 2321 * first unwritten byte pointed to by uio_loffset. 2322 */ 2323 mutex_enter(&rp->r_statelock); 2324 if (rp->r_size < uio->uio_loffset) 2325 rp->r_size = uio->uio_loffset; 2326 rp->r_flags &= ~R4MODINPROGRESS; 2327 rp->r_flags |= R4DIRTY; 2328 mutex_exit(&rp->r_statelock); 2329 2330 /* n = # of bytes written */ 2331 n = (int)(uio->uio_loffset - offset); 2332 2333 if (!vpm_enable) { 2334 base += n; 2335 } 2336 2337 tcount -= n; 2338 /* 2339 * If we created pages w/o initializing them completely, 2340 * we need to zero the part that wasn't set up. 2341 * This happens on a most EOF write cases and if 2342 * we had some sort of error during the uiomove. 2343 */ 2344 if (!vpm_enable && pagecreate) { 2345 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2346 (void) kzero(base, PAGESIZE - n); 2347 2348 if (pgcreated) { 2349 /* 2350 * Caller is responsible for this page, 2351 * it was not created in this loop. 2352 */ 2353 pgcreated = 0; 2354 } else { 2355 /* 2356 * For bug 1094402: segmap_pagecreate locks 2357 * page. Unlock it. This also unlocks the 2358 * pages allocated by page_create_va() in 2359 * segmap_pagecreate(). 2360 */ 2361 sm_error = segmap_fault(kas.a_hat, segkmap, 2362 saved_base, saved_n, 2363 F_SOFTUNLOCK, S_WRITE); 2364 if (error == 0) 2365 error = sm_error; 2366 } 2367 } 2368 } while (tcount > 0 && error == 0); 2369 2370 return (error); 2371 } 2372 2373 int 2374 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2375 { 2376 rnode4_t *rp; 2377 page_t *pp; 2378 u_offset_t eoff; 2379 u_offset_t io_off; 2380 size_t io_len; 2381 int error; 2382 int rdirty; 2383 int err; 2384 2385 rp = VTOR4(vp); 2386 ASSERT(rp->r_count > 0); 2387 2388 if (!nfs4_has_pages(vp)) 2389 return (0); 2390 2391 ASSERT(vp->v_type != VCHR); 2392 2393 /* 2394 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2395 * writes. B_FORCE is set to force the VM system to actually 2396 * invalidate the pages, even if the i/o failed. The pages 2397 * need to get invalidated because they can't be written out 2398 * because there isn't any space left on either the server's 2399 * file system or in the user's disk quota. The B_FREE bit 2400 * is cleared to avoid confusion as to whether this is a 2401 * request to place the page on the freelist or to destroy 2402 * it. 2403 */ 2404 if ((rp->r_flags & R4OUTOFSPACE) || 2405 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2406 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2407 2408 if (len == 0) { 2409 /* 2410 * If doing a full file synchronous operation, then clear 2411 * the R4DIRTY bit. If a page gets dirtied while the flush 2412 * is happening, then R4DIRTY will get set again. The 2413 * R4DIRTY bit must get cleared before the flush so that 2414 * we don't lose this information. 2415 * 2416 * If there are no full file async write operations 2417 * pending and RDIRTY bit is set, clear it. 2418 */ 2419 if (off == (u_offset_t)0 && 2420 !(flags & B_ASYNC) && 2421 (rp->r_flags & R4DIRTY)) { 2422 mutex_enter(&rp->r_statelock); 2423 rdirty = (rp->r_flags & R4DIRTY); 2424 rp->r_flags &= ~R4DIRTY; 2425 mutex_exit(&rp->r_statelock); 2426 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2427 mutex_enter(&rp->r_statelock); 2428 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2429 rdirty = (rp->r_flags & R4DIRTY); 2430 rp->r_flags &= ~R4DIRTY; 2431 } 2432 mutex_exit(&rp->r_statelock); 2433 } else 2434 rdirty = 0; 2435 2436 /* 2437 * Search the entire vp list for pages >= off, and flush 2438 * the dirty pages. 2439 */ 2440 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2441 flags, cr); 2442 2443 /* 2444 * If an error occurred and the file was marked as dirty 2445 * before and we aren't forcibly invalidating pages, then 2446 * reset the R4DIRTY flag. 2447 */ 2448 if (error && rdirty && 2449 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2450 mutex_enter(&rp->r_statelock); 2451 rp->r_flags |= R4DIRTY; 2452 mutex_exit(&rp->r_statelock); 2453 } 2454 } else { 2455 /* 2456 * Do a range from [off...off + len) looking for pages 2457 * to deal with. 2458 */ 2459 error = 0; 2460 io_len = 0; 2461 eoff = off + len; 2462 mutex_enter(&rp->r_statelock); 2463 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2464 io_off += io_len) { 2465 mutex_exit(&rp->r_statelock); 2466 /* 2467 * If we are not invalidating, synchronously 2468 * freeing or writing pages use the routine 2469 * page_lookup_nowait() to prevent reclaiming 2470 * them from the free list. 2471 */ 2472 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2473 pp = page_lookup(vp, io_off, 2474 (flags & (B_INVAL | B_FREE)) ? 2475 SE_EXCL : SE_SHARED); 2476 } else { 2477 pp = page_lookup_nowait(vp, io_off, 2478 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2479 } 2480 2481 if (pp == NULL || !pvn_getdirty(pp, flags)) 2482 io_len = PAGESIZE; 2483 else { 2484 err = (*rp->r_putapage)(vp, pp, &io_off, 2485 &io_len, flags, cr); 2486 if (!error) 2487 error = err; 2488 /* 2489 * "io_off" and "io_len" are returned as 2490 * the range of pages we actually wrote. 2491 * This allows us to skip ahead more quickly 2492 * since several pages may've been dealt 2493 * with by this iteration of the loop. 2494 */ 2495 } 2496 mutex_enter(&rp->r_statelock); 2497 } 2498 mutex_exit(&rp->r_statelock); 2499 } 2500 2501 return (error); 2502 } 2503 2504 void 2505 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2506 { 2507 rnode4_t *rp; 2508 2509 rp = VTOR4(vp); 2510 if (IS_SHADOW(vp, rp)) 2511 vp = RTOV4(rp); 2512 mutex_enter(&rp->r_statelock); 2513 while (rp->r_flags & R4TRUNCATE) 2514 cv_wait(&rp->r_cv, &rp->r_statelock); 2515 rp->r_flags |= R4TRUNCATE; 2516 if (off == (u_offset_t)0) { 2517 rp->r_flags &= ~R4DIRTY; 2518 if (!(rp->r_flags & R4STALE)) 2519 rp->r_error = 0; 2520 } 2521 rp->r_truncaddr = off; 2522 mutex_exit(&rp->r_statelock); 2523 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2524 B_INVAL | B_TRUNC, cr); 2525 mutex_enter(&rp->r_statelock); 2526 rp->r_flags &= ~R4TRUNCATE; 2527 cv_broadcast(&rp->r_cv); 2528 mutex_exit(&rp->r_statelock); 2529 } 2530 2531 static int 2532 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2533 { 2534 mntinfo4_t *mi; 2535 struct mntinfo_kstat *mik; 2536 vfs_t *vfsp; 2537 2538 /* this is a read-only kstat. Bail out on a write */ 2539 if (rw == KSTAT_WRITE) 2540 return (EACCES); 2541 2542 2543 /* 2544 * We don't want to wait here as kstat_chain_lock could be held by 2545 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2546 * and thus could lead to a deadlock. 2547 */ 2548 vfsp = (struct vfs *)ksp->ks_private; 2549 2550 mi = VFTOMI4(vfsp); 2551 mik = (struct mntinfo_kstat *)ksp->ks_data; 2552 2553 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2554 2555 mik->mik_vers = (uint32_t)mi->mi_vers; 2556 mik->mik_flags = mi->mi_flags; 2557 /* 2558 * The sv_secdata holds the flavor the client specifies. 2559 * If the client uses default and a security negotiation 2560 * occurs, sv_currsec will point to the current flavor 2561 * selected from the server flavor list. 2562 * sv_currsec is NULL if no security negotiation takes place. 2563 */ 2564 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2565 mi->mi_curr_serv->sv_currsec->secmod : 2566 mi->mi_curr_serv->sv_secdata->secmod; 2567 mik->mik_curread = (uint32_t)mi->mi_curread; 2568 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2569 mik->mik_retrans = mi->mi_retrans; 2570 mik->mik_timeo = mi->mi_timeo; 2571 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2572 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2573 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2574 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2575 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2576 mik->mik_failover = (uint32_t)mi->mi_failover; 2577 mik->mik_remap = (uint32_t)mi->mi_remap; 2578 2579 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2580 2581 return (0); 2582 } 2583 2584 void 2585 nfs4_mnt_kstat_init(struct vfs *vfsp) 2586 { 2587 mntinfo4_t *mi = VFTOMI4(vfsp); 2588 2589 /* 2590 * PSARC 2001/697 Contract Private Interface 2591 * All nfs kstats are under SunMC contract 2592 * Please refer to the PSARC listed above and contact 2593 * SunMC before making any changes! 2594 * 2595 * Changes must be reviewed by Solaris File Sharing 2596 * Changes must be communicated to contract-2001-697@sun.com 2597 * 2598 */ 2599 2600 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2601 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2602 if (mi->mi_io_kstats) { 2603 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2604 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2605 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2606 kstat_install(mi->mi_io_kstats); 2607 } 2608 2609 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2610 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2611 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2612 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2613 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2614 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2615 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2616 kstat_install(mi->mi_ro_kstats); 2617 } 2618 2619 nfs4_mnt_recov_kstat_init(vfsp); 2620 } 2621 2622 void 2623 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2624 { 2625 mntinfo4_t *mi; 2626 clock_t now = ddi_get_lbolt(); 2627 2628 mi = VTOMI4(vp); 2629 /* 2630 * In case of forced unmount, do not print any messages 2631 * since it can flood the console with error messages. 2632 */ 2633 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2634 return; 2635 2636 /* 2637 * If the mount point is dead, not recoverable, do not 2638 * print error messages that can flood the console. 2639 */ 2640 if (mi->mi_flags & MI4_RECOV_FAIL) 2641 return; 2642 2643 /* 2644 * No use in flooding the console with ENOSPC 2645 * messages from the same file system. 2646 */ 2647 if ((error != ENOSPC && error != EDQUOT) || 2648 now - mi->mi_printftime > 0) { 2649 zoneid_t zoneid = mi->mi_zone->zone_id; 2650 2651 #ifdef DEBUG 2652 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2653 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2654 #else 2655 nfs_perror(error, "NFS write error on host %s: %m.\n", 2656 VTOR4(vp)->r_server->sv_hostname, NULL); 2657 #endif 2658 if (error == ENOSPC || error == EDQUOT) { 2659 zcmn_err(zoneid, CE_CONT, 2660 "^File: userid=%d, groupid=%d\n", 2661 crgetuid(cr), crgetgid(cr)); 2662 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2663 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2664 zcmn_err(zoneid, CE_CONT, 2665 "^User: userid=%d, groupid=%d\n", 2666 crgetuid(curthread->t_cred), 2667 crgetgid(curthread->t_cred)); 2668 } 2669 mi->mi_printftime = now + 2670 nfs_write_error_interval * hz; 2671 } 2672 sfh4_printfhandle(VTOR4(vp)->r_fh); 2673 #ifdef DEBUG 2674 if (error == EACCES) { 2675 zcmn_err(zoneid, CE_CONT, 2676 "nfs_bio: cred is%s kcred\n", 2677 cr == kcred ? "" : " not"); 2678 } 2679 #endif 2680 } 2681 } 2682 2683 /* 2684 * Return non-zero if the given file can be safely memory mapped. Locks 2685 * are safe if whole-file (length and offset are both zero). 2686 */ 2687 2688 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2689 2690 static int 2691 nfs4_safemap(const vnode_t *vp) 2692 { 2693 locklist_t *llp, *next_llp; 2694 int safe = 1; 2695 rnode4_t *rp = VTOR4(vp); 2696 2697 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2698 2699 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2700 "vp = %p", (void *)vp)); 2701 2702 /* 2703 * Review all the locks for the vnode, both ones that have been 2704 * acquired and ones that are pending. We assume that 2705 * flk_active_locks_for_vp() has merged any locks that can be 2706 * merged (so that if a process has the entire file locked, it is 2707 * represented as a single lock). 2708 * 2709 * Note that we can't bail out of the loop if we find a non-safe 2710 * lock, because we have to free all the elements in the llp list. 2711 * We might be able to speed up this code slightly by not looking 2712 * at each lock's l_start and l_len fields once we've found a 2713 * non-safe lock. 2714 */ 2715 2716 llp = flk_active_locks_for_vp(vp); 2717 while (llp) { 2718 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2719 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2720 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2721 if (!SAFE_LOCK(llp->ll_flock)) { 2722 safe = 0; 2723 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2724 "nfs4_safemap: unsafe active lock (%" PRId64 2725 ", %" PRId64 ")", llp->ll_flock.l_start, 2726 llp->ll_flock.l_len)); 2727 } 2728 next_llp = llp->ll_next; 2729 VN_RELE(llp->ll_vp); 2730 kmem_free(llp, sizeof (*llp)); 2731 llp = next_llp; 2732 } 2733 2734 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2735 safe ? "safe" : "unsafe")); 2736 return (safe); 2737 } 2738 2739 /* 2740 * Return whether there is a lost LOCK or LOCKU queued up for the given 2741 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2742 */ 2743 2744 bool_t 2745 nfs4_map_lost_lock_conflict(vnode_t *vp) 2746 { 2747 bool_t conflict = FALSE; 2748 nfs4_lost_rqst_t *lrp; 2749 mntinfo4_t *mi = VTOMI4(vp); 2750 2751 mutex_enter(&mi->mi_lock); 2752 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2753 lrp = list_next(&mi->mi_lost_state, lrp)) { 2754 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2755 continue; 2756 ASSERT(lrp->lr_vp != NULL); 2757 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2758 continue; /* different file */ 2759 if (!SAFE_LOCK(*lrp->lr_flk)) { 2760 conflict = TRUE; 2761 break; 2762 } 2763 } 2764 2765 mutex_exit(&mi->mi_lock); 2766 return (conflict); 2767 } 2768 2769 /* 2770 * nfs_lockcompletion: 2771 * 2772 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2773 * as non cachable (set VNOCACHE bit). 2774 */ 2775 2776 void 2777 nfs4_lockcompletion(vnode_t *vp, int cmd) 2778 { 2779 rnode4_t *rp = VTOR4(vp); 2780 2781 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2782 ASSERT(!IS_SHADOW(vp, rp)); 2783 2784 if (cmd == F_SETLK || cmd == F_SETLKW) { 2785 2786 if (!nfs4_safemap(vp)) { 2787 mutex_enter(&vp->v_lock); 2788 vp->v_flag |= VNOCACHE; 2789 mutex_exit(&vp->v_lock); 2790 } else { 2791 mutex_enter(&vp->v_lock); 2792 vp->v_flag &= ~VNOCACHE; 2793 mutex_exit(&vp->v_lock); 2794 } 2795 } 2796 /* 2797 * The cached attributes of the file are stale after acquiring 2798 * the lock on the file. They were updated when the file was 2799 * opened, but not updated when the lock was acquired. Therefore the 2800 * cached attributes are invalidated after the lock is obtained. 2801 */ 2802 PURGE_ATTRCACHE4(vp); 2803 } 2804 2805 /* ARGSUSED */ 2806 static void * 2807 nfs4_mi_init(zoneid_t zoneid) 2808 { 2809 struct mi4_globals *mig; 2810 2811 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2812 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2813 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2814 offsetof(mntinfo4_t, mi_zone_node)); 2815 mig->mig_destructor_called = B_FALSE; 2816 return (mig); 2817 } 2818 2819 /* 2820 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2821 * state and killing off threads. 2822 */ 2823 /* ARGSUSED */ 2824 static void 2825 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2826 { 2827 struct mi4_globals *mig = data; 2828 mntinfo4_t *mi; 2829 nfs4_server_t *np; 2830 2831 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2832 "nfs4_mi_shutdown zone %d\n", zoneid)); 2833 ASSERT(mig != NULL); 2834 for (;;) { 2835 mutex_enter(&mig->mig_lock); 2836 mi = list_head(&mig->mig_list); 2837 if (mi == NULL) { 2838 mutex_exit(&mig->mig_lock); 2839 break; 2840 } 2841 2842 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2843 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2844 /* 2845 * purge the DNLC for this filesystem 2846 */ 2847 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2848 /* 2849 * Tell existing async worker threads to exit. 2850 */ 2851 mutex_enter(&mi->mi_async_lock); 2852 mi->mi_max_threads = 0; 2853 cv_broadcast(&mi->mi_async_work_cv); 2854 /* 2855 * Set the appropriate flags, signal and wait for both the 2856 * async manager and the inactive thread to exit when they're 2857 * done with their current work. 2858 */ 2859 mutex_enter(&mi->mi_lock); 2860 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2861 mutex_exit(&mi->mi_lock); 2862 mutex_exit(&mi->mi_async_lock); 2863 if (mi->mi_manager_thread) { 2864 nfs4_async_manager_stop(mi->mi_vfsp); 2865 } 2866 if (mi->mi_inactive_thread) { 2867 mutex_enter(&mi->mi_async_lock); 2868 cv_signal(&mi->mi_inact_req_cv); 2869 /* 2870 * Wait for the inactive thread to exit. 2871 */ 2872 while (mi->mi_inactive_thread != NULL) { 2873 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2874 } 2875 mutex_exit(&mi->mi_async_lock); 2876 } 2877 /* 2878 * Wait for the recovery thread to complete, that is, it will 2879 * signal when it is done using the "mi" structure and about 2880 * to exit 2881 */ 2882 mutex_enter(&mi->mi_lock); 2883 while (mi->mi_in_recovery > 0) 2884 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2885 mutex_exit(&mi->mi_lock); 2886 /* 2887 * We're done when every mi has been done or the list is empty. 2888 * This one is done, remove it from the list. 2889 */ 2890 list_remove(&mig->mig_list, mi); 2891 mutex_exit(&mig->mig_lock); 2892 zone_rele(mi->mi_zone); 2893 /* 2894 * Release hold on vfs and mi done to prevent race with zone 2895 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2896 */ 2897 VFS_RELE(mi->mi_vfsp); 2898 MI4_RELE(mi); 2899 } 2900 /* 2901 * Tell each renew thread in the zone to exit 2902 */ 2903 mutex_enter(&nfs4_server_lst_lock); 2904 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2905 mutex_enter(&np->s_lock); 2906 if (np->zoneid == zoneid) { 2907 /* 2908 * We add another hold onto the nfs4_server_t 2909 * because this will make sure tha the nfs4_server_t 2910 * stays around until nfs4_callback_fini_zone destroys 2911 * the zone. This way, the renew thread can 2912 * unconditionally release its holds on the 2913 * nfs4_server_t. 2914 */ 2915 np->s_refcnt++; 2916 nfs4_mark_srv_dead(np); 2917 } 2918 mutex_exit(&np->s_lock); 2919 } 2920 mutex_exit(&nfs4_server_lst_lock); 2921 } 2922 2923 static void 2924 nfs4_mi_free_globals(struct mi4_globals *mig) 2925 { 2926 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2927 mutex_destroy(&mig->mig_lock); 2928 kmem_free(mig, sizeof (*mig)); 2929 } 2930 2931 /* ARGSUSED */ 2932 static void 2933 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2934 { 2935 struct mi4_globals *mig = data; 2936 2937 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2938 "nfs4_mi_destroy zone %d\n", zoneid)); 2939 ASSERT(mig != NULL); 2940 mutex_enter(&mig->mig_lock); 2941 if (list_head(&mig->mig_list) != NULL) { 2942 /* Still waiting for VFS_FREEVFS() */ 2943 mig->mig_destructor_called = B_TRUE; 2944 mutex_exit(&mig->mig_lock); 2945 return; 2946 } 2947 nfs4_mi_free_globals(mig); 2948 } 2949 2950 /* 2951 * Add an NFS mount to the per-zone list of NFS mounts. 2952 */ 2953 void 2954 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2955 { 2956 struct mi4_globals *mig; 2957 2958 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2959 mutex_enter(&mig->mig_lock); 2960 list_insert_head(&mig->mig_list, mi); 2961 /* 2962 * hold added to eliminate race with zone shutdown -this will be 2963 * released in mi_shutdown 2964 */ 2965 MI4_HOLD(mi); 2966 VFS_HOLD(mi->mi_vfsp); 2967 mutex_exit(&mig->mig_lock); 2968 } 2969 2970 /* 2971 * Remove an NFS mount from the per-zone list of NFS mounts. 2972 */ 2973 int 2974 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 2975 { 2976 struct mi4_globals *mig; 2977 int ret = 0; 2978 2979 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2980 mutex_enter(&mig->mig_lock); 2981 mutex_enter(&mi->mi_lock); 2982 /* if this mi is marked dead, then the zone already released it */ 2983 if (!(mi->mi_flags & MI4_DEAD)) { 2984 list_remove(&mig->mig_list, mi); 2985 mutex_exit(&mi->mi_lock); 2986 2987 /* release the holds put on in zonelist_add(). */ 2988 VFS_RELE(mi->mi_vfsp); 2989 MI4_RELE(mi); 2990 ret = 1; 2991 } else { 2992 mutex_exit(&mi->mi_lock); 2993 } 2994 2995 /* 2996 * We can be called asynchronously by VFS_FREEVFS() after the zone 2997 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2998 * mi globals. 2999 */ 3000 if (list_head(&mig->mig_list) == NULL && 3001 mig->mig_destructor_called == B_TRUE) { 3002 nfs4_mi_free_globals(mig); 3003 return (ret); 3004 } 3005 mutex_exit(&mig->mig_lock); 3006 return (ret); 3007 } 3008 3009 void 3010 nfs_free_mi4(mntinfo4_t *mi) 3011 { 3012 nfs4_open_owner_t *foop; 3013 nfs4_oo_hash_bucket_t *bucketp; 3014 nfs4_debug_msg_t *msgp; 3015 int i; 3016 servinfo4_t *svp; 3017 3018 /* 3019 * Code introduced here should be carefully evaluated to make 3020 * sure none of the freed resources are accessed either directly 3021 * or indirectly after freeing them. For eg: Introducing calls to 3022 * NFS4_DEBUG that use mntinfo4_t structure member after freeing 3023 * the structure members or other routines calling back into NFS 3024 * accessing freed mntinfo4_t structure member. 3025 */ 3026 mutex_enter(&mi->mi_lock); 3027 ASSERT(mi->mi_recovthread == NULL); 3028 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 3029 mutex_exit(&mi->mi_lock); 3030 mutex_enter(&mi->mi_async_lock); 3031 ASSERT(mi->mi_threads == 0); 3032 ASSERT(mi->mi_manager_thread == NULL); 3033 mutex_exit(&mi->mi_async_lock); 3034 if (mi->mi_io_kstats) { 3035 kstat_delete(mi->mi_io_kstats); 3036 mi->mi_io_kstats = NULL; 3037 } 3038 if (mi->mi_ro_kstats) { 3039 kstat_delete(mi->mi_ro_kstats); 3040 mi->mi_ro_kstats = NULL; 3041 } 3042 if (mi->mi_recov_ksp) { 3043 kstat_delete(mi->mi_recov_ksp); 3044 mi->mi_recov_ksp = NULL; 3045 } 3046 mutex_enter(&mi->mi_msg_list_lock); 3047 while (msgp = list_head(&mi->mi_msg_list)) { 3048 list_remove(&mi->mi_msg_list, msgp); 3049 nfs4_free_msg(msgp); 3050 } 3051 mutex_exit(&mi->mi_msg_list_lock); 3052 list_destroy(&mi->mi_msg_list); 3053 if (mi->mi_fname != NULL) 3054 fn_rele(&mi->mi_fname); 3055 if (mi->mi_rootfh != NULL) 3056 sfh4_rele(&mi->mi_rootfh); 3057 if (mi->mi_srvparentfh != NULL) 3058 sfh4_rele(&mi->mi_srvparentfh); 3059 svp = mi->mi_servers; 3060 sv4_free(svp); 3061 mutex_destroy(&mi->mi_lock); 3062 mutex_destroy(&mi->mi_async_lock); 3063 mutex_destroy(&mi->mi_msg_list_lock); 3064 nfs_rw_destroy(&mi->mi_recovlock); 3065 nfs_rw_destroy(&mi->mi_rename_lock); 3066 nfs_rw_destroy(&mi->mi_fh_lock); 3067 cv_destroy(&mi->mi_failover_cv); 3068 cv_destroy(&mi->mi_async_reqs_cv); 3069 cv_destroy(&mi->mi_async_work_cv); 3070 cv_destroy(&mi->mi_async_cv); 3071 cv_destroy(&mi->mi_inact_req_cv); 3072 /* 3073 * Destroy the oo hash lists and mutexes for the cred hash table. 3074 */ 3075 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3076 bucketp = &(mi->mi_oo_list[i]); 3077 /* Destroy any remaining open owners on the list */ 3078 foop = list_head(&bucketp->b_oo_hash_list); 3079 while (foop != NULL) { 3080 list_remove(&bucketp->b_oo_hash_list, foop); 3081 nfs4_destroy_open_owner(foop); 3082 foop = list_head(&bucketp->b_oo_hash_list); 3083 } 3084 list_destroy(&bucketp->b_oo_hash_list); 3085 mutex_destroy(&bucketp->b_lock); 3086 } 3087 /* 3088 * Empty and destroy the freed open owner list. 3089 */ 3090 foop = list_head(&mi->mi_foo_list); 3091 while (foop != NULL) { 3092 list_remove(&mi->mi_foo_list, foop); 3093 nfs4_destroy_open_owner(foop); 3094 foop = list_head(&mi->mi_foo_list); 3095 } 3096 list_destroy(&mi->mi_foo_list); 3097 list_destroy(&mi->mi_bseqid_list); 3098 list_destroy(&mi->mi_lost_state); 3099 avl_destroy(&mi->mi_filehandles); 3100 kmem_free(mi, sizeof (*mi)); 3101 } 3102 void 3103 mi_hold(mntinfo4_t *mi) 3104 { 3105 atomic_add_32(&mi->mi_count, 1); 3106 ASSERT(mi->mi_count != 0); 3107 } 3108 3109 void 3110 mi_rele(mntinfo4_t *mi) 3111 { 3112 ASSERT(mi->mi_count != 0); 3113 if (atomic_add_32_nv(&mi->mi_count, -1) == 0) { 3114 nfs_free_mi4(mi); 3115 } 3116 } 3117 3118 vnode_t nfs4_xattr_notsupp_vnode; 3119 3120 void 3121 nfs4_clnt_init(void) 3122 { 3123 nfs4_vnops_init(); 3124 (void) nfs4_rnode_init(); 3125 (void) nfs4_shadow_init(); 3126 (void) nfs4_acache_init(); 3127 (void) nfs4_subr_init(); 3128 nfs4_acl_init(); 3129 nfs_idmap_init(); 3130 nfs4_callback_init(); 3131 nfs4_secinfo_init(); 3132 #ifdef DEBUG 3133 tsd_create(&nfs4_tsd_key, NULL); 3134 #endif 3135 3136 /* 3137 * Add a CPR callback so that we can update client 3138 * lease after a suspend and resume. 3139 */ 3140 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3141 3142 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3143 nfs4_mi_destroy); 3144 3145 /* 3146 * Initialise the reference count of the notsupp xattr cache vnode to 1 3147 * so that it never goes away (VOP_INACTIVE isn't called on it). 3148 */ 3149 nfs4_xattr_notsupp_vnode.v_count = 1; 3150 } 3151 3152 void 3153 nfs4_clnt_fini(void) 3154 { 3155 (void) zone_key_delete(mi4_list_key); 3156 nfs4_vnops_fini(); 3157 (void) nfs4_rnode_fini(); 3158 (void) nfs4_shadow_fini(); 3159 (void) nfs4_acache_fini(); 3160 (void) nfs4_subr_fini(); 3161 nfs_idmap_fini(); 3162 nfs4_callback_fini(); 3163 nfs4_secinfo_fini(); 3164 #ifdef DEBUG 3165 tsd_destroy(&nfs4_tsd_key); 3166 #endif 3167 if (cid) 3168 (void) callb_delete(cid); 3169 } 3170 3171 /*ARGSUSED*/ 3172 static boolean_t 3173 nfs4_client_cpr_callb(void *arg, int code) 3174 { 3175 /* 3176 * We get called for Suspend and Resume events. 3177 * For the suspend case we simply don't care! 3178 */ 3179 if (code == CB_CODE_CPR_CHKPT) { 3180 return (B_TRUE); 3181 } 3182 3183 /* 3184 * When we get to here we are in the process of 3185 * resuming the system from a previous suspend. 3186 */ 3187 nfs4_client_resumed = gethrestime_sec(); 3188 return (B_TRUE); 3189 } 3190 3191 void 3192 nfs4_renew_lease_thread(nfs4_server_t *sp) 3193 { 3194 int error = 0; 3195 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3196 clock_t tick_delay = 0; 3197 clock_t time_left = 0; 3198 callb_cpr_t cpr_info; 3199 kmutex_t cpr_lock; 3200 3201 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3202 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3203 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3204 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3205 3206 mutex_enter(&sp->s_lock); 3207 /* sp->s_lease_time is set via a GETATTR */ 3208 sp->last_renewal_time = gethrestime_sec(); 3209 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3210 ASSERT(sp->s_refcnt >= 1); 3211 3212 for (;;) { 3213 if (!sp->state_ref_count || 3214 sp->lease_valid != NFS4_LEASE_VALID) { 3215 3216 kip_secs = MAX((sp->s_lease_time >> 1) - 3217 (3 * sp->propagation_delay.tv_sec), 1); 3218 3219 tick_delay = SEC_TO_TICK(kip_secs); 3220 3221 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3222 "nfs4_renew_lease_thread: no renew : thread " 3223 "wait %ld secs", kip_secs)); 3224 3225 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3226 "nfs4_renew_lease_thread: no renew : " 3227 "state_ref_count %d, lease_valid %d", 3228 sp->state_ref_count, sp->lease_valid)); 3229 3230 mutex_enter(&cpr_lock); 3231 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3232 mutex_exit(&cpr_lock); 3233 time_left = cv_reltimedwait(&sp->cv_thread_exit, 3234 &sp->s_lock, tick_delay, TR_CLOCK_TICK); 3235 mutex_enter(&cpr_lock); 3236 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3237 mutex_exit(&cpr_lock); 3238 3239 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3240 "nfs4_renew_lease_thread: no renew: " 3241 "time left %ld", time_left)); 3242 3243 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3244 goto die; 3245 continue; 3246 } 3247 3248 tmp_last_renewal_time = sp->last_renewal_time; 3249 3250 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3251 (3 * sp->propagation_delay.tv_sec); 3252 3253 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3254 "nfs4_renew_lease_thread: tmp_time %ld, " 3255 "sp->last_renewal_time %ld", tmp_time, 3256 sp->last_renewal_time)); 3257 3258 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3259 3260 tick_delay = SEC_TO_TICK(kip_secs); 3261 3262 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3263 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3264 "secs", kip_secs)); 3265 3266 mutex_enter(&cpr_lock); 3267 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3268 mutex_exit(&cpr_lock); 3269 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock, 3270 tick_delay, TR_CLOCK_TICK); 3271 mutex_enter(&cpr_lock); 3272 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3273 mutex_exit(&cpr_lock); 3274 3275 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3276 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3277 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3278 "tmp_last_renewal_time %ld", time_left, 3279 sp->last_renewal_time, nfs4_client_resumed, 3280 tmp_last_renewal_time)); 3281 3282 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3283 goto die; 3284 3285 if (tmp_last_renewal_time == sp->last_renewal_time || 3286 (nfs4_client_resumed != 0 && 3287 nfs4_client_resumed > sp->last_renewal_time)) { 3288 /* 3289 * Issue RENEW op since we haven't renewed the lease 3290 * since we slept. 3291 */ 3292 tmp_now_time = gethrestime_sec(); 3293 error = nfs4renew(sp); 3294 /* 3295 * Need to re-acquire sp's lock, nfs4renew() 3296 * relinqueshes it. 3297 */ 3298 mutex_enter(&sp->s_lock); 3299 3300 /* 3301 * See if someone changed s_thread_exit while we gave 3302 * up s_lock. 3303 */ 3304 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3305 goto die; 3306 3307 if (!error) { 3308 /* 3309 * check to see if we implicitly renewed while 3310 * we waited for a reply for our RENEW call. 3311 */ 3312 if (tmp_last_renewal_time == 3313 sp->last_renewal_time) { 3314 /* no implicit renew came */ 3315 sp->last_renewal_time = tmp_now_time; 3316 } else { 3317 NFS4_DEBUG(nfs4_client_lease_debug, 3318 (CE_NOTE, "renew_thread: did " 3319 "implicit renewal before reply " 3320 "from server for RENEW")); 3321 } 3322 } else { 3323 /* figure out error */ 3324 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3325 "renew_thread: nfs4renew returned error" 3326 " %d", error)); 3327 } 3328 3329 } 3330 } 3331 3332 die: 3333 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3334 "nfs4_renew_lease_thread: thread exiting")); 3335 3336 while (sp->s_otw_call_count != 0) { 3337 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3338 "nfs4_renew_lease_thread: waiting for outstanding " 3339 "otw calls to finish for sp 0x%p, current " 3340 "s_otw_call_count %d", (void *)sp, 3341 sp->s_otw_call_count)); 3342 mutex_enter(&cpr_lock); 3343 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3344 mutex_exit(&cpr_lock); 3345 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3346 mutex_enter(&cpr_lock); 3347 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3348 mutex_exit(&cpr_lock); 3349 } 3350 mutex_exit(&sp->s_lock); 3351 3352 nfs4_server_rele(sp); /* free the thread's reference */ 3353 nfs4_server_rele(sp); /* free the list's reference */ 3354 sp = NULL; 3355 3356 done: 3357 mutex_enter(&cpr_lock); 3358 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3359 mutex_destroy(&cpr_lock); 3360 3361 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3362 "nfs4_renew_lease_thread: renew thread exit officially")); 3363 3364 zthread_exit(); 3365 /* NOT REACHED */ 3366 } 3367 3368 /* 3369 * Send out a RENEW op to the server. 3370 * Assumes sp is locked down. 3371 */ 3372 static int 3373 nfs4renew(nfs4_server_t *sp) 3374 { 3375 COMPOUND4args_clnt args; 3376 COMPOUND4res_clnt res; 3377 nfs_argop4 argop[1]; 3378 int doqueue = 1; 3379 int rpc_error; 3380 cred_t *cr; 3381 mntinfo4_t *mi; 3382 timespec_t prop_time, after_time; 3383 int needrecov = FALSE; 3384 nfs4_recov_state_t recov_state; 3385 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3386 3387 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3388 3389 recov_state.rs_flags = 0; 3390 recov_state.rs_num_retry_despite_err = 0; 3391 3392 recov_retry: 3393 mi = sp->mntinfo4_list; 3394 VFS_HOLD(mi->mi_vfsp); 3395 mutex_exit(&sp->s_lock); 3396 ASSERT(mi != NULL); 3397 3398 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3399 if (e.error) { 3400 VFS_RELE(mi->mi_vfsp); 3401 return (e.error); 3402 } 3403 3404 /* Check to see if we're dealing with a marked-dead sp */ 3405 mutex_enter(&sp->s_lock); 3406 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3407 mutex_exit(&sp->s_lock); 3408 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3409 VFS_RELE(mi->mi_vfsp); 3410 return (0); 3411 } 3412 3413 /* Make sure mi hasn't changed on us */ 3414 if (mi != sp->mntinfo4_list) { 3415 /* Must drop sp's lock to avoid a recursive mutex enter */ 3416 mutex_exit(&sp->s_lock); 3417 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3418 VFS_RELE(mi->mi_vfsp); 3419 mutex_enter(&sp->s_lock); 3420 goto recov_retry; 3421 } 3422 mutex_exit(&sp->s_lock); 3423 3424 args.ctag = TAG_RENEW; 3425 3426 args.array_len = 1; 3427 args.array = argop; 3428 3429 argop[0].argop = OP_RENEW; 3430 3431 mutex_enter(&sp->s_lock); 3432 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3433 cr = sp->s_cred; 3434 crhold(cr); 3435 mutex_exit(&sp->s_lock); 3436 3437 ASSERT(cr != NULL); 3438 3439 /* used to figure out RTT for sp */ 3440 gethrestime(&prop_time); 3441 3442 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3443 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3444 (void*)sp)); 3445 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3446 prop_time.tv_sec, prop_time.tv_nsec)); 3447 3448 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3449 mntinfo4_t *, mi); 3450 3451 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3452 crfree(cr); 3453 3454 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3455 mntinfo4_t *, mi); 3456 3457 gethrestime(&after_time); 3458 3459 mutex_enter(&sp->s_lock); 3460 sp->propagation_delay.tv_sec = 3461 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3462 mutex_exit(&sp->s_lock); 3463 3464 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3465 after_time.tv_sec, after_time.tv_nsec)); 3466 3467 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3468 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3469 nfs4_delegreturn_all(sp); 3470 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3471 VFS_RELE(mi->mi_vfsp); 3472 /* 3473 * If the server returns CB_PATH_DOWN, it has renewed 3474 * the lease and informed us that the callback path is 3475 * down. Since the lease is renewed, just return 0 and 3476 * let the renew thread proceed as normal. 3477 */ 3478 return (0); 3479 } 3480 3481 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3482 if (!needrecov && e.error) { 3483 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3484 VFS_RELE(mi->mi_vfsp); 3485 return (e.error); 3486 } 3487 3488 rpc_error = e.error; 3489 3490 if (needrecov) { 3491 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3492 "nfs4renew: initiating recovery\n")); 3493 3494 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3495 OP_RENEW, NULL) == FALSE) { 3496 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3497 VFS_RELE(mi->mi_vfsp); 3498 if (!e.error) 3499 (void) xdr_free(xdr_COMPOUND4res_clnt, 3500 (caddr_t)&res); 3501 mutex_enter(&sp->s_lock); 3502 goto recov_retry; 3503 } 3504 /* fall through for res.status case */ 3505 } 3506 3507 if (res.status) { 3508 if (res.status == NFS4ERR_LEASE_MOVED) { 3509 /*EMPTY*/ 3510 /* 3511 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3512 * to renew the lease on that server 3513 */ 3514 } 3515 e.error = geterrno4(res.status); 3516 } 3517 3518 if (!rpc_error) 3519 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3520 3521 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3522 3523 VFS_RELE(mi->mi_vfsp); 3524 3525 return (e.error); 3526 } 3527 3528 void 3529 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3530 { 3531 nfs4_server_t *sp; 3532 3533 /* this locks down sp if it is found */ 3534 sp = find_nfs4_server(mi); 3535 3536 if (sp != NULL) { 3537 nfs4_inc_state_ref_count_nolock(sp, mi); 3538 mutex_exit(&sp->s_lock); 3539 nfs4_server_rele(sp); 3540 } 3541 } 3542 3543 /* 3544 * Bump the number of OPEN files (ie: those with state) so we know if this 3545 * nfs4_server has any state to maintain a lease for or not. 3546 * 3547 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3548 */ 3549 void 3550 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3551 { 3552 ASSERT(mutex_owned(&sp->s_lock)); 3553 3554 sp->state_ref_count++; 3555 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3556 "nfs4_inc_state_ref_count: state_ref_count now %d", 3557 sp->state_ref_count)); 3558 3559 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3560 sp->lease_valid = NFS4_LEASE_VALID; 3561 3562 /* 3563 * If this call caused the lease to be marked valid and/or 3564 * took the state_ref_count from 0 to 1, then start the time 3565 * on lease renewal. 3566 */ 3567 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3568 sp->last_renewal_time = gethrestime_sec(); 3569 3570 /* update the number of open files for mi */ 3571 mi->mi_open_files++; 3572 } 3573 3574 void 3575 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3576 { 3577 nfs4_server_t *sp; 3578 3579 /* this locks down sp if it is found */ 3580 sp = find_nfs4_server_all(mi, 1); 3581 3582 if (sp != NULL) { 3583 nfs4_dec_state_ref_count_nolock(sp, mi); 3584 mutex_exit(&sp->s_lock); 3585 nfs4_server_rele(sp); 3586 } 3587 } 3588 3589 /* 3590 * Decrement the number of OPEN files (ie: those with state) so we know if 3591 * this nfs4_server has any state to maintain a lease for or not. 3592 */ 3593 void 3594 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3595 { 3596 ASSERT(mutex_owned(&sp->s_lock)); 3597 ASSERT(sp->state_ref_count != 0); 3598 sp->state_ref_count--; 3599 3600 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3601 "nfs4_dec_state_ref_count: state ref count now %d", 3602 sp->state_ref_count)); 3603 3604 mi->mi_open_files--; 3605 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3606 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3607 mi->mi_open_files, mi->mi_flags)); 3608 3609 /* We don't have to hold the mi_lock to test mi_flags */ 3610 if (mi->mi_open_files == 0 && 3611 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3612 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3613 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3614 "we have closed the last open file", (void*)mi)); 3615 nfs4_remove_mi_from_server(mi, sp); 3616 } 3617 } 3618 3619 bool_t 3620 inlease(nfs4_server_t *sp) 3621 { 3622 bool_t result; 3623 3624 ASSERT(mutex_owned(&sp->s_lock)); 3625 3626 if (sp->lease_valid == NFS4_LEASE_VALID && 3627 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3628 result = TRUE; 3629 else 3630 result = FALSE; 3631 3632 return (result); 3633 } 3634 3635 3636 /* 3637 * Return non-zero if the given nfs4_server_t is going through recovery. 3638 */ 3639 3640 int 3641 nfs4_server_in_recovery(nfs4_server_t *sp) 3642 { 3643 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3644 } 3645 3646 /* 3647 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3648 * first is less than, equal to, or greater than the second. 3649 */ 3650 3651 int 3652 sfh4cmp(const void *p1, const void *p2) 3653 { 3654 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3655 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3656 3657 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3658 } 3659 3660 /* 3661 * Create a table for shared filehandle objects. 3662 */ 3663 3664 void 3665 sfh4_createtab(avl_tree_t *tab) 3666 { 3667 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3668 offsetof(nfs4_sharedfh_t, sfh_tree)); 3669 } 3670 3671 /* 3672 * Return a shared filehandle object for the given filehandle. The caller 3673 * is responsible for eventually calling sfh4_rele(). 3674 */ 3675 3676 nfs4_sharedfh_t * 3677 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3678 { 3679 nfs4_sharedfh_t *sfh, *nsfh; 3680 avl_index_t where; 3681 nfs4_sharedfh_t skey; 3682 3683 if (!key) { 3684 skey.sfh_fh = *fh; 3685 key = &skey; 3686 } 3687 3688 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3689 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3690 /* 3691 * We allocate the largest possible filehandle size because it's 3692 * not that big, and it saves us from possibly having to resize the 3693 * buffer later. 3694 */ 3695 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3696 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3697 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3698 nsfh->sfh_refcnt = 1; 3699 nsfh->sfh_flags = SFH4_IN_TREE; 3700 nsfh->sfh_mi = mi; 3701 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3702 (void *)nsfh)); 3703 3704 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3705 sfh = avl_find(&mi->mi_filehandles, key, &where); 3706 if (sfh != NULL) { 3707 mutex_enter(&sfh->sfh_lock); 3708 sfh->sfh_refcnt++; 3709 mutex_exit(&sfh->sfh_lock); 3710 nfs_rw_exit(&mi->mi_fh_lock); 3711 /* free our speculative allocs */ 3712 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3713 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3714 return (sfh); 3715 } 3716 3717 avl_insert(&mi->mi_filehandles, nsfh, where); 3718 nfs_rw_exit(&mi->mi_fh_lock); 3719 3720 return (nsfh); 3721 } 3722 3723 /* 3724 * Return a shared filehandle object for the given filehandle. The caller 3725 * is responsible for eventually calling sfh4_rele(). 3726 */ 3727 3728 nfs4_sharedfh_t * 3729 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3730 { 3731 nfs4_sharedfh_t *sfh; 3732 nfs4_sharedfh_t key; 3733 3734 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3735 3736 #ifdef DEBUG 3737 if (nfs4_sharedfh_debug) { 3738 nfs4_fhandle_t fhandle; 3739 3740 fhandle.fh_len = fh->nfs_fh4_len; 3741 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3742 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3743 nfs4_printfhandle(&fhandle); 3744 } 3745 #endif 3746 3747 /* 3748 * If there's already an object for the given filehandle, bump the 3749 * reference count and return it. Otherwise, create a new object 3750 * and add it to the AVL tree. 3751 */ 3752 3753 key.sfh_fh = *fh; 3754 3755 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3756 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3757 if (sfh != NULL) { 3758 mutex_enter(&sfh->sfh_lock); 3759 sfh->sfh_refcnt++; 3760 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3761 "sfh4_get: found existing %p, new refcnt=%d", 3762 (void *)sfh, sfh->sfh_refcnt)); 3763 mutex_exit(&sfh->sfh_lock); 3764 nfs_rw_exit(&mi->mi_fh_lock); 3765 return (sfh); 3766 } 3767 nfs_rw_exit(&mi->mi_fh_lock); 3768 3769 return (sfh4_put(fh, mi, &key)); 3770 } 3771 3772 /* 3773 * Get a reference to the given shared filehandle object. 3774 */ 3775 3776 void 3777 sfh4_hold(nfs4_sharedfh_t *sfh) 3778 { 3779 ASSERT(sfh->sfh_refcnt > 0); 3780 3781 mutex_enter(&sfh->sfh_lock); 3782 sfh->sfh_refcnt++; 3783 NFS4_DEBUG(nfs4_sharedfh_debug, 3784 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3785 (void *)sfh, sfh->sfh_refcnt)); 3786 mutex_exit(&sfh->sfh_lock); 3787 } 3788 3789 /* 3790 * Release a reference to the given shared filehandle object and null out 3791 * the given pointer. 3792 */ 3793 3794 void 3795 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3796 { 3797 mntinfo4_t *mi; 3798 nfs4_sharedfh_t *sfh = *sfhpp; 3799 3800 ASSERT(sfh->sfh_refcnt > 0); 3801 3802 mutex_enter(&sfh->sfh_lock); 3803 if (sfh->sfh_refcnt > 1) { 3804 sfh->sfh_refcnt--; 3805 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3806 "sfh4_rele %p, new refcnt=%d", 3807 (void *)sfh, sfh->sfh_refcnt)); 3808 mutex_exit(&sfh->sfh_lock); 3809 goto finish; 3810 } 3811 mutex_exit(&sfh->sfh_lock); 3812 3813 /* 3814 * Possibly the last reference, so get the lock for the table in 3815 * case it's time to remove the object from the table. 3816 */ 3817 mi = sfh->sfh_mi; 3818 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3819 mutex_enter(&sfh->sfh_lock); 3820 sfh->sfh_refcnt--; 3821 if (sfh->sfh_refcnt > 0) { 3822 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3823 "sfh4_rele %p, new refcnt=%d", 3824 (void *)sfh, sfh->sfh_refcnt)); 3825 mutex_exit(&sfh->sfh_lock); 3826 nfs_rw_exit(&mi->mi_fh_lock); 3827 goto finish; 3828 } 3829 3830 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3831 "sfh4_rele %p, last ref", (void *)sfh)); 3832 if (sfh->sfh_flags & SFH4_IN_TREE) { 3833 avl_remove(&mi->mi_filehandles, sfh); 3834 sfh->sfh_flags &= ~SFH4_IN_TREE; 3835 } 3836 mutex_exit(&sfh->sfh_lock); 3837 nfs_rw_exit(&mi->mi_fh_lock); 3838 mutex_destroy(&sfh->sfh_lock); 3839 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3840 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3841 3842 finish: 3843 *sfhpp = NULL; 3844 } 3845 3846 /* 3847 * Update the filehandle for the given shared filehandle object. 3848 */ 3849 3850 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3851 3852 void 3853 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3854 { 3855 mntinfo4_t *mi = sfh->sfh_mi; 3856 nfs4_sharedfh_t *dupsfh; 3857 avl_index_t where; 3858 nfs4_sharedfh_t key; 3859 3860 #ifdef DEBUG 3861 mutex_enter(&sfh->sfh_lock); 3862 ASSERT(sfh->sfh_refcnt > 0); 3863 mutex_exit(&sfh->sfh_lock); 3864 #endif 3865 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3866 3867 /* 3868 * The basic plan is to remove the shared filehandle object from 3869 * the table, update it to have the new filehandle, then reinsert 3870 * it. 3871 */ 3872 3873 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3874 mutex_enter(&sfh->sfh_lock); 3875 if (sfh->sfh_flags & SFH4_IN_TREE) { 3876 avl_remove(&mi->mi_filehandles, sfh); 3877 sfh->sfh_flags &= ~SFH4_IN_TREE; 3878 } 3879 mutex_exit(&sfh->sfh_lock); 3880 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3881 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3882 sfh->sfh_fh.nfs_fh4_len); 3883 3884 /* 3885 * XXX If there is already a shared filehandle object with the new 3886 * filehandle, we're in trouble, because the rnode code assumes 3887 * that there is only one shared filehandle object for a given 3888 * filehandle. So issue a warning (for read-write mounts only) 3889 * and don't try to re-insert the given object into the table. 3890 * Hopefully the given object will quickly go away and everyone 3891 * will use the new object. 3892 */ 3893 key.sfh_fh = *newfh; 3894 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3895 if (dupsfh != NULL) { 3896 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3897 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3898 "duplicate filehandle detected"); 3899 sfh4_printfhandle(dupsfh); 3900 } 3901 } else { 3902 avl_insert(&mi->mi_filehandles, sfh, where); 3903 mutex_enter(&sfh->sfh_lock); 3904 sfh->sfh_flags |= SFH4_IN_TREE; 3905 mutex_exit(&sfh->sfh_lock); 3906 } 3907 nfs_rw_exit(&mi->mi_fh_lock); 3908 } 3909 3910 /* 3911 * Copy out the current filehandle for the given shared filehandle object. 3912 */ 3913 3914 void 3915 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3916 { 3917 mntinfo4_t *mi = sfh->sfh_mi; 3918 3919 ASSERT(sfh->sfh_refcnt > 0); 3920 3921 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3922 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3923 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3924 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3925 nfs_rw_exit(&mi->mi_fh_lock); 3926 } 3927 3928 /* 3929 * Print out the filehandle for the given shared filehandle object. 3930 */ 3931 3932 void 3933 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3934 { 3935 nfs4_fhandle_t fhandle; 3936 3937 sfh4_copyval(sfh, &fhandle); 3938 nfs4_printfhandle(&fhandle); 3939 } 3940 3941 /* 3942 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3943 * if they're the same, +1 if the first is "greater" than the second. The 3944 * caller (or whoever's calling the AVL package) is responsible for 3945 * handling locking issues. 3946 */ 3947 3948 static int 3949 fncmp(const void *p1, const void *p2) 3950 { 3951 const nfs4_fname_t *f1 = p1; 3952 const nfs4_fname_t *f2 = p2; 3953 int res; 3954 3955 res = strcmp(f1->fn_name, f2->fn_name); 3956 /* 3957 * The AVL package wants +/-1, not arbitrary positive or negative 3958 * integers. 3959 */ 3960 if (res > 0) 3961 res = 1; 3962 else if (res < 0) 3963 res = -1; 3964 return (res); 3965 } 3966 3967 /* 3968 * Get or create an fname with the given name, as a child of the given 3969 * fname. The caller is responsible for eventually releasing the reference 3970 * (fn_rele()). parent may be NULL. 3971 */ 3972 3973 nfs4_fname_t * 3974 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh) 3975 { 3976 nfs4_fname_t key; 3977 nfs4_fname_t *fnp; 3978 avl_index_t where; 3979 3980 key.fn_name = name; 3981 3982 /* 3983 * If there's already an fname registered with the given name, bump 3984 * its reference count and return it. Otherwise, create a new one 3985 * and add it to the parent's AVL tree. 3986 * 3987 * fname entries we are looking for should match both name 3988 * and sfh stored in the fname. 3989 */ 3990 again: 3991 if (parent != NULL) { 3992 mutex_enter(&parent->fn_lock); 3993 fnp = avl_find(&parent->fn_children, &key, &where); 3994 if (fnp != NULL) { 3995 /* 3996 * This hold on fnp is released below later, 3997 * in case this is not the fnp we want. 3998 */ 3999 fn_hold(fnp); 4000 4001 if (fnp->fn_sfh == sfh) { 4002 /* 4003 * We have found our entry. 4004 * put an hold and return it. 4005 */ 4006 mutex_exit(&parent->fn_lock); 4007 return (fnp); 4008 } 4009 4010 /* 4011 * We have found an entry that has a mismatching 4012 * fn_sfh. This could be a stale entry due to 4013 * server side rename. We will remove this entry 4014 * and make sure no such entries exist. 4015 */ 4016 mutex_exit(&parent->fn_lock); 4017 mutex_enter(&fnp->fn_lock); 4018 if (fnp->fn_parent == parent) { 4019 /* 4020 * Remove ourselves from parent's 4021 * fn_children tree. 4022 */ 4023 mutex_enter(&parent->fn_lock); 4024 avl_remove(&parent->fn_children, fnp); 4025 mutex_exit(&parent->fn_lock); 4026 fn_rele(&fnp->fn_parent); 4027 } 4028 mutex_exit(&fnp->fn_lock); 4029 fn_rele(&fnp); 4030 goto again; 4031 } 4032 } 4033 4034 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 4035 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 4036 fnp->fn_parent = parent; 4037 if (parent != NULL) 4038 fn_hold(parent); 4039 fnp->fn_len = strlen(name); 4040 ASSERT(fnp->fn_len < MAXNAMELEN); 4041 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 4042 (void) strcpy(fnp->fn_name, name); 4043 fnp->fn_refcnt = 1; 4044 4045 /* 4046 * This hold on sfh is later released 4047 * when we do the final fn_rele() on this fname. 4048 */ 4049 sfh4_hold(sfh); 4050 fnp->fn_sfh = sfh; 4051 4052 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 4053 offsetof(nfs4_fname_t, fn_tree)); 4054 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4055 "fn_get %p:%s, a new nfs4_fname_t!", 4056 (void *)fnp, fnp->fn_name)); 4057 if (parent != NULL) { 4058 avl_insert(&parent->fn_children, fnp, where); 4059 mutex_exit(&parent->fn_lock); 4060 } 4061 4062 return (fnp); 4063 } 4064 4065 void 4066 fn_hold(nfs4_fname_t *fnp) 4067 { 4068 atomic_add_32(&fnp->fn_refcnt, 1); 4069 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4070 "fn_hold %p:%s, new refcnt=%d", 4071 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4072 } 4073 4074 /* 4075 * Decrement the reference count of the given fname, and destroy it if its 4076 * reference count goes to zero. Nulls out the given pointer. 4077 */ 4078 4079 void 4080 fn_rele(nfs4_fname_t **fnpp) 4081 { 4082 nfs4_fname_t *parent; 4083 uint32_t newref; 4084 nfs4_fname_t *fnp; 4085 4086 recur: 4087 fnp = *fnpp; 4088 *fnpp = NULL; 4089 4090 mutex_enter(&fnp->fn_lock); 4091 parent = fnp->fn_parent; 4092 if (parent != NULL) 4093 mutex_enter(&parent->fn_lock); /* prevent new references */ 4094 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 4095 if (newref > 0) { 4096 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4097 "fn_rele %p:%s, new refcnt=%d", 4098 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4099 if (parent != NULL) 4100 mutex_exit(&parent->fn_lock); 4101 mutex_exit(&fnp->fn_lock); 4102 return; 4103 } 4104 4105 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4106 "fn_rele %p:%s, last reference, deleting...", 4107 (void *)fnp, fnp->fn_name)); 4108 if (parent != NULL) { 4109 avl_remove(&parent->fn_children, fnp); 4110 mutex_exit(&parent->fn_lock); 4111 } 4112 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4113 sfh4_rele(&fnp->fn_sfh); 4114 mutex_destroy(&fnp->fn_lock); 4115 avl_destroy(&fnp->fn_children); 4116 kmem_free(fnp, sizeof (nfs4_fname_t)); 4117 /* 4118 * Recursivly fn_rele the parent. 4119 * Use goto instead of a recursive call to avoid stack overflow. 4120 */ 4121 if (parent != NULL) { 4122 fnpp = &parent; 4123 goto recur; 4124 } 4125 } 4126 4127 /* 4128 * Returns the single component name of the given fname, in a MAXNAMELEN 4129 * string buffer, which the caller is responsible for freeing. Note that 4130 * the name may become invalid as a result of fn_move(). 4131 */ 4132 4133 char * 4134 fn_name(nfs4_fname_t *fnp) 4135 { 4136 char *name; 4137 4138 ASSERT(fnp->fn_len < MAXNAMELEN); 4139 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4140 mutex_enter(&fnp->fn_lock); 4141 (void) strcpy(name, fnp->fn_name); 4142 mutex_exit(&fnp->fn_lock); 4143 4144 return (name); 4145 } 4146 4147 4148 /* 4149 * fn_path_realloc 4150 * 4151 * This function, used only by fn_path, constructs 4152 * a new string which looks like "prepend" + "/" + "current". 4153 * by allocating a new string and freeing the old one. 4154 */ 4155 static void 4156 fn_path_realloc(char **curses, char *prepend) 4157 { 4158 int len, curlen = 0; 4159 char *news; 4160 4161 if (*curses == NULL) { 4162 /* 4163 * Prime the pump, allocate just the 4164 * space for prepend and return that. 4165 */ 4166 len = strlen(prepend) + 1; 4167 news = kmem_alloc(len, KM_SLEEP); 4168 (void) strncpy(news, prepend, len); 4169 } else { 4170 /* 4171 * Allocate the space for a new string 4172 * +1 +1 is for the "/" and the NULL 4173 * byte at the end of it all. 4174 */ 4175 curlen = strlen(*curses); 4176 len = curlen + strlen(prepend) + 1 + 1; 4177 news = kmem_alloc(len, KM_SLEEP); 4178 (void) strncpy(news, prepend, len); 4179 (void) strcat(news, "/"); 4180 (void) strcat(news, *curses); 4181 kmem_free(*curses, curlen + 1); 4182 } 4183 *curses = news; 4184 } 4185 4186 /* 4187 * Returns the path name (starting from the fs root) for the given fname. 4188 * The caller is responsible for freeing. Note that the path may be or 4189 * become invalid as a result of fn_move(). 4190 */ 4191 4192 char * 4193 fn_path(nfs4_fname_t *fnp) 4194 { 4195 char *path; 4196 nfs4_fname_t *nextfnp; 4197 4198 if (fnp == NULL) 4199 return (NULL); 4200 4201 path = NULL; 4202 4203 /* walk up the tree constructing the pathname. */ 4204 4205 fn_hold(fnp); /* adjust for later rele */ 4206 do { 4207 mutex_enter(&fnp->fn_lock); 4208 /* 4209 * Add fn_name in front of the current path 4210 */ 4211 fn_path_realloc(&path, fnp->fn_name); 4212 nextfnp = fnp->fn_parent; 4213 if (nextfnp != NULL) 4214 fn_hold(nextfnp); 4215 mutex_exit(&fnp->fn_lock); 4216 fn_rele(&fnp); 4217 fnp = nextfnp; 4218 } while (fnp != NULL); 4219 4220 return (path); 4221 } 4222 4223 /* 4224 * Return a reference to the parent of the given fname, which the caller is 4225 * responsible for eventually releasing. 4226 */ 4227 4228 nfs4_fname_t * 4229 fn_parent(nfs4_fname_t *fnp) 4230 { 4231 nfs4_fname_t *parent; 4232 4233 mutex_enter(&fnp->fn_lock); 4234 parent = fnp->fn_parent; 4235 if (parent != NULL) 4236 fn_hold(parent); 4237 mutex_exit(&fnp->fn_lock); 4238 4239 return (parent); 4240 } 4241 4242 /* 4243 * Update fnp so that its parent is newparent and its name is newname. 4244 */ 4245 4246 void 4247 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4248 { 4249 nfs4_fname_t *parent, *tmpfnp; 4250 ssize_t newlen; 4251 nfs4_fname_t key; 4252 avl_index_t where; 4253 4254 /* 4255 * This assert exists to catch the client trying to rename 4256 * a dir to be a child of itself. This happened at a recent 4257 * bakeoff against a 3rd party (broken) server which allowed 4258 * the rename to succeed. If it trips it means that: 4259 * a) the code in nfs4rename that detects this case is broken 4260 * b) the server is broken (since it allowed the bogus rename) 4261 * 4262 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4263 * panic below from: mutex_enter(&newparent->fn_lock); 4264 */ 4265 ASSERT(fnp != newparent); 4266 4267 /* 4268 * Remove fnp from its current parent, change its name, then add it 4269 * to newparent. It might happen that fnp was replaced by another 4270 * nfs4_fname_t with the same fn_name in parent->fn_children. 4271 * In such case, fnp->fn_parent is NULL and we skip the removal 4272 * of fnp from its current parent. 4273 */ 4274 mutex_enter(&fnp->fn_lock); 4275 parent = fnp->fn_parent; 4276 if (parent != NULL) { 4277 mutex_enter(&parent->fn_lock); 4278 avl_remove(&parent->fn_children, fnp); 4279 mutex_exit(&parent->fn_lock); 4280 fn_rele(&fnp->fn_parent); 4281 } 4282 4283 newlen = strlen(newname); 4284 if (newlen != fnp->fn_len) { 4285 ASSERT(newlen < MAXNAMELEN); 4286 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4287 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4288 fnp->fn_len = newlen; 4289 } 4290 (void) strcpy(fnp->fn_name, newname); 4291 4292 again: 4293 mutex_enter(&newparent->fn_lock); 4294 key.fn_name = fnp->fn_name; 4295 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4296 if (tmpfnp != NULL) { 4297 /* 4298 * This could be due to a file that was unlinked while 4299 * open, or perhaps the rnode is in the free list. Remove 4300 * it from newparent and let it go away on its own. The 4301 * contorted code is to deal with lock order issues and 4302 * race conditions. 4303 */ 4304 fn_hold(tmpfnp); 4305 mutex_exit(&newparent->fn_lock); 4306 mutex_enter(&tmpfnp->fn_lock); 4307 if (tmpfnp->fn_parent == newparent) { 4308 mutex_enter(&newparent->fn_lock); 4309 avl_remove(&newparent->fn_children, tmpfnp); 4310 mutex_exit(&newparent->fn_lock); 4311 fn_rele(&tmpfnp->fn_parent); 4312 } 4313 mutex_exit(&tmpfnp->fn_lock); 4314 fn_rele(&tmpfnp); 4315 goto again; 4316 } 4317 fnp->fn_parent = newparent; 4318 fn_hold(newparent); 4319 avl_insert(&newparent->fn_children, fnp, where); 4320 mutex_exit(&newparent->fn_lock); 4321 mutex_exit(&fnp->fn_lock); 4322 } 4323 4324 #ifdef DEBUG 4325 /* 4326 * Return non-zero if the type information makes sense for the given vnode. 4327 * Otherwise panic. 4328 */ 4329 int 4330 nfs4_consistent_type(vnode_t *vp) 4331 { 4332 rnode4_t *rp = VTOR4(vp); 4333 4334 if (nfs4_vtype_debug && vp->v_type != VNON && 4335 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4336 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4337 "rnode attr type=%d", (void *)vp, vp->v_type, 4338 rp->r_attr.va_type); 4339 } 4340 4341 return (1); 4342 } 4343 #endif /* DEBUG */ 4344