1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/thread.h> 35 #include <sys/t_lock.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/errno.h> 40 #include <sys/buf.h> 41 #include <sys/stat.h> 42 #include <sys/cred.h> 43 #include <sys/kmem.h> 44 #include <sys/debug.h> 45 #include <sys/dnlc.h> 46 #include <sys/vmsystm.h> 47 #include <sys/flock.h> 48 #include <sys/share.h> 49 #include <sys/cmn_err.h> 50 #include <sys/tiuser.h> 51 #include <sys/sysmacros.h> 52 #include <sys/callb.h> 53 #include <sys/acl.h> 54 #include <sys/kstat.h> 55 #include <sys/signal.h> 56 #include <sys/disp.h> 57 #include <sys/atomic.h> 58 #include <sys/list.h> 59 #include <sys/sdt.h> 60 61 #include <rpc/types.h> 62 #include <rpc/xdr.h> 63 #include <rpc/auth.h> 64 #include <rpc/clnt.h> 65 66 #include <nfs/nfs.h> 67 #include <nfs/nfs_clnt.h> 68 #include <nfs/nfs_acl.h> 69 70 #include <nfs/nfs4.h> 71 #include <nfs/rnode4.h> 72 #include <nfs/nfs4_clnt.h> 73 74 #include <vm/hat.h> 75 #include <vm/as.h> 76 #include <vm/page.h> 77 #include <vm/pvn.h> 78 #include <vm/seg.h> 79 #include <vm/seg_map.h> 80 #include <vm/seg_vn.h> 81 82 #include <sys/ddi.h> 83 84 /* 85 * Arguments to page-flush thread. 86 */ 87 typedef struct { 88 vnode_t *vp; 89 cred_t *cr; 90 } pgflush_t; 91 92 #ifdef DEBUG 93 int nfs4_client_lease_debug; 94 int nfs4_sharedfh_debug; 95 int nfs4_fname_debug; 96 97 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 98 int nfs4_vtype_debug; 99 100 uint_t nfs4_tsd_key; 101 #endif 102 103 static time_t nfs4_client_resumed = 0; 104 static callb_id_t cid = 0; 105 106 static int nfs4renew(nfs4_server_t *); 107 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 108 static void nfs4_pgflush_thread(pgflush_t *); 109 static void flush_pages(vnode_t *, cred_t *); 110 111 static boolean_t nfs4_client_cpr_callb(void *, int); 112 113 struct mi4_globals { 114 kmutex_t mig_lock; /* lock protecting mig_list */ 115 list_t mig_list; /* list of NFS v4 mounts in zone */ 116 boolean_t mig_destructor_called; 117 }; 118 119 static zone_key_t mi4_list_key; 120 121 /* 122 * Attributes caching: 123 * 124 * Attributes are cached in the rnode in struct vattr form. 125 * There is a time associated with the cached attributes (r_time_attr_inval) 126 * which tells whether the attributes are valid. The time is initialized 127 * to the difference between current time and the modify time of the vnode 128 * when new attributes are cached. This allows the attributes for 129 * files that have changed recently to be timed out sooner than for files 130 * that have not changed for a long time. There are minimum and maximum 131 * timeout values that can be set per mount point. 132 */ 133 134 /* 135 * If a cache purge is in progress, wait for it to finish. 136 * 137 * The current thread must not be in the middle of an 138 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 139 * between this thread, a recovery thread, and the page flush thread. 140 */ 141 int 142 nfs4_waitfor_purge_complete(vnode_t *vp) 143 { 144 rnode4_t *rp; 145 k_sigset_t smask; 146 147 rp = VTOR4(vp); 148 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 149 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 150 mutex_enter(&rp->r_statelock); 151 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 152 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 153 ((rp->r_flags & R4PGFLUSH) && 154 rp->r_pgflush != curthread)) { 155 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 156 sigunintr(&smask); 157 mutex_exit(&rp->r_statelock); 158 return (EINTR); 159 } 160 } 161 sigunintr(&smask); 162 mutex_exit(&rp->r_statelock); 163 } 164 return (0); 165 } 166 167 /* 168 * Validate caches by checking cached attributes. If they have timed out, 169 * then get new attributes from the server. As a side effect, cache 170 * invalidation is done if the attributes have changed. 171 * 172 * If the attributes have not timed out and if there is a cache 173 * invalidation being done by some other thread, then wait until that 174 * thread has completed the cache invalidation. 175 */ 176 int 177 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 178 { 179 int error; 180 nfs4_ga_res_t gar; 181 182 if (ATTRCACHE4_VALID(vp)) { 183 error = nfs4_waitfor_purge_complete(vp); 184 if (error) 185 return (error); 186 return (0); 187 } 188 189 gar.n4g_va.va_mask = AT_ALL; 190 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 191 } 192 193 /* 194 * Fill in attribute from the cache. 195 * If valid, then return 0 to indicate that no error occurred, 196 * otherwise return 1 to indicate that an error occurred. 197 */ 198 static int 199 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 200 { 201 rnode4_t *rp; 202 203 rp = VTOR4(vp); 204 mutex_enter(&rp->r_statelock); 205 mutex_enter(&rp->r_statev4_lock); 206 if (ATTRCACHE4_VALID(vp)) { 207 mutex_exit(&rp->r_statev4_lock); 208 /* 209 * Cached attributes are valid 210 */ 211 *vap = rp->r_attr; 212 mutex_exit(&rp->r_statelock); 213 return (0); 214 } 215 mutex_exit(&rp->r_statev4_lock); 216 mutex_exit(&rp->r_statelock); 217 return (1); 218 } 219 220 221 /* 222 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 223 * call is synchronous because all the pages were invalidated by the 224 * nfs4_invalidate_pages() call. 225 */ 226 void 227 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 228 { 229 struct rnode4 *rp = VTOR4(vp); 230 231 /* Ensure that the ..._end_op() call has been done */ 232 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 233 234 if (errno != ESTALE) 235 return; 236 237 mutex_enter(&rp->r_statelock); 238 rp->r_flags |= R4STALE; 239 if (!rp->r_error) 240 rp->r_error = errno; 241 mutex_exit(&rp->r_statelock); 242 if (nfs4_has_pages(vp)) 243 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 244 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 245 } 246 247 /* 248 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 249 * page purge is done asynchronously. 250 */ 251 void 252 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 253 { 254 rnode4_t *rp; 255 char *contents; 256 vnode_t *xattr; 257 int size; 258 int pgflush; /* are we the page flush thread? */ 259 260 /* 261 * Purge the DNLC for any entries which refer to this file. 262 */ 263 if (vp->v_count > 1 && 264 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 265 dnlc_purge_vp(vp); 266 267 /* 268 * Clear any readdir state bits and purge the readlink response cache. 269 */ 270 rp = VTOR4(vp); 271 mutex_enter(&rp->r_statelock); 272 rp->r_flags &= ~R4LOOKUP; 273 contents = rp->r_symlink.contents; 274 size = rp->r_symlink.size; 275 rp->r_symlink.contents = NULL; 276 277 xattr = rp->r_xattr_dir; 278 rp->r_xattr_dir = NULL; 279 280 /* 281 * Purge pathconf cache too. 282 */ 283 rp->r_pathconf.pc4_xattr_valid = 0; 284 rp->r_pathconf.pc4_cache_valid = 0; 285 286 pgflush = (curthread == rp->r_pgflush); 287 mutex_exit(&rp->r_statelock); 288 289 if (contents != NULL) { 290 291 kmem_free((void *)contents, size); 292 } 293 294 if (xattr != NULL) 295 VN_RELE(xattr); 296 297 /* 298 * Flush the page cache. If the current thread is the page flush 299 * thread, don't initiate a new page flush. There's no need for 300 * it, and doing it correctly is hard. 301 */ 302 if (nfs4_has_pages(vp) && !pgflush) { 303 if (!asyncpg) { 304 (void) nfs4_waitfor_purge_complete(vp); 305 flush_pages(vp, cr); 306 } else { 307 pgflush_t *args; 308 309 /* 310 * We don't hold r_statelock while creating the 311 * thread, in case the call blocks. So we use a 312 * flag to indicate that a page flush thread is 313 * active. 314 */ 315 mutex_enter(&rp->r_statelock); 316 if (rp->r_flags & R4PGFLUSH) { 317 mutex_exit(&rp->r_statelock); 318 } else { 319 rp->r_flags |= R4PGFLUSH; 320 mutex_exit(&rp->r_statelock); 321 322 args = kmem_alloc(sizeof (pgflush_t), 323 KM_SLEEP); 324 args->vp = vp; 325 VN_HOLD(args->vp); 326 args->cr = cr; 327 crhold(args->cr); 328 (void) zthread_create(NULL, 0, 329 nfs4_pgflush_thread, args, 0, 330 minclsyspri); 331 } 332 } 333 } 334 335 /* 336 * Flush the readdir response cache. 337 */ 338 nfs4_purge_rddir_cache(vp); 339 } 340 341 /* 342 * Invalidate all pages for the given file, after writing back the dirty 343 * ones. 344 */ 345 346 static void 347 flush_pages(vnode_t *vp, cred_t *cr) 348 { 349 int error; 350 rnode4_t *rp = VTOR4(vp); 351 352 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 353 if (error == ENOSPC || error == EDQUOT) { 354 mutex_enter(&rp->r_statelock); 355 if (!rp->r_error) 356 rp->r_error = error; 357 mutex_exit(&rp->r_statelock); 358 } 359 } 360 361 /* 362 * Page flush thread. 363 */ 364 365 static void 366 nfs4_pgflush_thread(pgflush_t *args) 367 { 368 rnode4_t *rp = VTOR4(args->vp); 369 370 /* remember which thread we are, so we don't deadlock ourselves */ 371 mutex_enter(&rp->r_statelock); 372 ASSERT(rp->r_pgflush == NULL); 373 rp->r_pgflush = curthread; 374 mutex_exit(&rp->r_statelock); 375 376 flush_pages(args->vp, args->cr); 377 378 mutex_enter(&rp->r_statelock); 379 rp->r_pgflush = NULL; 380 rp->r_flags &= ~R4PGFLUSH; 381 cv_broadcast(&rp->r_cv); 382 mutex_exit(&rp->r_statelock); 383 384 VN_RELE(args->vp); 385 crfree(args->cr); 386 kmem_free(args, sizeof (pgflush_t)); 387 zthread_exit(); 388 } 389 390 /* 391 * Purge the readdir cache of all entries which are not currently 392 * being filled. 393 */ 394 void 395 nfs4_purge_rddir_cache(vnode_t *vp) 396 { 397 rnode4_t *rp; 398 399 rp = VTOR4(vp); 400 401 mutex_enter(&rp->r_statelock); 402 rp->r_direof = NULL; 403 rp->r_flags &= ~R4LOOKUP; 404 rp->r_flags |= R4READDIRWATTR; 405 rddir4_cache_purge(rp); 406 mutex_exit(&rp->r_statelock); 407 } 408 409 /* 410 * Set attributes cache for given vnode using virtual attributes. There is 411 * no cache validation, but if the attributes are deemed to be stale, they 412 * are ignored. This corresponds to nfs3_attrcache(). 413 * 414 * Set the timeout value on the attribute cache and fill it 415 * with the passed in attributes. 416 */ 417 void 418 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 419 { 420 rnode4_t *rp = VTOR4(vp); 421 422 mutex_enter(&rp->r_statelock); 423 if (rp->r_time_attr_saved <= t) 424 nfs4_attrcache_va(vp, garp, FALSE); 425 mutex_exit(&rp->r_statelock); 426 } 427 428 /* 429 * Use the passed in virtual attributes to check to see whether the 430 * data and metadata caches are valid, cache the new attributes, and 431 * then do the cache invalidation if required. 432 * 433 * The cache validation and caching of the new attributes is done 434 * atomically via the use of the mutex, r_statelock. If required, 435 * the cache invalidation is done atomically w.r.t. the cache 436 * validation and caching of the attributes via the pseudo lock, 437 * r_serial. 438 * 439 * This routine is used to do cache validation and attributes caching 440 * for operations with a single set of post operation attributes. 441 */ 442 443 void 444 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 445 hrtime_t t, cred_t *cr, int async, 446 change_info4 *cinfo) 447 { 448 rnode4_t *rp; 449 int mtime_changed = 0; 450 int ctime_changed = 0; 451 vsecattr_t *vsp; 452 int was_serial, set_time_cache_inval, recov; 453 vattr_t *vap = &garp->n4g_va; 454 mntinfo4_t *mi = VTOMI4(vp); 455 len_t preattr_rsize; 456 boolean_t writemodify_set = B_FALSE; 457 boolean_t cachepurge_set = B_FALSE; 458 459 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 460 461 /* Is curthread the recovery thread? */ 462 mutex_enter(&mi->mi_lock); 463 recov = (VTOMI4(vp)->mi_recovthread == curthread); 464 mutex_exit(&mi->mi_lock); 465 466 rp = VTOR4(vp); 467 mutex_enter(&rp->r_statelock); 468 was_serial = (rp->r_serial == curthread); 469 if (rp->r_serial && !was_serial) { 470 klwp_t *lwp = ttolwp(curthread); 471 472 /* 473 * If we're the recovery thread, then purge current attrs 474 * and bail out to avoid potential deadlock between another 475 * thread caching attrs (r_serial thread), recov thread, 476 * and an async writer thread. 477 */ 478 if (recov) { 479 PURGE_ATTRCACHE4_LOCKED(rp); 480 mutex_exit(&rp->r_statelock); 481 return; 482 } 483 484 if (lwp != NULL) 485 lwp->lwp_nostop++; 486 while (rp->r_serial != NULL) { 487 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 488 mutex_exit(&rp->r_statelock); 489 if (lwp != NULL) 490 lwp->lwp_nostop--; 491 return; 492 } 493 } 494 if (lwp != NULL) 495 lwp->lwp_nostop--; 496 } 497 498 /* 499 * If there is a page flush thread, the current thread needs to 500 * bail out, to prevent a possible deadlock between the current 501 * thread (which might be in a start_op/end_op region), the 502 * recovery thread, and the page flush thread. Expire the 503 * attribute cache, so that any attributes the current thread was 504 * going to set are not lost. 505 */ 506 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 507 PURGE_ATTRCACHE4_LOCKED(rp); 508 mutex_exit(&rp->r_statelock); 509 return; 510 } 511 512 if (rp->r_time_attr_saved > t) { 513 /* 514 * Attributes have been cached since these attributes were 515 * probably made. If there is an inconsistency in what is 516 * cached, mark them invalid. If not, don't act on them. 517 */ 518 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 519 PURGE_ATTRCACHE4_LOCKED(rp); 520 mutex_exit(&rp->r_statelock); 521 return; 522 } 523 set_time_cache_inval = 0; 524 if (cinfo) { 525 /* 526 * Only directory modifying callers pass non-NULL cinfo. 527 */ 528 ASSERT(vp->v_type == VDIR); 529 /* 530 * If the cache timeout either doesn't exist or hasn't expired, 531 * and dir didn't changed on server before dirmod op 532 * and dir didn't change after dirmod op but before getattr 533 * then there's a chance that the client's cached data for 534 * this object is current (not stale). No immediate cache 535 * flush is required. 536 * 537 */ 538 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 539 cinfo->before == rp->r_change && 540 (garp->n4g_change_valid && 541 cinfo->after == garp->n4g_change)) { 542 543 /* 544 * If atomic isn't set, then the before/after info 545 * cannot be blindly trusted. For this case, we tell 546 * nfs4_attrcache_va to cache the attrs but also 547 * establish an absolute maximum cache timeout. When 548 * the timeout is reached, caches will be flushed. 549 */ 550 if (! cinfo->atomic) 551 set_time_cache_inval = 1; 552 } else { 553 554 /* 555 * We're not sure exactly what changed, but we know 556 * what to do. flush all caches for dir. remove the 557 * attr timeout. 558 * 559 * a) timeout expired. flush all caches. 560 * b) r_change != cinfo.before. flush all caches. 561 * c) r_change == cinfo.before, but cinfo.after != 562 * post-op getattr(change). flush all caches. 563 * d) post-op getattr(change) not provided by server. 564 * flush all caches. 565 */ 566 mtime_changed = 1; 567 ctime_changed = 1; 568 rp->r_time_cache_inval = 0; 569 } 570 } else { 571 /* 572 * Write thread after writing data to file on remote server, 573 * will always set R4WRITEMODIFIED to indicate that file on 574 * remote server was modified with a WRITE operation and would 575 * have marked attribute cache as timed out. If R4WRITEMODIFIED 576 * is set, then do not check for mtime and ctime change. 577 */ 578 if (!(rp->r_flags & R4WRITEMODIFIED)) { 579 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 580 mtime_changed = 1; 581 582 if (rp->r_attr.va_ctime.tv_sec != 583 vap->va_ctime.tv_sec || 584 rp->r_attr.va_ctime.tv_nsec != 585 vap->va_ctime.tv_nsec) 586 ctime_changed = 1; 587 } else { 588 writemodify_set = B_TRUE; 589 } 590 } 591 592 preattr_rsize = rp->r_size; 593 594 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 595 596 /* 597 * If we have updated filesize in nfs4_attrcache_va, as soon as we 598 * drop statelock we will be in transition of purging all 599 * our caches and updating them. It is possible for another 600 * thread to pick this new file size and read in zeroed data. 601 * stall other threads till cache purge is complete. 602 */ 603 if ((!cinfo) && (rp->r_size != preattr_rsize)) { 604 /* 605 * If R4WRITEMODIFIED was set and we have updated the file 606 * size, Server's returned file size need not necessarily 607 * be because of this Client's WRITE. We need to purge 608 * all caches. 609 */ 610 if (writemodify_set) 611 mtime_changed = 1; 612 613 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) { 614 rp->r_flags |= R4INCACHEPURGE; 615 cachepurge_set = B_TRUE; 616 } 617 } 618 619 if (!mtime_changed && !ctime_changed) { 620 mutex_exit(&rp->r_statelock); 621 return; 622 } 623 624 rp->r_serial = curthread; 625 626 mutex_exit(&rp->r_statelock); 627 628 /* 629 * If we're the recov thread, then force async nfs4_purge_caches 630 * to avoid potential deadlock. 631 */ 632 if (mtime_changed) 633 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 634 635 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) { 636 mutex_enter(&rp->r_statelock); 637 rp->r_flags &= ~R4INCACHEPURGE; 638 cv_broadcast(&rp->r_cv); 639 mutex_exit(&rp->r_statelock); 640 cachepurge_set = B_FALSE; 641 } 642 643 if (ctime_changed) { 644 (void) nfs4_access_purge_rp(rp); 645 if (rp->r_secattr != NULL) { 646 mutex_enter(&rp->r_statelock); 647 vsp = rp->r_secattr; 648 rp->r_secattr = NULL; 649 mutex_exit(&rp->r_statelock); 650 if (vsp != NULL) 651 nfs4_acl_free_cache(vsp); 652 } 653 } 654 655 if (!was_serial) { 656 mutex_enter(&rp->r_statelock); 657 rp->r_serial = NULL; 658 cv_broadcast(&rp->r_cv); 659 mutex_exit(&rp->r_statelock); 660 } 661 } 662 663 /* 664 * Set attributes cache for given vnode using virtual attributes. 665 * 666 * Set the timeout value on the attribute cache and fill it 667 * with the passed in attributes. 668 * 669 * The caller must be holding r_statelock. 670 */ 671 static void 672 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 673 { 674 rnode4_t *rp; 675 mntinfo4_t *mi; 676 hrtime_t delta; 677 hrtime_t now; 678 vattr_t *vap = &garp->n4g_va; 679 680 rp = VTOR4(vp); 681 682 ASSERT(MUTEX_HELD(&rp->r_statelock)); 683 ASSERT(vap->va_mask == AT_ALL); 684 685 /* Switch to master before checking v_flag */ 686 if (IS_SHADOW(vp, rp)) 687 vp = RTOV4(rp); 688 689 now = gethrtime(); 690 691 mi = VTOMI4(vp); 692 693 /* 694 * Only establish a new cache timeout (if requested). Never 695 * extend a timeout. Never clear a timeout. Clearing a timeout 696 * is done by nfs4_update_dircaches (ancestor in our call chain) 697 */ 698 if (set_cache_timeout && ! rp->r_time_cache_inval) 699 rp->r_time_cache_inval = now + mi->mi_acdirmax; 700 701 /* 702 * Delta is the number of nanoseconds that we will 703 * cache the attributes of the file. It is based on 704 * the number of nanoseconds since the last time that 705 * we detected a change. The assumption is that files 706 * that changed recently are likely to change again. 707 * There is a minimum and a maximum for regular files 708 * and for directories which is enforced though. 709 * 710 * Using the time since last change was detected 711 * eliminates direct comparison or calculation 712 * using mixed client and server times. NFS does 713 * not make any assumptions regarding the client 714 * and server clocks being synchronized. 715 */ 716 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 717 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 718 vap->va_size != rp->r_attr.va_size) { 719 rp->r_time_attr_saved = now; 720 } 721 722 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 723 delta = 0; 724 else { 725 delta = now - rp->r_time_attr_saved; 726 if (vp->v_type == VDIR) { 727 if (delta < mi->mi_acdirmin) 728 delta = mi->mi_acdirmin; 729 else if (delta > mi->mi_acdirmax) 730 delta = mi->mi_acdirmax; 731 } else { 732 if (delta < mi->mi_acregmin) 733 delta = mi->mi_acregmin; 734 else if (delta > mi->mi_acregmax) 735 delta = mi->mi_acregmax; 736 } 737 } 738 rp->r_time_attr_inval = now + delta; 739 740 rp->r_attr = *vap; 741 if (garp->n4g_change_valid) 742 rp->r_change = garp->n4g_change; 743 744 /* 745 * The attributes that were returned may be valid and can 746 * be used, but they may not be allowed to be cached. 747 * Reset the timers to cause immediate invalidation and 748 * clear r_change so no VERIFY operations will suceed 749 */ 750 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 751 rp->r_time_attr_inval = now; 752 rp->r_time_attr_saved = now; 753 rp->r_change = 0; 754 } 755 756 /* 757 * If mounted_on_fileid returned AND the object is a stub, 758 * then set object's va_nodeid to the mounted over fid 759 * returned by server. 760 * 761 * If mounted_on_fileid not provided/supported, then 762 * just set it to 0 for now. Eventually it would be 763 * better to set it to a hashed version of FH. This 764 * would probably be good enough to provide a unique 765 * fid/d_ino within a dir. 766 * 767 * We don't need to carry mounted_on_fileid in the 768 * rnode as long as the client never requests fileid 769 * without also requesting mounted_on_fileid. For 770 * now, it stays. 771 */ 772 if (garp->n4g_mon_fid_valid) { 773 rp->r_mntd_fid = garp->n4g_mon_fid; 774 775 if (RP_ISSTUB(rp)) 776 rp->r_attr.va_nodeid = rp->r_mntd_fid; 777 } 778 779 /* 780 * Check to see if there are valid pathconf bits to 781 * cache in the rnode. 782 */ 783 if (garp->n4g_ext_res) { 784 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 785 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 786 } else { 787 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 788 rp->r_pathconf.pc4_xattr_valid = TRUE; 789 rp->r_pathconf.pc4_xattr_exists = 790 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 791 } 792 } 793 } 794 /* 795 * Update the size of the file if there is no cached data or if 796 * the cached data is clean and there is no data being written 797 * out. 798 */ 799 if (rp->r_size != vap->va_size && 800 (!vn_has_cached_data(vp) || 801 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 802 rp->r_size = vap->va_size; 803 } 804 nfs_setswaplike(vp, vap); 805 rp->r_flags &= ~R4WRITEMODIFIED; 806 } 807 808 /* 809 * Get attributes over-the-wire and update attributes cache 810 * if no error occurred in the over-the-wire operation. 811 * Return 0 if successful, otherwise error. 812 */ 813 int 814 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 815 { 816 mntinfo4_t *mi = VTOMI4(vp); 817 hrtime_t t; 818 nfs4_recov_state_t recov_state; 819 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 820 821 recov_state.rs_flags = 0; 822 recov_state.rs_num_retry_despite_err = 0; 823 824 /* Save the original mount point security flavor */ 825 (void) save_mnt_secinfo(mi->mi_curr_serv); 826 827 recov_retry: 828 829 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 830 &recov_state, NULL))) { 831 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 832 return (e.error); 833 } 834 835 t = gethrtime(); 836 837 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 838 839 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 840 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 841 NULL, OP_GETATTR, NULL) == FALSE) { 842 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 843 &recov_state, 1); 844 goto recov_retry; 845 } 846 } 847 848 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 849 850 if (!e.error) { 851 if (e.stat == NFS4_OK) { 852 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 853 } else { 854 e.error = geterrno4(e.stat); 855 856 nfs4_purge_stale_fh(e.error, vp, cr); 857 } 858 } 859 860 /* 861 * If getattr a node that is a stub for a crossed 862 * mount point, keep the original secinfo flavor for 863 * the current file system, not the crossed one. 864 */ 865 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 866 867 return (e.error); 868 } 869 870 /* 871 * Generate a compound to get attributes over-the-wire. 872 */ 873 void 874 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 875 nfs4_error_t *ep, cred_t *cr, int get_acl) 876 { 877 COMPOUND4args_clnt args; 878 COMPOUND4res_clnt res; 879 int doqueue; 880 rnode4_t *rp = VTOR4(vp); 881 nfs_argop4 argop[2]; 882 883 args.ctag = TAG_GETATTR; 884 885 args.array_len = 2; 886 args.array = argop; 887 888 /* putfh */ 889 argop[0].argop = OP_CPUTFH; 890 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 891 892 /* getattr */ 893 /* 894 * Unlike nfs version 2 and 3, where getattr returns all the 895 * attributes, nfs version 4 returns only the ones explicitly 896 * asked for. This creates problems, as some system functions 897 * (e.g. cache check) require certain attributes and if the 898 * cached node lacks some attributes such as uid/gid, it can 899 * affect system utilities (e.g. "ls") that rely on the information 900 * to be there. This can lead to anything from system crashes to 901 * corrupted information processed by user apps. 902 * So to ensure that all bases are covered, request at least 903 * the AT_ALL attribute mask. 904 */ 905 argop[1].argop = OP_GETATTR; 906 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 907 if (get_acl) 908 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 909 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 910 911 doqueue = 1; 912 913 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 914 915 if (ep->error) 916 return; 917 918 if (res.status != NFS4_OK) { 919 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 920 return; 921 } 922 923 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 924 925 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 926 } 927 928 /* 929 * Return either cached or remote attributes. If get remote attr 930 * use them to check and invalidate caches, then cache the new attributes. 931 */ 932 int 933 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 934 { 935 int error; 936 rnode4_t *rp; 937 nfs4_ga_res_t gar; 938 939 ASSERT(nfs4_consistent_type(vp)); 940 941 /* 942 * If we've got cached attributes, we're done, otherwise go 943 * to the server to get attributes, which will update the cache 944 * in the process. Either way, use the cached attributes for 945 * the caller's vattr_t. 946 * 947 * Note that we ignore the gar set by the OTW call: the attr caching 948 * code may make adjustments when storing to the rnode, and we want 949 * to see those changes here. 950 */ 951 rp = VTOR4(vp); 952 error = 0; 953 mutex_enter(&rp->r_statelock); 954 if (!ATTRCACHE4_VALID(vp)) { 955 mutex_exit(&rp->r_statelock); 956 error = nfs4_getattr_otw(vp, &gar, cr, 0); 957 mutex_enter(&rp->r_statelock); 958 } 959 960 if (!error) 961 *vap = rp->r_attr; 962 963 /* Return the client's view of file size */ 964 vap->va_size = rp->r_size; 965 966 mutex_exit(&rp->r_statelock); 967 968 ASSERT(nfs4_consistent_type(vp)); 969 970 return (error); 971 } 972 973 int 974 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 975 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 976 { 977 COMPOUND4args_clnt args; 978 COMPOUND4res_clnt res; 979 int doqueue; 980 nfs_argop4 argop[2]; 981 mntinfo4_t *mi = VTOMI4(vp); 982 bool_t needrecov = FALSE; 983 nfs4_recov_state_t recov_state; 984 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 985 nfs4_ga_ext_res_t *gerp; 986 987 recov_state.rs_flags = 0; 988 recov_state.rs_num_retry_despite_err = 0; 989 990 recov_retry: 991 args.ctag = tag_type; 992 993 args.array_len = 2; 994 args.array = argop; 995 996 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 997 if (e.error) 998 return (e.error); 999 1000 /* putfh */ 1001 argop[0].argop = OP_CPUTFH; 1002 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 1003 1004 /* getattr */ 1005 argop[1].argop = OP_GETATTR; 1006 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 1007 argop[1].nfs_argop4_u.opgetattr.mi = mi; 1008 1009 doqueue = 1; 1010 1011 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1012 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 1013 rnode4info(VTOR4(vp)))); 1014 1015 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 1016 1017 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 1018 if (!needrecov && e.error) { 1019 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1020 needrecov); 1021 return (e.error); 1022 } 1023 1024 if (needrecov) { 1025 bool_t abort; 1026 1027 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1028 "nfs4_attr_otw: initiating recovery\n")); 1029 1030 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1031 NULL, OP_GETATTR, NULL); 1032 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1033 needrecov); 1034 if (!e.error) { 1035 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1036 e.error = geterrno4(res.status); 1037 } 1038 if (abort == FALSE) 1039 goto recov_retry; 1040 return (e.error); 1041 } 1042 1043 if (res.status) { 1044 e.error = geterrno4(res.status); 1045 } else { 1046 gerp = garp->n4g_ext_res; 1047 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1048 garp, sizeof (nfs4_ga_res_t)); 1049 garp->n4g_ext_res = gerp; 1050 if (garp->n4g_ext_res && 1051 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1052 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1053 ga_res.n4g_ext_res, 1054 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1055 } 1056 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1057 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1058 needrecov); 1059 return (e.error); 1060 } 1061 1062 /* 1063 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1064 * for the demand-based allocation of async threads per-mount. The 1065 * nfs_async_timeout is the amount of time a thread will live after it 1066 * becomes idle, unless new I/O requests are received before the thread 1067 * dies. See nfs4_async_putpage and nfs4_async_start. 1068 */ 1069 1070 static void nfs4_async_start(struct vfs *); 1071 1072 static void 1073 free_async_args4(struct nfs4_async_reqs *args) 1074 { 1075 rnode4_t *rp; 1076 1077 if (args->a_io != NFS4_INACTIVE) { 1078 rp = VTOR4(args->a_vp); 1079 mutex_enter(&rp->r_statelock); 1080 rp->r_count--; 1081 if (args->a_io == NFS4_PUTAPAGE || 1082 args->a_io == NFS4_PAGEIO) 1083 rp->r_awcount--; 1084 cv_broadcast(&rp->r_cv); 1085 mutex_exit(&rp->r_statelock); 1086 VN_RELE(args->a_vp); 1087 } 1088 crfree(args->a_cred); 1089 kmem_free(args, sizeof (*args)); 1090 } 1091 1092 /* 1093 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1094 * pageout(), running in the global zone, have legitimate reasons to do 1095 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1096 * use of a a per-mount "asynchronous requests manager thread" which is 1097 * signaled by the various asynchronous work routines when there is 1098 * asynchronous work to be done. It is responsible for creating new 1099 * worker threads if necessary, and notifying existing worker threads 1100 * that there is work to be done. 1101 * 1102 * In other words, it will "take the specifications from the customers and 1103 * give them to the engineers." 1104 * 1105 * Worker threads die off of their own accord if they are no longer 1106 * needed. 1107 * 1108 * This thread is killed when the zone is going away or the filesystem 1109 * is being unmounted. 1110 */ 1111 void 1112 nfs4_async_manager(vfs_t *vfsp) 1113 { 1114 callb_cpr_t cprinfo; 1115 mntinfo4_t *mi; 1116 uint_t max_threads; 1117 1118 mi = VFTOMI4(vfsp); 1119 1120 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1121 "nfs4_async_manager"); 1122 1123 mutex_enter(&mi->mi_async_lock); 1124 /* 1125 * We want to stash the max number of threads that this mount was 1126 * allowed so we can use it later when the variable is set to zero as 1127 * part of the zone/mount going away. 1128 * 1129 * We want to be able to create at least one thread to handle 1130 * asyncrhonous inactive calls. 1131 */ 1132 max_threads = MAX(mi->mi_max_threads, 1); 1133 mutex_enter(&mi->mi_lock); 1134 /* 1135 * We don't want to wait for mi_max_threads to go to zero, since that 1136 * happens as part of a failed unmount, but this thread should only 1137 * exit when the mount is really going away. 1138 * 1139 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1140 * attempted: the various _async_*() functions know to do things 1141 * inline if mi_max_threads == 0. Henceforth we just drain out the 1142 * outstanding requests. 1143 * 1144 * Note that we still create zthreads even if we notice the zone is 1145 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1146 * shutdown sequence to take slightly longer in some cases, but 1147 * doesn't violate the protocol, as all threads will exit as soon as 1148 * they're done processing the remaining requests. 1149 */ 1150 while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) || 1151 mi->mi_async_req_count > 0) { 1152 mutex_exit(&mi->mi_lock); 1153 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1154 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1155 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1156 while (mi->mi_async_req_count > 0) { 1157 /* 1158 * Paranoia: If the mount started out having 1159 * (mi->mi_max_threads == 0), and the value was 1160 * later changed (via a debugger or somesuch), 1161 * we could be confused since we will think we 1162 * can't create any threads, and the calling 1163 * code (which looks at the current value of 1164 * mi->mi_max_threads, now non-zero) thinks we 1165 * can. 1166 * 1167 * So, because we're paranoid, we create threads 1168 * up to the maximum of the original and the 1169 * current value. This means that future 1170 * (debugger-induced) alterations of 1171 * mi->mi_max_threads are ignored for our 1172 * purposes, but who told them they could change 1173 * random values on a live kernel anyhow? 1174 */ 1175 if (mi->mi_threads < 1176 MAX(mi->mi_max_threads, max_threads)) { 1177 mi->mi_threads++; 1178 mutex_exit(&mi->mi_async_lock); 1179 MI4_HOLD(mi); 1180 VFS_HOLD(vfsp); /* hold for new thread */ 1181 (void) zthread_create(NULL, 0, nfs4_async_start, 1182 vfsp, 0, minclsyspri); 1183 mutex_enter(&mi->mi_async_lock); 1184 } 1185 cv_signal(&mi->mi_async_work_cv); 1186 ASSERT(mi->mi_async_req_count != 0); 1187 mi->mi_async_req_count--; 1188 } 1189 mutex_enter(&mi->mi_lock); 1190 } 1191 mutex_exit(&mi->mi_lock); 1192 1193 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1194 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1195 /* 1196 * Let everyone know we're done. 1197 */ 1198 mi->mi_manager_thread = NULL; 1199 /* 1200 * Wake up the inactive thread. 1201 */ 1202 cv_broadcast(&mi->mi_inact_req_cv); 1203 /* 1204 * Wake up anyone sitting in nfs4_async_manager_stop() 1205 */ 1206 cv_broadcast(&mi->mi_async_cv); 1207 /* 1208 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1209 * since CALLB_CPR_EXIT is actually responsible for releasing 1210 * 'mi_async_lock'. 1211 */ 1212 CALLB_CPR_EXIT(&cprinfo); 1213 VFS_RELE(vfsp); /* release thread's hold */ 1214 MI4_RELE(mi); 1215 zthread_exit(); 1216 } 1217 1218 /* 1219 * Signal (and wait for) the async manager thread to clean up and go away. 1220 */ 1221 void 1222 nfs4_async_manager_stop(vfs_t *vfsp) 1223 { 1224 mntinfo4_t *mi = VFTOMI4(vfsp); 1225 1226 mutex_enter(&mi->mi_async_lock); 1227 mutex_enter(&mi->mi_lock); 1228 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1229 mutex_exit(&mi->mi_lock); 1230 cv_broadcast(&mi->mi_async_reqs_cv); 1231 /* 1232 * Wait for the async manager thread to die. 1233 */ 1234 while (mi->mi_manager_thread != NULL) 1235 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1236 mutex_exit(&mi->mi_async_lock); 1237 } 1238 1239 int 1240 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1241 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1242 u_offset_t, caddr_t, struct seg *, cred_t *)) 1243 { 1244 rnode4_t *rp; 1245 mntinfo4_t *mi; 1246 struct nfs4_async_reqs *args; 1247 1248 rp = VTOR4(vp); 1249 ASSERT(rp->r_freef == NULL); 1250 1251 mi = VTOMI4(vp); 1252 1253 /* 1254 * If addr falls in a different segment, don't bother doing readahead. 1255 */ 1256 if (addr >= seg->s_base + seg->s_size) 1257 return (-1); 1258 1259 /* 1260 * If we can't allocate a request structure, punt on the readahead. 1261 */ 1262 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1263 return (-1); 1264 1265 /* 1266 * If a lock operation is pending, don't initiate any new 1267 * readaheads. Otherwise, bump r_count to indicate the new 1268 * asynchronous I/O. 1269 */ 1270 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1271 kmem_free(args, sizeof (*args)); 1272 return (-1); 1273 } 1274 mutex_enter(&rp->r_statelock); 1275 rp->r_count++; 1276 mutex_exit(&rp->r_statelock); 1277 nfs_rw_exit(&rp->r_lkserlock); 1278 1279 args->a_next = NULL; 1280 #ifdef DEBUG 1281 args->a_queuer = curthread; 1282 #endif 1283 VN_HOLD(vp); 1284 args->a_vp = vp; 1285 ASSERT(cr != NULL); 1286 crhold(cr); 1287 args->a_cred = cr; 1288 args->a_io = NFS4_READ_AHEAD; 1289 args->a_nfs4_readahead = readahead; 1290 args->a_nfs4_blkoff = blkoff; 1291 args->a_nfs4_seg = seg; 1292 args->a_nfs4_addr = addr; 1293 1294 mutex_enter(&mi->mi_async_lock); 1295 1296 /* 1297 * If asyncio has been disabled, don't bother readahead. 1298 */ 1299 if (mi->mi_max_threads == 0) { 1300 mutex_exit(&mi->mi_async_lock); 1301 goto noasync; 1302 } 1303 1304 /* 1305 * Link request structure into the async list and 1306 * wakeup async thread to do the i/o. 1307 */ 1308 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1309 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1310 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1311 } else { 1312 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1313 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1314 } 1315 1316 if (mi->mi_io_kstats) { 1317 mutex_enter(&mi->mi_lock); 1318 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1319 mutex_exit(&mi->mi_lock); 1320 } 1321 1322 mi->mi_async_req_count++; 1323 ASSERT(mi->mi_async_req_count != 0); 1324 cv_signal(&mi->mi_async_reqs_cv); 1325 mutex_exit(&mi->mi_async_lock); 1326 return (0); 1327 1328 noasync: 1329 mutex_enter(&rp->r_statelock); 1330 rp->r_count--; 1331 cv_broadcast(&rp->r_cv); 1332 mutex_exit(&rp->r_statelock); 1333 VN_RELE(vp); 1334 crfree(cr); 1335 kmem_free(args, sizeof (*args)); 1336 return (-1); 1337 } 1338 1339 /* 1340 * The async queues for each mounted file system are arranged as a 1341 * set of queues, one for each async i/o type. Requests are taken 1342 * from the queues in a round-robin fashion. A number of consecutive 1343 * requests are taken from each queue before moving on to the next 1344 * queue. This functionality may allow the NFS Version 2 server to do 1345 * write clustering, even if the client is mixing writes and reads 1346 * because it will take multiple write requests from the queue 1347 * before processing any of the other async i/o types. 1348 * 1349 * XXX The nfs4_async_start thread is unsafe in the light of the present 1350 * model defined by cpr to suspend the system. Specifically over the 1351 * wire calls are cpr-unsafe. The thread should be reevaluated in 1352 * case of future updates to the cpr model. 1353 */ 1354 static void 1355 nfs4_async_start(struct vfs *vfsp) 1356 { 1357 struct nfs4_async_reqs *args; 1358 mntinfo4_t *mi = VFTOMI4(vfsp); 1359 clock_t time_left = 1; 1360 callb_cpr_t cprinfo; 1361 int i; 1362 extern int nfs_async_timeout; 1363 1364 /* 1365 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1366 * built in an implementation independent manner. 1367 */ 1368 if (nfs_async_timeout == -1) 1369 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1370 1371 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1372 1373 mutex_enter(&mi->mi_async_lock); 1374 for (;;) { 1375 /* 1376 * Find the next queue containing an entry. We start 1377 * at the current queue pointer and then round robin 1378 * through all of them until we either find a non-empty 1379 * queue or have looked through all of them. 1380 */ 1381 for (i = 0; i < NFS4_ASYNC_TYPES; i++) { 1382 args = *mi->mi_async_curr; 1383 if (args != NULL) 1384 break; 1385 mi->mi_async_curr++; 1386 if (mi->mi_async_curr == 1387 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1388 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1389 } 1390 /* 1391 * If we didn't find a entry, then block until woken up 1392 * again and then look through the queues again. 1393 */ 1394 if (args == NULL) { 1395 /* 1396 * Exiting is considered to be safe for CPR as well 1397 */ 1398 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1399 1400 /* 1401 * Wakeup thread waiting to unmount the file 1402 * system only if all async threads are inactive. 1403 * 1404 * If we've timed-out and there's nothing to do, 1405 * then get rid of this thread. 1406 */ 1407 if (mi->mi_max_threads == 0 || time_left <= 0) { 1408 if (--mi->mi_threads == 0) 1409 cv_signal(&mi->mi_async_cv); 1410 CALLB_CPR_EXIT(&cprinfo); 1411 VFS_RELE(vfsp); /* release thread's hold */ 1412 MI4_RELE(mi); 1413 zthread_exit(); 1414 /* NOTREACHED */ 1415 } 1416 time_left = cv_timedwait(&mi->mi_async_work_cv, 1417 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1418 1419 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1420 1421 continue; 1422 } else { 1423 time_left = 1; 1424 } 1425 1426 /* 1427 * Remove the request from the async queue and then 1428 * update the current async request queue pointer. If 1429 * the current queue is empty or we have removed enough 1430 * consecutive entries from it, then reset the counter 1431 * for this queue and then move the current pointer to 1432 * the next queue. 1433 */ 1434 *mi->mi_async_curr = args->a_next; 1435 if (*mi->mi_async_curr == NULL || 1436 --mi->mi_async_clusters[args->a_io] == 0) { 1437 mi->mi_async_clusters[args->a_io] = 1438 mi->mi_async_init_clusters; 1439 mi->mi_async_curr++; 1440 if (mi->mi_async_curr == 1441 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1442 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1443 } 1444 1445 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1446 mutex_enter(&mi->mi_lock); 1447 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1448 mutex_exit(&mi->mi_lock); 1449 } 1450 1451 mutex_exit(&mi->mi_async_lock); 1452 1453 /* 1454 * Obtain arguments from the async request structure. 1455 */ 1456 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1457 (*args->a_nfs4_readahead)(args->a_vp, 1458 args->a_nfs4_blkoff, args->a_nfs4_addr, 1459 args->a_nfs4_seg, args->a_cred); 1460 } else if (args->a_io == NFS4_PUTAPAGE) { 1461 (void) (*args->a_nfs4_putapage)(args->a_vp, 1462 args->a_nfs4_pp, args->a_nfs4_off, 1463 args->a_nfs4_len, args->a_nfs4_flags, 1464 args->a_cred); 1465 } else if (args->a_io == NFS4_PAGEIO) { 1466 (void) (*args->a_nfs4_pageio)(args->a_vp, 1467 args->a_nfs4_pp, args->a_nfs4_off, 1468 args->a_nfs4_len, args->a_nfs4_flags, 1469 args->a_cred); 1470 } else if (args->a_io == NFS4_READDIR) { 1471 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1472 args->a_nfs4_rdc, args->a_cred)); 1473 } else if (args->a_io == NFS4_COMMIT) { 1474 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1475 args->a_nfs4_offset, args->a_nfs4_count, 1476 args->a_cred); 1477 } else if (args->a_io == NFS4_INACTIVE) { 1478 nfs4_inactive_otw(args->a_vp, args->a_cred); 1479 } 1480 1481 /* 1482 * Now, release the vnode and free the credentials 1483 * structure. 1484 */ 1485 free_async_args4(args); 1486 /* 1487 * Reacquire the mutex because it will be needed above. 1488 */ 1489 mutex_enter(&mi->mi_async_lock); 1490 } 1491 } 1492 1493 /* 1494 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1495 * part of VOP_INACTIVE. 1496 */ 1497 1498 void 1499 nfs4_inactive_thread(mntinfo4_t *mi) 1500 { 1501 struct nfs4_async_reqs *args; 1502 callb_cpr_t cprinfo; 1503 vfs_t *vfsp = mi->mi_vfsp; 1504 1505 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1506 "nfs4_inactive_thread"); 1507 1508 for (;;) { 1509 mutex_enter(&mi->mi_async_lock); 1510 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1511 if (args == NULL) { 1512 mutex_enter(&mi->mi_lock); 1513 /* 1514 * We don't want to exit until the async manager is done 1515 * with its work; hence the check for mi_manager_thread 1516 * being NULL. 1517 * 1518 * The async manager thread will cv_broadcast() on 1519 * mi_inact_req_cv when it's done, at which point we'll 1520 * wake up and exit. 1521 */ 1522 if (mi->mi_manager_thread == NULL) 1523 goto die; 1524 mi->mi_flags |= MI4_INACTIVE_IDLE; 1525 mutex_exit(&mi->mi_lock); 1526 cv_signal(&mi->mi_async_cv); 1527 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1528 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1529 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1530 mutex_exit(&mi->mi_async_lock); 1531 } else { 1532 mutex_enter(&mi->mi_lock); 1533 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1534 mutex_exit(&mi->mi_lock); 1535 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1536 mutex_exit(&mi->mi_async_lock); 1537 nfs4_inactive_otw(args->a_vp, args->a_cred); 1538 crfree(args->a_cred); 1539 kmem_free(args, sizeof (*args)); 1540 } 1541 } 1542 die: 1543 mutex_exit(&mi->mi_lock); 1544 mi->mi_inactive_thread = NULL; 1545 cv_signal(&mi->mi_async_cv); 1546 1547 /* 1548 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1549 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1550 */ 1551 CALLB_CPR_EXIT(&cprinfo); 1552 1553 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1554 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1555 1556 MI4_RELE(mi); 1557 zthread_exit(); 1558 /* NOTREACHED */ 1559 } 1560 1561 /* 1562 * nfs_async_stop: 1563 * Wait for all outstanding putpage operations and the inactive thread to 1564 * complete; nfs4_async_stop_sig() without interruptibility. 1565 */ 1566 void 1567 nfs4_async_stop(struct vfs *vfsp) 1568 { 1569 mntinfo4_t *mi = VFTOMI4(vfsp); 1570 1571 /* 1572 * Wait for all outstanding async operations to complete and for 1573 * worker threads to exit. 1574 */ 1575 mutex_enter(&mi->mi_async_lock); 1576 mi->mi_max_threads = 0; 1577 cv_broadcast(&mi->mi_async_work_cv); 1578 while (mi->mi_threads != 0) 1579 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1580 1581 /* 1582 * Wait for the inactive thread to finish doing what it's doing. It 1583 * won't exit until the last reference to the vfs_t goes away. 1584 */ 1585 if (mi->mi_inactive_thread != NULL) { 1586 mutex_enter(&mi->mi_lock); 1587 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1588 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1589 mutex_exit(&mi->mi_lock); 1590 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1591 mutex_enter(&mi->mi_lock); 1592 } 1593 mutex_exit(&mi->mi_lock); 1594 } 1595 mutex_exit(&mi->mi_async_lock); 1596 } 1597 1598 /* 1599 * nfs_async_stop_sig: 1600 * Wait for all outstanding putpage operations and the inactive thread to 1601 * complete. If a signal is delivered we will abort and return non-zero; 1602 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1603 * need to make it interruptible. 1604 */ 1605 int 1606 nfs4_async_stop_sig(struct vfs *vfsp) 1607 { 1608 mntinfo4_t *mi = VFTOMI4(vfsp); 1609 ushort_t omax; 1610 bool_t intr = FALSE; 1611 1612 /* 1613 * Wait for all outstanding putpage operations to complete and for 1614 * worker threads to exit. 1615 */ 1616 mutex_enter(&mi->mi_async_lock); 1617 omax = mi->mi_max_threads; 1618 mi->mi_max_threads = 0; 1619 cv_broadcast(&mi->mi_async_work_cv); 1620 while (mi->mi_threads != 0) { 1621 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1622 intr = TRUE; 1623 goto interrupted; 1624 } 1625 } 1626 1627 /* 1628 * Wait for the inactive thread to finish doing what it's doing. It 1629 * won't exit until the a last reference to the vfs_t goes away. 1630 */ 1631 if (mi->mi_inactive_thread != NULL) { 1632 mutex_enter(&mi->mi_lock); 1633 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1634 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1635 mutex_exit(&mi->mi_lock); 1636 if (!cv_wait_sig(&mi->mi_async_cv, 1637 &mi->mi_async_lock)) { 1638 intr = TRUE; 1639 goto interrupted; 1640 } 1641 mutex_enter(&mi->mi_lock); 1642 } 1643 mutex_exit(&mi->mi_lock); 1644 } 1645 interrupted: 1646 if (intr) 1647 mi->mi_max_threads = omax; 1648 mutex_exit(&mi->mi_async_lock); 1649 1650 return (intr); 1651 } 1652 1653 int 1654 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1655 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1656 u_offset_t, size_t, int, cred_t *)) 1657 { 1658 rnode4_t *rp; 1659 mntinfo4_t *mi; 1660 struct nfs4_async_reqs *args; 1661 1662 ASSERT(flags & B_ASYNC); 1663 ASSERT(vp->v_vfsp != NULL); 1664 1665 rp = VTOR4(vp); 1666 ASSERT(rp->r_count > 0); 1667 1668 mi = VTOMI4(vp); 1669 1670 /* 1671 * If we can't allocate a request structure, do the putpage 1672 * operation synchronously in this thread's context. 1673 */ 1674 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1675 goto noasync; 1676 1677 args->a_next = NULL; 1678 #ifdef DEBUG 1679 args->a_queuer = curthread; 1680 #endif 1681 VN_HOLD(vp); 1682 args->a_vp = vp; 1683 ASSERT(cr != NULL); 1684 crhold(cr); 1685 args->a_cred = cr; 1686 args->a_io = NFS4_PUTAPAGE; 1687 args->a_nfs4_putapage = putapage; 1688 args->a_nfs4_pp = pp; 1689 args->a_nfs4_off = off; 1690 args->a_nfs4_len = (uint_t)len; 1691 args->a_nfs4_flags = flags; 1692 1693 mutex_enter(&mi->mi_async_lock); 1694 1695 /* 1696 * If asyncio has been disabled, then make a synchronous request. 1697 * This check is done a second time in case async io was diabled 1698 * while this thread was blocked waiting for memory pressure to 1699 * reduce or for the queue to drain. 1700 */ 1701 if (mi->mi_max_threads == 0) { 1702 mutex_exit(&mi->mi_async_lock); 1703 1704 VN_RELE(vp); 1705 crfree(cr); 1706 kmem_free(args, sizeof (*args)); 1707 goto noasync; 1708 } 1709 1710 /* 1711 * Link request structure into the async list and 1712 * wakeup async thread to do the i/o. 1713 */ 1714 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1715 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1716 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1717 } else { 1718 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1719 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1720 } 1721 1722 mutex_enter(&rp->r_statelock); 1723 rp->r_count++; 1724 rp->r_awcount++; 1725 mutex_exit(&rp->r_statelock); 1726 1727 if (mi->mi_io_kstats) { 1728 mutex_enter(&mi->mi_lock); 1729 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1730 mutex_exit(&mi->mi_lock); 1731 } 1732 1733 mi->mi_async_req_count++; 1734 ASSERT(mi->mi_async_req_count != 0); 1735 cv_signal(&mi->mi_async_reqs_cv); 1736 mutex_exit(&mi->mi_async_lock); 1737 return (0); 1738 1739 noasync: 1740 1741 if (curproc == proc_pageout || curproc == proc_fsflush || 1742 nfs_zone() == mi->mi_zone) { 1743 /* 1744 * If we get here in the context of the pageout/fsflush, 1745 * or we have run out of memory or we're attempting to 1746 * unmount we refuse to do a sync write, because this may 1747 * hang pageout/fsflush and the machine. In this case, 1748 * we just re-mark the page as dirty and punt on the page. 1749 * 1750 * Make sure B_FORCE isn't set. We can re-mark the 1751 * pages as dirty and unlock the pages in one swoop by 1752 * passing in B_ERROR to pvn_write_done(). However, 1753 * we should make sure B_FORCE isn't set - we don't 1754 * want the page tossed before it gets written out. 1755 */ 1756 if (flags & B_FORCE) 1757 flags &= ~(B_INVAL | B_FORCE); 1758 pvn_write_done(pp, flags | B_ERROR); 1759 return (0); 1760 } 1761 1762 /* 1763 * We'll get here only if (nfs_zone() != mi->mi_zone) 1764 * which means that this was a cross-zone sync putpage. 1765 * 1766 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1767 * as dirty and unlock them. 1768 * 1769 * We don't want to clear B_FORCE here as the caller presumably 1770 * knows what they're doing if they set it. 1771 */ 1772 pvn_write_done(pp, flags | B_ERROR); 1773 return (EPERM); 1774 } 1775 1776 int 1777 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1778 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1779 size_t, int, cred_t *)) 1780 { 1781 rnode4_t *rp; 1782 mntinfo4_t *mi; 1783 struct nfs4_async_reqs *args; 1784 1785 ASSERT(flags & B_ASYNC); 1786 ASSERT(vp->v_vfsp != NULL); 1787 1788 rp = VTOR4(vp); 1789 ASSERT(rp->r_count > 0); 1790 1791 mi = VTOMI4(vp); 1792 1793 /* 1794 * If we can't allocate a request structure, do the pageio 1795 * request synchronously in this thread's context. 1796 */ 1797 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1798 goto noasync; 1799 1800 args->a_next = NULL; 1801 #ifdef DEBUG 1802 args->a_queuer = curthread; 1803 #endif 1804 VN_HOLD(vp); 1805 args->a_vp = vp; 1806 ASSERT(cr != NULL); 1807 crhold(cr); 1808 args->a_cred = cr; 1809 args->a_io = NFS4_PAGEIO; 1810 args->a_nfs4_pageio = pageio; 1811 args->a_nfs4_pp = pp; 1812 args->a_nfs4_off = io_off; 1813 args->a_nfs4_len = (uint_t)io_len; 1814 args->a_nfs4_flags = flags; 1815 1816 mutex_enter(&mi->mi_async_lock); 1817 1818 /* 1819 * If asyncio has been disabled, then make a synchronous request. 1820 * This check is done a second time in case async io was diabled 1821 * while this thread was blocked waiting for memory pressure to 1822 * reduce or for the queue to drain. 1823 */ 1824 if (mi->mi_max_threads == 0) { 1825 mutex_exit(&mi->mi_async_lock); 1826 1827 VN_RELE(vp); 1828 crfree(cr); 1829 kmem_free(args, sizeof (*args)); 1830 goto noasync; 1831 } 1832 1833 /* 1834 * Link request structure into the async list and 1835 * wakeup async thread to do the i/o. 1836 */ 1837 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1838 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1839 mi->mi_async_tail[NFS4_PAGEIO] = args; 1840 } else { 1841 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1842 mi->mi_async_tail[NFS4_PAGEIO] = args; 1843 } 1844 1845 mutex_enter(&rp->r_statelock); 1846 rp->r_count++; 1847 rp->r_awcount++; 1848 mutex_exit(&rp->r_statelock); 1849 1850 if (mi->mi_io_kstats) { 1851 mutex_enter(&mi->mi_lock); 1852 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1853 mutex_exit(&mi->mi_lock); 1854 } 1855 1856 mi->mi_async_req_count++; 1857 ASSERT(mi->mi_async_req_count != 0); 1858 cv_signal(&mi->mi_async_reqs_cv); 1859 mutex_exit(&mi->mi_async_lock); 1860 return (0); 1861 1862 noasync: 1863 /* 1864 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1865 * the page list), for writes we do it synchronously, except for 1866 * proc_pageout/proc_fsflush as described below. 1867 */ 1868 if (flags & B_READ) { 1869 pvn_read_done(pp, flags | B_ERROR); 1870 return (0); 1871 } 1872 1873 if (curproc == proc_pageout || curproc == proc_fsflush) { 1874 /* 1875 * If we get here in the context of the pageout/fsflush, 1876 * we refuse to do a sync write, because this may hang 1877 * pageout/fsflush (and the machine). In this case, we just 1878 * re-mark the page as dirty and punt on the page. 1879 * 1880 * Make sure B_FORCE isn't set. We can re-mark the 1881 * pages as dirty and unlock the pages in one swoop by 1882 * passing in B_ERROR to pvn_write_done(). However, 1883 * we should make sure B_FORCE isn't set - we don't 1884 * want the page tossed before it gets written out. 1885 */ 1886 if (flags & B_FORCE) 1887 flags &= ~(B_INVAL | B_FORCE); 1888 pvn_write_done(pp, flags | B_ERROR); 1889 return (0); 1890 } 1891 1892 if (nfs_zone() != mi->mi_zone) { 1893 /* 1894 * So this was a cross-zone sync pageio. We pass in B_ERROR 1895 * to pvn_write_done() to re-mark the pages as dirty and unlock 1896 * them. 1897 * 1898 * We don't want to clear B_FORCE here as the caller presumably 1899 * knows what they're doing if they set it. 1900 */ 1901 pvn_write_done(pp, flags | B_ERROR); 1902 return (EPERM); 1903 } 1904 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1905 } 1906 1907 void 1908 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1909 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1910 { 1911 rnode4_t *rp; 1912 mntinfo4_t *mi; 1913 struct nfs4_async_reqs *args; 1914 1915 rp = VTOR4(vp); 1916 ASSERT(rp->r_freef == NULL); 1917 1918 mi = VTOMI4(vp); 1919 1920 /* 1921 * If we can't allocate a request structure, skip the readdir. 1922 */ 1923 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1924 goto noasync; 1925 1926 args->a_next = NULL; 1927 #ifdef DEBUG 1928 args->a_queuer = curthread; 1929 #endif 1930 VN_HOLD(vp); 1931 args->a_vp = vp; 1932 ASSERT(cr != NULL); 1933 crhold(cr); 1934 args->a_cred = cr; 1935 args->a_io = NFS4_READDIR; 1936 args->a_nfs4_readdir = readdir; 1937 args->a_nfs4_rdc = rdc; 1938 1939 mutex_enter(&mi->mi_async_lock); 1940 1941 /* 1942 * If asyncio has been disabled, then skip this request 1943 */ 1944 if (mi->mi_max_threads == 0) { 1945 mutex_exit(&mi->mi_async_lock); 1946 1947 VN_RELE(vp); 1948 crfree(cr); 1949 kmem_free(args, sizeof (*args)); 1950 goto noasync; 1951 } 1952 1953 /* 1954 * Link request structure into the async list and 1955 * wakeup async thread to do the i/o. 1956 */ 1957 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 1958 mi->mi_async_reqs[NFS4_READDIR] = args; 1959 mi->mi_async_tail[NFS4_READDIR] = args; 1960 } else { 1961 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 1962 mi->mi_async_tail[NFS4_READDIR] = args; 1963 } 1964 1965 mutex_enter(&rp->r_statelock); 1966 rp->r_count++; 1967 mutex_exit(&rp->r_statelock); 1968 1969 if (mi->mi_io_kstats) { 1970 mutex_enter(&mi->mi_lock); 1971 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1972 mutex_exit(&mi->mi_lock); 1973 } 1974 1975 mi->mi_async_req_count++; 1976 ASSERT(mi->mi_async_req_count != 0); 1977 cv_signal(&mi->mi_async_reqs_cv); 1978 mutex_exit(&mi->mi_async_lock); 1979 return; 1980 1981 noasync: 1982 mutex_enter(&rp->r_statelock); 1983 rdc->entries = NULL; 1984 /* 1985 * Indicate that no one is trying to fill this entry and 1986 * it still needs to be filled. 1987 */ 1988 rdc->flags &= ~RDDIR; 1989 rdc->flags |= RDDIRREQ; 1990 rddir4_cache_rele(rp, rdc); 1991 mutex_exit(&rp->r_statelock); 1992 } 1993 1994 void 1995 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1996 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1997 cred_t *)) 1998 { 1999 rnode4_t *rp; 2000 mntinfo4_t *mi; 2001 struct nfs4_async_reqs *args; 2002 page_t *pp; 2003 2004 rp = VTOR4(vp); 2005 mi = VTOMI4(vp); 2006 2007 /* 2008 * If we can't allocate a request structure, do the commit 2009 * operation synchronously in this thread's context. 2010 */ 2011 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2012 goto noasync; 2013 2014 args->a_next = NULL; 2015 #ifdef DEBUG 2016 args->a_queuer = curthread; 2017 #endif 2018 VN_HOLD(vp); 2019 args->a_vp = vp; 2020 ASSERT(cr != NULL); 2021 crhold(cr); 2022 args->a_cred = cr; 2023 args->a_io = NFS4_COMMIT; 2024 args->a_nfs4_commit = commit; 2025 args->a_nfs4_plist = plist; 2026 args->a_nfs4_offset = offset; 2027 args->a_nfs4_count = count; 2028 2029 mutex_enter(&mi->mi_async_lock); 2030 2031 /* 2032 * If asyncio has been disabled, then make a synchronous request. 2033 * This check is done a second time in case async io was diabled 2034 * while this thread was blocked waiting for memory pressure to 2035 * reduce or for the queue to drain. 2036 */ 2037 if (mi->mi_max_threads == 0) { 2038 mutex_exit(&mi->mi_async_lock); 2039 2040 VN_RELE(vp); 2041 crfree(cr); 2042 kmem_free(args, sizeof (*args)); 2043 goto noasync; 2044 } 2045 2046 /* 2047 * Link request structure into the async list and 2048 * wakeup async thread to do the i/o. 2049 */ 2050 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2051 mi->mi_async_reqs[NFS4_COMMIT] = args; 2052 mi->mi_async_tail[NFS4_COMMIT] = args; 2053 } else { 2054 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2055 mi->mi_async_tail[NFS4_COMMIT] = args; 2056 } 2057 2058 mutex_enter(&rp->r_statelock); 2059 rp->r_count++; 2060 mutex_exit(&rp->r_statelock); 2061 2062 if (mi->mi_io_kstats) { 2063 mutex_enter(&mi->mi_lock); 2064 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2065 mutex_exit(&mi->mi_lock); 2066 } 2067 2068 mi->mi_async_req_count++; 2069 ASSERT(mi->mi_async_req_count != 0); 2070 cv_signal(&mi->mi_async_reqs_cv); 2071 mutex_exit(&mi->mi_async_lock); 2072 return; 2073 2074 noasync: 2075 if (curproc == proc_pageout || curproc == proc_fsflush || 2076 nfs_zone() != mi->mi_zone) { 2077 while (plist != NULL) { 2078 pp = plist; 2079 page_sub(&plist, pp); 2080 pp->p_fsdata = C_COMMIT; 2081 page_unlock(pp); 2082 } 2083 return; 2084 } 2085 (*commit)(vp, plist, offset, count, cr); 2086 } 2087 2088 /* 2089 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2090 * reference to the vnode is handed over to the thread; the caller should 2091 * no longer refer to the vnode. 2092 * 2093 * Unlike most of the async routines, this handoff is needed for 2094 * correctness reasons, not just performance. So doing operations in the 2095 * context of the current thread is not an option. 2096 */ 2097 void 2098 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2099 { 2100 mntinfo4_t *mi; 2101 struct nfs4_async_reqs *args; 2102 boolean_t signal_inactive_thread = B_FALSE; 2103 2104 mi = VTOMI4(vp); 2105 2106 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2107 args->a_next = NULL; 2108 #ifdef DEBUG 2109 args->a_queuer = curthread; 2110 #endif 2111 args->a_vp = vp; 2112 ASSERT(cr != NULL); 2113 crhold(cr); 2114 args->a_cred = cr; 2115 args->a_io = NFS4_INACTIVE; 2116 2117 /* 2118 * Note that we don't check mi->mi_max_threads here, since we 2119 * *need* to get rid of this vnode regardless of whether someone 2120 * set nfs4_max_threads to zero in /etc/system. 2121 * 2122 * The manager thread knows about this and is willing to create 2123 * at least one thread to accommodate us. 2124 */ 2125 mutex_enter(&mi->mi_async_lock); 2126 if (mi->mi_inactive_thread == NULL) { 2127 rnode4_t *rp; 2128 vnode_t *unldvp = NULL; 2129 char *unlname; 2130 cred_t *unlcred; 2131 2132 mutex_exit(&mi->mi_async_lock); 2133 /* 2134 * We just need to free up the memory associated with the 2135 * vnode, which can be safely done from within the current 2136 * context. 2137 */ 2138 crfree(cr); /* drop our reference */ 2139 kmem_free(args, sizeof (*args)); 2140 rp = VTOR4(vp); 2141 mutex_enter(&rp->r_statelock); 2142 if (rp->r_unldvp != NULL) { 2143 unldvp = rp->r_unldvp; 2144 rp->r_unldvp = NULL; 2145 unlname = rp->r_unlname; 2146 rp->r_unlname = NULL; 2147 unlcred = rp->r_unlcred; 2148 rp->r_unlcred = NULL; 2149 } 2150 mutex_exit(&rp->r_statelock); 2151 /* 2152 * No need to explicitly throw away any cached pages. The 2153 * eventual r4inactive() will attempt a synchronous 2154 * VOP_PUTPAGE() which will immediately fail since the request 2155 * is coming from the wrong zone, and then will proceed to call 2156 * nfs4_invalidate_pages() which will clean things up for us. 2157 * 2158 * Throw away the delegation here so rp4_addfree()'s attempt to 2159 * return any existing delegations becomes a no-op. 2160 */ 2161 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2162 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2163 FALSE); 2164 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2165 nfs_rw_exit(&mi->mi_recovlock); 2166 } 2167 nfs4_clear_open_streams(rp); 2168 2169 rp4_addfree(rp, cr); 2170 if (unldvp != NULL) { 2171 kmem_free(unlname, MAXNAMELEN); 2172 VN_RELE(unldvp); 2173 crfree(unlcred); 2174 } 2175 return; 2176 } 2177 2178 if (mi->mi_manager_thread == NULL) { 2179 /* 2180 * We want to talk to the inactive thread. 2181 */ 2182 signal_inactive_thread = B_TRUE; 2183 } 2184 2185 /* 2186 * Enqueue the vnode and wake up either the special thread (empty 2187 * list) or an async thread. 2188 */ 2189 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2190 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2191 mi->mi_async_tail[NFS4_INACTIVE] = args; 2192 signal_inactive_thread = B_TRUE; 2193 } else { 2194 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2195 mi->mi_async_tail[NFS4_INACTIVE] = args; 2196 } 2197 if (signal_inactive_thread) { 2198 cv_signal(&mi->mi_inact_req_cv); 2199 } else { 2200 mi->mi_async_req_count++; 2201 ASSERT(mi->mi_async_req_count != 0); 2202 cv_signal(&mi->mi_async_reqs_cv); 2203 } 2204 2205 mutex_exit(&mi->mi_async_lock); 2206 } 2207 2208 int 2209 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2210 { 2211 int pagecreate; 2212 int n; 2213 int saved_n; 2214 caddr_t saved_base; 2215 u_offset_t offset; 2216 int error; 2217 int sm_error; 2218 vnode_t *vp = RTOV(rp); 2219 2220 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2221 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2222 if (!vpm_enable) { 2223 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2224 } 2225 2226 /* 2227 * Move bytes in at most PAGESIZE chunks. We must avoid 2228 * spanning pages in uiomove() because page faults may cause 2229 * the cache to be invalidated out from under us. The r_size is not 2230 * updated until after the uiomove. If we push the last page of a 2231 * file before r_size is correct, we will lose the data written past 2232 * the current (and invalid) r_size. 2233 */ 2234 do { 2235 offset = uio->uio_loffset; 2236 pagecreate = 0; 2237 2238 /* 2239 * n is the number of bytes required to satisfy the request 2240 * or the number of bytes to fill out the page. 2241 */ 2242 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2243 2244 /* 2245 * Check to see if we can skip reading in the page 2246 * and just allocate the memory. We can do this 2247 * if we are going to rewrite the entire mapping 2248 * or if we are going to write to or beyond the current 2249 * end of file from the beginning of the mapping. 2250 * 2251 * The read of r_size is now protected by r_statelock. 2252 */ 2253 mutex_enter(&rp->r_statelock); 2254 /* 2255 * When pgcreated is nonzero the caller has already done 2256 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2257 * segkpm this means we already have at least one page 2258 * created and mapped at base. 2259 */ 2260 pagecreate = pgcreated || 2261 ((offset & PAGEOFFSET) == 0 && 2262 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2263 2264 mutex_exit(&rp->r_statelock); 2265 2266 if (!vpm_enable && pagecreate) { 2267 /* 2268 * The last argument tells segmap_pagecreate() to 2269 * always lock the page, as opposed to sometimes 2270 * returning with the page locked. This way we avoid a 2271 * fault on the ensuing uiomove(), but also 2272 * more importantly (to fix bug 1094402) we can 2273 * call segmap_fault() to unlock the page in all 2274 * cases. An alternative would be to modify 2275 * segmap_pagecreate() to tell us when it is 2276 * locking a page, but that's a fairly major 2277 * interface change. 2278 */ 2279 if (pgcreated == 0) 2280 (void) segmap_pagecreate(segkmap, base, 2281 (uint_t)n, 1); 2282 saved_base = base; 2283 saved_n = n; 2284 } 2285 2286 /* 2287 * The number of bytes of data in the last page can not 2288 * be accurately be determined while page is being 2289 * uiomove'd to and the size of the file being updated. 2290 * Thus, inform threads which need to know accurately 2291 * how much data is in the last page of the file. They 2292 * will not do the i/o immediately, but will arrange for 2293 * the i/o to happen later when this modify operation 2294 * will have finished. 2295 */ 2296 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2297 mutex_enter(&rp->r_statelock); 2298 rp->r_flags |= R4MODINPROGRESS; 2299 rp->r_modaddr = (offset & MAXBMASK); 2300 mutex_exit(&rp->r_statelock); 2301 2302 if (vpm_enable) { 2303 /* 2304 * Copy data. If new pages are created, part of 2305 * the page that is not written will be initizliazed 2306 * with zeros. 2307 */ 2308 error = vpm_data_copy(vp, offset, n, uio, 2309 !pagecreate, NULL, 0, S_WRITE); 2310 } else { 2311 error = uiomove(base, n, UIO_WRITE, uio); 2312 } 2313 2314 /* 2315 * r_size is the maximum number of 2316 * bytes known to be in the file. 2317 * Make sure it is at least as high as the 2318 * first unwritten byte pointed to by uio_loffset. 2319 */ 2320 mutex_enter(&rp->r_statelock); 2321 if (rp->r_size < uio->uio_loffset) 2322 rp->r_size = uio->uio_loffset; 2323 rp->r_flags &= ~R4MODINPROGRESS; 2324 rp->r_flags |= R4DIRTY; 2325 mutex_exit(&rp->r_statelock); 2326 2327 /* n = # of bytes written */ 2328 n = (int)(uio->uio_loffset - offset); 2329 2330 if (!vpm_enable) { 2331 base += n; 2332 } 2333 2334 tcount -= n; 2335 /* 2336 * If we created pages w/o initializing them completely, 2337 * we need to zero the part that wasn't set up. 2338 * This happens on a most EOF write cases and if 2339 * we had some sort of error during the uiomove. 2340 */ 2341 if (!vpm_enable && pagecreate) { 2342 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2343 (void) kzero(base, PAGESIZE - n); 2344 2345 if (pgcreated) { 2346 /* 2347 * Caller is responsible for this page, 2348 * it was not created in this loop. 2349 */ 2350 pgcreated = 0; 2351 } else { 2352 /* 2353 * For bug 1094402: segmap_pagecreate locks 2354 * page. Unlock it. This also unlocks the 2355 * pages allocated by page_create_va() in 2356 * segmap_pagecreate(). 2357 */ 2358 sm_error = segmap_fault(kas.a_hat, segkmap, 2359 saved_base, saved_n, 2360 F_SOFTUNLOCK, S_WRITE); 2361 if (error == 0) 2362 error = sm_error; 2363 } 2364 } 2365 } while (tcount > 0 && error == 0); 2366 2367 return (error); 2368 } 2369 2370 int 2371 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2372 { 2373 rnode4_t *rp; 2374 page_t *pp; 2375 u_offset_t eoff; 2376 u_offset_t io_off; 2377 size_t io_len; 2378 int error; 2379 int rdirty; 2380 int err; 2381 2382 rp = VTOR4(vp); 2383 ASSERT(rp->r_count > 0); 2384 2385 if (!nfs4_has_pages(vp)) 2386 return (0); 2387 2388 ASSERT(vp->v_type != VCHR); 2389 2390 /* 2391 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2392 * writes. B_FORCE is set to force the VM system to actually 2393 * invalidate the pages, even if the i/o failed. The pages 2394 * need to get invalidated because they can't be written out 2395 * because there isn't any space left on either the server's 2396 * file system or in the user's disk quota. The B_FREE bit 2397 * is cleared to avoid confusion as to whether this is a 2398 * request to place the page on the freelist or to destroy 2399 * it. 2400 */ 2401 if ((rp->r_flags & R4OUTOFSPACE) || 2402 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2403 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2404 2405 if (len == 0) { 2406 /* 2407 * If doing a full file synchronous operation, then clear 2408 * the R4DIRTY bit. If a page gets dirtied while the flush 2409 * is happening, then R4DIRTY will get set again. The 2410 * R4DIRTY bit must get cleared before the flush so that 2411 * we don't lose this information. 2412 * 2413 * If there are no full file async write operations 2414 * pending and RDIRTY bit is set, clear it. 2415 */ 2416 if (off == (u_offset_t)0 && 2417 !(flags & B_ASYNC) && 2418 (rp->r_flags & R4DIRTY)) { 2419 mutex_enter(&rp->r_statelock); 2420 rdirty = (rp->r_flags & R4DIRTY); 2421 rp->r_flags &= ~R4DIRTY; 2422 mutex_exit(&rp->r_statelock); 2423 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2424 mutex_enter(&rp->r_statelock); 2425 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2426 rdirty = (rp->r_flags & R4DIRTY); 2427 rp->r_flags &= ~R4DIRTY; 2428 } 2429 mutex_exit(&rp->r_statelock); 2430 } else 2431 rdirty = 0; 2432 2433 /* 2434 * Search the entire vp list for pages >= off, and flush 2435 * the dirty pages. 2436 */ 2437 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2438 flags, cr); 2439 2440 /* 2441 * If an error occurred and the file was marked as dirty 2442 * before and we aren't forcibly invalidating pages, then 2443 * reset the R4DIRTY flag. 2444 */ 2445 if (error && rdirty && 2446 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2447 mutex_enter(&rp->r_statelock); 2448 rp->r_flags |= R4DIRTY; 2449 mutex_exit(&rp->r_statelock); 2450 } 2451 } else { 2452 /* 2453 * Do a range from [off...off + len) looking for pages 2454 * to deal with. 2455 */ 2456 error = 0; 2457 io_len = 0; 2458 eoff = off + len; 2459 mutex_enter(&rp->r_statelock); 2460 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2461 io_off += io_len) { 2462 mutex_exit(&rp->r_statelock); 2463 /* 2464 * If we are not invalidating, synchronously 2465 * freeing or writing pages use the routine 2466 * page_lookup_nowait() to prevent reclaiming 2467 * them from the free list. 2468 */ 2469 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2470 pp = page_lookup(vp, io_off, 2471 (flags & (B_INVAL | B_FREE)) ? 2472 SE_EXCL : SE_SHARED); 2473 } else { 2474 pp = page_lookup_nowait(vp, io_off, 2475 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2476 } 2477 2478 if (pp == NULL || !pvn_getdirty(pp, flags)) 2479 io_len = PAGESIZE; 2480 else { 2481 err = (*rp->r_putapage)(vp, pp, &io_off, 2482 &io_len, flags, cr); 2483 if (!error) 2484 error = err; 2485 /* 2486 * "io_off" and "io_len" are returned as 2487 * the range of pages we actually wrote. 2488 * This allows us to skip ahead more quickly 2489 * since several pages may've been dealt 2490 * with by this iteration of the loop. 2491 */ 2492 } 2493 mutex_enter(&rp->r_statelock); 2494 } 2495 mutex_exit(&rp->r_statelock); 2496 } 2497 2498 return (error); 2499 } 2500 2501 void 2502 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2503 { 2504 rnode4_t *rp; 2505 2506 rp = VTOR4(vp); 2507 if (IS_SHADOW(vp, rp)) 2508 vp = RTOV4(rp); 2509 mutex_enter(&rp->r_statelock); 2510 while (rp->r_flags & R4TRUNCATE) 2511 cv_wait(&rp->r_cv, &rp->r_statelock); 2512 rp->r_flags |= R4TRUNCATE; 2513 if (off == (u_offset_t)0) { 2514 rp->r_flags &= ~R4DIRTY; 2515 if (!(rp->r_flags & R4STALE)) 2516 rp->r_error = 0; 2517 } 2518 rp->r_truncaddr = off; 2519 mutex_exit(&rp->r_statelock); 2520 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2521 B_INVAL | B_TRUNC, cr); 2522 mutex_enter(&rp->r_statelock); 2523 rp->r_flags &= ~R4TRUNCATE; 2524 cv_broadcast(&rp->r_cv); 2525 mutex_exit(&rp->r_statelock); 2526 } 2527 2528 static int 2529 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2530 { 2531 mntinfo4_t *mi; 2532 struct mntinfo_kstat *mik; 2533 vfs_t *vfsp; 2534 2535 /* this is a read-only kstat. Bail out on a write */ 2536 if (rw == KSTAT_WRITE) 2537 return (EACCES); 2538 2539 2540 /* 2541 * We don't want to wait here as kstat_chain_lock could be held by 2542 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2543 * and thus could lead to a deadlock. 2544 */ 2545 vfsp = (struct vfs *)ksp->ks_private; 2546 2547 mi = VFTOMI4(vfsp); 2548 mik = (struct mntinfo_kstat *)ksp->ks_data; 2549 2550 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2551 2552 mik->mik_vers = (uint32_t)mi->mi_vers; 2553 mik->mik_flags = mi->mi_flags; 2554 /* 2555 * The sv_secdata holds the flavor the client specifies. 2556 * If the client uses default and a security negotiation 2557 * occurs, sv_currsec will point to the current flavor 2558 * selected from the server flavor list. 2559 * sv_currsec is NULL if no security negotiation takes place. 2560 */ 2561 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2562 mi->mi_curr_serv->sv_currsec->secmod : 2563 mi->mi_curr_serv->sv_secdata->secmod; 2564 mik->mik_curread = (uint32_t)mi->mi_curread; 2565 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2566 mik->mik_retrans = mi->mi_retrans; 2567 mik->mik_timeo = mi->mi_timeo; 2568 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2569 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2570 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2571 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2572 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2573 mik->mik_failover = (uint32_t)mi->mi_failover; 2574 mik->mik_remap = (uint32_t)mi->mi_remap; 2575 2576 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2577 2578 return (0); 2579 } 2580 2581 void 2582 nfs4_mnt_kstat_init(struct vfs *vfsp) 2583 { 2584 mntinfo4_t *mi = VFTOMI4(vfsp); 2585 2586 /* 2587 * PSARC 2001/697 Contract Private Interface 2588 * All nfs kstats are under SunMC contract 2589 * Please refer to the PSARC listed above and contact 2590 * SunMC before making any changes! 2591 * 2592 * Changes must be reviewed by Solaris File Sharing 2593 * Changes must be communicated to contract-2001-697@sun.com 2594 * 2595 */ 2596 2597 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2598 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2599 if (mi->mi_io_kstats) { 2600 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2601 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2602 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2603 kstat_install(mi->mi_io_kstats); 2604 } 2605 2606 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2607 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2608 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2609 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2610 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2611 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2612 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2613 kstat_install(mi->mi_ro_kstats); 2614 } 2615 2616 nfs4_mnt_recov_kstat_init(vfsp); 2617 } 2618 2619 void 2620 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2621 { 2622 mntinfo4_t *mi; 2623 2624 mi = VTOMI4(vp); 2625 /* 2626 * In case of forced unmount, do not print any messages 2627 * since it can flood the console with error messages. 2628 */ 2629 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2630 return; 2631 2632 /* 2633 * If the mount point is dead, not recoverable, do not 2634 * print error messages that can flood the console. 2635 */ 2636 if (mi->mi_flags & MI4_RECOV_FAIL) 2637 return; 2638 2639 /* 2640 * No use in flooding the console with ENOSPC 2641 * messages from the same file system. 2642 */ 2643 if ((error != ENOSPC && error != EDQUOT) || 2644 lbolt - mi->mi_printftime > 0) { 2645 zoneid_t zoneid = mi->mi_zone->zone_id; 2646 2647 #ifdef DEBUG 2648 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2649 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2650 #else 2651 nfs_perror(error, "NFS write error on host %s: %m.\n", 2652 VTOR4(vp)->r_server->sv_hostname, NULL); 2653 #endif 2654 if (error == ENOSPC || error == EDQUOT) { 2655 zcmn_err(zoneid, CE_CONT, 2656 "^File: userid=%d, groupid=%d\n", 2657 crgetuid(cr), crgetgid(cr)); 2658 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2659 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2660 zcmn_err(zoneid, CE_CONT, 2661 "^User: userid=%d, groupid=%d\n", 2662 crgetuid(curthread->t_cred), 2663 crgetgid(curthread->t_cred)); 2664 } 2665 mi->mi_printftime = lbolt + 2666 nfs_write_error_interval * hz; 2667 } 2668 sfh4_printfhandle(VTOR4(vp)->r_fh); 2669 #ifdef DEBUG 2670 if (error == EACCES) { 2671 zcmn_err(zoneid, CE_CONT, 2672 "nfs_bio: cred is%s kcred\n", 2673 cr == kcred ? "" : " not"); 2674 } 2675 #endif 2676 } 2677 } 2678 2679 /* 2680 * Return non-zero if the given file can be safely memory mapped. Locks 2681 * are safe if whole-file (length and offset are both zero). 2682 */ 2683 2684 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2685 2686 static int 2687 nfs4_safemap(const vnode_t *vp) 2688 { 2689 locklist_t *llp, *next_llp; 2690 int safe = 1; 2691 rnode4_t *rp = VTOR4(vp); 2692 2693 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2694 2695 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2696 "vp = %p", (void *)vp)); 2697 2698 /* 2699 * Review all the locks for the vnode, both ones that have been 2700 * acquired and ones that are pending. We assume that 2701 * flk_active_locks_for_vp() has merged any locks that can be 2702 * merged (so that if a process has the entire file locked, it is 2703 * represented as a single lock). 2704 * 2705 * Note that we can't bail out of the loop if we find a non-safe 2706 * lock, because we have to free all the elements in the llp list. 2707 * We might be able to speed up this code slightly by not looking 2708 * at each lock's l_start and l_len fields once we've found a 2709 * non-safe lock. 2710 */ 2711 2712 llp = flk_active_locks_for_vp(vp); 2713 while (llp) { 2714 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2715 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2716 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2717 if (!SAFE_LOCK(llp->ll_flock)) { 2718 safe = 0; 2719 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2720 "nfs4_safemap: unsafe active lock (%" PRId64 2721 ", %" PRId64 ")", llp->ll_flock.l_start, 2722 llp->ll_flock.l_len)); 2723 } 2724 next_llp = llp->ll_next; 2725 VN_RELE(llp->ll_vp); 2726 kmem_free(llp, sizeof (*llp)); 2727 llp = next_llp; 2728 } 2729 2730 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2731 safe ? "safe" : "unsafe")); 2732 return (safe); 2733 } 2734 2735 /* 2736 * Return whether there is a lost LOCK or LOCKU queued up for the given 2737 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2738 */ 2739 2740 bool_t 2741 nfs4_map_lost_lock_conflict(vnode_t *vp) 2742 { 2743 bool_t conflict = FALSE; 2744 nfs4_lost_rqst_t *lrp; 2745 mntinfo4_t *mi = VTOMI4(vp); 2746 2747 mutex_enter(&mi->mi_lock); 2748 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2749 lrp = list_next(&mi->mi_lost_state, lrp)) { 2750 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2751 continue; 2752 ASSERT(lrp->lr_vp != NULL); 2753 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2754 continue; /* different file */ 2755 if (!SAFE_LOCK(*lrp->lr_flk)) { 2756 conflict = TRUE; 2757 break; 2758 } 2759 } 2760 2761 mutex_exit(&mi->mi_lock); 2762 return (conflict); 2763 } 2764 2765 /* 2766 * nfs_lockcompletion: 2767 * 2768 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2769 * as non cachable (set VNOCACHE bit). 2770 */ 2771 2772 void 2773 nfs4_lockcompletion(vnode_t *vp, int cmd) 2774 { 2775 rnode4_t *rp = VTOR4(vp); 2776 2777 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2778 ASSERT(!IS_SHADOW(vp, rp)); 2779 2780 if (cmd == F_SETLK || cmd == F_SETLKW) { 2781 2782 if (!nfs4_safemap(vp)) { 2783 mutex_enter(&vp->v_lock); 2784 vp->v_flag |= VNOCACHE; 2785 mutex_exit(&vp->v_lock); 2786 } else { 2787 mutex_enter(&vp->v_lock); 2788 vp->v_flag &= ~VNOCACHE; 2789 mutex_exit(&vp->v_lock); 2790 } 2791 } 2792 /* 2793 * The cached attributes of the file are stale after acquiring 2794 * the lock on the file. They were updated when the file was 2795 * opened, but not updated when the lock was acquired. Therefore the 2796 * cached attributes are invalidated after the lock is obtained. 2797 */ 2798 PURGE_ATTRCACHE4(vp); 2799 } 2800 2801 /* ARGSUSED */ 2802 static void * 2803 nfs4_mi_init(zoneid_t zoneid) 2804 { 2805 struct mi4_globals *mig; 2806 2807 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2808 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2809 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2810 offsetof(mntinfo4_t, mi_zone_node)); 2811 mig->mig_destructor_called = B_FALSE; 2812 return (mig); 2813 } 2814 2815 /* 2816 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2817 * state and killing off threads. 2818 */ 2819 /* ARGSUSED */ 2820 static void 2821 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2822 { 2823 struct mi4_globals *mig = data; 2824 mntinfo4_t *mi; 2825 nfs4_server_t *np; 2826 2827 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2828 "nfs4_mi_shutdown zone %d\n", zoneid)); 2829 ASSERT(mig != NULL); 2830 for (;;) { 2831 mutex_enter(&mig->mig_lock); 2832 mi = list_head(&mig->mig_list); 2833 if (mi == NULL) { 2834 mutex_exit(&mig->mig_lock); 2835 break; 2836 } 2837 2838 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2839 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2840 /* 2841 * purge the DNLC for this filesystem 2842 */ 2843 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2844 /* 2845 * Tell existing async worker threads to exit. 2846 */ 2847 mutex_enter(&mi->mi_async_lock); 2848 mi->mi_max_threads = 0; 2849 cv_broadcast(&mi->mi_async_work_cv); 2850 /* 2851 * Set the appropriate flags, signal and wait for both the 2852 * async manager and the inactive thread to exit when they're 2853 * done with their current work. 2854 */ 2855 mutex_enter(&mi->mi_lock); 2856 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2857 mutex_exit(&mi->mi_lock); 2858 mutex_exit(&mi->mi_async_lock); 2859 if (mi->mi_manager_thread) { 2860 nfs4_async_manager_stop(mi->mi_vfsp); 2861 } 2862 if (mi->mi_inactive_thread) { 2863 mutex_enter(&mi->mi_async_lock); 2864 cv_signal(&mi->mi_inact_req_cv); 2865 /* 2866 * Wait for the inactive thread to exit. 2867 */ 2868 while (mi->mi_inactive_thread != NULL) { 2869 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2870 } 2871 mutex_exit(&mi->mi_async_lock); 2872 } 2873 /* 2874 * Wait for the recovery thread to complete, that is, it will 2875 * signal when it is done using the "mi" structure and about 2876 * to exit 2877 */ 2878 mutex_enter(&mi->mi_lock); 2879 while (mi->mi_in_recovery > 0) 2880 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2881 mutex_exit(&mi->mi_lock); 2882 /* 2883 * We're done when every mi has been done or the list is empty. 2884 * This one is done, remove it from the list. 2885 */ 2886 list_remove(&mig->mig_list, mi); 2887 mutex_exit(&mig->mig_lock); 2888 zone_rele(mi->mi_zone); 2889 /* 2890 * Release hold on vfs and mi done to prevent race with zone 2891 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2892 */ 2893 VFS_RELE(mi->mi_vfsp); 2894 MI4_RELE(mi); 2895 } 2896 /* 2897 * Tell each renew thread in the zone to exit 2898 */ 2899 mutex_enter(&nfs4_server_lst_lock); 2900 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2901 mutex_enter(&np->s_lock); 2902 if (np->zoneid == zoneid) { 2903 /* 2904 * We add another hold onto the nfs4_server_t 2905 * because this will make sure tha the nfs4_server_t 2906 * stays around until nfs4_callback_fini_zone destroys 2907 * the zone. This way, the renew thread can 2908 * unconditionally release its holds on the 2909 * nfs4_server_t. 2910 */ 2911 np->s_refcnt++; 2912 nfs4_mark_srv_dead(np); 2913 } 2914 mutex_exit(&np->s_lock); 2915 } 2916 mutex_exit(&nfs4_server_lst_lock); 2917 } 2918 2919 static void 2920 nfs4_mi_free_globals(struct mi4_globals *mig) 2921 { 2922 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2923 mutex_destroy(&mig->mig_lock); 2924 kmem_free(mig, sizeof (*mig)); 2925 } 2926 2927 /* ARGSUSED */ 2928 static void 2929 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2930 { 2931 struct mi4_globals *mig = data; 2932 2933 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2934 "nfs4_mi_destroy zone %d\n", zoneid)); 2935 ASSERT(mig != NULL); 2936 mutex_enter(&mig->mig_lock); 2937 if (list_head(&mig->mig_list) != NULL) { 2938 /* Still waiting for VFS_FREEVFS() */ 2939 mig->mig_destructor_called = B_TRUE; 2940 mutex_exit(&mig->mig_lock); 2941 return; 2942 } 2943 nfs4_mi_free_globals(mig); 2944 } 2945 2946 /* 2947 * Add an NFS mount to the per-zone list of NFS mounts. 2948 */ 2949 void 2950 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2951 { 2952 struct mi4_globals *mig; 2953 2954 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2955 mutex_enter(&mig->mig_lock); 2956 list_insert_head(&mig->mig_list, mi); 2957 /* 2958 * hold added to eliminate race with zone shutdown -this will be 2959 * released in mi_shutdown 2960 */ 2961 MI4_HOLD(mi); 2962 VFS_HOLD(mi->mi_vfsp); 2963 mutex_exit(&mig->mig_lock); 2964 } 2965 2966 /* 2967 * Remove an NFS mount from the per-zone list of NFS mounts. 2968 */ 2969 int 2970 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 2971 { 2972 struct mi4_globals *mig; 2973 int ret = 0; 2974 2975 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2976 mutex_enter(&mig->mig_lock); 2977 mutex_enter(&mi->mi_lock); 2978 /* if this mi is marked dead, then the zone already released it */ 2979 if (!(mi->mi_flags & MI4_DEAD)) { 2980 list_remove(&mig->mig_list, mi); 2981 mutex_exit(&mi->mi_lock); 2982 2983 /* release the holds put on in zonelist_add(). */ 2984 VFS_RELE(mi->mi_vfsp); 2985 MI4_RELE(mi); 2986 ret = 1; 2987 } else { 2988 mutex_exit(&mi->mi_lock); 2989 } 2990 2991 /* 2992 * We can be called asynchronously by VFS_FREEVFS() after the zone 2993 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2994 * mi globals. 2995 */ 2996 if (list_head(&mig->mig_list) == NULL && 2997 mig->mig_destructor_called == B_TRUE) { 2998 nfs4_mi_free_globals(mig); 2999 return (ret); 3000 } 3001 mutex_exit(&mig->mig_lock); 3002 return (ret); 3003 } 3004 3005 void 3006 nfs_free_mi4(mntinfo4_t *mi) 3007 { 3008 nfs4_open_owner_t *foop; 3009 nfs4_oo_hash_bucket_t *bucketp; 3010 nfs4_debug_msg_t *msgp; 3011 int i; 3012 servinfo4_t *svp; 3013 3014 mutex_enter(&mi->mi_lock); 3015 ASSERT(mi->mi_recovthread == NULL); 3016 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 3017 mutex_exit(&mi->mi_lock); 3018 mutex_enter(&mi->mi_async_lock); 3019 ASSERT(mi->mi_threads == 0); 3020 ASSERT(mi->mi_manager_thread == NULL); 3021 mutex_exit(&mi->mi_async_lock); 3022 svp = mi->mi_servers; 3023 sv4_free(svp); 3024 if (mi->mi_io_kstats) { 3025 kstat_delete(mi->mi_io_kstats); 3026 mi->mi_io_kstats = NULL; 3027 } 3028 if (mi->mi_ro_kstats) { 3029 kstat_delete(mi->mi_ro_kstats); 3030 mi->mi_ro_kstats = NULL; 3031 } 3032 if (mi->mi_recov_ksp) { 3033 kstat_delete(mi->mi_recov_ksp); 3034 mi->mi_recov_ksp = NULL; 3035 } 3036 mutex_enter(&mi->mi_msg_list_lock); 3037 while (msgp = list_head(&mi->mi_msg_list)) { 3038 list_remove(&mi->mi_msg_list, msgp); 3039 nfs4_free_msg(msgp); 3040 } 3041 mutex_exit(&mi->mi_msg_list_lock); 3042 list_destroy(&mi->mi_msg_list); 3043 if (mi->mi_fname != NULL) 3044 fn_rele(&mi->mi_fname); 3045 if (mi->mi_rootfh != NULL) 3046 sfh4_rele(&mi->mi_rootfh); 3047 if (mi->mi_srvparentfh != NULL) 3048 sfh4_rele(&mi->mi_srvparentfh); 3049 mutex_destroy(&mi->mi_lock); 3050 mutex_destroy(&mi->mi_async_lock); 3051 mutex_destroy(&mi->mi_msg_list_lock); 3052 nfs_rw_destroy(&mi->mi_recovlock); 3053 nfs_rw_destroy(&mi->mi_rename_lock); 3054 nfs_rw_destroy(&mi->mi_fh_lock); 3055 cv_destroy(&mi->mi_failover_cv); 3056 cv_destroy(&mi->mi_async_reqs_cv); 3057 cv_destroy(&mi->mi_async_work_cv); 3058 cv_destroy(&mi->mi_async_cv); 3059 cv_destroy(&mi->mi_inact_req_cv); 3060 /* 3061 * Destroy the oo hash lists and mutexes for the cred hash table. 3062 */ 3063 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3064 bucketp = &(mi->mi_oo_list[i]); 3065 /* Destroy any remaining open owners on the list */ 3066 foop = list_head(&bucketp->b_oo_hash_list); 3067 while (foop != NULL) { 3068 list_remove(&bucketp->b_oo_hash_list, foop); 3069 nfs4_destroy_open_owner(foop); 3070 foop = list_head(&bucketp->b_oo_hash_list); 3071 } 3072 list_destroy(&bucketp->b_oo_hash_list); 3073 mutex_destroy(&bucketp->b_lock); 3074 } 3075 /* 3076 * Empty and destroy the freed open owner list. 3077 */ 3078 foop = list_head(&mi->mi_foo_list); 3079 while (foop != NULL) { 3080 list_remove(&mi->mi_foo_list, foop); 3081 nfs4_destroy_open_owner(foop); 3082 foop = list_head(&mi->mi_foo_list); 3083 } 3084 list_destroy(&mi->mi_foo_list); 3085 list_destroy(&mi->mi_bseqid_list); 3086 list_destroy(&mi->mi_lost_state); 3087 avl_destroy(&mi->mi_filehandles); 3088 kmem_free(mi, sizeof (*mi)); 3089 } 3090 void 3091 mi_hold(mntinfo4_t *mi) 3092 { 3093 atomic_add_32(&mi->mi_count, 1); 3094 ASSERT(mi->mi_count != 0); 3095 } 3096 3097 void 3098 mi_rele(mntinfo4_t *mi) 3099 { 3100 ASSERT(mi->mi_count != 0); 3101 if (atomic_add_32_nv(&mi->mi_count, -1) == 0) { 3102 nfs_free_mi4(mi); 3103 } 3104 } 3105 3106 vnode_t nfs4_xattr_notsupp_vnode; 3107 3108 void 3109 nfs4_clnt_init(void) 3110 { 3111 nfs4_vnops_init(); 3112 (void) nfs4_rnode_init(); 3113 (void) nfs4_shadow_init(); 3114 (void) nfs4_acache_init(); 3115 (void) nfs4_subr_init(); 3116 nfs4_acl_init(); 3117 nfs_idmap_init(); 3118 nfs4_callback_init(); 3119 nfs4_secinfo_init(); 3120 #ifdef DEBUG 3121 tsd_create(&nfs4_tsd_key, NULL); 3122 #endif 3123 3124 /* 3125 * Add a CPR callback so that we can update client 3126 * lease after a suspend and resume. 3127 */ 3128 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3129 3130 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3131 nfs4_mi_destroy); 3132 3133 /* 3134 * Initialise the reference count of the notsupp xattr cache vnode to 1 3135 * so that it never goes away (VOP_INACTIVE isn't called on it). 3136 */ 3137 nfs4_xattr_notsupp_vnode.v_count = 1; 3138 } 3139 3140 void 3141 nfs4_clnt_fini(void) 3142 { 3143 (void) zone_key_delete(mi4_list_key); 3144 nfs4_vnops_fini(); 3145 (void) nfs4_rnode_fini(); 3146 (void) nfs4_shadow_fini(); 3147 (void) nfs4_acache_fini(); 3148 (void) nfs4_subr_fini(); 3149 nfs_idmap_fini(); 3150 nfs4_callback_fini(); 3151 nfs4_secinfo_fini(); 3152 #ifdef DEBUG 3153 tsd_destroy(&nfs4_tsd_key); 3154 #endif 3155 if (cid) 3156 (void) callb_delete(cid); 3157 } 3158 3159 /*ARGSUSED*/ 3160 static boolean_t 3161 nfs4_client_cpr_callb(void *arg, int code) 3162 { 3163 /* 3164 * We get called for Suspend and Resume events. 3165 * For the suspend case we simply don't care! 3166 */ 3167 if (code == CB_CODE_CPR_CHKPT) { 3168 return (B_TRUE); 3169 } 3170 3171 /* 3172 * When we get to here we are in the process of 3173 * resuming the system from a previous suspend. 3174 */ 3175 nfs4_client_resumed = gethrestime_sec(); 3176 return (B_TRUE); 3177 } 3178 3179 void 3180 nfs4_renew_lease_thread(nfs4_server_t *sp) 3181 { 3182 int error = 0; 3183 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3184 clock_t tick_delay = 0; 3185 clock_t time_left = 0; 3186 callb_cpr_t cpr_info; 3187 kmutex_t cpr_lock; 3188 3189 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3190 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3191 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3192 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3193 3194 mutex_enter(&sp->s_lock); 3195 /* sp->s_lease_time is set via a GETATTR */ 3196 sp->last_renewal_time = gethrestime_sec(); 3197 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3198 ASSERT(sp->s_refcnt >= 1); 3199 3200 for (;;) { 3201 if (!sp->state_ref_count || 3202 sp->lease_valid != NFS4_LEASE_VALID) { 3203 3204 kip_secs = MAX((sp->s_lease_time >> 1) - 3205 (3 * sp->propagation_delay.tv_sec), 1); 3206 3207 tick_delay = SEC_TO_TICK(kip_secs); 3208 3209 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3210 "nfs4_renew_lease_thread: no renew : thread " 3211 "wait %ld secs", kip_secs)); 3212 3213 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3214 "nfs4_renew_lease_thread: no renew : " 3215 "state_ref_count %d, lease_valid %d", 3216 sp->state_ref_count, sp->lease_valid)); 3217 3218 mutex_enter(&cpr_lock); 3219 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3220 mutex_exit(&cpr_lock); 3221 time_left = cv_timedwait(&sp->cv_thread_exit, 3222 &sp->s_lock, tick_delay + lbolt); 3223 mutex_enter(&cpr_lock); 3224 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3225 mutex_exit(&cpr_lock); 3226 3227 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3228 "nfs4_renew_lease_thread: no renew: " 3229 "time left %ld", time_left)); 3230 3231 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3232 goto die; 3233 continue; 3234 } 3235 3236 tmp_last_renewal_time = sp->last_renewal_time; 3237 3238 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3239 (3 * sp->propagation_delay.tv_sec); 3240 3241 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3242 "nfs4_renew_lease_thread: tmp_time %ld, " 3243 "sp->last_renewal_time %ld", tmp_time, 3244 sp->last_renewal_time)); 3245 3246 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3247 3248 tick_delay = SEC_TO_TICK(kip_secs); 3249 3250 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3251 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3252 "secs", kip_secs)); 3253 3254 mutex_enter(&cpr_lock); 3255 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3256 mutex_exit(&cpr_lock); 3257 time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock, 3258 tick_delay + lbolt); 3259 mutex_enter(&cpr_lock); 3260 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3261 mutex_exit(&cpr_lock); 3262 3263 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3264 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3265 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3266 "tmp_last_renewal_time %ld", time_left, 3267 sp->last_renewal_time, nfs4_client_resumed, 3268 tmp_last_renewal_time)); 3269 3270 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3271 goto die; 3272 3273 if (tmp_last_renewal_time == sp->last_renewal_time || 3274 (nfs4_client_resumed != 0 && 3275 nfs4_client_resumed > sp->last_renewal_time)) { 3276 /* 3277 * Issue RENEW op since we haven't renewed the lease 3278 * since we slept. 3279 */ 3280 tmp_now_time = gethrestime_sec(); 3281 error = nfs4renew(sp); 3282 /* 3283 * Need to re-acquire sp's lock, nfs4renew() 3284 * relinqueshes it. 3285 */ 3286 mutex_enter(&sp->s_lock); 3287 3288 /* 3289 * See if someone changed s_thread_exit while we gave 3290 * up s_lock. 3291 */ 3292 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3293 goto die; 3294 3295 if (!error) { 3296 /* 3297 * check to see if we implicitly renewed while 3298 * we waited for a reply for our RENEW call. 3299 */ 3300 if (tmp_last_renewal_time == 3301 sp->last_renewal_time) { 3302 /* no implicit renew came */ 3303 sp->last_renewal_time = tmp_now_time; 3304 } else { 3305 NFS4_DEBUG(nfs4_client_lease_debug, 3306 (CE_NOTE, "renew_thread: did " 3307 "implicit renewal before reply " 3308 "from server for RENEW")); 3309 } 3310 } else { 3311 /* figure out error */ 3312 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3313 "renew_thread: nfs4renew returned error" 3314 " %d", error)); 3315 } 3316 3317 } 3318 } 3319 3320 die: 3321 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3322 "nfs4_renew_lease_thread: thread exiting")); 3323 3324 while (sp->s_otw_call_count != 0) { 3325 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3326 "nfs4_renew_lease_thread: waiting for outstanding " 3327 "otw calls to finish for sp 0x%p, current " 3328 "s_otw_call_count %d", (void *)sp, 3329 sp->s_otw_call_count)); 3330 mutex_enter(&cpr_lock); 3331 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3332 mutex_exit(&cpr_lock); 3333 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3334 mutex_enter(&cpr_lock); 3335 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3336 mutex_exit(&cpr_lock); 3337 } 3338 mutex_exit(&sp->s_lock); 3339 3340 nfs4_server_rele(sp); /* free the thread's reference */ 3341 nfs4_server_rele(sp); /* free the list's reference */ 3342 sp = NULL; 3343 3344 done: 3345 mutex_enter(&cpr_lock); 3346 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3347 mutex_destroy(&cpr_lock); 3348 3349 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3350 "nfs4_renew_lease_thread: renew thread exit officially")); 3351 3352 zthread_exit(); 3353 /* NOT REACHED */ 3354 } 3355 3356 /* 3357 * Send out a RENEW op to the server. 3358 * Assumes sp is locked down. 3359 */ 3360 static int 3361 nfs4renew(nfs4_server_t *sp) 3362 { 3363 COMPOUND4args_clnt args; 3364 COMPOUND4res_clnt res; 3365 nfs_argop4 argop[1]; 3366 int doqueue = 1; 3367 int rpc_error; 3368 cred_t *cr; 3369 mntinfo4_t *mi; 3370 timespec_t prop_time, after_time; 3371 int needrecov = FALSE; 3372 nfs4_recov_state_t recov_state; 3373 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3374 3375 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3376 3377 recov_state.rs_flags = 0; 3378 recov_state.rs_num_retry_despite_err = 0; 3379 3380 recov_retry: 3381 mi = sp->mntinfo4_list; 3382 VFS_HOLD(mi->mi_vfsp); 3383 mutex_exit(&sp->s_lock); 3384 ASSERT(mi != NULL); 3385 3386 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3387 if (e.error) { 3388 VFS_RELE(mi->mi_vfsp); 3389 return (e.error); 3390 } 3391 3392 /* Check to see if we're dealing with a marked-dead sp */ 3393 mutex_enter(&sp->s_lock); 3394 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3395 mutex_exit(&sp->s_lock); 3396 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3397 VFS_RELE(mi->mi_vfsp); 3398 return (0); 3399 } 3400 3401 /* Make sure mi hasn't changed on us */ 3402 if (mi != sp->mntinfo4_list) { 3403 /* Must drop sp's lock to avoid a recursive mutex enter */ 3404 mutex_exit(&sp->s_lock); 3405 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3406 VFS_RELE(mi->mi_vfsp); 3407 mutex_enter(&sp->s_lock); 3408 goto recov_retry; 3409 } 3410 mutex_exit(&sp->s_lock); 3411 3412 args.ctag = TAG_RENEW; 3413 3414 args.array_len = 1; 3415 args.array = argop; 3416 3417 argop[0].argop = OP_RENEW; 3418 3419 mutex_enter(&sp->s_lock); 3420 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3421 cr = sp->s_cred; 3422 crhold(cr); 3423 mutex_exit(&sp->s_lock); 3424 3425 ASSERT(cr != NULL); 3426 3427 /* used to figure out RTT for sp */ 3428 gethrestime(&prop_time); 3429 3430 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3431 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3432 (void*)sp)); 3433 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3434 prop_time.tv_sec, prop_time.tv_nsec)); 3435 3436 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3437 mntinfo4_t *, mi); 3438 3439 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3440 crfree(cr); 3441 3442 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3443 mntinfo4_t *, mi); 3444 3445 gethrestime(&after_time); 3446 3447 mutex_enter(&sp->s_lock); 3448 sp->propagation_delay.tv_sec = 3449 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3450 mutex_exit(&sp->s_lock); 3451 3452 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3453 after_time.tv_sec, after_time.tv_nsec)); 3454 3455 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3456 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3457 nfs4_delegreturn_all(sp); 3458 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3459 VFS_RELE(mi->mi_vfsp); 3460 /* 3461 * If the server returns CB_PATH_DOWN, it has renewed 3462 * the lease and informed us that the callback path is 3463 * down. Since the lease is renewed, just return 0 and 3464 * let the renew thread proceed as normal. 3465 */ 3466 return (0); 3467 } 3468 3469 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3470 if (!needrecov && e.error) { 3471 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3472 VFS_RELE(mi->mi_vfsp); 3473 return (e.error); 3474 } 3475 3476 rpc_error = e.error; 3477 3478 if (needrecov) { 3479 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3480 "nfs4renew: initiating recovery\n")); 3481 3482 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3483 OP_RENEW, NULL) == FALSE) { 3484 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3485 VFS_RELE(mi->mi_vfsp); 3486 if (!e.error) 3487 (void) xdr_free(xdr_COMPOUND4res_clnt, 3488 (caddr_t)&res); 3489 mutex_enter(&sp->s_lock); 3490 goto recov_retry; 3491 } 3492 /* fall through for res.status case */ 3493 } 3494 3495 if (res.status) { 3496 if (res.status == NFS4ERR_LEASE_MOVED) { 3497 /*EMPTY*/ 3498 /* 3499 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3500 * to renew the lease on that server 3501 */ 3502 } 3503 e.error = geterrno4(res.status); 3504 } 3505 3506 if (!rpc_error) 3507 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3508 3509 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3510 3511 VFS_RELE(mi->mi_vfsp); 3512 3513 return (e.error); 3514 } 3515 3516 void 3517 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3518 { 3519 nfs4_server_t *sp; 3520 3521 /* this locks down sp if it is found */ 3522 sp = find_nfs4_server(mi); 3523 3524 if (sp != NULL) { 3525 nfs4_inc_state_ref_count_nolock(sp, mi); 3526 mutex_exit(&sp->s_lock); 3527 nfs4_server_rele(sp); 3528 } 3529 } 3530 3531 /* 3532 * Bump the number of OPEN files (ie: those with state) so we know if this 3533 * nfs4_server has any state to maintain a lease for or not. 3534 * 3535 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3536 */ 3537 void 3538 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3539 { 3540 ASSERT(mutex_owned(&sp->s_lock)); 3541 3542 sp->state_ref_count++; 3543 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3544 "nfs4_inc_state_ref_count: state_ref_count now %d", 3545 sp->state_ref_count)); 3546 3547 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3548 sp->lease_valid = NFS4_LEASE_VALID; 3549 3550 /* 3551 * If this call caused the lease to be marked valid and/or 3552 * took the state_ref_count from 0 to 1, then start the time 3553 * on lease renewal. 3554 */ 3555 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3556 sp->last_renewal_time = gethrestime_sec(); 3557 3558 /* update the number of open files for mi */ 3559 mi->mi_open_files++; 3560 } 3561 3562 void 3563 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3564 { 3565 nfs4_server_t *sp; 3566 3567 /* this locks down sp if it is found */ 3568 sp = find_nfs4_server_all(mi, 1); 3569 3570 if (sp != NULL) { 3571 nfs4_dec_state_ref_count_nolock(sp, mi); 3572 mutex_exit(&sp->s_lock); 3573 nfs4_server_rele(sp); 3574 } 3575 } 3576 3577 /* 3578 * Decrement the number of OPEN files (ie: those with state) so we know if 3579 * this nfs4_server has any state to maintain a lease for or not. 3580 */ 3581 void 3582 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3583 { 3584 ASSERT(mutex_owned(&sp->s_lock)); 3585 ASSERT(sp->state_ref_count != 0); 3586 sp->state_ref_count--; 3587 3588 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3589 "nfs4_dec_state_ref_count: state ref count now %d", 3590 sp->state_ref_count)); 3591 3592 mi->mi_open_files--; 3593 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3594 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3595 mi->mi_open_files, mi->mi_flags)); 3596 3597 /* We don't have to hold the mi_lock to test mi_flags */ 3598 if (mi->mi_open_files == 0 && 3599 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3600 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3601 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3602 "we have closed the last open file", (void*)mi)); 3603 nfs4_remove_mi_from_server(mi, sp); 3604 } 3605 } 3606 3607 bool_t 3608 inlease(nfs4_server_t *sp) 3609 { 3610 bool_t result; 3611 3612 ASSERT(mutex_owned(&sp->s_lock)); 3613 3614 if (sp->lease_valid == NFS4_LEASE_VALID && 3615 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3616 result = TRUE; 3617 else 3618 result = FALSE; 3619 3620 return (result); 3621 } 3622 3623 3624 /* 3625 * Return non-zero if the given nfs4_server_t is going through recovery. 3626 */ 3627 3628 int 3629 nfs4_server_in_recovery(nfs4_server_t *sp) 3630 { 3631 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3632 } 3633 3634 /* 3635 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3636 * first is less than, equal to, or greater than the second. 3637 */ 3638 3639 int 3640 sfh4cmp(const void *p1, const void *p2) 3641 { 3642 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3643 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3644 3645 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3646 } 3647 3648 /* 3649 * Create a table for shared filehandle objects. 3650 */ 3651 3652 void 3653 sfh4_createtab(avl_tree_t *tab) 3654 { 3655 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3656 offsetof(nfs4_sharedfh_t, sfh_tree)); 3657 } 3658 3659 /* 3660 * Return a shared filehandle object for the given filehandle. The caller 3661 * is responsible for eventually calling sfh4_rele(). 3662 */ 3663 3664 nfs4_sharedfh_t * 3665 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3666 { 3667 nfs4_sharedfh_t *sfh, *nsfh; 3668 avl_index_t where; 3669 nfs4_sharedfh_t skey; 3670 3671 if (!key) { 3672 skey.sfh_fh = *fh; 3673 key = &skey; 3674 } 3675 3676 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3677 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3678 /* 3679 * We allocate the largest possible filehandle size because it's 3680 * not that big, and it saves us from possibly having to resize the 3681 * buffer later. 3682 */ 3683 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3684 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3685 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3686 nsfh->sfh_refcnt = 1; 3687 nsfh->sfh_flags = SFH4_IN_TREE; 3688 nsfh->sfh_mi = mi; 3689 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3690 (void *)nsfh)); 3691 3692 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3693 sfh = avl_find(&mi->mi_filehandles, key, &where); 3694 if (sfh != NULL) { 3695 mutex_enter(&sfh->sfh_lock); 3696 sfh->sfh_refcnt++; 3697 mutex_exit(&sfh->sfh_lock); 3698 nfs_rw_exit(&mi->mi_fh_lock); 3699 /* free our speculative allocs */ 3700 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3701 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3702 return (sfh); 3703 } 3704 3705 avl_insert(&mi->mi_filehandles, nsfh, where); 3706 nfs_rw_exit(&mi->mi_fh_lock); 3707 3708 return (nsfh); 3709 } 3710 3711 /* 3712 * Return a shared filehandle object for the given filehandle. The caller 3713 * is responsible for eventually calling sfh4_rele(). 3714 */ 3715 3716 nfs4_sharedfh_t * 3717 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3718 { 3719 nfs4_sharedfh_t *sfh; 3720 nfs4_sharedfh_t key; 3721 3722 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3723 3724 #ifdef DEBUG 3725 if (nfs4_sharedfh_debug) { 3726 nfs4_fhandle_t fhandle; 3727 3728 fhandle.fh_len = fh->nfs_fh4_len; 3729 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3730 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3731 nfs4_printfhandle(&fhandle); 3732 } 3733 #endif 3734 3735 /* 3736 * If there's already an object for the given filehandle, bump the 3737 * reference count and return it. Otherwise, create a new object 3738 * and add it to the AVL tree. 3739 */ 3740 3741 key.sfh_fh = *fh; 3742 3743 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3744 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3745 if (sfh != NULL) { 3746 mutex_enter(&sfh->sfh_lock); 3747 sfh->sfh_refcnt++; 3748 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3749 "sfh4_get: found existing %p, new refcnt=%d", 3750 (void *)sfh, sfh->sfh_refcnt)); 3751 mutex_exit(&sfh->sfh_lock); 3752 nfs_rw_exit(&mi->mi_fh_lock); 3753 return (sfh); 3754 } 3755 nfs_rw_exit(&mi->mi_fh_lock); 3756 3757 return (sfh4_put(fh, mi, &key)); 3758 } 3759 3760 /* 3761 * Get a reference to the given shared filehandle object. 3762 */ 3763 3764 void 3765 sfh4_hold(nfs4_sharedfh_t *sfh) 3766 { 3767 ASSERT(sfh->sfh_refcnt > 0); 3768 3769 mutex_enter(&sfh->sfh_lock); 3770 sfh->sfh_refcnt++; 3771 NFS4_DEBUG(nfs4_sharedfh_debug, 3772 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3773 (void *)sfh, sfh->sfh_refcnt)); 3774 mutex_exit(&sfh->sfh_lock); 3775 } 3776 3777 /* 3778 * Release a reference to the given shared filehandle object and null out 3779 * the given pointer. 3780 */ 3781 3782 void 3783 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3784 { 3785 mntinfo4_t *mi; 3786 nfs4_sharedfh_t *sfh = *sfhpp; 3787 3788 ASSERT(sfh->sfh_refcnt > 0); 3789 3790 mutex_enter(&sfh->sfh_lock); 3791 if (sfh->sfh_refcnt > 1) { 3792 sfh->sfh_refcnt--; 3793 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3794 "sfh4_rele %p, new refcnt=%d", 3795 (void *)sfh, sfh->sfh_refcnt)); 3796 mutex_exit(&sfh->sfh_lock); 3797 goto finish; 3798 } 3799 mutex_exit(&sfh->sfh_lock); 3800 3801 /* 3802 * Possibly the last reference, so get the lock for the table in 3803 * case it's time to remove the object from the table. 3804 */ 3805 mi = sfh->sfh_mi; 3806 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3807 mutex_enter(&sfh->sfh_lock); 3808 sfh->sfh_refcnt--; 3809 if (sfh->sfh_refcnt > 0) { 3810 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3811 "sfh4_rele %p, new refcnt=%d", 3812 (void *)sfh, sfh->sfh_refcnt)); 3813 mutex_exit(&sfh->sfh_lock); 3814 nfs_rw_exit(&mi->mi_fh_lock); 3815 goto finish; 3816 } 3817 3818 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3819 "sfh4_rele %p, last ref", (void *)sfh)); 3820 if (sfh->sfh_flags & SFH4_IN_TREE) { 3821 avl_remove(&mi->mi_filehandles, sfh); 3822 sfh->sfh_flags &= ~SFH4_IN_TREE; 3823 } 3824 mutex_exit(&sfh->sfh_lock); 3825 nfs_rw_exit(&mi->mi_fh_lock); 3826 mutex_destroy(&sfh->sfh_lock); 3827 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3828 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3829 3830 finish: 3831 *sfhpp = NULL; 3832 } 3833 3834 /* 3835 * Update the filehandle for the given shared filehandle object. 3836 */ 3837 3838 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3839 3840 void 3841 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3842 { 3843 mntinfo4_t *mi = sfh->sfh_mi; 3844 nfs4_sharedfh_t *dupsfh; 3845 avl_index_t where; 3846 nfs4_sharedfh_t key; 3847 3848 #ifdef DEBUG 3849 mutex_enter(&sfh->sfh_lock); 3850 ASSERT(sfh->sfh_refcnt > 0); 3851 mutex_exit(&sfh->sfh_lock); 3852 #endif 3853 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3854 3855 /* 3856 * The basic plan is to remove the shared filehandle object from 3857 * the table, update it to have the new filehandle, then reinsert 3858 * it. 3859 */ 3860 3861 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3862 mutex_enter(&sfh->sfh_lock); 3863 if (sfh->sfh_flags & SFH4_IN_TREE) { 3864 avl_remove(&mi->mi_filehandles, sfh); 3865 sfh->sfh_flags &= ~SFH4_IN_TREE; 3866 } 3867 mutex_exit(&sfh->sfh_lock); 3868 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3869 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3870 sfh->sfh_fh.nfs_fh4_len); 3871 3872 /* 3873 * XXX If there is already a shared filehandle object with the new 3874 * filehandle, we're in trouble, because the rnode code assumes 3875 * that there is only one shared filehandle object for a given 3876 * filehandle. So issue a warning (for read-write mounts only) 3877 * and don't try to re-insert the given object into the table. 3878 * Hopefully the given object will quickly go away and everyone 3879 * will use the new object. 3880 */ 3881 key.sfh_fh = *newfh; 3882 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3883 if (dupsfh != NULL) { 3884 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3885 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3886 "duplicate filehandle detected"); 3887 sfh4_printfhandle(dupsfh); 3888 } 3889 } else { 3890 avl_insert(&mi->mi_filehandles, sfh, where); 3891 mutex_enter(&sfh->sfh_lock); 3892 sfh->sfh_flags |= SFH4_IN_TREE; 3893 mutex_exit(&sfh->sfh_lock); 3894 } 3895 nfs_rw_exit(&mi->mi_fh_lock); 3896 } 3897 3898 /* 3899 * Copy out the current filehandle for the given shared filehandle object. 3900 */ 3901 3902 void 3903 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3904 { 3905 mntinfo4_t *mi = sfh->sfh_mi; 3906 3907 ASSERT(sfh->sfh_refcnt > 0); 3908 3909 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3910 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3911 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3912 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3913 nfs_rw_exit(&mi->mi_fh_lock); 3914 } 3915 3916 /* 3917 * Print out the filehandle for the given shared filehandle object. 3918 */ 3919 3920 void 3921 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3922 { 3923 nfs4_fhandle_t fhandle; 3924 3925 sfh4_copyval(sfh, &fhandle); 3926 nfs4_printfhandle(&fhandle); 3927 } 3928 3929 /* 3930 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3931 * if they're the same, +1 if the first is "greater" than the second. The 3932 * caller (or whoever's calling the AVL package) is responsible for 3933 * handling locking issues. 3934 */ 3935 3936 static int 3937 fncmp(const void *p1, const void *p2) 3938 { 3939 const nfs4_fname_t *f1 = p1; 3940 const nfs4_fname_t *f2 = p2; 3941 int res; 3942 3943 res = strcmp(f1->fn_name, f2->fn_name); 3944 /* 3945 * The AVL package wants +/-1, not arbitrary positive or negative 3946 * integers. 3947 */ 3948 if (res > 0) 3949 res = 1; 3950 else if (res < 0) 3951 res = -1; 3952 return (res); 3953 } 3954 3955 /* 3956 * Get or create an fname with the given name, as a child of the given 3957 * fname. The caller is responsible for eventually releasing the reference 3958 * (fn_rele()). parent may be NULL. 3959 */ 3960 3961 nfs4_fname_t * 3962 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh) 3963 { 3964 nfs4_fname_t key; 3965 nfs4_fname_t *fnp; 3966 avl_index_t where; 3967 3968 key.fn_name = name; 3969 3970 /* 3971 * If there's already an fname registered with the given name, bump 3972 * its reference count and return it. Otherwise, create a new one 3973 * and add it to the parent's AVL tree. 3974 * 3975 * fname entries we are looking for should match both name 3976 * and sfh stored in the fname. 3977 */ 3978 again: 3979 if (parent != NULL) { 3980 mutex_enter(&parent->fn_lock); 3981 fnp = avl_find(&parent->fn_children, &key, &where); 3982 if (fnp != NULL) { 3983 /* 3984 * This hold on fnp is released below later, 3985 * in case this is not the fnp we want. 3986 */ 3987 fn_hold(fnp); 3988 3989 if (fnp->fn_sfh == sfh) { 3990 /* 3991 * We have found our entry. 3992 * put an hold and return it. 3993 */ 3994 mutex_exit(&parent->fn_lock); 3995 return (fnp); 3996 } 3997 3998 /* 3999 * We have found an entry that has a mismatching 4000 * fn_sfh. This could be a stale entry due to 4001 * server side rename. We will remove this entry 4002 * and make sure no such entries exist. 4003 */ 4004 mutex_exit(&parent->fn_lock); 4005 mutex_enter(&fnp->fn_lock); 4006 if (fnp->fn_parent == parent) { 4007 /* 4008 * Remove ourselves from parent's 4009 * fn_children tree. 4010 */ 4011 mutex_enter(&parent->fn_lock); 4012 avl_remove(&parent->fn_children, fnp); 4013 mutex_exit(&parent->fn_lock); 4014 fn_rele(&fnp->fn_parent); 4015 } 4016 mutex_exit(&fnp->fn_lock); 4017 fn_rele(&fnp); 4018 goto again; 4019 } 4020 } 4021 4022 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 4023 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 4024 fnp->fn_parent = parent; 4025 if (parent != NULL) 4026 fn_hold(parent); 4027 fnp->fn_len = strlen(name); 4028 ASSERT(fnp->fn_len < MAXNAMELEN); 4029 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 4030 (void) strcpy(fnp->fn_name, name); 4031 fnp->fn_refcnt = 1; 4032 4033 /* 4034 * This hold on sfh is later released 4035 * when we do the final fn_rele() on this fname. 4036 */ 4037 sfh4_hold(sfh); 4038 fnp->fn_sfh = sfh; 4039 4040 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 4041 offsetof(nfs4_fname_t, fn_tree)); 4042 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4043 "fn_get %p:%s, a new nfs4_fname_t!", 4044 (void *)fnp, fnp->fn_name)); 4045 if (parent != NULL) { 4046 avl_insert(&parent->fn_children, fnp, where); 4047 mutex_exit(&parent->fn_lock); 4048 } 4049 4050 return (fnp); 4051 } 4052 4053 void 4054 fn_hold(nfs4_fname_t *fnp) 4055 { 4056 atomic_add_32(&fnp->fn_refcnt, 1); 4057 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4058 "fn_hold %p:%s, new refcnt=%d", 4059 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4060 } 4061 4062 /* 4063 * Decrement the reference count of the given fname, and destroy it if its 4064 * reference count goes to zero. Nulls out the given pointer. 4065 */ 4066 4067 void 4068 fn_rele(nfs4_fname_t **fnpp) 4069 { 4070 nfs4_fname_t *parent; 4071 uint32_t newref; 4072 nfs4_fname_t *fnp; 4073 4074 recur: 4075 fnp = *fnpp; 4076 *fnpp = NULL; 4077 4078 mutex_enter(&fnp->fn_lock); 4079 parent = fnp->fn_parent; 4080 if (parent != NULL) 4081 mutex_enter(&parent->fn_lock); /* prevent new references */ 4082 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 4083 if (newref > 0) { 4084 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4085 "fn_rele %p:%s, new refcnt=%d", 4086 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4087 if (parent != NULL) 4088 mutex_exit(&parent->fn_lock); 4089 mutex_exit(&fnp->fn_lock); 4090 return; 4091 } 4092 4093 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4094 "fn_rele %p:%s, last reference, deleting...", 4095 (void *)fnp, fnp->fn_name)); 4096 if (parent != NULL) { 4097 avl_remove(&parent->fn_children, fnp); 4098 mutex_exit(&parent->fn_lock); 4099 } 4100 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4101 sfh4_rele(&fnp->fn_sfh); 4102 mutex_destroy(&fnp->fn_lock); 4103 avl_destroy(&fnp->fn_children); 4104 kmem_free(fnp, sizeof (nfs4_fname_t)); 4105 /* 4106 * Recursivly fn_rele the parent. 4107 * Use goto instead of a recursive call to avoid stack overflow. 4108 */ 4109 if (parent != NULL) { 4110 fnpp = &parent; 4111 goto recur; 4112 } 4113 } 4114 4115 /* 4116 * Returns the single component name of the given fname, in a MAXNAMELEN 4117 * string buffer, which the caller is responsible for freeing. Note that 4118 * the name may become invalid as a result of fn_move(). 4119 */ 4120 4121 char * 4122 fn_name(nfs4_fname_t *fnp) 4123 { 4124 char *name; 4125 4126 ASSERT(fnp->fn_len < MAXNAMELEN); 4127 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4128 mutex_enter(&fnp->fn_lock); 4129 (void) strcpy(name, fnp->fn_name); 4130 mutex_exit(&fnp->fn_lock); 4131 4132 return (name); 4133 } 4134 4135 4136 /* 4137 * fn_path_realloc 4138 * 4139 * This function, used only by fn_path, constructs 4140 * a new string which looks like "prepend" + "/" + "current". 4141 * by allocating a new string and freeing the old one. 4142 */ 4143 static void 4144 fn_path_realloc(char **curses, char *prepend) 4145 { 4146 int len, curlen = 0; 4147 char *news; 4148 4149 if (*curses == NULL) { 4150 /* 4151 * Prime the pump, allocate just the 4152 * space for prepend and return that. 4153 */ 4154 len = strlen(prepend) + 1; 4155 news = kmem_alloc(len, KM_SLEEP); 4156 (void) strncpy(news, prepend, len); 4157 } else { 4158 /* 4159 * Allocate the space for a new string 4160 * +1 +1 is for the "/" and the NULL 4161 * byte at the end of it all. 4162 */ 4163 curlen = strlen(*curses); 4164 len = curlen + strlen(prepend) + 1 + 1; 4165 news = kmem_alloc(len, KM_SLEEP); 4166 (void) strncpy(news, prepend, len); 4167 (void) strcat(news, "/"); 4168 (void) strcat(news, *curses); 4169 kmem_free(*curses, curlen + 1); 4170 } 4171 *curses = news; 4172 } 4173 4174 /* 4175 * Returns the path name (starting from the fs root) for the given fname. 4176 * The caller is responsible for freeing. Note that the path may be or 4177 * become invalid as a result of fn_move(). 4178 */ 4179 4180 char * 4181 fn_path(nfs4_fname_t *fnp) 4182 { 4183 char *path; 4184 nfs4_fname_t *nextfnp; 4185 4186 if (fnp == NULL) 4187 return (NULL); 4188 4189 path = NULL; 4190 4191 /* walk up the tree constructing the pathname. */ 4192 4193 fn_hold(fnp); /* adjust for later rele */ 4194 do { 4195 mutex_enter(&fnp->fn_lock); 4196 /* 4197 * Add fn_name in front of the current path 4198 */ 4199 fn_path_realloc(&path, fnp->fn_name); 4200 nextfnp = fnp->fn_parent; 4201 if (nextfnp != NULL) 4202 fn_hold(nextfnp); 4203 mutex_exit(&fnp->fn_lock); 4204 fn_rele(&fnp); 4205 fnp = nextfnp; 4206 } while (fnp != NULL); 4207 4208 return (path); 4209 } 4210 4211 /* 4212 * Return a reference to the parent of the given fname, which the caller is 4213 * responsible for eventually releasing. 4214 */ 4215 4216 nfs4_fname_t * 4217 fn_parent(nfs4_fname_t *fnp) 4218 { 4219 nfs4_fname_t *parent; 4220 4221 mutex_enter(&fnp->fn_lock); 4222 parent = fnp->fn_parent; 4223 if (parent != NULL) 4224 fn_hold(parent); 4225 mutex_exit(&fnp->fn_lock); 4226 4227 return (parent); 4228 } 4229 4230 /* 4231 * Update fnp so that its parent is newparent and its name is newname. 4232 */ 4233 4234 void 4235 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4236 { 4237 nfs4_fname_t *parent, *tmpfnp; 4238 ssize_t newlen; 4239 nfs4_fname_t key; 4240 avl_index_t where; 4241 4242 /* 4243 * This assert exists to catch the client trying to rename 4244 * a dir to be a child of itself. This happened at a recent 4245 * bakeoff against a 3rd party (broken) server which allowed 4246 * the rename to succeed. If it trips it means that: 4247 * a) the code in nfs4rename that detects this case is broken 4248 * b) the server is broken (since it allowed the bogus rename) 4249 * 4250 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4251 * panic below from: mutex_enter(&newparent->fn_lock); 4252 */ 4253 ASSERT(fnp != newparent); 4254 4255 /* 4256 * Remove fnp from its current parent, change its name, then add it 4257 * to newparent. 4258 */ 4259 mutex_enter(&fnp->fn_lock); 4260 parent = fnp->fn_parent; 4261 mutex_enter(&parent->fn_lock); 4262 avl_remove(&parent->fn_children, fnp); 4263 mutex_exit(&parent->fn_lock); 4264 fn_rele(&fnp->fn_parent); 4265 4266 newlen = strlen(newname); 4267 if (newlen != fnp->fn_len) { 4268 ASSERT(newlen < MAXNAMELEN); 4269 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4270 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4271 fnp->fn_len = newlen; 4272 } 4273 (void) strcpy(fnp->fn_name, newname); 4274 4275 again: 4276 mutex_enter(&newparent->fn_lock); 4277 key.fn_name = fnp->fn_name; 4278 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4279 if (tmpfnp != NULL) { 4280 /* 4281 * This could be due to a file that was unlinked while 4282 * open, or perhaps the rnode is in the free list. Remove 4283 * it from newparent and let it go away on its own. The 4284 * contorted code is to deal with lock order issues and 4285 * race conditions. 4286 */ 4287 fn_hold(tmpfnp); 4288 mutex_exit(&newparent->fn_lock); 4289 mutex_enter(&tmpfnp->fn_lock); 4290 if (tmpfnp->fn_parent == newparent) { 4291 mutex_enter(&newparent->fn_lock); 4292 avl_remove(&newparent->fn_children, tmpfnp); 4293 mutex_exit(&newparent->fn_lock); 4294 fn_rele(&tmpfnp->fn_parent); 4295 } 4296 mutex_exit(&tmpfnp->fn_lock); 4297 fn_rele(&tmpfnp); 4298 goto again; 4299 } 4300 fnp->fn_parent = newparent; 4301 fn_hold(newparent); 4302 avl_insert(&newparent->fn_children, fnp, where); 4303 mutex_exit(&newparent->fn_lock); 4304 mutex_exit(&fnp->fn_lock); 4305 } 4306 4307 #ifdef DEBUG 4308 /* 4309 * Return non-zero if the type information makes sense for the given vnode. 4310 * Otherwise panic. 4311 */ 4312 int 4313 nfs4_consistent_type(vnode_t *vp) 4314 { 4315 rnode4_t *rp = VTOR4(vp); 4316 4317 if (nfs4_vtype_debug && vp->v_type != VNON && 4318 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4319 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4320 "rnode attr type=%d", (void *)vp, vp->v_type, 4321 rp->r_attr.va_type); 4322 } 4323 4324 return (1); 4325 } 4326 #endif /* DEBUG */ 4327