1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/thread.h> 37 #include <sys/t_lock.h> 38 #include <sys/time.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/errno.h> 42 #include <sys/buf.h> 43 #include <sys/stat.h> 44 #include <sys/cred.h> 45 #include <sys/kmem.h> 46 #include <sys/debug.h> 47 #include <sys/dnlc.h> 48 #include <sys/vmsystm.h> 49 #include <sys/flock.h> 50 #include <sys/share.h> 51 #include <sys/cmn_err.h> 52 #include <sys/tiuser.h> 53 #include <sys/sysmacros.h> 54 #include <sys/callb.h> 55 #include <sys/acl.h> 56 #include <sys/kstat.h> 57 #include <sys/signal.h> 58 #include <sys/disp.h> 59 #include <sys/atomic.h> 60 #include <sys/list.h> 61 #include <sys/sdt.h> 62 63 #include <rpc/types.h> 64 #include <rpc/xdr.h> 65 #include <rpc/auth.h> 66 #include <rpc/clnt.h> 67 68 #include <nfs/nfs.h> 69 #include <nfs/nfs_clnt.h> 70 #include <nfs/nfs_acl.h> 71 72 #include <nfs/nfs4.h> 73 #include <nfs/rnode4.h> 74 #include <nfs/nfs4_clnt.h> 75 76 #include <vm/hat.h> 77 #include <vm/as.h> 78 #include <vm/page.h> 79 #include <vm/pvn.h> 80 #include <vm/seg.h> 81 #include <vm/seg_map.h> 82 #include <vm/seg_vn.h> 83 84 #include <sys/ddi.h> 85 86 /* 87 * Arguments to page-flush thread. 88 */ 89 typedef struct { 90 vnode_t *vp; 91 cred_t *cr; 92 } pgflush_t; 93 94 #ifdef DEBUG 95 int nfs4_client_lease_debug; 96 int nfs4_sharedfh_debug; 97 int nfs4_fname_debug; 98 99 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 100 int nfs4_vtype_debug; 101 102 uint_t nfs4_tsd_key; 103 #endif 104 105 static time_t nfs4_client_resumed = 0; 106 static callb_id_t cid = 0; 107 108 static int nfs4renew(nfs4_server_t *); 109 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 110 static void nfs4_pgflush_thread(pgflush_t *); 111 static void flush_pages(vnode_t *, cred_t *); 112 113 static boolean_t nfs4_client_cpr_callb(void *, int); 114 115 struct mi4_globals { 116 kmutex_t mig_lock; /* lock protecting mig_list */ 117 list_t mig_list; /* list of NFS v4 mounts in zone */ 118 boolean_t mig_destructor_called; 119 }; 120 121 static zone_key_t mi4_list_key; 122 123 /* 124 * Attributes caching: 125 * 126 * Attributes are cached in the rnode in struct vattr form. 127 * There is a time associated with the cached attributes (r_time_attr_inval) 128 * which tells whether the attributes are valid. The time is initialized 129 * to the difference between current time and the modify time of the vnode 130 * when new attributes are cached. This allows the attributes for 131 * files that have changed recently to be timed out sooner than for files 132 * that have not changed for a long time. There are minimum and maximum 133 * timeout values that can be set per mount point. 134 */ 135 136 /* 137 * If a cache purge is in progress, wait for it to finish. 138 * 139 * The current thread must not be in the middle of an 140 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 141 * between this thread, a recovery thread, and the page flush thread. 142 */ 143 int 144 nfs4_waitfor_purge_complete(vnode_t *vp) 145 { 146 rnode4_t *rp; 147 k_sigset_t smask; 148 149 rp = VTOR4(vp); 150 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 151 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 152 mutex_enter(&rp->r_statelock); 153 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 154 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 155 ((rp->r_flags & R4PGFLUSH) && 156 rp->r_pgflush != curthread)) { 157 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 158 sigunintr(&smask); 159 mutex_exit(&rp->r_statelock); 160 return (EINTR); 161 } 162 } 163 sigunintr(&smask); 164 mutex_exit(&rp->r_statelock); 165 } 166 return (0); 167 } 168 169 /* 170 * Validate caches by checking cached attributes. If they have timed out, 171 * then get new attributes from the server. As a side effect, cache 172 * invalidation is done if the attributes have changed. 173 * 174 * If the attributes have not timed out and if there is a cache 175 * invalidation being done by some other thread, then wait until that 176 * thread has completed the cache invalidation. 177 */ 178 int 179 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 180 { 181 int error; 182 nfs4_ga_res_t gar; 183 184 if (ATTRCACHE4_VALID(vp)) { 185 error = nfs4_waitfor_purge_complete(vp); 186 if (error) 187 return (error); 188 return (0); 189 } 190 191 gar.n4g_va.va_mask = AT_ALL; 192 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 193 } 194 195 /* 196 * Fill in attribute from the cache. 197 * If valid, then return 0 to indicate that no error occurred, 198 * otherwise return 1 to indicate that an error occurred. 199 */ 200 static int 201 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 202 { 203 rnode4_t *rp; 204 205 rp = VTOR4(vp); 206 mutex_enter(&rp->r_statelock); 207 mutex_enter(&rp->r_statev4_lock); 208 if (ATTRCACHE4_VALID(vp)) { 209 mutex_exit(&rp->r_statev4_lock); 210 /* 211 * Cached attributes are valid 212 */ 213 *vap = rp->r_attr; 214 mutex_exit(&rp->r_statelock); 215 return (0); 216 } 217 mutex_exit(&rp->r_statev4_lock); 218 mutex_exit(&rp->r_statelock); 219 return (1); 220 } 221 222 223 /* 224 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 225 * call is synchronous because all the pages were invalidated by the 226 * nfs4_invalidate_pages() call. 227 */ 228 void 229 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 230 { 231 struct rnode4 *rp = VTOR4(vp); 232 233 /* Ensure that the ..._end_op() call has been done */ 234 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 235 236 if (errno != ESTALE) 237 return; 238 239 mutex_enter(&rp->r_statelock); 240 rp->r_flags |= R4STALE; 241 if (!rp->r_error) 242 rp->r_error = errno; 243 mutex_exit(&rp->r_statelock); 244 if (nfs4_has_pages(vp)) 245 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 246 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 247 } 248 249 /* 250 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 251 * page purge is done asynchronously. 252 */ 253 void 254 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 255 { 256 rnode4_t *rp; 257 char *contents; 258 vnode_t *xattr; 259 int size; 260 int pgflush; /* are we the page flush thread? */ 261 262 /* 263 * Purge the DNLC for any entries which refer to this file. 264 */ 265 if (vp->v_count > 1 && 266 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 267 dnlc_purge_vp(vp); 268 269 /* 270 * Clear any readdir state bits and purge the readlink response cache. 271 */ 272 rp = VTOR4(vp); 273 mutex_enter(&rp->r_statelock); 274 rp->r_flags &= ~R4LOOKUP; 275 contents = rp->r_symlink.contents; 276 size = rp->r_symlink.size; 277 rp->r_symlink.contents = NULL; 278 279 xattr = rp->r_xattr_dir; 280 rp->r_xattr_dir = NULL; 281 282 /* 283 * Purge pathconf cache too. 284 */ 285 rp->r_pathconf.pc4_xattr_valid = 0; 286 rp->r_pathconf.pc4_cache_valid = 0; 287 288 pgflush = (curthread == rp->r_pgflush); 289 mutex_exit(&rp->r_statelock); 290 291 if (contents != NULL) { 292 293 kmem_free((void *)contents, size); 294 } 295 296 if (xattr != NULL) 297 VN_RELE(xattr); 298 299 /* 300 * Flush the page cache. If the current thread is the page flush 301 * thread, don't initiate a new page flush. There's no need for 302 * it, and doing it correctly is hard. 303 */ 304 if (nfs4_has_pages(vp) && !pgflush) { 305 if (!asyncpg) { 306 (void) nfs4_waitfor_purge_complete(vp); 307 flush_pages(vp, cr); 308 } else { 309 pgflush_t *args; 310 311 /* 312 * We don't hold r_statelock while creating the 313 * thread, in case the call blocks. So we use a 314 * flag to indicate that a page flush thread is 315 * active. 316 */ 317 mutex_enter(&rp->r_statelock); 318 if (rp->r_flags & R4PGFLUSH) { 319 mutex_exit(&rp->r_statelock); 320 } else { 321 rp->r_flags |= R4PGFLUSH; 322 mutex_exit(&rp->r_statelock); 323 324 args = kmem_alloc(sizeof (pgflush_t), 325 KM_SLEEP); 326 args->vp = vp; 327 VN_HOLD(args->vp); 328 args->cr = cr; 329 crhold(args->cr); 330 (void) zthread_create(NULL, 0, 331 nfs4_pgflush_thread, args, 0, 332 minclsyspri); 333 } 334 } 335 } 336 337 /* 338 * Flush the readdir response cache. 339 */ 340 nfs4_purge_rddir_cache(vp); 341 } 342 343 /* 344 * Invalidate all pages for the given file, after writing back the dirty 345 * ones. 346 */ 347 348 static void 349 flush_pages(vnode_t *vp, cred_t *cr) 350 { 351 int error; 352 rnode4_t *rp = VTOR4(vp); 353 354 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 355 if (error == ENOSPC || error == EDQUOT) { 356 mutex_enter(&rp->r_statelock); 357 if (!rp->r_error) 358 rp->r_error = error; 359 mutex_exit(&rp->r_statelock); 360 } 361 } 362 363 /* 364 * Page flush thread. 365 */ 366 367 static void 368 nfs4_pgflush_thread(pgflush_t *args) 369 { 370 rnode4_t *rp = VTOR4(args->vp); 371 372 /* remember which thread we are, so we don't deadlock ourselves */ 373 mutex_enter(&rp->r_statelock); 374 ASSERT(rp->r_pgflush == NULL); 375 rp->r_pgflush = curthread; 376 mutex_exit(&rp->r_statelock); 377 378 flush_pages(args->vp, args->cr); 379 380 mutex_enter(&rp->r_statelock); 381 rp->r_pgflush = NULL; 382 rp->r_flags &= ~R4PGFLUSH; 383 cv_broadcast(&rp->r_cv); 384 mutex_exit(&rp->r_statelock); 385 386 VN_RELE(args->vp); 387 crfree(args->cr); 388 kmem_free(args, sizeof (pgflush_t)); 389 zthread_exit(); 390 } 391 392 /* 393 * Purge the readdir cache of all entries which are not currently 394 * being filled. 395 */ 396 void 397 nfs4_purge_rddir_cache(vnode_t *vp) 398 { 399 rnode4_t *rp; 400 401 rp = VTOR4(vp); 402 403 mutex_enter(&rp->r_statelock); 404 rp->r_direof = NULL; 405 rp->r_flags &= ~R4LOOKUP; 406 rp->r_flags |= R4READDIRWATTR; 407 rddir4_cache_purge(rp); 408 mutex_exit(&rp->r_statelock); 409 } 410 411 /* 412 * Set attributes cache for given vnode using virtual attributes. There is 413 * no cache validation, but if the attributes are deemed to be stale, they 414 * are ignored. This corresponds to nfs3_attrcache(). 415 * 416 * Set the timeout value on the attribute cache and fill it 417 * with the passed in attributes. 418 */ 419 void 420 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 421 { 422 rnode4_t *rp = VTOR4(vp); 423 424 mutex_enter(&rp->r_statelock); 425 if (rp->r_time_attr_saved <= t) 426 nfs4_attrcache_va(vp, garp, FALSE); 427 mutex_exit(&rp->r_statelock); 428 } 429 430 /* 431 * Use the passed in virtual attributes to check to see whether the 432 * data and metadata caches are valid, cache the new attributes, and 433 * then do the cache invalidation if required. 434 * 435 * The cache validation and caching of the new attributes is done 436 * atomically via the use of the mutex, r_statelock. If required, 437 * the cache invalidation is done atomically w.r.t. the cache 438 * validation and caching of the attributes via the pseudo lock, 439 * r_serial. 440 * 441 * This routine is used to do cache validation and attributes caching 442 * for operations with a single set of post operation attributes. 443 */ 444 445 void 446 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 447 hrtime_t t, cred_t *cr, int async, 448 change_info4 *cinfo) 449 { 450 rnode4_t *rp; 451 int mtime_changed = 0; 452 int ctime_changed = 0; 453 vsecattr_t *vsp; 454 int was_serial, set_time_cache_inval, recov; 455 vattr_t *vap = &garp->n4g_va; 456 mntinfo4_t *mi = VTOMI4(vp); 457 len_t preattr_rsize; 458 boolean_t writemodify_set = B_FALSE; 459 boolean_t cachepurge_set = B_FALSE; 460 461 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 462 463 /* Is curthread the recovery thread? */ 464 mutex_enter(&mi->mi_lock); 465 recov = (VTOMI4(vp)->mi_recovthread == curthread); 466 mutex_exit(&mi->mi_lock); 467 468 rp = VTOR4(vp); 469 mutex_enter(&rp->r_statelock); 470 was_serial = (rp->r_serial == curthread); 471 if (rp->r_serial && !was_serial) { 472 klwp_t *lwp = ttolwp(curthread); 473 474 /* 475 * If we're the recovery thread, then purge current attrs 476 * and bail out to avoid potential deadlock between another 477 * thread caching attrs (r_serial thread), recov thread, 478 * and an async writer thread. 479 */ 480 if (recov) { 481 PURGE_ATTRCACHE4_LOCKED(rp); 482 mutex_exit(&rp->r_statelock); 483 return; 484 } 485 486 if (lwp != NULL) 487 lwp->lwp_nostop++; 488 while (rp->r_serial != NULL) { 489 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 490 mutex_exit(&rp->r_statelock); 491 if (lwp != NULL) 492 lwp->lwp_nostop--; 493 return; 494 } 495 } 496 if (lwp != NULL) 497 lwp->lwp_nostop--; 498 } 499 500 /* 501 * If there is a page flush thread, the current thread needs to 502 * bail out, to prevent a possible deadlock between the current 503 * thread (which might be in a start_op/end_op region), the 504 * recovery thread, and the page flush thread. Expire the 505 * attribute cache, so that any attributes the current thread was 506 * going to set are not lost. 507 */ 508 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 509 PURGE_ATTRCACHE4_LOCKED(rp); 510 mutex_exit(&rp->r_statelock); 511 return; 512 } 513 514 if (rp->r_time_attr_saved > t) { 515 /* 516 * Attributes have been cached since these attributes were 517 * probably made. If there is an inconsistency in what is 518 * cached, mark them invalid. If not, don't act on them. 519 */ 520 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 521 PURGE_ATTRCACHE4_LOCKED(rp); 522 mutex_exit(&rp->r_statelock); 523 return; 524 } 525 set_time_cache_inval = 0; 526 if (cinfo) { 527 /* 528 * Only directory modifying callers pass non-NULL cinfo. 529 */ 530 ASSERT(vp->v_type == VDIR); 531 /* 532 * If the cache timeout either doesn't exist or hasn't expired, 533 * and dir didn't changed on server before dirmod op 534 * and dir didn't change after dirmod op but before getattr 535 * then there's a chance that the client's cached data for 536 * this object is current (not stale). No immediate cache 537 * flush is required. 538 * 539 */ 540 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 541 cinfo->before == rp->r_change && 542 (garp->n4g_change_valid && 543 cinfo->after == garp->n4g_change)) { 544 545 /* 546 * If atomic isn't set, then the before/after info 547 * cannot be blindly trusted. For this case, we tell 548 * nfs4_attrcache_va to cache the attrs but also 549 * establish an absolute maximum cache timeout. When 550 * the timeout is reached, caches will be flushed. 551 */ 552 if (! cinfo->atomic) 553 set_time_cache_inval = 1; 554 } else { 555 556 /* 557 * We're not sure exactly what changed, but we know 558 * what to do. flush all caches for dir. remove the 559 * attr timeout. 560 * 561 * a) timeout expired. flush all caches. 562 * b) r_change != cinfo.before. flush all caches. 563 * c) r_change == cinfo.before, but cinfo.after != 564 * post-op getattr(change). flush all caches. 565 * d) post-op getattr(change) not provided by server. 566 * flush all caches. 567 */ 568 mtime_changed = 1; 569 ctime_changed = 1; 570 rp->r_time_cache_inval = 0; 571 } 572 } else { 573 /* 574 * Write thread after writing data to file on remote server, 575 * will always set R4WRITEMODIFIED to indicate that file on 576 * remote server was modified with a WRITE operation and would 577 * have marked attribute cache as timed out. If R4WRITEMODIFIED 578 * is set, then do not check for mtime and ctime change. 579 */ 580 if (!(rp->r_flags & R4WRITEMODIFIED)) { 581 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 582 mtime_changed = 1; 583 584 if (rp->r_attr.va_ctime.tv_sec != 585 vap->va_ctime.tv_sec || 586 rp->r_attr.va_ctime.tv_nsec != 587 vap->va_ctime.tv_nsec) 588 ctime_changed = 1; 589 } else { 590 writemodify_set = B_TRUE; 591 } 592 } 593 594 preattr_rsize = rp->r_size; 595 596 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 597 598 /* 599 * If we have updated filesize in nfs4_attrcache_va, as soon as we 600 * drop statelock we will be in transition of purging all 601 * our caches and updating them. It is possible for another 602 * thread to pick this new file size and read in zeroed data. 603 * stall other threads till cache purge is complete. 604 */ 605 if ((!cinfo) && (rp->r_size != preattr_rsize)) { 606 /* 607 * If R4WRITEMODIFIED was set and we have updated the file 608 * size, Server's returned file size need not necessarily 609 * be because of this Client's WRITE. We need to purge 610 * all caches. 611 */ 612 if (writemodify_set) 613 mtime_changed = 1; 614 615 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) { 616 rp->r_flags |= R4INCACHEPURGE; 617 cachepurge_set = B_TRUE; 618 } 619 } 620 621 if (!mtime_changed && !ctime_changed) { 622 mutex_exit(&rp->r_statelock); 623 return; 624 } 625 626 rp->r_serial = curthread; 627 628 mutex_exit(&rp->r_statelock); 629 630 /* 631 * If we're the recov thread, then force async nfs4_purge_caches 632 * to avoid potential deadlock. 633 */ 634 if (mtime_changed) 635 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 636 637 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) { 638 mutex_enter(&rp->r_statelock); 639 rp->r_flags &= ~R4INCACHEPURGE; 640 cv_broadcast(&rp->r_cv); 641 mutex_exit(&rp->r_statelock); 642 cachepurge_set = B_FALSE; 643 } 644 645 if (ctime_changed) { 646 (void) nfs4_access_purge_rp(rp); 647 if (rp->r_secattr != NULL) { 648 mutex_enter(&rp->r_statelock); 649 vsp = rp->r_secattr; 650 rp->r_secattr = NULL; 651 mutex_exit(&rp->r_statelock); 652 if (vsp != NULL) 653 nfs4_acl_free_cache(vsp); 654 } 655 } 656 657 if (!was_serial) { 658 mutex_enter(&rp->r_statelock); 659 rp->r_serial = NULL; 660 cv_broadcast(&rp->r_cv); 661 mutex_exit(&rp->r_statelock); 662 } 663 } 664 665 /* 666 * Set attributes cache for given vnode using virtual attributes. 667 * 668 * Set the timeout value on the attribute cache and fill it 669 * with the passed in attributes. 670 * 671 * The caller must be holding r_statelock. 672 */ 673 static void 674 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 675 { 676 rnode4_t *rp; 677 mntinfo4_t *mi; 678 hrtime_t delta; 679 hrtime_t now; 680 vattr_t *vap = &garp->n4g_va; 681 682 rp = VTOR4(vp); 683 684 ASSERT(MUTEX_HELD(&rp->r_statelock)); 685 ASSERT(vap->va_mask == AT_ALL); 686 687 /* Switch to master before checking v_flag */ 688 if (IS_SHADOW(vp, rp)) 689 vp = RTOV4(rp); 690 691 now = gethrtime(); 692 693 mi = VTOMI4(vp); 694 695 /* 696 * Only establish a new cache timeout (if requested). Never 697 * extend a timeout. Never clear a timeout. Clearing a timeout 698 * is done by nfs4_update_dircaches (ancestor in our call chain) 699 */ 700 if (set_cache_timeout && ! rp->r_time_cache_inval) 701 rp->r_time_cache_inval = now + mi->mi_acdirmax; 702 703 /* 704 * Delta is the number of nanoseconds that we will 705 * cache the attributes of the file. It is based on 706 * the number of nanoseconds since the last time that 707 * we detected a change. The assumption is that files 708 * that changed recently are likely to change again. 709 * There is a minimum and a maximum for regular files 710 * and for directories which is enforced though. 711 * 712 * Using the time since last change was detected 713 * eliminates direct comparison or calculation 714 * using mixed client and server times. NFS does 715 * not make any assumptions regarding the client 716 * and server clocks being synchronized. 717 */ 718 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 719 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 720 vap->va_size != rp->r_attr.va_size) { 721 rp->r_time_attr_saved = now; 722 } 723 724 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 725 delta = 0; 726 else { 727 delta = now - rp->r_time_attr_saved; 728 if (vp->v_type == VDIR) { 729 if (delta < mi->mi_acdirmin) 730 delta = mi->mi_acdirmin; 731 else if (delta > mi->mi_acdirmax) 732 delta = mi->mi_acdirmax; 733 } else { 734 if (delta < mi->mi_acregmin) 735 delta = mi->mi_acregmin; 736 else if (delta > mi->mi_acregmax) 737 delta = mi->mi_acregmax; 738 } 739 } 740 rp->r_time_attr_inval = now + delta; 741 742 rp->r_attr = *vap; 743 if (garp->n4g_change_valid) 744 rp->r_change = garp->n4g_change; 745 746 /* 747 * The attributes that were returned may be valid and can 748 * be used, but they may not be allowed to be cached. 749 * Reset the timers to cause immediate invalidation and 750 * clear r_change so no VERIFY operations will suceed 751 */ 752 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 753 rp->r_time_attr_inval = now; 754 rp->r_time_attr_saved = now; 755 rp->r_change = 0; 756 } 757 758 /* 759 * If mounted_on_fileid returned AND the object is a stub, 760 * then set object's va_nodeid to the mounted over fid 761 * returned by server. 762 * 763 * If mounted_on_fileid not provided/supported, then 764 * just set it to 0 for now. Eventually it would be 765 * better to set it to a hashed version of FH. This 766 * would probably be good enough to provide a unique 767 * fid/d_ino within a dir. 768 * 769 * We don't need to carry mounted_on_fileid in the 770 * rnode as long as the client never requests fileid 771 * without also requesting mounted_on_fileid. For 772 * now, it stays. 773 */ 774 if (garp->n4g_mon_fid_valid) { 775 rp->r_mntd_fid = garp->n4g_mon_fid; 776 777 if (RP_ISSTUB(rp)) 778 rp->r_attr.va_nodeid = rp->r_mntd_fid; 779 } 780 781 /* 782 * Check to see if there are valid pathconf bits to 783 * cache in the rnode. 784 */ 785 if (garp->n4g_ext_res) { 786 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 787 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 788 } else { 789 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 790 rp->r_pathconf.pc4_xattr_valid = TRUE; 791 rp->r_pathconf.pc4_xattr_exists = 792 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 793 } 794 } 795 } 796 /* 797 * Update the size of the file if there is no cached data or if 798 * the cached data is clean and there is no data being written 799 * out. 800 */ 801 if (rp->r_size != vap->va_size && 802 (!vn_has_cached_data(vp) || 803 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 804 rp->r_size = vap->va_size; 805 } 806 nfs_setswaplike(vp, vap); 807 rp->r_flags &= ~R4WRITEMODIFIED; 808 } 809 810 /* 811 * Get attributes over-the-wire and update attributes cache 812 * if no error occurred in the over-the-wire operation. 813 * Return 0 if successful, otherwise error. 814 */ 815 int 816 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 817 { 818 mntinfo4_t *mi = VTOMI4(vp); 819 hrtime_t t; 820 nfs4_recov_state_t recov_state; 821 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 822 823 recov_state.rs_flags = 0; 824 recov_state.rs_num_retry_despite_err = 0; 825 826 /* Save the original mount point security flavor */ 827 (void) save_mnt_secinfo(mi->mi_curr_serv); 828 829 recov_retry: 830 831 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 832 &recov_state, NULL))) { 833 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 834 return (e.error); 835 } 836 837 t = gethrtime(); 838 839 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 840 841 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 842 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 843 NULL, OP_GETATTR, NULL) == FALSE) { 844 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 845 &recov_state, 1); 846 goto recov_retry; 847 } 848 } 849 850 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 851 852 if (!e.error) { 853 if (e.stat == NFS4_OK) { 854 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 855 } else { 856 e.error = geterrno4(e.stat); 857 858 nfs4_purge_stale_fh(e.error, vp, cr); 859 } 860 } 861 862 /* 863 * If getattr a node that is a stub for a crossed 864 * mount point, keep the original secinfo flavor for 865 * the current file system, not the crossed one. 866 */ 867 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 868 869 return (e.error); 870 } 871 872 /* 873 * Generate a compound to get attributes over-the-wire. 874 */ 875 void 876 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 877 nfs4_error_t *ep, cred_t *cr, int get_acl) 878 { 879 COMPOUND4args_clnt args; 880 COMPOUND4res_clnt res; 881 int doqueue; 882 rnode4_t *rp = VTOR4(vp); 883 nfs_argop4 argop[2]; 884 885 args.ctag = TAG_GETATTR; 886 887 args.array_len = 2; 888 args.array = argop; 889 890 /* putfh */ 891 argop[0].argop = OP_CPUTFH; 892 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 893 894 /* getattr */ 895 /* 896 * Unlike nfs version 2 and 3, where getattr returns all the 897 * attributes, nfs version 4 returns only the ones explicitly 898 * asked for. This creates problems, as some system functions 899 * (e.g. cache check) require certain attributes and if the 900 * cached node lacks some attributes such as uid/gid, it can 901 * affect system utilities (e.g. "ls") that rely on the information 902 * to be there. This can lead to anything from system crashes to 903 * corrupted information processed by user apps. 904 * So to ensure that all bases are covered, request at least 905 * the AT_ALL attribute mask. 906 */ 907 argop[1].argop = OP_GETATTR; 908 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 909 if (get_acl) 910 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 911 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 912 913 doqueue = 1; 914 915 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 916 917 if (ep->error) 918 return; 919 920 if (res.status != NFS4_OK) { 921 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 922 return; 923 } 924 925 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 926 927 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 928 } 929 930 /* 931 * Return either cached or remote attributes. If get remote attr 932 * use them to check and invalidate caches, then cache the new attributes. 933 */ 934 int 935 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 936 { 937 int error; 938 rnode4_t *rp; 939 nfs4_ga_res_t gar; 940 941 ASSERT(nfs4_consistent_type(vp)); 942 943 /* 944 * If we've got cached attributes, we're done, otherwise go 945 * to the server to get attributes, which will update the cache 946 * in the process. Either way, use the cached attributes for 947 * the caller's vattr_t. 948 * 949 * Note that we ignore the gar set by the OTW call: the attr caching 950 * code may make adjustments when storing to the rnode, and we want 951 * to see those changes here. 952 */ 953 rp = VTOR4(vp); 954 error = 0; 955 mutex_enter(&rp->r_statelock); 956 if (!ATTRCACHE4_VALID(vp)) { 957 mutex_exit(&rp->r_statelock); 958 error = nfs4_getattr_otw(vp, &gar, cr, 0); 959 mutex_enter(&rp->r_statelock); 960 } 961 962 if (!error) 963 *vap = rp->r_attr; 964 965 /* Return the client's view of file size */ 966 vap->va_size = rp->r_size; 967 968 mutex_exit(&rp->r_statelock); 969 970 ASSERT(nfs4_consistent_type(vp)); 971 972 return (error); 973 } 974 975 int 976 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 977 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 978 { 979 COMPOUND4args_clnt args; 980 COMPOUND4res_clnt res; 981 int doqueue; 982 nfs_argop4 argop[2]; 983 mntinfo4_t *mi = VTOMI4(vp); 984 bool_t needrecov = FALSE; 985 nfs4_recov_state_t recov_state; 986 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 987 nfs4_ga_ext_res_t *gerp; 988 989 recov_state.rs_flags = 0; 990 recov_state.rs_num_retry_despite_err = 0; 991 992 recov_retry: 993 args.ctag = tag_type; 994 995 args.array_len = 2; 996 args.array = argop; 997 998 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 999 if (e.error) 1000 return (e.error); 1001 1002 /* putfh */ 1003 argop[0].argop = OP_CPUTFH; 1004 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 1005 1006 /* getattr */ 1007 argop[1].argop = OP_GETATTR; 1008 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 1009 argop[1].nfs_argop4_u.opgetattr.mi = mi; 1010 1011 doqueue = 1; 1012 1013 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1014 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 1015 rnode4info(VTOR4(vp)))); 1016 1017 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 1018 1019 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 1020 if (!needrecov && e.error) { 1021 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1022 needrecov); 1023 return (e.error); 1024 } 1025 1026 if (needrecov) { 1027 bool_t abort; 1028 1029 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1030 "nfs4_attr_otw: initiating recovery\n")); 1031 1032 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1033 NULL, OP_GETATTR, NULL); 1034 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1035 needrecov); 1036 if (!e.error) { 1037 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1038 e.error = geterrno4(res.status); 1039 } 1040 if (abort == FALSE) 1041 goto recov_retry; 1042 return (e.error); 1043 } 1044 1045 if (res.status) { 1046 e.error = geterrno4(res.status); 1047 } else { 1048 gerp = garp->n4g_ext_res; 1049 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1050 garp, sizeof (nfs4_ga_res_t)); 1051 garp->n4g_ext_res = gerp; 1052 if (garp->n4g_ext_res && 1053 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1054 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1055 ga_res.n4g_ext_res, 1056 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1057 } 1058 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1059 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1060 needrecov); 1061 return (e.error); 1062 } 1063 1064 /* 1065 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1066 * for the demand-based allocation of async threads per-mount. The 1067 * nfs_async_timeout is the amount of time a thread will live after it 1068 * becomes idle, unless new I/O requests are received before the thread 1069 * dies. See nfs4_async_putpage and nfs4_async_start. 1070 */ 1071 1072 static void nfs4_async_start(struct vfs *); 1073 1074 static void 1075 free_async_args4(struct nfs4_async_reqs *args) 1076 { 1077 rnode4_t *rp; 1078 1079 if (args->a_io != NFS4_INACTIVE) { 1080 rp = VTOR4(args->a_vp); 1081 mutex_enter(&rp->r_statelock); 1082 rp->r_count--; 1083 if (args->a_io == NFS4_PUTAPAGE || 1084 args->a_io == NFS4_PAGEIO) 1085 rp->r_awcount--; 1086 cv_broadcast(&rp->r_cv); 1087 mutex_exit(&rp->r_statelock); 1088 VN_RELE(args->a_vp); 1089 } 1090 crfree(args->a_cred); 1091 kmem_free(args, sizeof (*args)); 1092 } 1093 1094 /* 1095 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1096 * pageout(), running in the global zone, have legitimate reasons to do 1097 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1098 * use of a a per-mount "asynchronous requests manager thread" which is 1099 * signaled by the various asynchronous work routines when there is 1100 * asynchronous work to be done. It is responsible for creating new 1101 * worker threads if necessary, and notifying existing worker threads 1102 * that there is work to be done. 1103 * 1104 * In other words, it will "take the specifications from the customers and 1105 * give them to the engineers." 1106 * 1107 * Worker threads die off of their own accord if they are no longer 1108 * needed. 1109 * 1110 * This thread is killed when the zone is going away or the filesystem 1111 * is being unmounted. 1112 */ 1113 void 1114 nfs4_async_manager(vfs_t *vfsp) 1115 { 1116 callb_cpr_t cprinfo; 1117 mntinfo4_t *mi; 1118 uint_t max_threads; 1119 1120 mi = VFTOMI4(vfsp); 1121 1122 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1123 "nfs4_async_manager"); 1124 1125 mutex_enter(&mi->mi_async_lock); 1126 /* 1127 * We want to stash the max number of threads that this mount was 1128 * allowed so we can use it later when the variable is set to zero as 1129 * part of the zone/mount going away. 1130 * 1131 * We want to be able to create at least one thread to handle 1132 * asyncrhonous inactive calls. 1133 */ 1134 max_threads = MAX(mi->mi_max_threads, 1); 1135 mutex_enter(&mi->mi_lock); 1136 /* 1137 * We don't want to wait for mi_max_threads to go to zero, since that 1138 * happens as part of a failed unmount, but this thread should only 1139 * exit when the mount is really going away. 1140 * 1141 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1142 * attempted: the various _async_*() functions know to do things 1143 * inline if mi_max_threads == 0. Henceforth we just drain out the 1144 * outstanding requests. 1145 * 1146 * Note that we still create zthreads even if we notice the zone is 1147 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1148 * shutdown sequence to take slightly longer in some cases, but 1149 * doesn't violate the protocol, as all threads will exit as soon as 1150 * they're done processing the remaining requests. 1151 */ 1152 while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) || 1153 mi->mi_async_req_count > 0) { 1154 mutex_exit(&mi->mi_lock); 1155 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1156 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1157 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1158 while (mi->mi_async_req_count > 0) { 1159 /* 1160 * Paranoia: If the mount started out having 1161 * (mi->mi_max_threads == 0), and the value was 1162 * later changed (via a debugger or somesuch), 1163 * we could be confused since we will think we 1164 * can't create any threads, and the calling 1165 * code (which looks at the current value of 1166 * mi->mi_max_threads, now non-zero) thinks we 1167 * can. 1168 * 1169 * So, because we're paranoid, we create threads 1170 * up to the maximum of the original and the 1171 * current value. This means that future 1172 * (debugger-induced) alterations of 1173 * mi->mi_max_threads are ignored for our 1174 * purposes, but who told them they could change 1175 * random values on a live kernel anyhow? 1176 */ 1177 if (mi->mi_threads < 1178 MAX(mi->mi_max_threads, max_threads)) { 1179 mi->mi_threads++; 1180 mutex_exit(&mi->mi_async_lock); 1181 MI4_HOLD(mi); 1182 VFS_HOLD(vfsp); /* hold for new thread */ 1183 (void) zthread_create(NULL, 0, nfs4_async_start, 1184 vfsp, 0, minclsyspri); 1185 mutex_enter(&mi->mi_async_lock); 1186 } 1187 cv_signal(&mi->mi_async_work_cv); 1188 ASSERT(mi->mi_async_req_count != 0); 1189 mi->mi_async_req_count--; 1190 } 1191 mutex_enter(&mi->mi_lock); 1192 } 1193 mutex_exit(&mi->mi_lock); 1194 1195 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1196 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1197 /* 1198 * Let everyone know we're done. 1199 */ 1200 mi->mi_manager_thread = NULL; 1201 /* 1202 * Wake up the inactive thread. 1203 */ 1204 cv_broadcast(&mi->mi_inact_req_cv); 1205 /* 1206 * Wake up anyone sitting in nfs4_async_manager_stop() 1207 */ 1208 cv_broadcast(&mi->mi_async_cv); 1209 /* 1210 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1211 * since CALLB_CPR_EXIT is actually responsible for releasing 1212 * 'mi_async_lock'. 1213 */ 1214 CALLB_CPR_EXIT(&cprinfo); 1215 VFS_RELE(vfsp); /* release thread's hold */ 1216 MI4_RELE(mi); 1217 zthread_exit(); 1218 } 1219 1220 /* 1221 * Signal (and wait for) the async manager thread to clean up and go away. 1222 */ 1223 void 1224 nfs4_async_manager_stop(vfs_t *vfsp) 1225 { 1226 mntinfo4_t *mi = VFTOMI4(vfsp); 1227 1228 mutex_enter(&mi->mi_async_lock); 1229 mutex_enter(&mi->mi_lock); 1230 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1231 mutex_exit(&mi->mi_lock); 1232 cv_broadcast(&mi->mi_async_reqs_cv); 1233 /* 1234 * Wait for the async manager thread to die. 1235 */ 1236 while (mi->mi_manager_thread != NULL) 1237 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1238 mutex_exit(&mi->mi_async_lock); 1239 } 1240 1241 int 1242 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1243 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1244 u_offset_t, caddr_t, struct seg *, cred_t *)) 1245 { 1246 rnode4_t *rp; 1247 mntinfo4_t *mi; 1248 struct nfs4_async_reqs *args; 1249 1250 rp = VTOR4(vp); 1251 ASSERT(rp->r_freef == NULL); 1252 1253 mi = VTOMI4(vp); 1254 1255 /* 1256 * If addr falls in a different segment, don't bother doing readahead. 1257 */ 1258 if (addr >= seg->s_base + seg->s_size) 1259 return (-1); 1260 1261 /* 1262 * If we can't allocate a request structure, punt on the readahead. 1263 */ 1264 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1265 return (-1); 1266 1267 /* 1268 * If a lock operation is pending, don't initiate any new 1269 * readaheads. Otherwise, bump r_count to indicate the new 1270 * asynchronous I/O. 1271 */ 1272 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1273 kmem_free(args, sizeof (*args)); 1274 return (-1); 1275 } 1276 mutex_enter(&rp->r_statelock); 1277 rp->r_count++; 1278 mutex_exit(&rp->r_statelock); 1279 nfs_rw_exit(&rp->r_lkserlock); 1280 1281 args->a_next = NULL; 1282 #ifdef DEBUG 1283 args->a_queuer = curthread; 1284 #endif 1285 VN_HOLD(vp); 1286 args->a_vp = vp; 1287 ASSERT(cr != NULL); 1288 crhold(cr); 1289 args->a_cred = cr; 1290 args->a_io = NFS4_READ_AHEAD; 1291 args->a_nfs4_readahead = readahead; 1292 args->a_nfs4_blkoff = blkoff; 1293 args->a_nfs4_seg = seg; 1294 args->a_nfs4_addr = addr; 1295 1296 mutex_enter(&mi->mi_async_lock); 1297 1298 /* 1299 * If asyncio has been disabled, don't bother readahead. 1300 */ 1301 if (mi->mi_max_threads == 0) { 1302 mutex_exit(&mi->mi_async_lock); 1303 goto noasync; 1304 } 1305 1306 /* 1307 * Link request structure into the async list and 1308 * wakeup async thread to do the i/o. 1309 */ 1310 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1311 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1312 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1313 } else { 1314 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1315 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1316 } 1317 1318 if (mi->mi_io_kstats) { 1319 mutex_enter(&mi->mi_lock); 1320 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1321 mutex_exit(&mi->mi_lock); 1322 } 1323 1324 mi->mi_async_req_count++; 1325 ASSERT(mi->mi_async_req_count != 0); 1326 cv_signal(&mi->mi_async_reqs_cv); 1327 mutex_exit(&mi->mi_async_lock); 1328 return (0); 1329 1330 noasync: 1331 mutex_enter(&rp->r_statelock); 1332 rp->r_count--; 1333 cv_broadcast(&rp->r_cv); 1334 mutex_exit(&rp->r_statelock); 1335 VN_RELE(vp); 1336 crfree(cr); 1337 kmem_free(args, sizeof (*args)); 1338 return (-1); 1339 } 1340 1341 /* 1342 * The async queues for each mounted file system are arranged as a 1343 * set of queues, one for each async i/o type. Requests are taken 1344 * from the queues in a round-robin fashion. A number of consecutive 1345 * requests are taken from each queue before moving on to the next 1346 * queue. This functionality may allow the NFS Version 2 server to do 1347 * write clustering, even if the client is mixing writes and reads 1348 * because it will take multiple write requests from the queue 1349 * before processing any of the other async i/o types. 1350 * 1351 * XXX The nfs4_async_start thread is unsafe in the light of the present 1352 * model defined by cpr to suspend the system. Specifically over the 1353 * wire calls are cpr-unsafe. The thread should be reevaluated in 1354 * case of future updates to the cpr model. 1355 */ 1356 static void 1357 nfs4_async_start(struct vfs *vfsp) 1358 { 1359 struct nfs4_async_reqs *args; 1360 mntinfo4_t *mi = VFTOMI4(vfsp); 1361 clock_t time_left = 1; 1362 callb_cpr_t cprinfo; 1363 int i; 1364 extern int nfs_async_timeout; 1365 1366 /* 1367 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1368 * built in an implementation independent manner. 1369 */ 1370 if (nfs_async_timeout == -1) 1371 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1372 1373 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1374 1375 mutex_enter(&mi->mi_async_lock); 1376 for (;;) { 1377 /* 1378 * Find the next queue containing an entry. We start 1379 * at the current queue pointer and then round robin 1380 * through all of them until we either find a non-empty 1381 * queue or have looked through all of them. 1382 */ 1383 for (i = 0; i < NFS4_ASYNC_TYPES; i++) { 1384 args = *mi->mi_async_curr; 1385 if (args != NULL) 1386 break; 1387 mi->mi_async_curr++; 1388 if (mi->mi_async_curr == 1389 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1390 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1391 } 1392 /* 1393 * If we didn't find a entry, then block until woken up 1394 * again and then look through the queues again. 1395 */ 1396 if (args == NULL) { 1397 /* 1398 * Exiting is considered to be safe for CPR as well 1399 */ 1400 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1401 1402 /* 1403 * Wakeup thread waiting to unmount the file 1404 * system only if all async threads are inactive. 1405 * 1406 * If we've timed-out and there's nothing to do, 1407 * then get rid of this thread. 1408 */ 1409 if (mi->mi_max_threads == 0 || time_left <= 0) { 1410 if (--mi->mi_threads == 0) 1411 cv_signal(&mi->mi_async_cv); 1412 CALLB_CPR_EXIT(&cprinfo); 1413 VFS_RELE(vfsp); /* release thread's hold */ 1414 MI4_RELE(mi); 1415 zthread_exit(); 1416 /* NOTREACHED */ 1417 } 1418 time_left = cv_timedwait(&mi->mi_async_work_cv, 1419 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1420 1421 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1422 1423 continue; 1424 } else { 1425 time_left = 1; 1426 } 1427 1428 /* 1429 * Remove the request from the async queue and then 1430 * update the current async request queue pointer. If 1431 * the current queue is empty or we have removed enough 1432 * consecutive entries from it, then reset the counter 1433 * for this queue and then move the current pointer to 1434 * the next queue. 1435 */ 1436 *mi->mi_async_curr = args->a_next; 1437 if (*mi->mi_async_curr == NULL || 1438 --mi->mi_async_clusters[args->a_io] == 0) { 1439 mi->mi_async_clusters[args->a_io] = 1440 mi->mi_async_init_clusters; 1441 mi->mi_async_curr++; 1442 if (mi->mi_async_curr == 1443 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1444 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1445 } 1446 1447 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1448 mutex_enter(&mi->mi_lock); 1449 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1450 mutex_exit(&mi->mi_lock); 1451 } 1452 1453 mutex_exit(&mi->mi_async_lock); 1454 1455 /* 1456 * Obtain arguments from the async request structure. 1457 */ 1458 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1459 (*args->a_nfs4_readahead)(args->a_vp, 1460 args->a_nfs4_blkoff, args->a_nfs4_addr, 1461 args->a_nfs4_seg, args->a_cred); 1462 } else if (args->a_io == NFS4_PUTAPAGE) { 1463 (void) (*args->a_nfs4_putapage)(args->a_vp, 1464 args->a_nfs4_pp, args->a_nfs4_off, 1465 args->a_nfs4_len, args->a_nfs4_flags, 1466 args->a_cred); 1467 } else if (args->a_io == NFS4_PAGEIO) { 1468 (void) (*args->a_nfs4_pageio)(args->a_vp, 1469 args->a_nfs4_pp, args->a_nfs4_off, 1470 args->a_nfs4_len, args->a_nfs4_flags, 1471 args->a_cred); 1472 } else if (args->a_io == NFS4_READDIR) { 1473 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1474 args->a_nfs4_rdc, args->a_cred)); 1475 } else if (args->a_io == NFS4_COMMIT) { 1476 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1477 args->a_nfs4_offset, args->a_nfs4_count, 1478 args->a_cred); 1479 } else if (args->a_io == NFS4_INACTIVE) { 1480 nfs4_inactive_otw(args->a_vp, args->a_cred); 1481 } 1482 1483 /* 1484 * Now, release the vnode and free the credentials 1485 * structure. 1486 */ 1487 free_async_args4(args); 1488 /* 1489 * Reacquire the mutex because it will be needed above. 1490 */ 1491 mutex_enter(&mi->mi_async_lock); 1492 } 1493 } 1494 1495 /* 1496 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1497 * part of VOP_INACTIVE. 1498 */ 1499 1500 void 1501 nfs4_inactive_thread(mntinfo4_t *mi) 1502 { 1503 struct nfs4_async_reqs *args; 1504 callb_cpr_t cprinfo; 1505 vfs_t *vfsp = mi->mi_vfsp; 1506 1507 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1508 "nfs4_inactive_thread"); 1509 1510 for (;;) { 1511 mutex_enter(&mi->mi_async_lock); 1512 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1513 if (args == NULL) { 1514 mutex_enter(&mi->mi_lock); 1515 /* 1516 * We don't want to exit until the async manager is done 1517 * with its work; hence the check for mi_manager_thread 1518 * being NULL. 1519 * 1520 * The async manager thread will cv_broadcast() on 1521 * mi_inact_req_cv when it's done, at which point we'll 1522 * wake up and exit. 1523 */ 1524 if (mi->mi_manager_thread == NULL) 1525 goto die; 1526 mi->mi_flags |= MI4_INACTIVE_IDLE; 1527 mutex_exit(&mi->mi_lock); 1528 cv_signal(&mi->mi_async_cv); 1529 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1530 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1531 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1532 mutex_exit(&mi->mi_async_lock); 1533 } else { 1534 mutex_enter(&mi->mi_lock); 1535 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1536 mutex_exit(&mi->mi_lock); 1537 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1538 mutex_exit(&mi->mi_async_lock); 1539 nfs4_inactive_otw(args->a_vp, args->a_cred); 1540 crfree(args->a_cred); 1541 kmem_free(args, sizeof (*args)); 1542 } 1543 } 1544 die: 1545 mutex_exit(&mi->mi_lock); 1546 mi->mi_inactive_thread = NULL; 1547 cv_signal(&mi->mi_async_cv); 1548 1549 /* 1550 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1551 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1552 */ 1553 CALLB_CPR_EXIT(&cprinfo); 1554 1555 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1556 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1557 1558 MI4_RELE(mi); 1559 zthread_exit(); 1560 /* NOTREACHED */ 1561 } 1562 1563 /* 1564 * nfs_async_stop: 1565 * Wait for all outstanding putpage operations and the inactive thread to 1566 * complete; nfs4_async_stop_sig() without interruptibility. 1567 */ 1568 void 1569 nfs4_async_stop(struct vfs *vfsp) 1570 { 1571 mntinfo4_t *mi = VFTOMI4(vfsp); 1572 1573 /* 1574 * Wait for all outstanding async operations to complete and for 1575 * worker threads to exit. 1576 */ 1577 mutex_enter(&mi->mi_async_lock); 1578 mi->mi_max_threads = 0; 1579 cv_broadcast(&mi->mi_async_work_cv); 1580 while (mi->mi_threads != 0) 1581 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1582 1583 /* 1584 * Wait for the inactive thread to finish doing what it's doing. It 1585 * won't exit until the last reference to the vfs_t goes away. 1586 */ 1587 if (mi->mi_inactive_thread != NULL) { 1588 mutex_enter(&mi->mi_lock); 1589 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1590 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1591 mutex_exit(&mi->mi_lock); 1592 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1593 mutex_enter(&mi->mi_lock); 1594 } 1595 mutex_exit(&mi->mi_lock); 1596 } 1597 mutex_exit(&mi->mi_async_lock); 1598 } 1599 1600 /* 1601 * nfs_async_stop_sig: 1602 * Wait for all outstanding putpage operations and the inactive thread to 1603 * complete. If a signal is delivered we will abort and return non-zero; 1604 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1605 * need to make it interruptible. 1606 */ 1607 int 1608 nfs4_async_stop_sig(struct vfs *vfsp) 1609 { 1610 mntinfo4_t *mi = VFTOMI4(vfsp); 1611 ushort_t omax; 1612 bool_t intr = FALSE; 1613 1614 /* 1615 * Wait for all outstanding putpage operations to complete and for 1616 * worker threads to exit. 1617 */ 1618 mutex_enter(&mi->mi_async_lock); 1619 omax = mi->mi_max_threads; 1620 mi->mi_max_threads = 0; 1621 cv_broadcast(&mi->mi_async_work_cv); 1622 while (mi->mi_threads != 0) { 1623 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1624 intr = TRUE; 1625 goto interrupted; 1626 } 1627 } 1628 1629 /* 1630 * Wait for the inactive thread to finish doing what it's doing. It 1631 * won't exit until the a last reference to the vfs_t goes away. 1632 */ 1633 if (mi->mi_inactive_thread != NULL) { 1634 mutex_enter(&mi->mi_lock); 1635 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1636 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1637 mutex_exit(&mi->mi_lock); 1638 if (!cv_wait_sig(&mi->mi_async_cv, 1639 &mi->mi_async_lock)) { 1640 intr = TRUE; 1641 goto interrupted; 1642 } 1643 mutex_enter(&mi->mi_lock); 1644 } 1645 mutex_exit(&mi->mi_lock); 1646 } 1647 interrupted: 1648 if (intr) 1649 mi->mi_max_threads = omax; 1650 mutex_exit(&mi->mi_async_lock); 1651 1652 return (intr); 1653 } 1654 1655 int 1656 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1657 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1658 u_offset_t, size_t, int, cred_t *)) 1659 { 1660 rnode4_t *rp; 1661 mntinfo4_t *mi; 1662 struct nfs4_async_reqs *args; 1663 1664 ASSERT(flags & B_ASYNC); 1665 ASSERT(vp->v_vfsp != NULL); 1666 1667 rp = VTOR4(vp); 1668 ASSERT(rp->r_count > 0); 1669 1670 mi = VTOMI4(vp); 1671 1672 /* 1673 * If we can't allocate a request structure, do the putpage 1674 * operation synchronously in this thread's context. 1675 */ 1676 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1677 goto noasync; 1678 1679 args->a_next = NULL; 1680 #ifdef DEBUG 1681 args->a_queuer = curthread; 1682 #endif 1683 VN_HOLD(vp); 1684 args->a_vp = vp; 1685 ASSERT(cr != NULL); 1686 crhold(cr); 1687 args->a_cred = cr; 1688 args->a_io = NFS4_PUTAPAGE; 1689 args->a_nfs4_putapage = putapage; 1690 args->a_nfs4_pp = pp; 1691 args->a_nfs4_off = off; 1692 args->a_nfs4_len = (uint_t)len; 1693 args->a_nfs4_flags = flags; 1694 1695 mutex_enter(&mi->mi_async_lock); 1696 1697 /* 1698 * If asyncio has been disabled, then make a synchronous request. 1699 * This check is done a second time in case async io was diabled 1700 * while this thread was blocked waiting for memory pressure to 1701 * reduce or for the queue to drain. 1702 */ 1703 if (mi->mi_max_threads == 0) { 1704 mutex_exit(&mi->mi_async_lock); 1705 1706 VN_RELE(vp); 1707 crfree(cr); 1708 kmem_free(args, sizeof (*args)); 1709 goto noasync; 1710 } 1711 1712 /* 1713 * Link request structure into the async list and 1714 * wakeup async thread to do the i/o. 1715 */ 1716 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1717 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1718 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1719 } else { 1720 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1721 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1722 } 1723 1724 mutex_enter(&rp->r_statelock); 1725 rp->r_count++; 1726 rp->r_awcount++; 1727 mutex_exit(&rp->r_statelock); 1728 1729 if (mi->mi_io_kstats) { 1730 mutex_enter(&mi->mi_lock); 1731 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1732 mutex_exit(&mi->mi_lock); 1733 } 1734 1735 mi->mi_async_req_count++; 1736 ASSERT(mi->mi_async_req_count != 0); 1737 cv_signal(&mi->mi_async_reqs_cv); 1738 mutex_exit(&mi->mi_async_lock); 1739 return (0); 1740 1741 noasync: 1742 1743 if (curproc == proc_pageout || curproc == proc_fsflush || 1744 nfs_zone() == mi->mi_zone) { 1745 /* 1746 * If we get here in the context of the pageout/fsflush, 1747 * or we have run out of memory or we're attempting to 1748 * unmount we refuse to do a sync write, because this may 1749 * hang pageout/fsflush and the machine. In this case, 1750 * we just re-mark the page as dirty and punt on the page. 1751 * 1752 * Make sure B_FORCE isn't set. We can re-mark the 1753 * pages as dirty and unlock the pages in one swoop by 1754 * passing in B_ERROR to pvn_write_done(). However, 1755 * we should make sure B_FORCE isn't set - we don't 1756 * want the page tossed before it gets written out. 1757 */ 1758 if (flags & B_FORCE) 1759 flags &= ~(B_INVAL | B_FORCE); 1760 pvn_write_done(pp, flags | B_ERROR); 1761 return (0); 1762 } 1763 1764 /* 1765 * We'll get here only if (nfs_zone() != mi->mi_zone) 1766 * which means that this was a cross-zone sync putpage. 1767 * 1768 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1769 * as dirty and unlock them. 1770 * 1771 * We don't want to clear B_FORCE here as the caller presumably 1772 * knows what they're doing if they set it. 1773 */ 1774 pvn_write_done(pp, flags | B_ERROR); 1775 return (EPERM); 1776 } 1777 1778 int 1779 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1780 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1781 size_t, int, cred_t *)) 1782 { 1783 rnode4_t *rp; 1784 mntinfo4_t *mi; 1785 struct nfs4_async_reqs *args; 1786 1787 ASSERT(flags & B_ASYNC); 1788 ASSERT(vp->v_vfsp != NULL); 1789 1790 rp = VTOR4(vp); 1791 ASSERT(rp->r_count > 0); 1792 1793 mi = VTOMI4(vp); 1794 1795 /* 1796 * If we can't allocate a request structure, do the pageio 1797 * request synchronously in this thread's context. 1798 */ 1799 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1800 goto noasync; 1801 1802 args->a_next = NULL; 1803 #ifdef DEBUG 1804 args->a_queuer = curthread; 1805 #endif 1806 VN_HOLD(vp); 1807 args->a_vp = vp; 1808 ASSERT(cr != NULL); 1809 crhold(cr); 1810 args->a_cred = cr; 1811 args->a_io = NFS4_PAGEIO; 1812 args->a_nfs4_pageio = pageio; 1813 args->a_nfs4_pp = pp; 1814 args->a_nfs4_off = io_off; 1815 args->a_nfs4_len = (uint_t)io_len; 1816 args->a_nfs4_flags = flags; 1817 1818 mutex_enter(&mi->mi_async_lock); 1819 1820 /* 1821 * If asyncio has been disabled, then make a synchronous request. 1822 * This check is done a second time in case async io was diabled 1823 * while this thread was blocked waiting for memory pressure to 1824 * reduce or for the queue to drain. 1825 */ 1826 if (mi->mi_max_threads == 0) { 1827 mutex_exit(&mi->mi_async_lock); 1828 1829 VN_RELE(vp); 1830 crfree(cr); 1831 kmem_free(args, sizeof (*args)); 1832 goto noasync; 1833 } 1834 1835 /* 1836 * Link request structure into the async list and 1837 * wakeup async thread to do the i/o. 1838 */ 1839 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1840 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1841 mi->mi_async_tail[NFS4_PAGEIO] = args; 1842 } else { 1843 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1844 mi->mi_async_tail[NFS4_PAGEIO] = args; 1845 } 1846 1847 mutex_enter(&rp->r_statelock); 1848 rp->r_count++; 1849 rp->r_awcount++; 1850 mutex_exit(&rp->r_statelock); 1851 1852 if (mi->mi_io_kstats) { 1853 mutex_enter(&mi->mi_lock); 1854 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1855 mutex_exit(&mi->mi_lock); 1856 } 1857 1858 mi->mi_async_req_count++; 1859 ASSERT(mi->mi_async_req_count != 0); 1860 cv_signal(&mi->mi_async_reqs_cv); 1861 mutex_exit(&mi->mi_async_lock); 1862 return (0); 1863 1864 noasync: 1865 /* 1866 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1867 * the page list), for writes we do it synchronously, except for 1868 * proc_pageout/proc_fsflush as described below. 1869 */ 1870 if (flags & B_READ) { 1871 pvn_read_done(pp, flags | B_ERROR); 1872 return (0); 1873 } 1874 1875 if (curproc == proc_pageout || curproc == proc_fsflush) { 1876 /* 1877 * If we get here in the context of the pageout/fsflush, 1878 * we refuse to do a sync write, because this may hang 1879 * pageout/fsflush (and the machine). In this case, we just 1880 * re-mark the page as dirty and punt on the page. 1881 * 1882 * Make sure B_FORCE isn't set. We can re-mark the 1883 * pages as dirty and unlock the pages in one swoop by 1884 * passing in B_ERROR to pvn_write_done(). However, 1885 * we should make sure B_FORCE isn't set - we don't 1886 * want the page tossed before it gets written out. 1887 */ 1888 if (flags & B_FORCE) 1889 flags &= ~(B_INVAL | B_FORCE); 1890 pvn_write_done(pp, flags | B_ERROR); 1891 return (0); 1892 } 1893 1894 if (nfs_zone() != mi->mi_zone) { 1895 /* 1896 * So this was a cross-zone sync pageio. We pass in B_ERROR 1897 * to pvn_write_done() to re-mark the pages as dirty and unlock 1898 * them. 1899 * 1900 * We don't want to clear B_FORCE here as the caller presumably 1901 * knows what they're doing if they set it. 1902 */ 1903 pvn_write_done(pp, flags | B_ERROR); 1904 return (EPERM); 1905 } 1906 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1907 } 1908 1909 void 1910 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1911 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1912 { 1913 rnode4_t *rp; 1914 mntinfo4_t *mi; 1915 struct nfs4_async_reqs *args; 1916 1917 rp = VTOR4(vp); 1918 ASSERT(rp->r_freef == NULL); 1919 1920 mi = VTOMI4(vp); 1921 1922 /* 1923 * If we can't allocate a request structure, skip the readdir. 1924 */ 1925 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1926 goto noasync; 1927 1928 args->a_next = NULL; 1929 #ifdef DEBUG 1930 args->a_queuer = curthread; 1931 #endif 1932 VN_HOLD(vp); 1933 args->a_vp = vp; 1934 ASSERT(cr != NULL); 1935 crhold(cr); 1936 args->a_cred = cr; 1937 args->a_io = NFS4_READDIR; 1938 args->a_nfs4_readdir = readdir; 1939 args->a_nfs4_rdc = rdc; 1940 1941 mutex_enter(&mi->mi_async_lock); 1942 1943 /* 1944 * If asyncio has been disabled, then skip this request 1945 */ 1946 if (mi->mi_max_threads == 0) { 1947 mutex_exit(&mi->mi_async_lock); 1948 1949 VN_RELE(vp); 1950 crfree(cr); 1951 kmem_free(args, sizeof (*args)); 1952 goto noasync; 1953 } 1954 1955 /* 1956 * Link request structure into the async list and 1957 * wakeup async thread to do the i/o. 1958 */ 1959 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 1960 mi->mi_async_reqs[NFS4_READDIR] = args; 1961 mi->mi_async_tail[NFS4_READDIR] = args; 1962 } else { 1963 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 1964 mi->mi_async_tail[NFS4_READDIR] = args; 1965 } 1966 1967 mutex_enter(&rp->r_statelock); 1968 rp->r_count++; 1969 mutex_exit(&rp->r_statelock); 1970 1971 if (mi->mi_io_kstats) { 1972 mutex_enter(&mi->mi_lock); 1973 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1974 mutex_exit(&mi->mi_lock); 1975 } 1976 1977 mi->mi_async_req_count++; 1978 ASSERT(mi->mi_async_req_count != 0); 1979 cv_signal(&mi->mi_async_reqs_cv); 1980 mutex_exit(&mi->mi_async_lock); 1981 return; 1982 1983 noasync: 1984 mutex_enter(&rp->r_statelock); 1985 rdc->entries = NULL; 1986 /* 1987 * Indicate that no one is trying to fill this entry and 1988 * it still needs to be filled. 1989 */ 1990 rdc->flags &= ~RDDIR; 1991 rdc->flags |= RDDIRREQ; 1992 rddir4_cache_rele(rp, rdc); 1993 mutex_exit(&rp->r_statelock); 1994 } 1995 1996 void 1997 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1998 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1999 cred_t *)) 2000 { 2001 rnode4_t *rp; 2002 mntinfo4_t *mi; 2003 struct nfs4_async_reqs *args; 2004 page_t *pp; 2005 2006 rp = VTOR4(vp); 2007 mi = VTOMI4(vp); 2008 2009 /* 2010 * If we can't allocate a request structure, do the commit 2011 * operation synchronously in this thread's context. 2012 */ 2013 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2014 goto noasync; 2015 2016 args->a_next = NULL; 2017 #ifdef DEBUG 2018 args->a_queuer = curthread; 2019 #endif 2020 VN_HOLD(vp); 2021 args->a_vp = vp; 2022 ASSERT(cr != NULL); 2023 crhold(cr); 2024 args->a_cred = cr; 2025 args->a_io = NFS4_COMMIT; 2026 args->a_nfs4_commit = commit; 2027 args->a_nfs4_plist = plist; 2028 args->a_nfs4_offset = offset; 2029 args->a_nfs4_count = count; 2030 2031 mutex_enter(&mi->mi_async_lock); 2032 2033 /* 2034 * If asyncio has been disabled, then make a synchronous request. 2035 * This check is done a second time in case async io was diabled 2036 * while this thread was blocked waiting for memory pressure to 2037 * reduce or for the queue to drain. 2038 */ 2039 if (mi->mi_max_threads == 0) { 2040 mutex_exit(&mi->mi_async_lock); 2041 2042 VN_RELE(vp); 2043 crfree(cr); 2044 kmem_free(args, sizeof (*args)); 2045 goto noasync; 2046 } 2047 2048 /* 2049 * Link request structure into the async list and 2050 * wakeup async thread to do the i/o. 2051 */ 2052 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2053 mi->mi_async_reqs[NFS4_COMMIT] = args; 2054 mi->mi_async_tail[NFS4_COMMIT] = args; 2055 } else { 2056 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2057 mi->mi_async_tail[NFS4_COMMIT] = args; 2058 } 2059 2060 mutex_enter(&rp->r_statelock); 2061 rp->r_count++; 2062 mutex_exit(&rp->r_statelock); 2063 2064 if (mi->mi_io_kstats) { 2065 mutex_enter(&mi->mi_lock); 2066 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2067 mutex_exit(&mi->mi_lock); 2068 } 2069 2070 mi->mi_async_req_count++; 2071 ASSERT(mi->mi_async_req_count != 0); 2072 cv_signal(&mi->mi_async_reqs_cv); 2073 mutex_exit(&mi->mi_async_lock); 2074 return; 2075 2076 noasync: 2077 if (curproc == proc_pageout || curproc == proc_fsflush || 2078 nfs_zone() != mi->mi_zone) { 2079 while (plist != NULL) { 2080 pp = plist; 2081 page_sub(&plist, pp); 2082 pp->p_fsdata = C_COMMIT; 2083 page_unlock(pp); 2084 } 2085 return; 2086 } 2087 (*commit)(vp, plist, offset, count, cr); 2088 } 2089 2090 /* 2091 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2092 * reference to the vnode is handed over to the thread; the caller should 2093 * no longer refer to the vnode. 2094 * 2095 * Unlike most of the async routines, this handoff is needed for 2096 * correctness reasons, not just performance. So doing operations in the 2097 * context of the current thread is not an option. 2098 */ 2099 void 2100 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2101 { 2102 mntinfo4_t *mi; 2103 struct nfs4_async_reqs *args; 2104 boolean_t signal_inactive_thread = B_FALSE; 2105 2106 mi = VTOMI4(vp); 2107 2108 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2109 args->a_next = NULL; 2110 #ifdef DEBUG 2111 args->a_queuer = curthread; 2112 #endif 2113 args->a_vp = vp; 2114 ASSERT(cr != NULL); 2115 crhold(cr); 2116 args->a_cred = cr; 2117 args->a_io = NFS4_INACTIVE; 2118 2119 /* 2120 * Note that we don't check mi->mi_max_threads here, since we 2121 * *need* to get rid of this vnode regardless of whether someone 2122 * set nfs4_max_threads to zero in /etc/system. 2123 * 2124 * The manager thread knows about this and is willing to create 2125 * at least one thread to accommodate us. 2126 */ 2127 mutex_enter(&mi->mi_async_lock); 2128 if (mi->mi_inactive_thread == NULL) { 2129 rnode4_t *rp; 2130 vnode_t *unldvp = NULL; 2131 char *unlname; 2132 cred_t *unlcred; 2133 2134 mutex_exit(&mi->mi_async_lock); 2135 /* 2136 * We just need to free up the memory associated with the 2137 * vnode, which can be safely done from within the current 2138 * context. 2139 */ 2140 crfree(cr); /* drop our reference */ 2141 kmem_free(args, sizeof (*args)); 2142 rp = VTOR4(vp); 2143 mutex_enter(&rp->r_statelock); 2144 if (rp->r_unldvp != NULL) { 2145 unldvp = rp->r_unldvp; 2146 rp->r_unldvp = NULL; 2147 unlname = rp->r_unlname; 2148 rp->r_unlname = NULL; 2149 unlcred = rp->r_unlcred; 2150 rp->r_unlcred = NULL; 2151 } 2152 mutex_exit(&rp->r_statelock); 2153 /* 2154 * No need to explicitly throw away any cached pages. The 2155 * eventual r4inactive() will attempt a synchronous 2156 * VOP_PUTPAGE() which will immediately fail since the request 2157 * is coming from the wrong zone, and then will proceed to call 2158 * nfs4_invalidate_pages() which will clean things up for us. 2159 * 2160 * Throw away the delegation here so rp4_addfree()'s attempt to 2161 * return any existing delegations becomes a no-op. 2162 */ 2163 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2164 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2165 FALSE); 2166 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2167 nfs_rw_exit(&mi->mi_recovlock); 2168 } 2169 nfs4_clear_open_streams(rp); 2170 2171 rp4_addfree(rp, cr); 2172 if (unldvp != NULL) { 2173 kmem_free(unlname, MAXNAMELEN); 2174 VN_RELE(unldvp); 2175 crfree(unlcred); 2176 } 2177 return; 2178 } 2179 2180 if (mi->mi_manager_thread == NULL) { 2181 /* 2182 * We want to talk to the inactive thread. 2183 */ 2184 signal_inactive_thread = B_TRUE; 2185 } 2186 2187 /* 2188 * Enqueue the vnode and wake up either the special thread (empty 2189 * list) or an async thread. 2190 */ 2191 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2192 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2193 mi->mi_async_tail[NFS4_INACTIVE] = args; 2194 signal_inactive_thread = B_TRUE; 2195 } else { 2196 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2197 mi->mi_async_tail[NFS4_INACTIVE] = args; 2198 } 2199 if (signal_inactive_thread) { 2200 cv_signal(&mi->mi_inact_req_cv); 2201 } else { 2202 mi->mi_async_req_count++; 2203 ASSERT(mi->mi_async_req_count != 0); 2204 cv_signal(&mi->mi_async_reqs_cv); 2205 } 2206 2207 mutex_exit(&mi->mi_async_lock); 2208 } 2209 2210 int 2211 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2212 { 2213 int pagecreate; 2214 int n; 2215 int saved_n; 2216 caddr_t saved_base; 2217 u_offset_t offset; 2218 int error; 2219 int sm_error; 2220 vnode_t *vp = RTOV(rp); 2221 2222 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2223 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2224 if (!vpm_enable) { 2225 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2226 } 2227 2228 /* 2229 * Move bytes in at most PAGESIZE chunks. We must avoid 2230 * spanning pages in uiomove() because page faults may cause 2231 * the cache to be invalidated out from under us. The r_size is not 2232 * updated until after the uiomove. If we push the last page of a 2233 * file before r_size is correct, we will lose the data written past 2234 * the current (and invalid) r_size. 2235 */ 2236 do { 2237 offset = uio->uio_loffset; 2238 pagecreate = 0; 2239 2240 /* 2241 * n is the number of bytes required to satisfy the request 2242 * or the number of bytes to fill out the page. 2243 */ 2244 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2245 2246 /* 2247 * Check to see if we can skip reading in the page 2248 * and just allocate the memory. We can do this 2249 * if we are going to rewrite the entire mapping 2250 * or if we are going to write to or beyond the current 2251 * end of file from the beginning of the mapping. 2252 * 2253 * The read of r_size is now protected by r_statelock. 2254 */ 2255 mutex_enter(&rp->r_statelock); 2256 /* 2257 * When pgcreated is nonzero the caller has already done 2258 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2259 * segkpm this means we already have at least one page 2260 * created and mapped at base. 2261 */ 2262 pagecreate = pgcreated || 2263 ((offset & PAGEOFFSET) == 0 && 2264 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2265 2266 mutex_exit(&rp->r_statelock); 2267 2268 if (!vpm_enable && pagecreate) { 2269 /* 2270 * The last argument tells segmap_pagecreate() to 2271 * always lock the page, as opposed to sometimes 2272 * returning with the page locked. This way we avoid a 2273 * fault on the ensuing uiomove(), but also 2274 * more importantly (to fix bug 1094402) we can 2275 * call segmap_fault() to unlock the page in all 2276 * cases. An alternative would be to modify 2277 * segmap_pagecreate() to tell us when it is 2278 * locking a page, but that's a fairly major 2279 * interface change. 2280 */ 2281 if (pgcreated == 0) 2282 (void) segmap_pagecreate(segkmap, base, 2283 (uint_t)n, 1); 2284 saved_base = base; 2285 saved_n = n; 2286 } 2287 2288 /* 2289 * The number of bytes of data in the last page can not 2290 * be accurately be determined while page is being 2291 * uiomove'd to and the size of the file being updated. 2292 * Thus, inform threads which need to know accurately 2293 * how much data is in the last page of the file. They 2294 * will not do the i/o immediately, but will arrange for 2295 * the i/o to happen later when this modify operation 2296 * will have finished. 2297 */ 2298 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2299 mutex_enter(&rp->r_statelock); 2300 rp->r_flags |= R4MODINPROGRESS; 2301 rp->r_modaddr = (offset & MAXBMASK); 2302 mutex_exit(&rp->r_statelock); 2303 2304 if (vpm_enable) { 2305 /* 2306 * Copy data. If new pages are created, part of 2307 * the page that is not written will be initizliazed 2308 * with zeros. 2309 */ 2310 error = vpm_data_copy(vp, offset, n, uio, 2311 !pagecreate, NULL, 0, S_WRITE); 2312 } else { 2313 error = uiomove(base, n, UIO_WRITE, uio); 2314 } 2315 2316 /* 2317 * r_size is the maximum number of 2318 * bytes known to be in the file. 2319 * Make sure it is at least as high as the 2320 * first unwritten byte pointed to by uio_loffset. 2321 */ 2322 mutex_enter(&rp->r_statelock); 2323 if (rp->r_size < uio->uio_loffset) 2324 rp->r_size = uio->uio_loffset; 2325 rp->r_flags &= ~R4MODINPROGRESS; 2326 rp->r_flags |= R4DIRTY; 2327 mutex_exit(&rp->r_statelock); 2328 2329 /* n = # of bytes written */ 2330 n = (int)(uio->uio_loffset - offset); 2331 2332 if (!vpm_enable) { 2333 base += n; 2334 } 2335 2336 tcount -= n; 2337 /* 2338 * If we created pages w/o initializing them completely, 2339 * we need to zero the part that wasn't set up. 2340 * This happens on a most EOF write cases and if 2341 * we had some sort of error during the uiomove. 2342 */ 2343 if (!vpm_enable && pagecreate) { 2344 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2345 (void) kzero(base, PAGESIZE - n); 2346 2347 if (pgcreated) { 2348 /* 2349 * Caller is responsible for this page, 2350 * it was not created in this loop. 2351 */ 2352 pgcreated = 0; 2353 } else { 2354 /* 2355 * For bug 1094402: segmap_pagecreate locks 2356 * page. Unlock it. This also unlocks the 2357 * pages allocated by page_create_va() in 2358 * segmap_pagecreate(). 2359 */ 2360 sm_error = segmap_fault(kas.a_hat, segkmap, 2361 saved_base, saved_n, 2362 F_SOFTUNLOCK, S_WRITE); 2363 if (error == 0) 2364 error = sm_error; 2365 } 2366 } 2367 } while (tcount > 0 && error == 0); 2368 2369 return (error); 2370 } 2371 2372 int 2373 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2374 { 2375 rnode4_t *rp; 2376 page_t *pp; 2377 u_offset_t eoff; 2378 u_offset_t io_off; 2379 size_t io_len; 2380 int error; 2381 int rdirty; 2382 int err; 2383 2384 rp = VTOR4(vp); 2385 ASSERT(rp->r_count > 0); 2386 2387 if (!nfs4_has_pages(vp)) 2388 return (0); 2389 2390 ASSERT(vp->v_type != VCHR); 2391 2392 /* 2393 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2394 * writes. B_FORCE is set to force the VM system to actually 2395 * invalidate the pages, even if the i/o failed. The pages 2396 * need to get invalidated because they can't be written out 2397 * because there isn't any space left on either the server's 2398 * file system or in the user's disk quota. The B_FREE bit 2399 * is cleared to avoid confusion as to whether this is a 2400 * request to place the page on the freelist or to destroy 2401 * it. 2402 */ 2403 if ((rp->r_flags & R4OUTOFSPACE) || 2404 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2405 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2406 2407 if (len == 0) { 2408 /* 2409 * If doing a full file synchronous operation, then clear 2410 * the R4DIRTY bit. If a page gets dirtied while the flush 2411 * is happening, then R4DIRTY will get set again. The 2412 * R4DIRTY bit must get cleared before the flush so that 2413 * we don't lose this information. 2414 * 2415 * If there are no full file async write operations 2416 * pending and RDIRTY bit is set, clear it. 2417 */ 2418 if (off == (u_offset_t)0 && 2419 !(flags & B_ASYNC) && 2420 (rp->r_flags & R4DIRTY)) { 2421 mutex_enter(&rp->r_statelock); 2422 rdirty = (rp->r_flags & R4DIRTY); 2423 rp->r_flags &= ~R4DIRTY; 2424 mutex_exit(&rp->r_statelock); 2425 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2426 mutex_enter(&rp->r_statelock); 2427 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2428 rdirty = (rp->r_flags & R4DIRTY); 2429 rp->r_flags &= ~R4DIRTY; 2430 } 2431 mutex_exit(&rp->r_statelock); 2432 } else 2433 rdirty = 0; 2434 2435 /* 2436 * Search the entire vp list for pages >= off, and flush 2437 * the dirty pages. 2438 */ 2439 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2440 flags, cr); 2441 2442 /* 2443 * If an error occurred and the file was marked as dirty 2444 * before and we aren't forcibly invalidating pages, then 2445 * reset the R4DIRTY flag. 2446 */ 2447 if (error && rdirty && 2448 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2449 mutex_enter(&rp->r_statelock); 2450 rp->r_flags |= R4DIRTY; 2451 mutex_exit(&rp->r_statelock); 2452 } 2453 } else { 2454 /* 2455 * Do a range from [off...off + len) looking for pages 2456 * to deal with. 2457 */ 2458 error = 0; 2459 io_len = 0; 2460 eoff = off + len; 2461 mutex_enter(&rp->r_statelock); 2462 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2463 io_off += io_len) { 2464 mutex_exit(&rp->r_statelock); 2465 /* 2466 * If we are not invalidating, synchronously 2467 * freeing or writing pages use the routine 2468 * page_lookup_nowait() to prevent reclaiming 2469 * them from the free list. 2470 */ 2471 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2472 pp = page_lookup(vp, io_off, 2473 (flags & (B_INVAL | B_FREE)) ? 2474 SE_EXCL : SE_SHARED); 2475 } else { 2476 pp = page_lookup_nowait(vp, io_off, 2477 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2478 } 2479 2480 if (pp == NULL || !pvn_getdirty(pp, flags)) 2481 io_len = PAGESIZE; 2482 else { 2483 err = (*rp->r_putapage)(vp, pp, &io_off, 2484 &io_len, flags, cr); 2485 if (!error) 2486 error = err; 2487 /* 2488 * "io_off" and "io_len" are returned as 2489 * the range of pages we actually wrote. 2490 * This allows us to skip ahead more quickly 2491 * since several pages may've been dealt 2492 * with by this iteration of the loop. 2493 */ 2494 } 2495 mutex_enter(&rp->r_statelock); 2496 } 2497 mutex_exit(&rp->r_statelock); 2498 } 2499 2500 return (error); 2501 } 2502 2503 void 2504 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2505 { 2506 rnode4_t *rp; 2507 2508 rp = VTOR4(vp); 2509 if (IS_SHADOW(vp, rp)) 2510 vp = RTOV4(rp); 2511 mutex_enter(&rp->r_statelock); 2512 while (rp->r_flags & R4TRUNCATE) 2513 cv_wait(&rp->r_cv, &rp->r_statelock); 2514 rp->r_flags |= R4TRUNCATE; 2515 if (off == (u_offset_t)0) { 2516 rp->r_flags &= ~R4DIRTY; 2517 if (!(rp->r_flags & R4STALE)) 2518 rp->r_error = 0; 2519 } 2520 rp->r_truncaddr = off; 2521 mutex_exit(&rp->r_statelock); 2522 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2523 B_INVAL | B_TRUNC, cr); 2524 mutex_enter(&rp->r_statelock); 2525 rp->r_flags &= ~R4TRUNCATE; 2526 cv_broadcast(&rp->r_cv); 2527 mutex_exit(&rp->r_statelock); 2528 } 2529 2530 static int 2531 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2532 { 2533 mntinfo4_t *mi; 2534 struct mntinfo_kstat *mik; 2535 vfs_t *vfsp; 2536 2537 /* this is a read-only kstat. Bail out on a write */ 2538 if (rw == KSTAT_WRITE) 2539 return (EACCES); 2540 2541 2542 /* 2543 * We don't want to wait here as kstat_chain_lock could be held by 2544 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2545 * and thus could lead to a deadlock. 2546 */ 2547 vfsp = (struct vfs *)ksp->ks_private; 2548 2549 mi = VFTOMI4(vfsp); 2550 mik = (struct mntinfo_kstat *)ksp->ks_data; 2551 2552 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2553 2554 mik->mik_vers = (uint32_t)mi->mi_vers; 2555 mik->mik_flags = mi->mi_flags; 2556 /* 2557 * The sv_secdata holds the flavor the client specifies. 2558 * If the client uses default and a security negotiation 2559 * occurs, sv_currsec will point to the current flavor 2560 * selected from the server flavor list. 2561 * sv_currsec is NULL if no security negotiation takes place. 2562 */ 2563 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2564 mi->mi_curr_serv->sv_currsec->secmod : 2565 mi->mi_curr_serv->sv_secdata->secmod; 2566 mik->mik_curread = (uint32_t)mi->mi_curread; 2567 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2568 mik->mik_retrans = mi->mi_retrans; 2569 mik->mik_timeo = mi->mi_timeo; 2570 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2571 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2572 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2573 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2574 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2575 mik->mik_failover = (uint32_t)mi->mi_failover; 2576 mik->mik_remap = (uint32_t)mi->mi_remap; 2577 2578 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2579 2580 return (0); 2581 } 2582 2583 void 2584 nfs4_mnt_kstat_init(struct vfs *vfsp) 2585 { 2586 mntinfo4_t *mi = VFTOMI4(vfsp); 2587 2588 /* 2589 * PSARC 2001/697 Contract Private Interface 2590 * All nfs kstats are under SunMC contract 2591 * Please refer to the PSARC listed above and contact 2592 * SunMC before making any changes! 2593 * 2594 * Changes must be reviewed by Solaris File Sharing 2595 * Changes must be communicated to contract-2001-697@sun.com 2596 * 2597 */ 2598 2599 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2600 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2601 if (mi->mi_io_kstats) { 2602 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2603 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2604 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2605 kstat_install(mi->mi_io_kstats); 2606 } 2607 2608 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2609 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2610 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2611 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2612 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2613 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2614 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2615 kstat_install(mi->mi_ro_kstats); 2616 } 2617 2618 nfs4_mnt_recov_kstat_init(vfsp); 2619 } 2620 2621 void 2622 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2623 { 2624 mntinfo4_t *mi; 2625 2626 mi = VTOMI4(vp); 2627 /* 2628 * In case of forced unmount, do not print any messages 2629 * since it can flood the console with error messages. 2630 */ 2631 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2632 return; 2633 2634 /* 2635 * If the mount point is dead, not recoverable, do not 2636 * print error messages that can flood the console. 2637 */ 2638 if (mi->mi_flags & MI4_RECOV_FAIL) 2639 return; 2640 2641 /* 2642 * No use in flooding the console with ENOSPC 2643 * messages from the same file system. 2644 */ 2645 if ((error != ENOSPC && error != EDQUOT) || 2646 lbolt - mi->mi_printftime > 0) { 2647 zoneid_t zoneid = mi->mi_zone->zone_id; 2648 2649 #ifdef DEBUG 2650 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2651 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2652 #else 2653 nfs_perror(error, "NFS write error on host %s: %m.\n", 2654 VTOR4(vp)->r_server->sv_hostname, NULL); 2655 #endif 2656 if (error == ENOSPC || error == EDQUOT) { 2657 zcmn_err(zoneid, CE_CONT, 2658 "^File: userid=%d, groupid=%d\n", 2659 crgetuid(cr), crgetgid(cr)); 2660 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2661 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2662 zcmn_err(zoneid, CE_CONT, 2663 "^User: userid=%d, groupid=%d\n", 2664 crgetuid(curthread->t_cred), 2665 crgetgid(curthread->t_cred)); 2666 } 2667 mi->mi_printftime = lbolt + 2668 nfs_write_error_interval * hz; 2669 } 2670 sfh4_printfhandle(VTOR4(vp)->r_fh); 2671 #ifdef DEBUG 2672 if (error == EACCES) { 2673 zcmn_err(zoneid, CE_CONT, 2674 "nfs_bio: cred is%s kcred\n", 2675 cr == kcred ? "" : " not"); 2676 } 2677 #endif 2678 } 2679 } 2680 2681 /* 2682 * Return non-zero if the given file can be safely memory mapped. Locks 2683 * are safe if whole-file (length and offset are both zero). 2684 */ 2685 2686 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2687 2688 static int 2689 nfs4_safemap(const vnode_t *vp) 2690 { 2691 locklist_t *llp, *next_llp; 2692 int safe = 1; 2693 rnode4_t *rp = VTOR4(vp); 2694 2695 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2696 2697 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2698 "vp = %p", (void *)vp)); 2699 2700 /* 2701 * Review all the locks for the vnode, both ones that have been 2702 * acquired and ones that are pending. We assume that 2703 * flk_active_locks_for_vp() has merged any locks that can be 2704 * merged (so that if a process has the entire file locked, it is 2705 * represented as a single lock). 2706 * 2707 * Note that we can't bail out of the loop if we find a non-safe 2708 * lock, because we have to free all the elements in the llp list. 2709 * We might be able to speed up this code slightly by not looking 2710 * at each lock's l_start and l_len fields once we've found a 2711 * non-safe lock. 2712 */ 2713 2714 llp = flk_active_locks_for_vp(vp); 2715 while (llp) { 2716 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2717 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2718 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2719 if (!SAFE_LOCK(llp->ll_flock)) { 2720 safe = 0; 2721 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2722 "nfs4_safemap: unsafe active lock (%" PRId64 2723 ", %" PRId64 ")", llp->ll_flock.l_start, 2724 llp->ll_flock.l_len)); 2725 } 2726 next_llp = llp->ll_next; 2727 VN_RELE(llp->ll_vp); 2728 kmem_free(llp, sizeof (*llp)); 2729 llp = next_llp; 2730 } 2731 2732 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2733 safe ? "safe" : "unsafe")); 2734 return (safe); 2735 } 2736 2737 /* 2738 * Return whether there is a lost LOCK or LOCKU queued up for the given 2739 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2740 */ 2741 2742 bool_t 2743 nfs4_map_lost_lock_conflict(vnode_t *vp) 2744 { 2745 bool_t conflict = FALSE; 2746 nfs4_lost_rqst_t *lrp; 2747 mntinfo4_t *mi = VTOMI4(vp); 2748 2749 mutex_enter(&mi->mi_lock); 2750 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2751 lrp = list_next(&mi->mi_lost_state, lrp)) { 2752 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2753 continue; 2754 ASSERT(lrp->lr_vp != NULL); 2755 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2756 continue; /* different file */ 2757 if (!SAFE_LOCK(*lrp->lr_flk)) { 2758 conflict = TRUE; 2759 break; 2760 } 2761 } 2762 2763 mutex_exit(&mi->mi_lock); 2764 return (conflict); 2765 } 2766 2767 /* 2768 * nfs_lockcompletion: 2769 * 2770 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2771 * as non cachable (set VNOCACHE bit). 2772 */ 2773 2774 void 2775 nfs4_lockcompletion(vnode_t *vp, int cmd) 2776 { 2777 rnode4_t *rp = VTOR4(vp); 2778 2779 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2780 ASSERT(!IS_SHADOW(vp, rp)); 2781 2782 if (cmd == F_SETLK || cmd == F_SETLKW) { 2783 2784 if (!nfs4_safemap(vp)) { 2785 mutex_enter(&vp->v_lock); 2786 vp->v_flag |= VNOCACHE; 2787 mutex_exit(&vp->v_lock); 2788 } else { 2789 mutex_enter(&vp->v_lock); 2790 vp->v_flag &= ~VNOCACHE; 2791 mutex_exit(&vp->v_lock); 2792 } 2793 } 2794 /* 2795 * The cached attributes of the file are stale after acquiring 2796 * the lock on the file. They were updated when the file was 2797 * opened, but not updated when the lock was acquired. Therefore the 2798 * cached attributes are invalidated after the lock is obtained. 2799 */ 2800 PURGE_ATTRCACHE4(vp); 2801 } 2802 2803 /* ARGSUSED */ 2804 static void * 2805 nfs4_mi_init(zoneid_t zoneid) 2806 { 2807 struct mi4_globals *mig; 2808 2809 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2810 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2811 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2812 offsetof(mntinfo4_t, mi_zone_node)); 2813 mig->mig_destructor_called = B_FALSE; 2814 return (mig); 2815 } 2816 2817 /* 2818 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2819 * state and killing off threads. 2820 */ 2821 /* ARGSUSED */ 2822 static void 2823 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2824 { 2825 struct mi4_globals *mig = data; 2826 mntinfo4_t *mi; 2827 nfs4_server_t *np; 2828 2829 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2830 "nfs4_mi_shutdown zone %d\n", zoneid)); 2831 ASSERT(mig != NULL); 2832 for (;;) { 2833 mutex_enter(&mig->mig_lock); 2834 mi = list_head(&mig->mig_list); 2835 if (mi == NULL) { 2836 mutex_exit(&mig->mig_lock); 2837 break; 2838 } 2839 2840 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2841 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2842 /* 2843 * purge the DNLC for this filesystem 2844 */ 2845 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2846 /* 2847 * Tell existing async worker threads to exit. 2848 */ 2849 mutex_enter(&mi->mi_async_lock); 2850 mi->mi_max_threads = 0; 2851 cv_broadcast(&mi->mi_async_work_cv); 2852 /* 2853 * Set the appropriate flags, signal and wait for both the 2854 * async manager and the inactive thread to exit when they're 2855 * done with their current work. 2856 */ 2857 mutex_enter(&mi->mi_lock); 2858 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2859 mutex_exit(&mi->mi_lock); 2860 mutex_exit(&mi->mi_async_lock); 2861 if (mi->mi_manager_thread) { 2862 nfs4_async_manager_stop(mi->mi_vfsp); 2863 } 2864 if (mi->mi_inactive_thread) { 2865 mutex_enter(&mi->mi_async_lock); 2866 cv_signal(&mi->mi_inact_req_cv); 2867 /* 2868 * Wait for the inactive thread to exit. 2869 */ 2870 while (mi->mi_inactive_thread != NULL) { 2871 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2872 } 2873 mutex_exit(&mi->mi_async_lock); 2874 } 2875 /* 2876 * Wait for the recovery thread to complete, that is, it will 2877 * signal when it is done using the "mi" structure and about 2878 * to exit 2879 */ 2880 mutex_enter(&mi->mi_lock); 2881 while (mi->mi_in_recovery > 0) 2882 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2883 mutex_exit(&mi->mi_lock); 2884 /* 2885 * We're done when every mi has been done or the list is empty. 2886 * This one is done, remove it from the list. 2887 */ 2888 list_remove(&mig->mig_list, mi); 2889 mutex_exit(&mig->mig_lock); 2890 zone_rele(mi->mi_zone); 2891 /* 2892 * Release hold on vfs and mi done to prevent race with zone 2893 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2894 */ 2895 VFS_RELE(mi->mi_vfsp); 2896 MI4_RELE(mi); 2897 } 2898 /* 2899 * Tell each renew thread in the zone to exit 2900 */ 2901 mutex_enter(&nfs4_server_lst_lock); 2902 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2903 mutex_enter(&np->s_lock); 2904 if (np->zoneid == zoneid) { 2905 /* 2906 * We add another hold onto the nfs4_server_t 2907 * because this will make sure tha the nfs4_server_t 2908 * stays around until nfs4_callback_fini_zone destroys 2909 * the zone. This way, the renew thread can 2910 * unconditionally release its holds on the 2911 * nfs4_server_t. 2912 */ 2913 np->s_refcnt++; 2914 nfs4_mark_srv_dead(np); 2915 } 2916 mutex_exit(&np->s_lock); 2917 } 2918 mutex_exit(&nfs4_server_lst_lock); 2919 } 2920 2921 static void 2922 nfs4_mi_free_globals(struct mi4_globals *mig) 2923 { 2924 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2925 mutex_destroy(&mig->mig_lock); 2926 kmem_free(mig, sizeof (*mig)); 2927 } 2928 2929 /* ARGSUSED */ 2930 static void 2931 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2932 { 2933 struct mi4_globals *mig = data; 2934 2935 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2936 "nfs4_mi_destroy zone %d\n", zoneid)); 2937 ASSERT(mig != NULL); 2938 mutex_enter(&mig->mig_lock); 2939 if (list_head(&mig->mig_list) != NULL) { 2940 /* Still waiting for VFS_FREEVFS() */ 2941 mig->mig_destructor_called = B_TRUE; 2942 mutex_exit(&mig->mig_lock); 2943 return; 2944 } 2945 nfs4_mi_free_globals(mig); 2946 } 2947 2948 /* 2949 * Add an NFS mount to the per-zone list of NFS mounts. 2950 */ 2951 void 2952 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2953 { 2954 struct mi4_globals *mig; 2955 2956 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2957 mutex_enter(&mig->mig_lock); 2958 list_insert_head(&mig->mig_list, mi); 2959 /* 2960 * hold added to eliminate race with zone shutdown -this will be 2961 * released in mi_shutdown 2962 */ 2963 MI4_HOLD(mi); 2964 VFS_HOLD(mi->mi_vfsp); 2965 mutex_exit(&mig->mig_lock); 2966 } 2967 2968 /* 2969 * Remove an NFS mount from the per-zone list of NFS mounts. 2970 */ 2971 int 2972 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 2973 { 2974 struct mi4_globals *mig; 2975 int ret = 0; 2976 2977 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2978 mutex_enter(&mig->mig_lock); 2979 mutex_enter(&mi->mi_lock); 2980 /* if this mi is marked dead, then the zone already released it */ 2981 if (!(mi->mi_flags & MI4_DEAD)) { 2982 list_remove(&mig->mig_list, mi); 2983 2984 /* release the holds put on in zonelist_add(). */ 2985 VFS_RELE(mi->mi_vfsp); 2986 MI4_RELE(mi); 2987 ret = 1; 2988 } 2989 mutex_exit(&mi->mi_lock); 2990 2991 /* 2992 * We can be called asynchronously by VFS_FREEVFS() after the zone 2993 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2994 * mi globals. 2995 */ 2996 if (list_head(&mig->mig_list) == NULL && 2997 mig->mig_destructor_called == B_TRUE) { 2998 nfs4_mi_free_globals(mig); 2999 return (ret); 3000 } 3001 mutex_exit(&mig->mig_lock); 3002 return (ret); 3003 } 3004 3005 void 3006 nfs_free_mi4(mntinfo4_t *mi) 3007 { 3008 nfs4_open_owner_t *foop; 3009 nfs4_oo_hash_bucket_t *bucketp; 3010 nfs4_debug_msg_t *msgp; 3011 int i; 3012 servinfo4_t *svp; 3013 3014 mutex_enter(&mi->mi_lock); 3015 ASSERT(mi->mi_recovthread == NULL); 3016 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 3017 mutex_exit(&mi->mi_lock); 3018 mutex_enter(&mi->mi_async_lock); 3019 ASSERT(mi->mi_threads == 0); 3020 ASSERT(mi->mi_manager_thread == NULL); 3021 mutex_exit(&mi->mi_async_lock); 3022 svp = mi->mi_servers; 3023 sv4_free(svp); 3024 if (mi->mi_io_kstats) { 3025 kstat_delete(mi->mi_io_kstats); 3026 mi->mi_io_kstats = NULL; 3027 } 3028 if (mi->mi_ro_kstats) { 3029 kstat_delete(mi->mi_ro_kstats); 3030 mi->mi_ro_kstats = NULL; 3031 } 3032 if (mi->mi_recov_ksp) { 3033 kstat_delete(mi->mi_recov_ksp); 3034 mi->mi_recov_ksp = NULL; 3035 } 3036 mutex_enter(&mi->mi_msg_list_lock); 3037 while (msgp = list_head(&mi->mi_msg_list)) { 3038 list_remove(&mi->mi_msg_list, msgp); 3039 nfs4_free_msg(msgp); 3040 } 3041 mutex_exit(&mi->mi_msg_list_lock); 3042 list_destroy(&mi->mi_msg_list); 3043 if (mi->mi_rootfh != NULL) 3044 sfh4_rele(&mi->mi_rootfh); 3045 if (mi->mi_srvparentfh != NULL) 3046 sfh4_rele(&mi->mi_srvparentfh); 3047 mutex_destroy(&mi->mi_lock); 3048 mutex_destroy(&mi->mi_async_lock); 3049 mutex_destroy(&mi->mi_msg_list_lock); 3050 nfs_rw_destroy(&mi->mi_recovlock); 3051 nfs_rw_destroy(&mi->mi_rename_lock); 3052 nfs_rw_destroy(&mi->mi_fh_lock); 3053 cv_destroy(&mi->mi_failover_cv); 3054 cv_destroy(&mi->mi_async_reqs_cv); 3055 cv_destroy(&mi->mi_async_work_cv); 3056 cv_destroy(&mi->mi_async_cv); 3057 cv_destroy(&mi->mi_inact_req_cv); 3058 /* 3059 * Destroy the oo hash lists and mutexes for the cred hash table. 3060 */ 3061 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3062 bucketp = &(mi->mi_oo_list[i]); 3063 /* Destroy any remaining open owners on the list */ 3064 foop = list_head(&bucketp->b_oo_hash_list); 3065 while (foop != NULL) { 3066 list_remove(&bucketp->b_oo_hash_list, foop); 3067 nfs4_destroy_open_owner(foop); 3068 foop = list_head(&bucketp->b_oo_hash_list); 3069 } 3070 list_destroy(&bucketp->b_oo_hash_list); 3071 mutex_destroy(&bucketp->b_lock); 3072 } 3073 /* 3074 * Empty and destroy the freed open owner list. 3075 */ 3076 foop = list_head(&mi->mi_foo_list); 3077 while (foop != NULL) { 3078 list_remove(&mi->mi_foo_list, foop); 3079 nfs4_destroy_open_owner(foop); 3080 foop = list_head(&mi->mi_foo_list); 3081 } 3082 list_destroy(&mi->mi_foo_list); 3083 list_destroy(&mi->mi_bseqid_list); 3084 list_destroy(&mi->mi_lost_state); 3085 avl_destroy(&mi->mi_filehandles); 3086 fn_rele(&mi->mi_fname); 3087 kmem_free(mi, sizeof (*mi)); 3088 } 3089 void 3090 mi_hold(mntinfo4_t *mi) 3091 { 3092 atomic_add_32(&mi->mi_count, 1); 3093 ASSERT(mi->mi_count != 0); 3094 } 3095 3096 void 3097 mi_rele(mntinfo4_t *mi) 3098 { 3099 ASSERT(mi->mi_count != 0); 3100 if (atomic_add_32_nv(&mi->mi_count, -1) == 0) { 3101 nfs_free_mi4(mi); 3102 } 3103 } 3104 3105 vnode_t nfs4_xattr_notsupp_vnode; 3106 3107 void 3108 nfs4_clnt_init(void) 3109 { 3110 nfs4_vnops_init(); 3111 (void) nfs4_rnode_init(); 3112 (void) nfs4_shadow_init(); 3113 (void) nfs4_acache_init(); 3114 (void) nfs4_subr_init(); 3115 nfs4_acl_init(); 3116 nfs_idmap_init(); 3117 nfs4_callback_init(); 3118 nfs4_secinfo_init(); 3119 #ifdef DEBUG 3120 tsd_create(&nfs4_tsd_key, NULL); 3121 #endif 3122 3123 /* 3124 * Add a CPR callback so that we can update client 3125 * lease after a suspend and resume. 3126 */ 3127 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3128 3129 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3130 nfs4_mi_destroy); 3131 3132 /* 3133 * Initialise the reference count of the notsupp xattr cache vnode to 1 3134 * so that it never goes away (VOP_INACTIVE isn't called on it). 3135 */ 3136 nfs4_xattr_notsupp_vnode.v_count = 1; 3137 } 3138 3139 void 3140 nfs4_clnt_fini(void) 3141 { 3142 (void) zone_key_delete(mi4_list_key); 3143 nfs4_vnops_fini(); 3144 (void) nfs4_rnode_fini(); 3145 (void) nfs4_shadow_fini(); 3146 (void) nfs4_acache_fini(); 3147 (void) nfs4_subr_fini(); 3148 nfs_idmap_fini(); 3149 nfs4_callback_fini(); 3150 nfs4_secinfo_fini(); 3151 #ifdef DEBUG 3152 tsd_destroy(&nfs4_tsd_key); 3153 #endif 3154 if (cid) 3155 (void) callb_delete(cid); 3156 } 3157 3158 /*ARGSUSED*/ 3159 static boolean_t 3160 nfs4_client_cpr_callb(void *arg, int code) 3161 { 3162 /* 3163 * We get called for Suspend and Resume events. 3164 * For the suspend case we simply don't care! 3165 */ 3166 if (code == CB_CODE_CPR_CHKPT) { 3167 return (B_TRUE); 3168 } 3169 3170 /* 3171 * When we get to here we are in the process of 3172 * resuming the system from a previous suspend. 3173 */ 3174 nfs4_client_resumed = gethrestime_sec(); 3175 return (B_TRUE); 3176 } 3177 3178 void 3179 nfs4_renew_lease_thread(nfs4_server_t *sp) 3180 { 3181 int error = 0; 3182 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3183 clock_t tick_delay = 0; 3184 clock_t time_left = 0; 3185 callb_cpr_t cpr_info; 3186 kmutex_t cpr_lock; 3187 3188 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3189 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3190 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3191 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3192 3193 mutex_enter(&sp->s_lock); 3194 /* sp->s_lease_time is set via a GETATTR */ 3195 sp->last_renewal_time = gethrestime_sec(); 3196 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3197 ASSERT(sp->s_refcnt >= 1); 3198 3199 for (;;) { 3200 if (!sp->state_ref_count || 3201 sp->lease_valid != NFS4_LEASE_VALID) { 3202 3203 kip_secs = MAX((sp->s_lease_time >> 1) - 3204 (3 * sp->propagation_delay.tv_sec), 1); 3205 3206 tick_delay = SEC_TO_TICK(kip_secs); 3207 3208 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3209 "nfs4_renew_lease_thread: no renew : thread " 3210 "wait %ld secs", kip_secs)); 3211 3212 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3213 "nfs4_renew_lease_thread: no renew : " 3214 "state_ref_count %d, lease_valid %d", 3215 sp->state_ref_count, sp->lease_valid)); 3216 3217 mutex_enter(&cpr_lock); 3218 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3219 mutex_exit(&cpr_lock); 3220 time_left = cv_timedwait(&sp->cv_thread_exit, 3221 &sp->s_lock, tick_delay + lbolt); 3222 mutex_enter(&cpr_lock); 3223 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3224 mutex_exit(&cpr_lock); 3225 3226 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3227 "nfs4_renew_lease_thread: no renew: " 3228 "time left %ld", time_left)); 3229 3230 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3231 goto die; 3232 continue; 3233 } 3234 3235 tmp_last_renewal_time = sp->last_renewal_time; 3236 3237 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3238 (3 * sp->propagation_delay.tv_sec); 3239 3240 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3241 "nfs4_renew_lease_thread: tmp_time %ld, " 3242 "sp->last_renewal_time %ld", tmp_time, 3243 sp->last_renewal_time)); 3244 3245 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3246 3247 tick_delay = SEC_TO_TICK(kip_secs); 3248 3249 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3250 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3251 "secs", kip_secs)); 3252 3253 mutex_enter(&cpr_lock); 3254 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3255 mutex_exit(&cpr_lock); 3256 time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock, 3257 tick_delay + lbolt); 3258 mutex_enter(&cpr_lock); 3259 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3260 mutex_exit(&cpr_lock); 3261 3262 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3263 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3264 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3265 "tmp_last_renewal_time %ld", time_left, 3266 sp->last_renewal_time, nfs4_client_resumed, 3267 tmp_last_renewal_time)); 3268 3269 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3270 goto die; 3271 3272 if (tmp_last_renewal_time == sp->last_renewal_time || 3273 (nfs4_client_resumed != 0 && 3274 nfs4_client_resumed > sp->last_renewal_time)) { 3275 /* 3276 * Issue RENEW op since we haven't renewed the lease 3277 * since we slept. 3278 */ 3279 tmp_now_time = gethrestime_sec(); 3280 error = nfs4renew(sp); 3281 /* 3282 * Need to re-acquire sp's lock, nfs4renew() 3283 * relinqueshes it. 3284 */ 3285 mutex_enter(&sp->s_lock); 3286 3287 /* 3288 * See if someone changed s_thread_exit while we gave 3289 * up s_lock. 3290 */ 3291 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3292 goto die; 3293 3294 if (!error) { 3295 /* 3296 * check to see if we implicitly renewed while 3297 * we waited for a reply for our RENEW call. 3298 */ 3299 if (tmp_last_renewal_time == 3300 sp->last_renewal_time) { 3301 /* no implicit renew came */ 3302 sp->last_renewal_time = tmp_now_time; 3303 } else { 3304 NFS4_DEBUG(nfs4_client_lease_debug, 3305 (CE_NOTE, "renew_thread: did " 3306 "implicit renewal before reply " 3307 "from server for RENEW")); 3308 } 3309 } else { 3310 /* figure out error */ 3311 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3312 "renew_thread: nfs4renew returned error" 3313 " %d", error)); 3314 } 3315 3316 } 3317 } 3318 3319 die: 3320 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3321 "nfs4_renew_lease_thread: thread exiting")); 3322 3323 while (sp->s_otw_call_count != 0) { 3324 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3325 "nfs4_renew_lease_thread: waiting for outstanding " 3326 "otw calls to finish for sp 0x%p, current " 3327 "s_otw_call_count %d", (void *)sp, 3328 sp->s_otw_call_count)); 3329 mutex_enter(&cpr_lock); 3330 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3331 mutex_exit(&cpr_lock); 3332 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3333 mutex_enter(&cpr_lock); 3334 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3335 mutex_exit(&cpr_lock); 3336 } 3337 mutex_exit(&sp->s_lock); 3338 3339 nfs4_server_rele(sp); /* free the thread's reference */ 3340 nfs4_server_rele(sp); /* free the list's reference */ 3341 sp = NULL; 3342 3343 done: 3344 mutex_enter(&cpr_lock); 3345 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3346 mutex_destroy(&cpr_lock); 3347 3348 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3349 "nfs4_renew_lease_thread: renew thread exit officially")); 3350 3351 zthread_exit(); 3352 /* NOT REACHED */ 3353 } 3354 3355 /* 3356 * Send out a RENEW op to the server. 3357 * Assumes sp is locked down. 3358 */ 3359 static int 3360 nfs4renew(nfs4_server_t *sp) 3361 { 3362 COMPOUND4args_clnt args; 3363 COMPOUND4res_clnt res; 3364 nfs_argop4 argop[1]; 3365 int doqueue = 1; 3366 int rpc_error; 3367 cred_t *cr; 3368 mntinfo4_t *mi; 3369 timespec_t prop_time, after_time; 3370 int needrecov = FALSE; 3371 nfs4_recov_state_t recov_state; 3372 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3373 3374 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3375 3376 recov_state.rs_flags = 0; 3377 recov_state.rs_num_retry_despite_err = 0; 3378 3379 recov_retry: 3380 mi = sp->mntinfo4_list; 3381 VFS_HOLD(mi->mi_vfsp); 3382 mutex_exit(&sp->s_lock); 3383 ASSERT(mi != NULL); 3384 3385 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3386 if (e.error) { 3387 VFS_RELE(mi->mi_vfsp); 3388 return (e.error); 3389 } 3390 3391 /* Check to see if we're dealing with a marked-dead sp */ 3392 mutex_enter(&sp->s_lock); 3393 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3394 mutex_exit(&sp->s_lock); 3395 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3396 VFS_RELE(mi->mi_vfsp); 3397 return (0); 3398 } 3399 3400 /* Make sure mi hasn't changed on us */ 3401 if (mi != sp->mntinfo4_list) { 3402 /* Must drop sp's lock to avoid a recursive mutex enter */ 3403 mutex_exit(&sp->s_lock); 3404 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3405 VFS_RELE(mi->mi_vfsp); 3406 mutex_enter(&sp->s_lock); 3407 goto recov_retry; 3408 } 3409 mutex_exit(&sp->s_lock); 3410 3411 args.ctag = TAG_RENEW; 3412 3413 args.array_len = 1; 3414 args.array = argop; 3415 3416 argop[0].argop = OP_RENEW; 3417 3418 mutex_enter(&sp->s_lock); 3419 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3420 cr = sp->s_cred; 3421 crhold(cr); 3422 mutex_exit(&sp->s_lock); 3423 3424 ASSERT(cr != NULL); 3425 3426 /* used to figure out RTT for sp */ 3427 gethrestime(&prop_time); 3428 3429 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3430 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3431 (void*)sp)); 3432 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3433 prop_time.tv_sec, prop_time.tv_nsec)); 3434 3435 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3436 mntinfo4_t *, mi); 3437 3438 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3439 crfree(cr); 3440 3441 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3442 mntinfo4_t *, mi); 3443 3444 gethrestime(&after_time); 3445 3446 mutex_enter(&sp->s_lock); 3447 sp->propagation_delay.tv_sec = 3448 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3449 mutex_exit(&sp->s_lock); 3450 3451 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3452 after_time.tv_sec, after_time.tv_nsec)); 3453 3454 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3455 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3456 nfs4_delegreturn_all(sp); 3457 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3458 VFS_RELE(mi->mi_vfsp); 3459 /* 3460 * If the server returns CB_PATH_DOWN, it has renewed 3461 * the lease and informed us that the callback path is 3462 * down. Since the lease is renewed, just return 0 and 3463 * let the renew thread proceed as normal. 3464 */ 3465 return (0); 3466 } 3467 3468 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3469 if (!needrecov && e.error) { 3470 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3471 VFS_RELE(mi->mi_vfsp); 3472 return (e.error); 3473 } 3474 3475 rpc_error = e.error; 3476 3477 if (needrecov) { 3478 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3479 "nfs4renew: initiating recovery\n")); 3480 3481 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3482 OP_RENEW, NULL) == FALSE) { 3483 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3484 VFS_RELE(mi->mi_vfsp); 3485 if (!e.error) 3486 (void) xdr_free(xdr_COMPOUND4res_clnt, 3487 (caddr_t)&res); 3488 mutex_enter(&sp->s_lock); 3489 goto recov_retry; 3490 } 3491 /* fall through for res.status case */ 3492 } 3493 3494 if (res.status) { 3495 if (res.status == NFS4ERR_LEASE_MOVED) { 3496 /*EMPTY*/ 3497 /* 3498 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3499 * to renew the lease on that server 3500 */ 3501 } 3502 e.error = geterrno4(res.status); 3503 } 3504 3505 if (!rpc_error) 3506 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3507 3508 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3509 3510 VFS_RELE(mi->mi_vfsp); 3511 3512 return (e.error); 3513 } 3514 3515 void 3516 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3517 { 3518 nfs4_server_t *sp; 3519 3520 /* this locks down sp if it is found */ 3521 sp = find_nfs4_server(mi); 3522 3523 if (sp != NULL) { 3524 nfs4_inc_state_ref_count_nolock(sp, mi); 3525 mutex_exit(&sp->s_lock); 3526 nfs4_server_rele(sp); 3527 } 3528 } 3529 3530 /* 3531 * Bump the number of OPEN files (ie: those with state) so we know if this 3532 * nfs4_server has any state to maintain a lease for or not. 3533 * 3534 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3535 */ 3536 void 3537 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3538 { 3539 ASSERT(mutex_owned(&sp->s_lock)); 3540 3541 sp->state_ref_count++; 3542 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3543 "nfs4_inc_state_ref_count: state_ref_count now %d", 3544 sp->state_ref_count)); 3545 3546 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3547 sp->lease_valid = NFS4_LEASE_VALID; 3548 3549 /* 3550 * If this call caused the lease to be marked valid and/or 3551 * took the state_ref_count from 0 to 1, then start the time 3552 * on lease renewal. 3553 */ 3554 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3555 sp->last_renewal_time = gethrestime_sec(); 3556 3557 /* update the number of open files for mi */ 3558 mi->mi_open_files++; 3559 } 3560 3561 void 3562 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3563 { 3564 nfs4_server_t *sp; 3565 3566 /* this locks down sp if it is found */ 3567 sp = find_nfs4_server_all(mi, 1); 3568 3569 if (sp != NULL) { 3570 nfs4_dec_state_ref_count_nolock(sp, mi); 3571 mutex_exit(&sp->s_lock); 3572 nfs4_server_rele(sp); 3573 } 3574 } 3575 3576 /* 3577 * Decrement the number of OPEN files (ie: those with state) so we know if 3578 * this nfs4_server has any state to maintain a lease for or not. 3579 */ 3580 void 3581 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3582 { 3583 ASSERT(mutex_owned(&sp->s_lock)); 3584 ASSERT(sp->state_ref_count != 0); 3585 sp->state_ref_count--; 3586 3587 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3588 "nfs4_dec_state_ref_count: state ref count now %d", 3589 sp->state_ref_count)); 3590 3591 mi->mi_open_files--; 3592 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3593 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3594 mi->mi_open_files, mi->mi_flags)); 3595 3596 /* We don't have to hold the mi_lock to test mi_flags */ 3597 if (mi->mi_open_files == 0 && 3598 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3599 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3600 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3601 "we have closed the last open file", (void*)mi)); 3602 nfs4_remove_mi_from_server(mi, sp); 3603 } 3604 } 3605 3606 bool_t 3607 inlease(nfs4_server_t *sp) 3608 { 3609 bool_t result; 3610 3611 ASSERT(mutex_owned(&sp->s_lock)); 3612 3613 if (sp->lease_valid == NFS4_LEASE_VALID && 3614 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3615 result = TRUE; 3616 else 3617 result = FALSE; 3618 3619 return (result); 3620 } 3621 3622 3623 /* 3624 * Return non-zero if the given nfs4_server_t is going through recovery. 3625 */ 3626 3627 int 3628 nfs4_server_in_recovery(nfs4_server_t *sp) 3629 { 3630 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3631 } 3632 3633 /* 3634 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3635 * first is less than, equal to, or greater than the second. 3636 */ 3637 3638 int 3639 sfh4cmp(const void *p1, const void *p2) 3640 { 3641 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3642 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3643 3644 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3645 } 3646 3647 /* 3648 * Create a table for shared filehandle objects. 3649 */ 3650 3651 void 3652 sfh4_createtab(avl_tree_t *tab) 3653 { 3654 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3655 offsetof(nfs4_sharedfh_t, sfh_tree)); 3656 } 3657 3658 /* 3659 * Return a shared filehandle object for the given filehandle. The caller 3660 * is responsible for eventually calling sfh4_rele(). 3661 */ 3662 3663 nfs4_sharedfh_t * 3664 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3665 { 3666 nfs4_sharedfh_t *sfh, *nsfh; 3667 avl_index_t where; 3668 nfs4_sharedfh_t skey; 3669 3670 if (!key) { 3671 skey.sfh_fh = *fh; 3672 key = &skey; 3673 } 3674 3675 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3676 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3677 /* 3678 * We allocate the largest possible filehandle size because it's 3679 * not that big, and it saves us from possibly having to resize the 3680 * buffer later. 3681 */ 3682 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3683 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3684 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3685 nsfh->sfh_refcnt = 1; 3686 nsfh->sfh_flags = SFH4_IN_TREE; 3687 nsfh->sfh_mi = mi; 3688 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3689 (void *)nsfh)); 3690 3691 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3692 sfh = avl_find(&mi->mi_filehandles, key, &where); 3693 if (sfh != NULL) { 3694 mutex_enter(&sfh->sfh_lock); 3695 sfh->sfh_refcnt++; 3696 mutex_exit(&sfh->sfh_lock); 3697 nfs_rw_exit(&mi->mi_fh_lock); 3698 /* free our speculative allocs */ 3699 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3700 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3701 return (sfh); 3702 } 3703 3704 avl_insert(&mi->mi_filehandles, nsfh, where); 3705 nfs_rw_exit(&mi->mi_fh_lock); 3706 3707 return (nsfh); 3708 } 3709 3710 /* 3711 * Return a shared filehandle object for the given filehandle. The caller 3712 * is responsible for eventually calling sfh4_rele(). 3713 */ 3714 3715 nfs4_sharedfh_t * 3716 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3717 { 3718 nfs4_sharedfh_t *sfh; 3719 nfs4_sharedfh_t key; 3720 3721 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3722 3723 #ifdef DEBUG 3724 if (nfs4_sharedfh_debug) { 3725 nfs4_fhandle_t fhandle; 3726 3727 fhandle.fh_len = fh->nfs_fh4_len; 3728 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3729 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3730 nfs4_printfhandle(&fhandle); 3731 } 3732 #endif 3733 3734 /* 3735 * If there's already an object for the given filehandle, bump the 3736 * reference count and return it. Otherwise, create a new object 3737 * and add it to the AVL tree. 3738 */ 3739 3740 key.sfh_fh = *fh; 3741 3742 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3743 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3744 if (sfh != NULL) { 3745 mutex_enter(&sfh->sfh_lock); 3746 sfh->sfh_refcnt++; 3747 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3748 "sfh4_get: found existing %p, new refcnt=%d", 3749 (void *)sfh, sfh->sfh_refcnt)); 3750 mutex_exit(&sfh->sfh_lock); 3751 nfs_rw_exit(&mi->mi_fh_lock); 3752 return (sfh); 3753 } 3754 nfs_rw_exit(&mi->mi_fh_lock); 3755 3756 return (sfh4_put(fh, mi, &key)); 3757 } 3758 3759 /* 3760 * Get a reference to the given shared filehandle object. 3761 */ 3762 3763 void 3764 sfh4_hold(nfs4_sharedfh_t *sfh) 3765 { 3766 ASSERT(sfh->sfh_refcnt > 0); 3767 3768 mutex_enter(&sfh->sfh_lock); 3769 sfh->sfh_refcnt++; 3770 NFS4_DEBUG(nfs4_sharedfh_debug, 3771 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3772 (void *)sfh, sfh->sfh_refcnt)); 3773 mutex_exit(&sfh->sfh_lock); 3774 } 3775 3776 /* 3777 * Release a reference to the given shared filehandle object and null out 3778 * the given pointer. 3779 */ 3780 3781 void 3782 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3783 { 3784 mntinfo4_t *mi; 3785 nfs4_sharedfh_t *sfh = *sfhpp; 3786 3787 ASSERT(sfh->sfh_refcnt > 0); 3788 3789 mutex_enter(&sfh->sfh_lock); 3790 if (sfh->sfh_refcnt > 1) { 3791 sfh->sfh_refcnt--; 3792 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3793 "sfh4_rele %p, new refcnt=%d", 3794 (void *)sfh, sfh->sfh_refcnt)); 3795 mutex_exit(&sfh->sfh_lock); 3796 goto finish; 3797 } 3798 mutex_exit(&sfh->sfh_lock); 3799 3800 /* 3801 * Possibly the last reference, so get the lock for the table in 3802 * case it's time to remove the object from the table. 3803 */ 3804 mi = sfh->sfh_mi; 3805 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3806 mutex_enter(&sfh->sfh_lock); 3807 sfh->sfh_refcnt--; 3808 if (sfh->sfh_refcnt > 0) { 3809 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3810 "sfh4_rele %p, new refcnt=%d", 3811 (void *)sfh, sfh->sfh_refcnt)); 3812 mutex_exit(&sfh->sfh_lock); 3813 nfs_rw_exit(&mi->mi_fh_lock); 3814 goto finish; 3815 } 3816 3817 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3818 "sfh4_rele %p, last ref", (void *)sfh)); 3819 if (sfh->sfh_flags & SFH4_IN_TREE) { 3820 avl_remove(&mi->mi_filehandles, sfh); 3821 sfh->sfh_flags &= ~SFH4_IN_TREE; 3822 } 3823 mutex_exit(&sfh->sfh_lock); 3824 nfs_rw_exit(&mi->mi_fh_lock); 3825 mutex_destroy(&sfh->sfh_lock); 3826 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3827 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3828 3829 finish: 3830 *sfhpp = NULL; 3831 } 3832 3833 /* 3834 * Update the filehandle for the given shared filehandle object. 3835 */ 3836 3837 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3838 3839 void 3840 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3841 { 3842 mntinfo4_t *mi = sfh->sfh_mi; 3843 nfs4_sharedfh_t *dupsfh; 3844 avl_index_t where; 3845 nfs4_sharedfh_t key; 3846 3847 #ifdef DEBUG 3848 mutex_enter(&sfh->sfh_lock); 3849 ASSERT(sfh->sfh_refcnt > 0); 3850 mutex_exit(&sfh->sfh_lock); 3851 #endif 3852 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3853 3854 /* 3855 * The basic plan is to remove the shared filehandle object from 3856 * the table, update it to have the new filehandle, then reinsert 3857 * it. 3858 */ 3859 3860 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3861 mutex_enter(&sfh->sfh_lock); 3862 if (sfh->sfh_flags & SFH4_IN_TREE) { 3863 avl_remove(&mi->mi_filehandles, sfh); 3864 sfh->sfh_flags &= ~SFH4_IN_TREE; 3865 } 3866 mutex_exit(&sfh->sfh_lock); 3867 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3868 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3869 sfh->sfh_fh.nfs_fh4_len); 3870 3871 /* 3872 * XXX If there is already a shared filehandle object with the new 3873 * filehandle, we're in trouble, because the rnode code assumes 3874 * that there is only one shared filehandle object for a given 3875 * filehandle. So issue a warning (for read-write mounts only) 3876 * and don't try to re-insert the given object into the table. 3877 * Hopefully the given object will quickly go away and everyone 3878 * will use the new object. 3879 */ 3880 key.sfh_fh = *newfh; 3881 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3882 if (dupsfh != NULL) { 3883 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3884 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3885 "duplicate filehandle detected"); 3886 sfh4_printfhandle(dupsfh); 3887 } 3888 } else { 3889 avl_insert(&mi->mi_filehandles, sfh, where); 3890 mutex_enter(&sfh->sfh_lock); 3891 sfh->sfh_flags |= SFH4_IN_TREE; 3892 mutex_exit(&sfh->sfh_lock); 3893 } 3894 nfs_rw_exit(&mi->mi_fh_lock); 3895 } 3896 3897 /* 3898 * Copy out the current filehandle for the given shared filehandle object. 3899 */ 3900 3901 void 3902 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3903 { 3904 mntinfo4_t *mi = sfh->sfh_mi; 3905 3906 ASSERT(sfh->sfh_refcnt > 0); 3907 3908 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3909 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3910 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3911 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3912 nfs_rw_exit(&mi->mi_fh_lock); 3913 } 3914 3915 /* 3916 * Print out the filehandle for the given shared filehandle object. 3917 */ 3918 3919 void 3920 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3921 { 3922 nfs4_fhandle_t fhandle; 3923 3924 sfh4_copyval(sfh, &fhandle); 3925 nfs4_printfhandle(&fhandle); 3926 } 3927 3928 /* 3929 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3930 * if they're the same, +1 if the first is "greater" than the second. The 3931 * caller (or whoever's calling the AVL package) is responsible for 3932 * handling locking issues. 3933 */ 3934 3935 static int 3936 fncmp(const void *p1, const void *p2) 3937 { 3938 const nfs4_fname_t *f1 = p1; 3939 const nfs4_fname_t *f2 = p2; 3940 int res; 3941 3942 res = strcmp(f1->fn_name, f2->fn_name); 3943 /* 3944 * The AVL package wants +/-1, not arbitrary positive or negative 3945 * integers. 3946 */ 3947 if (res > 0) 3948 res = 1; 3949 else if (res < 0) 3950 res = -1; 3951 return (res); 3952 } 3953 3954 /* 3955 * Get or create an fname with the given name, as a child of the given 3956 * fname. The caller is responsible for eventually releasing the reference 3957 * (fn_rele()). parent may be NULL. 3958 */ 3959 3960 nfs4_fname_t * 3961 fn_get(nfs4_fname_t *parent, char *name) 3962 { 3963 nfs4_fname_t key; 3964 nfs4_fname_t *fnp; 3965 avl_index_t where; 3966 3967 key.fn_name = name; 3968 3969 /* 3970 * If there's already an fname registered with the given name, bump 3971 * its reference count and return it. Otherwise, create a new one 3972 * and add it to the parent's AVL tree. 3973 */ 3974 3975 if (parent != NULL) { 3976 mutex_enter(&parent->fn_lock); 3977 fnp = avl_find(&parent->fn_children, &key, &where); 3978 if (fnp != NULL) { 3979 fn_hold(fnp); 3980 mutex_exit(&parent->fn_lock); 3981 return (fnp); 3982 } 3983 } 3984 3985 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 3986 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 3987 fnp->fn_parent = parent; 3988 if (parent != NULL) 3989 fn_hold(parent); 3990 fnp->fn_len = strlen(name); 3991 ASSERT(fnp->fn_len < MAXNAMELEN); 3992 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 3993 (void) strcpy(fnp->fn_name, name); 3994 fnp->fn_refcnt = 1; 3995 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 3996 offsetof(nfs4_fname_t, fn_tree)); 3997 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3998 "fn_get %p:%s, a new nfs4_fname_t!", 3999 (void *)fnp, fnp->fn_name)); 4000 if (parent != NULL) { 4001 avl_insert(&parent->fn_children, fnp, where); 4002 mutex_exit(&parent->fn_lock); 4003 } 4004 4005 return (fnp); 4006 } 4007 4008 void 4009 fn_hold(nfs4_fname_t *fnp) 4010 { 4011 atomic_add_32(&fnp->fn_refcnt, 1); 4012 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4013 "fn_hold %p:%s, new refcnt=%d", 4014 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4015 } 4016 4017 /* 4018 * Decrement the reference count of the given fname, and destroy it if its 4019 * reference count goes to zero. Nulls out the given pointer. 4020 */ 4021 4022 void 4023 fn_rele(nfs4_fname_t **fnpp) 4024 { 4025 nfs4_fname_t *parent; 4026 uint32_t newref; 4027 nfs4_fname_t *fnp; 4028 4029 recur: 4030 fnp = *fnpp; 4031 *fnpp = NULL; 4032 4033 mutex_enter(&fnp->fn_lock); 4034 parent = fnp->fn_parent; 4035 if (parent != NULL) 4036 mutex_enter(&parent->fn_lock); /* prevent new references */ 4037 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 4038 if (newref > 0) { 4039 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4040 "fn_rele %p:%s, new refcnt=%d", 4041 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4042 if (parent != NULL) 4043 mutex_exit(&parent->fn_lock); 4044 mutex_exit(&fnp->fn_lock); 4045 return; 4046 } 4047 4048 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4049 "fn_rele %p:%s, last reference, deleting...", 4050 (void *)fnp, fnp->fn_name)); 4051 if (parent != NULL) { 4052 avl_remove(&parent->fn_children, fnp); 4053 mutex_exit(&parent->fn_lock); 4054 } 4055 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4056 mutex_destroy(&fnp->fn_lock); 4057 avl_destroy(&fnp->fn_children); 4058 kmem_free(fnp, sizeof (nfs4_fname_t)); 4059 /* 4060 * Recursivly fn_rele the parent. 4061 * Use goto instead of a recursive call to avoid stack overflow. 4062 */ 4063 if (parent != NULL) { 4064 fnpp = &parent; 4065 goto recur; 4066 } 4067 } 4068 4069 /* 4070 * Returns the single component name of the given fname, in a MAXNAMELEN 4071 * string buffer, which the caller is responsible for freeing. Note that 4072 * the name may become invalid as a result of fn_move(). 4073 */ 4074 4075 char * 4076 fn_name(nfs4_fname_t *fnp) 4077 { 4078 char *name; 4079 4080 ASSERT(fnp->fn_len < MAXNAMELEN); 4081 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4082 mutex_enter(&fnp->fn_lock); 4083 (void) strcpy(name, fnp->fn_name); 4084 mutex_exit(&fnp->fn_lock); 4085 4086 return (name); 4087 } 4088 4089 4090 /* 4091 * fn_path_realloc 4092 * 4093 * This function, used only by fn_path, constructs 4094 * a new string which looks like "prepend" + "/" + "current". 4095 * by allocating a new string and freeing the old one. 4096 */ 4097 static void 4098 fn_path_realloc(char **curses, char *prepend) 4099 { 4100 int len, curlen = 0; 4101 char *news; 4102 4103 if (*curses == NULL) { 4104 /* 4105 * Prime the pump, allocate just the 4106 * space for prepend and return that. 4107 */ 4108 len = strlen(prepend) + 1; 4109 news = kmem_alloc(len, KM_SLEEP); 4110 (void) strncpy(news, prepend, len); 4111 } else { 4112 /* 4113 * Allocate the space for a new string 4114 * +1 +1 is for the "/" and the NULL 4115 * byte at the end of it all. 4116 */ 4117 curlen = strlen(*curses); 4118 len = curlen + strlen(prepend) + 1 + 1; 4119 news = kmem_alloc(len, KM_SLEEP); 4120 (void) strncpy(news, prepend, len); 4121 (void) strcat(news, "/"); 4122 (void) strcat(news, *curses); 4123 kmem_free(*curses, curlen + 1); 4124 } 4125 *curses = news; 4126 } 4127 4128 /* 4129 * Returns the path name (starting from the fs root) for the given fname. 4130 * The caller is responsible for freeing. Note that the path may be or 4131 * become invalid as a result of fn_move(). 4132 */ 4133 4134 char * 4135 fn_path(nfs4_fname_t *fnp) 4136 { 4137 char *path; 4138 nfs4_fname_t *nextfnp; 4139 4140 if (fnp == NULL) 4141 return (NULL); 4142 4143 path = NULL; 4144 4145 /* walk up the tree constructing the pathname. */ 4146 4147 fn_hold(fnp); /* adjust for later rele */ 4148 do { 4149 mutex_enter(&fnp->fn_lock); 4150 /* 4151 * Add fn_name in front of the current path 4152 */ 4153 fn_path_realloc(&path, fnp->fn_name); 4154 nextfnp = fnp->fn_parent; 4155 if (nextfnp != NULL) 4156 fn_hold(nextfnp); 4157 mutex_exit(&fnp->fn_lock); 4158 fn_rele(&fnp); 4159 fnp = nextfnp; 4160 } while (fnp != NULL); 4161 4162 return (path); 4163 } 4164 4165 /* 4166 * Return a reference to the parent of the given fname, which the caller is 4167 * responsible for eventually releasing. 4168 */ 4169 4170 nfs4_fname_t * 4171 fn_parent(nfs4_fname_t *fnp) 4172 { 4173 nfs4_fname_t *parent; 4174 4175 mutex_enter(&fnp->fn_lock); 4176 parent = fnp->fn_parent; 4177 if (parent != NULL) 4178 fn_hold(parent); 4179 mutex_exit(&fnp->fn_lock); 4180 4181 return (parent); 4182 } 4183 4184 /* 4185 * Update fnp so that its parent is newparent and its name is newname. 4186 */ 4187 4188 void 4189 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4190 { 4191 nfs4_fname_t *parent, *tmpfnp; 4192 ssize_t newlen; 4193 nfs4_fname_t key; 4194 avl_index_t where; 4195 4196 /* 4197 * This assert exists to catch the client trying to rename 4198 * a dir to be a child of itself. This happened at a recent 4199 * bakeoff against a 3rd party (broken) server which allowed 4200 * the rename to succeed. If it trips it means that: 4201 * a) the code in nfs4rename that detects this case is broken 4202 * b) the server is broken (since it allowed the bogus rename) 4203 * 4204 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4205 * panic below from: mutex_enter(&newparent->fn_lock); 4206 */ 4207 ASSERT(fnp != newparent); 4208 4209 /* 4210 * Remove fnp from its current parent, change its name, then add it 4211 * to newparent. 4212 */ 4213 mutex_enter(&fnp->fn_lock); 4214 parent = fnp->fn_parent; 4215 mutex_enter(&parent->fn_lock); 4216 avl_remove(&parent->fn_children, fnp); 4217 mutex_exit(&parent->fn_lock); 4218 fn_rele(&fnp->fn_parent); 4219 4220 newlen = strlen(newname); 4221 if (newlen != fnp->fn_len) { 4222 ASSERT(newlen < MAXNAMELEN); 4223 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4224 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4225 fnp->fn_len = newlen; 4226 } 4227 (void) strcpy(fnp->fn_name, newname); 4228 4229 again: 4230 mutex_enter(&newparent->fn_lock); 4231 key.fn_name = fnp->fn_name; 4232 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4233 if (tmpfnp != NULL) { 4234 /* 4235 * This could be due to a file that was unlinked while 4236 * open, or perhaps the rnode is in the free list. Remove 4237 * it from newparent and let it go away on its own. The 4238 * contorted code is to deal with lock order issues and 4239 * race conditions. 4240 */ 4241 fn_hold(tmpfnp); 4242 mutex_exit(&newparent->fn_lock); 4243 mutex_enter(&tmpfnp->fn_lock); 4244 if (tmpfnp->fn_parent == newparent) { 4245 mutex_enter(&newparent->fn_lock); 4246 avl_remove(&newparent->fn_children, tmpfnp); 4247 mutex_exit(&newparent->fn_lock); 4248 fn_rele(&tmpfnp->fn_parent); 4249 } 4250 mutex_exit(&tmpfnp->fn_lock); 4251 fn_rele(&tmpfnp); 4252 goto again; 4253 } 4254 fnp->fn_parent = newparent; 4255 fn_hold(newparent); 4256 avl_insert(&newparent->fn_children, fnp, where); 4257 mutex_exit(&newparent->fn_lock); 4258 mutex_exit(&fnp->fn_lock); 4259 } 4260 4261 #ifdef DEBUG 4262 /* 4263 * Return non-zero if the type information makes sense for the given vnode. 4264 * Otherwise panic. 4265 */ 4266 int 4267 nfs4_consistent_type(vnode_t *vp) 4268 { 4269 rnode4_t *rp = VTOR4(vp); 4270 4271 if (nfs4_vtype_debug && vp->v_type != VNON && 4272 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4273 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4274 "rnode attr type=%d", (void *)vp, vp->v_type, 4275 rp->r_attr.va_type); 4276 } 4277 4278 return (1); 4279 } 4280 #endif /* DEBUG */ 4281