1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 27 * All Rights Reserved 28 */ 29 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/systm.h> 33 #include <sys/thread.h> 34 #include <sys/t_lock.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/errno.h> 39 #include <sys/buf.h> 40 #include <sys/stat.h> 41 #include <sys/cred.h> 42 #include <sys/kmem.h> 43 #include <sys/debug.h> 44 #include <sys/dnlc.h> 45 #include <sys/vmsystm.h> 46 #include <sys/flock.h> 47 #include <sys/share.h> 48 #include <sys/cmn_err.h> 49 #include <sys/tiuser.h> 50 #include <sys/sysmacros.h> 51 #include <sys/callb.h> 52 #include <sys/acl.h> 53 #include <sys/kstat.h> 54 #include <sys/signal.h> 55 #include <sys/disp.h> 56 #include <sys/atomic.h> 57 #include <sys/list.h> 58 #include <sys/sdt.h> 59 60 #include <rpc/types.h> 61 #include <rpc/xdr.h> 62 #include <rpc/auth.h> 63 #include <rpc/clnt.h> 64 65 #include <nfs/nfs.h> 66 #include <nfs/nfs_clnt.h> 67 #include <nfs/nfs_acl.h> 68 69 #include <nfs/nfs4.h> 70 #include <nfs/rnode4.h> 71 #include <nfs/nfs4_clnt.h> 72 73 #include <vm/hat.h> 74 #include <vm/as.h> 75 #include <vm/page.h> 76 #include <vm/pvn.h> 77 #include <vm/seg.h> 78 #include <vm/seg_map.h> 79 #include <vm/seg_vn.h> 80 81 #include <sys/ddi.h> 82 83 /* 84 * Arguments to page-flush thread. 85 */ 86 typedef struct { 87 vnode_t *vp; 88 cred_t *cr; 89 } pgflush_t; 90 91 #ifdef DEBUG 92 int nfs4_client_lease_debug; 93 int nfs4_sharedfh_debug; 94 int nfs4_fname_debug; 95 96 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 97 int nfs4_vtype_debug; 98 99 uint_t nfs4_tsd_key; 100 #endif 101 102 static time_t nfs4_client_resumed = 0; 103 static callb_id_t cid = 0; 104 105 static int nfs4renew(nfs4_server_t *); 106 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 107 static void nfs4_pgflush_thread(pgflush_t *); 108 109 static boolean_t nfs4_client_cpr_callb(void *, int); 110 111 struct mi4_globals { 112 kmutex_t mig_lock; /* lock protecting mig_list */ 113 list_t mig_list; /* list of NFS v4 mounts in zone */ 114 boolean_t mig_destructor_called; 115 }; 116 117 static zone_key_t mi4_list_key; 118 119 /* 120 * Attributes caching: 121 * 122 * Attributes are cached in the rnode in struct vattr form. 123 * There is a time associated with the cached attributes (r_time_attr_inval) 124 * which tells whether the attributes are valid. The time is initialized 125 * to the difference between current time and the modify time of the vnode 126 * when new attributes are cached. This allows the attributes for 127 * files that have changed recently to be timed out sooner than for files 128 * that have not changed for a long time. There are minimum and maximum 129 * timeout values that can be set per mount point. 130 */ 131 132 /* 133 * If a cache purge is in progress, wait for it to finish. 134 * 135 * The current thread must not be in the middle of an 136 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 137 * between this thread, a recovery thread, and the page flush thread. 138 */ 139 int 140 nfs4_waitfor_purge_complete(vnode_t *vp) 141 { 142 rnode4_t *rp; 143 k_sigset_t smask; 144 145 rp = VTOR4(vp); 146 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 147 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 148 mutex_enter(&rp->r_statelock); 149 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 150 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 151 ((rp->r_flags & R4PGFLUSH) && 152 rp->r_pgflush != curthread)) { 153 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 154 sigunintr(&smask); 155 mutex_exit(&rp->r_statelock); 156 return (EINTR); 157 } 158 } 159 sigunintr(&smask); 160 mutex_exit(&rp->r_statelock); 161 } 162 return (0); 163 } 164 165 /* 166 * Validate caches by checking cached attributes. If they have timed out, 167 * then get new attributes from the server. As a side effect, cache 168 * invalidation is done if the attributes have changed. 169 * 170 * If the attributes have not timed out and if there is a cache 171 * invalidation being done by some other thread, then wait until that 172 * thread has completed the cache invalidation. 173 */ 174 int 175 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 176 { 177 int error; 178 nfs4_ga_res_t gar; 179 180 if (ATTRCACHE4_VALID(vp)) { 181 error = nfs4_waitfor_purge_complete(vp); 182 if (error) 183 return (error); 184 return (0); 185 } 186 187 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 188 } 189 190 /* 191 * Fill in attribute from the cache. 192 * If valid, then return 0 to indicate that no error occurred, 193 * otherwise return 1 to indicate that an error occurred. 194 */ 195 static int 196 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 197 { 198 rnode4_t *rp; 199 200 rp = VTOR4(vp); 201 mutex_enter(&rp->r_statelock); 202 mutex_enter(&rp->r_statev4_lock); 203 if (ATTRCACHE4_VALID(vp)) { 204 mutex_exit(&rp->r_statev4_lock); 205 /* 206 * Cached attributes are valid 207 */ 208 *vap = rp->r_attr; 209 mutex_exit(&rp->r_statelock); 210 return (0); 211 } 212 mutex_exit(&rp->r_statev4_lock); 213 mutex_exit(&rp->r_statelock); 214 return (1); 215 } 216 217 218 /* 219 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 220 * call is synchronous because all the pages were invalidated by the 221 * nfs4_invalidate_pages() call. 222 */ 223 void 224 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 225 { 226 struct rnode4 *rp = VTOR4(vp); 227 228 /* Ensure that the ..._end_op() call has been done */ 229 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 230 231 if (errno != ESTALE) 232 return; 233 234 mutex_enter(&rp->r_statelock); 235 rp->r_flags |= R4STALE; 236 if (!rp->r_error) 237 rp->r_error = errno; 238 mutex_exit(&rp->r_statelock); 239 if (nfs4_has_pages(vp)) 240 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 241 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 242 } 243 244 /* 245 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 246 * page purge is done asynchronously. 247 */ 248 void 249 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 250 { 251 rnode4_t *rp; 252 char *contents; 253 vnode_t *xattr; 254 int size; 255 int pgflush; /* are we the page flush thread? */ 256 257 /* 258 * Purge the DNLC for any entries which refer to this file. 259 */ 260 if (vp->v_count > 1 && 261 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 262 dnlc_purge_vp(vp); 263 264 /* 265 * Clear any readdir state bits and purge the readlink response cache. 266 */ 267 rp = VTOR4(vp); 268 mutex_enter(&rp->r_statelock); 269 rp->r_flags &= ~R4LOOKUP; 270 contents = rp->r_symlink.contents; 271 size = rp->r_symlink.size; 272 rp->r_symlink.contents = NULL; 273 274 xattr = rp->r_xattr_dir; 275 rp->r_xattr_dir = NULL; 276 277 /* 278 * Purge pathconf cache too. 279 */ 280 rp->r_pathconf.pc4_xattr_valid = 0; 281 rp->r_pathconf.pc4_cache_valid = 0; 282 283 pgflush = (curthread == rp->r_pgflush); 284 mutex_exit(&rp->r_statelock); 285 286 if (contents != NULL) { 287 288 kmem_free((void *)contents, size); 289 } 290 291 if (xattr != NULL) 292 VN_RELE(xattr); 293 294 /* 295 * Flush the page cache. If the current thread is the page flush 296 * thread, don't initiate a new page flush. There's no need for 297 * it, and doing it correctly is hard. 298 */ 299 if (nfs4_has_pages(vp) && !pgflush) { 300 if (!asyncpg) { 301 (void) nfs4_waitfor_purge_complete(vp); 302 nfs4_flush_pages(vp, cr); 303 } else { 304 pgflush_t *args; 305 306 /* 307 * We don't hold r_statelock while creating the 308 * thread, in case the call blocks. So we use a 309 * flag to indicate that a page flush thread is 310 * active. 311 */ 312 mutex_enter(&rp->r_statelock); 313 if (rp->r_flags & R4PGFLUSH) { 314 mutex_exit(&rp->r_statelock); 315 } else { 316 rp->r_flags |= R4PGFLUSH; 317 mutex_exit(&rp->r_statelock); 318 319 args = kmem_alloc(sizeof (pgflush_t), 320 KM_SLEEP); 321 args->vp = vp; 322 VN_HOLD(args->vp); 323 args->cr = cr; 324 crhold(args->cr); 325 (void) zthread_create(NULL, 0, 326 nfs4_pgflush_thread, args, 0, 327 minclsyspri); 328 } 329 } 330 } 331 332 /* 333 * Flush the readdir response cache. 334 */ 335 nfs4_purge_rddir_cache(vp); 336 } 337 338 /* 339 * Invalidate all pages for the given file, after writing back the dirty 340 * ones. 341 */ 342 343 void 344 nfs4_flush_pages(vnode_t *vp, cred_t *cr) 345 { 346 int error; 347 rnode4_t *rp = VTOR4(vp); 348 349 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 350 if (error == ENOSPC || error == EDQUOT) { 351 mutex_enter(&rp->r_statelock); 352 if (!rp->r_error) 353 rp->r_error = error; 354 mutex_exit(&rp->r_statelock); 355 } 356 } 357 358 /* 359 * Page flush thread. 360 */ 361 362 static void 363 nfs4_pgflush_thread(pgflush_t *args) 364 { 365 rnode4_t *rp = VTOR4(args->vp); 366 367 /* remember which thread we are, so we don't deadlock ourselves */ 368 mutex_enter(&rp->r_statelock); 369 ASSERT(rp->r_pgflush == NULL); 370 rp->r_pgflush = curthread; 371 mutex_exit(&rp->r_statelock); 372 373 nfs4_flush_pages(args->vp, args->cr); 374 375 mutex_enter(&rp->r_statelock); 376 rp->r_pgflush = NULL; 377 rp->r_flags &= ~R4PGFLUSH; 378 cv_broadcast(&rp->r_cv); 379 mutex_exit(&rp->r_statelock); 380 381 VN_RELE(args->vp); 382 crfree(args->cr); 383 kmem_free(args, sizeof (pgflush_t)); 384 zthread_exit(); 385 } 386 387 /* 388 * Purge the readdir cache of all entries which are not currently 389 * being filled. 390 */ 391 void 392 nfs4_purge_rddir_cache(vnode_t *vp) 393 { 394 rnode4_t *rp; 395 396 rp = VTOR4(vp); 397 398 mutex_enter(&rp->r_statelock); 399 rp->r_direof = NULL; 400 rp->r_flags &= ~R4LOOKUP; 401 rp->r_flags |= R4READDIRWATTR; 402 rddir4_cache_purge(rp); 403 mutex_exit(&rp->r_statelock); 404 } 405 406 /* 407 * Set attributes cache for given vnode using virtual attributes. There is 408 * no cache validation, but if the attributes are deemed to be stale, they 409 * are ignored. This corresponds to nfs3_attrcache(). 410 * 411 * Set the timeout value on the attribute cache and fill it 412 * with the passed in attributes. 413 */ 414 void 415 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 416 { 417 rnode4_t *rp = VTOR4(vp); 418 419 mutex_enter(&rp->r_statelock); 420 if (rp->r_time_attr_saved <= t) 421 nfs4_attrcache_va(vp, garp, FALSE); 422 mutex_exit(&rp->r_statelock); 423 } 424 425 /* 426 * Use the passed in virtual attributes to check to see whether the 427 * data and metadata caches are valid, cache the new attributes, and 428 * then do the cache invalidation if required. 429 * 430 * The cache validation and caching of the new attributes is done 431 * atomically via the use of the mutex, r_statelock. If required, 432 * the cache invalidation is done atomically w.r.t. the cache 433 * validation and caching of the attributes via the pseudo lock, 434 * r_serial. 435 * 436 * This routine is used to do cache validation and attributes caching 437 * for operations with a single set of post operation attributes. 438 */ 439 440 void 441 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 442 hrtime_t t, cred_t *cr, int async, 443 change_info4 *cinfo) 444 { 445 rnode4_t *rp; 446 int mtime_changed = 0; 447 int ctime_changed = 0; 448 vsecattr_t *vsp; 449 int was_serial, set_time_cache_inval, recov; 450 vattr_t *vap = &garp->n4g_va; 451 mntinfo4_t *mi = VTOMI4(vp); 452 len_t preattr_rsize; 453 boolean_t writemodify_set = B_FALSE; 454 boolean_t cachepurge_set = B_FALSE; 455 456 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 457 458 /* Is curthread the recovery thread? */ 459 mutex_enter(&mi->mi_lock); 460 recov = (VTOMI4(vp)->mi_recovthread == curthread); 461 mutex_exit(&mi->mi_lock); 462 463 rp = VTOR4(vp); 464 mutex_enter(&rp->r_statelock); 465 was_serial = (rp->r_serial == curthread); 466 if (rp->r_serial != NULL && !was_serial) { 467 /* 468 * Purge current attrs and bail out to avoid potential deadlock 469 * between another thread caching attrs (r_serial thread), this 470 * thread, and a thread trying to read or write pages. 471 */ 472 PURGE_ATTRCACHE4_LOCKED(rp); 473 mutex_exit(&rp->r_statelock); 474 return; 475 } 476 477 /* 478 * If there is a page flush thread, the current thread needs to 479 * bail out, to prevent a possible deadlock between the current 480 * thread (which might be in a start_op/end_op region), the 481 * recovery thread, and the page flush thread. Expire the 482 * attribute cache, so that any attributes the current thread was 483 * going to set are not lost. 484 */ 485 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 486 PURGE_ATTRCACHE4_LOCKED(rp); 487 mutex_exit(&rp->r_statelock); 488 return; 489 } 490 491 if (rp->r_time_attr_saved > t) { 492 /* 493 * Attributes have been cached since these attributes were 494 * probably made. If there is an inconsistency in what is 495 * cached, mark them invalid. If not, don't act on them. 496 */ 497 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 498 PURGE_ATTRCACHE4_LOCKED(rp); 499 mutex_exit(&rp->r_statelock); 500 return; 501 } 502 set_time_cache_inval = 0; 503 if (cinfo) { 504 /* 505 * Only directory modifying callers pass non-NULL cinfo. 506 */ 507 ASSERT(vp->v_type == VDIR); 508 /* 509 * If the cache timeout either doesn't exist or hasn't expired, 510 * and dir didn't changed on server before dirmod op 511 * and dir didn't change after dirmod op but before getattr 512 * then there's a chance that the client's cached data for 513 * this object is current (not stale). No immediate cache 514 * flush is required. 515 * 516 */ 517 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 518 cinfo->before == rp->r_change && 519 (garp->n4g_change_valid && 520 cinfo->after == garp->n4g_change)) { 521 522 /* 523 * If atomic isn't set, then the before/after info 524 * cannot be blindly trusted. For this case, we tell 525 * nfs4_attrcache_va to cache the attrs but also 526 * establish an absolute maximum cache timeout. When 527 * the timeout is reached, caches will be flushed. 528 */ 529 if (! cinfo->atomic) 530 set_time_cache_inval = 1; 531 } else { 532 533 /* 534 * We're not sure exactly what changed, but we know 535 * what to do. flush all caches for dir. remove the 536 * attr timeout. 537 * 538 * a) timeout expired. flush all caches. 539 * b) r_change != cinfo.before. flush all caches. 540 * c) r_change == cinfo.before, but cinfo.after != 541 * post-op getattr(change). flush all caches. 542 * d) post-op getattr(change) not provided by server. 543 * flush all caches. 544 */ 545 mtime_changed = 1; 546 ctime_changed = 1; 547 rp->r_time_cache_inval = 0; 548 } 549 } else { 550 /* 551 * Write thread after writing data to file on remote server, 552 * will always set R4WRITEMODIFIED to indicate that file on 553 * remote server was modified with a WRITE operation and would 554 * have marked attribute cache as timed out. If R4WRITEMODIFIED 555 * is set, then do not check for mtime and ctime change. 556 */ 557 if (!(rp->r_flags & R4WRITEMODIFIED)) { 558 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 559 mtime_changed = 1; 560 561 if (rp->r_attr.va_ctime.tv_sec != 562 vap->va_ctime.tv_sec || 563 rp->r_attr.va_ctime.tv_nsec != 564 vap->va_ctime.tv_nsec) 565 ctime_changed = 1; 566 567 /* 568 * If the change attribute was not provided by server 569 * or it differs, then flush all caches. 570 */ 571 if (!garp->n4g_change_valid || 572 rp->r_change != garp->n4g_change) { 573 mtime_changed = 1; 574 ctime_changed = 1; 575 } 576 } else { 577 writemodify_set = B_TRUE; 578 } 579 } 580 581 preattr_rsize = rp->r_size; 582 583 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 584 585 /* 586 * If we have updated filesize in nfs4_attrcache_va, as soon as we 587 * drop statelock we will be in transition of purging all 588 * our caches and updating them. It is possible for another 589 * thread to pick this new file size and read in zeroed data. 590 * stall other threads till cache purge is complete. 591 */ 592 if ((!cinfo) && (rp->r_size != preattr_rsize)) { 593 /* 594 * If R4WRITEMODIFIED was set and we have updated the file 595 * size, Server's returned file size need not necessarily 596 * be because of this Client's WRITE. We need to purge 597 * all caches. 598 */ 599 if (writemodify_set) 600 mtime_changed = 1; 601 602 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) { 603 rp->r_flags |= R4INCACHEPURGE; 604 cachepurge_set = B_TRUE; 605 } 606 } 607 608 if (!mtime_changed && !ctime_changed) { 609 mutex_exit(&rp->r_statelock); 610 return; 611 } 612 613 rp->r_serial = curthread; 614 615 mutex_exit(&rp->r_statelock); 616 617 /* 618 * If we're the recov thread, then force async nfs4_purge_caches 619 * to avoid potential deadlock. 620 */ 621 if (mtime_changed) 622 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 623 624 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) { 625 mutex_enter(&rp->r_statelock); 626 rp->r_flags &= ~R4INCACHEPURGE; 627 cv_broadcast(&rp->r_cv); 628 mutex_exit(&rp->r_statelock); 629 cachepurge_set = B_FALSE; 630 } 631 632 if (ctime_changed) { 633 (void) nfs4_access_purge_rp(rp); 634 if (rp->r_secattr != NULL) { 635 mutex_enter(&rp->r_statelock); 636 vsp = rp->r_secattr; 637 rp->r_secattr = NULL; 638 mutex_exit(&rp->r_statelock); 639 if (vsp != NULL) 640 nfs4_acl_free_cache(vsp); 641 } 642 } 643 644 if (!was_serial) { 645 mutex_enter(&rp->r_statelock); 646 rp->r_serial = NULL; 647 cv_broadcast(&rp->r_cv); 648 mutex_exit(&rp->r_statelock); 649 } 650 } 651 652 /* 653 * Set attributes cache for given vnode using virtual attributes. 654 * 655 * Set the timeout value on the attribute cache and fill it 656 * with the passed in attributes. 657 * 658 * The caller must be holding r_statelock. 659 */ 660 static void 661 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 662 { 663 rnode4_t *rp; 664 mntinfo4_t *mi; 665 hrtime_t delta; 666 hrtime_t now; 667 vattr_t *vap = &garp->n4g_va; 668 669 rp = VTOR4(vp); 670 671 ASSERT(MUTEX_HELD(&rp->r_statelock)); 672 ASSERT(vap->va_mask == AT_ALL); 673 674 /* Switch to master before checking v_flag */ 675 if (IS_SHADOW(vp, rp)) 676 vp = RTOV4(rp); 677 678 now = gethrtime(); 679 680 mi = VTOMI4(vp); 681 682 /* 683 * Only establish a new cache timeout (if requested). Never 684 * extend a timeout. Never clear a timeout. Clearing a timeout 685 * is done by nfs4_update_dircaches (ancestor in our call chain) 686 */ 687 if (set_cache_timeout && ! rp->r_time_cache_inval) 688 rp->r_time_cache_inval = now + mi->mi_acdirmax; 689 690 /* 691 * Delta is the number of nanoseconds that we will 692 * cache the attributes of the file. It is based on 693 * the number of nanoseconds since the last time that 694 * we detected a change. The assumption is that files 695 * that changed recently are likely to change again. 696 * There is a minimum and a maximum for regular files 697 * and for directories which is enforced though. 698 * 699 * Using the time since last change was detected 700 * eliminates direct comparison or calculation 701 * using mixed client and server times. NFS does 702 * not make any assumptions regarding the client 703 * and server clocks being synchronized. 704 */ 705 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 706 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 707 vap->va_size != rp->r_attr.va_size) { 708 rp->r_time_attr_saved = now; 709 } 710 711 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 712 delta = 0; 713 else { 714 delta = now - rp->r_time_attr_saved; 715 if (vp->v_type == VDIR) { 716 if (delta < mi->mi_acdirmin) 717 delta = mi->mi_acdirmin; 718 else if (delta > mi->mi_acdirmax) 719 delta = mi->mi_acdirmax; 720 } else { 721 if (delta < mi->mi_acregmin) 722 delta = mi->mi_acregmin; 723 else if (delta > mi->mi_acregmax) 724 delta = mi->mi_acregmax; 725 } 726 } 727 rp->r_time_attr_inval = now + delta; 728 729 rp->r_attr = *vap; 730 if (garp->n4g_change_valid) 731 rp->r_change = garp->n4g_change; 732 733 /* 734 * The attributes that were returned may be valid and can 735 * be used, but they may not be allowed to be cached. 736 * Reset the timers to cause immediate invalidation and 737 * clear r_change so no VERIFY operations will suceed 738 */ 739 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 740 rp->r_time_attr_inval = now; 741 rp->r_time_attr_saved = now; 742 rp->r_change = 0; 743 } 744 745 /* 746 * If mounted_on_fileid returned AND the object is a stub, 747 * then set object's va_nodeid to the mounted over fid 748 * returned by server. 749 * 750 * If mounted_on_fileid not provided/supported, then 751 * just set it to 0 for now. Eventually it would be 752 * better to set it to a hashed version of FH. This 753 * would probably be good enough to provide a unique 754 * fid/d_ino within a dir. 755 * 756 * We don't need to carry mounted_on_fileid in the 757 * rnode as long as the client never requests fileid 758 * without also requesting mounted_on_fileid. For 759 * now, it stays. 760 */ 761 if (garp->n4g_mon_fid_valid) { 762 rp->r_mntd_fid = garp->n4g_mon_fid; 763 764 if (RP_ISSTUB(rp)) 765 rp->r_attr.va_nodeid = rp->r_mntd_fid; 766 } 767 768 /* 769 * Check to see if there are valid pathconf bits to 770 * cache in the rnode. 771 */ 772 if (garp->n4g_ext_res) { 773 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 774 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 775 } else { 776 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 777 rp->r_pathconf.pc4_xattr_valid = TRUE; 778 rp->r_pathconf.pc4_xattr_exists = 779 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 780 } 781 } 782 } 783 /* 784 * Update the size of the file if there is no cached data or if 785 * the cached data is clean and there is no data being written 786 * out. 787 */ 788 if (rp->r_size != vap->va_size && 789 (!vn_has_cached_data(vp) || 790 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 791 rp->r_size = vap->va_size; 792 } 793 nfs_setswaplike(vp, vap); 794 rp->r_flags &= ~R4WRITEMODIFIED; 795 } 796 797 /* 798 * Get attributes over-the-wire and update attributes cache 799 * if no error occurred in the over-the-wire operation. 800 * Return 0 if successful, otherwise error. 801 */ 802 int 803 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 804 { 805 mntinfo4_t *mi = VTOMI4(vp); 806 hrtime_t t; 807 nfs4_recov_state_t recov_state; 808 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 809 810 recov_state.rs_flags = 0; 811 recov_state.rs_num_retry_despite_err = 0; 812 813 /* Save the original mount point security flavor */ 814 (void) save_mnt_secinfo(mi->mi_curr_serv); 815 816 recov_retry: 817 818 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 819 &recov_state, NULL))) { 820 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 821 return (e.error); 822 } 823 824 t = gethrtime(); 825 826 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 827 828 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 829 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 830 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) { 831 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 832 &recov_state, 1); 833 goto recov_retry; 834 } 835 } 836 837 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 838 839 if (!e.error) { 840 if (e.stat == NFS4_OK) { 841 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 842 } else { 843 e.error = geterrno4(e.stat); 844 845 nfs4_purge_stale_fh(e.error, vp, cr); 846 } 847 } 848 849 /* 850 * If getattr a node that is a stub for a crossed 851 * mount point, keep the original secinfo flavor for 852 * the current file system, not the crossed one. 853 */ 854 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 855 856 return (e.error); 857 } 858 859 /* 860 * Generate a compound to get attributes over-the-wire. 861 */ 862 void 863 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 864 nfs4_error_t *ep, cred_t *cr, int get_acl) 865 { 866 COMPOUND4args_clnt args; 867 COMPOUND4res_clnt res; 868 int doqueue; 869 rnode4_t *rp = VTOR4(vp); 870 nfs_argop4 argop[2]; 871 872 args.ctag = TAG_GETATTR; 873 874 args.array_len = 2; 875 args.array = argop; 876 877 /* putfh */ 878 argop[0].argop = OP_CPUTFH; 879 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 880 881 /* getattr */ 882 /* 883 * Unlike nfs version 2 and 3, where getattr returns all the 884 * attributes, nfs version 4 returns only the ones explicitly 885 * asked for. This creates problems, as some system functions 886 * (e.g. cache check) require certain attributes and if the 887 * cached node lacks some attributes such as uid/gid, it can 888 * affect system utilities (e.g. "ls") that rely on the information 889 * to be there. This can lead to anything from system crashes to 890 * corrupted information processed by user apps. 891 * So to ensure that all bases are covered, request at least 892 * the AT_ALL attribute mask. 893 */ 894 argop[1].argop = OP_GETATTR; 895 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 896 if (get_acl) 897 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 898 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 899 900 doqueue = 1; 901 902 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 903 904 if (ep->error) 905 return; 906 907 if (res.status != NFS4_OK) { 908 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 909 return; 910 } 911 912 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 913 914 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 915 } 916 917 /* 918 * Return either cached or remote attributes. If get remote attr 919 * use them to check and invalidate caches, then cache the new attributes. 920 */ 921 int 922 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 923 { 924 int error; 925 rnode4_t *rp; 926 nfs4_ga_res_t gar; 927 928 ASSERT(nfs4_consistent_type(vp)); 929 930 /* 931 * If we've got cached attributes, we're done, otherwise go 932 * to the server to get attributes, which will update the cache 933 * in the process. Either way, use the cached attributes for 934 * the caller's vattr_t. 935 * 936 * Note that we ignore the gar set by the OTW call: the attr caching 937 * code may make adjustments when storing to the rnode, and we want 938 * to see those changes here. 939 */ 940 rp = VTOR4(vp); 941 error = 0; 942 mutex_enter(&rp->r_statelock); 943 if (!ATTRCACHE4_VALID(vp)) { 944 mutex_exit(&rp->r_statelock); 945 error = nfs4_getattr_otw(vp, &gar, cr, 0); 946 mutex_enter(&rp->r_statelock); 947 } 948 949 if (!error) 950 *vap = rp->r_attr; 951 952 /* Return the client's view of file size */ 953 vap->va_size = rp->r_size; 954 955 mutex_exit(&rp->r_statelock); 956 957 ASSERT(nfs4_consistent_type(vp)); 958 959 return (error); 960 } 961 962 int 963 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 964 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 965 { 966 COMPOUND4args_clnt args; 967 COMPOUND4res_clnt res; 968 int doqueue; 969 nfs_argop4 argop[2]; 970 mntinfo4_t *mi = VTOMI4(vp); 971 bool_t needrecov = FALSE; 972 nfs4_recov_state_t recov_state; 973 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 974 nfs4_ga_ext_res_t *gerp; 975 976 recov_state.rs_flags = 0; 977 recov_state.rs_num_retry_despite_err = 0; 978 979 recov_retry: 980 args.ctag = tag_type; 981 982 args.array_len = 2; 983 args.array = argop; 984 985 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 986 if (e.error) 987 return (e.error); 988 989 /* putfh */ 990 argop[0].argop = OP_CPUTFH; 991 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 992 993 /* getattr */ 994 argop[1].argop = OP_GETATTR; 995 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 996 argop[1].nfs_argop4_u.opgetattr.mi = mi; 997 998 doqueue = 1; 999 1000 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1001 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 1002 rnode4info(VTOR4(vp)))); 1003 1004 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 1005 1006 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 1007 if (!needrecov && e.error) { 1008 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1009 needrecov); 1010 return (e.error); 1011 } 1012 1013 if (needrecov) { 1014 bool_t abort; 1015 1016 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1017 "nfs4_attr_otw: initiating recovery\n")); 1018 1019 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1020 NULL, OP_GETATTR, NULL, NULL, NULL); 1021 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1022 needrecov); 1023 if (!e.error) { 1024 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1025 e.error = geterrno4(res.status); 1026 } 1027 if (abort == FALSE) 1028 goto recov_retry; 1029 return (e.error); 1030 } 1031 1032 if (res.status) { 1033 e.error = geterrno4(res.status); 1034 } else { 1035 gerp = garp->n4g_ext_res; 1036 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1037 garp, sizeof (nfs4_ga_res_t)); 1038 garp->n4g_ext_res = gerp; 1039 if (garp->n4g_ext_res && 1040 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1041 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1042 ga_res.n4g_ext_res, 1043 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1044 } 1045 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1046 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1047 needrecov); 1048 return (e.error); 1049 } 1050 1051 /* 1052 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1053 * for the demand-based allocation of async threads per-mount. The 1054 * nfs_async_timeout is the amount of time a thread will live after it 1055 * becomes idle, unless new I/O requests are received before the thread 1056 * dies. See nfs4_async_putpage and nfs4_async_start. 1057 */ 1058 1059 static void nfs4_async_start(struct vfs *); 1060 static void nfs4_async_pgops_start(struct vfs *); 1061 static void nfs4_async_common_start(struct vfs *, int); 1062 1063 static void 1064 free_async_args4(struct nfs4_async_reqs *args) 1065 { 1066 rnode4_t *rp; 1067 1068 if (args->a_io != NFS4_INACTIVE) { 1069 rp = VTOR4(args->a_vp); 1070 mutex_enter(&rp->r_statelock); 1071 rp->r_count--; 1072 if (args->a_io == NFS4_PUTAPAGE || 1073 args->a_io == NFS4_PAGEIO) 1074 rp->r_awcount--; 1075 cv_broadcast(&rp->r_cv); 1076 mutex_exit(&rp->r_statelock); 1077 VN_RELE(args->a_vp); 1078 } 1079 crfree(args->a_cred); 1080 kmem_free(args, sizeof (*args)); 1081 } 1082 1083 /* 1084 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1085 * pageout(), running in the global zone, have legitimate reasons to do 1086 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1087 * use of a a per-mount "asynchronous requests manager thread" which is 1088 * signaled by the various asynchronous work routines when there is 1089 * asynchronous work to be done. It is responsible for creating new 1090 * worker threads if necessary, and notifying existing worker threads 1091 * that there is work to be done. 1092 * 1093 * In other words, it will "take the specifications from the customers and 1094 * give them to the engineers." 1095 * 1096 * Worker threads die off of their own accord if they are no longer 1097 * needed. 1098 * 1099 * This thread is killed when the zone is going away or the filesystem 1100 * is being unmounted. 1101 */ 1102 void 1103 nfs4_async_manager(vfs_t *vfsp) 1104 { 1105 callb_cpr_t cprinfo; 1106 mntinfo4_t *mi; 1107 uint_t max_threads; 1108 1109 mi = VFTOMI4(vfsp); 1110 1111 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1112 "nfs4_async_manager"); 1113 1114 mutex_enter(&mi->mi_async_lock); 1115 /* 1116 * We want to stash the max number of threads that this mount was 1117 * allowed so we can use it later when the variable is set to zero as 1118 * part of the zone/mount going away. 1119 * 1120 * We want to be able to create at least one thread to handle 1121 * asynchronous inactive calls. 1122 */ 1123 max_threads = MAX(mi->mi_max_threads, 1); 1124 /* 1125 * We don't want to wait for mi_max_threads to go to zero, since that 1126 * happens as part of a failed unmount, but this thread should only 1127 * exit when the mount is really going away. 1128 * 1129 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1130 * attempted: the various _async_*() functions know to do things 1131 * inline if mi_max_threads == 0. Henceforth we just drain out the 1132 * outstanding requests. 1133 * 1134 * Note that we still create zthreads even if we notice the zone is 1135 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1136 * shutdown sequence to take slightly longer in some cases, but 1137 * doesn't violate the protocol, as all threads will exit as soon as 1138 * they're done processing the remaining requests. 1139 */ 1140 for (;;) { 1141 while (mi->mi_async_req_count > 0) { 1142 /* 1143 * Paranoia: If the mount started out having 1144 * (mi->mi_max_threads == 0), and the value was 1145 * later changed (via a debugger or somesuch), 1146 * we could be confused since we will think we 1147 * can't create any threads, and the calling 1148 * code (which looks at the current value of 1149 * mi->mi_max_threads, now non-zero) thinks we 1150 * can. 1151 * 1152 * So, because we're paranoid, we create threads 1153 * up to the maximum of the original and the 1154 * current value. This means that future 1155 * (debugger-induced) alterations of 1156 * mi->mi_max_threads are ignored for our 1157 * purposes, but who told them they could change 1158 * random values on a live kernel anyhow? 1159 */ 1160 if (mi->mi_threads[NFS4_ASYNC_QUEUE] < 1161 MAX(mi->mi_max_threads, max_threads)) { 1162 mi->mi_threads[NFS4_ASYNC_QUEUE]++; 1163 mutex_exit(&mi->mi_async_lock); 1164 MI4_HOLD(mi); 1165 VFS_HOLD(vfsp); /* hold for new thread */ 1166 (void) zthread_create(NULL, 0, nfs4_async_start, 1167 vfsp, 0, minclsyspri); 1168 mutex_enter(&mi->mi_async_lock); 1169 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] < 1170 NUM_ASYNC_PGOPS_THREADS) { 1171 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++; 1172 mutex_exit(&mi->mi_async_lock); 1173 MI4_HOLD(mi); 1174 VFS_HOLD(vfsp); /* hold for new thread */ 1175 (void) zthread_create(NULL, 0, 1176 nfs4_async_pgops_start, vfsp, 0, 1177 minclsyspri); 1178 mutex_enter(&mi->mi_async_lock); 1179 } 1180 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1181 ASSERT(mi->mi_async_req_count != 0); 1182 mi->mi_async_req_count--; 1183 } 1184 1185 mutex_enter(&mi->mi_lock); 1186 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) { 1187 mutex_exit(&mi->mi_lock); 1188 break; 1189 } 1190 mutex_exit(&mi->mi_lock); 1191 1192 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1193 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1194 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1195 } 1196 1197 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1198 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1199 /* 1200 * Let everyone know we're done. 1201 */ 1202 mi->mi_manager_thread = NULL; 1203 /* 1204 * Wake up the inactive thread. 1205 */ 1206 cv_broadcast(&mi->mi_inact_req_cv); 1207 /* 1208 * Wake up anyone sitting in nfs4_async_manager_stop() 1209 */ 1210 cv_broadcast(&mi->mi_async_cv); 1211 /* 1212 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1213 * since CALLB_CPR_EXIT is actually responsible for releasing 1214 * 'mi_async_lock'. 1215 */ 1216 CALLB_CPR_EXIT(&cprinfo); 1217 VFS_RELE(vfsp); /* release thread's hold */ 1218 MI4_RELE(mi); 1219 zthread_exit(); 1220 } 1221 1222 /* 1223 * Signal (and wait for) the async manager thread to clean up and go away. 1224 */ 1225 void 1226 nfs4_async_manager_stop(vfs_t *vfsp) 1227 { 1228 mntinfo4_t *mi = VFTOMI4(vfsp); 1229 1230 mutex_enter(&mi->mi_async_lock); 1231 mutex_enter(&mi->mi_lock); 1232 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1233 mutex_exit(&mi->mi_lock); 1234 cv_broadcast(&mi->mi_async_reqs_cv); 1235 /* 1236 * Wait for the async manager thread to die. 1237 */ 1238 while (mi->mi_manager_thread != NULL) 1239 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1240 mutex_exit(&mi->mi_async_lock); 1241 } 1242 1243 int 1244 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1245 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1246 u_offset_t, caddr_t, struct seg *, cred_t *)) 1247 { 1248 rnode4_t *rp; 1249 mntinfo4_t *mi; 1250 struct nfs4_async_reqs *args; 1251 1252 rp = VTOR4(vp); 1253 ASSERT(rp->r_freef == NULL); 1254 1255 mi = VTOMI4(vp); 1256 1257 /* 1258 * If addr falls in a different segment, don't bother doing readahead. 1259 */ 1260 if (addr >= seg->s_base + seg->s_size) 1261 return (-1); 1262 1263 /* 1264 * If we can't allocate a request structure, punt on the readahead. 1265 */ 1266 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1267 return (-1); 1268 1269 /* 1270 * If a lock operation is pending, don't initiate any new 1271 * readaheads. Otherwise, bump r_count to indicate the new 1272 * asynchronous I/O. 1273 */ 1274 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1275 kmem_free(args, sizeof (*args)); 1276 return (-1); 1277 } 1278 mutex_enter(&rp->r_statelock); 1279 rp->r_count++; 1280 mutex_exit(&rp->r_statelock); 1281 nfs_rw_exit(&rp->r_lkserlock); 1282 1283 args->a_next = NULL; 1284 #ifdef DEBUG 1285 args->a_queuer = curthread; 1286 #endif 1287 VN_HOLD(vp); 1288 args->a_vp = vp; 1289 ASSERT(cr != NULL); 1290 crhold(cr); 1291 args->a_cred = cr; 1292 args->a_io = NFS4_READ_AHEAD; 1293 args->a_nfs4_readahead = readahead; 1294 args->a_nfs4_blkoff = blkoff; 1295 args->a_nfs4_seg = seg; 1296 args->a_nfs4_addr = addr; 1297 1298 mutex_enter(&mi->mi_async_lock); 1299 1300 /* 1301 * If asyncio has been disabled, don't bother readahead. 1302 */ 1303 if (mi->mi_max_threads == 0) { 1304 mutex_exit(&mi->mi_async_lock); 1305 goto noasync; 1306 } 1307 1308 /* 1309 * Link request structure into the async list and 1310 * wakeup async thread to do the i/o. 1311 */ 1312 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1313 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1314 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1315 } else { 1316 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1317 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1318 } 1319 1320 if (mi->mi_io_kstats) { 1321 mutex_enter(&mi->mi_lock); 1322 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1323 mutex_exit(&mi->mi_lock); 1324 } 1325 1326 mi->mi_async_req_count++; 1327 ASSERT(mi->mi_async_req_count != 0); 1328 cv_signal(&mi->mi_async_reqs_cv); 1329 mutex_exit(&mi->mi_async_lock); 1330 return (0); 1331 1332 noasync: 1333 mutex_enter(&rp->r_statelock); 1334 rp->r_count--; 1335 cv_broadcast(&rp->r_cv); 1336 mutex_exit(&rp->r_statelock); 1337 VN_RELE(vp); 1338 crfree(cr); 1339 kmem_free(args, sizeof (*args)); 1340 return (-1); 1341 } 1342 1343 static void 1344 nfs4_async_start(struct vfs *vfsp) 1345 { 1346 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE); 1347 } 1348 1349 static void 1350 nfs4_async_pgops_start(struct vfs *vfsp) 1351 { 1352 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE); 1353 } 1354 1355 /* 1356 * The async queues for each mounted file system are arranged as a 1357 * set of queues, one for each async i/o type. Requests are taken 1358 * from the queues in a round-robin fashion. A number of consecutive 1359 * requests are taken from each queue before moving on to the next 1360 * queue. This functionality may allow the NFS Version 2 server to do 1361 * write clustering, even if the client is mixing writes and reads 1362 * because it will take multiple write requests from the queue 1363 * before processing any of the other async i/o types. 1364 * 1365 * XXX The nfs4_async_common_start thread is unsafe in the light of the present 1366 * model defined by cpr to suspend the system. Specifically over the 1367 * wire calls are cpr-unsafe. The thread should be reevaluated in 1368 * case of future updates to the cpr model. 1369 */ 1370 static void 1371 nfs4_async_common_start(struct vfs *vfsp, int async_queue) 1372 { 1373 struct nfs4_async_reqs *args; 1374 mntinfo4_t *mi = VFTOMI4(vfsp); 1375 clock_t time_left = 1; 1376 callb_cpr_t cprinfo; 1377 int i; 1378 extern int nfs_async_timeout; 1379 int async_types; 1380 kcondvar_t *async_work_cv; 1381 1382 if (async_queue == NFS4_ASYNC_QUEUE) { 1383 async_types = NFS4_ASYNC_TYPES; 1384 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]; 1385 } else { 1386 async_types = NFS4_ASYNC_PGOPS_TYPES; 1387 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]; 1388 } 1389 1390 /* 1391 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1392 * built in an implementation independent manner. 1393 */ 1394 if (nfs_async_timeout == -1) 1395 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1396 1397 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1398 1399 mutex_enter(&mi->mi_async_lock); 1400 for (;;) { 1401 /* 1402 * Find the next queue containing an entry. We start 1403 * at the current queue pointer and then round robin 1404 * through all of them until we either find a non-empty 1405 * queue or have looked through all of them. 1406 */ 1407 for (i = 0; i < async_types; i++) { 1408 args = *mi->mi_async_curr[async_queue]; 1409 if (args != NULL) 1410 break; 1411 mi->mi_async_curr[async_queue]++; 1412 if (mi->mi_async_curr[async_queue] == 1413 &mi->mi_async_reqs[async_types]) { 1414 mi->mi_async_curr[async_queue] = 1415 &mi->mi_async_reqs[0]; 1416 } 1417 } 1418 /* 1419 * If we didn't find a entry, then block until woken up 1420 * again and then look through the queues again. 1421 */ 1422 if (args == NULL) { 1423 /* 1424 * Exiting is considered to be safe for CPR as well 1425 */ 1426 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1427 1428 /* 1429 * Wakeup thread waiting to unmount the file 1430 * system only if all async threads are inactive. 1431 * 1432 * If we've timed-out and there's nothing to do, 1433 * then get rid of this thread. 1434 */ 1435 if (mi->mi_max_threads == 0 || time_left <= 0) { 1436 --mi->mi_threads[async_queue]; 1437 1438 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 1439 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0) 1440 cv_signal(&mi->mi_async_cv); 1441 CALLB_CPR_EXIT(&cprinfo); 1442 VFS_RELE(vfsp); /* release thread's hold */ 1443 MI4_RELE(mi); 1444 zthread_exit(); 1445 /* NOTREACHED */ 1446 } 1447 time_left = cv_reltimedwait(async_work_cv, 1448 &mi->mi_async_lock, nfs_async_timeout, 1449 TR_CLOCK_TICK); 1450 1451 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1452 1453 continue; 1454 } else { 1455 time_left = 1; 1456 } 1457 1458 /* 1459 * Remove the request from the async queue and then 1460 * update the current async request queue pointer. If 1461 * the current queue is empty or we have removed enough 1462 * consecutive entries from it, then reset the counter 1463 * for this queue and then move the current pointer to 1464 * the next queue. 1465 */ 1466 *mi->mi_async_curr[async_queue] = args->a_next; 1467 if (*mi->mi_async_curr[async_queue] == NULL || 1468 --mi->mi_async_clusters[args->a_io] == 0) { 1469 mi->mi_async_clusters[args->a_io] = 1470 mi->mi_async_init_clusters; 1471 mi->mi_async_curr[async_queue]++; 1472 if (mi->mi_async_curr[async_queue] == 1473 &mi->mi_async_reqs[async_types]) { 1474 mi->mi_async_curr[async_queue] = 1475 &mi->mi_async_reqs[0]; 1476 } 1477 } 1478 1479 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1480 mutex_enter(&mi->mi_lock); 1481 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1482 mutex_exit(&mi->mi_lock); 1483 } 1484 1485 mutex_exit(&mi->mi_async_lock); 1486 1487 /* 1488 * Obtain arguments from the async request structure. 1489 */ 1490 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1491 (*args->a_nfs4_readahead)(args->a_vp, 1492 args->a_nfs4_blkoff, args->a_nfs4_addr, 1493 args->a_nfs4_seg, args->a_cred); 1494 } else if (args->a_io == NFS4_PUTAPAGE) { 1495 (void) (*args->a_nfs4_putapage)(args->a_vp, 1496 args->a_nfs4_pp, args->a_nfs4_off, 1497 args->a_nfs4_len, args->a_nfs4_flags, 1498 args->a_cred); 1499 } else if (args->a_io == NFS4_PAGEIO) { 1500 (void) (*args->a_nfs4_pageio)(args->a_vp, 1501 args->a_nfs4_pp, args->a_nfs4_off, 1502 args->a_nfs4_len, args->a_nfs4_flags, 1503 args->a_cred); 1504 } else if (args->a_io == NFS4_READDIR) { 1505 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1506 args->a_nfs4_rdc, args->a_cred)); 1507 } else if (args->a_io == NFS4_COMMIT) { 1508 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1509 args->a_nfs4_offset, args->a_nfs4_count, 1510 args->a_cred); 1511 } else if (args->a_io == NFS4_INACTIVE) { 1512 nfs4_inactive_otw(args->a_vp, args->a_cred); 1513 } 1514 1515 /* 1516 * Now, release the vnode and free the credentials 1517 * structure. 1518 */ 1519 free_async_args4(args); 1520 /* 1521 * Reacquire the mutex because it will be needed above. 1522 */ 1523 mutex_enter(&mi->mi_async_lock); 1524 } 1525 } 1526 1527 /* 1528 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1529 * part of VOP_INACTIVE. 1530 */ 1531 1532 void 1533 nfs4_inactive_thread(mntinfo4_t *mi) 1534 { 1535 struct nfs4_async_reqs *args; 1536 callb_cpr_t cprinfo; 1537 vfs_t *vfsp = mi->mi_vfsp; 1538 1539 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1540 "nfs4_inactive_thread"); 1541 1542 for (;;) { 1543 mutex_enter(&mi->mi_async_lock); 1544 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1545 if (args == NULL) { 1546 mutex_enter(&mi->mi_lock); 1547 /* 1548 * We don't want to exit until the async manager is done 1549 * with its work; hence the check for mi_manager_thread 1550 * being NULL. 1551 * 1552 * The async manager thread will cv_broadcast() on 1553 * mi_inact_req_cv when it's done, at which point we'll 1554 * wake up and exit. 1555 */ 1556 if (mi->mi_manager_thread == NULL) 1557 goto die; 1558 mi->mi_flags |= MI4_INACTIVE_IDLE; 1559 mutex_exit(&mi->mi_lock); 1560 cv_signal(&mi->mi_async_cv); 1561 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1562 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1563 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1564 mutex_exit(&mi->mi_async_lock); 1565 } else { 1566 mutex_enter(&mi->mi_lock); 1567 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1568 mutex_exit(&mi->mi_lock); 1569 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1570 mutex_exit(&mi->mi_async_lock); 1571 nfs4_inactive_otw(args->a_vp, args->a_cred); 1572 crfree(args->a_cred); 1573 kmem_free(args, sizeof (*args)); 1574 } 1575 } 1576 die: 1577 mutex_exit(&mi->mi_lock); 1578 mi->mi_inactive_thread = NULL; 1579 cv_signal(&mi->mi_async_cv); 1580 1581 /* 1582 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1583 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1584 */ 1585 CALLB_CPR_EXIT(&cprinfo); 1586 1587 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1588 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1589 1590 MI4_RELE(mi); 1591 zthread_exit(); 1592 /* NOTREACHED */ 1593 } 1594 1595 /* 1596 * nfs_async_stop: 1597 * Wait for all outstanding putpage operations and the inactive thread to 1598 * complete; nfs4_async_stop_sig() without interruptibility. 1599 */ 1600 void 1601 nfs4_async_stop(struct vfs *vfsp) 1602 { 1603 mntinfo4_t *mi = VFTOMI4(vfsp); 1604 1605 /* 1606 * Wait for all outstanding async operations to complete and for 1607 * worker threads to exit. 1608 */ 1609 mutex_enter(&mi->mi_async_lock); 1610 mi->mi_max_threads = 0; 1611 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1612 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1613 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) 1614 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1615 1616 /* 1617 * Wait for the inactive thread to finish doing what it's doing. It 1618 * won't exit until the last reference to the vfs_t goes away. 1619 */ 1620 if (mi->mi_inactive_thread != NULL) { 1621 mutex_enter(&mi->mi_lock); 1622 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1623 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1624 mutex_exit(&mi->mi_lock); 1625 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1626 mutex_enter(&mi->mi_lock); 1627 } 1628 mutex_exit(&mi->mi_lock); 1629 } 1630 mutex_exit(&mi->mi_async_lock); 1631 } 1632 1633 /* 1634 * nfs_async_stop_sig: 1635 * Wait for all outstanding putpage operations and the inactive thread to 1636 * complete. If a signal is delivered we will abort and return non-zero; 1637 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1638 * need to make it interruptible. 1639 */ 1640 int 1641 nfs4_async_stop_sig(struct vfs *vfsp) 1642 { 1643 mntinfo4_t *mi = VFTOMI4(vfsp); 1644 ushort_t omax; 1645 bool_t intr = FALSE; 1646 1647 /* 1648 * Wait for all outstanding putpage operations to complete and for 1649 * worker threads to exit. 1650 */ 1651 mutex_enter(&mi->mi_async_lock); 1652 omax = mi->mi_max_threads; 1653 mi->mi_max_threads = 0; 1654 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1655 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1656 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) { 1657 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1658 intr = TRUE; 1659 goto interrupted; 1660 } 1661 } 1662 1663 /* 1664 * Wait for the inactive thread to finish doing what it's doing. It 1665 * won't exit until the a last reference to the vfs_t goes away. 1666 */ 1667 if (mi->mi_inactive_thread != NULL) { 1668 mutex_enter(&mi->mi_lock); 1669 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1670 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1671 mutex_exit(&mi->mi_lock); 1672 if (!cv_wait_sig(&mi->mi_async_cv, 1673 &mi->mi_async_lock)) { 1674 intr = TRUE; 1675 goto interrupted; 1676 } 1677 mutex_enter(&mi->mi_lock); 1678 } 1679 mutex_exit(&mi->mi_lock); 1680 } 1681 interrupted: 1682 if (intr) 1683 mi->mi_max_threads = omax; 1684 mutex_exit(&mi->mi_async_lock); 1685 1686 return (intr); 1687 } 1688 1689 int 1690 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1691 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1692 u_offset_t, size_t, int, cred_t *)) 1693 { 1694 rnode4_t *rp; 1695 mntinfo4_t *mi; 1696 struct nfs4_async_reqs *args; 1697 1698 ASSERT(flags & B_ASYNC); 1699 ASSERT(vp->v_vfsp != NULL); 1700 1701 rp = VTOR4(vp); 1702 ASSERT(rp->r_count > 0); 1703 1704 mi = VTOMI4(vp); 1705 1706 /* 1707 * If we can't allocate a request structure, do the putpage 1708 * operation synchronously in this thread's context. 1709 */ 1710 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1711 goto noasync; 1712 1713 args->a_next = NULL; 1714 #ifdef DEBUG 1715 args->a_queuer = curthread; 1716 #endif 1717 VN_HOLD(vp); 1718 args->a_vp = vp; 1719 ASSERT(cr != NULL); 1720 crhold(cr); 1721 args->a_cred = cr; 1722 args->a_io = NFS4_PUTAPAGE; 1723 args->a_nfs4_putapage = putapage; 1724 args->a_nfs4_pp = pp; 1725 args->a_nfs4_off = off; 1726 args->a_nfs4_len = (uint_t)len; 1727 args->a_nfs4_flags = flags; 1728 1729 mutex_enter(&mi->mi_async_lock); 1730 1731 /* 1732 * If asyncio has been disabled, then make a synchronous request. 1733 * This check is done a second time in case async io was diabled 1734 * while this thread was blocked waiting for memory pressure to 1735 * reduce or for the queue to drain. 1736 */ 1737 if (mi->mi_max_threads == 0) { 1738 mutex_exit(&mi->mi_async_lock); 1739 1740 VN_RELE(vp); 1741 crfree(cr); 1742 kmem_free(args, sizeof (*args)); 1743 goto noasync; 1744 } 1745 1746 /* 1747 * Link request structure into the async list and 1748 * wakeup async thread to do the i/o. 1749 */ 1750 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1751 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1752 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1753 } else { 1754 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1755 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1756 } 1757 1758 mutex_enter(&rp->r_statelock); 1759 rp->r_count++; 1760 rp->r_awcount++; 1761 mutex_exit(&rp->r_statelock); 1762 1763 if (mi->mi_io_kstats) { 1764 mutex_enter(&mi->mi_lock); 1765 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1766 mutex_exit(&mi->mi_lock); 1767 } 1768 1769 mi->mi_async_req_count++; 1770 ASSERT(mi->mi_async_req_count != 0); 1771 cv_signal(&mi->mi_async_reqs_cv); 1772 mutex_exit(&mi->mi_async_lock); 1773 return (0); 1774 1775 noasync: 1776 1777 if (curproc == proc_pageout || curproc == proc_fsflush) { 1778 /* 1779 * If we get here in the context of the pageout/fsflush, 1780 * or we have run out of memory or we're attempting to 1781 * unmount we refuse to do a sync write, because this may 1782 * hang pageout/fsflush and the machine. In this case, 1783 * we just re-mark the page as dirty and punt on the page. 1784 * 1785 * Make sure B_FORCE isn't set. We can re-mark the 1786 * pages as dirty and unlock the pages in one swoop by 1787 * passing in B_ERROR to pvn_write_done(). However, 1788 * we should make sure B_FORCE isn't set - we don't 1789 * want the page tossed before it gets written out. 1790 */ 1791 if (flags & B_FORCE) 1792 flags &= ~(B_INVAL | B_FORCE); 1793 pvn_write_done(pp, flags | B_ERROR); 1794 return (0); 1795 } 1796 1797 if (nfs_zone() != mi->mi_zone) { 1798 /* 1799 * So this was a cross-zone sync putpage. 1800 * 1801 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1802 * as dirty and unlock them. 1803 * 1804 * We don't want to clear B_FORCE here as the caller presumably 1805 * knows what they're doing if they set it. 1806 */ 1807 pvn_write_done(pp, flags | B_ERROR); 1808 return (EPERM); 1809 } 1810 return ((*putapage)(vp, pp, off, len, flags, cr)); 1811 } 1812 1813 int 1814 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1815 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1816 size_t, int, cred_t *)) 1817 { 1818 rnode4_t *rp; 1819 mntinfo4_t *mi; 1820 struct nfs4_async_reqs *args; 1821 1822 ASSERT(flags & B_ASYNC); 1823 ASSERT(vp->v_vfsp != NULL); 1824 1825 rp = VTOR4(vp); 1826 ASSERT(rp->r_count > 0); 1827 1828 mi = VTOMI4(vp); 1829 1830 /* 1831 * If we can't allocate a request structure, do the pageio 1832 * request synchronously in this thread's context. 1833 */ 1834 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1835 goto noasync; 1836 1837 args->a_next = NULL; 1838 #ifdef DEBUG 1839 args->a_queuer = curthread; 1840 #endif 1841 VN_HOLD(vp); 1842 args->a_vp = vp; 1843 ASSERT(cr != NULL); 1844 crhold(cr); 1845 args->a_cred = cr; 1846 args->a_io = NFS4_PAGEIO; 1847 args->a_nfs4_pageio = pageio; 1848 args->a_nfs4_pp = pp; 1849 args->a_nfs4_off = io_off; 1850 args->a_nfs4_len = (uint_t)io_len; 1851 args->a_nfs4_flags = flags; 1852 1853 mutex_enter(&mi->mi_async_lock); 1854 1855 /* 1856 * If asyncio has been disabled, then make a synchronous request. 1857 * This check is done a second time in case async io was diabled 1858 * while this thread was blocked waiting for memory pressure to 1859 * reduce or for the queue to drain. 1860 */ 1861 if (mi->mi_max_threads == 0) { 1862 mutex_exit(&mi->mi_async_lock); 1863 1864 VN_RELE(vp); 1865 crfree(cr); 1866 kmem_free(args, sizeof (*args)); 1867 goto noasync; 1868 } 1869 1870 /* 1871 * Link request structure into the async list and 1872 * wakeup async thread to do the i/o. 1873 */ 1874 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1875 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1876 mi->mi_async_tail[NFS4_PAGEIO] = args; 1877 } else { 1878 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1879 mi->mi_async_tail[NFS4_PAGEIO] = args; 1880 } 1881 1882 mutex_enter(&rp->r_statelock); 1883 rp->r_count++; 1884 rp->r_awcount++; 1885 mutex_exit(&rp->r_statelock); 1886 1887 if (mi->mi_io_kstats) { 1888 mutex_enter(&mi->mi_lock); 1889 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1890 mutex_exit(&mi->mi_lock); 1891 } 1892 1893 mi->mi_async_req_count++; 1894 ASSERT(mi->mi_async_req_count != 0); 1895 cv_signal(&mi->mi_async_reqs_cv); 1896 mutex_exit(&mi->mi_async_lock); 1897 return (0); 1898 1899 noasync: 1900 /* 1901 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1902 * the page list), for writes we do it synchronously, except for 1903 * proc_pageout/proc_fsflush as described below. 1904 */ 1905 if (flags & B_READ) { 1906 pvn_read_done(pp, flags | B_ERROR); 1907 return (0); 1908 } 1909 1910 if (curproc == proc_pageout || curproc == proc_fsflush) { 1911 /* 1912 * If we get here in the context of the pageout/fsflush, 1913 * we refuse to do a sync write, because this may hang 1914 * pageout/fsflush (and the machine). In this case, we just 1915 * re-mark the page as dirty and punt on the page. 1916 * 1917 * Make sure B_FORCE isn't set. We can re-mark the 1918 * pages as dirty and unlock the pages in one swoop by 1919 * passing in B_ERROR to pvn_write_done(). However, 1920 * we should make sure B_FORCE isn't set - we don't 1921 * want the page tossed before it gets written out. 1922 */ 1923 if (flags & B_FORCE) 1924 flags &= ~(B_INVAL | B_FORCE); 1925 pvn_write_done(pp, flags | B_ERROR); 1926 return (0); 1927 } 1928 1929 if (nfs_zone() != mi->mi_zone) { 1930 /* 1931 * So this was a cross-zone sync pageio. We pass in B_ERROR 1932 * to pvn_write_done() to re-mark the pages as dirty and unlock 1933 * them. 1934 * 1935 * We don't want to clear B_FORCE here as the caller presumably 1936 * knows what they're doing if they set it. 1937 */ 1938 pvn_write_done(pp, flags | B_ERROR); 1939 return (EPERM); 1940 } 1941 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1942 } 1943 1944 void 1945 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1946 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1947 { 1948 rnode4_t *rp; 1949 mntinfo4_t *mi; 1950 struct nfs4_async_reqs *args; 1951 1952 rp = VTOR4(vp); 1953 ASSERT(rp->r_freef == NULL); 1954 1955 mi = VTOMI4(vp); 1956 1957 /* 1958 * If we can't allocate a request structure, skip the readdir. 1959 */ 1960 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1961 goto noasync; 1962 1963 args->a_next = NULL; 1964 #ifdef DEBUG 1965 args->a_queuer = curthread; 1966 #endif 1967 VN_HOLD(vp); 1968 args->a_vp = vp; 1969 ASSERT(cr != NULL); 1970 crhold(cr); 1971 args->a_cred = cr; 1972 args->a_io = NFS4_READDIR; 1973 args->a_nfs4_readdir = readdir; 1974 args->a_nfs4_rdc = rdc; 1975 1976 mutex_enter(&mi->mi_async_lock); 1977 1978 /* 1979 * If asyncio has been disabled, then skip this request 1980 */ 1981 if (mi->mi_max_threads == 0) { 1982 mutex_exit(&mi->mi_async_lock); 1983 1984 VN_RELE(vp); 1985 crfree(cr); 1986 kmem_free(args, sizeof (*args)); 1987 goto noasync; 1988 } 1989 1990 /* 1991 * Link request structure into the async list and 1992 * wakeup async thread to do the i/o. 1993 */ 1994 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 1995 mi->mi_async_reqs[NFS4_READDIR] = args; 1996 mi->mi_async_tail[NFS4_READDIR] = args; 1997 } else { 1998 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 1999 mi->mi_async_tail[NFS4_READDIR] = args; 2000 } 2001 2002 mutex_enter(&rp->r_statelock); 2003 rp->r_count++; 2004 mutex_exit(&rp->r_statelock); 2005 2006 if (mi->mi_io_kstats) { 2007 mutex_enter(&mi->mi_lock); 2008 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2009 mutex_exit(&mi->mi_lock); 2010 } 2011 2012 mi->mi_async_req_count++; 2013 ASSERT(mi->mi_async_req_count != 0); 2014 cv_signal(&mi->mi_async_reqs_cv); 2015 mutex_exit(&mi->mi_async_lock); 2016 return; 2017 2018 noasync: 2019 mutex_enter(&rp->r_statelock); 2020 rdc->entries = NULL; 2021 /* 2022 * Indicate that no one is trying to fill this entry and 2023 * it still needs to be filled. 2024 */ 2025 rdc->flags &= ~RDDIR; 2026 rdc->flags |= RDDIRREQ; 2027 rddir4_cache_rele(rp, rdc); 2028 mutex_exit(&rp->r_statelock); 2029 } 2030 2031 void 2032 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 2033 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 2034 cred_t *)) 2035 { 2036 rnode4_t *rp; 2037 mntinfo4_t *mi; 2038 struct nfs4_async_reqs *args; 2039 page_t *pp; 2040 2041 rp = VTOR4(vp); 2042 mi = VTOMI4(vp); 2043 2044 /* 2045 * If we can't allocate a request structure, do the commit 2046 * operation synchronously in this thread's context. 2047 */ 2048 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2049 goto noasync; 2050 2051 args->a_next = NULL; 2052 #ifdef DEBUG 2053 args->a_queuer = curthread; 2054 #endif 2055 VN_HOLD(vp); 2056 args->a_vp = vp; 2057 ASSERT(cr != NULL); 2058 crhold(cr); 2059 args->a_cred = cr; 2060 args->a_io = NFS4_COMMIT; 2061 args->a_nfs4_commit = commit; 2062 args->a_nfs4_plist = plist; 2063 args->a_nfs4_offset = offset; 2064 args->a_nfs4_count = count; 2065 2066 mutex_enter(&mi->mi_async_lock); 2067 2068 /* 2069 * If asyncio has been disabled, then make a synchronous request. 2070 * This check is done a second time in case async io was diabled 2071 * while this thread was blocked waiting for memory pressure to 2072 * reduce or for the queue to drain. 2073 */ 2074 if (mi->mi_max_threads == 0) { 2075 mutex_exit(&mi->mi_async_lock); 2076 2077 VN_RELE(vp); 2078 crfree(cr); 2079 kmem_free(args, sizeof (*args)); 2080 goto noasync; 2081 } 2082 2083 /* 2084 * Link request structure into the async list and 2085 * wakeup async thread to do the i/o. 2086 */ 2087 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2088 mi->mi_async_reqs[NFS4_COMMIT] = args; 2089 mi->mi_async_tail[NFS4_COMMIT] = args; 2090 } else { 2091 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2092 mi->mi_async_tail[NFS4_COMMIT] = args; 2093 } 2094 2095 mutex_enter(&rp->r_statelock); 2096 rp->r_count++; 2097 mutex_exit(&rp->r_statelock); 2098 2099 if (mi->mi_io_kstats) { 2100 mutex_enter(&mi->mi_lock); 2101 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2102 mutex_exit(&mi->mi_lock); 2103 } 2104 2105 mi->mi_async_req_count++; 2106 ASSERT(mi->mi_async_req_count != 0); 2107 cv_signal(&mi->mi_async_reqs_cv); 2108 mutex_exit(&mi->mi_async_lock); 2109 return; 2110 2111 noasync: 2112 if (curproc == proc_pageout || curproc == proc_fsflush || 2113 nfs_zone() != mi->mi_zone) { 2114 while (plist != NULL) { 2115 pp = plist; 2116 page_sub(&plist, pp); 2117 pp->p_fsdata = C_COMMIT; 2118 page_unlock(pp); 2119 } 2120 return; 2121 } 2122 (*commit)(vp, plist, offset, count, cr); 2123 } 2124 2125 /* 2126 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2127 * reference to the vnode is handed over to the thread; the caller should 2128 * no longer refer to the vnode. 2129 * 2130 * Unlike most of the async routines, this handoff is needed for 2131 * correctness reasons, not just performance. So doing operations in the 2132 * context of the current thread is not an option. 2133 */ 2134 void 2135 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2136 { 2137 mntinfo4_t *mi; 2138 struct nfs4_async_reqs *args; 2139 boolean_t signal_inactive_thread = B_FALSE; 2140 2141 mi = VTOMI4(vp); 2142 2143 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2144 args->a_next = NULL; 2145 #ifdef DEBUG 2146 args->a_queuer = curthread; 2147 #endif 2148 args->a_vp = vp; 2149 ASSERT(cr != NULL); 2150 crhold(cr); 2151 args->a_cred = cr; 2152 args->a_io = NFS4_INACTIVE; 2153 2154 /* 2155 * Note that we don't check mi->mi_max_threads here, since we 2156 * *need* to get rid of this vnode regardless of whether someone 2157 * set nfs4_max_threads to zero in /etc/system. 2158 * 2159 * The manager thread knows about this and is willing to create 2160 * at least one thread to accommodate us. 2161 */ 2162 mutex_enter(&mi->mi_async_lock); 2163 if (mi->mi_inactive_thread == NULL) { 2164 rnode4_t *rp; 2165 vnode_t *unldvp = NULL; 2166 char *unlname; 2167 cred_t *unlcred; 2168 2169 mutex_exit(&mi->mi_async_lock); 2170 /* 2171 * We just need to free up the memory associated with the 2172 * vnode, which can be safely done from within the current 2173 * context. 2174 */ 2175 crfree(cr); /* drop our reference */ 2176 kmem_free(args, sizeof (*args)); 2177 rp = VTOR4(vp); 2178 mutex_enter(&rp->r_statelock); 2179 if (rp->r_unldvp != NULL) { 2180 unldvp = rp->r_unldvp; 2181 rp->r_unldvp = NULL; 2182 unlname = rp->r_unlname; 2183 rp->r_unlname = NULL; 2184 unlcred = rp->r_unlcred; 2185 rp->r_unlcred = NULL; 2186 } 2187 mutex_exit(&rp->r_statelock); 2188 /* 2189 * No need to explicitly throw away any cached pages. The 2190 * eventual r4inactive() will attempt a synchronous 2191 * VOP_PUTPAGE() which will immediately fail since the request 2192 * is coming from the wrong zone, and then will proceed to call 2193 * nfs4_invalidate_pages() which will clean things up for us. 2194 * 2195 * Throw away the delegation here so rp4_addfree()'s attempt to 2196 * return any existing delegations becomes a no-op. 2197 */ 2198 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2199 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2200 FALSE); 2201 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2202 nfs_rw_exit(&mi->mi_recovlock); 2203 } 2204 nfs4_clear_open_streams(rp); 2205 2206 rp4_addfree(rp, cr); 2207 if (unldvp != NULL) { 2208 kmem_free(unlname, MAXNAMELEN); 2209 VN_RELE(unldvp); 2210 crfree(unlcred); 2211 } 2212 return; 2213 } 2214 2215 if (mi->mi_manager_thread == NULL) { 2216 /* 2217 * We want to talk to the inactive thread. 2218 */ 2219 signal_inactive_thread = B_TRUE; 2220 } 2221 2222 /* 2223 * Enqueue the vnode and wake up either the special thread (empty 2224 * list) or an async thread. 2225 */ 2226 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2227 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2228 mi->mi_async_tail[NFS4_INACTIVE] = args; 2229 signal_inactive_thread = B_TRUE; 2230 } else { 2231 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2232 mi->mi_async_tail[NFS4_INACTIVE] = args; 2233 } 2234 if (signal_inactive_thread) { 2235 cv_signal(&mi->mi_inact_req_cv); 2236 } else { 2237 mi->mi_async_req_count++; 2238 ASSERT(mi->mi_async_req_count != 0); 2239 cv_signal(&mi->mi_async_reqs_cv); 2240 } 2241 2242 mutex_exit(&mi->mi_async_lock); 2243 } 2244 2245 int 2246 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2247 { 2248 int pagecreate; 2249 int n; 2250 int saved_n; 2251 caddr_t saved_base; 2252 u_offset_t offset; 2253 int error; 2254 int sm_error; 2255 vnode_t *vp = RTOV(rp); 2256 2257 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2258 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2259 if (!vpm_enable) { 2260 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2261 } 2262 2263 /* 2264 * Move bytes in at most PAGESIZE chunks. We must avoid 2265 * spanning pages in uiomove() because page faults may cause 2266 * the cache to be invalidated out from under us. The r_size is not 2267 * updated until after the uiomove. If we push the last page of a 2268 * file before r_size is correct, we will lose the data written past 2269 * the current (and invalid) r_size. 2270 */ 2271 do { 2272 offset = uio->uio_loffset; 2273 pagecreate = 0; 2274 2275 /* 2276 * n is the number of bytes required to satisfy the request 2277 * or the number of bytes to fill out the page. 2278 */ 2279 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2280 2281 /* 2282 * Check to see if we can skip reading in the page 2283 * and just allocate the memory. We can do this 2284 * if we are going to rewrite the entire mapping 2285 * or if we are going to write to or beyond the current 2286 * end of file from the beginning of the mapping. 2287 * 2288 * The read of r_size is now protected by r_statelock. 2289 */ 2290 mutex_enter(&rp->r_statelock); 2291 /* 2292 * When pgcreated is nonzero the caller has already done 2293 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2294 * segkpm this means we already have at least one page 2295 * created and mapped at base. 2296 */ 2297 pagecreate = pgcreated || 2298 ((offset & PAGEOFFSET) == 0 && 2299 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2300 2301 mutex_exit(&rp->r_statelock); 2302 2303 if (!vpm_enable && pagecreate) { 2304 /* 2305 * The last argument tells segmap_pagecreate() to 2306 * always lock the page, as opposed to sometimes 2307 * returning with the page locked. This way we avoid a 2308 * fault on the ensuing uiomove(), but also 2309 * more importantly (to fix bug 1094402) we can 2310 * call segmap_fault() to unlock the page in all 2311 * cases. An alternative would be to modify 2312 * segmap_pagecreate() to tell us when it is 2313 * locking a page, but that's a fairly major 2314 * interface change. 2315 */ 2316 if (pgcreated == 0) 2317 (void) segmap_pagecreate(segkmap, base, 2318 (uint_t)n, 1); 2319 saved_base = base; 2320 saved_n = n; 2321 } 2322 2323 /* 2324 * The number of bytes of data in the last page can not 2325 * be accurately be determined while page is being 2326 * uiomove'd to and the size of the file being updated. 2327 * Thus, inform threads which need to know accurately 2328 * how much data is in the last page of the file. They 2329 * will not do the i/o immediately, but will arrange for 2330 * the i/o to happen later when this modify operation 2331 * will have finished. 2332 */ 2333 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2334 mutex_enter(&rp->r_statelock); 2335 rp->r_flags |= R4MODINPROGRESS; 2336 rp->r_modaddr = (offset & MAXBMASK); 2337 mutex_exit(&rp->r_statelock); 2338 2339 if (vpm_enable) { 2340 /* 2341 * Copy data. If new pages are created, part of 2342 * the page that is not written will be initizliazed 2343 * with zeros. 2344 */ 2345 error = vpm_data_copy(vp, offset, n, uio, 2346 !pagecreate, NULL, 0, S_WRITE); 2347 } else { 2348 error = uiomove(base, n, UIO_WRITE, uio); 2349 } 2350 2351 /* 2352 * r_size is the maximum number of 2353 * bytes known to be in the file. 2354 * Make sure it is at least as high as the 2355 * first unwritten byte pointed to by uio_loffset. 2356 */ 2357 mutex_enter(&rp->r_statelock); 2358 if (rp->r_size < uio->uio_loffset) 2359 rp->r_size = uio->uio_loffset; 2360 rp->r_flags &= ~R4MODINPROGRESS; 2361 rp->r_flags |= R4DIRTY; 2362 mutex_exit(&rp->r_statelock); 2363 2364 /* n = # of bytes written */ 2365 n = (int)(uio->uio_loffset - offset); 2366 2367 if (!vpm_enable) { 2368 base += n; 2369 } 2370 2371 tcount -= n; 2372 /* 2373 * If we created pages w/o initializing them completely, 2374 * we need to zero the part that wasn't set up. 2375 * This happens on a most EOF write cases and if 2376 * we had some sort of error during the uiomove. 2377 */ 2378 if (!vpm_enable && pagecreate) { 2379 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2380 (void) kzero(base, PAGESIZE - n); 2381 2382 if (pgcreated) { 2383 /* 2384 * Caller is responsible for this page, 2385 * it was not created in this loop. 2386 */ 2387 pgcreated = 0; 2388 } else { 2389 /* 2390 * For bug 1094402: segmap_pagecreate locks 2391 * page. Unlock it. This also unlocks the 2392 * pages allocated by page_create_va() in 2393 * segmap_pagecreate(). 2394 */ 2395 sm_error = segmap_fault(kas.a_hat, segkmap, 2396 saved_base, saved_n, 2397 F_SOFTUNLOCK, S_WRITE); 2398 if (error == 0) 2399 error = sm_error; 2400 } 2401 } 2402 } while (tcount > 0 && error == 0); 2403 2404 return (error); 2405 } 2406 2407 int 2408 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2409 { 2410 rnode4_t *rp; 2411 page_t *pp; 2412 u_offset_t eoff; 2413 u_offset_t io_off; 2414 size_t io_len; 2415 int error; 2416 int rdirty; 2417 int err; 2418 2419 rp = VTOR4(vp); 2420 ASSERT(rp->r_count > 0); 2421 2422 if (!nfs4_has_pages(vp)) 2423 return (0); 2424 2425 ASSERT(vp->v_type != VCHR); 2426 2427 /* 2428 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2429 * writes. B_FORCE is set to force the VM system to actually 2430 * invalidate the pages, even if the i/o failed. The pages 2431 * need to get invalidated because they can't be written out 2432 * because there isn't any space left on either the server's 2433 * file system or in the user's disk quota. The B_FREE bit 2434 * is cleared to avoid confusion as to whether this is a 2435 * request to place the page on the freelist or to destroy 2436 * it. 2437 */ 2438 if ((rp->r_flags & R4OUTOFSPACE) || 2439 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2440 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2441 2442 if (len == 0) { 2443 /* 2444 * If doing a full file synchronous operation, then clear 2445 * the R4DIRTY bit. If a page gets dirtied while the flush 2446 * is happening, then R4DIRTY will get set again. The 2447 * R4DIRTY bit must get cleared before the flush so that 2448 * we don't lose this information. 2449 * 2450 * If there are no full file async write operations 2451 * pending and RDIRTY bit is set, clear it. 2452 */ 2453 if (off == (u_offset_t)0 && 2454 !(flags & B_ASYNC) && 2455 (rp->r_flags & R4DIRTY)) { 2456 mutex_enter(&rp->r_statelock); 2457 rdirty = (rp->r_flags & R4DIRTY); 2458 rp->r_flags &= ~R4DIRTY; 2459 mutex_exit(&rp->r_statelock); 2460 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2461 mutex_enter(&rp->r_statelock); 2462 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2463 rdirty = (rp->r_flags & R4DIRTY); 2464 rp->r_flags &= ~R4DIRTY; 2465 } 2466 mutex_exit(&rp->r_statelock); 2467 } else 2468 rdirty = 0; 2469 2470 /* 2471 * Search the entire vp list for pages >= off, and flush 2472 * the dirty pages. 2473 */ 2474 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2475 flags, cr); 2476 2477 /* 2478 * If an error occurred and the file was marked as dirty 2479 * before and we aren't forcibly invalidating pages, then 2480 * reset the R4DIRTY flag. 2481 */ 2482 if (error && rdirty && 2483 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2484 mutex_enter(&rp->r_statelock); 2485 rp->r_flags |= R4DIRTY; 2486 mutex_exit(&rp->r_statelock); 2487 } 2488 } else { 2489 /* 2490 * Do a range from [off...off + len) looking for pages 2491 * to deal with. 2492 */ 2493 error = 0; 2494 io_len = 0; 2495 eoff = off + len; 2496 mutex_enter(&rp->r_statelock); 2497 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2498 io_off += io_len) { 2499 mutex_exit(&rp->r_statelock); 2500 /* 2501 * If we are not invalidating, synchronously 2502 * freeing or writing pages use the routine 2503 * page_lookup_nowait() to prevent reclaiming 2504 * them from the free list. 2505 */ 2506 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2507 pp = page_lookup(vp, io_off, 2508 (flags & (B_INVAL | B_FREE)) ? 2509 SE_EXCL : SE_SHARED); 2510 } else { 2511 pp = page_lookup_nowait(vp, io_off, 2512 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2513 } 2514 2515 if (pp == NULL || !pvn_getdirty(pp, flags)) 2516 io_len = PAGESIZE; 2517 else { 2518 err = (*rp->r_putapage)(vp, pp, &io_off, 2519 &io_len, flags, cr); 2520 if (!error) 2521 error = err; 2522 /* 2523 * "io_off" and "io_len" are returned as 2524 * the range of pages we actually wrote. 2525 * This allows us to skip ahead more quickly 2526 * since several pages may've been dealt 2527 * with by this iteration of the loop. 2528 */ 2529 } 2530 mutex_enter(&rp->r_statelock); 2531 } 2532 mutex_exit(&rp->r_statelock); 2533 } 2534 2535 return (error); 2536 } 2537 2538 void 2539 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2540 { 2541 rnode4_t *rp; 2542 2543 rp = VTOR4(vp); 2544 if (IS_SHADOW(vp, rp)) 2545 vp = RTOV4(rp); 2546 mutex_enter(&rp->r_statelock); 2547 while (rp->r_flags & R4TRUNCATE) 2548 cv_wait(&rp->r_cv, &rp->r_statelock); 2549 rp->r_flags |= R4TRUNCATE; 2550 if (off == (u_offset_t)0) { 2551 rp->r_flags &= ~R4DIRTY; 2552 if (!(rp->r_flags & R4STALE)) 2553 rp->r_error = 0; 2554 } 2555 rp->r_truncaddr = off; 2556 mutex_exit(&rp->r_statelock); 2557 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2558 B_INVAL | B_TRUNC, cr); 2559 mutex_enter(&rp->r_statelock); 2560 rp->r_flags &= ~R4TRUNCATE; 2561 cv_broadcast(&rp->r_cv); 2562 mutex_exit(&rp->r_statelock); 2563 } 2564 2565 static int 2566 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2567 { 2568 mntinfo4_t *mi; 2569 struct mntinfo_kstat *mik; 2570 vfs_t *vfsp; 2571 2572 /* this is a read-only kstat. Bail out on a write */ 2573 if (rw == KSTAT_WRITE) 2574 return (EACCES); 2575 2576 2577 /* 2578 * We don't want to wait here as kstat_chain_lock could be held by 2579 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2580 * and thus could lead to a deadlock. 2581 */ 2582 vfsp = (struct vfs *)ksp->ks_private; 2583 2584 mi = VFTOMI4(vfsp); 2585 mik = (struct mntinfo_kstat *)ksp->ks_data; 2586 2587 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2588 2589 mik->mik_vers = (uint32_t)mi->mi_vers; 2590 mik->mik_flags = mi->mi_flags; 2591 /* 2592 * The sv_secdata holds the flavor the client specifies. 2593 * If the client uses default and a security negotiation 2594 * occurs, sv_currsec will point to the current flavor 2595 * selected from the server flavor list. 2596 * sv_currsec is NULL if no security negotiation takes place. 2597 */ 2598 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2599 mi->mi_curr_serv->sv_currsec->secmod : 2600 mi->mi_curr_serv->sv_secdata->secmod; 2601 mik->mik_curread = (uint32_t)mi->mi_curread; 2602 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2603 mik->mik_retrans = mi->mi_retrans; 2604 mik->mik_timeo = mi->mi_timeo; 2605 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2606 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2607 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2608 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2609 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2610 mik->mik_failover = (uint32_t)mi->mi_failover; 2611 mik->mik_remap = (uint32_t)mi->mi_remap; 2612 2613 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2614 2615 return (0); 2616 } 2617 2618 void 2619 nfs4_mnt_kstat_init(struct vfs *vfsp) 2620 { 2621 mntinfo4_t *mi = VFTOMI4(vfsp); 2622 2623 /* 2624 * PSARC 2001/697 Contract Private Interface 2625 * All nfs kstats are under SunMC contract 2626 * Please refer to the PSARC listed above and contact 2627 * SunMC before making any changes! 2628 * 2629 * Changes must be reviewed by Solaris File Sharing 2630 * Changes must be communicated to contract-2001-697@sun.com 2631 * 2632 */ 2633 2634 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2635 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2636 if (mi->mi_io_kstats) { 2637 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2638 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2639 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2640 kstat_install(mi->mi_io_kstats); 2641 } 2642 2643 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2644 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2645 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2646 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2647 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2648 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2649 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2650 kstat_install(mi->mi_ro_kstats); 2651 } 2652 2653 nfs4_mnt_recov_kstat_init(vfsp); 2654 } 2655 2656 void 2657 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2658 { 2659 mntinfo4_t *mi; 2660 clock_t now = ddi_get_lbolt(); 2661 2662 mi = VTOMI4(vp); 2663 /* 2664 * In case of forced unmount, do not print any messages 2665 * since it can flood the console with error messages. 2666 */ 2667 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2668 return; 2669 2670 /* 2671 * If the mount point is dead, not recoverable, do not 2672 * print error messages that can flood the console. 2673 */ 2674 if (mi->mi_flags & MI4_RECOV_FAIL) 2675 return; 2676 2677 /* 2678 * No use in flooding the console with ENOSPC 2679 * messages from the same file system. 2680 */ 2681 if ((error != ENOSPC && error != EDQUOT) || 2682 now - mi->mi_printftime > 0) { 2683 zoneid_t zoneid = mi->mi_zone->zone_id; 2684 2685 #ifdef DEBUG 2686 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2687 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2688 #else 2689 nfs_perror(error, "NFS write error on host %s: %m.\n", 2690 VTOR4(vp)->r_server->sv_hostname, NULL); 2691 #endif 2692 if (error == ENOSPC || error == EDQUOT) { 2693 zcmn_err(zoneid, CE_CONT, 2694 "^File: userid=%d, groupid=%d\n", 2695 crgetuid(cr), crgetgid(cr)); 2696 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2697 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2698 zcmn_err(zoneid, CE_CONT, 2699 "^User: userid=%d, groupid=%d\n", 2700 crgetuid(curthread->t_cred), 2701 crgetgid(curthread->t_cred)); 2702 } 2703 mi->mi_printftime = now + 2704 nfs_write_error_interval * hz; 2705 } 2706 sfh4_printfhandle(VTOR4(vp)->r_fh); 2707 #ifdef DEBUG 2708 if (error == EACCES) { 2709 zcmn_err(zoneid, CE_CONT, 2710 "nfs_bio: cred is%s kcred\n", 2711 cr == kcred ? "" : " not"); 2712 } 2713 #endif 2714 } 2715 } 2716 2717 /* 2718 * Return non-zero if the given file can be safely memory mapped. Locks 2719 * are safe if whole-file (length and offset are both zero). 2720 */ 2721 2722 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2723 2724 static int 2725 nfs4_safemap(const vnode_t *vp) 2726 { 2727 locklist_t *llp, *next_llp; 2728 int safe = 1; 2729 rnode4_t *rp = VTOR4(vp); 2730 2731 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2732 2733 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2734 "vp = %p", (void *)vp)); 2735 2736 /* 2737 * Review all the locks for the vnode, both ones that have been 2738 * acquired and ones that are pending. We assume that 2739 * flk_active_locks_for_vp() has merged any locks that can be 2740 * merged (so that if a process has the entire file locked, it is 2741 * represented as a single lock). 2742 * 2743 * Note that we can't bail out of the loop if we find a non-safe 2744 * lock, because we have to free all the elements in the llp list. 2745 * We might be able to speed up this code slightly by not looking 2746 * at each lock's l_start and l_len fields once we've found a 2747 * non-safe lock. 2748 */ 2749 2750 llp = flk_active_locks_for_vp(vp); 2751 while (llp) { 2752 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2753 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2754 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2755 if (!SAFE_LOCK(llp->ll_flock)) { 2756 safe = 0; 2757 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2758 "nfs4_safemap: unsafe active lock (%" PRId64 2759 ", %" PRId64 ")", llp->ll_flock.l_start, 2760 llp->ll_flock.l_len)); 2761 } 2762 next_llp = llp->ll_next; 2763 VN_RELE(llp->ll_vp); 2764 kmem_free(llp, sizeof (*llp)); 2765 llp = next_llp; 2766 } 2767 2768 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2769 safe ? "safe" : "unsafe")); 2770 return (safe); 2771 } 2772 2773 /* 2774 * Return whether there is a lost LOCK or LOCKU queued up for the given 2775 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2776 */ 2777 2778 bool_t 2779 nfs4_map_lost_lock_conflict(vnode_t *vp) 2780 { 2781 bool_t conflict = FALSE; 2782 nfs4_lost_rqst_t *lrp; 2783 mntinfo4_t *mi = VTOMI4(vp); 2784 2785 mutex_enter(&mi->mi_lock); 2786 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2787 lrp = list_next(&mi->mi_lost_state, lrp)) { 2788 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2789 continue; 2790 ASSERT(lrp->lr_vp != NULL); 2791 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2792 continue; /* different file */ 2793 if (!SAFE_LOCK(*lrp->lr_flk)) { 2794 conflict = TRUE; 2795 break; 2796 } 2797 } 2798 2799 mutex_exit(&mi->mi_lock); 2800 return (conflict); 2801 } 2802 2803 /* 2804 * nfs_lockcompletion: 2805 * 2806 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2807 * as non cachable (set VNOCACHE bit). 2808 */ 2809 2810 void 2811 nfs4_lockcompletion(vnode_t *vp, int cmd) 2812 { 2813 rnode4_t *rp = VTOR4(vp); 2814 2815 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2816 ASSERT(!IS_SHADOW(vp, rp)); 2817 2818 if (cmd == F_SETLK || cmd == F_SETLKW) { 2819 2820 if (!nfs4_safemap(vp)) { 2821 mutex_enter(&vp->v_lock); 2822 vp->v_flag |= VNOCACHE; 2823 mutex_exit(&vp->v_lock); 2824 } else { 2825 mutex_enter(&vp->v_lock); 2826 vp->v_flag &= ~VNOCACHE; 2827 mutex_exit(&vp->v_lock); 2828 } 2829 } 2830 /* 2831 * The cached attributes of the file are stale after acquiring 2832 * the lock on the file. They were updated when the file was 2833 * opened, but not updated when the lock was acquired. Therefore the 2834 * cached attributes are invalidated after the lock is obtained. 2835 */ 2836 PURGE_ATTRCACHE4(vp); 2837 } 2838 2839 /* ARGSUSED */ 2840 static void * 2841 nfs4_mi_init(zoneid_t zoneid) 2842 { 2843 struct mi4_globals *mig; 2844 2845 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2846 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2847 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2848 offsetof(mntinfo4_t, mi_zone_node)); 2849 mig->mig_destructor_called = B_FALSE; 2850 return (mig); 2851 } 2852 2853 /* 2854 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2855 * state and killing off threads. 2856 */ 2857 /* ARGSUSED */ 2858 static void 2859 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2860 { 2861 struct mi4_globals *mig = data; 2862 mntinfo4_t *mi; 2863 nfs4_server_t *np; 2864 2865 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2866 "nfs4_mi_shutdown zone %d\n", zoneid)); 2867 ASSERT(mig != NULL); 2868 for (;;) { 2869 mutex_enter(&mig->mig_lock); 2870 mi = list_head(&mig->mig_list); 2871 if (mi == NULL) { 2872 mutex_exit(&mig->mig_lock); 2873 break; 2874 } 2875 2876 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2877 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2878 /* 2879 * purge the DNLC for this filesystem 2880 */ 2881 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2882 /* 2883 * Tell existing async worker threads to exit. 2884 */ 2885 mutex_enter(&mi->mi_async_lock); 2886 mi->mi_max_threads = 0; 2887 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2888 /* 2889 * Set the appropriate flags, signal and wait for both the 2890 * async manager and the inactive thread to exit when they're 2891 * done with their current work. 2892 */ 2893 mutex_enter(&mi->mi_lock); 2894 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2895 mutex_exit(&mi->mi_lock); 2896 mutex_exit(&mi->mi_async_lock); 2897 if (mi->mi_manager_thread) { 2898 nfs4_async_manager_stop(mi->mi_vfsp); 2899 } 2900 if (mi->mi_inactive_thread) { 2901 mutex_enter(&mi->mi_async_lock); 2902 cv_signal(&mi->mi_inact_req_cv); 2903 /* 2904 * Wait for the inactive thread to exit. 2905 */ 2906 while (mi->mi_inactive_thread != NULL) { 2907 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2908 } 2909 mutex_exit(&mi->mi_async_lock); 2910 } 2911 /* 2912 * Wait for the recovery thread to complete, that is, it will 2913 * signal when it is done using the "mi" structure and about 2914 * to exit 2915 */ 2916 mutex_enter(&mi->mi_lock); 2917 while (mi->mi_in_recovery > 0) 2918 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2919 mutex_exit(&mi->mi_lock); 2920 /* 2921 * We're done when every mi has been done or the list is empty. 2922 * This one is done, remove it from the list. 2923 */ 2924 list_remove(&mig->mig_list, mi); 2925 mutex_exit(&mig->mig_lock); 2926 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4); 2927 2928 /* 2929 * Release hold on vfs and mi done to prevent race with zone 2930 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2931 */ 2932 VFS_RELE(mi->mi_vfsp); 2933 MI4_RELE(mi); 2934 } 2935 /* 2936 * Tell each renew thread in the zone to exit 2937 */ 2938 mutex_enter(&nfs4_server_lst_lock); 2939 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2940 mutex_enter(&np->s_lock); 2941 if (np->zoneid == zoneid) { 2942 /* 2943 * We add another hold onto the nfs4_server_t 2944 * because this will make sure tha the nfs4_server_t 2945 * stays around until nfs4_callback_fini_zone destroys 2946 * the zone. This way, the renew thread can 2947 * unconditionally release its holds on the 2948 * nfs4_server_t. 2949 */ 2950 np->s_refcnt++; 2951 nfs4_mark_srv_dead(np); 2952 } 2953 mutex_exit(&np->s_lock); 2954 } 2955 mutex_exit(&nfs4_server_lst_lock); 2956 } 2957 2958 static void 2959 nfs4_mi_free_globals(struct mi4_globals *mig) 2960 { 2961 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2962 mutex_destroy(&mig->mig_lock); 2963 kmem_free(mig, sizeof (*mig)); 2964 } 2965 2966 /* ARGSUSED */ 2967 static void 2968 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2969 { 2970 struct mi4_globals *mig = data; 2971 2972 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2973 "nfs4_mi_destroy zone %d\n", zoneid)); 2974 ASSERT(mig != NULL); 2975 mutex_enter(&mig->mig_lock); 2976 if (list_head(&mig->mig_list) != NULL) { 2977 /* Still waiting for VFS_FREEVFS() */ 2978 mig->mig_destructor_called = B_TRUE; 2979 mutex_exit(&mig->mig_lock); 2980 return; 2981 } 2982 nfs4_mi_free_globals(mig); 2983 } 2984 2985 /* 2986 * Add an NFS mount to the per-zone list of NFS mounts. 2987 */ 2988 void 2989 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2990 { 2991 struct mi4_globals *mig; 2992 2993 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2994 mutex_enter(&mig->mig_lock); 2995 list_insert_head(&mig->mig_list, mi); 2996 /* 2997 * hold added to eliminate race with zone shutdown -this will be 2998 * released in mi_shutdown 2999 */ 3000 MI4_HOLD(mi); 3001 VFS_HOLD(mi->mi_vfsp); 3002 mutex_exit(&mig->mig_lock); 3003 } 3004 3005 /* 3006 * Remove an NFS mount from the per-zone list of NFS mounts. 3007 */ 3008 int 3009 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 3010 { 3011 struct mi4_globals *mig; 3012 int ret = 0; 3013 3014 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3015 mutex_enter(&mig->mig_lock); 3016 mutex_enter(&mi->mi_lock); 3017 /* if this mi is marked dead, then the zone already released it */ 3018 if (!(mi->mi_flags & MI4_DEAD)) { 3019 list_remove(&mig->mig_list, mi); 3020 mutex_exit(&mi->mi_lock); 3021 3022 /* release the holds put on in zonelist_add(). */ 3023 VFS_RELE(mi->mi_vfsp); 3024 MI4_RELE(mi); 3025 ret = 1; 3026 } else { 3027 mutex_exit(&mi->mi_lock); 3028 } 3029 3030 /* 3031 * We can be called asynchronously by VFS_FREEVFS() after the zone 3032 * shutdown/destroy callbacks have executed; if so, clean up the zone's 3033 * mi globals. 3034 */ 3035 if (list_head(&mig->mig_list) == NULL && 3036 mig->mig_destructor_called == B_TRUE) { 3037 nfs4_mi_free_globals(mig); 3038 return (ret); 3039 } 3040 mutex_exit(&mig->mig_lock); 3041 return (ret); 3042 } 3043 3044 void 3045 nfs_free_mi4(mntinfo4_t *mi) 3046 { 3047 nfs4_open_owner_t *foop; 3048 nfs4_oo_hash_bucket_t *bucketp; 3049 nfs4_debug_msg_t *msgp; 3050 int i; 3051 servinfo4_t *svp; 3052 3053 /* 3054 * Code introduced here should be carefully evaluated to make 3055 * sure none of the freed resources are accessed either directly 3056 * or indirectly after freeing them. For eg: Introducing calls to 3057 * NFS4_DEBUG that use mntinfo4_t structure member after freeing 3058 * the structure members or other routines calling back into NFS 3059 * accessing freed mntinfo4_t structure member. 3060 */ 3061 mutex_enter(&mi->mi_lock); 3062 ASSERT(mi->mi_recovthread == NULL); 3063 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 3064 mutex_exit(&mi->mi_lock); 3065 mutex_enter(&mi->mi_async_lock); 3066 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 3067 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0); 3068 ASSERT(mi->mi_manager_thread == NULL); 3069 mutex_exit(&mi->mi_async_lock); 3070 if (mi->mi_io_kstats) { 3071 kstat_delete(mi->mi_io_kstats); 3072 mi->mi_io_kstats = NULL; 3073 } 3074 if (mi->mi_ro_kstats) { 3075 kstat_delete(mi->mi_ro_kstats); 3076 mi->mi_ro_kstats = NULL; 3077 } 3078 if (mi->mi_recov_ksp) { 3079 kstat_delete(mi->mi_recov_ksp); 3080 mi->mi_recov_ksp = NULL; 3081 } 3082 mutex_enter(&mi->mi_msg_list_lock); 3083 while (msgp = list_head(&mi->mi_msg_list)) { 3084 list_remove(&mi->mi_msg_list, msgp); 3085 nfs4_free_msg(msgp); 3086 } 3087 mutex_exit(&mi->mi_msg_list_lock); 3088 list_destroy(&mi->mi_msg_list); 3089 if (mi->mi_fname != NULL) 3090 fn_rele(&mi->mi_fname); 3091 if (mi->mi_rootfh != NULL) 3092 sfh4_rele(&mi->mi_rootfh); 3093 if (mi->mi_srvparentfh != NULL) 3094 sfh4_rele(&mi->mi_srvparentfh); 3095 svp = mi->mi_servers; 3096 sv4_free(svp); 3097 mutex_destroy(&mi->mi_lock); 3098 mutex_destroy(&mi->mi_async_lock); 3099 mutex_destroy(&mi->mi_msg_list_lock); 3100 mutex_destroy(&mi->mi_rnodes_lock); 3101 nfs_rw_destroy(&mi->mi_recovlock); 3102 nfs_rw_destroy(&mi->mi_rename_lock); 3103 nfs_rw_destroy(&mi->mi_fh_lock); 3104 cv_destroy(&mi->mi_failover_cv); 3105 cv_destroy(&mi->mi_async_reqs_cv); 3106 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]); 3107 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]); 3108 cv_destroy(&mi->mi_async_cv); 3109 cv_destroy(&mi->mi_inact_req_cv); 3110 /* 3111 * Destroy the oo hash lists and mutexes for the cred hash table. 3112 */ 3113 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3114 bucketp = &(mi->mi_oo_list[i]); 3115 /* Destroy any remaining open owners on the list */ 3116 foop = list_head(&bucketp->b_oo_hash_list); 3117 while (foop != NULL) { 3118 list_remove(&bucketp->b_oo_hash_list, foop); 3119 nfs4_destroy_open_owner(foop); 3120 foop = list_head(&bucketp->b_oo_hash_list); 3121 } 3122 list_destroy(&bucketp->b_oo_hash_list); 3123 mutex_destroy(&bucketp->b_lock); 3124 } 3125 /* 3126 * Empty and destroy the freed open owner list. 3127 */ 3128 foop = list_head(&mi->mi_foo_list); 3129 while (foop != NULL) { 3130 list_remove(&mi->mi_foo_list, foop); 3131 nfs4_destroy_open_owner(foop); 3132 foop = list_head(&mi->mi_foo_list); 3133 } 3134 list_destroy(&mi->mi_foo_list); 3135 list_destroy(&mi->mi_bseqid_list); 3136 list_destroy(&mi->mi_lost_state); 3137 list_destroy(&mi->mi_rnodes); 3138 avl_destroy(&mi->mi_filehandles); 3139 kmem_free(mi, sizeof (*mi)); 3140 } 3141 void 3142 mi_hold(mntinfo4_t *mi) 3143 { 3144 atomic_inc_32(&mi->mi_count); 3145 ASSERT(mi->mi_count != 0); 3146 } 3147 3148 void 3149 mi_rele(mntinfo4_t *mi) 3150 { 3151 ASSERT(mi->mi_count != 0); 3152 if (atomic_dec_32_nv(&mi->mi_count) == 0) { 3153 nfs_free_mi4(mi); 3154 } 3155 } 3156 3157 vnode_t nfs4_xattr_notsupp_vnode; 3158 3159 void 3160 nfs4_clnt_init(void) 3161 { 3162 nfs4_vnops_init(); 3163 (void) nfs4_rnode_init(); 3164 (void) nfs4_shadow_init(); 3165 (void) nfs4_acache_init(); 3166 (void) nfs4_subr_init(); 3167 nfs4_acl_init(); 3168 nfs_idmap_init(); 3169 nfs4_callback_init(); 3170 nfs4_secinfo_init(); 3171 #ifdef DEBUG 3172 tsd_create(&nfs4_tsd_key, NULL); 3173 #endif 3174 3175 /* 3176 * Add a CPR callback so that we can update client 3177 * lease after a suspend and resume. 3178 */ 3179 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3180 3181 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3182 nfs4_mi_destroy); 3183 3184 /* 3185 * Initialise the reference count of the notsupp xattr cache vnode to 1 3186 * so that it never goes away (VOP_INACTIVE isn't called on it). 3187 */ 3188 nfs4_xattr_notsupp_vnode.v_count = 1; 3189 } 3190 3191 void 3192 nfs4_clnt_fini(void) 3193 { 3194 (void) zone_key_delete(mi4_list_key); 3195 nfs4_vnops_fini(); 3196 (void) nfs4_rnode_fini(); 3197 (void) nfs4_shadow_fini(); 3198 (void) nfs4_acache_fini(); 3199 (void) nfs4_subr_fini(); 3200 nfs_idmap_fini(); 3201 nfs4_callback_fini(); 3202 nfs4_secinfo_fini(); 3203 #ifdef DEBUG 3204 tsd_destroy(&nfs4_tsd_key); 3205 #endif 3206 if (cid) 3207 (void) callb_delete(cid); 3208 } 3209 3210 /*ARGSUSED*/ 3211 static boolean_t 3212 nfs4_client_cpr_callb(void *arg, int code) 3213 { 3214 /* 3215 * We get called for Suspend and Resume events. 3216 * For the suspend case we simply don't care! 3217 */ 3218 if (code == CB_CODE_CPR_CHKPT) { 3219 return (B_TRUE); 3220 } 3221 3222 /* 3223 * When we get to here we are in the process of 3224 * resuming the system from a previous suspend. 3225 */ 3226 nfs4_client_resumed = gethrestime_sec(); 3227 return (B_TRUE); 3228 } 3229 3230 void 3231 nfs4_renew_lease_thread(nfs4_server_t *sp) 3232 { 3233 int error = 0; 3234 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3235 clock_t tick_delay = 0; 3236 clock_t time_left = 0; 3237 callb_cpr_t cpr_info; 3238 kmutex_t cpr_lock; 3239 3240 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3241 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3242 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3243 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3244 3245 mutex_enter(&sp->s_lock); 3246 /* sp->s_lease_time is set via a GETATTR */ 3247 sp->last_renewal_time = gethrestime_sec(); 3248 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3249 ASSERT(sp->s_refcnt >= 1); 3250 3251 for (;;) { 3252 if (!sp->state_ref_count || 3253 sp->lease_valid != NFS4_LEASE_VALID) { 3254 3255 kip_secs = MAX((sp->s_lease_time >> 1) - 3256 (3 * sp->propagation_delay.tv_sec), 1); 3257 3258 tick_delay = SEC_TO_TICK(kip_secs); 3259 3260 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3261 "nfs4_renew_lease_thread: no renew : thread " 3262 "wait %ld secs", kip_secs)); 3263 3264 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3265 "nfs4_renew_lease_thread: no renew : " 3266 "state_ref_count %d, lease_valid %d", 3267 sp->state_ref_count, sp->lease_valid)); 3268 3269 mutex_enter(&cpr_lock); 3270 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3271 mutex_exit(&cpr_lock); 3272 time_left = cv_reltimedwait(&sp->cv_thread_exit, 3273 &sp->s_lock, tick_delay, TR_CLOCK_TICK); 3274 mutex_enter(&cpr_lock); 3275 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3276 mutex_exit(&cpr_lock); 3277 3278 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3279 "nfs4_renew_lease_thread: no renew: " 3280 "time left %ld", time_left)); 3281 3282 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3283 goto die; 3284 continue; 3285 } 3286 3287 tmp_last_renewal_time = sp->last_renewal_time; 3288 3289 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3290 (3 * sp->propagation_delay.tv_sec); 3291 3292 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3293 "nfs4_renew_lease_thread: tmp_time %ld, " 3294 "sp->last_renewal_time %ld", tmp_time, 3295 sp->last_renewal_time)); 3296 3297 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3298 3299 tick_delay = SEC_TO_TICK(kip_secs); 3300 3301 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3302 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3303 "secs", kip_secs)); 3304 3305 mutex_enter(&cpr_lock); 3306 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3307 mutex_exit(&cpr_lock); 3308 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock, 3309 tick_delay, TR_CLOCK_TICK); 3310 mutex_enter(&cpr_lock); 3311 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3312 mutex_exit(&cpr_lock); 3313 3314 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3315 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3316 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3317 "tmp_last_renewal_time %ld", time_left, 3318 sp->last_renewal_time, nfs4_client_resumed, 3319 tmp_last_renewal_time)); 3320 3321 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3322 goto die; 3323 3324 if (tmp_last_renewal_time == sp->last_renewal_time || 3325 (nfs4_client_resumed != 0 && 3326 nfs4_client_resumed > sp->last_renewal_time)) { 3327 /* 3328 * Issue RENEW op since we haven't renewed the lease 3329 * since we slept. 3330 */ 3331 tmp_now_time = gethrestime_sec(); 3332 error = nfs4renew(sp); 3333 /* 3334 * Need to re-acquire sp's lock, nfs4renew() 3335 * relinqueshes it. 3336 */ 3337 mutex_enter(&sp->s_lock); 3338 3339 /* 3340 * See if someone changed s_thread_exit while we gave 3341 * up s_lock. 3342 */ 3343 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3344 goto die; 3345 3346 if (!error) { 3347 /* 3348 * check to see if we implicitly renewed while 3349 * we waited for a reply for our RENEW call. 3350 */ 3351 if (tmp_last_renewal_time == 3352 sp->last_renewal_time) { 3353 /* no implicit renew came */ 3354 sp->last_renewal_time = tmp_now_time; 3355 } else { 3356 NFS4_DEBUG(nfs4_client_lease_debug, 3357 (CE_NOTE, "renew_thread: did " 3358 "implicit renewal before reply " 3359 "from server for RENEW")); 3360 } 3361 } else { 3362 /* figure out error */ 3363 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3364 "renew_thread: nfs4renew returned error" 3365 " %d", error)); 3366 } 3367 3368 } 3369 } 3370 3371 die: 3372 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3373 "nfs4_renew_lease_thread: thread exiting")); 3374 3375 while (sp->s_otw_call_count != 0) { 3376 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3377 "nfs4_renew_lease_thread: waiting for outstanding " 3378 "otw calls to finish for sp 0x%p, current " 3379 "s_otw_call_count %d", (void *)sp, 3380 sp->s_otw_call_count)); 3381 mutex_enter(&cpr_lock); 3382 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3383 mutex_exit(&cpr_lock); 3384 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3385 mutex_enter(&cpr_lock); 3386 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3387 mutex_exit(&cpr_lock); 3388 } 3389 mutex_exit(&sp->s_lock); 3390 3391 nfs4_server_rele(sp); /* free the thread's reference */ 3392 nfs4_server_rele(sp); /* free the list's reference */ 3393 sp = NULL; 3394 3395 done: 3396 mutex_enter(&cpr_lock); 3397 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3398 mutex_destroy(&cpr_lock); 3399 3400 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3401 "nfs4_renew_lease_thread: renew thread exit officially")); 3402 3403 zthread_exit(); 3404 /* NOT REACHED */ 3405 } 3406 3407 /* 3408 * Send out a RENEW op to the server. 3409 * Assumes sp is locked down. 3410 */ 3411 static int 3412 nfs4renew(nfs4_server_t *sp) 3413 { 3414 COMPOUND4args_clnt args; 3415 COMPOUND4res_clnt res; 3416 nfs_argop4 argop[1]; 3417 int doqueue = 1; 3418 int rpc_error; 3419 cred_t *cr; 3420 mntinfo4_t *mi; 3421 timespec_t prop_time, after_time; 3422 int needrecov = FALSE; 3423 nfs4_recov_state_t recov_state; 3424 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3425 3426 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3427 3428 recov_state.rs_flags = 0; 3429 recov_state.rs_num_retry_despite_err = 0; 3430 3431 recov_retry: 3432 mi = sp->mntinfo4_list; 3433 VFS_HOLD(mi->mi_vfsp); 3434 mutex_exit(&sp->s_lock); 3435 ASSERT(mi != NULL); 3436 3437 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3438 if (e.error) { 3439 VFS_RELE(mi->mi_vfsp); 3440 return (e.error); 3441 } 3442 3443 /* Check to see if we're dealing with a marked-dead sp */ 3444 mutex_enter(&sp->s_lock); 3445 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3446 mutex_exit(&sp->s_lock); 3447 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3448 VFS_RELE(mi->mi_vfsp); 3449 return (0); 3450 } 3451 3452 /* Make sure mi hasn't changed on us */ 3453 if (mi != sp->mntinfo4_list) { 3454 /* Must drop sp's lock to avoid a recursive mutex enter */ 3455 mutex_exit(&sp->s_lock); 3456 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3457 VFS_RELE(mi->mi_vfsp); 3458 mutex_enter(&sp->s_lock); 3459 goto recov_retry; 3460 } 3461 mutex_exit(&sp->s_lock); 3462 3463 args.ctag = TAG_RENEW; 3464 3465 args.array_len = 1; 3466 args.array = argop; 3467 3468 argop[0].argop = OP_RENEW; 3469 3470 mutex_enter(&sp->s_lock); 3471 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3472 cr = sp->s_cred; 3473 crhold(cr); 3474 mutex_exit(&sp->s_lock); 3475 3476 ASSERT(cr != NULL); 3477 3478 /* used to figure out RTT for sp */ 3479 gethrestime(&prop_time); 3480 3481 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3482 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3483 (void*)sp)); 3484 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3485 prop_time.tv_sec, prop_time.tv_nsec)); 3486 3487 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3488 mntinfo4_t *, mi); 3489 3490 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3491 crfree(cr); 3492 3493 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3494 mntinfo4_t *, mi); 3495 3496 gethrestime(&after_time); 3497 3498 mutex_enter(&sp->s_lock); 3499 sp->propagation_delay.tv_sec = 3500 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3501 mutex_exit(&sp->s_lock); 3502 3503 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3504 after_time.tv_sec, after_time.tv_nsec)); 3505 3506 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3507 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3508 nfs4_delegreturn_all(sp); 3509 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3510 VFS_RELE(mi->mi_vfsp); 3511 /* 3512 * If the server returns CB_PATH_DOWN, it has renewed 3513 * the lease and informed us that the callback path is 3514 * down. Since the lease is renewed, just return 0 and 3515 * let the renew thread proceed as normal. 3516 */ 3517 return (0); 3518 } 3519 3520 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3521 if (!needrecov && e.error) { 3522 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3523 VFS_RELE(mi->mi_vfsp); 3524 return (e.error); 3525 } 3526 3527 rpc_error = e.error; 3528 3529 if (needrecov) { 3530 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3531 "nfs4renew: initiating recovery\n")); 3532 3533 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3534 OP_RENEW, NULL, NULL, NULL) == FALSE) { 3535 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3536 VFS_RELE(mi->mi_vfsp); 3537 if (!e.error) 3538 (void) xdr_free(xdr_COMPOUND4res_clnt, 3539 (caddr_t)&res); 3540 mutex_enter(&sp->s_lock); 3541 goto recov_retry; 3542 } 3543 /* fall through for res.status case */ 3544 } 3545 3546 if (res.status) { 3547 if (res.status == NFS4ERR_LEASE_MOVED) { 3548 /*EMPTY*/ 3549 /* 3550 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3551 * to renew the lease on that server 3552 */ 3553 } 3554 e.error = geterrno4(res.status); 3555 } 3556 3557 if (!rpc_error) 3558 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3559 3560 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3561 3562 VFS_RELE(mi->mi_vfsp); 3563 3564 return (e.error); 3565 } 3566 3567 void 3568 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3569 { 3570 nfs4_server_t *sp; 3571 3572 /* this locks down sp if it is found */ 3573 sp = find_nfs4_server(mi); 3574 3575 if (sp != NULL) { 3576 nfs4_inc_state_ref_count_nolock(sp, mi); 3577 mutex_exit(&sp->s_lock); 3578 nfs4_server_rele(sp); 3579 } 3580 } 3581 3582 /* 3583 * Bump the number of OPEN files (ie: those with state) so we know if this 3584 * nfs4_server has any state to maintain a lease for or not. 3585 * 3586 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3587 */ 3588 void 3589 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3590 { 3591 ASSERT(mutex_owned(&sp->s_lock)); 3592 3593 sp->state_ref_count++; 3594 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3595 "nfs4_inc_state_ref_count: state_ref_count now %d", 3596 sp->state_ref_count)); 3597 3598 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3599 sp->lease_valid = NFS4_LEASE_VALID; 3600 3601 /* 3602 * If this call caused the lease to be marked valid and/or 3603 * took the state_ref_count from 0 to 1, then start the time 3604 * on lease renewal. 3605 */ 3606 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3607 sp->last_renewal_time = gethrestime_sec(); 3608 3609 /* update the number of open files for mi */ 3610 mi->mi_open_files++; 3611 } 3612 3613 void 3614 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3615 { 3616 nfs4_server_t *sp; 3617 3618 /* this locks down sp if it is found */ 3619 sp = find_nfs4_server_all(mi, 1); 3620 3621 if (sp != NULL) { 3622 nfs4_dec_state_ref_count_nolock(sp, mi); 3623 mutex_exit(&sp->s_lock); 3624 nfs4_server_rele(sp); 3625 } 3626 } 3627 3628 /* 3629 * Decrement the number of OPEN files (ie: those with state) so we know if 3630 * this nfs4_server has any state to maintain a lease for or not. 3631 */ 3632 void 3633 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3634 { 3635 ASSERT(mutex_owned(&sp->s_lock)); 3636 ASSERT(sp->state_ref_count != 0); 3637 sp->state_ref_count--; 3638 3639 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3640 "nfs4_dec_state_ref_count: state ref count now %d", 3641 sp->state_ref_count)); 3642 3643 mi->mi_open_files--; 3644 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3645 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3646 mi->mi_open_files, mi->mi_flags)); 3647 3648 /* We don't have to hold the mi_lock to test mi_flags */ 3649 if (mi->mi_open_files == 0 && 3650 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3651 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3652 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3653 "we have closed the last open file", (void*)mi)); 3654 nfs4_remove_mi_from_server(mi, sp); 3655 } 3656 } 3657 3658 bool_t 3659 inlease(nfs4_server_t *sp) 3660 { 3661 bool_t result; 3662 3663 ASSERT(mutex_owned(&sp->s_lock)); 3664 3665 if (sp->lease_valid == NFS4_LEASE_VALID && 3666 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3667 result = TRUE; 3668 else 3669 result = FALSE; 3670 3671 return (result); 3672 } 3673 3674 3675 /* 3676 * Return non-zero if the given nfs4_server_t is going through recovery. 3677 */ 3678 3679 int 3680 nfs4_server_in_recovery(nfs4_server_t *sp) 3681 { 3682 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3683 } 3684 3685 /* 3686 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3687 * first is less than, equal to, or greater than the second. 3688 */ 3689 3690 int 3691 sfh4cmp(const void *p1, const void *p2) 3692 { 3693 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3694 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3695 3696 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3697 } 3698 3699 /* 3700 * Create a table for shared filehandle objects. 3701 */ 3702 3703 void 3704 sfh4_createtab(avl_tree_t *tab) 3705 { 3706 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3707 offsetof(nfs4_sharedfh_t, sfh_tree)); 3708 } 3709 3710 /* 3711 * Return a shared filehandle object for the given filehandle. The caller 3712 * is responsible for eventually calling sfh4_rele(). 3713 */ 3714 3715 nfs4_sharedfh_t * 3716 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3717 { 3718 nfs4_sharedfh_t *sfh, *nsfh; 3719 avl_index_t where; 3720 nfs4_sharedfh_t skey; 3721 3722 if (!key) { 3723 skey.sfh_fh = *fh; 3724 key = &skey; 3725 } 3726 3727 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3728 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3729 /* 3730 * We allocate the largest possible filehandle size because it's 3731 * not that big, and it saves us from possibly having to resize the 3732 * buffer later. 3733 */ 3734 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3735 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3736 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3737 nsfh->sfh_refcnt = 1; 3738 nsfh->sfh_flags = SFH4_IN_TREE; 3739 nsfh->sfh_mi = mi; 3740 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3741 (void *)nsfh)); 3742 3743 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3744 sfh = avl_find(&mi->mi_filehandles, key, &where); 3745 if (sfh != NULL) { 3746 mutex_enter(&sfh->sfh_lock); 3747 sfh->sfh_refcnt++; 3748 mutex_exit(&sfh->sfh_lock); 3749 nfs_rw_exit(&mi->mi_fh_lock); 3750 /* free our speculative allocs */ 3751 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3752 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3753 return (sfh); 3754 } 3755 3756 avl_insert(&mi->mi_filehandles, nsfh, where); 3757 nfs_rw_exit(&mi->mi_fh_lock); 3758 3759 return (nsfh); 3760 } 3761 3762 /* 3763 * Return a shared filehandle object for the given filehandle. The caller 3764 * is responsible for eventually calling sfh4_rele(). 3765 */ 3766 3767 nfs4_sharedfh_t * 3768 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3769 { 3770 nfs4_sharedfh_t *sfh; 3771 nfs4_sharedfh_t key; 3772 3773 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3774 3775 #ifdef DEBUG 3776 if (nfs4_sharedfh_debug) { 3777 nfs4_fhandle_t fhandle; 3778 3779 fhandle.fh_len = fh->nfs_fh4_len; 3780 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3781 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3782 nfs4_printfhandle(&fhandle); 3783 } 3784 #endif 3785 3786 /* 3787 * If there's already an object for the given filehandle, bump the 3788 * reference count and return it. Otherwise, create a new object 3789 * and add it to the AVL tree. 3790 */ 3791 3792 key.sfh_fh = *fh; 3793 3794 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3795 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3796 if (sfh != NULL) { 3797 mutex_enter(&sfh->sfh_lock); 3798 sfh->sfh_refcnt++; 3799 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3800 "sfh4_get: found existing %p, new refcnt=%d", 3801 (void *)sfh, sfh->sfh_refcnt)); 3802 mutex_exit(&sfh->sfh_lock); 3803 nfs_rw_exit(&mi->mi_fh_lock); 3804 return (sfh); 3805 } 3806 nfs_rw_exit(&mi->mi_fh_lock); 3807 3808 return (sfh4_put(fh, mi, &key)); 3809 } 3810 3811 /* 3812 * Get a reference to the given shared filehandle object. 3813 */ 3814 3815 void 3816 sfh4_hold(nfs4_sharedfh_t *sfh) 3817 { 3818 ASSERT(sfh->sfh_refcnt > 0); 3819 3820 mutex_enter(&sfh->sfh_lock); 3821 sfh->sfh_refcnt++; 3822 NFS4_DEBUG(nfs4_sharedfh_debug, 3823 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3824 (void *)sfh, sfh->sfh_refcnt)); 3825 mutex_exit(&sfh->sfh_lock); 3826 } 3827 3828 /* 3829 * Release a reference to the given shared filehandle object and null out 3830 * the given pointer. 3831 */ 3832 3833 void 3834 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3835 { 3836 mntinfo4_t *mi; 3837 nfs4_sharedfh_t *sfh = *sfhpp; 3838 3839 ASSERT(sfh->sfh_refcnt > 0); 3840 3841 mutex_enter(&sfh->sfh_lock); 3842 if (sfh->sfh_refcnt > 1) { 3843 sfh->sfh_refcnt--; 3844 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3845 "sfh4_rele %p, new refcnt=%d", 3846 (void *)sfh, sfh->sfh_refcnt)); 3847 mutex_exit(&sfh->sfh_lock); 3848 goto finish; 3849 } 3850 mutex_exit(&sfh->sfh_lock); 3851 3852 /* 3853 * Possibly the last reference, so get the lock for the table in 3854 * case it's time to remove the object from the table. 3855 */ 3856 mi = sfh->sfh_mi; 3857 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3858 mutex_enter(&sfh->sfh_lock); 3859 sfh->sfh_refcnt--; 3860 if (sfh->sfh_refcnt > 0) { 3861 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3862 "sfh4_rele %p, new refcnt=%d", 3863 (void *)sfh, sfh->sfh_refcnt)); 3864 mutex_exit(&sfh->sfh_lock); 3865 nfs_rw_exit(&mi->mi_fh_lock); 3866 goto finish; 3867 } 3868 3869 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3870 "sfh4_rele %p, last ref", (void *)sfh)); 3871 if (sfh->sfh_flags & SFH4_IN_TREE) { 3872 avl_remove(&mi->mi_filehandles, sfh); 3873 sfh->sfh_flags &= ~SFH4_IN_TREE; 3874 } 3875 mutex_exit(&sfh->sfh_lock); 3876 nfs_rw_exit(&mi->mi_fh_lock); 3877 mutex_destroy(&sfh->sfh_lock); 3878 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3879 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3880 3881 finish: 3882 *sfhpp = NULL; 3883 } 3884 3885 /* 3886 * Update the filehandle for the given shared filehandle object. 3887 */ 3888 3889 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3890 3891 void 3892 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3893 { 3894 mntinfo4_t *mi = sfh->sfh_mi; 3895 nfs4_sharedfh_t *dupsfh; 3896 avl_index_t where; 3897 nfs4_sharedfh_t key; 3898 3899 #ifdef DEBUG 3900 mutex_enter(&sfh->sfh_lock); 3901 ASSERT(sfh->sfh_refcnt > 0); 3902 mutex_exit(&sfh->sfh_lock); 3903 #endif 3904 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3905 3906 /* 3907 * The basic plan is to remove the shared filehandle object from 3908 * the table, update it to have the new filehandle, then reinsert 3909 * it. 3910 */ 3911 3912 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3913 mutex_enter(&sfh->sfh_lock); 3914 if (sfh->sfh_flags & SFH4_IN_TREE) { 3915 avl_remove(&mi->mi_filehandles, sfh); 3916 sfh->sfh_flags &= ~SFH4_IN_TREE; 3917 } 3918 mutex_exit(&sfh->sfh_lock); 3919 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3920 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3921 sfh->sfh_fh.nfs_fh4_len); 3922 3923 /* 3924 * XXX If there is already a shared filehandle object with the new 3925 * filehandle, we're in trouble, because the rnode code assumes 3926 * that there is only one shared filehandle object for a given 3927 * filehandle. So issue a warning (for read-write mounts only) 3928 * and don't try to re-insert the given object into the table. 3929 * Hopefully the given object will quickly go away and everyone 3930 * will use the new object. 3931 */ 3932 key.sfh_fh = *newfh; 3933 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3934 if (dupsfh != NULL) { 3935 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3936 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3937 "duplicate filehandle detected"); 3938 sfh4_printfhandle(dupsfh); 3939 } 3940 } else { 3941 avl_insert(&mi->mi_filehandles, sfh, where); 3942 mutex_enter(&sfh->sfh_lock); 3943 sfh->sfh_flags |= SFH4_IN_TREE; 3944 mutex_exit(&sfh->sfh_lock); 3945 } 3946 nfs_rw_exit(&mi->mi_fh_lock); 3947 } 3948 3949 /* 3950 * Copy out the current filehandle for the given shared filehandle object. 3951 */ 3952 3953 void 3954 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3955 { 3956 mntinfo4_t *mi = sfh->sfh_mi; 3957 3958 ASSERT(sfh->sfh_refcnt > 0); 3959 3960 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3961 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3962 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3963 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3964 nfs_rw_exit(&mi->mi_fh_lock); 3965 } 3966 3967 /* 3968 * Print out the filehandle for the given shared filehandle object. 3969 */ 3970 3971 void 3972 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3973 { 3974 nfs4_fhandle_t fhandle; 3975 3976 sfh4_copyval(sfh, &fhandle); 3977 nfs4_printfhandle(&fhandle); 3978 } 3979 3980 /* 3981 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3982 * if they're the same, +1 if the first is "greater" than the second. The 3983 * caller (or whoever's calling the AVL package) is responsible for 3984 * handling locking issues. 3985 */ 3986 3987 static int 3988 fncmp(const void *p1, const void *p2) 3989 { 3990 const nfs4_fname_t *f1 = p1; 3991 const nfs4_fname_t *f2 = p2; 3992 int res; 3993 3994 res = strcmp(f1->fn_name, f2->fn_name); 3995 /* 3996 * The AVL package wants +/-1, not arbitrary positive or negative 3997 * integers. 3998 */ 3999 if (res > 0) 4000 res = 1; 4001 else if (res < 0) 4002 res = -1; 4003 return (res); 4004 } 4005 4006 /* 4007 * Get or create an fname with the given name, as a child of the given 4008 * fname. The caller is responsible for eventually releasing the reference 4009 * (fn_rele()). parent may be NULL. 4010 */ 4011 4012 nfs4_fname_t * 4013 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh) 4014 { 4015 nfs4_fname_t key; 4016 nfs4_fname_t *fnp; 4017 avl_index_t where; 4018 4019 key.fn_name = name; 4020 4021 /* 4022 * If there's already an fname registered with the given name, bump 4023 * its reference count and return it. Otherwise, create a new one 4024 * and add it to the parent's AVL tree. 4025 * 4026 * fname entries we are looking for should match both name 4027 * and sfh stored in the fname. 4028 */ 4029 again: 4030 if (parent != NULL) { 4031 mutex_enter(&parent->fn_lock); 4032 fnp = avl_find(&parent->fn_children, &key, &where); 4033 if (fnp != NULL) { 4034 /* 4035 * This hold on fnp is released below later, 4036 * in case this is not the fnp we want. 4037 */ 4038 fn_hold(fnp); 4039 4040 if (fnp->fn_sfh == sfh) { 4041 /* 4042 * We have found our entry. 4043 * put an hold and return it. 4044 */ 4045 mutex_exit(&parent->fn_lock); 4046 return (fnp); 4047 } 4048 4049 /* 4050 * We have found an entry that has a mismatching 4051 * fn_sfh. This could be a stale entry due to 4052 * server side rename. We will remove this entry 4053 * and make sure no such entries exist. 4054 */ 4055 mutex_exit(&parent->fn_lock); 4056 mutex_enter(&fnp->fn_lock); 4057 if (fnp->fn_parent == parent) { 4058 /* 4059 * Remove ourselves from parent's 4060 * fn_children tree. 4061 */ 4062 mutex_enter(&parent->fn_lock); 4063 avl_remove(&parent->fn_children, fnp); 4064 mutex_exit(&parent->fn_lock); 4065 fn_rele(&fnp->fn_parent); 4066 } 4067 mutex_exit(&fnp->fn_lock); 4068 fn_rele(&fnp); 4069 goto again; 4070 } 4071 } 4072 4073 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 4074 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 4075 fnp->fn_parent = parent; 4076 if (parent != NULL) 4077 fn_hold(parent); 4078 fnp->fn_len = strlen(name); 4079 ASSERT(fnp->fn_len < MAXNAMELEN); 4080 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 4081 (void) strcpy(fnp->fn_name, name); 4082 fnp->fn_refcnt = 1; 4083 4084 /* 4085 * This hold on sfh is later released 4086 * when we do the final fn_rele() on this fname. 4087 */ 4088 sfh4_hold(sfh); 4089 fnp->fn_sfh = sfh; 4090 4091 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 4092 offsetof(nfs4_fname_t, fn_tree)); 4093 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4094 "fn_get %p:%s, a new nfs4_fname_t!", 4095 (void *)fnp, fnp->fn_name)); 4096 if (parent != NULL) { 4097 avl_insert(&parent->fn_children, fnp, where); 4098 mutex_exit(&parent->fn_lock); 4099 } 4100 4101 return (fnp); 4102 } 4103 4104 void 4105 fn_hold(nfs4_fname_t *fnp) 4106 { 4107 atomic_inc_32(&fnp->fn_refcnt); 4108 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4109 "fn_hold %p:%s, new refcnt=%d", 4110 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4111 } 4112 4113 /* 4114 * Decrement the reference count of the given fname, and destroy it if its 4115 * reference count goes to zero. Nulls out the given pointer. 4116 */ 4117 4118 void 4119 fn_rele(nfs4_fname_t **fnpp) 4120 { 4121 nfs4_fname_t *parent; 4122 uint32_t newref; 4123 nfs4_fname_t *fnp; 4124 4125 recur: 4126 fnp = *fnpp; 4127 *fnpp = NULL; 4128 4129 mutex_enter(&fnp->fn_lock); 4130 parent = fnp->fn_parent; 4131 if (parent != NULL) 4132 mutex_enter(&parent->fn_lock); /* prevent new references */ 4133 newref = atomic_dec_32_nv(&fnp->fn_refcnt); 4134 if (newref > 0) { 4135 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4136 "fn_rele %p:%s, new refcnt=%d", 4137 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4138 if (parent != NULL) 4139 mutex_exit(&parent->fn_lock); 4140 mutex_exit(&fnp->fn_lock); 4141 return; 4142 } 4143 4144 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4145 "fn_rele %p:%s, last reference, deleting...", 4146 (void *)fnp, fnp->fn_name)); 4147 if (parent != NULL) { 4148 avl_remove(&parent->fn_children, fnp); 4149 mutex_exit(&parent->fn_lock); 4150 } 4151 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4152 sfh4_rele(&fnp->fn_sfh); 4153 mutex_destroy(&fnp->fn_lock); 4154 avl_destroy(&fnp->fn_children); 4155 kmem_free(fnp, sizeof (nfs4_fname_t)); 4156 /* 4157 * Recursivly fn_rele the parent. 4158 * Use goto instead of a recursive call to avoid stack overflow. 4159 */ 4160 if (parent != NULL) { 4161 fnpp = &parent; 4162 goto recur; 4163 } 4164 } 4165 4166 /* 4167 * Returns the single component name of the given fname, in a MAXNAMELEN 4168 * string buffer, which the caller is responsible for freeing. Note that 4169 * the name may become invalid as a result of fn_move(). 4170 */ 4171 4172 char * 4173 fn_name(nfs4_fname_t *fnp) 4174 { 4175 char *name; 4176 4177 ASSERT(fnp->fn_len < MAXNAMELEN); 4178 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4179 mutex_enter(&fnp->fn_lock); 4180 (void) strcpy(name, fnp->fn_name); 4181 mutex_exit(&fnp->fn_lock); 4182 4183 return (name); 4184 } 4185 4186 4187 /* 4188 * fn_path_realloc 4189 * 4190 * This function, used only by fn_path, constructs 4191 * a new string which looks like "prepend" + "/" + "current". 4192 * by allocating a new string and freeing the old one. 4193 */ 4194 static void 4195 fn_path_realloc(char **curses, char *prepend) 4196 { 4197 int len, curlen = 0; 4198 char *news; 4199 4200 if (*curses == NULL) { 4201 /* 4202 * Prime the pump, allocate just the 4203 * space for prepend and return that. 4204 */ 4205 len = strlen(prepend) + 1; 4206 news = kmem_alloc(len, KM_SLEEP); 4207 (void) strncpy(news, prepend, len); 4208 } else { 4209 /* 4210 * Allocate the space for a new string 4211 * +1 +1 is for the "/" and the NULL 4212 * byte at the end of it all. 4213 */ 4214 curlen = strlen(*curses); 4215 len = curlen + strlen(prepend) + 1 + 1; 4216 news = kmem_alloc(len, KM_SLEEP); 4217 (void) strncpy(news, prepend, len); 4218 (void) strcat(news, "/"); 4219 (void) strcat(news, *curses); 4220 kmem_free(*curses, curlen + 1); 4221 } 4222 *curses = news; 4223 } 4224 4225 /* 4226 * Returns the path name (starting from the fs root) for the given fname. 4227 * The caller is responsible for freeing. Note that the path may be or 4228 * become invalid as a result of fn_move(). 4229 */ 4230 4231 char * 4232 fn_path(nfs4_fname_t *fnp) 4233 { 4234 char *path; 4235 nfs4_fname_t *nextfnp; 4236 4237 if (fnp == NULL) 4238 return (NULL); 4239 4240 path = NULL; 4241 4242 /* walk up the tree constructing the pathname. */ 4243 4244 fn_hold(fnp); /* adjust for later rele */ 4245 do { 4246 mutex_enter(&fnp->fn_lock); 4247 /* 4248 * Add fn_name in front of the current path 4249 */ 4250 fn_path_realloc(&path, fnp->fn_name); 4251 nextfnp = fnp->fn_parent; 4252 if (nextfnp != NULL) 4253 fn_hold(nextfnp); 4254 mutex_exit(&fnp->fn_lock); 4255 fn_rele(&fnp); 4256 fnp = nextfnp; 4257 } while (fnp != NULL); 4258 4259 return (path); 4260 } 4261 4262 /* 4263 * Return a reference to the parent of the given fname, which the caller is 4264 * responsible for eventually releasing. 4265 */ 4266 4267 nfs4_fname_t * 4268 fn_parent(nfs4_fname_t *fnp) 4269 { 4270 nfs4_fname_t *parent; 4271 4272 mutex_enter(&fnp->fn_lock); 4273 parent = fnp->fn_parent; 4274 if (parent != NULL) 4275 fn_hold(parent); 4276 mutex_exit(&fnp->fn_lock); 4277 4278 return (parent); 4279 } 4280 4281 /* 4282 * Update fnp so that its parent is newparent and its name is newname. 4283 */ 4284 4285 void 4286 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4287 { 4288 nfs4_fname_t *parent, *tmpfnp; 4289 ssize_t newlen; 4290 nfs4_fname_t key; 4291 avl_index_t where; 4292 4293 /* 4294 * This assert exists to catch the client trying to rename 4295 * a dir to be a child of itself. This happened at a recent 4296 * bakeoff against a 3rd party (broken) server which allowed 4297 * the rename to succeed. If it trips it means that: 4298 * a) the code in nfs4rename that detects this case is broken 4299 * b) the server is broken (since it allowed the bogus rename) 4300 * 4301 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4302 * panic below from: mutex_enter(&newparent->fn_lock); 4303 */ 4304 ASSERT(fnp != newparent); 4305 4306 /* 4307 * Remove fnp from its current parent, change its name, then add it 4308 * to newparent. It might happen that fnp was replaced by another 4309 * nfs4_fname_t with the same fn_name in parent->fn_children. 4310 * In such case, fnp->fn_parent is NULL and we skip the removal 4311 * of fnp from its current parent. 4312 */ 4313 mutex_enter(&fnp->fn_lock); 4314 parent = fnp->fn_parent; 4315 if (parent != NULL) { 4316 mutex_enter(&parent->fn_lock); 4317 avl_remove(&parent->fn_children, fnp); 4318 mutex_exit(&parent->fn_lock); 4319 fn_rele(&fnp->fn_parent); 4320 } 4321 4322 newlen = strlen(newname); 4323 if (newlen != fnp->fn_len) { 4324 ASSERT(newlen < MAXNAMELEN); 4325 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4326 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4327 fnp->fn_len = newlen; 4328 } 4329 (void) strcpy(fnp->fn_name, newname); 4330 4331 again: 4332 mutex_enter(&newparent->fn_lock); 4333 key.fn_name = fnp->fn_name; 4334 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4335 if (tmpfnp != NULL) { 4336 /* 4337 * This could be due to a file that was unlinked while 4338 * open, or perhaps the rnode is in the free list. Remove 4339 * it from newparent and let it go away on its own. The 4340 * contorted code is to deal with lock order issues and 4341 * race conditions. 4342 */ 4343 fn_hold(tmpfnp); 4344 mutex_exit(&newparent->fn_lock); 4345 mutex_enter(&tmpfnp->fn_lock); 4346 if (tmpfnp->fn_parent == newparent) { 4347 mutex_enter(&newparent->fn_lock); 4348 avl_remove(&newparent->fn_children, tmpfnp); 4349 mutex_exit(&newparent->fn_lock); 4350 fn_rele(&tmpfnp->fn_parent); 4351 } 4352 mutex_exit(&tmpfnp->fn_lock); 4353 fn_rele(&tmpfnp); 4354 goto again; 4355 } 4356 fnp->fn_parent = newparent; 4357 fn_hold(newparent); 4358 avl_insert(&newparent->fn_children, fnp, where); 4359 mutex_exit(&newparent->fn_lock); 4360 mutex_exit(&fnp->fn_lock); 4361 } 4362 4363 #ifdef DEBUG 4364 /* 4365 * Return non-zero if the type information makes sense for the given vnode. 4366 * Otherwise panic. 4367 */ 4368 int 4369 nfs4_consistent_type(vnode_t *vp) 4370 { 4371 rnode4_t *rp = VTOR4(vp); 4372 4373 if (nfs4_vtype_debug && vp->v_type != VNON && 4374 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4375 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4376 "rnode attr type=%d", (void *)vp, vp->v_type, 4377 rp->r_attr.va_type); 4378 } 4379 4380 return (1); 4381 } 4382 #endif /* DEBUG */ 4383