1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 27 * All Rights Reserved 28 */ 29 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/systm.h> 33 #include <sys/thread.h> 34 #include <sys/t_lock.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/errno.h> 39 #include <sys/buf.h> 40 #include <sys/stat.h> 41 #include <sys/cred.h> 42 #include <sys/kmem.h> 43 #include <sys/debug.h> 44 #include <sys/dnlc.h> 45 #include <sys/vmsystm.h> 46 #include <sys/flock.h> 47 #include <sys/share.h> 48 #include <sys/cmn_err.h> 49 #include <sys/tiuser.h> 50 #include <sys/sysmacros.h> 51 #include <sys/callb.h> 52 #include <sys/acl.h> 53 #include <sys/kstat.h> 54 #include <sys/signal.h> 55 #include <sys/disp.h> 56 #include <sys/atomic.h> 57 #include <sys/list.h> 58 #include <sys/sdt.h> 59 60 #include <rpc/types.h> 61 #include <rpc/xdr.h> 62 #include <rpc/auth.h> 63 #include <rpc/clnt.h> 64 65 #include <nfs/nfs.h> 66 #include <nfs/nfs_clnt.h> 67 #include <nfs/nfs_acl.h> 68 69 #include <nfs/nfs4.h> 70 #include <nfs/rnode4.h> 71 #include <nfs/nfs4_clnt.h> 72 73 #include <vm/hat.h> 74 #include <vm/as.h> 75 #include <vm/page.h> 76 #include <vm/pvn.h> 77 #include <vm/seg.h> 78 #include <vm/seg_map.h> 79 #include <vm/seg_vn.h> 80 81 #include <sys/ddi.h> 82 83 /* 84 * Arguments to page-flush thread. 85 */ 86 typedef struct { 87 vnode_t *vp; 88 cred_t *cr; 89 } pgflush_t; 90 91 #ifdef DEBUG 92 int nfs4_client_lease_debug; 93 int nfs4_sharedfh_debug; 94 int nfs4_fname_debug; 95 96 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 97 int nfs4_vtype_debug; 98 99 uint_t nfs4_tsd_key; 100 #endif 101 102 static time_t nfs4_client_resumed = 0; 103 static callb_id_t cid = 0; 104 105 static int nfs4renew(nfs4_server_t *); 106 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 107 static void nfs4_pgflush_thread(pgflush_t *); 108 109 static boolean_t nfs4_client_cpr_callb(void *, int); 110 111 struct mi4_globals { 112 kmutex_t mig_lock; /* lock protecting mig_list */ 113 list_t mig_list; /* list of NFS v4 mounts in zone */ 114 boolean_t mig_destructor_called; 115 }; 116 117 static zone_key_t mi4_list_key; 118 119 /* 120 * Attributes caching: 121 * 122 * Attributes are cached in the rnode in struct vattr form. 123 * There is a time associated with the cached attributes (r_time_attr_inval) 124 * which tells whether the attributes are valid. The time is initialized 125 * to the difference between current time and the modify time of the vnode 126 * when new attributes are cached. This allows the attributes for 127 * files that have changed recently to be timed out sooner than for files 128 * that have not changed for a long time. There are minimum and maximum 129 * timeout values that can be set per mount point. 130 */ 131 132 /* 133 * If a cache purge is in progress, wait for it to finish. 134 * 135 * The current thread must not be in the middle of an 136 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 137 * between this thread, a recovery thread, and the page flush thread. 138 */ 139 int 140 nfs4_waitfor_purge_complete(vnode_t *vp) 141 { 142 rnode4_t *rp; 143 k_sigset_t smask; 144 145 rp = VTOR4(vp); 146 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 147 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 148 mutex_enter(&rp->r_statelock); 149 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 150 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 151 ((rp->r_flags & R4PGFLUSH) && 152 rp->r_pgflush != curthread)) { 153 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 154 sigunintr(&smask); 155 mutex_exit(&rp->r_statelock); 156 return (EINTR); 157 } 158 } 159 sigunintr(&smask); 160 mutex_exit(&rp->r_statelock); 161 } 162 return (0); 163 } 164 165 /* 166 * Validate caches by checking cached attributes. If they have timed out, 167 * then get new attributes from the server. As a side effect, cache 168 * invalidation is done if the attributes have changed. 169 * 170 * If the attributes have not timed out and if there is a cache 171 * invalidation being done by some other thread, then wait until that 172 * thread has completed the cache invalidation. 173 */ 174 int 175 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 176 { 177 int error; 178 nfs4_ga_res_t gar; 179 180 if (ATTRCACHE4_VALID(vp)) { 181 error = nfs4_waitfor_purge_complete(vp); 182 if (error) 183 return (error); 184 return (0); 185 } 186 187 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 188 } 189 190 /* 191 * Fill in attribute from the cache. 192 * If valid, then return 0 to indicate that no error occurred, 193 * otherwise return 1 to indicate that an error occurred. 194 */ 195 static int 196 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 197 { 198 rnode4_t *rp; 199 200 rp = VTOR4(vp); 201 mutex_enter(&rp->r_statelock); 202 mutex_enter(&rp->r_statev4_lock); 203 if (ATTRCACHE4_VALID(vp)) { 204 mutex_exit(&rp->r_statev4_lock); 205 /* 206 * Cached attributes are valid 207 */ 208 *vap = rp->r_attr; 209 mutex_exit(&rp->r_statelock); 210 return (0); 211 } 212 mutex_exit(&rp->r_statev4_lock); 213 mutex_exit(&rp->r_statelock); 214 return (1); 215 } 216 217 218 /* 219 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 220 * call is synchronous because all the pages were invalidated by the 221 * nfs4_invalidate_pages() call. 222 */ 223 void 224 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 225 { 226 struct rnode4 *rp = VTOR4(vp); 227 228 /* Ensure that the ..._end_op() call has been done */ 229 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 230 231 if (errno != ESTALE) 232 return; 233 234 mutex_enter(&rp->r_statelock); 235 rp->r_flags |= R4STALE; 236 if (!rp->r_error) 237 rp->r_error = errno; 238 mutex_exit(&rp->r_statelock); 239 if (nfs4_has_pages(vp)) 240 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 241 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 242 } 243 244 /* 245 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 246 * page purge is done asynchronously. 247 */ 248 void 249 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 250 { 251 rnode4_t *rp; 252 char *contents; 253 vnode_t *xattr; 254 int size; 255 int pgflush; /* are we the page flush thread? */ 256 257 /* 258 * Purge the DNLC for any entries which refer to this file. 259 */ 260 if (vp->v_count > 1 && 261 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 262 dnlc_purge_vp(vp); 263 264 /* 265 * Clear any readdir state bits and purge the readlink response cache. 266 */ 267 rp = VTOR4(vp); 268 mutex_enter(&rp->r_statelock); 269 rp->r_flags &= ~R4LOOKUP; 270 contents = rp->r_symlink.contents; 271 size = rp->r_symlink.size; 272 rp->r_symlink.contents = NULL; 273 274 xattr = rp->r_xattr_dir; 275 rp->r_xattr_dir = NULL; 276 277 /* 278 * Purge pathconf cache too. 279 */ 280 rp->r_pathconf.pc4_xattr_valid = 0; 281 rp->r_pathconf.pc4_cache_valid = 0; 282 283 pgflush = (curthread == rp->r_pgflush); 284 mutex_exit(&rp->r_statelock); 285 286 if (contents != NULL) { 287 288 kmem_free((void *)contents, size); 289 } 290 291 if (xattr != NULL) 292 VN_RELE(xattr); 293 294 /* 295 * Flush the page cache. If the current thread is the page flush 296 * thread, don't initiate a new page flush. There's no need for 297 * it, and doing it correctly is hard. 298 */ 299 if (nfs4_has_pages(vp) && !pgflush) { 300 if (!asyncpg) { 301 (void) nfs4_waitfor_purge_complete(vp); 302 nfs4_flush_pages(vp, cr); 303 } else { 304 pgflush_t *args; 305 306 /* 307 * We don't hold r_statelock while creating the 308 * thread, in case the call blocks. So we use a 309 * flag to indicate that a page flush thread is 310 * active. 311 */ 312 mutex_enter(&rp->r_statelock); 313 if (rp->r_flags & R4PGFLUSH) { 314 mutex_exit(&rp->r_statelock); 315 } else { 316 rp->r_flags |= R4PGFLUSH; 317 mutex_exit(&rp->r_statelock); 318 319 args = kmem_alloc(sizeof (pgflush_t), 320 KM_SLEEP); 321 args->vp = vp; 322 VN_HOLD(args->vp); 323 args->cr = cr; 324 crhold(args->cr); 325 (void) zthread_create(NULL, 0, 326 nfs4_pgflush_thread, args, 0, 327 minclsyspri); 328 } 329 } 330 } 331 332 /* 333 * Flush the readdir response cache. 334 */ 335 nfs4_purge_rddir_cache(vp); 336 } 337 338 /* 339 * Invalidate all pages for the given file, after writing back the dirty 340 * ones. 341 */ 342 343 void 344 nfs4_flush_pages(vnode_t *vp, cred_t *cr) 345 { 346 int error; 347 rnode4_t *rp = VTOR4(vp); 348 349 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 350 if (error == ENOSPC || error == EDQUOT) { 351 mutex_enter(&rp->r_statelock); 352 if (!rp->r_error) 353 rp->r_error = error; 354 mutex_exit(&rp->r_statelock); 355 } 356 } 357 358 /* 359 * Page flush thread. 360 */ 361 362 static void 363 nfs4_pgflush_thread(pgflush_t *args) 364 { 365 rnode4_t *rp = VTOR4(args->vp); 366 367 /* remember which thread we are, so we don't deadlock ourselves */ 368 mutex_enter(&rp->r_statelock); 369 ASSERT(rp->r_pgflush == NULL); 370 rp->r_pgflush = curthread; 371 mutex_exit(&rp->r_statelock); 372 373 nfs4_flush_pages(args->vp, args->cr); 374 375 mutex_enter(&rp->r_statelock); 376 rp->r_pgflush = NULL; 377 rp->r_flags &= ~R4PGFLUSH; 378 cv_broadcast(&rp->r_cv); 379 mutex_exit(&rp->r_statelock); 380 381 VN_RELE(args->vp); 382 crfree(args->cr); 383 kmem_free(args, sizeof (pgflush_t)); 384 zthread_exit(); 385 } 386 387 /* 388 * Purge the readdir cache of all entries which are not currently 389 * being filled. 390 */ 391 void 392 nfs4_purge_rddir_cache(vnode_t *vp) 393 { 394 rnode4_t *rp; 395 396 rp = VTOR4(vp); 397 398 mutex_enter(&rp->r_statelock); 399 rp->r_direof = NULL; 400 rp->r_flags &= ~R4LOOKUP; 401 rp->r_flags |= R4READDIRWATTR; 402 rddir4_cache_purge(rp); 403 mutex_exit(&rp->r_statelock); 404 } 405 406 /* 407 * Set attributes cache for given vnode using virtual attributes. There is 408 * no cache validation, but if the attributes are deemed to be stale, they 409 * are ignored. This corresponds to nfs3_attrcache(). 410 * 411 * Set the timeout value on the attribute cache and fill it 412 * with the passed in attributes. 413 */ 414 void 415 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 416 { 417 rnode4_t *rp = VTOR4(vp); 418 419 mutex_enter(&rp->r_statelock); 420 if (rp->r_time_attr_saved <= t) 421 nfs4_attrcache_va(vp, garp, FALSE); 422 mutex_exit(&rp->r_statelock); 423 } 424 425 /* 426 * Use the passed in virtual attributes to check to see whether the 427 * data and metadata caches are valid, cache the new attributes, and 428 * then do the cache invalidation if required. 429 * 430 * The cache validation and caching of the new attributes is done 431 * atomically via the use of the mutex, r_statelock. If required, 432 * the cache invalidation is done atomically w.r.t. the cache 433 * validation and caching of the attributes via the pseudo lock, 434 * r_serial. 435 * 436 * This routine is used to do cache validation and attributes caching 437 * for operations with a single set of post operation attributes. 438 */ 439 440 void 441 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 442 hrtime_t t, cred_t *cr, int async, 443 change_info4 *cinfo) 444 { 445 rnode4_t *rp; 446 int mtime_changed = 0; 447 int ctime_changed = 0; 448 vsecattr_t *vsp; 449 int was_serial, set_time_cache_inval, recov; 450 vattr_t *vap = &garp->n4g_va; 451 mntinfo4_t *mi = VTOMI4(vp); 452 len_t preattr_rsize; 453 boolean_t writemodify_set = B_FALSE; 454 boolean_t cachepurge_set = B_FALSE; 455 456 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 457 458 /* Is curthread the recovery thread? */ 459 mutex_enter(&mi->mi_lock); 460 recov = (VTOMI4(vp)->mi_recovthread == curthread); 461 mutex_exit(&mi->mi_lock); 462 463 rp = VTOR4(vp); 464 mutex_enter(&rp->r_statelock); 465 was_serial = (rp->r_serial == curthread); 466 if (rp->r_serial && !was_serial) { 467 klwp_t *lwp = ttolwp(curthread); 468 469 /* 470 * If we're the recovery thread, then purge current attrs 471 * and bail out to avoid potential deadlock between another 472 * thread caching attrs (r_serial thread), recov thread, 473 * and an async writer thread. 474 */ 475 if (recov) { 476 PURGE_ATTRCACHE4_LOCKED(rp); 477 mutex_exit(&rp->r_statelock); 478 return; 479 } 480 481 if (lwp != NULL) 482 lwp->lwp_nostop++; 483 while (rp->r_serial != NULL) { 484 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 485 mutex_exit(&rp->r_statelock); 486 if (lwp != NULL) 487 lwp->lwp_nostop--; 488 return; 489 } 490 } 491 if (lwp != NULL) 492 lwp->lwp_nostop--; 493 } 494 495 /* 496 * If there is a page flush thread, the current thread needs to 497 * bail out, to prevent a possible deadlock between the current 498 * thread (which might be in a start_op/end_op region), the 499 * recovery thread, and the page flush thread. Expire the 500 * attribute cache, so that any attributes the current thread was 501 * going to set are not lost. 502 */ 503 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 504 PURGE_ATTRCACHE4_LOCKED(rp); 505 mutex_exit(&rp->r_statelock); 506 return; 507 } 508 509 if (rp->r_time_attr_saved > t) { 510 /* 511 * Attributes have been cached since these attributes were 512 * probably made. If there is an inconsistency in what is 513 * cached, mark them invalid. If not, don't act on them. 514 */ 515 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 516 PURGE_ATTRCACHE4_LOCKED(rp); 517 mutex_exit(&rp->r_statelock); 518 return; 519 } 520 set_time_cache_inval = 0; 521 if (cinfo) { 522 /* 523 * Only directory modifying callers pass non-NULL cinfo. 524 */ 525 ASSERT(vp->v_type == VDIR); 526 /* 527 * If the cache timeout either doesn't exist or hasn't expired, 528 * and dir didn't changed on server before dirmod op 529 * and dir didn't change after dirmod op but before getattr 530 * then there's a chance that the client's cached data for 531 * this object is current (not stale). No immediate cache 532 * flush is required. 533 * 534 */ 535 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 536 cinfo->before == rp->r_change && 537 (garp->n4g_change_valid && 538 cinfo->after == garp->n4g_change)) { 539 540 /* 541 * If atomic isn't set, then the before/after info 542 * cannot be blindly trusted. For this case, we tell 543 * nfs4_attrcache_va to cache the attrs but also 544 * establish an absolute maximum cache timeout. When 545 * the timeout is reached, caches will be flushed. 546 */ 547 if (! cinfo->atomic) 548 set_time_cache_inval = 1; 549 } else { 550 551 /* 552 * We're not sure exactly what changed, but we know 553 * what to do. flush all caches for dir. remove the 554 * attr timeout. 555 * 556 * a) timeout expired. flush all caches. 557 * b) r_change != cinfo.before. flush all caches. 558 * c) r_change == cinfo.before, but cinfo.after != 559 * post-op getattr(change). flush all caches. 560 * d) post-op getattr(change) not provided by server. 561 * flush all caches. 562 */ 563 mtime_changed = 1; 564 ctime_changed = 1; 565 rp->r_time_cache_inval = 0; 566 } 567 } else { 568 /* 569 * Write thread after writing data to file on remote server, 570 * will always set R4WRITEMODIFIED to indicate that file on 571 * remote server was modified with a WRITE operation and would 572 * have marked attribute cache as timed out. If R4WRITEMODIFIED 573 * is set, then do not check for mtime and ctime change. 574 */ 575 if (!(rp->r_flags & R4WRITEMODIFIED)) { 576 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 577 mtime_changed = 1; 578 579 if (rp->r_attr.va_ctime.tv_sec != 580 vap->va_ctime.tv_sec || 581 rp->r_attr.va_ctime.tv_nsec != 582 vap->va_ctime.tv_nsec) 583 ctime_changed = 1; 584 585 /* 586 * If the change attribute was not provided by server 587 * or it differs, then flush all caches. 588 */ 589 if (!garp->n4g_change_valid || 590 rp->r_change != garp->n4g_change) { 591 mtime_changed = 1; 592 ctime_changed = 1; 593 } 594 } else { 595 writemodify_set = B_TRUE; 596 } 597 } 598 599 preattr_rsize = rp->r_size; 600 601 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 602 603 /* 604 * If we have updated filesize in nfs4_attrcache_va, as soon as we 605 * drop statelock we will be in transition of purging all 606 * our caches and updating them. It is possible for another 607 * thread to pick this new file size and read in zeroed data. 608 * stall other threads till cache purge is complete. 609 */ 610 if ((!cinfo) && (rp->r_size != preattr_rsize)) { 611 /* 612 * If R4WRITEMODIFIED was set and we have updated the file 613 * size, Server's returned file size need not necessarily 614 * be because of this Client's WRITE. We need to purge 615 * all caches. 616 */ 617 if (writemodify_set) 618 mtime_changed = 1; 619 620 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) { 621 rp->r_flags |= R4INCACHEPURGE; 622 cachepurge_set = B_TRUE; 623 } 624 } 625 626 if (!mtime_changed && !ctime_changed) { 627 mutex_exit(&rp->r_statelock); 628 return; 629 } 630 631 rp->r_serial = curthread; 632 633 mutex_exit(&rp->r_statelock); 634 635 /* 636 * If we're the recov thread, then force async nfs4_purge_caches 637 * to avoid potential deadlock. 638 */ 639 if (mtime_changed) 640 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 641 642 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) { 643 mutex_enter(&rp->r_statelock); 644 rp->r_flags &= ~R4INCACHEPURGE; 645 cv_broadcast(&rp->r_cv); 646 mutex_exit(&rp->r_statelock); 647 cachepurge_set = B_FALSE; 648 } 649 650 if (ctime_changed) { 651 (void) nfs4_access_purge_rp(rp); 652 if (rp->r_secattr != NULL) { 653 mutex_enter(&rp->r_statelock); 654 vsp = rp->r_secattr; 655 rp->r_secattr = NULL; 656 mutex_exit(&rp->r_statelock); 657 if (vsp != NULL) 658 nfs4_acl_free_cache(vsp); 659 } 660 } 661 662 if (!was_serial) { 663 mutex_enter(&rp->r_statelock); 664 rp->r_serial = NULL; 665 cv_broadcast(&rp->r_cv); 666 mutex_exit(&rp->r_statelock); 667 } 668 } 669 670 /* 671 * Set attributes cache for given vnode using virtual attributes. 672 * 673 * Set the timeout value on the attribute cache and fill it 674 * with the passed in attributes. 675 * 676 * The caller must be holding r_statelock. 677 */ 678 static void 679 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 680 { 681 rnode4_t *rp; 682 mntinfo4_t *mi; 683 hrtime_t delta; 684 hrtime_t now; 685 vattr_t *vap = &garp->n4g_va; 686 687 rp = VTOR4(vp); 688 689 ASSERT(MUTEX_HELD(&rp->r_statelock)); 690 ASSERT(vap->va_mask == AT_ALL); 691 692 /* Switch to master before checking v_flag */ 693 if (IS_SHADOW(vp, rp)) 694 vp = RTOV4(rp); 695 696 now = gethrtime(); 697 698 mi = VTOMI4(vp); 699 700 /* 701 * Only establish a new cache timeout (if requested). Never 702 * extend a timeout. Never clear a timeout. Clearing a timeout 703 * is done by nfs4_update_dircaches (ancestor in our call chain) 704 */ 705 if (set_cache_timeout && ! rp->r_time_cache_inval) 706 rp->r_time_cache_inval = now + mi->mi_acdirmax; 707 708 /* 709 * Delta is the number of nanoseconds that we will 710 * cache the attributes of the file. It is based on 711 * the number of nanoseconds since the last time that 712 * we detected a change. The assumption is that files 713 * that changed recently are likely to change again. 714 * There is a minimum and a maximum for regular files 715 * and for directories which is enforced though. 716 * 717 * Using the time since last change was detected 718 * eliminates direct comparison or calculation 719 * using mixed client and server times. NFS does 720 * not make any assumptions regarding the client 721 * and server clocks being synchronized. 722 */ 723 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 724 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 725 vap->va_size != rp->r_attr.va_size) { 726 rp->r_time_attr_saved = now; 727 } 728 729 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 730 delta = 0; 731 else { 732 delta = now - rp->r_time_attr_saved; 733 if (vp->v_type == VDIR) { 734 if (delta < mi->mi_acdirmin) 735 delta = mi->mi_acdirmin; 736 else if (delta > mi->mi_acdirmax) 737 delta = mi->mi_acdirmax; 738 } else { 739 if (delta < mi->mi_acregmin) 740 delta = mi->mi_acregmin; 741 else if (delta > mi->mi_acregmax) 742 delta = mi->mi_acregmax; 743 } 744 } 745 rp->r_time_attr_inval = now + delta; 746 747 rp->r_attr = *vap; 748 if (garp->n4g_change_valid) 749 rp->r_change = garp->n4g_change; 750 751 /* 752 * The attributes that were returned may be valid and can 753 * be used, but they may not be allowed to be cached. 754 * Reset the timers to cause immediate invalidation and 755 * clear r_change so no VERIFY operations will suceed 756 */ 757 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 758 rp->r_time_attr_inval = now; 759 rp->r_time_attr_saved = now; 760 rp->r_change = 0; 761 } 762 763 /* 764 * If mounted_on_fileid returned AND the object is a stub, 765 * then set object's va_nodeid to the mounted over fid 766 * returned by server. 767 * 768 * If mounted_on_fileid not provided/supported, then 769 * just set it to 0 for now. Eventually it would be 770 * better to set it to a hashed version of FH. This 771 * would probably be good enough to provide a unique 772 * fid/d_ino within a dir. 773 * 774 * We don't need to carry mounted_on_fileid in the 775 * rnode as long as the client never requests fileid 776 * without also requesting mounted_on_fileid. For 777 * now, it stays. 778 */ 779 if (garp->n4g_mon_fid_valid) { 780 rp->r_mntd_fid = garp->n4g_mon_fid; 781 782 if (RP_ISSTUB(rp)) 783 rp->r_attr.va_nodeid = rp->r_mntd_fid; 784 } 785 786 /* 787 * Check to see if there are valid pathconf bits to 788 * cache in the rnode. 789 */ 790 if (garp->n4g_ext_res) { 791 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 792 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 793 } else { 794 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 795 rp->r_pathconf.pc4_xattr_valid = TRUE; 796 rp->r_pathconf.pc4_xattr_exists = 797 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 798 } 799 } 800 } 801 /* 802 * Update the size of the file if there is no cached data or if 803 * the cached data is clean and there is no data being written 804 * out. 805 */ 806 if (rp->r_size != vap->va_size && 807 (!vn_has_cached_data(vp) || 808 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 809 rp->r_size = vap->va_size; 810 } 811 nfs_setswaplike(vp, vap); 812 rp->r_flags &= ~R4WRITEMODIFIED; 813 } 814 815 /* 816 * Get attributes over-the-wire and update attributes cache 817 * if no error occurred in the over-the-wire operation. 818 * Return 0 if successful, otherwise error. 819 */ 820 int 821 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 822 { 823 mntinfo4_t *mi = VTOMI4(vp); 824 hrtime_t t; 825 nfs4_recov_state_t recov_state; 826 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 827 828 recov_state.rs_flags = 0; 829 recov_state.rs_num_retry_despite_err = 0; 830 831 /* Save the original mount point security flavor */ 832 (void) save_mnt_secinfo(mi->mi_curr_serv); 833 834 recov_retry: 835 836 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 837 &recov_state, NULL))) { 838 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 839 return (e.error); 840 } 841 842 t = gethrtime(); 843 844 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 845 846 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 847 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 848 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) { 849 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 850 &recov_state, 1); 851 goto recov_retry; 852 } 853 } 854 855 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 856 857 if (!e.error) { 858 if (e.stat == NFS4_OK) { 859 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 860 } else { 861 e.error = geterrno4(e.stat); 862 863 nfs4_purge_stale_fh(e.error, vp, cr); 864 } 865 } 866 867 /* 868 * If getattr a node that is a stub for a crossed 869 * mount point, keep the original secinfo flavor for 870 * the current file system, not the crossed one. 871 */ 872 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 873 874 return (e.error); 875 } 876 877 /* 878 * Generate a compound to get attributes over-the-wire. 879 */ 880 void 881 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 882 nfs4_error_t *ep, cred_t *cr, int get_acl) 883 { 884 COMPOUND4args_clnt args; 885 COMPOUND4res_clnt res; 886 int doqueue; 887 rnode4_t *rp = VTOR4(vp); 888 nfs_argop4 argop[2]; 889 890 args.ctag = TAG_GETATTR; 891 892 args.array_len = 2; 893 args.array = argop; 894 895 /* putfh */ 896 argop[0].argop = OP_CPUTFH; 897 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 898 899 /* getattr */ 900 /* 901 * Unlike nfs version 2 and 3, where getattr returns all the 902 * attributes, nfs version 4 returns only the ones explicitly 903 * asked for. This creates problems, as some system functions 904 * (e.g. cache check) require certain attributes and if the 905 * cached node lacks some attributes such as uid/gid, it can 906 * affect system utilities (e.g. "ls") that rely on the information 907 * to be there. This can lead to anything from system crashes to 908 * corrupted information processed by user apps. 909 * So to ensure that all bases are covered, request at least 910 * the AT_ALL attribute mask. 911 */ 912 argop[1].argop = OP_GETATTR; 913 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 914 if (get_acl) 915 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 916 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 917 918 doqueue = 1; 919 920 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 921 922 if (ep->error) 923 return; 924 925 if (res.status != NFS4_OK) { 926 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 927 return; 928 } 929 930 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 931 932 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 933 } 934 935 /* 936 * Return either cached or remote attributes. If get remote attr 937 * use them to check and invalidate caches, then cache the new attributes. 938 */ 939 int 940 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 941 { 942 int error; 943 rnode4_t *rp; 944 nfs4_ga_res_t gar; 945 946 ASSERT(nfs4_consistent_type(vp)); 947 948 /* 949 * If we've got cached attributes, we're done, otherwise go 950 * to the server to get attributes, which will update the cache 951 * in the process. Either way, use the cached attributes for 952 * the caller's vattr_t. 953 * 954 * Note that we ignore the gar set by the OTW call: the attr caching 955 * code may make adjustments when storing to the rnode, and we want 956 * to see those changes here. 957 */ 958 rp = VTOR4(vp); 959 error = 0; 960 mutex_enter(&rp->r_statelock); 961 if (!ATTRCACHE4_VALID(vp)) { 962 mutex_exit(&rp->r_statelock); 963 error = nfs4_getattr_otw(vp, &gar, cr, 0); 964 mutex_enter(&rp->r_statelock); 965 } 966 967 if (!error) 968 *vap = rp->r_attr; 969 970 /* Return the client's view of file size */ 971 vap->va_size = rp->r_size; 972 973 mutex_exit(&rp->r_statelock); 974 975 ASSERT(nfs4_consistent_type(vp)); 976 977 return (error); 978 } 979 980 int 981 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 982 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 983 { 984 COMPOUND4args_clnt args; 985 COMPOUND4res_clnt res; 986 int doqueue; 987 nfs_argop4 argop[2]; 988 mntinfo4_t *mi = VTOMI4(vp); 989 bool_t needrecov = FALSE; 990 nfs4_recov_state_t recov_state; 991 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 992 nfs4_ga_ext_res_t *gerp; 993 994 recov_state.rs_flags = 0; 995 recov_state.rs_num_retry_despite_err = 0; 996 997 recov_retry: 998 args.ctag = tag_type; 999 1000 args.array_len = 2; 1001 args.array = argop; 1002 1003 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 1004 if (e.error) 1005 return (e.error); 1006 1007 /* putfh */ 1008 argop[0].argop = OP_CPUTFH; 1009 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 1010 1011 /* getattr */ 1012 argop[1].argop = OP_GETATTR; 1013 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 1014 argop[1].nfs_argop4_u.opgetattr.mi = mi; 1015 1016 doqueue = 1; 1017 1018 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1019 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 1020 rnode4info(VTOR4(vp)))); 1021 1022 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 1023 1024 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 1025 if (!needrecov && e.error) { 1026 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1027 needrecov); 1028 return (e.error); 1029 } 1030 1031 if (needrecov) { 1032 bool_t abort; 1033 1034 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1035 "nfs4_attr_otw: initiating recovery\n")); 1036 1037 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1038 NULL, OP_GETATTR, NULL, NULL, NULL); 1039 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1040 needrecov); 1041 if (!e.error) { 1042 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1043 e.error = geterrno4(res.status); 1044 } 1045 if (abort == FALSE) 1046 goto recov_retry; 1047 return (e.error); 1048 } 1049 1050 if (res.status) { 1051 e.error = geterrno4(res.status); 1052 } else { 1053 gerp = garp->n4g_ext_res; 1054 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1055 garp, sizeof (nfs4_ga_res_t)); 1056 garp->n4g_ext_res = gerp; 1057 if (garp->n4g_ext_res && 1058 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1059 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1060 ga_res.n4g_ext_res, 1061 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1062 } 1063 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1064 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1065 needrecov); 1066 return (e.error); 1067 } 1068 1069 /* 1070 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1071 * for the demand-based allocation of async threads per-mount. The 1072 * nfs_async_timeout is the amount of time a thread will live after it 1073 * becomes idle, unless new I/O requests are received before the thread 1074 * dies. See nfs4_async_putpage and nfs4_async_start. 1075 */ 1076 1077 static void nfs4_async_start(struct vfs *); 1078 static void nfs4_async_pgops_start(struct vfs *); 1079 static void nfs4_async_common_start(struct vfs *, int); 1080 1081 static void 1082 free_async_args4(struct nfs4_async_reqs *args) 1083 { 1084 rnode4_t *rp; 1085 1086 if (args->a_io != NFS4_INACTIVE) { 1087 rp = VTOR4(args->a_vp); 1088 mutex_enter(&rp->r_statelock); 1089 rp->r_count--; 1090 if (args->a_io == NFS4_PUTAPAGE || 1091 args->a_io == NFS4_PAGEIO) 1092 rp->r_awcount--; 1093 cv_broadcast(&rp->r_cv); 1094 mutex_exit(&rp->r_statelock); 1095 VN_RELE(args->a_vp); 1096 } 1097 crfree(args->a_cred); 1098 kmem_free(args, sizeof (*args)); 1099 } 1100 1101 /* 1102 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1103 * pageout(), running in the global zone, have legitimate reasons to do 1104 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1105 * use of a a per-mount "asynchronous requests manager thread" which is 1106 * signaled by the various asynchronous work routines when there is 1107 * asynchronous work to be done. It is responsible for creating new 1108 * worker threads if necessary, and notifying existing worker threads 1109 * that there is work to be done. 1110 * 1111 * In other words, it will "take the specifications from the customers and 1112 * give them to the engineers." 1113 * 1114 * Worker threads die off of their own accord if they are no longer 1115 * needed. 1116 * 1117 * This thread is killed when the zone is going away or the filesystem 1118 * is being unmounted. 1119 */ 1120 void 1121 nfs4_async_manager(vfs_t *vfsp) 1122 { 1123 callb_cpr_t cprinfo; 1124 mntinfo4_t *mi; 1125 uint_t max_threads; 1126 1127 mi = VFTOMI4(vfsp); 1128 1129 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1130 "nfs4_async_manager"); 1131 1132 mutex_enter(&mi->mi_async_lock); 1133 /* 1134 * We want to stash the max number of threads that this mount was 1135 * allowed so we can use it later when the variable is set to zero as 1136 * part of the zone/mount going away. 1137 * 1138 * We want to be able to create at least one thread to handle 1139 * asynchronous inactive calls. 1140 */ 1141 max_threads = MAX(mi->mi_max_threads, 1); 1142 /* 1143 * We don't want to wait for mi_max_threads to go to zero, since that 1144 * happens as part of a failed unmount, but this thread should only 1145 * exit when the mount is really going away. 1146 * 1147 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1148 * attempted: the various _async_*() functions know to do things 1149 * inline if mi_max_threads == 0. Henceforth we just drain out the 1150 * outstanding requests. 1151 * 1152 * Note that we still create zthreads even if we notice the zone is 1153 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1154 * shutdown sequence to take slightly longer in some cases, but 1155 * doesn't violate the protocol, as all threads will exit as soon as 1156 * they're done processing the remaining requests. 1157 */ 1158 for (;;) { 1159 while (mi->mi_async_req_count > 0) { 1160 /* 1161 * Paranoia: If the mount started out having 1162 * (mi->mi_max_threads == 0), and the value was 1163 * later changed (via a debugger or somesuch), 1164 * we could be confused since we will think we 1165 * can't create any threads, and the calling 1166 * code (which looks at the current value of 1167 * mi->mi_max_threads, now non-zero) thinks we 1168 * can. 1169 * 1170 * So, because we're paranoid, we create threads 1171 * up to the maximum of the original and the 1172 * current value. This means that future 1173 * (debugger-induced) alterations of 1174 * mi->mi_max_threads are ignored for our 1175 * purposes, but who told them they could change 1176 * random values on a live kernel anyhow? 1177 */ 1178 if (mi->mi_threads[NFS4_ASYNC_QUEUE] < 1179 MAX(mi->mi_max_threads, max_threads)) { 1180 mi->mi_threads[NFS4_ASYNC_QUEUE]++; 1181 mutex_exit(&mi->mi_async_lock); 1182 MI4_HOLD(mi); 1183 VFS_HOLD(vfsp); /* hold for new thread */ 1184 (void) zthread_create(NULL, 0, nfs4_async_start, 1185 vfsp, 0, minclsyspri); 1186 mutex_enter(&mi->mi_async_lock); 1187 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] < 1188 NUM_ASYNC_PGOPS_THREADS) { 1189 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++; 1190 mutex_exit(&mi->mi_async_lock); 1191 MI4_HOLD(mi); 1192 VFS_HOLD(vfsp); /* hold for new thread */ 1193 (void) zthread_create(NULL, 0, 1194 nfs4_async_pgops_start, vfsp, 0, 1195 minclsyspri); 1196 mutex_enter(&mi->mi_async_lock); 1197 } 1198 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1199 ASSERT(mi->mi_async_req_count != 0); 1200 mi->mi_async_req_count--; 1201 } 1202 1203 mutex_enter(&mi->mi_lock); 1204 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) { 1205 mutex_exit(&mi->mi_lock); 1206 break; 1207 } 1208 mutex_exit(&mi->mi_lock); 1209 1210 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1211 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1212 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1213 } 1214 1215 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1216 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1217 /* 1218 * Let everyone know we're done. 1219 */ 1220 mi->mi_manager_thread = NULL; 1221 /* 1222 * Wake up the inactive thread. 1223 */ 1224 cv_broadcast(&mi->mi_inact_req_cv); 1225 /* 1226 * Wake up anyone sitting in nfs4_async_manager_stop() 1227 */ 1228 cv_broadcast(&mi->mi_async_cv); 1229 /* 1230 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1231 * since CALLB_CPR_EXIT is actually responsible for releasing 1232 * 'mi_async_lock'. 1233 */ 1234 CALLB_CPR_EXIT(&cprinfo); 1235 VFS_RELE(vfsp); /* release thread's hold */ 1236 MI4_RELE(mi); 1237 zthread_exit(); 1238 } 1239 1240 /* 1241 * Signal (and wait for) the async manager thread to clean up and go away. 1242 */ 1243 void 1244 nfs4_async_manager_stop(vfs_t *vfsp) 1245 { 1246 mntinfo4_t *mi = VFTOMI4(vfsp); 1247 1248 mutex_enter(&mi->mi_async_lock); 1249 mutex_enter(&mi->mi_lock); 1250 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1251 mutex_exit(&mi->mi_lock); 1252 cv_broadcast(&mi->mi_async_reqs_cv); 1253 /* 1254 * Wait for the async manager thread to die. 1255 */ 1256 while (mi->mi_manager_thread != NULL) 1257 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1258 mutex_exit(&mi->mi_async_lock); 1259 } 1260 1261 int 1262 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1263 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1264 u_offset_t, caddr_t, struct seg *, cred_t *)) 1265 { 1266 rnode4_t *rp; 1267 mntinfo4_t *mi; 1268 struct nfs4_async_reqs *args; 1269 1270 rp = VTOR4(vp); 1271 ASSERT(rp->r_freef == NULL); 1272 1273 mi = VTOMI4(vp); 1274 1275 /* 1276 * If addr falls in a different segment, don't bother doing readahead. 1277 */ 1278 if (addr >= seg->s_base + seg->s_size) 1279 return (-1); 1280 1281 /* 1282 * If we can't allocate a request structure, punt on the readahead. 1283 */ 1284 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1285 return (-1); 1286 1287 /* 1288 * If a lock operation is pending, don't initiate any new 1289 * readaheads. Otherwise, bump r_count to indicate the new 1290 * asynchronous I/O. 1291 */ 1292 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1293 kmem_free(args, sizeof (*args)); 1294 return (-1); 1295 } 1296 mutex_enter(&rp->r_statelock); 1297 rp->r_count++; 1298 mutex_exit(&rp->r_statelock); 1299 nfs_rw_exit(&rp->r_lkserlock); 1300 1301 args->a_next = NULL; 1302 #ifdef DEBUG 1303 args->a_queuer = curthread; 1304 #endif 1305 VN_HOLD(vp); 1306 args->a_vp = vp; 1307 ASSERT(cr != NULL); 1308 crhold(cr); 1309 args->a_cred = cr; 1310 args->a_io = NFS4_READ_AHEAD; 1311 args->a_nfs4_readahead = readahead; 1312 args->a_nfs4_blkoff = blkoff; 1313 args->a_nfs4_seg = seg; 1314 args->a_nfs4_addr = addr; 1315 1316 mutex_enter(&mi->mi_async_lock); 1317 1318 /* 1319 * If asyncio has been disabled, don't bother readahead. 1320 */ 1321 if (mi->mi_max_threads == 0) { 1322 mutex_exit(&mi->mi_async_lock); 1323 goto noasync; 1324 } 1325 1326 /* 1327 * Link request structure into the async list and 1328 * wakeup async thread to do the i/o. 1329 */ 1330 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1331 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1332 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1333 } else { 1334 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1335 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1336 } 1337 1338 if (mi->mi_io_kstats) { 1339 mutex_enter(&mi->mi_lock); 1340 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1341 mutex_exit(&mi->mi_lock); 1342 } 1343 1344 mi->mi_async_req_count++; 1345 ASSERT(mi->mi_async_req_count != 0); 1346 cv_signal(&mi->mi_async_reqs_cv); 1347 mutex_exit(&mi->mi_async_lock); 1348 return (0); 1349 1350 noasync: 1351 mutex_enter(&rp->r_statelock); 1352 rp->r_count--; 1353 cv_broadcast(&rp->r_cv); 1354 mutex_exit(&rp->r_statelock); 1355 VN_RELE(vp); 1356 crfree(cr); 1357 kmem_free(args, sizeof (*args)); 1358 return (-1); 1359 } 1360 1361 static void 1362 nfs4_async_start(struct vfs *vfsp) 1363 { 1364 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE); 1365 } 1366 1367 static void 1368 nfs4_async_pgops_start(struct vfs *vfsp) 1369 { 1370 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE); 1371 } 1372 1373 /* 1374 * The async queues for each mounted file system are arranged as a 1375 * set of queues, one for each async i/o type. Requests are taken 1376 * from the queues in a round-robin fashion. A number of consecutive 1377 * requests are taken from each queue before moving on to the next 1378 * queue. This functionality may allow the NFS Version 2 server to do 1379 * write clustering, even if the client is mixing writes and reads 1380 * because it will take multiple write requests from the queue 1381 * before processing any of the other async i/o types. 1382 * 1383 * XXX The nfs4_async_common_start thread is unsafe in the light of the present 1384 * model defined by cpr to suspend the system. Specifically over the 1385 * wire calls are cpr-unsafe. The thread should be reevaluated in 1386 * case of future updates to the cpr model. 1387 */ 1388 static void 1389 nfs4_async_common_start(struct vfs *vfsp, int async_queue) 1390 { 1391 struct nfs4_async_reqs *args; 1392 mntinfo4_t *mi = VFTOMI4(vfsp); 1393 clock_t time_left = 1; 1394 callb_cpr_t cprinfo; 1395 int i; 1396 extern int nfs_async_timeout; 1397 int async_types; 1398 kcondvar_t *async_work_cv; 1399 1400 if (async_queue == NFS4_ASYNC_QUEUE) { 1401 async_types = NFS4_ASYNC_TYPES; 1402 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]; 1403 } else { 1404 async_types = NFS4_ASYNC_PGOPS_TYPES; 1405 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]; 1406 } 1407 1408 /* 1409 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1410 * built in an implementation independent manner. 1411 */ 1412 if (nfs_async_timeout == -1) 1413 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1414 1415 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1416 1417 mutex_enter(&mi->mi_async_lock); 1418 for (;;) { 1419 /* 1420 * Find the next queue containing an entry. We start 1421 * at the current queue pointer and then round robin 1422 * through all of them until we either find a non-empty 1423 * queue or have looked through all of them. 1424 */ 1425 for (i = 0; i < async_types; i++) { 1426 args = *mi->mi_async_curr[async_queue]; 1427 if (args != NULL) 1428 break; 1429 mi->mi_async_curr[async_queue]++; 1430 if (mi->mi_async_curr[async_queue] == 1431 &mi->mi_async_reqs[async_types]) { 1432 mi->mi_async_curr[async_queue] = 1433 &mi->mi_async_reqs[0]; 1434 } 1435 } 1436 /* 1437 * If we didn't find a entry, then block until woken up 1438 * again and then look through the queues again. 1439 */ 1440 if (args == NULL) { 1441 /* 1442 * Exiting is considered to be safe for CPR as well 1443 */ 1444 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1445 1446 /* 1447 * Wakeup thread waiting to unmount the file 1448 * system only if all async threads are inactive. 1449 * 1450 * If we've timed-out and there's nothing to do, 1451 * then get rid of this thread. 1452 */ 1453 if (mi->mi_max_threads == 0 || time_left <= 0) { 1454 --mi->mi_threads[async_queue]; 1455 1456 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 1457 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0) 1458 cv_signal(&mi->mi_async_cv); 1459 CALLB_CPR_EXIT(&cprinfo); 1460 VFS_RELE(vfsp); /* release thread's hold */ 1461 MI4_RELE(mi); 1462 zthread_exit(); 1463 /* NOTREACHED */ 1464 } 1465 time_left = cv_reltimedwait(async_work_cv, 1466 &mi->mi_async_lock, nfs_async_timeout, 1467 TR_CLOCK_TICK); 1468 1469 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1470 1471 continue; 1472 } else { 1473 time_left = 1; 1474 } 1475 1476 /* 1477 * Remove the request from the async queue and then 1478 * update the current async request queue pointer. If 1479 * the current queue is empty or we have removed enough 1480 * consecutive entries from it, then reset the counter 1481 * for this queue and then move the current pointer to 1482 * the next queue. 1483 */ 1484 *mi->mi_async_curr[async_queue] = args->a_next; 1485 if (*mi->mi_async_curr[async_queue] == NULL || 1486 --mi->mi_async_clusters[args->a_io] == 0) { 1487 mi->mi_async_clusters[args->a_io] = 1488 mi->mi_async_init_clusters; 1489 mi->mi_async_curr[async_queue]++; 1490 if (mi->mi_async_curr[async_queue] == 1491 &mi->mi_async_reqs[async_types]) { 1492 mi->mi_async_curr[async_queue] = 1493 &mi->mi_async_reqs[0]; 1494 } 1495 } 1496 1497 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1498 mutex_enter(&mi->mi_lock); 1499 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1500 mutex_exit(&mi->mi_lock); 1501 } 1502 1503 mutex_exit(&mi->mi_async_lock); 1504 1505 /* 1506 * Obtain arguments from the async request structure. 1507 */ 1508 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1509 (*args->a_nfs4_readahead)(args->a_vp, 1510 args->a_nfs4_blkoff, args->a_nfs4_addr, 1511 args->a_nfs4_seg, args->a_cred); 1512 } else if (args->a_io == NFS4_PUTAPAGE) { 1513 (void) (*args->a_nfs4_putapage)(args->a_vp, 1514 args->a_nfs4_pp, args->a_nfs4_off, 1515 args->a_nfs4_len, args->a_nfs4_flags, 1516 args->a_cred); 1517 } else if (args->a_io == NFS4_PAGEIO) { 1518 (void) (*args->a_nfs4_pageio)(args->a_vp, 1519 args->a_nfs4_pp, args->a_nfs4_off, 1520 args->a_nfs4_len, args->a_nfs4_flags, 1521 args->a_cred); 1522 } else if (args->a_io == NFS4_READDIR) { 1523 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1524 args->a_nfs4_rdc, args->a_cred)); 1525 } else if (args->a_io == NFS4_COMMIT) { 1526 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1527 args->a_nfs4_offset, args->a_nfs4_count, 1528 args->a_cred); 1529 } else if (args->a_io == NFS4_INACTIVE) { 1530 nfs4_inactive_otw(args->a_vp, args->a_cred); 1531 } 1532 1533 /* 1534 * Now, release the vnode and free the credentials 1535 * structure. 1536 */ 1537 free_async_args4(args); 1538 /* 1539 * Reacquire the mutex because it will be needed above. 1540 */ 1541 mutex_enter(&mi->mi_async_lock); 1542 } 1543 } 1544 1545 /* 1546 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1547 * part of VOP_INACTIVE. 1548 */ 1549 1550 void 1551 nfs4_inactive_thread(mntinfo4_t *mi) 1552 { 1553 struct nfs4_async_reqs *args; 1554 callb_cpr_t cprinfo; 1555 vfs_t *vfsp = mi->mi_vfsp; 1556 1557 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1558 "nfs4_inactive_thread"); 1559 1560 for (;;) { 1561 mutex_enter(&mi->mi_async_lock); 1562 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1563 if (args == NULL) { 1564 mutex_enter(&mi->mi_lock); 1565 /* 1566 * We don't want to exit until the async manager is done 1567 * with its work; hence the check for mi_manager_thread 1568 * being NULL. 1569 * 1570 * The async manager thread will cv_broadcast() on 1571 * mi_inact_req_cv when it's done, at which point we'll 1572 * wake up and exit. 1573 */ 1574 if (mi->mi_manager_thread == NULL) 1575 goto die; 1576 mi->mi_flags |= MI4_INACTIVE_IDLE; 1577 mutex_exit(&mi->mi_lock); 1578 cv_signal(&mi->mi_async_cv); 1579 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1580 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1581 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1582 mutex_exit(&mi->mi_async_lock); 1583 } else { 1584 mutex_enter(&mi->mi_lock); 1585 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1586 mutex_exit(&mi->mi_lock); 1587 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1588 mutex_exit(&mi->mi_async_lock); 1589 nfs4_inactive_otw(args->a_vp, args->a_cred); 1590 crfree(args->a_cred); 1591 kmem_free(args, sizeof (*args)); 1592 } 1593 } 1594 die: 1595 mutex_exit(&mi->mi_lock); 1596 mi->mi_inactive_thread = NULL; 1597 cv_signal(&mi->mi_async_cv); 1598 1599 /* 1600 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1601 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1602 */ 1603 CALLB_CPR_EXIT(&cprinfo); 1604 1605 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1606 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1607 1608 MI4_RELE(mi); 1609 zthread_exit(); 1610 /* NOTREACHED */ 1611 } 1612 1613 /* 1614 * nfs_async_stop: 1615 * Wait for all outstanding putpage operations and the inactive thread to 1616 * complete; nfs4_async_stop_sig() without interruptibility. 1617 */ 1618 void 1619 nfs4_async_stop(struct vfs *vfsp) 1620 { 1621 mntinfo4_t *mi = VFTOMI4(vfsp); 1622 1623 /* 1624 * Wait for all outstanding async operations to complete and for 1625 * worker threads to exit. 1626 */ 1627 mutex_enter(&mi->mi_async_lock); 1628 mi->mi_max_threads = 0; 1629 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1630 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1631 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) 1632 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1633 1634 /* 1635 * Wait for the inactive thread to finish doing what it's doing. It 1636 * won't exit until the last reference to the vfs_t goes away. 1637 */ 1638 if (mi->mi_inactive_thread != NULL) { 1639 mutex_enter(&mi->mi_lock); 1640 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1641 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1642 mutex_exit(&mi->mi_lock); 1643 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1644 mutex_enter(&mi->mi_lock); 1645 } 1646 mutex_exit(&mi->mi_lock); 1647 } 1648 mutex_exit(&mi->mi_async_lock); 1649 } 1650 1651 /* 1652 * nfs_async_stop_sig: 1653 * Wait for all outstanding putpage operations and the inactive thread to 1654 * complete. If a signal is delivered we will abort and return non-zero; 1655 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1656 * need to make it interruptible. 1657 */ 1658 int 1659 nfs4_async_stop_sig(struct vfs *vfsp) 1660 { 1661 mntinfo4_t *mi = VFTOMI4(vfsp); 1662 ushort_t omax; 1663 bool_t intr = FALSE; 1664 1665 /* 1666 * Wait for all outstanding putpage operations to complete and for 1667 * worker threads to exit. 1668 */ 1669 mutex_enter(&mi->mi_async_lock); 1670 omax = mi->mi_max_threads; 1671 mi->mi_max_threads = 0; 1672 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1673 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1674 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) { 1675 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1676 intr = TRUE; 1677 goto interrupted; 1678 } 1679 } 1680 1681 /* 1682 * Wait for the inactive thread to finish doing what it's doing. It 1683 * won't exit until the a last reference to the vfs_t goes away. 1684 */ 1685 if (mi->mi_inactive_thread != NULL) { 1686 mutex_enter(&mi->mi_lock); 1687 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1688 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1689 mutex_exit(&mi->mi_lock); 1690 if (!cv_wait_sig(&mi->mi_async_cv, 1691 &mi->mi_async_lock)) { 1692 intr = TRUE; 1693 goto interrupted; 1694 } 1695 mutex_enter(&mi->mi_lock); 1696 } 1697 mutex_exit(&mi->mi_lock); 1698 } 1699 interrupted: 1700 if (intr) 1701 mi->mi_max_threads = omax; 1702 mutex_exit(&mi->mi_async_lock); 1703 1704 return (intr); 1705 } 1706 1707 int 1708 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1709 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1710 u_offset_t, size_t, int, cred_t *)) 1711 { 1712 rnode4_t *rp; 1713 mntinfo4_t *mi; 1714 struct nfs4_async_reqs *args; 1715 1716 ASSERT(flags & B_ASYNC); 1717 ASSERT(vp->v_vfsp != NULL); 1718 1719 rp = VTOR4(vp); 1720 ASSERT(rp->r_count > 0); 1721 1722 mi = VTOMI4(vp); 1723 1724 /* 1725 * If we can't allocate a request structure, do the putpage 1726 * operation synchronously in this thread's context. 1727 */ 1728 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1729 goto noasync; 1730 1731 args->a_next = NULL; 1732 #ifdef DEBUG 1733 args->a_queuer = curthread; 1734 #endif 1735 VN_HOLD(vp); 1736 args->a_vp = vp; 1737 ASSERT(cr != NULL); 1738 crhold(cr); 1739 args->a_cred = cr; 1740 args->a_io = NFS4_PUTAPAGE; 1741 args->a_nfs4_putapage = putapage; 1742 args->a_nfs4_pp = pp; 1743 args->a_nfs4_off = off; 1744 args->a_nfs4_len = (uint_t)len; 1745 args->a_nfs4_flags = flags; 1746 1747 mutex_enter(&mi->mi_async_lock); 1748 1749 /* 1750 * If asyncio has been disabled, then make a synchronous request. 1751 * This check is done a second time in case async io was diabled 1752 * while this thread was blocked waiting for memory pressure to 1753 * reduce or for the queue to drain. 1754 */ 1755 if (mi->mi_max_threads == 0) { 1756 mutex_exit(&mi->mi_async_lock); 1757 1758 VN_RELE(vp); 1759 crfree(cr); 1760 kmem_free(args, sizeof (*args)); 1761 goto noasync; 1762 } 1763 1764 /* 1765 * Link request structure into the async list and 1766 * wakeup async thread to do the i/o. 1767 */ 1768 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1769 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1770 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1771 } else { 1772 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1773 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1774 } 1775 1776 mutex_enter(&rp->r_statelock); 1777 rp->r_count++; 1778 rp->r_awcount++; 1779 mutex_exit(&rp->r_statelock); 1780 1781 if (mi->mi_io_kstats) { 1782 mutex_enter(&mi->mi_lock); 1783 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1784 mutex_exit(&mi->mi_lock); 1785 } 1786 1787 mi->mi_async_req_count++; 1788 ASSERT(mi->mi_async_req_count != 0); 1789 cv_signal(&mi->mi_async_reqs_cv); 1790 mutex_exit(&mi->mi_async_lock); 1791 return (0); 1792 1793 noasync: 1794 1795 if (curproc == proc_pageout || curproc == proc_fsflush) { 1796 /* 1797 * If we get here in the context of the pageout/fsflush, 1798 * or we have run out of memory or we're attempting to 1799 * unmount we refuse to do a sync write, because this may 1800 * hang pageout/fsflush and the machine. In this case, 1801 * we just re-mark the page as dirty and punt on the page. 1802 * 1803 * Make sure B_FORCE isn't set. We can re-mark the 1804 * pages as dirty and unlock the pages in one swoop by 1805 * passing in B_ERROR to pvn_write_done(). However, 1806 * we should make sure B_FORCE isn't set - we don't 1807 * want the page tossed before it gets written out. 1808 */ 1809 if (flags & B_FORCE) 1810 flags &= ~(B_INVAL | B_FORCE); 1811 pvn_write_done(pp, flags | B_ERROR); 1812 return (0); 1813 } 1814 1815 if (nfs_zone() != mi->mi_zone) { 1816 /* 1817 * So this was a cross-zone sync putpage. 1818 * 1819 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1820 * as dirty and unlock them. 1821 * 1822 * We don't want to clear B_FORCE here as the caller presumably 1823 * knows what they're doing if they set it. 1824 */ 1825 pvn_write_done(pp, flags | B_ERROR); 1826 return (EPERM); 1827 } 1828 return ((*putapage)(vp, pp, off, len, flags, cr)); 1829 } 1830 1831 int 1832 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1833 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1834 size_t, int, cred_t *)) 1835 { 1836 rnode4_t *rp; 1837 mntinfo4_t *mi; 1838 struct nfs4_async_reqs *args; 1839 1840 ASSERT(flags & B_ASYNC); 1841 ASSERT(vp->v_vfsp != NULL); 1842 1843 rp = VTOR4(vp); 1844 ASSERT(rp->r_count > 0); 1845 1846 mi = VTOMI4(vp); 1847 1848 /* 1849 * If we can't allocate a request structure, do the pageio 1850 * request synchronously in this thread's context. 1851 */ 1852 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1853 goto noasync; 1854 1855 args->a_next = NULL; 1856 #ifdef DEBUG 1857 args->a_queuer = curthread; 1858 #endif 1859 VN_HOLD(vp); 1860 args->a_vp = vp; 1861 ASSERT(cr != NULL); 1862 crhold(cr); 1863 args->a_cred = cr; 1864 args->a_io = NFS4_PAGEIO; 1865 args->a_nfs4_pageio = pageio; 1866 args->a_nfs4_pp = pp; 1867 args->a_nfs4_off = io_off; 1868 args->a_nfs4_len = (uint_t)io_len; 1869 args->a_nfs4_flags = flags; 1870 1871 mutex_enter(&mi->mi_async_lock); 1872 1873 /* 1874 * If asyncio has been disabled, then make a synchronous request. 1875 * This check is done a second time in case async io was diabled 1876 * while this thread was blocked waiting for memory pressure to 1877 * reduce or for the queue to drain. 1878 */ 1879 if (mi->mi_max_threads == 0) { 1880 mutex_exit(&mi->mi_async_lock); 1881 1882 VN_RELE(vp); 1883 crfree(cr); 1884 kmem_free(args, sizeof (*args)); 1885 goto noasync; 1886 } 1887 1888 /* 1889 * Link request structure into the async list and 1890 * wakeup async thread to do the i/o. 1891 */ 1892 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1893 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1894 mi->mi_async_tail[NFS4_PAGEIO] = args; 1895 } else { 1896 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1897 mi->mi_async_tail[NFS4_PAGEIO] = args; 1898 } 1899 1900 mutex_enter(&rp->r_statelock); 1901 rp->r_count++; 1902 rp->r_awcount++; 1903 mutex_exit(&rp->r_statelock); 1904 1905 if (mi->mi_io_kstats) { 1906 mutex_enter(&mi->mi_lock); 1907 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1908 mutex_exit(&mi->mi_lock); 1909 } 1910 1911 mi->mi_async_req_count++; 1912 ASSERT(mi->mi_async_req_count != 0); 1913 cv_signal(&mi->mi_async_reqs_cv); 1914 mutex_exit(&mi->mi_async_lock); 1915 return (0); 1916 1917 noasync: 1918 /* 1919 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1920 * the page list), for writes we do it synchronously, except for 1921 * proc_pageout/proc_fsflush as described below. 1922 */ 1923 if (flags & B_READ) { 1924 pvn_read_done(pp, flags | B_ERROR); 1925 return (0); 1926 } 1927 1928 if (curproc == proc_pageout || curproc == proc_fsflush) { 1929 /* 1930 * If we get here in the context of the pageout/fsflush, 1931 * we refuse to do a sync write, because this may hang 1932 * pageout/fsflush (and the machine). In this case, we just 1933 * re-mark the page as dirty and punt on the page. 1934 * 1935 * Make sure B_FORCE isn't set. We can re-mark the 1936 * pages as dirty and unlock the pages in one swoop by 1937 * passing in B_ERROR to pvn_write_done(). However, 1938 * we should make sure B_FORCE isn't set - we don't 1939 * want the page tossed before it gets written out. 1940 */ 1941 if (flags & B_FORCE) 1942 flags &= ~(B_INVAL | B_FORCE); 1943 pvn_write_done(pp, flags | B_ERROR); 1944 return (0); 1945 } 1946 1947 if (nfs_zone() != mi->mi_zone) { 1948 /* 1949 * So this was a cross-zone sync pageio. We pass in B_ERROR 1950 * to pvn_write_done() to re-mark the pages as dirty and unlock 1951 * them. 1952 * 1953 * We don't want to clear B_FORCE here as the caller presumably 1954 * knows what they're doing if they set it. 1955 */ 1956 pvn_write_done(pp, flags | B_ERROR); 1957 return (EPERM); 1958 } 1959 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1960 } 1961 1962 void 1963 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1964 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1965 { 1966 rnode4_t *rp; 1967 mntinfo4_t *mi; 1968 struct nfs4_async_reqs *args; 1969 1970 rp = VTOR4(vp); 1971 ASSERT(rp->r_freef == NULL); 1972 1973 mi = VTOMI4(vp); 1974 1975 /* 1976 * If we can't allocate a request structure, skip the readdir. 1977 */ 1978 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1979 goto noasync; 1980 1981 args->a_next = NULL; 1982 #ifdef DEBUG 1983 args->a_queuer = curthread; 1984 #endif 1985 VN_HOLD(vp); 1986 args->a_vp = vp; 1987 ASSERT(cr != NULL); 1988 crhold(cr); 1989 args->a_cred = cr; 1990 args->a_io = NFS4_READDIR; 1991 args->a_nfs4_readdir = readdir; 1992 args->a_nfs4_rdc = rdc; 1993 1994 mutex_enter(&mi->mi_async_lock); 1995 1996 /* 1997 * If asyncio has been disabled, then skip this request 1998 */ 1999 if (mi->mi_max_threads == 0) { 2000 mutex_exit(&mi->mi_async_lock); 2001 2002 VN_RELE(vp); 2003 crfree(cr); 2004 kmem_free(args, sizeof (*args)); 2005 goto noasync; 2006 } 2007 2008 /* 2009 * Link request structure into the async list and 2010 * wakeup async thread to do the i/o. 2011 */ 2012 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 2013 mi->mi_async_reqs[NFS4_READDIR] = args; 2014 mi->mi_async_tail[NFS4_READDIR] = args; 2015 } else { 2016 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 2017 mi->mi_async_tail[NFS4_READDIR] = args; 2018 } 2019 2020 mutex_enter(&rp->r_statelock); 2021 rp->r_count++; 2022 mutex_exit(&rp->r_statelock); 2023 2024 if (mi->mi_io_kstats) { 2025 mutex_enter(&mi->mi_lock); 2026 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2027 mutex_exit(&mi->mi_lock); 2028 } 2029 2030 mi->mi_async_req_count++; 2031 ASSERT(mi->mi_async_req_count != 0); 2032 cv_signal(&mi->mi_async_reqs_cv); 2033 mutex_exit(&mi->mi_async_lock); 2034 return; 2035 2036 noasync: 2037 mutex_enter(&rp->r_statelock); 2038 rdc->entries = NULL; 2039 /* 2040 * Indicate that no one is trying to fill this entry and 2041 * it still needs to be filled. 2042 */ 2043 rdc->flags &= ~RDDIR; 2044 rdc->flags |= RDDIRREQ; 2045 rddir4_cache_rele(rp, rdc); 2046 mutex_exit(&rp->r_statelock); 2047 } 2048 2049 void 2050 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 2051 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 2052 cred_t *)) 2053 { 2054 rnode4_t *rp; 2055 mntinfo4_t *mi; 2056 struct nfs4_async_reqs *args; 2057 page_t *pp; 2058 2059 rp = VTOR4(vp); 2060 mi = VTOMI4(vp); 2061 2062 /* 2063 * If we can't allocate a request structure, do the commit 2064 * operation synchronously in this thread's context. 2065 */ 2066 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2067 goto noasync; 2068 2069 args->a_next = NULL; 2070 #ifdef DEBUG 2071 args->a_queuer = curthread; 2072 #endif 2073 VN_HOLD(vp); 2074 args->a_vp = vp; 2075 ASSERT(cr != NULL); 2076 crhold(cr); 2077 args->a_cred = cr; 2078 args->a_io = NFS4_COMMIT; 2079 args->a_nfs4_commit = commit; 2080 args->a_nfs4_plist = plist; 2081 args->a_nfs4_offset = offset; 2082 args->a_nfs4_count = count; 2083 2084 mutex_enter(&mi->mi_async_lock); 2085 2086 /* 2087 * If asyncio has been disabled, then make a synchronous request. 2088 * This check is done a second time in case async io was diabled 2089 * while this thread was blocked waiting for memory pressure to 2090 * reduce or for the queue to drain. 2091 */ 2092 if (mi->mi_max_threads == 0) { 2093 mutex_exit(&mi->mi_async_lock); 2094 2095 VN_RELE(vp); 2096 crfree(cr); 2097 kmem_free(args, sizeof (*args)); 2098 goto noasync; 2099 } 2100 2101 /* 2102 * Link request structure into the async list and 2103 * wakeup async thread to do the i/o. 2104 */ 2105 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2106 mi->mi_async_reqs[NFS4_COMMIT] = args; 2107 mi->mi_async_tail[NFS4_COMMIT] = args; 2108 } else { 2109 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2110 mi->mi_async_tail[NFS4_COMMIT] = args; 2111 } 2112 2113 mutex_enter(&rp->r_statelock); 2114 rp->r_count++; 2115 mutex_exit(&rp->r_statelock); 2116 2117 if (mi->mi_io_kstats) { 2118 mutex_enter(&mi->mi_lock); 2119 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2120 mutex_exit(&mi->mi_lock); 2121 } 2122 2123 mi->mi_async_req_count++; 2124 ASSERT(mi->mi_async_req_count != 0); 2125 cv_signal(&mi->mi_async_reqs_cv); 2126 mutex_exit(&mi->mi_async_lock); 2127 return; 2128 2129 noasync: 2130 if (curproc == proc_pageout || curproc == proc_fsflush || 2131 nfs_zone() != mi->mi_zone) { 2132 while (plist != NULL) { 2133 pp = plist; 2134 page_sub(&plist, pp); 2135 pp->p_fsdata = C_COMMIT; 2136 page_unlock(pp); 2137 } 2138 return; 2139 } 2140 (*commit)(vp, plist, offset, count, cr); 2141 } 2142 2143 /* 2144 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2145 * reference to the vnode is handed over to the thread; the caller should 2146 * no longer refer to the vnode. 2147 * 2148 * Unlike most of the async routines, this handoff is needed for 2149 * correctness reasons, not just performance. So doing operations in the 2150 * context of the current thread is not an option. 2151 */ 2152 void 2153 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2154 { 2155 mntinfo4_t *mi; 2156 struct nfs4_async_reqs *args; 2157 boolean_t signal_inactive_thread = B_FALSE; 2158 2159 mi = VTOMI4(vp); 2160 2161 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2162 args->a_next = NULL; 2163 #ifdef DEBUG 2164 args->a_queuer = curthread; 2165 #endif 2166 args->a_vp = vp; 2167 ASSERT(cr != NULL); 2168 crhold(cr); 2169 args->a_cred = cr; 2170 args->a_io = NFS4_INACTIVE; 2171 2172 /* 2173 * Note that we don't check mi->mi_max_threads here, since we 2174 * *need* to get rid of this vnode regardless of whether someone 2175 * set nfs4_max_threads to zero in /etc/system. 2176 * 2177 * The manager thread knows about this and is willing to create 2178 * at least one thread to accommodate us. 2179 */ 2180 mutex_enter(&mi->mi_async_lock); 2181 if (mi->mi_inactive_thread == NULL) { 2182 rnode4_t *rp; 2183 vnode_t *unldvp = NULL; 2184 char *unlname; 2185 cred_t *unlcred; 2186 2187 mutex_exit(&mi->mi_async_lock); 2188 /* 2189 * We just need to free up the memory associated with the 2190 * vnode, which can be safely done from within the current 2191 * context. 2192 */ 2193 crfree(cr); /* drop our reference */ 2194 kmem_free(args, sizeof (*args)); 2195 rp = VTOR4(vp); 2196 mutex_enter(&rp->r_statelock); 2197 if (rp->r_unldvp != NULL) { 2198 unldvp = rp->r_unldvp; 2199 rp->r_unldvp = NULL; 2200 unlname = rp->r_unlname; 2201 rp->r_unlname = NULL; 2202 unlcred = rp->r_unlcred; 2203 rp->r_unlcred = NULL; 2204 } 2205 mutex_exit(&rp->r_statelock); 2206 /* 2207 * No need to explicitly throw away any cached pages. The 2208 * eventual r4inactive() will attempt a synchronous 2209 * VOP_PUTPAGE() which will immediately fail since the request 2210 * is coming from the wrong zone, and then will proceed to call 2211 * nfs4_invalidate_pages() which will clean things up for us. 2212 * 2213 * Throw away the delegation here so rp4_addfree()'s attempt to 2214 * return any existing delegations becomes a no-op. 2215 */ 2216 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2217 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2218 FALSE); 2219 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2220 nfs_rw_exit(&mi->mi_recovlock); 2221 } 2222 nfs4_clear_open_streams(rp); 2223 2224 rp4_addfree(rp, cr); 2225 if (unldvp != NULL) { 2226 kmem_free(unlname, MAXNAMELEN); 2227 VN_RELE(unldvp); 2228 crfree(unlcred); 2229 } 2230 return; 2231 } 2232 2233 if (mi->mi_manager_thread == NULL) { 2234 /* 2235 * We want to talk to the inactive thread. 2236 */ 2237 signal_inactive_thread = B_TRUE; 2238 } 2239 2240 /* 2241 * Enqueue the vnode and wake up either the special thread (empty 2242 * list) or an async thread. 2243 */ 2244 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2245 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2246 mi->mi_async_tail[NFS4_INACTIVE] = args; 2247 signal_inactive_thread = B_TRUE; 2248 } else { 2249 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2250 mi->mi_async_tail[NFS4_INACTIVE] = args; 2251 } 2252 if (signal_inactive_thread) { 2253 cv_signal(&mi->mi_inact_req_cv); 2254 } else { 2255 mi->mi_async_req_count++; 2256 ASSERT(mi->mi_async_req_count != 0); 2257 cv_signal(&mi->mi_async_reqs_cv); 2258 } 2259 2260 mutex_exit(&mi->mi_async_lock); 2261 } 2262 2263 int 2264 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2265 { 2266 int pagecreate; 2267 int n; 2268 int saved_n; 2269 caddr_t saved_base; 2270 u_offset_t offset; 2271 int error; 2272 int sm_error; 2273 vnode_t *vp = RTOV(rp); 2274 2275 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2276 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2277 if (!vpm_enable) { 2278 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2279 } 2280 2281 /* 2282 * Move bytes in at most PAGESIZE chunks. We must avoid 2283 * spanning pages in uiomove() because page faults may cause 2284 * the cache to be invalidated out from under us. The r_size is not 2285 * updated until after the uiomove. If we push the last page of a 2286 * file before r_size is correct, we will lose the data written past 2287 * the current (and invalid) r_size. 2288 */ 2289 do { 2290 offset = uio->uio_loffset; 2291 pagecreate = 0; 2292 2293 /* 2294 * n is the number of bytes required to satisfy the request 2295 * or the number of bytes to fill out the page. 2296 */ 2297 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2298 2299 /* 2300 * Check to see if we can skip reading in the page 2301 * and just allocate the memory. We can do this 2302 * if we are going to rewrite the entire mapping 2303 * or if we are going to write to or beyond the current 2304 * end of file from the beginning of the mapping. 2305 * 2306 * The read of r_size is now protected by r_statelock. 2307 */ 2308 mutex_enter(&rp->r_statelock); 2309 /* 2310 * When pgcreated is nonzero the caller has already done 2311 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2312 * segkpm this means we already have at least one page 2313 * created and mapped at base. 2314 */ 2315 pagecreate = pgcreated || 2316 ((offset & PAGEOFFSET) == 0 && 2317 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2318 2319 mutex_exit(&rp->r_statelock); 2320 2321 if (!vpm_enable && pagecreate) { 2322 /* 2323 * The last argument tells segmap_pagecreate() to 2324 * always lock the page, as opposed to sometimes 2325 * returning with the page locked. This way we avoid a 2326 * fault on the ensuing uiomove(), but also 2327 * more importantly (to fix bug 1094402) we can 2328 * call segmap_fault() to unlock the page in all 2329 * cases. An alternative would be to modify 2330 * segmap_pagecreate() to tell us when it is 2331 * locking a page, but that's a fairly major 2332 * interface change. 2333 */ 2334 if (pgcreated == 0) 2335 (void) segmap_pagecreate(segkmap, base, 2336 (uint_t)n, 1); 2337 saved_base = base; 2338 saved_n = n; 2339 } 2340 2341 /* 2342 * The number of bytes of data in the last page can not 2343 * be accurately be determined while page is being 2344 * uiomove'd to and the size of the file being updated. 2345 * Thus, inform threads which need to know accurately 2346 * how much data is in the last page of the file. They 2347 * will not do the i/o immediately, but will arrange for 2348 * the i/o to happen later when this modify operation 2349 * will have finished. 2350 */ 2351 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2352 mutex_enter(&rp->r_statelock); 2353 rp->r_flags |= R4MODINPROGRESS; 2354 rp->r_modaddr = (offset & MAXBMASK); 2355 mutex_exit(&rp->r_statelock); 2356 2357 if (vpm_enable) { 2358 /* 2359 * Copy data. If new pages are created, part of 2360 * the page that is not written will be initizliazed 2361 * with zeros. 2362 */ 2363 error = vpm_data_copy(vp, offset, n, uio, 2364 !pagecreate, NULL, 0, S_WRITE); 2365 } else { 2366 error = uiomove(base, n, UIO_WRITE, uio); 2367 } 2368 2369 /* 2370 * r_size is the maximum number of 2371 * bytes known to be in the file. 2372 * Make sure it is at least as high as the 2373 * first unwritten byte pointed to by uio_loffset. 2374 */ 2375 mutex_enter(&rp->r_statelock); 2376 if (rp->r_size < uio->uio_loffset) 2377 rp->r_size = uio->uio_loffset; 2378 rp->r_flags &= ~R4MODINPROGRESS; 2379 rp->r_flags |= R4DIRTY; 2380 mutex_exit(&rp->r_statelock); 2381 2382 /* n = # of bytes written */ 2383 n = (int)(uio->uio_loffset - offset); 2384 2385 if (!vpm_enable) { 2386 base += n; 2387 } 2388 2389 tcount -= n; 2390 /* 2391 * If we created pages w/o initializing them completely, 2392 * we need to zero the part that wasn't set up. 2393 * This happens on a most EOF write cases and if 2394 * we had some sort of error during the uiomove. 2395 */ 2396 if (!vpm_enable && pagecreate) { 2397 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2398 (void) kzero(base, PAGESIZE - n); 2399 2400 if (pgcreated) { 2401 /* 2402 * Caller is responsible for this page, 2403 * it was not created in this loop. 2404 */ 2405 pgcreated = 0; 2406 } else { 2407 /* 2408 * For bug 1094402: segmap_pagecreate locks 2409 * page. Unlock it. This also unlocks the 2410 * pages allocated by page_create_va() in 2411 * segmap_pagecreate(). 2412 */ 2413 sm_error = segmap_fault(kas.a_hat, segkmap, 2414 saved_base, saved_n, 2415 F_SOFTUNLOCK, S_WRITE); 2416 if (error == 0) 2417 error = sm_error; 2418 } 2419 } 2420 } while (tcount > 0 && error == 0); 2421 2422 return (error); 2423 } 2424 2425 int 2426 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2427 { 2428 rnode4_t *rp; 2429 page_t *pp; 2430 u_offset_t eoff; 2431 u_offset_t io_off; 2432 size_t io_len; 2433 int error; 2434 int rdirty; 2435 int err; 2436 2437 rp = VTOR4(vp); 2438 ASSERT(rp->r_count > 0); 2439 2440 if (!nfs4_has_pages(vp)) 2441 return (0); 2442 2443 ASSERT(vp->v_type != VCHR); 2444 2445 /* 2446 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2447 * writes. B_FORCE is set to force the VM system to actually 2448 * invalidate the pages, even if the i/o failed. The pages 2449 * need to get invalidated because they can't be written out 2450 * because there isn't any space left on either the server's 2451 * file system or in the user's disk quota. The B_FREE bit 2452 * is cleared to avoid confusion as to whether this is a 2453 * request to place the page on the freelist or to destroy 2454 * it. 2455 */ 2456 if ((rp->r_flags & R4OUTOFSPACE) || 2457 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2458 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2459 2460 if (len == 0) { 2461 /* 2462 * If doing a full file synchronous operation, then clear 2463 * the R4DIRTY bit. If a page gets dirtied while the flush 2464 * is happening, then R4DIRTY will get set again. The 2465 * R4DIRTY bit must get cleared before the flush so that 2466 * we don't lose this information. 2467 * 2468 * If there are no full file async write operations 2469 * pending and RDIRTY bit is set, clear it. 2470 */ 2471 if (off == (u_offset_t)0 && 2472 !(flags & B_ASYNC) && 2473 (rp->r_flags & R4DIRTY)) { 2474 mutex_enter(&rp->r_statelock); 2475 rdirty = (rp->r_flags & R4DIRTY); 2476 rp->r_flags &= ~R4DIRTY; 2477 mutex_exit(&rp->r_statelock); 2478 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2479 mutex_enter(&rp->r_statelock); 2480 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2481 rdirty = (rp->r_flags & R4DIRTY); 2482 rp->r_flags &= ~R4DIRTY; 2483 } 2484 mutex_exit(&rp->r_statelock); 2485 } else 2486 rdirty = 0; 2487 2488 /* 2489 * Search the entire vp list for pages >= off, and flush 2490 * the dirty pages. 2491 */ 2492 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2493 flags, cr); 2494 2495 /* 2496 * If an error occurred and the file was marked as dirty 2497 * before and we aren't forcibly invalidating pages, then 2498 * reset the R4DIRTY flag. 2499 */ 2500 if (error && rdirty && 2501 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2502 mutex_enter(&rp->r_statelock); 2503 rp->r_flags |= R4DIRTY; 2504 mutex_exit(&rp->r_statelock); 2505 } 2506 } else { 2507 /* 2508 * Do a range from [off...off + len) looking for pages 2509 * to deal with. 2510 */ 2511 error = 0; 2512 io_len = 0; 2513 eoff = off + len; 2514 mutex_enter(&rp->r_statelock); 2515 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2516 io_off += io_len) { 2517 mutex_exit(&rp->r_statelock); 2518 /* 2519 * If we are not invalidating, synchronously 2520 * freeing or writing pages use the routine 2521 * page_lookup_nowait() to prevent reclaiming 2522 * them from the free list. 2523 */ 2524 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2525 pp = page_lookup(vp, io_off, 2526 (flags & (B_INVAL | B_FREE)) ? 2527 SE_EXCL : SE_SHARED); 2528 } else { 2529 pp = page_lookup_nowait(vp, io_off, 2530 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2531 } 2532 2533 if (pp == NULL || !pvn_getdirty(pp, flags)) 2534 io_len = PAGESIZE; 2535 else { 2536 err = (*rp->r_putapage)(vp, pp, &io_off, 2537 &io_len, flags, cr); 2538 if (!error) 2539 error = err; 2540 /* 2541 * "io_off" and "io_len" are returned as 2542 * the range of pages we actually wrote. 2543 * This allows us to skip ahead more quickly 2544 * since several pages may've been dealt 2545 * with by this iteration of the loop. 2546 */ 2547 } 2548 mutex_enter(&rp->r_statelock); 2549 } 2550 mutex_exit(&rp->r_statelock); 2551 } 2552 2553 return (error); 2554 } 2555 2556 void 2557 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2558 { 2559 rnode4_t *rp; 2560 2561 rp = VTOR4(vp); 2562 if (IS_SHADOW(vp, rp)) 2563 vp = RTOV4(rp); 2564 mutex_enter(&rp->r_statelock); 2565 while (rp->r_flags & R4TRUNCATE) 2566 cv_wait(&rp->r_cv, &rp->r_statelock); 2567 rp->r_flags |= R4TRUNCATE; 2568 if (off == (u_offset_t)0) { 2569 rp->r_flags &= ~R4DIRTY; 2570 if (!(rp->r_flags & R4STALE)) 2571 rp->r_error = 0; 2572 } 2573 rp->r_truncaddr = off; 2574 mutex_exit(&rp->r_statelock); 2575 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2576 B_INVAL | B_TRUNC, cr); 2577 mutex_enter(&rp->r_statelock); 2578 rp->r_flags &= ~R4TRUNCATE; 2579 cv_broadcast(&rp->r_cv); 2580 mutex_exit(&rp->r_statelock); 2581 } 2582 2583 static int 2584 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2585 { 2586 mntinfo4_t *mi; 2587 struct mntinfo_kstat *mik; 2588 vfs_t *vfsp; 2589 2590 /* this is a read-only kstat. Bail out on a write */ 2591 if (rw == KSTAT_WRITE) 2592 return (EACCES); 2593 2594 2595 /* 2596 * We don't want to wait here as kstat_chain_lock could be held by 2597 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2598 * and thus could lead to a deadlock. 2599 */ 2600 vfsp = (struct vfs *)ksp->ks_private; 2601 2602 mi = VFTOMI4(vfsp); 2603 mik = (struct mntinfo_kstat *)ksp->ks_data; 2604 2605 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2606 2607 mik->mik_vers = (uint32_t)mi->mi_vers; 2608 mik->mik_flags = mi->mi_flags; 2609 /* 2610 * The sv_secdata holds the flavor the client specifies. 2611 * If the client uses default and a security negotiation 2612 * occurs, sv_currsec will point to the current flavor 2613 * selected from the server flavor list. 2614 * sv_currsec is NULL if no security negotiation takes place. 2615 */ 2616 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2617 mi->mi_curr_serv->sv_currsec->secmod : 2618 mi->mi_curr_serv->sv_secdata->secmod; 2619 mik->mik_curread = (uint32_t)mi->mi_curread; 2620 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2621 mik->mik_retrans = mi->mi_retrans; 2622 mik->mik_timeo = mi->mi_timeo; 2623 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2624 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2625 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2626 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2627 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2628 mik->mik_failover = (uint32_t)mi->mi_failover; 2629 mik->mik_remap = (uint32_t)mi->mi_remap; 2630 2631 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2632 2633 return (0); 2634 } 2635 2636 void 2637 nfs4_mnt_kstat_init(struct vfs *vfsp) 2638 { 2639 mntinfo4_t *mi = VFTOMI4(vfsp); 2640 2641 /* 2642 * PSARC 2001/697 Contract Private Interface 2643 * All nfs kstats are under SunMC contract 2644 * Please refer to the PSARC listed above and contact 2645 * SunMC before making any changes! 2646 * 2647 * Changes must be reviewed by Solaris File Sharing 2648 * Changes must be communicated to contract-2001-697@sun.com 2649 * 2650 */ 2651 2652 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2653 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2654 if (mi->mi_io_kstats) { 2655 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2656 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2657 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2658 kstat_install(mi->mi_io_kstats); 2659 } 2660 2661 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2662 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2663 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2664 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2665 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2666 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2667 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2668 kstat_install(mi->mi_ro_kstats); 2669 } 2670 2671 nfs4_mnt_recov_kstat_init(vfsp); 2672 } 2673 2674 void 2675 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2676 { 2677 mntinfo4_t *mi; 2678 clock_t now = ddi_get_lbolt(); 2679 2680 mi = VTOMI4(vp); 2681 /* 2682 * In case of forced unmount, do not print any messages 2683 * since it can flood the console with error messages. 2684 */ 2685 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2686 return; 2687 2688 /* 2689 * If the mount point is dead, not recoverable, do not 2690 * print error messages that can flood the console. 2691 */ 2692 if (mi->mi_flags & MI4_RECOV_FAIL) 2693 return; 2694 2695 /* 2696 * No use in flooding the console with ENOSPC 2697 * messages from the same file system. 2698 */ 2699 if ((error != ENOSPC && error != EDQUOT) || 2700 now - mi->mi_printftime > 0) { 2701 zoneid_t zoneid = mi->mi_zone->zone_id; 2702 2703 #ifdef DEBUG 2704 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2705 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2706 #else 2707 nfs_perror(error, "NFS write error on host %s: %m.\n", 2708 VTOR4(vp)->r_server->sv_hostname, NULL); 2709 #endif 2710 if (error == ENOSPC || error == EDQUOT) { 2711 zcmn_err(zoneid, CE_CONT, 2712 "^File: userid=%d, groupid=%d\n", 2713 crgetuid(cr), crgetgid(cr)); 2714 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2715 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2716 zcmn_err(zoneid, CE_CONT, 2717 "^User: userid=%d, groupid=%d\n", 2718 crgetuid(curthread->t_cred), 2719 crgetgid(curthread->t_cred)); 2720 } 2721 mi->mi_printftime = now + 2722 nfs_write_error_interval * hz; 2723 } 2724 sfh4_printfhandle(VTOR4(vp)->r_fh); 2725 #ifdef DEBUG 2726 if (error == EACCES) { 2727 zcmn_err(zoneid, CE_CONT, 2728 "nfs_bio: cred is%s kcred\n", 2729 cr == kcred ? "" : " not"); 2730 } 2731 #endif 2732 } 2733 } 2734 2735 /* 2736 * Return non-zero if the given file can be safely memory mapped. Locks 2737 * are safe if whole-file (length and offset are both zero). 2738 */ 2739 2740 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2741 2742 static int 2743 nfs4_safemap(const vnode_t *vp) 2744 { 2745 locklist_t *llp, *next_llp; 2746 int safe = 1; 2747 rnode4_t *rp = VTOR4(vp); 2748 2749 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2750 2751 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2752 "vp = %p", (void *)vp)); 2753 2754 /* 2755 * Review all the locks for the vnode, both ones that have been 2756 * acquired and ones that are pending. We assume that 2757 * flk_active_locks_for_vp() has merged any locks that can be 2758 * merged (so that if a process has the entire file locked, it is 2759 * represented as a single lock). 2760 * 2761 * Note that we can't bail out of the loop if we find a non-safe 2762 * lock, because we have to free all the elements in the llp list. 2763 * We might be able to speed up this code slightly by not looking 2764 * at each lock's l_start and l_len fields once we've found a 2765 * non-safe lock. 2766 */ 2767 2768 llp = flk_active_locks_for_vp(vp); 2769 while (llp) { 2770 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2771 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2772 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2773 if (!SAFE_LOCK(llp->ll_flock)) { 2774 safe = 0; 2775 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2776 "nfs4_safemap: unsafe active lock (%" PRId64 2777 ", %" PRId64 ")", llp->ll_flock.l_start, 2778 llp->ll_flock.l_len)); 2779 } 2780 next_llp = llp->ll_next; 2781 VN_RELE(llp->ll_vp); 2782 kmem_free(llp, sizeof (*llp)); 2783 llp = next_llp; 2784 } 2785 2786 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2787 safe ? "safe" : "unsafe")); 2788 return (safe); 2789 } 2790 2791 /* 2792 * Return whether there is a lost LOCK or LOCKU queued up for the given 2793 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2794 */ 2795 2796 bool_t 2797 nfs4_map_lost_lock_conflict(vnode_t *vp) 2798 { 2799 bool_t conflict = FALSE; 2800 nfs4_lost_rqst_t *lrp; 2801 mntinfo4_t *mi = VTOMI4(vp); 2802 2803 mutex_enter(&mi->mi_lock); 2804 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2805 lrp = list_next(&mi->mi_lost_state, lrp)) { 2806 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2807 continue; 2808 ASSERT(lrp->lr_vp != NULL); 2809 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2810 continue; /* different file */ 2811 if (!SAFE_LOCK(*lrp->lr_flk)) { 2812 conflict = TRUE; 2813 break; 2814 } 2815 } 2816 2817 mutex_exit(&mi->mi_lock); 2818 return (conflict); 2819 } 2820 2821 /* 2822 * nfs_lockcompletion: 2823 * 2824 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2825 * as non cachable (set VNOCACHE bit). 2826 */ 2827 2828 void 2829 nfs4_lockcompletion(vnode_t *vp, int cmd) 2830 { 2831 rnode4_t *rp = VTOR4(vp); 2832 2833 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2834 ASSERT(!IS_SHADOW(vp, rp)); 2835 2836 if (cmd == F_SETLK || cmd == F_SETLKW) { 2837 2838 if (!nfs4_safemap(vp)) { 2839 mutex_enter(&vp->v_lock); 2840 vp->v_flag |= VNOCACHE; 2841 mutex_exit(&vp->v_lock); 2842 } else { 2843 mutex_enter(&vp->v_lock); 2844 vp->v_flag &= ~VNOCACHE; 2845 mutex_exit(&vp->v_lock); 2846 } 2847 } 2848 /* 2849 * The cached attributes of the file are stale after acquiring 2850 * the lock on the file. They were updated when the file was 2851 * opened, but not updated when the lock was acquired. Therefore the 2852 * cached attributes are invalidated after the lock is obtained. 2853 */ 2854 PURGE_ATTRCACHE4(vp); 2855 } 2856 2857 /* ARGSUSED */ 2858 static void * 2859 nfs4_mi_init(zoneid_t zoneid) 2860 { 2861 struct mi4_globals *mig; 2862 2863 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2864 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2865 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2866 offsetof(mntinfo4_t, mi_zone_node)); 2867 mig->mig_destructor_called = B_FALSE; 2868 return (mig); 2869 } 2870 2871 /* 2872 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2873 * state and killing off threads. 2874 */ 2875 /* ARGSUSED */ 2876 static void 2877 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2878 { 2879 struct mi4_globals *mig = data; 2880 mntinfo4_t *mi; 2881 nfs4_server_t *np; 2882 2883 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2884 "nfs4_mi_shutdown zone %d\n", zoneid)); 2885 ASSERT(mig != NULL); 2886 for (;;) { 2887 mutex_enter(&mig->mig_lock); 2888 mi = list_head(&mig->mig_list); 2889 if (mi == NULL) { 2890 mutex_exit(&mig->mig_lock); 2891 break; 2892 } 2893 2894 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2895 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2896 /* 2897 * purge the DNLC for this filesystem 2898 */ 2899 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2900 /* 2901 * Tell existing async worker threads to exit. 2902 */ 2903 mutex_enter(&mi->mi_async_lock); 2904 mi->mi_max_threads = 0; 2905 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2906 /* 2907 * Set the appropriate flags, signal and wait for both the 2908 * async manager and the inactive thread to exit when they're 2909 * done with their current work. 2910 */ 2911 mutex_enter(&mi->mi_lock); 2912 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2913 mutex_exit(&mi->mi_lock); 2914 mutex_exit(&mi->mi_async_lock); 2915 if (mi->mi_manager_thread) { 2916 nfs4_async_manager_stop(mi->mi_vfsp); 2917 } 2918 if (mi->mi_inactive_thread) { 2919 mutex_enter(&mi->mi_async_lock); 2920 cv_signal(&mi->mi_inact_req_cv); 2921 /* 2922 * Wait for the inactive thread to exit. 2923 */ 2924 while (mi->mi_inactive_thread != NULL) { 2925 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2926 } 2927 mutex_exit(&mi->mi_async_lock); 2928 } 2929 /* 2930 * Wait for the recovery thread to complete, that is, it will 2931 * signal when it is done using the "mi" structure and about 2932 * to exit 2933 */ 2934 mutex_enter(&mi->mi_lock); 2935 while (mi->mi_in_recovery > 0) 2936 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2937 mutex_exit(&mi->mi_lock); 2938 /* 2939 * We're done when every mi has been done or the list is empty. 2940 * This one is done, remove it from the list. 2941 */ 2942 list_remove(&mig->mig_list, mi); 2943 mutex_exit(&mig->mig_lock); 2944 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4); 2945 2946 /* 2947 * Release hold on vfs and mi done to prevent race with zone 2948 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2949 */ 2950 VFS_RELE(mi->mi_vfsp); 2951 MI4_RELE(mi); 2952 } 2953 /* 2954 * Tell each renew thread in the zone to exit 2955 */ 2956 mutex_enter(&nfs4_server_lst_lock); 2957 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2958 mutex_enter(&np->s_lock); 2959 if (np->zoneid == zoneid) { 2960 /* 2961 * We add another hold onto the nfs4_server_t 2962 * because this will make sure tha the nfs4_server_t 2963 * stays around until nfs4_callback_fini_zone destroys 2964 * the zone. This way, the renew thread can 2965 * unconditionally release its holds on the 2966 * nfs4_server_t. 2967 */ 2968 np->s_refcnt++; 2969 nfs4_mark_srv_dead(np); 2970 } 2971 mutex_exit(&np->s_lock); 2972 } 2973 mutex_exit(&nfs4_server_lst_lock); 2974 } 2975 2976 static void 2977 nfs4_mi_free_globals(struct mi4_globals *mig) 2978 { 2979 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2980 mutex_destroy(&mig->mig_lock); 2981 kmem_free(mig, sizeof (*mig)); 2982 } 2983 2984 /* ARGSUSED */ 2985 static void 2986 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2987 { 2988 struct mi4_globals *mig = data; 2989 2990 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2991 "nfs4_mi_destroy zone %d\n", zoneid)); 2992 ASSERT(mig != NULL); 2993 mutex_enter(&mig->mig_lock); 2994 if (list_head(&mig->mig_list) != NULL) { 2995 /* Still waiting for VFS_FREEVFS() */ 2996 mig->mig_destructor_called = B_TRUE; 2997 mutex_exit(&mig->mig_lock); 2998 return; 2999 } 3000 nfs4_mi_free_globals(mig); 3001 } 3002 3003 /* 3004 * Add an NFS mount to the per-zone list of NFS mounts. 3005 */ 3006 void 3007 nfs4_mi_zonelist_add(mntinfo4_t *mi) 3008 { 3009 struct mi4_globals *mig; 3010 3011 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3012 mutex_enter(&mig->mig_lock); 3013 list_insert_head(&mig->mig_list, mi); 3014 /* 3015 * hold added to eliminate race with zone shutdown -this will be 3016 * released in mi_shutdown 3017 */ 3018 MI4_HOLD(mi); 3019 VFS_HOLD(mi->mi_vfsp); 3020 mutex_exit(&mig->mig_lock); 3021 } 3022 3023 /* 3024 * Remove an NFS mount from the per-zone list of NFS mounts. 3025 */ 3026 int 3027 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 3028 { 3029 struct mi4_globals *mig; 3030 int ret = 0; 3031 3032 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3033 mutex_enter(&mig->mig_lock); 3034 mutex_enter(&mi->mi_lock); 3035 /* if this mi is marked dead, then the zone already released it */ 3036 if (!(mi->mi_flags & MI4_DEAD)) { 3037 list_remove(&mig->mig_list, mi); 3038 mutex_exit(&mi->mi_lock); 3039 3040 /* release the holds put on in zonelist_add(). */ 3041 VFS_RELE(mi->mi_vfsp); 3042 MI4_RELE(mi); 3043 ret = 1; 3044 } else { 3045 mutex_exit(&mi->mi_lock); 3046 } 3047 3048 /* 3049 * We can be called asynchronously by VFS_FREEVFS() after the zone 3050 * shutdown/destroy callbacks have executed; if so, clean up the zone's 3051 * mi globals. 3052 */ 3053 if (list_head(&mig->mig_list) == NULL && 3054 mig->mig_destructor_called == B_TRUE) { 3055 nfs4_mi_free_globals(mig); 3056 return (ret); 3057 } 3058 mutex_exit(&mig->mig_lock); 3059 return (ret); 3060 } 3061 3062 void 3063 nfs_free_mi4(mntinfo4_t *mi) 3064 { 3065 nfs4_open_owner_t *foop; 3066 nfs4_oo_hash_bucket_t *bucketp; 3067 nfs4_debug_msg_t *msgp; 3068 int i; 3069 servinfo4_t *svp; 3070 3071 /* 3072 * Code introduced here should be carefully evaluated to make 3073 * sure none of the freed resources are accessed either directly 3074 * or indirectly after freeing them. For eg: Introducing calls to 3075 * NFS4_DEBUG that use mntinfo4_t structure member after freeing 3076 * the structure members or other routines calling back into NFS 3077 * accessing freed mntinfo4_t structure member. 3078 */ 3079 mutex_enter(&mi->mi_lock); 3080 ASSERT(mi->mi_recovthread == NULL); 3081 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 3082 mutex_exit(&mi->mi_lock); 3083 mutex_enter(&mi->mi_async_lock); 3084 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 3085 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0); 3086 ASSERT(mi->mi_manager_thread == NULL); 3087 mutex_exit(&mi->mi_async_lock); 3088 if (mi->mi_io_kstats) { 3089 kstat_delete(mi->mi_io_kstats); 3090 mi->mi_io_kstats = NULL; 3091 } 3092 if (mi->mi_ro_kstats) { 3093 kstat_delete(mi->mi_ro_kstats); 3094 mi->mi_ro_kstats = NULL; 3095 } 3096 if (mi->mi_recov_ksp) { 3097 kstat_delete(mi->mi_recov_ksp); 3098 mi->mi_recov_ksp = NULL; 3099 } 3100 mutex_enter(&mi->mi_msg_list_lock); 3101 while (msgp = list_head(&mi->mi_msg_list)) { 3102 list_remove(&mi->mi_msg_list, msgp); 3103 nfs4_free_msg(msgp); 3104 } 3105 mutex_exit(&mi->mi_msg_list_lock); 3106 list_destroy(&mi->mi_msg_list); 3107 if (mi->mi_fname != NULL) 3108 fn_rele(&mi->mi_fname); 3109 if (mi->mi_rootfh != NULL) 3110 sfh4_rele(&mi->mi_rootfh); 3111 if (mi->mi_srvparentfh != NULL) 3112 sfh4_rele(&mi->mi_srvparentfh); 3113 svp = mi->mi_servers; 3114 sv4_free(svp); 3115 mutex_destroy(&mi->mi_lock); 3116 mutex_destroy(&mi->mi_async_lock); 3117 mutex_destroy(&mi->mi_msg_list_lock); 3118 nfs_rw_destroy(&mi->mi_recovlock); 3119 nfs_rw_destroy(&mi->mi_rename_lock); 3120 nfs_rw_destroy(&mi->mi_fh_lock); 3121 cv_destroy(&mi->mi_failover_cv); 3122 cv_destroy(&mi->mi_async_reqs_cv); 3123 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]); 3124 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]); 3125 cv_destroy(&mi->mi_async_cv); 3126 cv_destroy(&mi->mi_inact_req_cv); 3127 /* 3128 * Destroy the oo hash lists and mutexes for the cred hash table. 3129 */ 3130 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3131 bucketp = &(mi->mi_oo_list[i]); 3132 /* Destroy any remaining open owners on the list */ 3133 foop = list_head(&bucketp->b_oo_hash_list); 3134 while (foop != NULL) { 3135 list_remove(&bucketp->b_oo_hash_list, foop); 3136 nfs4_destroy_open_owner(foop); 3137 foop = list_head(&bucketp->b_oo_hash_list); 3138 } 3139 list_destroy(&bucketp->b_oo_hash_list); 3140 mutex_destroy(&bucketp->b_lock); 3141 } 3142 /* 3143 * Empty and destroy the freed open owner list. 3144 */ 3145 foop = list_head(&mi->mi_foo_list); 3146 while (foop != NULL) { 3147 list_remove(&mi->mi_foo_list, foop); 3148 nfs4_destroy_open_owner(foop); 3149 foop = list_head(&mi->mi_foo_list); 3150 } 3151 list_destroy(&mi->mi_foo_list); 3152 list_destroy(&mi->mi_bseqid_list); 3153 list_destroy(&mi->mi_lost_state); 3154 avl_destroy(&mi->mi_filehandles); 3155 kmem_free(mi, sizeof (*mi)); 3156 } 3157 void 3158 mi_hold(mntinfo4_t *mi) 3159 { 3160 atomic_inc_32(&mi->mi_count); 3161 ASSERT(mi->mi_count != 0); 3162 } 3163 3164 void 3165 mi_rele(mntinfo4_t *mi) 3166 { 3167 ASSERT(mi->mi_count != 0); 3168 if (atomic_dec_32_nv(&mi->mi_count) == 0) { 3169 nfs_free_mi4(mi); 3170 } 3171 } 3172 3173 vnode_t nfs4_xattr_notsupp_vnode; 3174 3175 void 3176 nfs4_clnt_init(void) 3177 { 3178 nfs4_vnops_init(); 3179 (void) nfs4_rnode_init(); 3180 (void) nfs4_shadow_init(); 3181 (void) nfs4_acache_init(); 3182 (void) nfs4_subr_init(); 3183 nfs4_acl_init(); 3184 nfs_idmap_init(); 3185 nfs4_callback_init(); 3186 nfs4_secinfo_init(); 3187 #ifdef DEBUG 3188 tsd_create(&nfs4_tsd_key, NULL); 3189 #endif 3190 3191 /* 3192 * Add a CPR callback so that we can update client 3193 * lease after a suspend and resume. 3194 */ 3195 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3196 3197 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3198 nfs4_mi_destroy); 3199 3200 /* 3201 * Initialise the reference count of the notsupp xattr cache vnode to 1 3202 * so that it never goes away (VOP_INACTIVE isn't called on it). 3203 */ 3204 nfs4_xattr_notsupp_vnode.v_count = 1; 3205 } 3206 3207 void 3208 nfs4_clnt_fini(void) 3209 { 3210 (void) zone_key_delete(mi4_list_key); 3211 nfs4_vnops_fini(); 3212 (void) nfs4_rnode_fini(); 3213 (void) nfs4_shadow_fini(); 3214 (void) nfs4_acache_fini(); 3215 (void) nfs4_subr_fini(); 3216 nfs_idmap_fini(); 3217 nfs4_callback_fini(); 3218 nfs4_secinfo_fini(); 3219 #ifdef DEBUG 3220 tsd_destroy(&nfs4_tsd_key); 3221 #endif 3222 if (cid) 3223 (void) callb_delete(cid); 3224 } 3225 3226 /*ARGSUSED*/ 3227 static boolean_t 3228 nfs4_client_cpr_callb(void *arg, int code) 3229 { 3230 /* 3231 * We get called for Suspend and Resume events. 3232 * For the suspend case we simply don't care! 3233 */ 3234 if (code == CB_CODE_CPR_CHKPT) { 3235 return (B_TRUE); 3236 } 3237 3238 /* 3239 * When we get to here we are in the process of 3240 * resuming the system from a previous suspend. 3241 */ 3242 nfs4_client_resumed = gethrestime_sec(); 3243 return (B_TRUE); 3244 } 3245 3246 void 3247 nfs4_renew_lease_thread(nfs4_server_t *sp) 3248 { 3249 int error = 0; 3250 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3251 clock_t tick_delay = 0; 3252 clock_t time_left = 0; 3253 callb_cpr_t cpr_info; 3254 kmutex_t cpr_lock; 3255 3256 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3257 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3258 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3259 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3260 3261 mutex_enter(&sp->s_lock); 3262 /* sp->s_lease_time is set via a GETATTR */ 3263 sp->last_renewal_time = gethrestime_sec(); 3264 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3265 ASSERT(sp->s_refcnt >= 1); 3266 3267 for (;;) { 3268 if (!sp->state_ref_count || 3269 sp->lease_valid != NFS4_LEASE_VALID) { 3270 3271 kip_secs = MAX((sp->s_lease_time >> 1) - 3272 (3 * sp->propagation_delay.tv_sec), 1); 3273 3274 tick_delay = SEC_TO_TICK(kip_secs); 3275 3276 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3277 "nfs4_renew_lease_thread: no renew : thread " 3278 "wait %ld secs", kip_secs)); 3279 3280 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3281 "nfs4_renew_lease_thread: no renew : " 3282 "state_ref_count %d, lease_valid %d", 3283 sp->state_ref_count, sp->lease_valid)); 3284 3285 mutex_enter(&cpr_lock); 3286 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3287 mutex_exit(&cpr_lock); 3288 time_left = cv_reltimedwait(&sp->cv_thread_exit, 3289 &sp->s_lock, tick_delay, TR_CLOCK_TICK); 3290 mutex_enter(&cpr_lock); 3291 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3292 mutex_exit(&cpr_lock); 3293 3294 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3295 "nfs4_renew_lease_thread: no renew: " 3296 "time left %ld", time_left)); 3297 3298 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3299 goto die; 3300 continue; 3301 } 3302 3303 tmp_last_renewal_time = sp->last_renewal_time; 3304 3305 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3306 (3 * sp->propagation_delay.tv_sec); 3307 3308 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3309 "nfs4_renew_lease_thread: tmp_time %ld, " 3310 "sp->last_renewal_time %ld", tmp_time, 3311 sp->last_renewal_time)); 3312 3313 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3314 3315 tick_delay = SEC_TO_TICK(kip_secs); 3316 3317 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3318 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3319 "secs", kip_secs)); 3320 3321 mutex_enter(&cpr_lock); 3322 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3323 mutex_exit(&cpr_lock); 3324 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock, 3325 tick_delay, TR_CLOCK_TICK); 3326 mutex_enter(&cpr_lock); 3327 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3328 mutex_exit(&cpr_lock); 3329 3330 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3331 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3332 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3333 "tmp_last_renewal_time %ld", time_left, 3334 sp->last_renewal_time, nfs4_client_resumed, 3335 tmp_last_renewal_time)); 3336 3337 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3338 goto die; 3339 3340 if (tmp_last_renewal_time == sp->last_renewal_time || 3341 (nfs4_client_resumed != 0 && 3342 nfs4_client_resumed > sp->last_renewal_time)) { 3343 /* 3344 * Issue RENEW op since we haven't renewed the lease 3345 * since we slept. 3346 */ 3347 tmp_now_time = gethrestime_sec(); 3348 error = nfs4renew(sp); 3349 /* 3350 * Need to re-acquire sp's lock, nfs4renew() 3351 * relinqueshes it. 3352 */ 3353 mutex_enter(&sp->s_lock); 3354 3355 /* 3356 * See if someone changed s_thread_exit while we gave 3357 * up s_lock. 3358 */ 3359 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3360 goto die; 3361 3362 if (!error) { 3363 /* 3364 * check to see if we implicitly renewed while 3365 * we waited for a reply for our RENEW call. 3366 */ 3367 if (tmp_last_renewal_time == 3368 sp->last_renewal_time) { 3369 /* no implicit renew came */ 3370 sp->last_renewal_time = tmp_now_time; 3371 } else { 3372 NFS4_DEBUG(nfs4_client_lease_debug, 3373 (CE_NOTE, "renew_thread: did " 3374 "implicit renewal before reply " 3375 "from server for RENEW")); 3376 } 3377 } else { 3378 /* figure out error */ 3379 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3380 "renew_thread: nfs4renew returned error" 3381 " %d", error)); 3382 } 3383 3384 } 3385 } 3386 3387 die: 3388 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3389 "nfs4_renew_lease_thread: thread exiting")); 3390 3391 while (sp->s_otw_call_count != 0) { 3392 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3393 "nfs4_renew_lease_thread: waiting for outstanding " 3394 "otw calls to finish for sp 0x%p, current " 3395 "s_otw_call_count %d", (void *)sp, 3396 sp->s_otw_call_count)); 3397 mutex_enter(&cpr_lock); 3398 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3399 mutex_exit(&cpr_lock); 3400 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3401 mutex_enter(&cpr_lock); 3402 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3403 mutex_exit(&cpr_lock); 3404 } 3405 mutex_exit(&sp->s_lock); 3406 3407 nfs4_server_rele(sp); /* free the thread's reference */ 3408 nfs4_server_rele(sp); /* free the list's reference */ 3409 sp = NULL; 3410 3411 done: 3412 mutex_enter(&cpr_lock); 3413 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3414 mutex_destroy(&cpr_lock); 3415 3416 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3417 "nfs4_renew_lease_thread: renew thread exit officially")); 3418 3419 zthread_exit(); 3420 /* NOT REACHED */ 3421 } 3422 3423 /* 3424 * Send out a RENEW op to the server. 3425 * Assumes sp is locked down. 3426 */ 3427 static int 3428 nfs4renew(nfs4_server_t *sp) 3429 { 3430 COMPOUND4args_clnt args; 3431 COMPOUND4res_clnt res; 3432 nfs_argop4 argop[1]; 3433 int doqueue = 1; 3434 int rpc_error; 3435 cred_t *cr; 3436 mntinfo4_t *mi; 3437 timespec_t prop_time, after_time; 3438 int needrecov = FALSE; 3439 nfs4_recov_state_t recov_state; 3440 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3441 3442 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3443 3444 recov_state.rs_flags = 0; 3445 recov_state.rs_num_retry_despite_err = 0; 3446 3447 recov_retry: 3448 mi = sp->mntinfo4_list; 3449 VFS_HOLD(mi->mi_vfsp); 3450 mutex_exit(&sp->s_lock); 3451 ASSERT(mi != NULL); 3452 3453 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3454 if (e.error) { 3455 VFS_RELE(mi->mi_vfsp); 3456 return (e.error); 3457 } 3458 3459 /* Check to see if we're dealing with a marked-dead sp */ 3460 mutex_enter(&sp->s_lock); 3461 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3462 mutex_exit(&sp->s_lock); 3463 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3464 VFS_RELE(mi->mi_vfsp); 3465 return (0); 3466 } 3467 3468 /* Make sure mi hasn't changed on us */ 3469 if (mi != sp->mntinfo4_list) { 3470 /* Must drop sp's lock to avoid a recursive mutex enter */ 3471 mutex_exit(&sp->s_lock); 3472 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3473 VFS_RELE(mi->mi_vfsp); 3474 mutex_enter(&sp->s_lock); 3475 goto recov_retry; 3476 } 3477 mutex_exit(&sp->s_lock); 3478 3479 args.ctag = TAG_RENEW; 3480 3481 args.array_len = 1; 3482 args.array = argop; 3483 3484 argop[0].argop = OP_RENEW; 3485 3486 mutex_enter(&sp->s_lock); 3487 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3488 cr = sp->s_cred; 3489 crhold(cr); 3490 mutex_exit(&sp->s_lock); 3491 3492 ASSERT(cr != NULL); 3493 3494 /* used to figure out RTT for sp */ 3495 gethrestime(&prop_time); 3496 3497 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3498 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3499 (void*)sp)); 3500 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3501 prop_time.tv_sec, prop_time.tv_nsec)); 3502 3503 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3504 mntinfo4_t *, mi); 3505 3506 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3507 crfree(cr); 3508 3509 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3510 mntinfo4_t *, mi); 3511 3512 gethrestime(&after_time); 3513 3514 mutex_enter(&sp->s_lock); 3515 sp->propagation_delay.tv_sec = 3516 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3517 mutex_exit(&sp->s_lock); 3518 3519 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3520 after_time.tv_sec, after_time.tv_nsec)); 3521 3522 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3523 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3524 nfs4_delegreturn_all(sp); 3525 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3526 VFS_RELE(mi->mi_vfsp); 3527 /* 3528 * If the server returns CB_PATH_DOWN, it has renewed 3529 * the lease and informed us that the callback path is 3530 * down. Since the lease is renewed, just return 0 and 3531 * let the renew thread proceed as normal. 3532 */ 3533 return (0); 3534 } 3535 3536 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3537 if (!needrecov && e.error) { 3538 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3539 VFS_RELE(mi->mi_vfsp); 3540 return (e.error); 3541 } 3542 3543 rpc_error = e.error; 3544 3545 if (needrecov) { 3546 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3547 "nfs4renew: initiating recovery\n")); 3548 3549 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3550 OP_RENEW, NULL, NULL, NULL) == FALSE) { 3551 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3552 VFS_RELE(mi->mi_vfsp); 3553 if (!e.error) 3554 (void) xdr_free(xdr_COMPOUND4res_clnt, 3555 (caddr_t)&res); 3556 mutex_enter(&sp->s_lock); 3557 goto recov_retry; 3558 } 3559 /* fall through for res.status case */ 3560 } 3561 3562 if (res.status) { 3563 if (res.status == NFS4ERR_LEASE_MOVED) { 3564 /*EMPTY*/ 3565 /* 3566 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3567 * to renew the lease on that server 3568 */ 3569 } 3570 e.error = geterrno4(res.status); 3571 } 3572 3573 if (!rpc_error) 3574 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3575 3576 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3577 3578 VFS_RELE(mi->mi_vfsp); 3579 3580 return (e.error); 3581 } 3582 3583 void 3584 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3585 { 3586 nfs4_server_t *sp; 3587 3588 /* this locks down sp if it is found */ 3589 sp = find_nfs4_server(mi); 3590 3591 if (sp != NULL) { 3592 nfs4_inc_state_ref_count_nolock(sp, mi); 3593 mutex_exit(&sp->s_lock); 3594 nfs4_server_rele(sp); 3595 } 3596 } 3597 3598 /* 3599 * Bump the number of OPEN files (ie: those with state) so we know if this 3600 * nfs4_server has any state to maintain a lease for or not. 3601 * 3602 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3603 */ 3604 void 3605 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3606 { 3607 ASSERT(mutex_owned(&sp->s_lock)); 3608 3609 sp->state_ref_count++; 3610 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3611 "nfs4_inc_state_ref_count: state_ref_count now %d", 3612 sp->state_ref_count)); 3613 3614 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3615 sp->lease_valid = NFS4_LEASE_VALID; 3616 3617 /* 3618 * If this call caused the lease to be marked valid and/or 3619 * took the state_ref_count from 0 to 1, then start the time 3620 * on lease renewal. 3621 */ 3622 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3623 sp->last_renewal_time = gethrestime_sec(); 3624 3625 /* update the number of open files for mi */ 3626 mi->mi_open_files++; 3627 } 3628 3629 void 3630 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3631 { 3632 nfs4_server_t *sp; 3633 3634 /* this locks down sp if it is found */ 3635 sp = find_nfs4_server_all(mi, 1); 3636 3637 if (sp != NULL) { 3638 nfs4_dec_state_ref_count_nolock(sp, mi); 3639 mutex_exit(&sp->s_lock); 3640 nfs4_server_rele(sp); 3641 } 3642 } 3643 3644 /* 3645 * Decrement the number of OPEN files (ie: those with state) so we know if 3646 * this nfs4_server has any state to maintain a lease for or not. 3647 */ 3648 void 3649 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3650 { 3651 ASSERT(mutex_owned(&sp->s_lock)); 3652 ASSERT(sp->state_ref_count != 0); 3653 sp->state_ref_count--; 3654 3655 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3656 "nfs4_dec_state_ref_count: state ref count now %d", 3657 sp->state_ref_count)); 3658 3659 mi->mi_open_files--; 3660 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3661 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3662 mi->mi_open_files, mi->mi_flags)); 3663 3664 /* We don't have to hold the mi_lock to test mi_flags */ 3665 if (mi->mi_open_files == 0 && 3666 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3667 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3668 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3669 "we have closed the last open file", (void*)mi)); 3670 nfs4_remove_mi_from_server(mi, sp); 3671 } 3672 } 3673 3674 bool_t 3675 inlease(nfs4_server_t *sp) 3676 { 3677 bool_t result; 3678 3679 ASSERT(mutex_owned(&sp->s_lock)); 3680 3681 if (sp->lease_valid == NFS4_LEASE_VALID && 3682 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3683 result = TRUE; 3684 else 3685 result = FALSE; 3686 3687 return (result); 3688 } 3689 3690 3691 /* 3692 * Return non-zero if the given nfs4_server_t is going through recovery. 3693 */ 3694 3695 int 3696 nfs4_server_in_recovery(nfs4_server_t *sp) 3697 { 3698 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3699 } 3700 3701 /* 3702 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3703 * first is less than, equal to, or greater than the second. 3704 */ 3705 3706 int 3707 sfh4cmp(const void *p1, const void *p2) 3708 { 3709 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3710 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3711 3712 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3713 } 3714 3715 /* 3716 * Create a table for shared filehandle objects. 3717 */ 3718 3719 void 3720 sfh4_createtab(avl_tree_t *tab) 3721 { 3722 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3723 offsetof(nfs4_sharedfh_t, sfh_tree)); 3724 } 3725 3726 /* 3727 * Return a shared filehandle object for the given filehandle. The caller 3728 * is responsible for eventually calling sfh4_rele(). 3729 */ 3730 3731 nfs4_sharedfh_t * 3732 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3733 { 3734 nfs4_sharedfh_t *sfh, *nsfh; 3735 avl_index_t where; 3736 nfs4_sharedfh_t skey; 3737 3738 if (!key) { 3739 skey.sfh_fh = *fh; 3740 key = &skey; 3741 } 3742 3743 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3744 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3745 /* 3746 * We allocate the largest possible filehandle size because it's 3747 * not that big, and it saves us from possibly having to resize the 3748 * buffer later. 3749 */ 3750 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3751 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3752 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3753 nsfh->sfh_refcnt = 1; 3754 nsfh->sfh_flags = SFH4_IN_TREE; 3755 nsfh->sfh_mi = mi; 3756 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3757 (void *)nsfh)); 3758 3759 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3760 sfh = avl_find(&mi->mi_filehandles, key, &where); 3761 if (sfh != NULL) { 3762 mutex_enter(&sfh->sfh_lock); 3763 sfh->sfh_refcnt++; 3764 mutex_exit(&sfh->sfh_lock); 3765 nfs_rw_exit(&mi->mi_fh_lock); 3766 /* free our speculative allocs */ 3767 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3768 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3769 return (sfh); 3770 } 3771 3772 avl_insert(&mi->mi_filehandles, nsfh, where); 3773 nfs_rw_exit(&mi->mi_fh_lock); 3774 3775 return (nsfh); 3776 } 3777 3778 /* 3779 * Return a shared filehandle object for the given filehandle. The caller 3780 * is responsible for eventually calling sfh4_rele(). 3781 */ 3782 3783 nfs4_sharedfh_t * 3784 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3785 { 3786 nfs4_sharedfh_t *sfh; 3787 nfs4_sharedfh_t key; 3788 3789 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3790 3791 #ifdef DEBUG 3792 if (nfs4_sharedfh_debug) { 3793 nfs4_fhandle_t fhandle; 3794 3795 fhandle.fh_len = fh->nfs_fh4_len; 3796 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3797 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3798 nfs4_printfhandle(&fhandle); 3799 } 3800 #endif 3801 3802 /* 3803 * If there's already an object for the given filehandle, bump the 3804 * reference count and return it. Otherwise, create a new object 3805 * and add it to the AVL tree. 3806 */ 3807 3808 key.sfh_fh = *fh; 3809 3810 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3811 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3812 if (sfh != NULL) { 3813 mutex_enter(&sfh->sfh_lock); 3814 sfh->sfh_refcnt++; 3815 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3816 "sfh4_get: found existing %p, new refcnt=%d", 3817 (void *)sfh, sfh->sfh_refcnt)); 3818 mutex_exit(&sfh->sfh_lock); 3819 nfs_rw_exit(&mi->mi_fh_lock); 3820 return (sfh); 3821 } 3822 nfs_rw_exit(&mi->mi_fh_lock); 3823 3824 return (sfh4_put(fh, mi, &key)); 3825 } 3826 3827 /* 3828 * Get a reference to the given shared filehandle object. 3829 */ 3830 3831 void 3832 sfh4_hold(nfs4_sharedfh_t *sfh) 3833 { 3834 ASSERT(sfh->sfh_refcnt > 0); 3835 3836 mutex_enter(&sfh->sfh_lock); 3837 sfh->sfh_refcnt++; 3838 NFS4_DEBUG(nfs4_sharedfh_debug, 3839 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3840 (void *)sfh, sfh->sfh_refcnt)); 3841 mutex_exit(&sfh->sfh_lock); 3842 } 3843 3844 /* 3845 * Release a reference to the given shared filehandle object and null out 3846 * the given pointer. 3847 */ 3848 3849 void 3850 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3851 { 3852 mntinfo4_t *mi; 3853 nfs4_sharedfh_t *sfh = *sfhpp; 3854 3855 ASSERT(sfh->sfh_refcnt > 0); 3856 3857 mutex_enter(&sfh->sfh_lock); 3858 if (sfh->sfh_refcnt > 1) { 3859 sfh->sfh_refcnt--; 3860 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3861 "sfh4_rele %p, new refcnt=%d", 3862 (void *)sfh, sfh->sfh_refcnt)); 3863 mutex_exit(&sfh->sfh_lock); 3864 goto finish; 3865 } 3866 mutex_exit(&sfh->sfh_lock); 3867 3868 /* 3869 * Possibly the last reference, so get the lock for the table in 3870 * case it's time to remove the object from the table. 3871 */ 3872 mi = sfh->sfh_mi; 3873 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3874 mutex_enter(&sfh->sfh_lock); 3875 sfh->sfh_refcnt--; 3876 if (sfh->sfh_refcnt > 0) { 3877 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3878 "sfh4_rele %p, new refcnt=%d", 3879 (void *)sfh, sfh->sfh_refcnt)); 3880 mutex_exit(&sfh->sfh_lock); 3881 nfs_rw_exit(&mi->mi_fh_lock); 3882 goto finish; 3883 } 3884 3885 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3886 "sfh4_rele %p, last ref", (void *)sfh)); 3887 if (sfh->sfh_flags & SFH4_IN_TREE) { 3888 avl_remove(&mi->mi_filehandles, sfh); 3889 sfh->sfh_flags &= ~SFH4_IN_TREE; 3890 } 3891 mutex_exit(&sfh->sfh_lock); 3892 nfs_rw_exit(&mi->mi_fh_lock); 3893 mutex_destroy(&sfh->sfh_lock); 3894 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3895 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3896 3897 finish: 3898 *sfhpp = NULL; 3899 } 3900 3901 /* 3902 * Update the filehandle for the given shared filehandle object. 3903 */ 3904 3905 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3906 3907 void 3908 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3909 { 3910 mntinfo4_t *mi = sfh->sfh_mi; 3911 nfs4_sharedfh_t *dupsfh; 3912 avl_index_t where; 3913 nfs4_sharedfh_t key; 3914 3915 #ifdef DEBUG 3916 mutex_enter(&sfh->sfh_lock); 3917 ASSERT(sfh->sfh_refcnt > 0); 3918 mutex_exit(&sfh->sfh_lock); 3919 #endif 3920 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3921 3922 /* 3923 * The basic plan is to remove the shared filehandle object from 3924 * the table, update it to have the new filehandle, then reinsert 3925 * it. 3926 */ 3927 3928 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3929 mutex_enter(&sfh->sfh_lock); 3930 if (sfh->sfh_flags & SFH4_IN_TREE) { 3931 avl_remove(&mi->mi_filehandles, sfh); 3932 sfh->sfh_flags &= ~SFH4_IN_TREE; 3933 } 3934 mutex_exit(&sfh->sfh_lock); 3935 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3936 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3937 sfh->sfh_fh.nfs_fh4_len); 3938 3939 /* 3940 * XXX If there is already a shared filehandle object with the new 3941 * filehandle, we're in trouble, because the rnode code assumes 3942 * that there is only one shared filehandle object for a given 3943 * filehandle. So issue a warning (for read-write mounts only) 3944 * and don't try to re-insert the given object into the table. 3945 * Hopefully the given object will quickly go away and everyone 3946 * will use the new object. 3947 */ 3948 key.sfh_fh = *newfh; 3949 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3950 if (dupsfh != NULL) { 3951 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3952 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3953 "duplicate filehandle detected"); 3954 sfh4_printfhandle(dupsfh); 3955 } 3956 } else { 3957 avl_insert(&mi->mi_filehandles, sfh, where); 3958 mutex_enter(&sfh->sfh_lock); 3959 sfh->sfh_flags |= SFH4_IN_TREE; 3960 mutex_exit(&sfh->sfh_lock); 3961 } 3962 nfs_rw_exit(&mi->mi_fh_lock); 3963 } 3964 3965 /* 3966 * Copy out the current filehandle for the given shared filehandle object. 3967 */ 3968 3969 void 3970 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3971 { 3972 mntinfo4_t *mi = sfh->sfh_mi; 3973 3974 ASSERT(sfh->sfh_refcnt > 0); 3975 3976 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3977 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3978 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3979 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3980 nfs_rw_exit(&mi->mi_fh_lock); 3981 } 3982 3983 /* 3984 * Print out the filehandle for the given shared filehandle object. 3985 */ 3986 3987 void 3988 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3989 { 3990 nfs4_fhandle_t fhandle; 3991 3992 sfh4_copyval(sfh, &fhandle); 3993 nfs4_printfhandle(&fhandle); 3994 } 3995 3996 /* 3997 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3998 * if they're the same, +1 if the first is "greater" than the second. The 3999 * caller (or whoever's calling the AVL package) is responsible for 4000 * handling locking issues. 4001 */ 4002 4003 static int 4004 fncmp(const void *p1, const void *p2) 4005 { 4006 const nfs4_fname_t *f1 = p1; 4007 const nfs4_fname_t *f2 = p2; 4008 int res; 4009 4010 res = strcmp(f1->fn_name, f2->fn_name); 4011 /* 4012 * The AVL package wants +/-1, not arbitrary positive or negative 4013 * integers. 4014 */ 4015 if (res > 0) 4016 res = 1; 4017 else if (res < 0) 4018 res = -1; 4019 return (res); 4020 } 4021 4022 /* 4023 * Get or create an fname with the given name, as a child of the given 4024 * fname. The caller is responsible for eventually releasing the reference 4025 * (fn_rele()). parent may be NULL. 4026 */ 4027 4028 nfs4_fname_t * 4029 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh) 4030 { 4031 nfs4_fname_t key; 4032 nfs4_fname_t *fnp; 4033 avl_index_t where; 4034 4035 key.fn_name = name; 4036 4037 /* 4038 * If there's already an fname registered with the given name, bump 4039 * its reference count and return it. Otherwise, create a new one 4040 * and add it to the parent's AVL tree. 4041 * 4042 * fname entries we are looking for should match both name 4043 * and sfh stored in the fname. 4044 */ 4045 again: 4046 if (parent != NULL) { 4047 mutex_enter(&parent->fn_lock); 4048 fnp = avl_find(&parent->fn_children, &key, &where); 4049 if (fnp != NULL) { 4050 /* 4051 * This hold on fnp is released below later, 4052 * in case this is not the fnp we want. 4053 */ 4054 fn_hold(fnp); 4055 4056 if (fnp->fn_sfh == sfh) { 4057 /* 4058 * We have found our entry. 4059 * put an hold and return it. 4060 */ 4061 mutex_exit(&parent->fn_lock); 4062 return (fnp); 4063 } 4064 4065 /* 4066 * We have found an entry that has a mismatching 4067 * fn_sfh. This could be a stale entry due to 4068 * server side rename. We will remove this entry 4069 * and make sure no such entries exist. 4070 */ 4071 mutex_exit(&parent->fn_lock); 4072 mutex_enter(&fnp->fn_lock); 4073 if (fnp->fn_parent == parent) { 4074 /* 4075 * Remove ourselves from parent's 4076 * fn_children tree. 4077 */ 4078 mutex_enter(&parent->fn_lock); 4079 avl_remove(&parent->fn_children, fnp); 4080 mutex_exit(&parent->fn_lock); 4081 fn_rele(&fnp->fn_parent); 4082 } 4083 mutex_exit(&fnp->fn_lock); 4084 fn_rele(&fnp); 4085 goto again; 4086 } 4087 } 4088 4089 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 4090 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 4091 fnp->fn_parent = parent; 4092 if (parent != NULL) 4093 fn_hold(parent); 4094 fnp->fn_len = strlen(name); 4095 ASSERT(fnp->fn_len < MAXNAMELEN); 4096 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 4097 (void) strcpy(fnp->fn_name, name); 4098 fnp->fn_refcnt = 1; 4099 4100 /* 4101 * This hold on sfh is later released 4102 * when we do the final fn_rele() on this fname. 4103 */ 4104 sfh4_hold(sfh); 4105 fnp->fn_sfh = sfh; 4106 4107 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 4108 offsetof(nfs4_fname_t, fn_tree)); 4109 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4110 "fn_get %p:%s, a new nfs4_fname_t!", 4111 (void *)fnp, fnp->fn_name)); 4112 if (parent != NULL) { 4113 avl_insert(&parent->fn_children, fnp, where); 4114 mutex_exit(&parent->fn_lock); 4115 } 4116 4117 return (fnp); 4118 } 4119 4120 void 4121 fn_hold(nfs4_fname_t *fnp) 4122 { 4123 atomic_inc_32(&fnp->fn_refcnt); 4124 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4125 "fn_hold %p:%s, new refcnt=%d", 4126 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4127 } 4128 4129 /* 4130 * Decrement the reference count of the given fname, and destroy it if its 4131 * reference count goes to zero. Nulls out the given pointer. 4132 */ 4133 4134 void 4135 fn_rele(nfs4_fname_t **fnpp) 4136 { 4137 nfs4_fname_t *parent; 4138 uint32_t newref; 4139 nfs4_fname_t *fnp; 4140 4141 recur: 4142 fnp = *fnpp; 4143 *fnpp = NULL; 4144 4145 mutex_enter(&fnp->fn_lock); 4146 parent = fnp->fn_parent; 4147 if (parent != NULL) 4148 mutex_enter(&parent->fn_lock); /* prevent new references */ 4149 newref = atomic_dec_32_nv(&fnp->fn_refcnt); 4150 if (newref > 0) { 4151 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4152 "fn_rele %p:%s, new refcnt=%d", 4153 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4154 if (parent != NULL) 4155 mutex_exit(&parent->fn_lock); 4156 mutex_exit(&fnp->fn_lock); 4157 return; 4158 } 4159 4160 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4161 "fn_rele %p:%s, last reference, deleting...", 4162 (void *)fnp, fnp->fn_name)); 4163 if (parent != NULL) { 4164 avl_remove(&parent->fn_children, fnp); 4165 mutex_exit(&parent->fn_lock); 4166 } 4167 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4168 sfh4_rele(&fnp->fn_sfh); 4169 mutex_destroy(&fnp->fn_lock); 4170 avl_destroy(&fnp->fn_children); 4171 kmem_free(fnp, sizeof (nfs4_fname_t)); 4172 /* 4173 * Recursivly fn_rele the parent. 4174 * Use goto instead of a recursive call to avoid stack overflow. 4175 */ 4176 if (parent != NULL) { 4177 fnpp = &parent; 4178 goto recur; 4179 } 4180 } 4181 4182 /* 4183 * Returns the single component name of the given fname, in a MAXNAMELEN 4184 * string buffer, which the caller is responsible for freeing. Note that 4185 * the name may become invalid as a result of fn_move(). 4186 */ 4187 4188 char * 4189 fn_name(nfs4_fname_t *fnp) 4190 { 4191 char *name; 4192 4193 ASSERT(fnp->fn_len < MAXNAMELEN); 4194 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4195 mutex_enter(&fnp->fn_lock); 4196 (void) strcpy(name, fnp->fn_name); 4197 mutex_exit(&fnp->fn_lock); 4198 4199 return (name); 4200 } 4201 4202 4203 /* 4204 * fn_path_realloc 4205 * 4206 * This function, used only by fn_path, constructs 4207 * a new string which looks like "prepend" + "/" + "current". 4208 * by allocating a new string and freeing the old one. 4209 */ 4210 static void 4211 fn_path_realloc(char **curses, char *prepend) 4212 { 4213 int len, curlen = 0; 4214 char *news; 4215 4216 if (*curses == NULL) { 4217 /* 4218 * Prime the pump, allocate just the 4219 * space for prepend and return that. 4220 */ 4221 len = strlen(prepend) + 1; 4222 news = kmem_alloc(len, KM_SLEEP); 4223 (void) strncpy(news, prepend, len); 4224 } else { 4225 /* 4226 * Allocate the space for a new string 4227 * +1 +1 is for the "/" and the NULL 4228 * byte at the end of it all. 4229 */ 4230 curlen = strlen(*curses); 4231 len = curlen + strlen(prepend) + 1 + 1; 4232 news = kmem_alloc(len, KM_SLEEP); 4233 (void) strncpy(news, prepend, len); 4234 (void) strcat(news, "/"); 4235 (void) strcat(news, *curses); 4236 kmem_free(*curses, curlen + 1); 4237 } 4238 *curses = news; 4239 } 4240 4241 /* 4242 * Returns the path name (starting from the fs root) for the given fname. 4243 * The caller is responsible for freeing. Note that the path may be or 4244 * become invalid as a result of fn_move(). 4245 */ 4246 4247 char * 4248 fn_path(nfs4_fname_t *fnp) 4249 { 4250 char *path; 4251 nfs4_fname_t *nextfnp; 4252 4253 if (fnp == NULL) 4254 return (NULL); 4255 4256 path = NULL; 4257 4258 /* walk up the tree constructing the pathname. */ 4259 4260 fn_hold(fnp); /* adjust for later rele */ 4261 do { 4262 mutex_enter(&fnp->fn_lock); 4263 /* 4264 * Add fn_name in front of the current path 4265 */ 4266 fn_path_realloc(&path, fnp->fn_name); 4267 nextfnp = fnp->fn_parent; 4268 if (nextfnp != NULL) 4269 fn_hold(nextfnp); 4270 mutex_exit(&fnp->fn_lock); 4271 fn_rele(&fnp); 4272 fnp = nextfnp; 4273 } while (fnp != NULL); 4274 4275 return (path); 4276 } 4277 4278 /* 4279 * Return a reference to the parent of the given fname, which the caller is 4280 * responsible for eventually releasing. 4281 */ 4282 4283 nfs4_fname_t * 4284 fn_parent(nfs4_fname_t *fnp) 4285 { 4286 nfs4_fname_t *parent; 4287 4288 mutex_enter(&fnp->fn_lock); 4289 parent = fnp->fn_parent; 4290 if (parent != NULL) 4291 fn_hold(parent); 4292 mutex_exit(&fnp->fn_lock); 4293 4294 return (parent); 4295 } 4296 4297 /* 4298 * Update fnp so that its parent is newparent and its name is newname. 4299 */ 4300 4301 void 4302 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4303 { 4304 nfs4_fname_t *parent, *tmpfnp; 4305 ssize_t newlen; 4306 nfs4_fname_t key; 4307 avl_index_t where; 4308 4309 /* 4310 * This assert exists to catch the client trying to rename 4311 * a dir to be a child of itself. This happened at a recent 4312 * bakeoff against a 3rd party (broken) server which allowed 4313 * the rename to succeed. If it trips it means that: 4314 * a) the code in nfs4rename that detects this case is broken 4315 * b) the server is broken (since it allowed the bogus rename) 4316 * 4317 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4318 * panic below from: mutex_enter(&newparent->fn_lock); 4319 */ 4320 ASSERT(fnp != newparent); 4321 4322 /* 4323 * Remove fnp from its current parent, change its name, then add it 4324 * to newparent. It might happen that fnp was replaced by another 4325 * nfs4_fname_t with the same fn_name in parent->fn_children. 4326 * In such case, fnp->fn_parent is NULL and we skip the removal 4327 * of fnp from its current parent. 4328 */ 4329 mutex_enter(&fnp->fn_lock); 4330 parent = fnp->fn_parent; 4331 if (parent != NULL) { 4332 mutex_enter(&parent->fn_lock); 4333 avl_remove(&parent->fn_children, fnp); 4334 mutex_exit(&parent->fn_lock); 4335 fn_rele(&fnp->fn_parent); 4336 } 4337 4338 newlen = strlen(newname); 4339 if (newlen != fnp->fn_len) { 4340 ASSERT(newlen < MAXNAMELEN); 4341 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4342 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4343 fnp->fn_len = newlen; 4344 } 4345 (void) strcpy(fnp->fn_name, newname); 4346 4347 again: 4348 mutex_enter(&newparent->fn_lock); 4349 key.fn_name = fnp->fn_name; 4350 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4351 if (tmpfnp != NULL) { 4352 /* 4353 * This could be due to a file that was unlinked while 4354 * open, or perhaps the rnode is in the free list. Remove 4355 * it from newparent and let it go away on its own. The 4356 * contorted code is to deal with lock order issues and 4357 * race conditions. 4358 */ 4359 fn_hold(tmpfnp); 4360 mutex_exit(&newparent->fn_lock); 4361 mutex_enter(&tmpfnp->fn_lock); 4362 if (tmpfnp->fn_parent == newparent) { 4363 mutex_enter(&newparent->fn_lock); 4364 avl_remove(&newparent->fn_children, tmpfnp); 4365 mutex_exit(&newparent->fn_lock); 4366 fn_rele(&tmpfnp->fn_parent); 4367 } 4368 mutex_exit(&tmpfnp->fn_lock); 4369 fn_rele(&tmpfnp); 4370 goto again; 4371 } 4372 fnp->fn_parent = newparent; 4373 fn_hold(newparent); 4374 avl_insert(&newparent->fn_children, fnp, where); 4375 mutex_exit(&newparent->fn_lock); 4376 mutex_exit(&fnp->fn_lock); 4377 } 4378 4379 #ifdef DEBUG 4380 /* 4381 * Return non-zero if the type information makes sense for the given vnode. 4382 * Otherwise panic. 4383 */ 4384 int 4385 nfs4_consistent_type(vnode_t *vp) 4386 { 4387 rnode4_t *rp = VTOR4(vp); 4388 4389 if (nfs4_vtype_debug && vp->v_type != VNON && 4390 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4391 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4392 "rnode attr type=%d", (void *)vp, vp->v_type, 4393 rp->r_attr.va_type); 4394 } 4395 4396 return (1); 4397 } 4398 #endif /* DEBUG */ 4399