1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/thread.h> 37 #include <sys/t_lock.h> 38 #include <sys/time.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/errno.h> 42 #include <sys/buf.h> 43 #include <sys/stat.h> 44 #include <sys/cred.h> 45 #include <sys/kmem.h> 46 #include <sys/debug.h> 47 #include <sys/dnlc.h> 48 #include <sys/vmsystm.h> 49 #include <sys/flock.h> 50 #include <sys/share.h> 51 #include <sys/cmn_err.h> 52 #include <sys/tiuser.h> 53 #include <sys/sysmacros.h> 54 #include <sys/callb.h> 55 #include <sys/acl.h> 56 #include <sys/kstat.h> 57 #include <sys/signal.h> 58 #include <sys/disp.h> 59 #include <sys/atomic.h> 60 #include <sys/list.h> 61 #include <sys/sdt.h> 62 63 #include <rpc/types.h> 64 #include <rpc/xdr.h> 65 #include <rpc/auth.h> 66 #include <rpc/clnt.h> 67 68 #include <nfs/nfs.h> 69 #include <nfs/nfs_clnt.h> 70 #include <nfs/nfs_acl.h> 71 72 #include <nfs/nfs4.h> 73 #include <nfs/rnode4.h> 74 #include <nfs/nfs4_clnt.h> 75 76 #include <vm/hat.h> 77 #include <vm/as.h> 78 #include <vm/page.h> 79 #include <vm/pvn.h> 80 #include <vm/seg.h> 81 #include <vm/seg_map.h> 82 #include <vm/seg_vn.h> 83 84 #include <sys/ddi.h> 85 86 /* 87 * Arguments to page-flush thread. 88 */ 89 typedef struct { 90 vnode_t *vp; 91 cred_t *cr; 92 } pgflush_t; 93 94 #ifdef DEBUG 95 int nfs4_client_lease_debug; 96 int nfs4_sharedfh_debug; 97 int nfs4_fname_debug; 98 99 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 100 int nfs4_vtype_debug; 101 102 uint_t nfs4_tsd_key; 103 #endif 104 105 static time_t nfs4_client_resumed = 0; 106 static callb_id_t cid = 0; 107 108 static int nfs4renew(nfs4_server_t *); 109 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 110 static void nfs4_pgflush_thread(pgflush_t *); 111 static void flush_pages(vnode_t *, cred_t *); 112 113 static boolean_t nfs4_client_cpr_callb(void *, int); 114 115 struct mi4_globals { 116 kmutex_t mig_lock; /* lock protecting mig_list */ 117 list_t mig_list; /* list of NFS v4 mounts in zone */ 118 boolean_t mig_destructor_called; 119 }; 120 121 static zone_key_t mi4_list_key; 122 123 /* 124 * Attributes caching: 125 * 126 * Attributes are cached in the rnode in struct vattr form. 127 * There is a time associated with the cached attributes (r_time_attr_inval) 128 * which tells whether the attributes are valid. The time is initialized 129 * to the difference between current time and the modify time of the vnode 130 * when new attributes are cached. This allows the attributes for 131 * files that have changed recently to be timed out sooner than for files 132 * that have not changed for a long time. There are minimum and maximum 133 * timeout values that can be set per mount point. 134 */ 135 136 /* 137 * If a cache purge is in progress, wait for it to finish. 138 * 139 * The current thread must not be in the middle of an 140 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 141 * between this thread, a recovery thread, and the page flush thread. 142 */ 143 int 144 nfs4_waitfor_purge_complete(vnode_t *vp) 145 { 146 rnode4_t *rp; 147 k_sigset_t smask; 148 149 rp = VTOR4(vp); 150 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 151 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 152 mutex_enter(&rp->r_statelock); 153 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 154 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 155 ((rp->r_flags & R4PGFLUSH) && 156 rp->r_pgflush != curthread)) { 157 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 158 sigunintr(&smask); 159 mutex_exit(&rp->r_statelock); 160 return (EINTR); 161 } 162 } 163 sigunintr(&smask); 164 mutex_exit(&rp->r_statelock); 165 } 166 return (0); 167 } 168 169 /* 170 * Validate caches by checking cached attributes. If they have timed out, 171 * then get new attributes from the server. As a side effect, cache 172 * invalidation is done if the attributes have changed. 173 * 174 * If the attributes have not timed out and if there is a cache 175 * invalidation being done by some other thread, then wait until that 176 * thread has completed the cache invalidation. 177 */ 178 int 179 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 180 { 181 int error; 182 nfs4_ga_res_t gar; 183 184 if (ATTRCACHE4_VALID(vp)) { 185 error = nfs4_waitfor_purge_complete(vp); 186 if (error) 187 return (error); 188 return (0); 189 } 190 191 gar.n4g_va.va_mask = AT_ALL; 192 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 193 } 194 195 /* 196 * Fill in attribute from the cache. 197 * If valid, then return 0 to indicate that no error occurred, 198 * otherwise return 1 to indicate that an error occurred. 199 */ 200 static int 201 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 202 { 203 rnode4_t *rp; 204 205 rp = VTOR4(vp); 206 mutex_enter(&rp->r_statelock); 207 mutex_enter(&rp->r_statev4_lock); 208 if (ATTRCACHE4_VALID(vp)) { 209 mutex_exit(&rp->r_statev4_lock); 210 /* 211 * Cached attributes are valid 212 */ 213 *vap = rp->r_attr; 214 mutex_exit(&rp->r_statelock); 215 return (0); 216 } 217 mutex_exit(&rp->r_statev4_lock); 218 mutex_exit(&rp->r_statelock); 219 return (1); 220 } 221 222 223 /* 224 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 225 * call is synchronous because all the pages were invalidated by the 226 * nfs4_invalidate_pages() call. 227 */ 228 void 229 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 230 { 231 struct rnode4 *rp = VTOR4(vp); 232 233 /* Ensure that the ..._end_op() call has been done */ 234 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 235 236 if (errno != ESTALE) 237 return; 238 239 mutex_enter(&rp->r_statelock); 240 rp->r_flags |= R4STALE; 241 if (!rp->r_error) 242 rp->r_error = errno; 243 mutex_exit(&rp->r_statelock); 244 if (nfs4_has_pages(vp)) 245 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 246 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 247 } 248 249 /* 250 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 251 * page purge is done asynchronously. 252 */ 253 void 254 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 255 { 256 rnode4_t *rp; 257 char *contents; 258 vnode_t *xattr; 259 int size; 260 int pgflush; /* are we the page flush thread? */ 261 262 /* 263 * Purge the DNLC for any entries which refer to this file. 264 */ 265 if (vp->v_count > 1 && 266 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 267 dnlc_purge_vp(vp); 268 269 /* 270 * Clear any readdir state bits and purge the readlink response cache. 271 */ 272 rp = VTOR4(vp); 273 mutex_enter(&rp->r_statelock); 274 rp->r_flags &= ~R4LOOKUP; 275 contents = rp->r_symlink.contents; 276 size = rp->r_symlink.size; 277 rp->r_symlink.contents = NULL; 278 279 xattr = rp->r_xattr_dir; 280 rp->r_xattr_dir = NULL; 281 282 /* 283 * Purge pathconf cache too. 284 */ 285 rp->r_pathconf.pc4_xattr_valid = 0; 286 rp->r_pathconf.pc4_cache_valid = 0; 287 288 pgflush = (curthread == rp->r_pgflush); 289 mutex_exit(&rp->r_statelock); 290 291 if (contents != NULL) { 292 293 kmem_free((void *)contents, size); 294 } 295 296 if (xattr != NULL) 297 VN_RELE(xattr); 298 299 /* 300 * Flush the page cache. If the current thread is the page flush 301 * thread, don't initiate a new page flush. There's no need for 302 * it, and doing it correctly is hard. 303 */ 304 if (nfs4_has_pages(vp) && !pgflush) { 305 if (!asyncpg) { 306 (void) nfs4_waitfor_purge_complete(vp); 307 flush_pages(vp, cr); 308 } else { 309 pgflush_t *args; 310 311 /* 312 * We don't hold r_statelock while creating the 313 * thread, in case the call blocks. So we use a 314 * flag to indicate that a page flush thread is 315 * active. 316 */ 317 mutex_enter(&rp->r_statelock); 318 if (rp->r_flags & R4PGFLUSH) { 319 mutex_exit(&rp->r_statelock); 320 } else { 321 rp->r_flags |= R4PGFLUSH; 322 mutex_exit(&rp->r_statelock); 323 324 args = kmem_alloc(sizeof (pgflush_t), 325 KM_SLEEP); 326 args->vp = vp; 327 VN_HOLD(args->vp); 328 args->cr = cr; 329 crhold(args->cr); 330 (void) zthread_create(NULL, 0, 331 nfs4_pgflush_thread, args, 0, 332 minclsyspri); 333 } 334 } 335 } 336 337 /* 338 * Flush the readdir response cache. 339 */ 340 nfs4_purge_rddir_cache(vp); 341 } 342 343 /* 344 * Invalidate all pages for the given file, after writing back the dirty 345 * ones. 346 */ 347 348 static void 349 flush_pages(vnode_t *vp, cred_t *cr) 350 { 351 int error; 352 rnode4_t *rp = VTOR4(vp); 353 354 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr); 355 if (error == ENOSPC || error == EDQUOT) { 356 mutex_enter(&rp->r_statelock); 357 if (!rp->r_error) 358 rp->r_error = error; 359 mutex_exit(&rp->r_statelock); 360 } 361 } 362 363 /* 364 * Page flush thread. 365 */ 366 367 static void 368 nfs4_pgflush_thread(pgflush_t *args) 369 { 370 rnode4_t *rp = VTOR4(args->vp); 371 372 /* remember which thread we are, so we don't deadlock ourselves */ 373 mutex_enter(&rp->r_statelock); 374 ASSERT(rp->r_pgflush == NULL); 375 rp->r_pgflush = curthread; 376 mutex_exit(&rp->r_statelock); 377 378 flush_pages(args->vp, args->cr); 379 380 mutex_enter(&rp->r_statelock); 381 rp->r_pgflush = NULL; 382 rp->r_flags &= ~R4PGFLUSH; 383 cv_broadcast(&rp->r_cv); 384 mutex_exit(&rp->r_statelock); 385 386 VN_RELE(args->vp); 387 crfree(args->cr); 388 kmem_free(args, sizeof (pgflush_t)); 389 zthread_exit(); 390 } 391 392 /* 393 * Purge the readdir cache of all entries which are not currently 394 * being filled. 395 */ 396 void 397 nfs4_purge_rddir_cache(vnode_t *vp) 398 { 399 rnode4_t *rp; 400 401 rp = VTOR4(vp); 402 403 mutex_enter(&rp->r_statelock); 404 rp->r_direof = NULL; 405 rp->r_flags &= ~R4LOOKUP; 406 rp->r_flags |= R4READDIRWATTR; 407 rddir4_cache_purge(rp); 408 mutex_exit(&rp->r_statelock); 409 } 410 411 /* 412 * Set attributes cache for given vnode using virtual attributes. There is 413 * no cache validation, but if the attributes are deemed to be stale, they 414 * are ignored. This corresponds to nfs3_attrcache(). 415 * 416 * Set the timeout value on the attribute cache and fill it 417 * with the passed in attributes. 418 */ 419 void 420 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 421 { 422 rnode4_t *rp = VTOR4(vp); 423 424 mutex_enter(&rp->r_statelock); 425 if (rp->r_time_attr_saved <= t) 426 nfs4_attrcache_va(vp, garp, FALSE); 427 mutex_exit(&rp->r_statelock); 428 } 429 430 /* 431 * Use the passed in virtual attributes to check to see whether the 432 * data and metadata caches are valid, cache the new attributes, and 433 * then do the cache invalidation if required. 434 * 435 * The cache validation and caching of the new attributes is done 436 * atomically via the use of the mutex, r_statelock. If required, 437 * the cache invalidation is done atomically w.r.t. the cache 438 * validation and caching of the attributes via the pseudo lock, 439 * r_serial. 440 * 441 * This routine is used to do cache validation and attributes caching 442 * for operations with a single set of post operation attributes. 443 */ 444 445 void 446 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 447 hrtime_t t, cred_t *cr, int async, 448 change_info4 *cinfo) 449 { 450 rnode4_t *rp; 451 int mtime_changed; 452 int ctime_changed; 453 vsecattr_t *vsp; 454 int was_serial, set_time_cache_inval, recov; 455 vattr_t *vap = &garp->n4g_va; 456 mntinfo4_t *mi = VTOMI4(vp); 457 458 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 459 460 /* Is curthread the recovery thread? */ 461 mutex_enter(&mi->mi_lock); 462 recov = (VTOMI4(vp)->mi_recovthread == curthread); 463 mutex_exit(&mi->mi_lock); 464 465 rp = VTOR4(vp); 466 mutex_enter(&rp->r_statelock); 467 was_serial = (rp->r_serial == curthread); 468 if (rp->r_serial && !was_serial) { 469 klwp_t *lwp = ttolwp(curthread); 470 471 /* 472 * If we're the recovery thread, then purge current attrs 473 * and bail out to avoid potential deadlock between another 474 * thread caching attrs (r_serial thread), recov thread, 475 * and an async writer thread. 476 */ 477 if (recov) { 478 PURGE_ATTRCACHE4_LOCKED(rp); 479 mutex_exit(&rp->r_statelock); 480 return; 481 } 482 483 if (lwp != NULL) 484 lwp->lwp_nostop++; 485 while (rp->r_serial != NULL) { 486 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 487 mutex_exit(&rp->r_statelock); 488 if (lwp != NULL) 489 lwp->lwp_nostop--; 490 return; 491 } 492 } 493 if (lwp != NULL) 494 lwp->lwp_nostop--; 495 } 496 497 /* 498 * If there is a page flush thread, the current thread needs to 499 * bail out, to prevent a possible deadlock between the current 500 * thread (which might be in a start_op/end_op region), the 501 * recovery thread, and the page flush thread. Expire the 502 * attribute cache, so that any attributes the current thread was 503 * going to set are not lost. 504 */ 505 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 506 PURGE_ATTRCACHE4_LOCKED(rp); 507 mutex_exit(&rp->r_statelock); 508 return; 509 } 510 511 if (rp->r_time_attr_saved > t) { 512 /* 513 * Attributes have been cached since these attributes were 514 * probably made. If there is an inconsistency in what is 515 * cached, mark them invalid. If not, don't act on them. 516 */ 517 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 518 PURGE_ATTRCACHE4_LOCKED(rp); 519 mutex_exit(&rp->r_statelock); 520 return; 521 } 522 set_time_cache_inval = 0; 523 if (cinfo) { 524 /* 525 * Only directory modifying callers pass non-NULL cinfo. 526 */ 527 ASSERT(vp->v_type == VDIR); 528 /* 529 * If the cache timeout either doesn't exist or hasn't expired, 530 * and dir didn't changed on server before dirmod op 531 * and dir didn't change after dirmod op but before getattr 532 * then there's a chance that the client's cached data for 533 * this object is current (not stale). No immediate cache 534 * flush is required. 535 * 536 */ 537 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 538 cinfo->before == rp->r_change && 539 (garp->n4g_change_valid && 540 cinfo->after == garp->n4g_change)) { 541 542 /* 543 * If atomic isn't set, then the before/after info 544 * cannot be blindly trusted. For this case, we tell 545 * nfs4_attrcache_va to cache the attrs but also 546 * establish an absolute maximum cache timeout. When 547 * the timeout is reached, caches will be flushed. 548 */ 549 if (! cinfo->atomic) 550 set_time_cache_inval = 1; 551 552 mtime_changed = 0; 553 ctime_changed = 0; 554 } else { 555 556 /* 557 * We're not sure exactly what changed, but we know 558 * what to do. flush all caches for dir. remove the 559 * attr timeout. 560 * 561 * a) timeout expired. flush all caches. 562 * b) r_change != cinfo.before. flush all caches. 563 * c) r_change == cinfo.before, but cinfo.after != 564 * post-op getattr(change). flush all caches. 565 * d) post-op getattr(change) not provided by server. 566 * flush all caches. 567 */ 568 mtime_changed = 1; 569 ctime_changed = 1; 570 rp->r_time_cache_inval = 0; 571 } 572 } else { 573 if (!(rp->r_flags & R4WRITEMODIFIED)) { 574 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 575 mtime_changed = 1; 576 else 577 mtime_changed = 0; 578 if (rp->r_attr.va_ctime.tv_sec != 579 vap->va_ctime.tv_sec || 580 rp->r_attr.va_ctime.tv_nsec != 581 vap->va_ctime.tv_nsec) 582 ctime_changed = 1; 583 else 584 ctime_changed = 0; 585 } else { 586 mtime_changed = 0; 587 ctime_changed = 0; 588 } 589 } 590 591 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 592 593 if (!mtime_changed && !ctime_changed) { 594 mutex_exit(&rp->r_statelock); 595 return; 596 } 597 598 rp->r_serial = curthread; 599 600 mutex_exit(&rp->r_statelock); 601 602 /* 603 * If we're the recov thread, then force async nfs4_purge_caches 604 * to avoid potential deadlock. 605 */ 606 if (mtime_changed) 607 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 608 609 if (ctime_changed) { 610 (void) nfs4_access_purge_rp(rp); 611 if (rp->r_secattr != NULL) { 612 mutex_enter(&rp->r_statelock); 613 vsp = rp->r_secattr; 614 rp->r_secattr = NULL; 615 mutex_exit(&rp->r_statelock); 616 if (vsp != NULL) 617 nfs4_acl_free_cache(vsp); 618 } 619 } 620 621 if (!was_serial) { 622 mutex_enter(&rp->r_statelock); 623 rp->r_serial = NULL; 624 cv_broadcast(&rp->r_cv); 625 mutex_exit(&rp->r_statelock); 626 } 627 } 628 629 /* 630 * Set attributes cache for given vnode using virtual attributes. 631 * 632 * Set the timeout value on the attribute cache and fill it 633 * with the passed in attributes. 634 * 635 * The caller must be holding r_statelock. 636 */ 637 static void 638 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 639 { 640 rnode4_t *rp; 641 mntinfo4_t *mi; 642 hrtime_t delta; 643 hrtime_t now; 644 vattr_t *vap = &garp->n4g_va; 645 646 rp = VTOR4(vp); 647 648 ASSERT(MUTEX_HELD(&rp->r_statelock)); 649 ASSERT(vap->va_mask == AT_ALL); 650 651 /* Switch to master before checking v_flag */ 652 if (IS_SHADOW(vp, rp)) 653 vp = RTOV4(rp); 654 655 now = gethrtime(); 656 657 mi = VTOMI4(vp); 658 659 /* 660 * Only establish a new cache timeout (if requested). Never 661 * extend a timeout. Never clear a timeout. Clearing a timeout 662 * is done by nfs4_update_dircaches (ancestor in our call chain) 663 */ 664 if (set_cache_timeout && ! rp->r_time_cache_inval) 665 rp->r_time_cache_inval = now + mi->mi_acdirmax; 666 667 /* 668 * Delta is the number of nanoseconds that we will 669 * cache the attributes of the file. It is based on 670 * the number of nanoseconds since the last time that 671 * we detected a change. The assumption is that files 672 * that changed recently are likely to change again. 673 * There is a minimum and a maximum for regular files 674 * and for directories which is enforced though. 675 * 676 * Using the time since last change was detected 677 * eliminates direct comparison or calculation 678 * using mixed client and server times. NFS does 679 * not make any assumptions regarding the client 680 * and server clocks being synchronized. 681 */ 682 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 683 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 684 vap->va_size != rp->r_attr.va_size) { 685 rp->r_time_attr_saved = now; 686 } 687 688 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 689 delta = 0; 690 else { 691 delta = now - rp->r_time_attr_saved; 692 if (vp->v_type == VDIR) { 693 if (delta < mi->mi_acdirmin) 694 delta = mi->mi_acdirmin; 695 else if (delta > mi->mi_acdirmax) 696 delta = mi->mi_acdirmax; 697 } else { 698 if (delta < mi->mi_acregmin) 699 delta = mi->mi_acregmin; 700 else if (delta > mi->mi_acregmax) 701 delta = mi->mi_acregmax; 702 } 703 } 704 rp->r_time_attr_inval = now + delta; 705 706 rp->r_attr = *vap; 707 if (garp->n4g_change_valid) 708 rp->r_change = garp->n4g_change; 709 710 /* 711 * The attributes that were returned may be valid and can 712 * be used, but they may not be allowed to be cached. 713 * Reset the timers to cause immediate invalidation and 714 * clear r_change so no VERIFY operations will suceed 715 */ 716 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 717 rp->r_time_attr_inval = now; 718 rp->r_time_attr_saved = now; 719 rp->r_change = 0; 720 } 721 722 /* 723 * If mounted_on_fileid returned AND the object is a stub, 724 * then set object's va_nodeid to the mounted over fid 725 * returned by server. 726 * 727 * If mounted_on_fileid not provided/supported, then 728 * just set it to 0 for now. Eventually it would be 729 * better to set it to a hashed version of FH. This 730 * would probably be good enough to provide a unique 731 * fid/d_ino within a dir. 732 * 733 * We don't need to carry mounted_on_fileid in the 734 * rnode as long as the client never requests fileid 735 * without also requesting mounted_on_fileid. For 736 * now, it stays. 737 */ 738 if (garp->n4g_mon_fid_valid) { 739 rp->r_mntd_fid = garp->n4g_mon_fid; 740 741 if (rp->r_flags & R4SRVSTUB) 742 rp->r_attr.va_nodeid = rp->r_mntd_fid; 743 } 744 745 /* 746 * Check to see if there are valid pathconf bits to 747 * cache in the rnode. 748 */ 749 if (garp->n4g_ext_res) { 750 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 751 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 752 } else { 753 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 754 rp->r_pathconf.pc4_xattr_valid = TRUE; 755 rp->r_pathconf.pc4_xattr_exists = 756 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 757 } 758 } 759 } 760 /* 761 * Update the size of the file if there is no cached data or if 762 * the cached data is clean and there is no data being written 763 * out. 764 */ 765 if (rp->r_size != vap->va_size && 766 (!vn_has_cached_data(vp) || 767 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 768 rp->r_size = vap->va_size; 769 } 770 nfs_setswaplike(vp, vap); 771 rp->r_flags &= ~R4WRITEMODIFIED; 772 } 773 774 /* 775 * Get attributes over-the-wire and update attributes cache 776 * if no error occurred in the over-the-wire operation. 777 * Return 0 if successful, otherwise error. 778 */ 779 int 780 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 781 { 782 mntinfo4_t *mi = VTOMI4(vp); 783 hrtime_t t; 784 nfs4_recov_state_t recov_state; 785 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 786 787 recov_state.rs_flags = 0; 788 recov_state.rs_num_retry_despite_err = 0; 789 790 /* Save the original mount point security flavor */ 791 (void) save_mnt_secinfo(mi->mi_curr_serv); 792 793 recov_retry: 794 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 795 &recov_state, NULL))) { 796 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 797 return (e.error); 798 } 799 800 t = gethrtime(); 801 802 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 803 804 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 805 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 806 NULL, OP_GETATTR, NULL) == FALSE) { 807 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 808 &recov_state, 1); 809 goto recov_retry; 810 } 811 } 812 813 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 814 815 if (!e.error) { 816 if (e.stat == NFS4_OK) { 817 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 818 } else { 819 e.error = geterrno4(e.stat); 820 821 nfs4_purge_stale_fh(e.error, vp, cr); 822 } 823 } 824 825 /* 826 * If getattr a node that is a stub for a crossed 827 * mount point, keep the original secinfo flavor for 828 * the current file system, not the crossed one. 829 */ 830 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 831 832 return (e.error); 833 } 834 835 /* 836 * Generate a compound to get attributes over-the-wire. 837 */ 838 void 839 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 840 nfs4_error_t *ep, cred_t *cr, int get_acl) 841 { 842 COMPOUND4args_clnt args; 843 COMPOUND4res_clnt res; 844 int doqueue; 845 rnode4_t *rp = VTOR4(vp); 846 nfs_argop4 argop[2]; 847 848 args.ctag = TAG_GETATTR; 849 850 args.array_len = 2; 851 args.array = argop; 852 853 /* putfh */ 854 argop[0].argop = OP_CPUTFH; 855 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 856 857 /* getattr */ 858 /* 859 * Unlike nfs version 2 and 3, where getattr returns all the 860 * attributes, nfs version 4 returns only the ones explicitely 861 * asked for. This creates problems, as some system functions 862 * (e.g. cache check) require certain attributes and if the 863 * cached node lacks some attributes such as uid/gid, it can 864 * affect system utilities (e.g. "ls") that rely on the information 865 * to be there. This can lead to anything from system crashes to 866 * corrupted information processed by user apps. 867 * So to ensure that all bases are covered, request at least 868 * the AT_ALL attribute mask. 869 */ 870 argop[1].argop = OP_GETATTR; 871 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 872 if (get_acl) 873 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 874 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 875 876 doqueue = 1; 877 878 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 879 880 if (ep->error) 881 return; 882 883 if (res.status != NFS4_OK) { 884 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 885 return; 886 } 887 888 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 889 890 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 891 } 892 893 /* 894 * Return either cached or remote attributes. If get remote attr 895 * use them to check and invalidate caches, then cache the new attributes. 896 */ 897 int 898 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 899 { 900 int error; 901 rnode4_t *rp; 902 nfs4_ga_res_t gar; 903 904 ASSERT(nfs4_consistent_type(vp)); 905 906 /* 907 * If we've got cached attributes, we're done, otherwise go 908 * to the server to get attributes, which will update the cache 909 * in the process. 910 */ 911 rp = VTOR4(vp); 912 mutex_enter(&rp->r_statelock); 913 mutex_enter(&rp->r_statev4_lock); 914 if (ATTRCACHE4_VALID(vp)) { 915 mutex_exit(&rp->r_statev4_lock); 916 /* 917 * Cached attributes are valid 918 * Return the client's view of file size 919 */ 920 *vap = rp->r_attr; 921 vap->va_size = rp->r_size; 922 mutex_exit(&rp->r_statelock); 923 924 ASSERT(nfs4_consistent_type(vp)); 925 926 return (0); 927 } 928 mutex_exit(&rp->r_statev4_lock); 929 mutex_exit(&rp->r_statelock); 930 931 error = nfs4_getattr_otw(vp, &gar, cr, 0); 932 if (!error) 933 *vap = gar.n4g_va; 934 935 /* Return the client's view of file size */ 936 mutex_enter(&rp->r_statelock); 937 vap->va_size = rp->r_size; 938 mutex_exit(&rp->r_statelock); 939 940 ASSERT(nfs4_consistent_type(vp)); 941 942 return (error); 943 } 944 945 int 946 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 947 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 948 { 949 COMPOUND4args_clnt args; 950 COMPOUND4res_clnt res; 951 int doqueue; 952 nfs_argop4 argop[2]; 953 mntinfo4_t *mi = VTOMI4(vp); 954 bool_t needrecov = FALSE; 955 nfs4_recov_state_t recov_state; 956 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 957 nfs4_ga_ext_res_t *gerp; 958 959 recov_state.rs_flags = 0; 960 recov_state.rs_num_retry_despite_err = 0; 961 962 recov_retry: 963 args.ctag = tag_type; 964 965 args.array_len = 2; 966 args.array = argop; 967 968 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 969 if (e.error) 970 return (e.error); 971 972 /* putfh */ 973 argop[0].argop = OP_CPUTFH; 974 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 975 976 /* getattr */ 977 argop[1].argop = OP_GETATTR; 978 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 979 argop[1].nfs_argop4_u.opgetattr.mi = mi; 980 981 doqueue = 1; 982 983 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 984 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 985 rnode4info(VTOR4(vp)))); 986 987 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 988 989 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 990 if (!needrecov && e.error) { 991 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 992 needrecov); 993 return (e.error); 994 } 995 996 if (needrecov) { 997 bool_t abort; 998 999 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1000 "nfs4_attr_otw: initiating recovery\n")); 1001 1002 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1003 NULL, OP_GETATTR, NULL); 1004 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1005 needrecov); 1006 if (!e.error) { 1007 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1008 e.error = geterrno4(res.status); 1009 } 1010 if (abort == FALSE) 1011 goto recov_retry; 1012 return (e.error); 1013 } 1014 1015 if (res.status) { 1016 e.error = geterrno4(res.status); 1017 } else { 1018 gerp = garp->n4g_ext_res; 1019 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1020 garp, sizeof (nfs4_ga_res_t)); 1021 garp->n4g_ext_res = gerp; 1022 if (garp->n4g_ext_res && 1023 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1024 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1025 ga_res.n4g_ext_res, 1026 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1027 } 1028 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1029 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1030 needrecov); 1031 return (e.error); 1032 } 1033 1034 /* 1035 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1036 * for the demand-based allocation of async threads per-mount. The 1037 * nfs_async_timeout is the amount of time a thread will live after it 1038 * becomes idle, unless new I/O requests are received before the thread 1039 * dies. See nfs4_async_putpage and nfs4_async_start. 1040 */ 1041 1042 static void nfs4_async_start(struct vfs *); 1043 1044 static void 1045 free_async_args4(struct nfs4_async_reqs *args) 1046 { 1047 rnode4_t *rp; 1048 1049 if (args->a_io != NFS4_INACTIVE) { 1050 rp = VTOR4(args->a_vp); 1051 mutex_enter(&rp->r_statelock); 1052 rp->r_count--; 1053 if (args->a_io == NFS4_PUTAPAGE || 1054 args->a_io == NFS4_PAGEIO) 1055 rp->r_awcount--; 1056 cv_broadcast(&rp->r_cv); 1057 mutex_exit(&rp->r_statelock); 1058 VN_RELE(args->a_vp); 1059 } 1060 crfree(args->a_cred); 1061 kmem_free(args, sizeof (*args)); 1062 } 1063 1064 /* 1065 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1066 * pageout(), running in the global zone, have legitimate reasons to do 1067 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1068 * use of a a per-mount "asynchronous requests manager thread" which is 1069 * signaled by the various asynchronous work routines when there is 1070 * asynchronous work to be done. It is responsible for creating new 1071 * worker threads if necessary, and notifying existing worker threads 1072 * that there is work to be done. 1073 * 1074 * In other words, it will "take the specifications from the customers and 1075 * give them to the engineers." 1076 * 1077 * Worker threads die off of their own accord if they are no longer 1078 * needed. 1079 * 1080 * This thread is killed when the zone is going away or the filesystem 1081 * is being unmounted. 1082 */ 1083 void 1084 nfs4_async_manager(vfs_t *vfsp) 1085 { 1086 callb_cpr_t cprinfo; 1087 mntinfo4_t *mi; 1088 uint_t max_threads; 1089 1090 mi = VFTOMI4(vfsp); 1091 1092 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1093 "nfs4_async_manager"); 1094 1095 mutex_enter(&mi->mi_async_lock); 1096 /* 1097 * We want to stash the max number of threads that this mount was 1098 * allowed so we can use it later when the variable is set to zero as 1099 * part of the zone/mount going away. 1100 * 1101 * We want to be able to create at least one thread to handle 1102 * asyncrhonous inactive calls. 1103 */ 1104 max_threads = MAX(mi->mi_max_threads, 1); 1105 mutex_enter(&mi->mi_lock); 1106 /* 1107 * We don't want to wait for mi_max_threads to go to zero, since that 1108 * happens as part of a failed unmount, but this thread should only 1109 * exit when the mount is really going away. 1110 * 1111 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1112 * attempted: the various _async_*() functions know to do things 1113 * inline if mi_max_threads == 0. Henceforth we just drain out the 1114 * outstanding requests. 1115 * 1116 * Note that we still create zthreads even if we notice the zone is 1117 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1118 * shutdown sequence to take slightly longer in some cases, but 1119 * doesn't violate the protocol, as all threads will exit as soon as 1120 * they're done processing the remaining requests. 1121 */ 1122 while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) || 1123 mi->mi_async_req_count > 0) { 1124 mutex_exit(&mi->mi_lock); 1125 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1126 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1127 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1128 while (mi->mi_async_req_count > 0) { 1129 /* 1130 * Paranoia: If the mount started out having 1131 * (mi->mi_max_threads == 0), and the value was 1132 * later changed (via a debugger or somesuch), 1133 * we could be confused since we will think we 1134 * can't create any threads, and the calling 1135 * code (which looks at the current value of 1136 * mi->mi_max_threads, now non-zero) thinks we 1137 * can. 1138 * 1139 * So, because we're paranoid, we create threads 1140 * up to the maximum of the original and the 1141 * current value. This means that future 1142 * (debugger-induced) alterations of 1143 * mi->mi_max_threads are ignored for our 1144 * purposes, but who told them they could change 1145 * random values on a live kernel anyhow? 1146 */ 1147 if (mi->mi_threads < 1148 MAX(mi->mi_max_threads, max_threads)) { 1149 mi->mi_threads++; 1150 mutex_exit(&mi->mi_async_lock); 1151 MI4_HOLD(mi); 1152 VFS_HOLD(vfsp); /* hold for new thread */ 1153 (void) zthread_create(NULL, 0, nfs4_async_start, 1154 vfsp, 0, minclsyspri); 1155 mutex_enter(&mi->mi_async_lock); 1156 } 1157 cv_signal(&mi->mi_async_work_cv); 1158 ASSERT(mi->mi_async_req_count != 0); 1159 mi->mi_async_req_count--; 1160 } 1161 mutex_enter(&mi->mi_lock); 1162 } 1163 mutex_exit(&mi->mi_lock); 1164 1165 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1166 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1167 /* 1168 * Let everyone know we're done. 1169 */ 1170 mi->mi_manager_thread = NULL; 1171 /* 1172 * Wake up the inactive thread. 1173 */ 1174 cv_broadcast(&mi->mi_inact_req_cv); 1175 /* 1176 * Wake up anyone sitting in nfs4_async_manager_stop() 1177 */ 1178 cv_broadcast(&mi->mi_async_cv); 1179 /* 1180 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1181 * since CALLB_CPR_EXIT is actually responsible for releasing 1182 * 'mi_async_lock'. 1183 */ 1184 CALLB_CPR_EXIT(&cprinfo); 1185 VFS_RELE(vfsp); /* release thread's hold */ 1186 MI4_RELE(mi); 1187 zthread_exit(); 1188 } 1189 1190 /* 1191 * Signal (and wait for) the async manager thread to clean up and go away. 1192 */ 1193 void 1194 nfs4_async_manager_stop(vfs_t *vfsp) 1195 { 1196 mntinfo4_t *mi = VFTOMI4(vfsp); 1197 1198 mutex_enter(&mi->mi_async_lock); 1199 mutex_enter(&mi->mi_lock); 1200 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1201 mutex_exit(&mi->mi_lock); 1202 cv_broadcast(&mi->mi_async_reqs_cv); 1203 /* 1204 * Wait for the async manager thread to die. 1205 */ 1206 while (mi->mi_manager_thread != NULL) 1207 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1208 mutex_exit(&mi->mi_async_lock); 1209 } 1210 1211 int 1212 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1213 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1214 u_offset_t, caddr_t, struct seg *, cred_t *)) 1215 { 1216 rnode4_t *rp; 1217 mntinfo4_t *mi; 1218 struct nfs4_async_reqs *args; 1219 1220 rp = VTOR4(vp); 1221 ASSERT(rp->r_freef == NULL); 1222 1223 mi = VTOMI4(vp); 1224 1225 /* 1226 * If addr falls in a different segment, don't bother doing readahead. 1227 */ 1228 if (addr >= seg->s_base + seg->s_size) 1229 return (-1); 1230 1231 /* 1232 * If we can't allocate a request structure, punt on the readahead. 1233 */ 1234 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1235 return (-1); 1236 1237 /* 1238 * If a lock operation is pending, don't initiate any new 1239 * readaheads. Otherwise, bump r_count to indicate the new 1240 * asynchronous I/O. 1241 */ 1242 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1243 kmem_free(args, sizeof (*args)); 1244 return (-1); 1245 } 1246 mutex_enter(&rp->r_statelock); 1247 rp->r_count++; 1248 mutex_exit(&rp->r_statelock); 1249 nfs_rw_exit(&rp->r_lkserlock); 1250 1251 args->a_next = NULL; 1252 #ifdef DEBUG 1253 args->a_queuer = curthread; 1254 #endif 1255 VN_HOLD(vp); 1256 args->a_vp = vp; 1257 ASSERT(cr != NULL); 1258 crhold(cr); 1259 args->a_cred = cr; 1260 args->a_io = NFS4_READ_AHEAD; 1261 args->a_nfs4_readahead = readahead; 1262 args->a_nfs4_blkoff = blkoff; 1263 args->a_nfs4_seg = seg; 1264 args->a_nfs4_addr = addr; 1265 1266 mutex_enter(&mi->mi_async_lock); 1267 1268 /* 1269 * If asyncio has been disabled, don't bother readahead. 1270 */ 1271 if (mi->mi_max_threads == 0) { 1272 mutex_exit(&mi->mi_async_lock); 1273 goto noasync; 1274 } 1275 1276 /* 1277 * Link request structure into the async list and 1278 * wakeup async thread to do the i/o. 1279 */ 1280 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1281 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1282 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1283 } else { 1284 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1285 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1286 } 1287 1288 if (mi->mi_io_kstats) { 1289 mutex_enter(&mi->mi_lock); 1290 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1291 mutex_exit(&mi->mi_lock); 1292 } 1293 1294 mi->mi_async_req_count++; 1295 ASSERT(mi->mi_async_req_count != 0); 1296 cv_signal(&mi->mi_async_reqs_cv); 1297 mutex_exit(&mi->mi_async_lock); 1298 return (0); 1299 1300 noasync: 1301 mutex_enter(&rp->r_statelock); 1302 rp->r_count--; 1303 cv_broadcast(&rp->r_cv); 1304 mutex_exit(&rp->r_statelock); 1305 VN_RELE(vp); 1306 crfree(cr); 1307 kmem_free(args, sizeof (*args)); 1308 return (-1); 1309 } 1310 1311 /* 1312 * The async queues for each mounted file system are arranged as a 1313 * set of queues, one for each async i/o type. Requests are taken 1314 * from the queues in a round-robin fashion. A number of consecutive 1315 * requests are taken from each queue before moving on to the next 1316 * queue. This functionality may allow the NFS Version 2 server to do 1317 * write clustering, even if the client is mixing writes and reads 1318 * because it will take multiple write requests from the queue 1319 * before processing any of the other async i/o types. 1320 * 1321 * XXX The nfs4_async_start thread is unsafe in the light of the present 1322 * model defined by cpr to suspend the system. Specifically over the 1323 * wire calls are cpr-unsafe. The thread should be reevaluated in 1324 * case of future updates to the cpr model. 1325 */ 1326 static void 1327 nfs4_async_start(struct vfs *vfsp) 1328 { 1329 struct nfs4_async_reqs *args; 1330 mntinfo4_t *mi = VFTOMI4(vfsp); 1331 clock_t time_left = 1; 1332 callb_cpr_t cprinfo; 1333 int i; 1334 extern int nfs_async_timeout; 1335 1336 /* 1337 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1338 * built in an implementation independent manner. 1339 */ 1340 if (nfs_async_timeout == -1) 1341 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1342 1343 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1344 1345 mutex_enter(&mi->mi_async_lock); 1346 for (;;) { 1347 /* 1348 * Find the next queue containing an entry. We start 1349 * at the current queue pointer and then round robin 1350 * through all of them until we either find a non-empty 1351 * queue or have looked through all of them. 1352 */ 1353 for (i = 0; i < NFS4_ASYNC_TYPES; i++) { 1354 args = *mi->mi_async_curr; 1355 if (args != NULL) 1356 break; 1357 mi->mi_async_curr++; 1358 if (mi->mi_async_curr == 1359 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1360 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1361 } 1362 /* 1363 * If we didn't find a entry, then block until woken up 1364 * again and then look through the queues again. 1365 */ 1366 if (args == NULL) { 1367 /* 1368 * Exiting is considered to be safe for CPR as well 1369 */ 1370 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1371 1372 /* 1373 * Wakeup thread waiting to unmount the file 1374 * system only if all async threads are inactive. 1375 * 1376 * If we've timed-out and there's nothing to do, 1377 * then get rid of this thread. 1378 */ 1379 if (mi->mi_max_threads == 0 || time_left <= 0) { 1380 if (--mi->mi_threads == 0) 1381 cv_signal(&mi->mi_async_cv); 1382 CALLB_CPR_EXIT(&cprinfo); 1383 VFS_RELE(vfsp); /* release thread's hold */ 1384 MI4_RELE(mi); 1385 zthread_exit(); 1386 /* NOTREACHED */ 1387 } 1388 time_left = cv_timedwait(&mi->mi_async_work_cv, 1389 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1390 1391 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1392 1393 continue; 1394 } else { 1395 time_left = 1; 1396 } 1397 1398 /* 1399 * Remove the request from the async queue and then 1400 * update the current async request queue pointer. If 1401 * the current queue is empty or we have removed enough 1402 * consecutive entries from it, then reset the counter 1403 * for this queue and then move the current pointer to 1404 * the next queue. 1405 */ 1406 *mi->mi_async_curr = args->a_next; 1407 if (*mi->mi_async_curr == NULL || 1408 --mi->mi_async_clusters[args->a_io] == 0) { 1409 mi->mi_async_clusters[args->a_io] = 1410 mi->mi_async_init_clusters; 1411 mi->mi_async_curr++; 1412 if (mi->mi_async_curr == 1413 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1414 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1415 } 1416 1417 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1418 mutex_enter(&mi->mi_lock); 1419 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1420 mutex_exit(&mi->mi_lock); 1421 } 1422 1423 mutex_exit(&mi->mi_async_lock); 1424 1425 /* 1426 * Obtain arguments from the async request structure. 1427 */ 1428 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1429 (*args->a_nfs4_readahead)(args->a_vp, 1430 args->a_nfs4_blkoff, 1431 args->a_nfs4_addr, args->a_nfs4_seg, 1432 args->a_cred); 1433 } else if (args->a_io == NFS4_PUTAPAGE) { 1434 (void) (*args->a_nfs4_putapage)(args->a_vp, 1435 args->a_nfs4_pp, args->a_nfs4_off, 1436 args->a_nfs4_len, args->a_nfs4_flags, 1437 args->a_cred); 1438 } else if (args->a_io == NFS4_PAGEIO) { 1439 (void) (*args->a_nfs4_pageio)(args->a_vp, 1440 args->a_nfs4_pp, args->a_nfs4_off, 1441 args->a_nfs4_len, args->a_nfs4_flags, 1442 args->a_cred); 1443 } else if (args->a_io == NFS4_READDIR) { 1444 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1445 args->a_nfs4_rdc, args->a_cred)); 1446 } else if (args->a_io == NFS4_COMMIT) { 1447 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1448 args->a_nfs4_offset, args->a_nfs4_count, 1449 args->a_cred); 1450 } else if (args->a_io == NFS4_INACTIVE) { 1451 nfs4_inactive_otw(args->a_vp, args->a_cred); 1452 } 1453 1454 /* 1455 * Now, release the vnode and free the credentials 1456 * structure. 1457 */ 1458 free_async_args4(args); 1459 /* 1460 * Reacquire the mutex because it will be needed above. 1461 */ 1462 mutex_enter(&mi->mi_async_lock); 1463 } 1464 } 1465 1466 /* 1467 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1468 * part of VOP_INACTIVE. 1469 */ 1470 1471 void 1472 nfs4_inactive_thread(mntinfo4_t *mi) 1473 { 1474 struct nfs4_async_reqs *args; 1475 callb_cpr_t cprinfo; 1476 vfs_t *vfsp = mi->mi_vfsp; 1477 1478 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1479 "nfs4_inactive_thread"); 1480 1481 for (;;) { 1482 mutex_enter(&mi->mi_async_lock); 1483 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1484 if (args == NULL) { 1485 mutex_enter(&mi->mi_lock); 1486 /* 1487 * We don't want to exit until the async manager is done 1488 * with its work; hence the check for mi_manager_thread 1489 * being NULL. 1490 * 1491 * The async manager thread will cv_broadcast() on 1492 * mi_inact_req_cv when it's done, at which point we'll 1493 * wake up and exit. 1494 */ 1495 if (mi->mi_manager_thread == NULL) 1496 goto die; 1497 mi->mi_flags |= MI4_INACTIVE_IDLE; 1498 mutex_exit(&mi->mi_lock); 1499 cv_signal(&mi->mi_async_cv); 1500 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1501 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1502 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1503 mutex_exit(&mi->mi_async_lock); 1504 } else { 1505 mutex_enter(&mi->mi_lock); 1506 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1507 mutex_exit(&mi->mi_lock); 1508 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1509 mutex_exit(&mi->mi_async_lock); 1510 nfs4_inactive_otw(args->a_vp, args->a_cred); 1511 crfree(args->a_cred); 1512 kmem_free(args, sizeof (*args)); 1513 } 1514 } 1515 die: 1516 mutex_exit(&mi->mi_lock); 1517 mi->mi_inactive_thread = NULL; 1518 cv_signal(&mi->mi_async_cv); 1519 1520 /* 1521 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1522 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1523 */ 1524 CALLB_CPR_EXIT(&cprinfo); 1525 1526 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1527 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1528 1529 MI4_RELE(mi); 1530 zthread_exit(); 1531 /* NOTREACHED */ 1532 } 1533 1534 /* 1535 * nfs_async_stop: 1536 * Wait for all outstanding putpage operations and the inactive thread to 1537 * complete; nfs4_async_stop_sig() without interruptibility. 1538 */ 1539 void 1540 nfs4_async_stop(struct vfs *vfsp) 1541 { 1542 mntinfo4_t *mi = VFTOMI4(vfsp); 1543 1544 /* 1545 * Wait for all outstanding async operations to complete and for 1546 * worker threads to exit. 1547 */ 1548 mutex_enter(&mi->mi_async_lock); 1549 mi->mi_max_threads = 0; 1550 cv_broadcast(&mi->mi_async_work_cv); 1551 while (mi->mi_threads != 0) 1552 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1553 1554 /* 1555 * Wait for the inactive thread to finish doing what it's doing. It 1556 * won't exit until the last reference to the vfs_t goes away. 1557 */ 1558 if (mi->mi_inactive_thread != NULL) { 1559 mutex_enter(&mi->mi_lock); 1560 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1561 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1562 mutex_exit(&mi->mi_lock); 1563 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1564 mutex_enter(&mi->mi_lock); 1565 } 1566 mutex_exit(&mi->mi_lock); 1567 } 1568 mutex_exit(&mi->mi_async_lock); 1569 } 1570 1571 /* 1572 * nfs_async_stop_sig: 1573 * Wait for all outstanding putpage operations and the inactive thread to 1574 * complete. If a signal is delivered we will abort and return non-zero; 1575 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1576 * need to make it interruptable. 1577 */ 1578 int 1579 nfs4_async_stop_sig(struct vfs *vfsp) 1580 { 1581 mntinfo4_t *mi = VFTOMI4(vfsp); 1582 ushort_t omax; 1583 bool_t intr = FALSE; 1584 1585 /* 1586 * Wait for all outstanding putpage operations to complete and for 1587 * worker threads to exit. 1588 */ 1589 mutex_enter(&mi->mi_async_lock); 1590 omax = mi->mi_max_threads; 1591 mi->mi_max_threads = 0; 1592 cv_broadcast(&mi->mi_async_work_cv); 1593 while (mi->mi_threads != 0) { 1594 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1595 intr = TRUE; 1596 goto interrupted; 1597 } 1598 } 1599 1600 /* 1601 * Wait for the inactive thread to finish doing what it's doing. It 1602 * won't exit until the a last reference to the vfs_t goes away. 1603 */ 1604 if (mi->mi_inactive_thread != NULL) { 1605 mutex_enter(&mi->mi_lock); 1606 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1607 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1608 mutex_exit(&mi->mi_lock); 1609 if (!cv_wait_sig(&mi->mi_async_cv, 1610 &mi->mi_async_lock)) { 1611 intr = TRUE; 1612 goto interrupted; 1613 } 1614 mutex_enter(&mi->mi_lock); 1615 } 1616 mutex_exit(&mi->mi_lock); 1617 } 1618 interrupted: 1619 if (intr) 1620 mi->mi_max_threads = omax; 1621 mutex_exit(&mi->mi_async_lock); 1622 1623 return (intr); 1624 } 1625 1626 int 1627 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1628 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1629 u_offset_t, size_t, int, cred_t *)) 1630 { 1631 rnode4_t *rp; 1632 mntinfo4_t *mi; 1633 struct nfs4_async_reqs *args; 1634 1635 ASSERT(flags & B_ASYNC); 1636 ASSERT(vp->v_vfsp != NULL); 1637 1638 rp = VTOR4(vp); 1639 ASSERT(rp->r_count > 0); 1640 1641 mi = VTOMI4(vp); 1642 1643 /* 1644 * If we can't allocate a request structure, do the putpage 1645 * operation synchronously in this thread's context. 1646 */ 1647 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1648 goto noasync; 1649 1650 args->a_next = NULL; 1651 #ifdef DEBUG 1652 args->a_queuer = curthread; 1653 #endif 1654 VN_HOLD(vp); 1655 args->a_vp = vp; 1656 ASSERT(cr != NULL); 1657 crhold(cr); 1658 args->a_cred = cr; 1659 args->a_io = NFS4_PUTAPAGE; 1660 args->a_nfs4_putapage = putapage; 1661 args->a_nfs4_pp = pp; 1662 args->a_nfs4_off = off; 1663 args->a_nfs4_len = (uint_t)len; 1664 args->a_nfs4_flags = flags; 1665 1666 mutex_enter(&mi->mi_async_lock); 1667 1668 /* 1669 * If asyncio has been disabled, then make a synchronous request. 1670 * This check is done a second time in case async io was diabled 1671 * while this thread was blocked waiting for memory pressure to 1672 * reduce or for the queue to drain. 1673 */ 1674 if (mi->mi_max_threads == 0) { 1675 mutex_exit(&mi->mi_async_lock); 1676 1677 VN_RELE(vp); 1678 crfree(cr); 1679 kmem_free(args, sizeof (*args)); 1680 goto noasync; 1681 } 1682 1683 /* 1684 * Link request structure into the async list and 1685 * wakeup async thread to do the i/o. 1686 */ 1687 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1688 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1689 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1690 } else { 1691 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1692 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1693 } 1694 1695 mutex_enter(&rp->r_statelock); 1696 rp->r_count++; 1697 rp->r_awcount++; 1698 mutex_exit(&rp->r_statelock); 1699 1700 if (mi->mi_io_kstats) { 1701 mutex_enter(&mi->mi_lock); 1702 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1703 mutex_exit(&mi->mi_lock); 1704 } 1705 1706 mi->mi_async_req_count++; 1707 ASSERT(mi->mi_async_req_count != 0); 1708 cv_signal(&mi->mi_async_reqs_cv); 1709 mutex_exit(&mi->mi_async_lock); 1710 return (0); 1711 1712 noasync: 1713 1714 if (curproc == proc_pageout || curproc == proc_fsflush || 1715 nfs_zone() == mi->mi_zone) { 1716 /* 1717 * If we get here in the context of the pageout/fsflush, 1718 * or we have run out of memory or we're attempting to 1719 * unmount we refuse to do a sync write, because this may 1720 * hang pageout/fsflush and the machine. In this case, 1721 * we just re-mark the page as dirty and punt on the page. 1722 * 1723 * Make sure B_FORCE isn't set. We can re-mark the 1724 * pages as dirty and unlock the pages in one swoop by 1725 * passing in B_ERROR to pvn_write_done(). However, 1726 * we should make sure B_FORCE isn't set - we don't 1727 * want the page tossed before it gets written out. 1728 */ 1729 if (flags & B_FORCE) 1730 flags &= ~(B_INVAL | B_FORCE); 1731 pvn_write_done(pp, flags | B_ERROR); 1732 return (0); 1733 } 1734 1735 /* 1736 * We'll get here only if (nfs_zone() != mi->mi_zone) 1737 * which means that this was a cross-zone sync putpage. 1738 * 1739 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1740 * as dirty and unlock them. 1741 * 1742 * We don't want to clear B_FORCE here as the caller presumably 1743 * knows what they're doing if they set it. 1744 */ 1745 pvn_write_done(pp, flags | B_ERROR); 1746 return (EPERM); 1747 } 1748 1749 int 1750 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1751 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1752 size_t, int, cred_t *)) 1753 { 1754 rnode4_t *rp; 1755 mntinfo4_t *mi; 1756 struct nfs4_async_reqs *args; 1757 1758 ASSERT(flags & B_ASYNC); 1759 ASSERT(vp->v_vfsp != NULL); 1760 1761 rp = VTOR4(vp); 1762 ASSERT(rp->r_count > 0); 1763 1764 mi = VTOMI4(vp); 1765 1766 /* 1767 * If we can't allocate a request structure, do the pageio 1768 * request synchronously in this thread's context. 1769 */ 1770 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1771 goto noasync; 1772 1773 args->a_next = NULL; 1774 #ifdef DEBUG 1775 args->a_queuer = curthread; 1776 #endif 1777 VN_HOLD(vp); 1778 args->a_vp = vp; 1779 ASSERT(cr != NULL); 1780 crhold(cr); 1781 args->a_cred = cr; 1782 args->a_io = NFS4_PAGEIO; 1783 args->a_nfs4_pageio = pageio; 1784 args->a_nfs4_pp = pp; 1785 args->a_nfs4_off = io_off; 1786 args->a_nfs4_len = (uint_t)io_len; 1787 args->a_nfs4_flags = flags; 1788 1789 mutex_enter(&mi->mi_async_lock); 1790 1791 /* 1792 * If asyncio has been disabled, then make a synchronous request. 1793 * This check is done a second time in case async io was diabled 1794 * while this thread was blocked waiting for memory pressure to 1795 * reduce or for the queue to drain. 1796 */ 1797 if (mi->mi_max_threads == 0) { 1798 mutex_exit(&mi->mi_async_lock); 1799 1800 VN_RELE(vp); 1801 crfree(cr); 1802 kmem_free(args, sizeof (*args)); 1803 goto noasync; 1804 } 1805 1806 /* 1807 * Link request structure into the async list and 1808 * wakeup async thread to do the i/o. 1809 */ 1810 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1811 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1812 mi->mi_async_tail[NFS4_PAGEIO] = args; 1813 } else { 1814 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1815 mi->mi_async_tail[NFS4_PAGEIO] = args; 1816 } 1817 1818 mutex_enter(&rp->r_statelock); 1819 rp->r_count++; 1820 rp->r_awcount++; 1821 mutex_exit(&rp->r_statelock); 1822 1823 if (mi->mi_io_kstats) { 1824 mutex_enter(&mi->mi_lock); 1825 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1826 mutex_exit(&mi->mi_lock); 1827 } 1828 1829 mi->mi_async_req_count++; 1830 ASSERT(mi->mi_async_req_count != 0); 1831 cv_signal(&mi->mi_async_reqs_cv); 1832 mutex_exit(&mi->mi_async_lock); 1833 return (0); 1834 1835 noasync: 1836 /* 1837 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1838 * the page list), for writes we do it synchronously, except for 1839 * proc_pageout/proc_fsflush as described below. 1840 */ 1841 if (flags & B_READ) { 1842 pvn_read_done(pp, flags | B_ERROR); 1843 return (0); 1844 } 1845 1846 if (curproc == proc_pageout || curproc == proc_fsflush) { 1847 /* 1848 * If we get here in the context of the pageout/fsflush, 1849 * we refuse to do a sync write, because this may hang 1850 * pageout/fsflush (and the machine). In this case, we just 1851 * re-mark the page as dirty and punt on the page. 1852 * 1853 * Make sure B_FORCE isn't set. We can re-mark the 1854 * pages as dirty and unlock the pages in one swoop by 1855 * passing in B_ERROR to pvn_write_done(). However, 1856 * we should make sure B_FORCE isn't set - we don't 1857 * want the page tossed before it gets written out. 1858 */ 1859 if (flags & B_FORCE) 1860 flags &= ~(B_INVAL | B_FORCE); 1861 pvn_write_done(pp, flags | B_ERROR); 1862 return (0); 1863 } 1864 1865 if (nfs_zone() != mi->mi_zone) { 1866 /* 1867 * So this was a cross-zone sync pageio. We pass in B_ERROR 1868 * to pvn_write_done() to re-mark the pages as dirty and unlock 1869 * them. 1870 * 1871 * We don't want to clear B_FORCE here as the caller presumably 1872 * knows what they're doing if they set it. 1873 */ 1874 pvn_write_done(pp, flags | B_ERROR); 1875 return (EPERM); 1876 } 1877 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1878 } 1879 1880 void 1881 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1882 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1883 { 1884 rnode4_t *rp; 1885 mntinfo4_t *mi; 1886 struct nfs4_async_reqs *args; 1887 1888 rp = VTOR4(vp); 1889 ASSERT(rp->r_freef == NULL); 1890 1891 mi = VTOMI4(vp); 1892 1893 /* 1894 * If we can't allocate a request structure, skip the readdir. 1895 */ 1896 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1897 goto noasync; 1898 1899 args->a_next = NULL; 1900 #ifdef DEBUG 1901 args->a_queuer = curthread; 1902 #endif 1903 VN_HOLD(vp); 1904 args->a_vp = vp; 1905 ASSERT(cr != NULL); 1906 crhold(cr); 1907 args->a_cred = cr; 1908 args->a_io = NFS4_READDIR; 1909 args->a_nfs4_readdir = readdir; 1910 args->a_nfs4_rdc = rdc; 1911 1912 mutex_enter(&mi->mi_async_lock); 1913 1914 /* 1915 * If asyncio has been disabled, then skip this request 1916 */ 1917 if (mi->mi_max_threads == 0) { 1918 mutex_exit(&mi->mi_async_lock); 1919 1920 VN_RELE(vp); 1921 crfree(cr); 1922 kmem_free(args, sizeof (*args)); 1923 goto noasync; 1924 } 1925 1926 /* 1927 * Link request structure into the async list and 1928 * wakeup async thread to do the i/o. 1929 */ 1930 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 1931 mi->mi_async_reqs[NFS4_READDIR] = args; 1932 mi->mi_async_tail[NFS4_READDIR] = args; 1933 } else { 1934 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 1935 mi->mi_async_tail[NFS4_READDIR] = args; 1936 } 1937 1938 mutex_enter(&rp->r_statelock); 1939 rp->r_count++; 1940 mutex_exit(&rp->r_statelock); 1941 1942 if (mi->mi_io_kstats) { 1943 mutex_enter(&mi->mi_lock); 1944 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1945 mutex_exit(&mi->mi_lock); 1946 } 1947 1948 mi->mi_async_req_count++; 1949 ASSERT(mi->mi_async_req_count != 0); 1950 cv_signal(&mi->mi_async_reqs_cv); 1951 mutex_exit(&mi->mi_async_lock); 1952 return; 1953 1954 noasync: 1955 mutex_enter(&rp->r_statelock); 1956 rdc->entries = NULL; 1957 /* 1958 * Indicate that no one is trying to fill this entry and 1959 * it still needs to be filled. 1960 */ 1961 rdc->flags &= ~RDDIR; 1962 rdc->flags |= RDDIRREQ; 1963 rddir4_cache_rele(rp, rdc); 1964 mutex_exit(&rp->r_statelock); 1965 } 1966 1967 void 1968 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1969 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1970 cred_t *)) 1971 { 1972 rnode4_t *rp; 1973 mntinfo4_t *mi; 1974 struct nfs4_async_reqs *args; 1975 page_t *pp; 1976 1977 rp = VTOR4(vp); 1978 mi = VTOMI4(vp); 1979 1980 /* 1981 * If we can't allocate a request structure, do the commit 1982 * operation synchronously in this thread's context. 1983 */ 1984 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1985 goto noasync; 1986 1987 args->a_next = NULL; 1988 #ifdef DEBUG 1989 args->a_queuer = curthread; 1990 #endif 1991 VN_HOLD(vp); 1992 args->a_vp = vp; 1993 ASSERT(cr != NULL); 1994 crhold(cr); 1995 args->a_cred = cr; 1996 args->a_io = NFS4_COMMIT; 1997 args->a_nfs4_commit = commit; 1998 args->a_nfs4_plist = plist; 1999 args->a_nfs4_offset = offset; 2000 args->a_nfs4_count = count; 2001 2002 mutex_enter(&mi->mi_async_lock); 2003 2004 /* 2005 * If asyncio has been disabled, then make a synchronous request. 2006 * This check is done a second time in case async io was diabled 2007 * while this thread was blocked waiting for memory pressure to 2008 * reduce or for the queue to drain. 2009 */ 2010 if (mi->mi_max_threads == 0) { 2011 mutex_exit(&mi->mi_async_lock); 2012 2013 VN_RELE(vp); 2014 crfree(cr); 2015 kmem_free(args, sizeof (*args)); 2016 goto noasync; 2017 } 2018 2019 /* 2020 * Link request structure into the async list and 2021 * wakeup async thread to do the i/o. 2022 */ 2023 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2024 mi->mi_async_reqs[NFS4_COMMIT] = args; 2025 mi->mi_async_tail[NFS4_COMMIT] = args; 2026 } else { 2027 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2028 mi->mi_async_tail[NFS4_COMMIT] = args; 2029 } 2030 2031 mutex_enter(&rp->r_statelock); 2032 rp->r_count++; 2033 mutex_exit(&rp->r_statelock); 2034 2035 if (mi->mi_io_kstats) { 2036 mutex_enter(&mi->mi_lock); 2037 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2038 mutex_exit(&mi->mi_lock); 2039 } 2040 2041 mi->mi_async_req_count++; 2042 ASSERT(mi->mi_async_req_count != 0); 2043 cv_signal(&mi->mi_async_reqs_cv); 2044 mutex_exit(&mi->mi_async_lock); 2045 return; 2046 2047 noasync: 2048 if (curproc == proc_pageout || curproc == proc_fsflush || 2049 nfs_zone() != mi->mi_zone) { 2050 while (plist != NULL) { 2051 pp = plist; 2052 page_sub(&plist, pp); 2053 pp->p_fsdata = C_COMMIT; 2054 page_unlock(pp); 2055 } 2056 return; 2057 } 2058 (*commit)(vp, plist, offset, count, cr); 2059 } 2060 2061 /* 2062 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2063 * reference to the vnode is handed over to the thread; the caller should 2064 * no longer refer to the vnode. 2065 * 2066 * Unlike most of the async routines, this handoff is needed for 2067 * correctness reasons, not just performance. So doing operations in the 2068 * context of the current thread is not an option. 2069 */ 2070 void 2071 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2072 { 2073 mntinfo4_t *mi; 2074 struct nfs4_async_reqs *args; 2075 boolean_t signal_inactive_thread = B_FALSE; 2076 2077 mi = VTOMI4(vp); 2078 2079 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2080 args->a_next = NULL; 2081 #ifdef DEBUG 2082 args->a_queuer = curthread; 2083 #endif 2084 args->a_vp = vp; 2085 ASSERT(cr != NULL); 2086 crhold(cr); 2087 args->a_cred = cr; 2088 args->a_io = NFS4_INACTIVE; 2089 2090 /* 2091 * Note that we don't check mi->mi_max_threads here, since we 2092 * *need* to get rid of this vnode regardless of whether someone 2093 * set nfs4_max_threads to zero in /etc/system. 2094 * 2095 * The manager thread knows about this and is willing to create 2096 * at least one thread to accomodate us. 2097 */ 2098 mutex_enter(&mi->mi_async_lock); 2099 if (mi->mi_inactive_thread == NULL) { 2100 rnode4_t *rp; 2101 vnode_t *unldvp = NULL; 2102 char *unlname; 2103 cred_t *unlcred; 2104 2105 mutex_exit(&mi->mi_async_lock); 2106 /* 2107 * We just need to free up the memory associated with the 2108 * vnode, which can be safely done from within the current 2109 * context. 2110 */ 2111 crfree(cr); /* drop our reference */ 2112 kmem_free(args, sizeof (*args)); 2113 rp = VTOR4(vp); 2114 mutex_enter(&rp->r_statelock); 2115 if (rp->r_unldvp != NULL) { 2116 unldvp = rp->r_unldvp; 2117 rp->r_unldvp = NULL; 2118 unlname = rp->r_unlname; 2119 rp->r_unlname = NULL; 2120 unlcred = rp->r_unlcred; 2121 rp->r_unlcred = NULL; 2122 } 2123 mutex_exit(&rp->r_statelock); 2124 /* 2125 * No need to explicitly throw away any cached pages. The 2126 * eventual r4inactive() will attempt a synchronous 2127 * VOP_PUTPAGE() which will immediately fail since the request 2128 * is coming from the wrong zone, and then will proceed to call 2129 * nfs4_invalidate_pages() which will clean things up for us. 2130 * 2131 * Throw away the delegation here so rp4_addfree()'s attempt to 2132 * return any existing delegations becomes a no-op. 2133 */ 2134 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2135 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2136 FALSE); 2137 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2138 nfs_rw_exit(&mi->mi_recovlock); 2139 } 2140 nfs4_clear_open_streams(rp); 2141 2142 rp4_addfree(rp, cr); 2143 if (unldvp != NULL) { 2144 kmem_free(unlname, MAXNAMELEN); 2145 VN_RELE(unldvp); 2146 crfree(unlcred); 2147 } 2148 return; 2149 } 2150 2151 if (mi->mi_manager_thread == NULL) { 2152 /* 2153 * We want to talk to the inactive thread. 2154 */ 2155 signal_inactive_thread = B_TRUE; 2156 } 2157 2158 /* 2159 * Enqueue the vnode and wake up either the special thread (empty 2160 * list) or an async thread. 2161 */ 2162 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2163 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2164 mi->mi_async_tail[NFS4_INACTIVE] = args; 2165 signal_inactive_thread = B_TRUE; 2166 } else { 2167 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2168 mi->mi_async_tail[NFS4_INACTIVE] = args; 2169 } 2170 if (signal_inactive_thread) { 2171 cv_signal(&mi->mi_inact_req_cv); 2172 } else { 2173 mi->mi_async_req_count++; 2174 ASSERT(mi->mi_async_req_count != 0); 2175 cv_signal(&mi->mi_async_reqs_cv); 2176 } 2177 2178 mutex_exit(&mi->mi_async_lock); 2179 } 2180 2181 int 2182 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2183 { 2184 int pagecreate; 2185 int n; 2186 int saved_n; 2187 caddr_t saved_base; 2188 u_offset_t offset; 2189 int error; 2190 int sm_error; 2191 vnode_t *vp = RTOV(rp); 2192 2193 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2194 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2195 if (!vpm_enable) { 2196 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2197 } 2198 2199 /* 2200 * Move bytes in at most PAGESIZE chunks. We must avoid 2201 * spanning pages in uiomove() because page faults may cause 2202 * the cache to be invalidated out from under us. The r_size is not 2203 * updated until after the uiomove. If we push the last page of a 2204 * file before r_size is correct, we will lose the data written past 2205 * the current (and invalid) r_size. 2206 */ 2207 do { 2208 offset = uio->uio_loffset; 2209 pagecreate = 0; 2210 2211 /* 2212 * n is the number of bytes required to satisfy the request 2213 * or the number of bytes to fill out the page. 2214 */ 2215 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2216 2217 /* 2218 * Check to see if we can skip reading in the page 2219 * and just allocate the memory. We can do this 2220 * if we are going to rewrite the entire mapping 2221 * or if we are going to write to or beyond the current 2222 * end of file from the beginning of the mapping. 2223 * 2224 * The read of r_size is now protected by r_statelock. 2225 */ 2226 mutex_enter(&rp->r_statelock); 2227 /* 2228 * When pgcreated is nonzero the caller has already done 2229 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2230 * segkpm this means we already have at least one page 2231 * created and mapped at base. 2232 */ 2233 pagecreate = pgcreated || 2234 ((offset & PAGEOFFSET) == 0 && 2235 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2236 2237 mutex_exit(&rp->r_statelock); 2238 2239 if (!vpm_enable && pagecreate) { 2240 /* 2241 * The last argument tells segmap_pagecreate() to 2242 * always lock the page, as opposed to sometimes 2243 * returning with the page locked. This way we avoid a 2244 * fault on the ensuing uiomove(), but also 2245 * more importantly (to fix bug 1094402) we can 2246 * call segmap_fault() to unlock the page in all 2247 * cases. An alternative would be to modify 2248 * segmap_pagecreate() to tell us when it is 2249 * locking a page, but that's a fairly major 2250 * interface change. 2251 */ 2252 if (pgcreated == 0) 2253 (void) segmap_pagecreate(segkmap, base, 2254 (uint_t)n, 1); 2255 saved_base = base; 2256 saved_n = n; 2257 } 2258 2259 /* 2260 * The number of bytes of data in the last page can not 2261 * be accurately be determined while page is being 2262 * uiomove'd to and the size of the file being updated. 2263 * Thus, inform threads which need to know accurately 2264 * how much data is in the last page of the file. They 2265 * will not do the i/o immediately, but will arrange for 2266 * the i/o to happen later when this modify operation 2267 * will have finished. 2268 */ 2269 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2270 mutex_enter(&rp->r_statelock); 2271 rp->r_flags |= R4MODINPROGRESS; 2272 rp->r_modaddr = (offset & MAXBMASK); 2273 mutex_exit(&rp->r_statelock); 2274 2275 if (vpm_enable) { 2276 /* 2277 * Copy data. If new pages are created, part of 2278 * the page that is not written will be initizliazed 2279 * with zeros. 2280 */ 2281 error = vpm_data_copy(vp, offset, n, uio, 2282 !pagecreate, NULL, 0, S_WRITE); 2283 } else { 2284 error = uiomove(base, n, UIO_WRITE, uio); 2285 } 2286 2287 /* 2288 * r_size is the maximum number of 2289 * bytes known to be in the file. 2290 * Make sure it is at least as high as the 2291 * first unwritten byte pointed to by uio_loffset. 2292 */ 2293 mutex_enter(&rp->r_statelock); 2294 if (rp->r_size < uio->uio_loffset) 2295 rp->r_size = uio->uio_loffset; 2296 rp->r_flags &= ~R4MODINPROGRESS; 2297 rp->r_flags |= R4DIRTY; 2298 mutex_exit(&rp->r_statelock); 2299 2300 /* n = # of bytes written */ 2301 n = (int)(uio->uio_loffset - offset); 2302 2303 if (!vpm_enable) { 2304 base += n; 2305 } 2306 2307 tcount -= n; 2308 /* 2309 * If we created pages w/o initializing them completely, 2310 * we need to zero the part that wasn't set up. 2311 * This happens on a most EOF write cases and if 2312 * we had some sort of error during the uiomove. 2313 */ 2314 if (!vpm_enable && pagecreate) { 2315 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2316 (void) kzero(base, PAGESIZE - n); 2317 2318 if (pgcreated) { 2319 /* 2320 * Caller is responsible for this page, 2321 * it was not created in this loop. 2322 */ 2323 pgcreated = 0; 2324 } else { 2325 /* 2326 * For bug 1094402: segmap_pagecreate locks 2327 * page. Unlock it. This also unlocks the 2328 * pages allocated by page_create_va() in 2329 * segmap_pagecreate(). 2330 */ 2331 sm_error = segmap_fault(kas.a_hat, segkmap, 2332 saved_base, saved_n, 2333 F_SOFTUNLOCK, S_WRITE); 2334 if (error == 0) 2335 error = sm_error; 2336 } 2337 } 2338 } while (tcount > 0 && error == 0); 2339 2340 return (error); 2341 } 2342 2343 int 2344 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2345 { 2346 rnode4_t *rp; 2347 page_t *pp; 2348 u_offset_t eoff; 2349 u_offset_t io_off; 2350 size_t io_len; 2351 int error; 2352 int rdirty; 2353 int err; 2354 2355 rp = VTOR4(vp); 2356 ASSERT(rp->r_count > 0); 2357 2358 if (!nfs4_has_pages(vp)) 2359 return (0); 2360 2361 ASSERT(vp->v_type != VCHR); 2362 2363 /* 2364 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2365 * writes. B_FORCE is set to force the VM system to actually 2366 * invalidate the pages, even if the i/o failed. The pages 2367 * need to get invalidated because they can't be written out 2368 * because there isn't any space left on either the server's 2369 * file system or in the user's disk quota. The B_FREE bit 2370 * is cleared to avoid confusion as to whether this is a 2371 * request to place the page on the freelist or to destroy 2372 * it. 2373 */ 2374 if ((rp->r_flags & R4OUTOFSPACE) || 2375 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2376 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2377 2378 if (len == 0) { 2379 /* 2380 * If doing a full file synchronous operation, then clear 2381 * the R4DIRTY bit. If a page gets dirtied while the flush 2382 * is happening, then R4DIRTY will get set again. The 2383 * R4DIRTY bit must get cleared before the flush so that 2384 * we don't lose this information. 2385 * 2386 * If there are no full file async write operations 2387 * pending and RDIRTY bit is set, clear it. 2388 */ 2389 if (off == (u_offset_t)0 && 2390 !(flags & B_ASYNC) && 2391 (rp->r_flags & R4DIRTY)) { 2392 mutex_enter(&rp->r_statelock); 2393 rdirty = (rp->r_flags & R4DIRTY); 2394 rp->r_flags &= ~R4DIRTY; 2395 mutex_exit(&rp->r_statelock); 2396 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2397 mutex_enter(&rp->r_statelock); 2398 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2399 rdirty = (rp->r_flags & R4DIRTY); 2400 rp->r_flags &= ~R4DIRTY; 2401 } 2402 mutex_exit(&rp->r_statelock); 2403 } else 2404 rdirty = 0; 2405 2406 /* 2407 * Search the entire vp list for pages >= off, and flush 2408 * the dirty pages. 2409 */ 2410 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2411 flags, cr); 2412 2413 /* 2414 * If an error occured and the file was marked as dirty 2415 * before and we aren't forcibly invalidating pages, then 2416 * reset the R4DIRTY flag. 2417 */ 2418 if (error && rdirty && 2419 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2420 mutex_enter(&rp->r_statelock); 2421 rp->r_flags |= R4DIRTY; 2422 mutex_exit(&rp->r_statelock); 2423 } 2424 } else { 2425 /* 2426 * Do a range from [off...off + len) looking for pages 2427 * to deal with. 2428 */ 2429 error = 0; 2430 io_len = 0; 2431 eoff = off + len; 2432 mutex_enter(&rp->r_statelock); 2433 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2434 io_off += io_len) { 2435 mutex_exit(&rp->r_statelock); 2436 /* 2437 * If we are not invalidating, synchronously 2438 * freeing or writing pages use the routine 2439 * page_lookup_nowait() to prevent reclaiming 2440 * them from the free list. 2441 */ 2442 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2443 pp = page_lookup(vp, io_off, 2444 (flags & (B_INVAL | B_FREE)) ? 2445 SE_EXCL : SE_SHARED); 2446 } else { 2447 pp = page_lookup_nowait(vp, io_off, 2448 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2449 } 2450 2451 if (pp == NULL || !pvn_getdirty(pp, flags)) 2452 io_len = PAGESIZE; 2453 else { 2454 err = (*rp->r_putapage)(vp, pp, &io_off, 2455 &io_len, flags, cr); 2456 if (!error) 2457 error = err; 2458 /* 2459 * "io_off" and "io_len" are returned as 2460 * the range of pages we actually wrote. 2461 * This allows us to skip ahead more quickly 2462 * since several pages may've been dealt 2463 * with by this iteration of the loop. 2464 */ 2465 } 2466 mutex_enter(&rp->r_statelock); 2467 } 2468 mutex_exit(&rp->r_statelock); 2469 } 2470 2471 return (error); 2472 } 2473 2474 void 2475 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2476 { 2477 rnode4_t *rp; 2478 2479 rp = VTOR4(vp); 2480 if (IS_SHADOW(vp, rp)) 2481 vp = RTOV4(rp); 2482 mutex_enter(&rp->r_statelock); 2483 while (rp->r_flags & R4TRUNCATE) 2484 cv_wait(&rp->r_cv, &rp->r_statelock); 2485 rp->r_flags |= R4TRUNCATE; 2486 if (off == (u_offset_t)0) { 2487 rp->r_flags &= ~R4DIRTY; 2488 if (!(rp->r_flags & R4STALE)) 2489 rp->r_error = 0; 2490 } 2491 rp->r_truncaddr = off; 2492 mutex_exit(&rp->r_statelock); 2493 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2494 B_INVAL | B_TRUNC, cr); 2495 mutex_enter(&rp->r_statelock); 2496 rp->r_flags &= ~R4TRUNCATE; 2497 cv_broadcast(&rp->r_cv); 2498 mutex_exit(&rp->r_statelock); 2499 } 2500 2501 static int 2502 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2503 { 2504 mntinfo4_t *mi; 2505 struct mntinfo_kstat *mik; 2506 vfs_t *vfsp; 2507 2508 /* this is a read-only kstat. Bail out on a write */ 2509 if (rw == KSTAT_WRITE) 2510 return (EACCES); 2511 2512 2513 /* 2514 * We don't want to wait here as kstat_chain_lock could be held by 2515 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2516 * and thus could lead to a deadlock. 2517 */ 2518 vfsp = (struct vfs *)ksp->ks_private; 2519 2520 mi = VFTOMI4(vfsp); 2521 mik = (struct mntinfo_kstat *)ksp->ks_data; 2522 2523 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2524 2525 mik->mik_vers = (uint32_t)mi->mi_vers; 2526 mik->mik_flags = mi->mi_flags; 2527 /* 2528 * The sv_secdata holds the flavor the client specifies. 2529 * If the client uses default and a security negotiation 2530 * occurs, sv_currsec will point to the current flavor 2531 * selected from the server flavor list. 2532 * sv_currsec is NULL if no security negotiation takes place. 2533 */ 2534 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2535 mi->mi_curr_serv->sv_currsec->secmod : 2536 mi->mi_curr_serv->sv_secdata->secmod; 2537 mik->mik_curread = (uint32_t)mi->mi_curread; 2538 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2539 mik->mik_retrans = mi->mi_retrans; 2540 mik->mik_timeo = mi->mi_timeo; 2541 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2542 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2543 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2544 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2545 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2546 mik->mik_failover = (uint32_t)mi->mi_failover; 2547 mik->mik_remap = (uint32_t)mi->mi_remap; 2548 2549 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2550 2551 return (0); 2552 } 2553 2554 void 2555 nfs4_mnt_kstat_init(struct vfs *vfsp) 2556 { 2557 mntinfo4_t *mi = VFTOMI4(vfsp); 2558 2559 /* 2560 * PSARC 2001/697 Contract Private Interface 2561 * All nfs kstats are under SunMC contract 2562 * Please refer to the PSARC listed above and contact 2563 * SunMC before making any changes! 2564 * 2565 * Changes must be reviewed by Solaris File Sharing 2566 * Changes must be communicated to contract-2001-697@sun.com 2567 * 2568 */ 2569 2570 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2571 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2572 if (mi->mi_io_kstats) { 2573 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2574 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2575 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2576 kstat_install(mi->mi_io_kstats); 2577 } 2578 2579 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2580 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2581 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2582 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2583 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2584 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2585 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2586 kstat_install(mi->mi_ro_kstats); 2587 } 2588 2589 nfs4_mnt_recov_kstat_init(vfsp); 2590 } 2591 2592 void 2593 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2594 { 2595 mntinfo4_t *mi; 2596 2597 mi = VTOMI4(vp); 2598 /* 2599 * In case of forced unmount, do not print any messages 2600 * since it can flood the console with error messages. 2601 */ 2602 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2603 return; 2604 2605 /* 2606 * If the mount point is dead, not recoverable, do not 2607 * print error messages that can flood the console. 2608 */ 2609 if (mi->mi_flags & MI4_RECOV_FAIL) 2610 return; 2611 2612 /* 2613 * No use in flooding the console with ENOSPC 2614 * messages from the same file system. 2615 */ 2616 if ((error != ENOSPC && error != EDQUOT) || 2617 lbolt - mi->mi_printftime > 0) { 2618 zoneid_t zoneid = mi->mi_zone->zone_id; 2619 2620 #ifdef DEBUG 2621 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2622 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2623 #else 2624 nfs_perror(error, "NFS write error on host %s: %m.\n", 2625 VTOR4(vp)->r_server->sv_hostname, NULL); 2626 #endif 2627 if (error == ENOSPC || error == EDQUOT) { 2628 zcmn_err(zoneid, CE_CONT, 2629 "^File: userid=%d, groupid=%d\n", 2630 crgetuid(cr), crgetgid(cr)); 2631 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2632 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2633 zcmn_err(zoneid, CE_CONT, 2634 "^User: userid=%d, groupid=%d\n", 2635 crgetuid(curthread->t_cred), 2636 crgetgid(curthread->t_cred)); 2637 } 2638 mi->mi_printftime = lbolt + 2639 nfs_write_error_interval * hz; 2640 } 2641 sfh4_printfhandle(VTOR4(vp)->r_fh); 2642 #ifdef DEBUG 2643 if (error == EACCES) { 2644 zcmn_err(zoneid, CE_CONT, 2645 "nfs_bio: cred is%s kcred\n", 2646 cr == kcred ? "" : " not"); 2647 } 2648 #endif 2649 } 2650 } 2651 2652 /* 2653 * Return non-zero if the given file can be safely memory mapped. Locks 2654 * are safe if whole-file (length and offset are both zero). 2655 */ 2656 2657 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2658 2659 static int 2660 nfs4_safemap(const vnode_t *vp) 2661 { 2662 locklist_t *llp, *next_llp; 2663 int safe = 1; 2664 rnode4_t *rp = VTOR4(vp); 2665 2666 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2667 2668 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2669 "vp = %p", (void *)vp)); 2670 2671 /* 2672 * Review all the locks for the vnode, both ones that have been 2673 * acquired and ones that are pending. We assume that 2674 * flk_active_locks_for_vp() has merged any locks that can be 2675 * merged (so that if a process has the entire file locked, it is 2676 * represented as a single lock). 2677 * 2678 * Note that we can't bail out of the loop if we find a non-safe 2679 * lock, because we have to free all the elements in the llp list. 2680 * We might be able to speed up this code slightly by not looking 2681 * at each lock's l_start and l_len fields once we've found a 2682 * non-safe lock. 2683 */ 2684 2685 llp = flk_active_locks_for_vp(vp); 2686 while (llp) { 2687 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2688 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2689 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2690 if (!SAFE_LOCK(llp->ll_flock)) { 2691 safe = 0; 2692 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2693 "nfs4_safemap: unsafe active lock (%" PRId64 2694 ", %" PRId64 ")", llp->ll_flock.l_start, 2695 llp->ll_flock.l_len)); 2696 } 2697 next_llp = llp->ll_next; 2698 VN_RELE(llp->ll_vp); 2699 kmem_free(llp, sizeof (*llp)); 2700 llp = next_llp; 2701 } 2702 2703 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2704 safe ? "safe" : "unsafe")); 2705 return (safe); 2706 } 2707 2708 /* 2709 * Return whether there is a lost LOCK or LOCKU queued up for the given 2710 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2711 */ 2712 2713 bool_t 2714 nfs4_map_lost_lock_conflict(vnode_t *vp) 2715 { 2716 bool_t conflict = FALSE; 2717 nfs4_lost_rqst_t *lrp; 2718 mntinfo4_t *mi = VTOMI4(vp); 2719 2720 mutex_enter(&mi->mi_lock); 2721 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2722 lrp = list_next(&mi->mi_lost_state, lrp)) { 2723 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2724 continue; 2725 ASSERT(lrp->lr_vp != NULL); 2726 if (!VOP_CMP(lrp->lr_vp, vp)) 2727 continue; /* different file */ 2728 if (!SAFE_LOCK(*lrp->lr_flk)) { 2729 conflict = TRUE; 2730 break; 2731 } 2732 } 2733 2734 mutex_exit(&mi->mi_lock); 2735 return (conflict); 2736 } 2737 2738 /* 2739 * nfs_lockcompletion: 2740 * 2741 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2742 * as non cachable (set VNOCACHE bit). 2743 */ 2744 2745 void 2746 nfs4_lockcompletion(vnode_t *vp, int cmd) 2747 { 2748 rnode4_t *rp = VTOR4(vp); 2749 2750 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2751 ASSERT(!IS_SHADOW(vp, rp)); 2752 2753 if (cmd == F_SETLK || cmd == F_SETLKW) { 2754 2755 if (!nfs4_safemap(vp)) { 2756 mutex_enter(&vp->v_lock); 2757 vp->v_flag |= VNOCACHE; 2758 mutex_exit(&vp->v_lock); 2759 } else { 2760 mutex_enter(&vp->v_lock); 2761 vp->v_flag &= ~VNOCACHE; 2762 mutex_exit(&vp->v_lock); 2763 } 2764 } 2765 /* 2766 * The cached attributes of the file are stale after acquiring 2767 * the lock on the file. They were updated when the file was 2768 * opened, but not updated when the lock was acquired. Therefore the 2769 * cached attributes are invalidated after the lock is obtained. 2770 */ 2771 PURGE_ATTRCACHE4(vp); 2772 } 2773 2774 /* ARGSUSED */ 2775 static void * 2776 nfs4_mi_init(zoneid_t zoneid) 2777 { 2778 struct mi4_globals *mig; 2779 2780 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2781 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2782 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2783 offsetof(mntinfo4_t, mi_zone_node)); 2784 mig->mig_destructor_called = B_FALSE; 2785 return (mig); 2786 } 2787 2788 /* 2789 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2790 * state and killing off threads. 2791 */ 2792 /* ARGSUSED */ 2793 static void 2794 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2795 { 2796 struct mi4_globals *mig = data; 2797 mntinfo4_t *mi; 2798 nfs4_server_t *np; 2799 2800 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2801 "nfs4_mi_shutdown zone %d\n", zoneid)); 2802 ASSERT(mig != NULL); 2803 for (;;) { 2804 mutex_enter(&mig->mig_lock); 2805 mi = list_head(&mig->mig_list); 2806 if (mi == NULL) { 2807 mutex_exit(&mig->mig_lock); 2808 break; 2809 } 2810 2811 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2812 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2813 /* 2814 * purge the DNLC for this filesystem 2815 */ 2816 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2817 /* 2818 * Tell existing async worker threads to exit. 2819 */ 2820 mutex_enter(&mi->mi_async_lock); 2821 mi->mi_max_threads = 0; 2822 cv_broadcast(&mi->mi_async_work_cv); 2823 /* 2824 * Set the appropriate flags, signal and wait for both the 2825 * async manager and the inactive thread to exit when they're 2826 * done with their current work. 2827 */ 2828 mutex_enter(&mi->mi_lock); 2829 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2830 mutex_exit(&mi->mi_lock); 2831 mutex_exit(&mi->mi_async_lock); 2832 if (mi->mi_manager_thread) { 2833 nfs4_async_manager_stop(mi->mi_vfsp); 2834 } 2835 if (mi->mi_inactive_thread) { 2836 mutex_enter(&mi->mi_async_lock); 2837 cv_signal(&mi->mi_inact_req_cv); 2838 /* 2839 * Wait for the inactive thread to exit. 2840 */ 2841 while (mi->mi_inactive_thread != NULL) { 2842 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2843 } 2844 mutex_exit(&mi->mi_async_lock); 2845 } 2846 /* 2847 * Wait for the recovery thread to complete, that is, it will 2848 * signal when it is done using the "mi" structure and about 2849 * to exit 2850 */ 2851 mutex_enter(&mi->mi_lock); 2852 while (mi->mi_in_recovery > 0) 2853 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2854 mutex_exit(&mi->mi_lock); 2855 /* 2856 * We're done when every mi has been done or the list is empty. 2857 * This one is done, remove it from the list. 2858 */ 2859 list_remove(&mig->mig_list, mi); 2860 mutex_exit(&mig->mig_lock); 2861 zone_rele(mi->mi_zone); 2862 /* 2863 * Release hold on vfs and mi done to prevent race with zone 2864 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2865 */ 2866 VFS_RELE(mi->mi_vfsp); 2867 MI4_RELE(mi); 2868 } 2869 /* 2870 * Tell each renew thread in the zone to exit 2871 */ 2872 mutex_enter(&nfs4_server_lst_lock); 2873 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2874 mutex_enter(&np->s_lock); 2875 if (np->zoneid == zoneid) { 2876 /* 2877 * We add another hold onto the nfs4_server_t 2878 * because this will make sure tha the nfs4_server_t 2879 * stays around until nfs4_callback_fini_zone destroys 2880 * the zone. This way, the renew thread can 2881 * unconditionally release its holds on the 2882 * nfs4_server_t. 2883 */ 2884 np->s_refcnt++; 2885 nfs4_mark_srv_dead(np); 2886 } 2887 mutex_exit(&np->s_lock); 2888 } 2889 mutex_exit(&nfs4_server_lst_lock); 2890 } 2891 2892 static void 2893 nfs4_mi_free_globals(struct mi4_globals *mig) 2894 { 2895 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2896 mutex_destroy(&mig->mig_lock); 2897 kmem_free(mig, sizeof (*mig)); 2898 } 2899 2900 /* ARGSUSED */ 2901 static void 2902 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2903 { 2904 struct mi4_globals *mig = data; 2905 2906 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2907 "nfs4_mi_destroy zone %d\n", zoneid)); 2908 ASSERT(mig != NULL); 2909 mutex_enter(&mig->mig_lock); 2910 if (list_head(&mig->mig_list) != NULL) { 2911 /* Still waiting for VFS_FREEVFS() */ 2912 mig->mig_destructor_called = B_TRUE; 2913 mutex_exit(&mig->mig_lock); 2914 return; 2915 } 2916 nfs4_mi_free_globals(mig); 2917 } 2918 2919 /* 2920 * Add an NFS mount to the per-zone list of NFS mounts. 2921 */ 2922 void 2923 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2924 { 2925 struct mi4_globals *mig; 2926 2927 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2928 mutex_enter(&mig->mig_lock); 2929 list_insert_head(&mig->mig_list, mi); 2930 /* 2931 * hold added to eliminate race with zone shutdown -this will be 2932 * released in mi_shutdown 2933 */ 2934 MI4_HOLD(mi); 2935 VFS_HOLD(mi->mi_vfsp); 2936 mutex_exit(&mig->mig_lock); 2937 } 2938 2939 /* 2940 * Remove an NFS mount from the per-zone list of NFS mounts. 2941 */ 2942 int 2943 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 2944 { 2945 struct mi4_globals *mig; 2946 int ret = 0; 2947 2948 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2949 mutex_enter(&mig->mig_lock); 2950 mutex_enter(&mi->mi_lock); 2951 /* if this mi is marked dead, then the zone already released it */ 2952 if (!(mi->mi_flags & MI4_DEAD)) { 2953 list_remove(&mig->mig_list, mi); 2954 2955 /* release the holds put on in zonelist_add(). */ 2956 VFS_RELE(mi->mi_vfsp); 2957 MI4_RELE(mi); 2958 ret = 1; 2959 } 2960 mutex_exit(&mi->mi_lock); 2961 2962 /* 2963 * We can be called asynchronously by VFS_FREEVFS() after the zone 2964 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2965 * mi globals. 2966 */ 2967 if (list_head(&mig->mig_list) == NULL && 2968 mig->mig_destructor_called == B_TRUE) { 2969 nfs4_mi_free_globals(mig); 2970 return (ret); 2971 } 2972 mutex_exit(&mig->mig_lock); 2973 return (ret); 2974 } 2975 2976 void 2977 nfs_free_mi4(mntinfo4_t *mi) 2978 { 2979 nfs4_open_owner_t *foop; 2980 nfs4_oo_hash_bucket_t *bucketp; 2981 nfs4_debug_msg_t *msgp; 2982 int i; 2983 servinfo4_t *svp; 2984 2985 mutex_enter(&mi->mi_lock); 2986 ASSERT(mi->mi_recovthread == NULL); 2987 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 2988 mutex_exit(&mi->mi_lock); 2989 mutex_enter(&mi->mi_async_lock); 2990 ASSERT(mi->mi_threads == 0); 2991 ASSERT(mi->mi_manager_thread == NULL); 2992 mutex_exit(&mi->mi_async_lock); 2993 svp = mi->mi_servers; 2994 sv4_free(svp); 2995 if (mi->mi_io_kstats) { 2996 kstat_delete(mi->mi_io_kstats); 2997 mi->mi_io_kstats = NULL; 2998 } 2999 if (mi->mi_ro_kstats) { 3000 kstat_delete(mi->mi_ro_kstats); 3001 mi->mi_ro_kstats = NULL; 3002 } 3003 if (mi->mi_recov_ksp) { 3004 kstat_delete(mi->mi_recov_ksp); 3005 mi->mi_recov_ksp = NULL; 3006 } 3007 mutex_enter(&mi->mi_msg_list_lock); 3008 while (msgp = list_head(&mi->mi_msg_list)) { 3009 list_remove(&mi->mi_msg_list, msgp); 3010 nfs4_free_msg(msgp); 3011 } 3012 mutex_exit(&mi->mi_msg_list_lock); 3013 list_destroy(&mi->mi_msg_list); 3014 if (mi->mi_rootfh != NULL) 3015 sfh4_rele(&mi->mi_rootfh); 3016 if (mi->mi_srvparentfh != NULL) 3017 sfh4_rele(&mi->mi_srvparentfh); 3018 mutex_destroy(&mi->mi_lock); 3019 mutex_destroy(&mi->mi_async_lock); 3020 mutex_destroy(&mi->mi_msg_list_lock); 3021 nfs_rw_destroy(&mi->mi_recovlock); 3022 nfs_rw_destroy(&mi->mi_rename_lock); 3023 nfs_rw_destroy(&mi->mi_fh_lock); 3024 cv_destroy(&mi->mi_failover_cv); 3025 cv_destroy(&mi->mi_async_reqs_cv); 3026 cv_destroy(&mi->mi_async_work_cv); 3027 cv_destroy(&mi->mi_async_cv); 3028 cv_destroy(&mi->mi_inact_req_cv); 3029 /* 3030 * Destroy the oo hash lists and mutexes for the cred hash table. 3031 */ 3032 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3033 bucketp = &(mi->mi_oo_list[i]); 3034 /* Destroy any remaining open owners on the list */ 3035 foop = list_head(&bucketp->b_oo_hash_list); 3036 while (foop != NULL) { 3037 list_remove(&bucketp->b_oo_hash_list, foop); 3038 nfs4_destroy_open_owner(foop); 3039 foop = list_head(&bucketp->b_oo_hash_list); 3040 } 3041 list_destroy(&bucketp->b_oo_hash_list); 3042 mutex_destroy(&bucketp->b_lock); 3043 } 3044 /* 3045 * Empty and destroy the freed open owner list. 3046 */ 3047 foop = list_head(&mi->mi_foo_list); 3048 while (foop != NULL) { 3049 list_remove(&mi->mi_foo_list, foop); 3050 nfs4_destroy_open_owner(foop); 3051 foop = list_head(&mi->mi_foo_list); 3052 } 3053 list_destroy(&mi->mi_foo_list); 3054 list_destroy(&mi->mi_bseqid_list); 3055 list_destroy(&mi->mi_lost_state); 3056 avl_destroy(&mi->mi_filehandles); 3057 fn_rele(&mi->mi_fname); 3058 kmem_free(mi, sizeof (*mi)); 3059 } 3060 void 3061 mi_hold(mntinfo4_t *mi) 3062 { 3063 atomic_add_32(&mi->mi_count, 1); 3064 ASSERT(mi->mi_count != 0); 3065 } 3066 3067 void 3068 mi_rele(mntinfo4_t *mi) 3069 { 3070 ASSERT(mi->mi_count != 0); 3071 if (atomic_add_32_nv(&mi->mi_count, -1) == 0) { 3072 nfs_free_mi4(mi); 3073 } 3074 } 3075 3076 vnode_t nfs4_xattr_notsupp_vnode; 3077 3078 void 3079 nfs4_clnt_init(void) 3080 { 3081 nfs4_vnops_init(); 3082 (void) nfs4_rnode_init(); 3083 (void) nfs4_shadow_init(); 3084 (void) nfs4_acache_init(); 3085 (void) nfs4_subr_init(); 3086 nfs4_acl_init(); 3087 nfs_idmap_init(); 3088 nfs4_callback_init(); 3089 nfs4_secinfo_init(); 3090 #ifdef DEBUG 3091 tsd_create(&nfs4_tsd_key, NULL); 3092 #endif 3093 3094 /* 3095 * Add a CPR callback so that we can update client 3096 * lease after a suspend and resume. 3097 */ 3098 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3099 3100 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3101 nfs4_mi_destroy); 3102 3103 /* 3104 * Initialise the reference count of the notsupp xattr cache vnode to 1 3105 * so that it never goes away (VOP_INACTIVE isn't called on it). 3106 */ 3107 nfs4_xattr_notsupp_vnode.v_count = 1; 3108 } 3109 3110 void 3111 nfs4_clnt_fini(void) 3112 { 3113 (void) zone_key_delete(mi4_list_key); 3114 nfs4_vnops_fini(); 3115 (void) nfs4_rnode_fini(); 3116 (void) nfs4_shadow_fini(); 3117 (void) nfs4_acache_fini(); 3118 (void) nfs4_subr_fini(); 3119 nfs_idmap_fini(); 3120 nfs4_callback_fini(); 3121 nfs4_secinfo_fini(); 3122 #ifdef DEBUG 3123 tsd_destroy(&nfs4_tsd_key); 3124 #endif 3125 if (cid) 3126 (void) callb_delete(cid); 3127 } 3128 3129 /*ARGSUSED*/ 3130 static boolean_t 3131 nfs4_client_cpr_callb(void *arg, int code) 3132 { 3133 /* 3134 * We get called for Suspend and Resume events. 3135 * For the suspend case we simply don't care! 3136 */ 3137 if (code == CB_CODE_CPR_CHKPT) { 3138 return (B_TRUE); 3139 } 3140 3141 /* 3142 * When we get to here we are in the process of 3143 * resuming the system from a previous suspend. 3144 */ 3145 nfs4_client_resumed = gethrestime_sec(); 3146 return (B_TRUE); 3147 } 3148 3149 void 3150 nfs4_renew_lease_thread(nfs4_server_t *sp) 3151 { 3152 int error = 0; 3153 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3154 clock_t tick_delay = 0; 3155 clock_t time_left = 0; 3156 callb_cpr_t cpr_info; 3157 kmutex_t cpr_lock; 3158 3159 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3160 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3161 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3162 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3163 3164 mutex_enter(&sp->s_lock); 3165 /* sp->s_lease_time is set via a GETATTR */ 3166 sp->last_renewal_time = gethrestime_sec(); 3167 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3168 ASSERT(sp->s_refcnt >= 1); 3169 3170 for (;;) { 3171 if (!sp->state_ref_count || 3172 sp->lease_valid != NFS4_LEASE_VALID) { 3173 3174 kip_secs = MAX((sp->s_lease_time >> 1) - 3175 (3 * sp->propagation_delay.tv_sec), 1); 3176 3177 tick_delay = SEC_TO_TICK(kip_secs); 3178 3179 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3180 "nfs4_renew_lease_thread: no renew : thread " 3181 "wait %ld secs", kip_secs)); 3182 3183 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3184 "nfs4_renew_lease_thread: no renew : " 3185 "state_ref_count %d, lease_valid %d", 3186 sp->state_ref_count, sp->lease_valid)); 3187 3188 mutex_enter(&cpr_lock); 3189 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3190 mutex_exit(&cpr_lock); 3191 time_left = cv_timedwait(&sp->cv_thread_exit, 3192 &sp->s_lock, tick_delay + lbolt); 3193 mutex_enter(&cpr_lock); 3194 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3195 mutex_exit(&cpr_lock); 3196 3197 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3198 "nfs4_renew_lease_thread: no renew: " 3199 "time left %ld", time_left)); 3200 3201 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3202 goto die; 3203 continue; 3204 } 3205 3206 tmp_last_renewal_time = sp->last_renewal_time; 3207 3208 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3209 (3 * sp->propagation_delay.tv_sec); 3210 3211 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3212 "nfs4_renew_lease_thread: tmp_time %ld, " 3213 "sp->last_renewal_time %ld", tmp_time, 3214 sp->last_renewal_time)); 3215 3216 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3217 3218 tick_delay = SEC_TO_TICK(kip_secs); 3219 3220 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3221 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3222 "secs", kip_secs)); 3223 3224 mutex_enter(&cpr_lock); 3225 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3226 mutex_exit(&cpr_lock); 3227 time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock, 3228 tick_delay + lbolt); 3229 mutex_enter(&cpr_lock); 3230 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3231 mutex_exit(&cpr_lock); 3232 3233 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3234 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3235 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3236 "tmp_last_renewal_time %ld", time_left, 3237 sp->last_renewal_time, nfs4_client_resumed, 3238 tmp_last_renewal_time)); 3239 3240 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3241 goto die; 3242 3243 if (tmp_last_renewal_time == sp->last_renewal_time || 3244 (nfs4_client_resumed != 0 && 3245 nfs4_client_resumed > sp->last_renewal_time)) { 3246 /* 3247 * Issue RENEW op since we haven't renewed the lease 3248 * since we slept. 3249 */ 3250 tmp_now_time = gethrestime_sec(); 3251 error = nfs4renew(sp); 3252 /* 3253 * Need to re-acquire sp's lock, nfs4renew() 3254 * relinqueshes it. 3255 */ 3256 mutex_enter(&sp->s_lock); 3257 3258 /* 3259 * See if someone changed s_thread_exit while we gave 3260 * up s_lock. 3261 */ 3262 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3263 goto die; 3264 3265 if (!error) { 3266 /* 3267 * check to see if we implicitly renewed while 3268 * we waited for a reply for our RENEW call. 3269 */ 3270 if (tmp_last_renewal_time == 3271 sp->last_renewal_time) { 3272 /* no implicit renew came */ 3273 sp->last_renewal_time = tmp_now_time; 3274 } else { 3275 NFS4_DEBUG(nfs4_client_lease_debug, 3276 (CE_NOTE, "renew_thread: did " 3277 "implicit renewal before reply " 3278 "from server for RENEW")); 3279 } 3280 } else { 3281 /* figure out error */ 3282 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3283 "renew_thread: nfs4renew returned error" 3284 " %d", error)); 3285 } 3286 3287 } 3288 } 3289 3290 die: 3291 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3292 "nfs4_renew_lease_thread: thread exiting")); 3293 3294 while (sp->s_otw_call_count != 0) { 3295 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3296 "nfs4_renew_lease_thread: waiting for outstanding " 3297 "otw calls to finish for sp 0x%p, current " 3298 "s_otw_call_count %d", (void *)sp, 3299 sp->s_otw_call_count)); 3300 mutex_enter(&cpr_lock); 3301 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3302 mutex_exit(&cpr_lock); 3303 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3304 mutex_enter(&cpr_lock); 3305 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3306 mutex_exit(&cpr_lock); 3307 } 3308 mutex_exit(&sp->s_lock); 3309 3310 nfs4_server_rele(sp); /* free the thread's reference */ 3311 nfs4_server_rele(sp); /* free the list's reference */ 3312 sp = NULL; 3313 3314 done: 3315 mutex_enter(&cpr_lock); 3316 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3317 mutex_destroy(&cpr_lock); 3318 3319 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3320 "nfs4_renew_lease_thread: renew thread exit officially")); 3321 3322 zthread_exit(); 3323 /* NOT REACHED */ 3324 } 3325 3326 /* 3327 * Send out a RENEW op to the server. 3328 * Assumes sp is locked down. 3329 */ 3330 static int 3331 nfs4renew(nfs4_server_t *sp) 3332 { 3333 COMPOUND4args_clnt args; 3334 COMPOUND4res_clnt res; 3335 nfs_argop4 argop[1]; 3336 int doqueue = 1; 3337 int rpc_error; 3338 cred_t *cr; 3339 mntinfo4_t *mi; 3340 timespec_t prop_time, after_time; 3341 int needrecov = FALSE; 3342 nfs4_recov_state_t recov_state; 3343 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3344 3345 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3346 3347 recov_state.rs_flags = 0; 3348 recov_state.rs_num_retry_despite_err = 0; 3349 3350 recov_retry: 3351 mi = sp->mntinfo4_list; 3352 VFS_HOLD(mi->mi_vfsp); 3353 mutex_exit(&sp->s_lock); 3354 ASSERT(mi != NULL); 3355 3356 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3357 if (e.error) { 3358 VFS_RELE(mi->mi_vfsp); 3359 return (e.error); 3360 } 3361 3362 /* Check to see if we're dealing with a marked-dead sp */ 3363 mutex_enter(&sp->s_lock); 3364 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3365 mutex_exit(&sp->s_lock); 3366 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3367 VFS_RELE(mi->mi_vfsp); 3368 return (0); 3369 } 3370 3371 /* Make sure mi hasn't changed on us */ 3372 if (mi != sp->mntinfo4_list) { 3373 /* Must drop sp's lock to avoid a recursive mutex enter */ 3374 mutex_exit(&sp->s_lock); 3375 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3376 VFS_RELE(mi->mi_vfsp); 3377 mutex_enter(&sp->s_lock); 3378 goto recov_retry; 3379 } 3380 mutex_exit(&sp->s_lock); 3381 3382 args.ctag = TAG_RENEW; 3383 3384 args.array_len = 1; 3385 args.array = argop; 3386 3387 argop[0].argop = OP_RENEW; 3388 3389 mutex_enter(&sp->s_lock); 3390 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3391 cr = sp->s_cred; 3392 crhold(cr); 3393 mutex_exit(&sp->s_lock); 3394 3395 ASSERT(cr != NULL); 3396 3397 /* used to figure out RTT for sp */ 3398 gethrestime(&prop_time); 3399 3400 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3401 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3402 (void*)sp)); 3403 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3404 prop_time.tv_sec, prop_time.tv_nsec)); 3405 3406 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3407 mntinfo4_t *, mi); 3408 3409 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3410 crfree(cr); 3411 3412 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3413 mntinfo4_t *, mi); 3414 3415 gethrestime(&after_time); 3416 3417 mutex_enter(&sp->s_lock); 3418 sp->propagation_delay.tv_sec = 3419 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3420 mutex_exit(&sp->s_lock); 3421 3422 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3423 after_time.tv_sec, after_time.tv_nsec)); 3424 3425 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3426 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3427 nfs4_delegreturn_all(sp); 3428 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3429 VFS_RELE(mi->mi_vfsp); 3430 /* 3431 * If the server returns CB_PATH_DOWN, it has renewed 3432 * the lease and informed us that the callback path is 3433 * down. Since the lease is renewed, just return 0 and 3434 * let the renew thread proceed as normal. 3435 */ 3436 return (0); 3437 } 3438 3439 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3440 if (!needrecov && e.error) { 3441 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3442 VFS_RELE(mi->mi_vfsp); 3443 return (e.error); 3444 } 3445 3446 rpc_error = e.error; 3447 3448 if (needrecov) { 3449 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3450 "nfs4renew: initiating recovery\n")); 3451 3452 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3453 OP_RENEW, NULL) == FALSE) { 3454 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3455 VFS_RELE(mi->mi_vfsp); 3456 if (!e.error) 3457 (void) xdr_free(xdr_COMPOUND4res_clnt, 3458 (caddr_t)&res); 3459 mutex_enter(&sp->s_lock); 3460 goto recov_retry; 3461 } 3462 /* fall through for res.status case */ 3463 } 3464 3465 if (res.status) { 3466 if (res.status == NFS4ERR_LEASE_MOVED) { 3467 /*EMPTY*/ 3468 /* 3469 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3470 * to renew the lease on that server 3471 */ 3472 } 3473 e.error = geterrno4(res.status); 3474 } 3475 3476 if (!rpc_error) 3477 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3478 3479 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3480 3481 VFS_RELE(mi->mi_vfsp); 3482 3483 return (e.error); 3484 } 3485 3486 void 3487 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3488 { 3489 nfs4_server_t *sp; 3490 3491 /* this locks down sp if it is found */ 3492 sp = find_nfs4_server(mi); 3493 3494 if (sp != NULL) { 3495 nfs4_inc_state_ref_count_nolock(sp, mi); 3496 mutex_exit(&sp->s_lock); 3497 nfs4_server_rele(sp); 3498 } 3499 } 3500 3501 /* 3502 * Bump the number of OPEN files (ie: those with state) so we know if this 3503 * nfs4_server has any state to maintain a lease for or not. 3504 * 3505 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3506 */ 3507 void 3508 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3509 { 3510 ASSERT(mutex_owned(&sp->s_lock)); 3511 3512 sp->state_ref_count++; 3513 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3514 "nfs4_inc_state_ref_count: state_ref_count now %d", 3515 sp->state_ref_count)); 3516 3517 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3518 sp->lease_valid = NFS4_LEASE_VALID; 3519 3520 /* 3521 * If this call caused the lease to be marked valid and/or 3522 * took the state_ref_count from 0 to 1, then start the time 3523 * on lease renewal. 3524 */ 3525 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3526 sp->last_renewal_time = gethrestime_sec(); 3527 3528 /* update the number of open files for mi */ 3529 mi->mi_open_files++; 3530 } 3531 3532 void 3533 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3534 { 3535 nfs4_server_t *sp; 3536 3537 /* this locks down sp if it is found */ 3538 sp = find_nfs4_server_all(mi, 1); 3539 3540 if (sp != NULL) { 3541 nfs4_dec_state_ref_count_nolock(sp, mi); 3542 mutex_exit(&sp->s_lock); 3543 nfs4_server_rele(sp); 3544 } 3545 } 3546 3547 /* 3548 * Decrement the number of OPEN files (ie: those with state) so we know if 3549 * this nfs4_server has any state to maintain a lease for or not. 3550 */ 3551 void 3552 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3553 { 3554 ASSERT(mutex_owned(&sp->s_lock)); 3555 ASSERT(sp->state_ref_count != 0); 3556 sp->state_ref_count--; 3557 3558 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3559 "nfs4_dec_state_ref_count: state ref count now %d", 3560 sp->state_ref_count)); 3561 3562 mi->mi_open_files--; 3563 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3564 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3565 mi->mi_open_files, mi->mi_flags)); 3566 3567 /* We don't have to hold the mi_lock to test mi_flags */ 3568 if (mi->mi_open_files == 0 && 3569 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3570 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3571 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3572 "we have closed the last open file", (void*)mi)); 3573 nfs4_remove_mi_from_server(mi, sp); 3574 } 3575 } 3576 3577 bool_t 3578 inlease(nfs4_server_t *sp) 3579 { 3580 bool_t result; 3581 3582 ASSERT(mutex_owned(&sp->s_lock)); 3583 3584 if (sp->lease_valid == NFS4_LEASE_VALID && 3585 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3586 result = TRUE; 3587 else 3588 result = FALSE; 3589 3590 return (result); 3591 } 3592 3593 3594 /* 3595 * Return non-zero if the given nfs4_server_t is going through recovery. 3596 */ 3597 3598 int 3599 nfs4_server_in_recovery(nfs4_server_t *sp) 3600 { 3601 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3602 } 3603 3604 /* 3605 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3606 * first is less than, equal to, or greater than the second. 3607 */ 3608 3609 int 3610 sfh4cmp(const void *p1, const void *p2) 3611 { 3612 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3613 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3614 3615 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3616 } 3617 3618 /* 3619 * Create a table for shared filehandle objects. 3620 */ 3621 3622 void 3623 sfh4_createtab(avl_tree_t *tab) 3624 { 3625 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3626 offsetof(nfs4_sharedfh_t, sfh_tree)); 3627 } 3628 3629 /* 3630 * Return a shared filehandle object for the given filehandle. The caller 3631 * is responsible for eventually calling sfh4_rele(). 3632 */ 3633 3634 nfs4_sharedfh_t * 3635 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3636 { 3637 nfs4_sharedfh_t *sfh, *nsfh; 3638 avl_index_t where; 3639 nfs4_sharedfh_t skey; 3640 3641 if (!key) { 3642 skey.sfh_fh = *fh; 3643 key = &skey; 3644 } 3645 3646 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3647 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3648 /* 3649 * We allocate the largest possible filehandle size because it's 3650 * not that big, and it saves us from possibly having to resize the 3651 * buffer later. 3652 */ 3653 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3654 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3655 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3656 nsfh->sfh_refcnt = 1; 3657 nsfh->sfh_flags = SFH4_IN_TREE; 3658 nsfh->sfh_mi = mi; 3659 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3660 (void *)nsfh)); 3661 3662 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3663 sfh = avl_find(&mi->mi_filehandles, key, &where); 3664 if (sfh != NULL) { 3665 mutex_enter(&sfh->sfh_lock); 3666 sfh->sfh_refcnt++; 3667 mutex_exit(&sfh->sfh_lock); 3668 nfs_rw_exit(&mi->mi_fh_lock); 3669 /* free our speculative allocs */ 3670 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3671 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3672 return (sfh); 3673 } 3674 3675 avl_insert(&mi->mi_filehandles, nsfh, where); 3676 nfs_rw_exit(&mi->mi_fh_lock); 3677 3678 return (nsfh); 3679 } 3680 3681 /* 3682 * Return a shared filehandle object for the given filehandle. The caller 3683 * is responsible for eventually calling sfh4_rele(). 3684 */ 3685 3686 nfs4_sharedfh_t * 3687 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3688 { 3689 nfs4_sharedfh_t *sfh; 3690 nfs4_sharedfh_t key; 3691 3692 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3693 3694 #ifdef DEBUG 3695 if (nfs4_sharedfh_debug) { 3696 nfs4_fhandle_t fhandle; 3697 3698 fhandle.fh_len = fh->nfs_fh4_len; 3699 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3700 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3701 nfs4_printfhandle(&fhandle); 3702 } 3703 #endif 3704 3705 /* 3706 * If there's already an object for the given filehandle, bump the 3707 * reference count and return it. Otherwise, create a new object 3708 * and add it to the AVL tree. 3709 */ 3710 3711 key.sfh_fh = *fh; 3712 3713 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3714 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3715 if (sfh != NULL) { 3716 mutex_enter(&sfh->sfh_lock); 3717 sfh->sfh_refcnt++; 3718 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3719 "sfh4_get: found existing %p, new refcnt=%d", 3720 (void *)sfh, sfh->sfh_refcnt)); 3721 mutex_exit(&sfh->sfh_lock); 3722 nfs_rw_exit(&mi->mi_fh_lock); 3723 return (sfh); 3724 } 3725 nfs_rw_exit(&mi->mi_fh_lock); 3726 3727 return (sfh4_put(fh, mi, &key)); 3728 } 3729 3730 /* 3731 * Get a reference to the given shared filehandle object. 3732 */ 3733 3734 void 3735 sfh4_hold(nfs4_sharedfh_t *sfh) 3736 { 3737 ASSERT(sfh->sfh_refcnt > 0); 3738 3739 mutex_enter(&sfh->sfh_lock); 3740 sfh->sfh_refcnt++; 3741 NFS4_DEBUG(nfs4_sharedfh_debug, 3742 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3743 (void *)sfh, sfh->sfh_refcnt)); 3744 mutex_exit(&sfh->sfh_lock); 3745 } 3746 3747 /* 3748 * Release a reference to the given shared filehandle object and null out 3749 * the given pointer. 3750 */ 3751 3752 void 3753 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3754 { 3755 mntinfo4_t *mi; 3756 nfs4_sharedfh_t *sfh = *sfhpp; 3757 3758 ASSERT(sfh->sfh_refcnt > 0); 3759 3760 mutex_enter(&sfh->sfh_lock); 3761 if (sfh->sfh_refcnt > 1) { 3762 sfh->sfh_refcnt--; 3763 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3764 "sfh4_rele %p, new refcnt=%d", 3765 (void *)sfh, sfh->sfh_refcnt)); 3766 mutex_exit(&sfh->sfh_lock); 3767 goto finish; 3768 } 3769 mutex_exit(&sfh->sfh_lock); 3770 3771 /* 3772 * Possibly the last reference, so get the lock for the table in 3773 * case it's time to remove the object from the table. 3774 */ 3775 mi = sfh->sfh_mi; 3776 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3777 mutex_enter(&sfh->sfh_lock); 3778 sfh->sfh_refcnt--; 3779 if (sfh->sfh_refcnt > 0) { 3780 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3781 "sfh4_rele %p, new refcnt=%d", 3782 (void *)sfh, sfh->sfh_refcnt)); 3783 mutex_exit(&sfh->sfh_lock); 3784 nfs_rw_exit(&mi->mi_fh_lock); 3785 goto finish; 3786 } 3787 3788 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3789 "sfh4_rele %p, last ref", (void *)sfh)); 3790 if (sfh->sfh_flags & SFH4_IN_TREE) { 3791 avl_remove(&mi->mi_filehandles, sfh); 3792 sfh->sfh_flags &= ~SFH4_IN_TREE; 3793 } 3794 mutex_exit(&sfh->sfh_lock); 3795 nfs_rw_exit(&mi->mi_fh_lock); 3796 mutex_destroy(&sfh->sfh_lock); 3797 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3798 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3799 3800 finish: 3801 *sfhpp = NULL; 3802 } 3803 3804 /* 3805 * Update the filehandle for the given shared filehandle object. 3806 */ 3807 3808 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3809 3810 void 3811 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3812 { 3813 mntinfo4_t *mi = sfh->sfh_mi; 3814 nfs4_sharedfh_t *dupsfh; 3815 avl_index_t where; 3816 nfs4_sharedfh_t key; 3817 3818 #ifdef DEBUG 3819 mutex_enter(&sfh->sfh_lock); 3820 ASSERT(sfh->sfh_refcnt > 0); 3821 mutex_exit(&sfh->sfh_lock); 3822 #endif 3823 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3824 3825 /* 3826 * The basic plan is to remove the shared filehandle object from 3827 * the table, update it to have the new filehandle, then reinsert 3828 * it. 3829 */ 3830 3831 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3832 mutex_enter(&sfh->sfh_lock); 3833 if (sfh->sfh_flags & SFH4_IN_TREE) { 3834 avl_remove(&mi->mi_filehandles, sfh); 3835 sfh->sfh_flags &= ~SFH4_IN_TREE; 3836 } 3837 mutex_exit(&sfh->sfh_lock); 3838 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3839 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3840 sfh->sfh_fh.nfs_fh4_len); 3841 3842 /* 3843 * XXX If there is already a shared filehandle object with the new 3844 * filehandle, we're in trouble, because the rnode code assumes 3845 * that there is only one shared filehandle object for a given 3846 * filehandle. So issue a warning (for read-write mounts only) 3847 * and don't try to re-insert the given object into the table. 3848 * Hopefully the given object will quickly go away and everyone 3849 * will use the new object. 3850 */ 3851 key.sfh_fh = *newfh; 3852 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3853 if (dupsfh != NULL) { 3854 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3855 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3856 "duplicate filehandle detected"); 3857 sfh4_printfhandle(dupsfh); 3858 } 3859 } else { 3860 avl_insert(&mi->mi_filehandles, sfh, where); 3861 mutex_enter(&sfh->sfh_lock); 3862 sfh->sfh_flags |= SFH4_IN_TREE; 3863 mutex_exit(&sfh->sfh_lock); 3864 } 3865 nfs_rw_exit(&mi->mi_fh_lock); 3866 } 3867 3868 /* 3869 * Copy out the current filehandle for the given shared filehandle object. 3870 */ 3871 3872 void 3873 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3874 { 3875 mntinfo4_t *mi = sfh->sfh_mi; 3876 3877 ASSERT(sfh->sfh_refcnt > 0); 3878 3879 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3880 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3881 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3882 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3883 nfs_rw_exit(&mi->mi_fh_lock); 3884 } 3885 3886 /* 3887 * Print out the filehandle for the given shared filehandle object. 3888 */ 3889 3890 void 3891 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3892 { 3893 nfs4_fhandle_t fhandle; 3894 3895 sfh4_copyval(sfh, &fhandle); 3896 nfs4_printfhandle(&fhandle); 3897 } 3898 3899 /* 3900 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3901 * if they're the same, +1 if the first is "greater" than the second. The 3902 * caller (or whoever's calling the AVL package) is responsible for 3903 * handling locking issues. 3904 */ 3905 3906 static int 3907 fncmp(const void *p1, const void *p2) 3908 { 3909 const nfs4_fname_t *f1 = p1; 3910 const nfs4_fname_t *f2 = p2; 3911 int res; 3912 3913 res = strcmp(f1->fn_name, f2->fn_name); 3914 /* 3915 * The AVL package wants +/-1, not arbitrary positive or negative 3916 * integers. 3917 */ 3918 if (res > 0) 3919 res = 1; 3920 else if (res < 0) 3921 res = -1; 3922 return (res); 3923 } 3924 3925 /* 3926 * Get or create an fname with the given name, as a child of the given 3927 * fname. The caller is responsible for eventually releasing the reference 3928 * (fn_rele()). parent may be NULL. 3929 */ 3930 3931 nfs4_fname_t * 3932 fn_get(nfs4_fname_t *parent, char *name) 3933 { 3934 nfs4_fname_t key; 3935 nfs4_fname_t *fnp; 3936 avl_index_t where; 3937 3938 key.fn_name = name; 3939 3940 /* 3941 * If there's already an fname registered with the given name, bump 3942 * its reference count and return it. Otherwise, create a new one 3943 * and add it to the parent's AVL tree. 3944 */ 3945 3946 if (parent != NULL) { 3947 mutex_enter(&parent->fn_lock); 3948 fnp = avl_find(&parent->fn_children, &key, &where); 3949 if (fnp != NULL) { 3950 fn_hold(fnp); 3951 mutex_exit(&parent->fn_lock); 3952 return (fnp); 3953 } 3954 } 3955 3956 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 3957 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 3958 fnp->fn_parent = parent; 3959 if (parent != NULL) 3960 fn_hold(parent); 3961 fnp->fn_len = strlen(name); 3962 ASSERT(fnp->fn_len < MAXNAMELEN); 3963 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 3964 (void) strcpy(fnp->fn_name, name); 3965 fnp->fn_refcnt = 1; 3966 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 3967 offsetof(nfs4_fname_t, fn_tree)); 3968 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3969 "fn_get %p:%s, a new nfs4_fname_t!", 3970 (void *)fnp, fnp->fn_name)); 3971 if (parent != NULL) { 3972 avl_insert(&parent->fn_children, fnp, where); 3973 mutex_exit(&parent->fn_lock); 3974 } 3975 3976 return (fnp); 3977 } 3978 3979 void 3980 fn_hold(nfs4_fname_t *fnp) 3981 { 3982 atomic_add_32(&fnp->fn_refcnt, 1); 3983 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3984 "fn_hold %p:%s, new refcnt=%d", 3985 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 3986 } 3987 3988 /* 3989 * Decrement the reference count of the given fname, and destroy it if its 3990 * reference count goes to zero. Nulls out the given pointer. 3991 */ 3992 3993 void 3994 fn_rele(nfs4_fname_t **fnpp) 3995 { 3996 nfs4_fname_t *parent; 3997 uint32_t newref; 3998 nfs4_fname_t *fnp; 3999 4000 recur: 4001 fnp = *fnpp; 4002 *fnpp = NULL; 4003 4004 mutex_enter(&fnp->fn_lock); 4005 parent = fnp->fn_parent; 4006 if (parent != NULL) 4007 mutex_enter(&parent->fn_lock); /* prevent new references */ 4008 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 4009 if (newref > 0) { 4010 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4011 "fn_rele %p:%s, new refcnt=%d", 4012 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4013 if (parent != NULL) 4014 mutex_exit(&parent->fn_lock); 4015 mutex_exit(&fnp->fn_lock); 4016 return; 4017 } 4018 4019 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4020 "fn_rele %p:%s, last reference, deleting...", 4021 (void *)fnp, fnp->fn_name)); 4022 if (parent != NULL) { 4023 avl_remove(&parent->fn_children, fnp); 4024 mutex_exit(&parent->fn_lock); 4025 } 4026 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4027 mutex_destroy(&fnp->fn_lock); 4028 avl_destroy(&fnp->fn_children); 4029 kmem_free(fnp, sizeof (nfs4_fname_t)); 4030 /* 4031 * Recursivly fn_rele the parent. 4032 * Use goto instead of a recursive call to avoid stack overflow. 4033 */ 4034 if (parent != NULL) { 4035 fnpp = &parent; 4036 goto recur; 4037 } 4038 } 4039 4040 /* 4041 * Returns the single component name of the given fname, in a MAXNAMELEN 4042 * string buffer, which the caller is responsible for freeing. Note that 4043 * the name may become invalid as a result of fn_move(). 4044 */ 4045 4046 char * 4047 fn_name(nfs4_fname_t *fnp) 4048 { 4049 char *name; 4050 4051 ASSERT(fnp->fn_len < MAXNAMELEN); 4052 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4053 mutex_enter(&fnp->fn_lock); 4054 (void) strcpy(name, fnp->fn_name); 4055 mutex_exit(&fnp->fn_lock); 4056 4057 return (name); 4058 } 4059 4060 4061 /* 4062 * fn_path_realloc 4063 * 4064 * This function, used only by fn_path, constructs 4065 * a new string which looks like "prepend" + "/" + "current". 4066 * by allocating a new string and freeing the old one. 4067 */ 4068 static void 4069 fn_path_realloc(char **curses, char *prepend) 4070 { 4071 int len, curlen = 0; 4072 char *news; 4073 4074 if (*curses == NULL) { 4075 /* 4076 * Prime the pump, allocate just the 4077 * space for prepend and return that. 4078 */ 4079 len = strlen(prepend) + 1; 4080 news = kmem_alloc(len, KM_SLEEP); 4081 (void) strncpy(news, prepend, len); 4082 } else { 4083 /* 4084 * Allocate the space for a new string 4085 * +1 +1 is for the "/" and the NULL 4086 * byte at the end of it all. 4087 */ 4088 curlen = strlen(*curses); 4089 len = curlen + strlen(prepend) + 1 + 1; 4090 news = kmem_alloc(len, KM_SLEEP); 4091 (void) strncpy(news, prepend, len); 4092 (void) strcat(news, "/"); 4093 (void) strcat(news, *curses); 4094 kmem_free(*curses, curlen + 1); 4095 } 4096 *curses = news; 4097 } 4098 4099 /* 4100 * Returns the path name (starting from the fs root) for the given fname. 4101 * The caller is responsible for freeing. Note that the path may be or 4102 * become invalid as a result of fn_move(). 4103 */ 4104 4105 char * 4106 fn_path(nfs4_fname_t *fnp) 4107 { 4108 char *path; 4109 nfs4_fname_t *nextfnp; 4110 4111 if (fnp == NULL) 4112 return (NULL); 4113 4114 path = NULL; 4115 4116 /* walk up the tree constructing the pathname. */ 4117 4118 fn_hold(fnp); /* adjust for later rele */ 4119 do { 4120 mutex_enter(&fnp->fn_lock); 4121 /* 4122 * Add fn_name in front of the current path 4123 */ 4124 fn_path_realloc(&path, fnp->fn_name); 4125 nextfnp = fnp->fn_parent; 4126 if (nextfnp != NULL) 4127 fn_hold(nextfnp); 4128 mutex_exit(&fnp->fn_lock); 4129 fn_rele(&fnp); 4130 fnp = nextfnp; 4131 } while (fnp != NULL); 4132 4133 return (path); 4134 } 4135 4136 /* 4137 * Return a reference to the parent of the given fname, which the caller is 4138 * responsible for eventually releasing. 4139 */ 4140 4141 nfs4_fname_t * 4142 fn_parent(nfs4_fname_t *fnp) 4143 { 4144 nfs4_fname_t *parent; 4145 4146 mutex_enter(&fnp->fn_lock); 4147 parent = fnp->fn_parent; 4148 if (parent != NULL) 4149 fn_hold(parent); 4150 mutex_exit(&fnp->fn_lock); 4151 4152 return (parent); 4153 } 4154 4155 /* 4156 * Update fnp so that its parent is newparent and its name is newname. 4157 */ 4158 4159 void 4160 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4161 { 4162 nfs4_fname_t *parent, *tmpfnp; 4163 ssize_t newlen; 4164 nfs4_fname_t key; 4165 avl_index_t where; 4166 4167 /* 4168 * This assert exists to catch the client trying to rename 4169 * a dir to be a child of itself. This happened at a recent 4170 * bakeoff against a 3rd party (broken) server which allowed 4171 * the rename to succeed. If it trips it means that: 4172 * a) the code in nfs4rename that detects this case is broken 4173 * b) the server is broken (since it allowed the bogus rename) 4174 * 4175 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4176 * panic below from: mutex_enter(&newparent->fn_lock); 4177 */ 4178 ASSERT(fnp != newparent); 4179 4180 /* 4181 * Remove fnp from its current parent, change its name, then add it 4182 * to newparent. 4183 */ 4184 mutex_enter(&fnp->fn_lock); 4185 parent = fnp->fn_parent; 4186 mutex_enter(&parent->fn_lock); 4187 avl_remove(&parent->fn_children, fnp); 4188 mutex_exit(&parent->fn_lock); 4189 fn_rele(&fnp->fn_parent); 4190 4191 newlen = strlen(newname); 4192 if (newlen != fnp->fn_len) { 4193 ASSERT(newlen < MAXNAMELEN); 4194 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4195 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4196 fnp->fn_len = newlen; 4197 } 4198 (void) strcpy(fnp->fn_name, newname); 4199 4200 again: 4201 mutex_enter(&newparent->fn_lock); 4202 key.fn_name = fnp->fn_name; 4203 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4204 if (tmpfnp != NULL) { 4205 /* 4206 * This could be due to a file that was unlinked while 4207 * open, or perhaps the rnode is in the free list. Remove 4208 * it from newparent and let it go away on its own. The 4209 * contorted code is to deal with lock order issues and 4210 * race conditions. 4211 */ 4212 fn_hold(tmpfnp); 4213 mutex_exit(&newparent->fn_lock); 4214 mutex_enter(&tmpfnp->fn_lock); 4215 if (tmpfnp->fn_parent == newparent) { 4216 mutex_enter(&newparent->fn_lock); 4217 avl_remove(&newparent->fn_children, tmpfnp); 4218 mutex_exit(&newparent->fn_lock); 4219 fn_rele(&tmpfnp->fn_parent); 4220 } 4221 mutex_exit(&tmpfnp->fn_lock); 4222 fn_rele(&tmpfnp); 4223 goto again; 4224 } 4225 fnp->fn_parent = newparent; 4226 fn_hold(newparent); 4227 avl_insert(&newparent->fn_children, fnp, where); 4228 mutex_exit(&newparent->fn_lock); 4229 mutex_exit(&fnp->fn_lock); 4230 } 4231 4232 #ifdef DEBUG 4233 /* 4234 * Return non-zero if the type information makes sense for the given vnode. 4235 * Otherwise panic. 4236 */ 4237 int 4238 nfs4_consistent_type(vnode_t *vp) 4239 { 4240 rnode4_t *rp = VTOR4(vp); 4241 4242 if (nfs4_vtype_debug && vp->v_type != VNON && 4243 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4244 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4245 "rnode attr type=%d", (void *)vp, vp->v_type, 4246 rp->r_attr.va_type); 4247 } 4248 4249 return (1); 4250 } 4251 #endif /* DEBUG */ 4252