1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 29 * All Rights Reserved 30 */ 31 32 #pragma ident "%Z%%M% %I% %E% SMI" 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/thread.h> 38 #include <sys/t_lock.h> 39 #include <sys/time.h> 40 #include <sys/vnode.h> 41 #include <sys/vfs.h> 42 #include <sys/errno.h> 43 #include <sys/buf.h> 44 #include <sys/stat.h> 45 #include <sys/cred.h> 46 #include <sys/kmem.h> 47 #include <sys/debug.h> 48 #include <sys/dnlc.h> 49 #include <sys/vmsystm.h> 50 #include <sys/flock.h> 51 #include <sys/share.h> 52 #include <sys/cmn_err.h> 53 #include <sys/tiuser.h> 54 #include <sys/sysmacros.h> 55 #include <sys/callb.h> 56 #include <sys/acl.h> 57 #include <sys/kstat.h> 58 #include <sys/signal.h> 59 #include <sys/disp.h> 60 #include <sys/atomic.h> 61 #include <sys/list.h> 62 #include <sys/sdt.h> 63 64 #include <rpc/types.h> 65 #include <rpc/xdr.h> 66 #include <rpc/auth.h> 67 #include <rpc/clnt.h> 68 69 #include <nfs/nfs.h> 70 #include <nfs/nfs_clnt.h> 71 #include <nfs/nfs_acl.h> 72 73 #include <nfs/nfs4.h> 74 #include <nfs/rnode4.h> 75 #include <nfs/nfs4_clnt.h> 76 77 #include <vm/hat.h> 78 #include <vm/as.h> 79 #include <vm/page.h> 80 #include <vm/pvn.h> 81 #include <vm/seg.h> 82 #include <vm/seg_map.h> 83 #include <vm/seg_vn.h> 84 85 #include <sys/ddi.h> 86 87 /* 88 * Arguments to page-flush thread. 89 */ 90 typedef struct { 91 vnode_t *vp; 92 cred_t *cr; 93 } pgflush_t; 94 95 #ifdef DEBUG 96 int nfs4_client_lease_debug; 97 int nfs4_sharedfh_debug; 98 int nfs4_fname_debug; 99 100 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 101 int nfs4_vtype_debug; 102 103 uint_t nfs4_tsd_key; 104 #endif 105 106 static time_t nfs4_client_resumed = 0; 107 static callb_id_t cid = 0; 108 109 static int nfs4renew(nfs4_server_t *); 110 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 111 static void nfs4_pgflush_thread(pgflush_t *); 112 static void flush_pages(vnode_t *, cred_t *); 113 114 static boolean_t nfs4_client_cpr_callb(void *, int); 115 116 struct mi4_globals { 117 kmutex_t mig_lock; /* lock protecting mig_list */ 118 list_t mig_list; /* list of NFS v4 mounts in zone */ 119 boolean_t mig_destructor_called; 120 }; 121 122 static zone_key_t mi4_list_key; 123 124 /* 125 * Attributes caching: 126 * 127 * Attributes are cached in the rnode in struct vattr form. 128 * There is a time associated with the cached attributes (r_time_attr_inval) 129 * which tells whether the attributes are valid. The time is initialized 130 * to the difference between current time and the modify time of the vnode 131 * when new attributes are cached. This allows the attributes for 132 * files that have changed recently to be timed out sooner than for files 133 * that have not changed for a long time. There are minimum and maximum 134 * timeout values that can be set per mount point. 135 */ 136 137 /* 138 * If a cache purge is in progress, wait for it to finish. 139 * 140 * The current thread must not be in the middle of an 141 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 142 * between this thread, a recovery thread, and the page flush thread. 143 */ 144 int 145 nfs4_waitfor_purge_complete(vnode_t *vp) 146 { 147 rnode4_t *rp; 148 k_sigset_t smask; 149 150 rp = VTOR4(vp); 151 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 152 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 153 mutex_enter(&rp->r_statelock); 154 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 155 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 156 ((rp->r_flags & R4PGFLUSH) && 157 rp->r_pgflush != curthread)) { 158 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 159 sigunintr(&smask); 160 mutex_exit(&rp->r_statelock); 161 return (EINTR); 162 } 163 } 164 sigunintr(&smask); 165 mutex_exit(&rp->r_statelock); 166 } 167 return (0); 168 } 169 170 /* 171 * Validate caches by checking cached attributes. If they have timed out, 172 * then get new attributes from the server. As a side effect, cache 173 * invalidation is done if the attributes have changed. 174 * 175 * If the attributes have not timed out and if there is a cache 176 * invalidation being done by some other thread, then wait until that 177 * thread has completed the cache invalidation. 178 */ 179 int 180 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 181 { 182 int error; 183 nfs4_ga_res_t gar; 184 185 if (ATTRCACHE4_VALID(vp)) { 186 error = nfs4_waitfor_purge_complete(vp); 187 if (error) 188 return (error); 189 return (0); 190 } 191 192 gar.n4g_va.va_mask = AT_ALL; 193 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 194 } 195 196 /* 197 * Fill in attribute from the cache. 198 * If valid, then return 0 to indicate that no error occurred, 199 * otherwise return 1 to indicate that an error occurred. 200 */ 201 static int 202 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 203 { 204 rnode4_t *rp; 205 206 rp = VTOR4(vp); 207 mutex_enter(&rp->r_statelock); 208 mutex_enter(&rp->r_statev4_lock); 209 if (ATTRCACHE4_VALID(vp)) { 210 mutex_exit(&rp->r_statev4_lock); 211 /* 212 * Cached attributes are valid 213 */ 214 *vap = rp->r_attr; 215 mutex_exit(&rp->r_statelock); 216 return (0); 217 } 218 mutex_exit(&rp->r_statev4_lock); 219 mutex_exit(&rp->r_statelock); 220 return (1); 221 } 222 223 224 /* 225 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 226 * call is synchronous because all the pages were invalidated by the 227 * nfs4_invalidate_pages() call. 228 */ 229 void 230 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 231 { 232 struct rnode4 *rp = VTOR4(vp); 233 234 /* Ensure that the ..._end_op() call has been done */ 235 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 236 237 if (errno != ESTALE) 238 return; 239 240 mutex_enter(&rp->r_statelock); 241 rp->r_flags |= R4STALE; 242 if (!rp->r_error) 243 rp->r_error = errno; 244 mutex_exit(&rp->r_statelock); 245 if (nfs4_has_pages(vp)) 246 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 247 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 248 } 249 250 /* 251 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 252 * page purge is done asynchronously. 253 */ 254 void 255 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 256 { 257 rnode4_t *rp; 258 char *contents; 259 vnode_t *xattr; 260 int size; 261 int pgflush; /* are we the page flush thread? */ 262 263 /* 264 * Purge the DNLC for any entries which refer to this file. 265 */ 266 if (vp->v_count > 1 && 267 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 268 dnlc_purge_vp(vp); 269 270 /* 271 * Clear any readdir state bits and purge the readlink response cache. 272 */ 273 rp = VTOR4(vp); 274 mutex_enter(&rp->r_statelock); 275 rp->r_flags &= ~R4LOOKUP; 276 contents = rp->r_symlink.contents; 277 size = rp->r_symlink.size; 278 rp->r_symlink.contents = NULL; 279 280 xattr = rp->r_xattr_dir; 281 rp->r_xattr_dir = NULL; 282 283 /* 284 * Purge pathconf cache too. 285 */ 286 rp->r_pathconf.pc4_xattr_valid = 0; 287 rp->r_pathconf.pc4_cache_valid = 0; 288 289 pgflush = (curthread == rp->r_pgflush); 290 mutex_exit(&rp->r_statelock); 291 292 if (contents != NULL) { 293 294 kmem_free((void *)contents, size); 295 } 296 297 if (xattr != NULL) 298 VN_RELE(xattr); 299 300 /* 301 * Flush the page cache. If the current thread is the page flush 302 * thread, don't initiate a new page flush. There's no need for 303 * it, and doing it correctly is hard. 304 */ 305 if (nfs4_has_pages(vp) && !pgflush) { 306 if (!asyncpg) { 307 (void) nfs4_waitfor_purge_complete(vp); 308 flush_pages(vp, cr); 309 } else { 310 pgflush_t *args; 311 312 /* 313 * We don't hold r_statelock while creating the 314 * thread, in case the call blocks. So we use a 315 * flag to indicate that a page flush thread is 316 * active. 317 */ 318 mutex_enter(&rp->r_statelock); 319 if (rp->r_flags & R4PGFLUSH) { 320 mutex_exit(&rp->r_statelock); 321 } else { 322 rp->r_flags |= R4PGFLUSH; 323 mutex_exit(&rp->r_statelock); 324 325 args = kmem_alloc(sizeof (pgflush_t), 326 KM_SLEEP); 327 args->vp = vp; 328 VN_HOLD(args->vp); 329 args->cr = cr; 330 crhold(args->cr); 331 (void) zthread_create(NULL, 0, 332 nfs4_pgflush_thread, args, 0, 333 minclsyspri); 334 } 335 } 336 } 337 338 /* 339 * Flush the readdir response cache. 340 */ 341 nfs4_purge_rddir_cache(vp); 342 } 343 344 /* 345 * Invalidate all pages for the given file, after writing back the dirty 346 * ones. 347 */ 348 349 static void 350 flush_pages(vnode_t *vp, cred_t *cr) 351 { 352 int error; 353 rnode4_t *rp = VTOR4(vp); 354 355 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr); 356 if (error == ENOSPC || error == EDQUOT) { 357 mutex_enter(&rp->r_statelock); 358 if (!rp->r_error) 359 rp->r_error = error; 360 mutex_exit(&rp->r_statelock); 361 } 362 } 363 364 /* 365 * Page flush thread. 366 */ 367 368 static void 369 nfs4_pgflush_thread(pgflush_t *args) 370 { 371 rnode4_t *rp = VTOR4(args->vp); 372 373 /* remember which thread we are, so we don't deadlock ourselves */ 374 mutex_enter(&rp->r_statelock); 375 ASSERT(rp->r_pgflush == NULL); 376 rp->r_pgflush = curthread; 377 mutex_exit(&rp->r_statelock); 378 379 flush_pages(args->vp, args->cr); 380 381 mutex_enter(&rp->r_statelock); 382 rp->r_pgflush = NULL; 383 rp->r_flags &= ~R4PGFLUSH; 384 cv_broadcast(&rp->r_cv); 385 mutex_exit(&rp->r_statelock); 386 387 VN_RELE(args->vp); 388 crfree(args->cr); 389 kmem_free(args, sizeof (pgflush_t)); 390 zthread_exit(); 391 } 392 393 /* 394 * Purge the readdir cache of all entries which are not currently 395 * being filled. 396 */ 397 void 398 nfs4_purge_rddir_cache(vnode_t *vp) 399 { 400 rnode4_t *rp; 401 402 rp = VTOR4(vp); 403 404 mutex_enter(&rp->r_statelock); 405 rp->r_direof = NULL; 406 rp->r_flags &= ~R4LOOKUP; 407 rp->r_flags |= R4READDIRWATTR; 408 rddir4_cache_purge(rp); 409 mutex_exit(&rp->r_statelock); 410 } 411 412 /* 413 * Set attributes cache for given vnode using virtual attributes. There is 414 * no cache validation, but if the attributes are deemed to be stale, they 415 * are ignored. This corresponds to nfs3_attrcache(). 416 * 417 * Set the timeout value on the attribute cache and fill it 418 * with the passed in attributes. 419 */ 420 void 421 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 422 { 423 rnode4_t *rp = VTOR4(vp); 424 425 mutex_enter(&rp->r_statelock); 426 if (rp->r_time_attr_saved <= t) 427 nfs4_attrcache_va(vp, garp, FALSE); 428 mutex_exit(&rp->r_statelock); 429 } 430 431 /* 432 * Use the passed in virtual attributes to check to see whether the 433 * data and metadata caches are valid, cache the new attributes, and 434 * then do the cache invalidation if required. 435 * 436 * The cache validation and caching of the new attributes is done 437 * atomically via the use of the mutex, r_statelock. If required, 438 * the cache invalidation is done atomically w.r.t. the cache 439 * validation and caching of the attributes via the pseudo lock, 440 * r_serial. 441 * 442 * This routine is used to do cache validation and attributes caching 443 * for operations with a single set of post operation attributes. 444 */ 445 446 void 447 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 448 hrtime_t t, cred_t *cr, int async, 449 change_info4 *cinfo) 450 { 451 rnode4_t *rp; 452 int mtime_changed; 453 int ctime_changed; 454 vsecattr_t *vsp; 455 int was_serial, set_time_cache_inval, recov; 456 vattr_t *vap = &garp->n4g_va; 457 mntinfo4_t *mi = VTOMI4(vp); 458 459 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 460 461 /* Is curthread the recovery thread? */ 462 mutex_enter(&mi->mi_lock); 463 recov = (VTOMI4(vp)->mi_recovthread == curthread); 464 mutex_exit(&mi->mi_lock); 465 466 rp = VTOR4(vp); 467 mutex_enter(&rp->r_statelock); 468 was_serial = (rp->r_serial == curthread); 469 if (rp->r_serial && !was_serial) { 470 klwp_t *lwp = ttolwp(curthread); 471 472 /* 473 * If we're the recovery thread, then purge current attrs 474 * and bail out to avoid potential deadlock between another 475 * thread caching attrs (r_serial thread), recov thread, 476 * and an async writer thread. 477 */ 478 if (recov) { 479 PURGE_ATTRCACHE4_LOCKED(rp); 480 mutex_exit(&rp->r_statelock); 481 return; 482 } 483 484 if (lwp != NULL) 485 lwp->lwp_nostop++; 486 while (rp->r_serial != NULL) { 487 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 488 mutex_exit(&rp->r_statelock); 489 if (lwp != NULL) 490 lwp->lwp_nostop--; 491 return; 492 } 493 } 494 if (lwp != NULL) 495 lwp->lwp_nostop--; 496 } 497 498 /* 499 * If there is a page flush thread, the current thread needs to 500 * bail out, to prevent a possible deadlock between the current 501 * thread (which might be in a start_op/end_op region), the 502 * recovery thread, and the page flush thread. Expire the 503 * attribute cache, so that any attributes the current thread was 504 * going to set are not lost. 505 */ 506 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 507 PURGE_ATTRCACHE4_LOCKED(rp); 508 mutex_exit(&rp->r_statelock); 509 return; 510 } 511 512 if (rp->r_time_attr_saved > t) { 513 /* 514 * Attributes have been cached since these attributes were 515 * made, so don't act on them. 516 */ 517 mutex_exit(&rp->r_statelock); 518 return; 519 } 520 set_time_cache_inval = 0; 521 if (cinfo) { 522 /* 523 * Only directory modifying callers pass non-NULL cinfo. 524 */ 525 ASSERT(vp->v_type == VDIR); 526 /* 527 * If the cache timeout either doesn't exist or hasn't expired, 528 * and dir didn't changed on server before dirmod op 529 * and dir didn't change after dirmod op but before getattr 530 * then there's a chance that the client's cached data for 531 * this object is current (not stale). No immediate cache 532 * flush is required. 533 * 534 */ 535 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 536 cinfo->before == rp->r_change && 537 (garp->n4g_change_valid && 538 cinfo->after == garp->n4g_change)) { 539 540 /* 541 * If atomic isn't set, then the before/after info 542 * cannot be blindly trusted. For this case, we tell 543 * nfs4_attrcache_va to cache the attrs but also 544 * establish an absolute maximum cache timeout. When 545 * the timeout is reached, caches will be flushed. 546 */ 547 if (! cinfo->atomic) 548 set_time_cache_inval = 1; 549 550 mtime_changed = 0; 551 ctime_changed = 0; 552 } else { 553 554 /* 555 * We're not sure exactly what changed, but we know 556 * what to do. flush all caches for dir. remove the 557 * attr timeout. 558 * 559 * a) timeout expired. flush all caches. 560 * b) r_change != cinfo.before. flush all caches. 561 * c) r_change == cinfo.before, but cinfo.after != 562 * post-op getattr(change). flush all caches. 563 * d) post-op getattr(change) not provided by server. 564 * flush all caches. 565 */ 566 mtime_changed = 1; 567 ctime_changed = 1; 568 rp->r_time_cache_inval = 0; 569 } 570 } else { 571 if (!(rp->r_flags & R4WRITEMODIFIED)) { 572 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 573 mtime_changed = 1; 574 else 575 mtime_changed = 0; 576 if (rp->r_attr.va_ctime.tv_sec != 577 vap->va_ctime.tv_sec || 578 rp->r_attr.va_ctime.tv_nsec != 579 vap->va_ctime.tv_nsec) 580 ctime_changed = 1; 581 else 582 ctime_changed = 0; 583 } else { 584 mtime_changed = 0; 585 ctime_changed = 0; 586 } 587 } 588 589 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 590 591 if (!mtime_changed && !ctime_changed) { 592 mutex_exit(&rp->r_statelock); 593 return; 594 } 595 596 rp->r_serial = curthread; 597 598 mutex_exit(&rp->r_statelock); 599 600 /* 601 * If we're the recov thread, then force async nfs4_purge_caches 602 * to avoid potential deadlock. 603 */ 604 if (mtime_changed) 605 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 606 607 if (ctime_changed) { 608 (void) nfs4_access_purge_rp(rp); 609 if (rp->r_secattr != NULL) { 610 mutex_enter(&rp->r_statelock); 611 vsp = rp->r_secattr; 612 rp->r_secattr = NULL; 613 mutex_exit(&rp->r_statelock); 614 if (vsp != NULL) 615 nfs4_acl_free_cache(vsp); 616 } 617 } 618 619 if (!was_serial) { 620 mutex_enter(&rp->r_statelock); 621 rp->r_serial = NULL; 622 cv_broadcast(&rp->r_cv); 623 mutex_exit(&rp->r_statelock); 624 } 625 } 626 627 /* 628 * Set attributes cache for given vnode using virtual attributes. 629 * 630 * Set the timeout value on the attribute cache and fill it 631 * with the passed in attributes. 632 * 633 * The caller must be holding r_statelock. 634 */ 635 static void 636 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 637 { 638 rnode4_t *rp; 639 mntinfo4_t *mi; 640 hrtime_t delta; 641 hrtime_t now; 642 vattr_t *vap = &garp->n4g_va; 643 644 rp = VTOR4(vp); 645 646 ASSERT(MUTEX_HELD(&rp->r_statelock)); 647 ASSERT(vap->va_mask == AT_ALL); 648 649 /* Switch to master before checking v_flag */ 650 if (IS_SHADOW(vp, rp)) 651 vp = RTOV4(rp); 652 653 now = gethrtime(); 654 655 mi = VTOMI4(vp); 656 657 /* 658 * Only establish a new cache timeout (if requested). Never 659 * extend a timeout. Never clear a timeout. Clearing a timeout 660 * is done by nfs4_update_dircaches (ancestor in our call chain) 661 */ 662 if (set_cache_timeout && ! rp->r_time_cache_inval) 663 rp->r_time_cache_inval = now + mi->mi_acdirmax; 664 665 /* 666 * Delta is the number of nanoseconds that we will 667 * cache the attributes of the file. It is based on 668 * the number of nanoseconds since the last time that 669 * we detected a change. The assumption is that files 670 * that changed recently are likely to change again. 671 * There is a minimum and a maximum for regular files 672 * and for directories which is enforced though. 673 * 674 * Using the time since last change was detected 675 * eliminates direct comparison or calculation 676 * using mixed client and server times. NFS does 677 * not make any assumptions regarding the client 678 * and server clocks being synchronized. 679 */ 680 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 681 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 682 vap->va_size != rp->r_attr.va_size) { 683 rp->r_time_attr_saved = now; 684 } 685 686 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 687 delta = 0; 688 else { 689 delta = now - rp->r_time_attr_saved; 690 if (vp->v_type == VDIR) { 691 if (delta < mi->mi_acdirmin) 692 delta = mi->mi_acdirmin; 693 else if (delta > mi->mi_acdirmax) 694 delta = mi->mi_acdirmax; 695 } else { 696 if (delta < mi->mi_acregmin) 697 delta = mi->mi_acregmin; 698 else if (delta > mi->mi_acregmax) 699 delta = mi->mi_acregmax; 700 } 701 } 702 rp->r_time_attr_inval = now + delta; 703 704 rp->r_attr = *vap; 705 if (garp->n4g_change_valid) 706 rp->r_change = garp->n4g_change; 707 708 /* 709 * The attributes that were returned may be valid and can 710 * be used, but they may not be allowed to be cached. 711 * Reset the timers to cause immediate invalidation and 712 * clear r_change so no VERIFY operations will suceed 713 */ 714 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 715 rp->r_time_attr_inval = now; 716 rp->r_time_attr_saved = now; 717 rp->r_change = 0; 718 } 719 720 /* 721 * If mounted_on_fileid returned AND the object is a stub, 722 * then set object's va_nodeid to the mounted over fid 723 * returned by server. 724 * 725 * If mounted_on_fileid not provided/supported, then 726 * just set it to 0 for now. Eventually it would be 727 * better to set it to a hashed version of FH. This 728 * would probably be good enough to provide a unique 729 * fid/d_ino within a dir. 730 * 731 * We don't need to carry mounted_on_fileid in the 732 * rnode as long as the client never requests fileid 733 * without also requesting mounted_on_fileid. For 734 * now, it stays. 735 */ 736 if (garp->n4g_mon_fid_valid) { 737 rp->r_mntd_fid = garp->n4g_mon_fid; 738 739 if (rp->r_flags & R4SRVSTUB) 740 rp->r_attr.va_nodeid = rp->r_mntd_fid; 741 } 742 743 /* 744 * Check to see if there are valid pathconf bits to 745 * cache in the rnode. 746 */ 747 if (garp->n4g_ext_res) { 748 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 749 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 750 } else { 751 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 752 rp->r_pathconf.pc4_xattr_valid = TRUE; 753 rp->r_pathconf.pc4_xattr_exists = 754 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 755 } 756 } 757 } 758 /* 759 * Update the size of the file if there is no cached data or if 760 * the cached data is clean and there is no data being written 761 * out. 762 */ 763 if (rp->r_size != vap->va_size && 764 (!vn_has_cached_data(vp) || 765 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 766 rp->r_size = vap->va_size; 767 } 768 nfs_setswaplike(vp, vap); 769 rp->r_flags &= ~R4WRITEMODIFIED; 770 } 771 772 /* 773 * Get attributes over-the-wire and update attributes cache 774 * if no error occurred in the over-the-wire operation. 775 * Return 0 if successful, otherwise error. 776 */ 777 int 778 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 779 { 780 mntinfo4_t *mi = VTOMI4(vp); 781 hrtime_t t; 782 nfs4_recov_state_t recov_state; 783 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 784 785 recov_state.rs_flags = 0; 786 recov_state.rs_num_retry_despite_err = 0; 787 788 /* Save the original mount point security flavor */ 789 (void) save_mnt_secinfo(mi->mi_curr_serv); 790 791 recov_retry: 792 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 793 &recov_state, NULL))) { 794 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 795 return (e.error); 796 } 797 798 t = gethrtime(); 799 800 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 801 802 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 803 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 804 NULL, OP_GETATTR, NULL) == FALSE) { 805 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 806 &recov_state, 1); 807 goto recov_retry; 808 } 809 } 810 811 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 812 813 if (!e.error) { 814 if (e.stat == NFS4_OK) { 815 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 816 } else { 817 e.error = geterrno4(e.stat); 818 819 nfs4_purge_stale_fh(e.error, vp, cr); 820 } 821 } 822 823 /* 824 * If getattr a node that is a stub for a crossed 825 * mount point, keep the original secinfo flavor for 826 * the current file system, not the crossed one. 827 */ 828 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 829 830 return (e.error); 831 } 832 833 /* 834 * Generate a compound to get attributes over-the-wire. 835 */ 836 void 837 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 838 nfs4_error_t *ep, cred_t *cr, int get_acl) 839 { 840 COMPOUND4args_clnt args; 841 COMPOUND4res_clnt res; 842 int doqueue; 843 rnode4_t *rp = VTOR4(vp); 844 nfs_argop4 argop[2]; 845 846 args.ctag = TAG_GETATTR; 847 848 args.array_len = 2; 849 args.array = argop; 850 851 /* putfh */ 852 argop[0].argop = OP_CPUTFH; 853 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 854 855 /* getattr */ 856 /* 857 * Unlike nfs version 2 and 3, where getattr returns all the 858 * attributes, nfs version 4 returns only the ones explicitely 859 * asked for. This creates problems, as some system functions 860 * (e.g. cache check) require certain attributes and if the 861 * cached node lacks some attributes such as uid/gid, it can 862 * affect system utilities (e.g. "ls") that rely on the information 863 * to be there. This can lead to anything from system crashes to 864 * corrupted information processed by user apps. 865 * So to ensure that all bases are covered, request at least 866 * the AT_ALL attribute mask. 867 */ 868 argop[1].argop = OP_GETATTR; 869 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 870 if (get_acl) 871 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 872 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 873 874 doqueue = 1; 875 876 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 877 878 if (ep->error) 879 return; 880 881 if (res.status != NFS4_OK) { 882 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 883 return; 884 } 885 886 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 887 888 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 889 } 890 891 /* 892 * Return either cached or remote attributes. If get remote attr 893 * use them to check and invalidate caches, then cache the new attributes. 894 */ 895 int 896 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 897 { 898 int error; 899 rnode4_t *rp; 900 nfs4_ga_res_t gar; 901 902 ASSERT(nfs4_consistent_type(vp)); 903 904 /* 905 * If we've got cached attributes, we're done, otherwise go 906 * to the server to get attributes, which will update the cache 907 * in the process. 908 */ 909 rp = VTOR4(vp); 910 mutex_enter(&rp->r_statelock); 911 mutex_enter(&rp->r_statev4_lock); 912 if (ATTRCACHE4_VALID(vp)) { 913 mutex_exit(&rp->r_statev4_lock); 914 /* 915 * Cached attributes are valid 916 * Return the client's view of file size 917 */ 918 *vap = rp->r_attr; 919 vap->va_size = rp->r_size; 920 mutex_exit(&rp->r_statelock); 921 922 ASSERT(nfs4_consistent_type(vp)); 923 924 return (0); 925 } 926 mutex_exit(&rp->r_statev4_lock); 927 mutex_exit(&rp->r_statelock); 928 929 error = nfs4_getattr_otw(vp, &gar, cr, 0); 930 if (!error) 931 *vap = gar.n4g_va; 932 933 /* Return the client's view of file size */ 934 mutex_enter(&rp->r_statelock); 935 vap->va_size = rp->r_size; 936 mutex_exit(&rp->r_statelock); 937 938 ASSERT(nfs4_consistent_type(vp)); 939 940 return (error); 941 } 942 943 int 944 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 945 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 946 { 947 COMPOUND4args_clnt args; 948 COMPOUND4res_clnt res; 949 int doqueue; 950 nfs_argop4 argop[2]; 951 mntinfo4_t *mi = VTOMI4(vp); 952 bool_t needrecov = FALSE; 953 nfs4_recov_state_t recov_state; 954 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 955 nfs4_ga_ext_res_t *gerp; 956 957 recov_state.rs_flags = 0; 958 recov_state.rs_num_retry_despite_err = 0; 959 960 recov_retry: 961 args.ctag = tag_type; 962 963 args.array_len = 2; 964 args.array = argop; 965 966 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 967 if (e.error) 968 return (e.error); 969 970 /* putfh */ 971 argop[0].argop = OP_CPUTFH; 972 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 973 974 /* getattr */ 975 argop[1].argop = OP_GETATTR; 976 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 977 argop[1].nfs_argop4_u.opgetattr.mi = mi; 978 979 doqueue = 1; 980 981 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 982 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 983 rnode4info(VTOR4(vp)))); 984 985 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 986 987 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 988 if (!needrecov && e.error) { 989 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 990 needrecov); 991 return (e.error); 992 } 993 994 if (needrecov) { 995 bool_t abort; 996 997 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 998 "nfs4_attr_otw: initiating recovery\n")); 999 1000 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1001 NULL, OP_GETATTR, NULL); 1002 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1003 needrecov); 1004 if (!e.error) { 1005 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1006 e.error = geterrno4(res.status); 1007 } 1008 if (abort == FALSE) 1009 goto recov_retry; 1010 return (e.error); 1011 } 1012 1013 if (res.status) { 1014 e.error = geterrno4(res.status); 1015 } else { 1016 gerp = garp->n4g_ext_res; 1017 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1018 garp, sizeof (nfs4_ga_res_t)); 1019 garp->n4g_ext_res = gerp; 1020 if (garp->n4g_ext_res && 1021 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1022 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1023 ga_res.n4g_ext_res, 1024 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1025 } 1026 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1027 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1028 needrecov); 1029 return (e.error); 1030 } 1031 1032 /* 1033 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1034 * for the demand-based allocation of async threads per-mount. The 1035 * nfs_async_timeout is the amount of time a thread will live after it 1036 * becomes idle, unless new I/O requests are received before the thread 1037 * dies. See nfs4_async_putpage and nfs4_async_start. 1038 */ 1039 1040 static void nfs4_async_start(struct vfs *); 1041 1042 static void 1043 free_async_args4(struct nfs4_async_reqs *args) 1044 { 1045 rnode4_t *rp; 1046 1047 if (args->a_io != NFS4_INACTIVE) { 1048 rp = VTOR4(args->a_vp); 1049 mutex_enter(&rp->r_statelock); 1050 rp->r_count--; 1051 if (args->a_io == NFS4_PUTAPAGE || 1052 args->a_io == NFS4_PAGEIO) 1053 rp->r_awcount--; 1054 cv_broadcast(&rp->r_cv); 1055 mutex_exit(&rp->r_statelock); 1056 VN_RELE(args->a_vp); 1057 } 1058 crfree(args->a_cred); 1059 kmem_free(args, sizeof (*args)); 1060 } 1061 1062 /* 1063 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1064 * pageout(), running in the global zone, have legitimate reasons to do 1065 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1066 * use of a a per-mount "asynchronous requests manager thread" which is 1067 * signaled by the various asynchronous work routines when there is 1068 * asynchronous work to be done. It is responsible for creating new 1069 * worker threads if necessary, and notifying existing worker threads 1070 * that there is work to be done. 1071 * 1072 * In other words, it will "take the specifications from the customers and 1073 * give them to the engineers." 1074 * 1075 * Worker threads die off of their own accord if they are no longer 1076 * needed. 1077 * 1078 * This thread is killed when the zone is going away or the filesystem 1079 * is being unmounted. 1080 */ 1081 void 1082 nfs4_async_manager(vfs_t *vfsp) 1083 { 1084 callb_cpr_t cprinfo; 1085 mntinfo4_t *mi; 1086 uint_t max_threads; 1087 1088 mi = VFTOMI4(vfsp); 1089 1090 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1091 "nfs4_async_manager"); 1092 1093 mutex_enter(&mi->mi_async_lock); 1094 /* 1095 * We want to stash the max number of threads that this mount was 1096 * allowed so we can use it later when the variable is set to zero as 1097 * part of the zone/mount going away. 1098 * 1099 * We want to be able to create at least one thread to handle 1100 * asyncrhonous inactive calls. 1101 */ 1102 max_threads = MAX(mi->mi_max_threads, 1); 1103 mutex_enter(&mi->mi_lock); 1104 /* 1105 * We don't want to wait for mi_max_threads to go to zero, since that 1106 * happens as part of a failed unmount, but this thread should only 1107 * exit when the mount is really going away. 1108 * 1109 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1110 * attempted: the various _async_*() functions know to do things 1111 * inline if mi_max_threads == 0. Henceforth we just drain out the 1112 * outstanding requests. 1113 * 1114 * Note that we still create zthreads even if we notice the zone is 1115 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1116 * shutdown sequence to take slightly longer in some cases, but 1117 * doesn't violate the protocol, as all threads will exit as soon as 1118 * they're done processing the remaining requests. 1119 */ 1120 while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) || 1121 mi->mi_async_req_count > 0) { 1122 mutex_exit(&mi->mi_lock); 1123 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1124 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1125 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1126 while (mi->mi_async_req_count > 0) { 1127 /* 1128 * Paranoia: If the mount started out having 1129 * (mi->mi_max_threads == 0), and the value was 1130 * later changed (via a debugger or somesuch), 1131 * we could be confused since we will think we 1132 * can't create any threads, and the calling 1133 * code (which looks at the current value of 1134 * mi->mi_max_threads, now non-zero) thinks we 1135 * can. 1136 * 1137 * So, because we're paranoid, we create threads 1138 * up to the maximum of the original and the 1139 * current value. This means that future 1140 * (debugger-induced) alterations of 1141 * mi->mi_max_threads are ignored for our 1142 * purposes, but who told them they could change 1143 * random values on a live kernel anyhow? 1144 */ 1145 if (mi->mi_threads < 1146 MAX(mi->mi_max_threads, max_threads)) { 1147 mi->mi_threads++; 1148 mutex_exit(&mi->mi_async_lock); 1149 VFS_HOLD(vfsp); /* hold for new thread */ 1150 (void) zthread_create(NULL, 0, nfs4_async_start, 1151 vfsp, 0, minclsyspri); 1152 mutex_enter(&mi->mi_async_lock); 1153 } 1154 cv_signal(&mi->mi_async_work_cv); 1155 ASSERT(mi->mi_async_req_count != 0); 1156 mi->mi_async_req_count--; 1157 } 1158 mutex_enter(&mi->mi_lock); 1159 } 1160 mutex_exit(&mi->mi_lock); 1161 1162 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1163 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1164 /* 1165 * Let everyone know we're done. 1166 */ 1167 mi->mi_manager_thread = NULL; 1168 /* 1169 * Wake up the inactive thread. 1170 */ 1171 cv_broadcast(&mi->mi_inact_req_cv); 1172 /* 1173 * Wake up anyone sitting in nfs4_async_manager_stop() 1174 */ 1175 cv_broadcast(&mi->mi_async_cv); 1176 /* 1177 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1178 * since CALLB_CPR_EXIT is actually responsible for releasing 1179 * 'mi_async_lock'. 1180 */ 1181 CALLB_CPR_EXIT(&cprinfo); 1182 VFS_RELE(vfsp); /* release thread's hold */ 1183 zthread_exit(); 1184 } 1185 1186 /* 1187 * Signal (and wait for) the async manager thread to clean up and go away. 1188 */ 1189 void 1190 nfs4_async_manager_stop(vfs_t *vfsp) 1191 { 1192 mntinfo4_t *mi = VFTOMI4(vfsp); 1193 1194 mutex_enter(&mi->mi_async_lock); 1195 mutex_enter(&mi->mi_lock); 1196 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1197 mutex_exit(&mi->mi_lock); 1198 cv_broadcast(&mi->mi_async_reqs_cv); 1199 /* 1200 * Wait for the async manager thread to die. 1201 */ 1202 while (mi->mi_manager_thread != NULL) 1203 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1204 mutex_exit(&mi->mi_async_lock); 1205 } 1206 1207 int 1208 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1209 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1210 u_offset_t, caddr_t, struct seg *, cred_t *)) 1211 { 1212 rnode4_t *rp; 1213 mntinfo4_t *mi; 1214 struct nfs4_async_reqs *args; 1215 1216 rp = VTOR4(vp); 1217 ASSERT(rp->r_freef == NULL); 1218 1219 mi = VTOMI4(vp); 1220 1221 /* 1222 * If addr falls in a different segment, don't bother doing readahead. 1223 */ 1224 if (addr >= seg->s_base + seg->s_size) 1225 return (-1); 1226 1227 /* 1228 * If we can't allocate a request structure, punt on the readahead. 1229 */ 1230 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1231 return (-1); 1232 1233 /* 1234 * If a lock operation is pending, don't initiate any new 1235 * readaheads. Otherwise, bump r_count to indicate the new 1236 * asynchronous I/O. 1237 */ 1238 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1239 kmem_free(args, sizeof (*args)); 1240 return (-1); 1241 } 1242 mutex_enter(&rp->r_statelock); 1243 rp->r_count++; 1244 mutex_exit(&rp->r_statelock); 1245 nfs_rw_exit(&rp->r_lkserlock); 1246 1247 args->a_next = NULL; 1248 #ifdef DEBUG 1249 args->a_queuer = curthread; 1250 #endif 1251 VN_HOLD(vp); 1252 args->a_vp = vp; 1253 ASSERT(cr != NULL); 1254 crhold(cr); 1255 args->a_cred = cr; 1256 args->a_io = NFS4_READ_AHEAD; 1257 args->a_nfs4_readahead = readahead; 1258 args->a_nfs4_blkoff = blkoff; 1259 args->a_nfs4_seg = seg; 1260 args->a_nfs4_addr = addr; 1261 1262 mutex_enter(&mi->mi_async_lock); 1263 1264 /* 1265 * If asyncio has been disabled, don't bother readahead. 1266 */ 1267 if (mi->mi_max_threads == 0) { 1268 mutex_exit(&mi->mi_async_lock); 1269 goto noasync; 1270 } 1271 1272 /* 1273 * Link request structure into the async list and 1274 * wakeup async thread to do the i/o. 1275 */ 1276 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1277 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1278 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1279 } else { 1280 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1281 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1282 } 1283 1284 if (mi->mi_io_kstats) { 1285 mutex_enter(&mi->mi_lock); 1286 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1287 mutex_exit(&mi->mi_lock); 1288 } 1289 1290 mi->mi_async_req_count++; 1291 ASSERT(mi->mi_async_req_count != 0); 1292 cv_signal(&mi->mi_async_reqs_cv); 1293 mutex_exit(&mi->mi_async_lock); 1294 return (0); 1295 1296 noasync: 1297 mutex_enter(&rp->r_statelock); 1298 rp->r_count--; 1299 cv_broadcast(&rp->r_cv); 1300 mutex_exit(&rp->r_statelock); 1301 VN_RELE(vp); 1302 crfree(cr); 1303 kmem_free(args, sizeof (*args)); 1304 return (-1); 1305 } 1306 1307 /* 1308 * The async queues for each mounted file system are arranged as a 1309 * set of queues, one for each async i/o type. Requests are taken 1310 * from the queues in a round-robin fashion. A number of consecutive 1311 * requests are taken from each queue before moving on to the next 1312 * queue. This functionality may allow the NFS Version 2 server to do 1313 * write clustering, even if the client is mixing writes and reads 1314 * because it will take multiple write requests from the queue 1315 * before processing any of the other async i/o types. 1316 * 1317 * XXX The nfs4_async_start thread is unsafe in the light of the present 1318 * model defined by cpr to suspend the system. Specifically over the 1319 * wire calls are cpr-unsafe. The thread should be reevaluated in 1320 * case of future updates to the cpr model. 1321 */ 1322 static void 1323 nfs4_async_start(struct vfs *vfsp) 1324 { 1325 struct nfs4_async_reqs *args; 1326 mntinfo4_t *mi = VFTOMI4(vfsp); 1327 clock_t time_left = 1; 1328 callb_cpr_t cprinfo; 1329 int i; 1330 extern int nfs_async_timeout; 1331 1332 /* 1333 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1334 * built in an implementation independent manner. 1335 */ 1336 if (nfs_async_timeout == -1) 1337 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1338 1339 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1340 1341 mutex_enter(&mi->mi_async_lock); 1342 for (;;) { 1343 /* 1344 * Find the next queue containing an entry. We start 1345 * at the current queue pointer and then round robin 1346 * through all of them until we either find a non-empty 1347 * queue or have looked through all of them. 1348 */ 1349 for (i = 0; i < NFS4_ASYNC_TYPES; i++) { 1350 args = *mi->mi_async_curr; 1351 if (args != NULL) 1352 break; 1353 mi->mi_async_curr++; 1354 if (mi->mi_async_curr == 1355 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1356 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1357 } 1358 /* 1359 * If we didn't find a entry, then block until woken up 1360 * again and then look through the queues again. 1361 */ 1362 if (args == NULL) { 1363 /* 1364 * Exiting is considered to be safe for CPR as well 1365 */ 1366 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1367 1368 /* 1369 * Wakeup thread waiting to unmount the file 1370 * system only if all async threads are inactive. 1371 * 1372 * If we've timed-out and there's nothing to do, 1373 * then get rid of this thread. 1374 */ 1375 if (mi->mi_max_threads == 0 || time_left <= 0) { 1376 if (--mi->mi_threads == 0) 1377 cv_signal(&mi->mi_async_cv); 1378 CALLB_CPR_EXIT(&cprinfo); 1379 VFS_RELE(vfsp); /* release thread's hold */ 1380 zthread_exit(); 1381 /* NOTREACHED */ 1382 } 1383 time_left = cv_timedwait(&mi->mi_async_work_cv, 1384 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1385 1386 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1387 1388 continue; 1389 } else { 1390 time_left = 1; 1391 } 1392 1393 /* 1394 * Remove the request from the async queue and then 1395 * update the current async request queue pointer. If 1396 * the current queue is empty or we have removed enough 1397 * consecutive entries from it, then reset the counter 1398 * for this queue and then move the current pointer to 1399 * the next queue. 1400 */ 1401 *mi->mi_async_curr = args->a_next; 1402 if (*mi->mi_async_curr == NULL || 1403 --mi->mi_async_clusters[args->a_io] == 0) { 1404 mi->mi_async_clusters[args->a_io] = 1405 mi->mi_async_init_clusters; 1406 mi->mi_async_curr++; 1407 if (mi->mi_async_curr == 1408 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1409 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1410 } 1411 1412 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1413 mutex_enter(&mi->mi_lock); 1414 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1415 mutex_exit(&mi->mi_lock); 1416 } 1417 1418 mutex_exit(&mi->mi_async_lock); 1419 1420 /* 1421 * Obtain arguments from the async request structure. 1422 */ 1423 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1424 (*args->a_nfs4_readahead)(args->a_vp, 1425 args->a_nfs4_blkoff, 1426 args->a_nfs4_addr, args->a_nfs4_seg, 1427 args->a_cred); 1428 } else if (args->a_io == NFS4_PUTAPAGE) { 1429 (void) (*args->a_nfs4_putapage)(args->a_vp, 1430 args->a_nfs4_pp, args->a_nfs4_off, 1431 args->a_nfs4_len, args->a_nfs4_flags, 1432 args->a_cred); 1433 } else if (args->a_io == NFS4_PAGEIO) { 1434 (void) (*args->a_nfs4_pageio)(args->a_vp, 1435 args->a_nfs4_pp, args->a_nfs4_off, 1436 args->a_nfs4_len, args->a_nfs4_flags, 1437 args->a_cred); 1438 } else if (args->a_io == NFS4_READDIR) { 1439 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1440 args->a_nfs4_rdc, args->a_cred)); 1441 } else if (args->a_io == NFS4_COMMIT) { 1442 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1443 args->a_nfs4_offset, args->a_nfs4_count, 1444 args->a_cred); 1445 } else if (args->a_io == NFS4_INACTIVE) { 1446 nfs4_inactive_otw(args->a_vp, args->a_cred); 1447 } 1448 1449 /* 1450 * Now, release the vnode and free the credentials 1451 * structure. 1452 */ 1453 free_async_args4(args); 1454 /* 1455 * Reacquire the mutex because it will be needed above. 1456 */ 1457 mutex_enter(&mi->mi_async_lock); 1458 } 1459 } 1460 1461 /* 1462 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1463 * part of VOP_INACTIVE. 1464 */ 1465 1466 void 1467 nfs4_inactive_thread(mntinfo4_t *mi) 1468 { 1469 struct nfs4_async_reqs *args; 1470 callb_cpr_t cprinfo; 1471 int call_nfs_free_mi4 = 0; 1472 vfs_t *vfsp = mi->mi_vfsp; 1473 1474 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1475 "nfs4_inactive_thread"); 1476 1477 for (;;) { 1478 mutex_enter(&mi->mi_async_lock); 1479 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1480 if (args == NULL) { 1481 mutex_enter(&mi->mi_lock); 1482 /* 1483 * During regular operation (ie, unmount 1484 * or a failed mount), the async manager thread always 1485 * exits before MI4_DEAD is set by nfs_free_mi4(). 1486 * 1487 * When a zone is shutting down, however, we set 1488 * MI4_DEAD before the async manager thread is done, and 1489 * we don't want to exit until the async manager is done 1490 * with its work; hence the check for mi_manager_thread 1491 * being NULL. 1492 * 1493 * The async manager thread will cv_broadcast() on 1494 * mi_inact_req_cv when it's done, at which point we'll 1495 * wake up and exit. 1496 */ 1497 if (mi->mi_manager_thread == NULL && 1498 (mi->mi_flags & MI4_DEAD)) 1499 goto die; 1500 mi->mi_flags |= MI4_INACTIVE_IDLE; 1501 mutex_exit(&mi->mi_lock); 1502 cv_signal(&mi->mi_async_cv); 1503 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1504 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1505 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1506 mutex_exit(&mi->mi_async_lock); 1507 } else { 1508 mutex_enter(&mi->mi_lock); 1509 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1510 mutex_exit(&mi->mi_lock); 1511 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1512 mutex_exit(&mi->mi_async_lock); 1513 nfs4_inactive_otw(args->a_vp, args->a_cred); 1514 crfree(args->a_cred); 1515 kmem_free(args, sizeof (*args)); 1516 } 1517 } 1518 die: 1519 mutex_exit(&mi->mi_lock); 1520 call_nfs_free_mi4 = (mi->mi_inactive_thread == NULL); 1521 mi->mi_inactive_thread = NULL; 1522 cv_signal(&mi->mi_async_cv); 1523 /* 1524 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1525 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1526 */ 1527 CALLB_CPR_EXIT(&cprinfo); 1528 if (call_nfs_free_mi4) { 1529 if (mi->mi_io_kstats) { 1530 kstat_delete(mi->mi_io_kstats); 1531 mi->mi_io_kstats = NULL; 1532 } 1533 if (mi->mi_ro_kstats) { 1534 kstat_delete(mi->mi_ro_kstats); 1535 mi->mi_ro_kstats = NULL; 1536 } 1537 if (mi->mi_recov_ksp) { 1538 kstat_delete(mi->mi_recov_ksp); 1539 mi->mi_recov_ksp = NULL; 1540 } 1541 nfs_free_mi4(mi); 1542 } 1543 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1544 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1545 zthread_exit(); 1546 /* NOTREACHED */ 1547 } 1548 1549 /* 1550 * nfs_async_stop: 1551 * Wait for all outstanding putpage operations and the inactive thread to 1552 * complete; nfs4_async_stop_sig() without interruptibility. 1553 */ 1554 void 1555 nfs4_async_stop(struct vfs *vfsp) 1556 { 1557 mntinfo4_t *mi = VFTOMI4(vfsp); 1558 1559 /* 1560 * Wait for all outstanding async operations to complete and for 1561 * worker threads to exit. 1562 */ 1563 mutex_enter(&mi->mi_async_lock); 1564 mi->mi_max_threads = 0; 1565 cv_broadcast(&mi->mi_async_work_cv); 1566 while (mi->mi_threads != 0) 1567 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1568 1569 /* 1570 * Wait for the inactive thread to finish doing what it's doing. It 1571 * won't exit until the last reference to the vfs_t goes away. 1572 */ 1573 if (mi->mi_inactive_thread != NULL) { 1574 mutex_enter(&mi->mi_lock); 1575 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1576 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1577 mutex_exit(&mi->mi_lock); 1578 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1579 mutex_enter(&mi->mi_lock); 1580 } 1581 mutex_exit(&mi->mi_lock); 1582 } 1583 mutex_exit(&mi->mi_async_lock); 1584 } 1585 1586 /* 1587 * nfs_async_stop_sig: 1588 * Wait for all outstanding putpage operations and the inactive thread to 1589 * complete. If a signal is delivered we will abort and return non-zero; 1590 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1591 * need to make it interruptable. 1592 */ 1593 int 1594 nfs4_async_stop_sig(struct vfs *vfsp) 1595 { 1596 mntinfo4_t *mi = VFTOMI4(vfsp); 1597 ushort_t omax; 1598 bool_t intr = FALSE; 1599 1600 /* 1601 * Wait for all outstanding putpage operations to complete and for 1602 * worker threads to exit. 1603 */ 1604 mutex_enter(&mi->mi_async_lock); 1605 omax = mi->mi_max_threads; 1606 mi->mi_max_threads = 0; 1607 cv_broadcast(&mi->mi_async_work_cv); 1608 while (mi->mi_threads != 0) { 1609 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1610 intr = TRUE; 1611 goto interrupted; 1612 } 1613 } 1614 1615 /* 1616 * Wait for the inactive thread to finish doing what it's doing. It 1617 * won't exit until the a last reference to the vfs_t goes away. 1618 */ 1619 if (mi->mi_inactive_thread != NULL) { 1620 mutex_enter(&mi->mi_lock); 1621 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1622 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1623 mutex_exit(&mi->mi_lock); 1624 if (!cv_wait_sig(&mi->mi_async_cv, 1625 &mi->mi_async_lock)) { 1626 intr = TRUE; 1627 goto interrupted; 1628 } 1629 mutex_enter(&mi->mi_lock); 1630 } 1631 mutex_exit(&mi->mi_lock); 1632 } 1633 interrupted: 1634 if (intr) 1635 mi->mi_max_threads = omax; 1636 mutex_exit(&mi->mi_async_lock); 1637 1638 return (intr); 1639 } 1640 1641 int 1642 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1643 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1644 u_offset_t, size_t, int, cred_t *)) 1645 { 1646 rnode4_t *rp; 1647 mntinfo4_t *mi; 1648 struct nfs4_async_reqs *args; 1649 1650 ASSERT(flags & B_ASYNC); 1651 ASSERT(vp->v_vfsp != NULL); 1652 1653 rp = VTOR4(vp); 1654 ASSERT(rp->r_count > 0); 1655 1656 mi = VTOMI4(vp); 1657 1658 /* 1659 * If we can't allocate a request structure, do the putpage 1660 * operation synchronously in this thread's context. 1661 */ 1662 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1663 goto noasync; 1664 1665 args->a_next = NULL; 1666 #ifdef DEBUG 1667 args->a_queuer = curthread; 1668 #endif 1669 VN_HOLD(vp); 1670 args->a_vp = vp; 1671 ASSERT(cr != NULL); 1672 crhold(cr); 1673 args->a_cred = cr; 1674 args->a_io = NFS4_PUTAPAGE; 1675 args->a_nfs4_putapage = putapage; 1676 args->a_nfs4_pp = pp; 1677 args->a_nfs4_off = off; 1678 args->a_nfs4_len = (uint_t)len; 1679 args->a_nfs4_flags = flags; 1680 1681 mutex_enter(&mi->mi_async_lock); 1682 1683 /* 1684 * If asyncio has been disabled, then make a synchronous request. 1685 * This check is done a second time in case async io was diabled 1686 * while this thread was blocked waiting for memory pressure to 1687 * reduce or for the queue to drain. 1688 */ 1689 if (mi->mi_max_threads == 0) { 1690 mutex_exit(&mi->mi_async_lock); 1691 1692 VN_RELE(vp); 1693 crfree(cr); 1694 kmem_free(args, sizeof (*args)); 1695 goto noasync; 1696 } 1697 1698 /* 1699 * Link request structure into the async list and 1700 * wakeup async thread to do the i/o. 1701 */ 1702 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1703 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1704 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1705 } else { 1706 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1707 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1708 } 1709 1710 mutex_enter(&rp->r_statelock); 1711 rp->r_count++; 1712 rp->r_awcount++; 1713 mutex_exit(&rp->r_statelock); 1714 1715 if (mi->mi_io_kstats) { 1716 mutex_enter(&mi->mi_lock); 1717 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1718 mutex_exit(&mi->mi_lock); 1719 } 1720 1721 mi->mi_async_req_count++; 1722 ASSERT(mi->mi_async_req_count != 0); 1723 cv_signal(&mi->mi_async_reqs_cv); 1724 mutex_exit(&mi->mi_async_lock); 1725 return (0); 1726 1727 noasync: 1728 1729 if (curproc == proc_pageout || curproc == proc_fsflush || 1730 nfs_zone() == mi->mi_zone) { 1731 /* 1732 * If we get here in the context of the pageout/fsflush, 1733 * or we have run out of memory or we're attempting to 1734 * unmount we refuse to do a sync write, because this may 1735 * hang pageout/fsflush and the machine. In this case, 1736 * we just re-mark the page as dirty and punt on the page. 1737 * 1738 * Make sure B_FORCE isn't set. We can re-mark the 1739 * pages as dirty and unlock the pages in one swoop by 1740 * passing in B_ERROR to pvn_write_done(). However, 1741 * we should make sure B_FORCE isn't set - we don't 1742 * want the page tossed before it gets written out. 1743 */ 1744 if (flags & B_FORCE) 1745 flags &= ~(B_INVAL | B_FORCE); 1746 pvn_write_done(pp, flags | B_ERROR); 1747 return (0); 1748 } 1749 1750 /* 1751 * We'll get here only if (nfs_zone() != mi->mi_zone) 1752 * which means that this was a cross-zone sync putpage. 1753 * 1754 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1755 * as dirty and unlock them. 1756 * 1757 * We don't want to clear B_FORCE here as the caller presumably 1758 * knows what they're doing if they set it. 1759 */ 1760 pvn_write_done(pp, flags | B_ERROR); 1761 return (EPERM); 1762 } 1763 1764 int 1765 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1766 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1767 size_t, int, cred_t *)) 1768 { 1769 rnode4_t *rp; 1770 mntinfo4_t *mi; 1771 struct nfs4_async_reqs *args; 1772 1773 ASSERT(flags & B_ASYNC); 1774 ASSERT(vp->v_vfsp != NULL); 1775 1776 rp = VTOR4(vp); 1777 ASSERT(rp->r_count > 0); 1778 1779 mi = VTOMI4(vp); 1780 1781 /* 1782 * If we can't allocate a request structure, do the pageio 1783 * request synchronously in this thread's context. 1784 */ 1785 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1786 goto noasync; 1787 1788 args->a_next = NULL; 1789 #ifdef DEBUG 1790 args->a_queuer = curthread; 1791 #endif 1792 VN_HOLD(vp); 1793 args->a_vp = vp; 1794 ASSERT(cr != NULL); 1795 crhold(cr); 1796 args->a_cred = cr; 1797 args->a_io = NFS4_PAGEIO; 1798 args->a_nfs4_pageio = pageio; 1799 args->a_nfs4_pp = pp; 1800 args->a_nfs4_off = io_off; 1801 args->a_nfs4_len = (uint_t)io_len; 1802 args->a_nfs4_flags = flags; 1803 1804 mutex_enter(&mi->mi_async_lock); 1805 1806 /* 1807 * If asyncio has been disabled, then make a synchronous request. 1808 * This check is done a second time in case async io was diabled 1809 * while this thread was blocked waiting for memory pressure to 1810 * reduce or for the queue to drain. 1811 */ 1812 if (mi->mi_max_threads == 0) { 1813 mutex_exit(&mi->mi_async_lock); 1814 1815 VN_RELE(vp); 1816 crfree(cr); 1817 kmem_free(args, sizeof (*args)); 1818 goto noasync; 1819 } 1820 1821 /* 1822 * Link request structure into the async list and 1823 * wakeup async thread to do the i/o. 1824 */ 1825 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1826 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1827 mi->mi_async_tail[NFS4_PAGEIO] = args; 1828 } else { 1829 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1830 mi->mi_async_tail[NFS4_PAGEIO] = args; 1831 } 1832 1833 mutex_enter(&rp->r_statelock); 1834 rp->r_count++; 1835 rp->r_awcount++; 1836 mutex_exit(&rp->r_statelock); 1837 1838 if (mi->mi_io_kstats) { 1839 mutex_enter(&mi->mi_lock); 1840 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1841 mutex_exit(&mi->mi_lock); 1842 } 1843 1844 mi->mi_async_req_count++; 1845 ASSERT(mi->mi_async_req_count != 0); 1846 cv_signal(&mi->mi_async_reqs_cv); 1847 mutex_exit(&mi->mi_async_lock); 1848 return (0); 1849 1850 noasync: 1851 /* 1852 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1853 * the page list), for writes we do it synchronously, except for 1854 * proc_pageout/proc_fsflush as described below. 1855 */ 1856 if (flags & B_READ) { 1857 pvn_read_done(pp, flags | B_ERROR); 1858 return (0); 1859 } 1860 1861 if (curproc == proc_pageout || curproc == proc_fsflush) { 1862 /* 1863 * If we get here in the context of the pageout/fsflush, 1864 * we refuse to do a sync write, because this may hang 1865 * pageout/fsflush (and the machine). In this case, we just 1866 * re-mark the page as dirty and punt on the page. 1867 * 1868 * Make sure B_FORCE isn't set. We can re-mark the 1869 * pages as dirty and unlock the pages in one swoop by 1870 * passing in B_ERROR to pvn_write_done(). However, 1871 * we should make sure B_FORCE isn't set - we don't 1872 * want the page tossed before it gets written out. 1873 */ 1874 if (flags & B_FORCE) 1875 flags &= ~(B_INVAL | B_FORCE); 1876 pvn_write_done(pp, flags | B_ERROR); 1877 return (0); 1878 } 1879 1880 if (nfs_zone() != mi->mi_zone) { 1881 /* 1882 * So this was a cross-zone sync pageio. We pass in B_ERROR 1883 * to pvn_write_done() to re-mark the pages as dirty and unlock 1884 * them. 1885 * 1886 * We don't want to clear B_FORCE here as the caller presumably 1887 * knows what they're doing if they set it. 1888 */ 1889 pvn_write_done(pp, flags | B_ERROR); 1890 return (EPERM); 1891 } 1892 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1893 } 1894 1895 void 1896 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1897 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1898 { 1899 rnode4_t *rp; 1900 mntinfo4_t *mi; 1901 struct nfs4_async_reqs *args; 1902 1903 rp = VTOR4(vp); 1904 ASSERT(rp->r_freef == NULL); 1905 1906 mi = VTOMI4(vp); 1907 1908 /* 1909 * If we can't allocate a request structure, skip the readdir. 1910 */ 1911 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1912 goto noasync; 1913 1914 args->a_next = NULL; 1915 #ifdef DEBUG 1916 args->a_queuer = curthread; 1917 #endif 1918 VN_HOLD(vp); 1919 args->a_vp = vp; 1920 ASSERT(cr != NULL); 1921 crhold(cr); 1922 args->a_cred = cr; 1923 args->a_io = NFS4_READDIR; 1924 args->a_nfs4_readdir = readdir; 1925 args->a_nfs4_rdc = rdc; 1926 1927 mutex_enter(&mi->mi_async_lock); 1928 1929 /* 1930 * If asyncio has been disabled, then skip this request 1931 */ 1932 if (mi->mi_max_threads == 0) { 1933 mutex_exit(&mi->mi_async_lock); 1934 1935 VN_RELE(vp); 1936 crfree(cr); 1937 kmem_free(args, sizeof (*args)); 1938 goto noasync; 1939 } 1940 1941 /* 1942 * Link request structure into the async list and 1943 * wakeup async thread to do the i/o. 1944 */ 1945 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 1946 mi->mi_async_reqs[NFS4_READDIR] = args; 1947 mi->mi_async_tail[NFS4_READDIR] = args; 1948 } else { 1949 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 1950 mi->mi_async_tail[NFS4_READDIR] = args; 1951 } 1952 1953 mutex_enter(&rp->r_statelock); 1954 rp->r_count++; 1955 mutex_exit(&rp->r_statelock); 1956 1957 if (mi->mi_io_kstats) { 1958 mutex_enter(&mi->mi_lock); 1959 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1960 mutex_exit(&mi->mi_lock); 1961 } 1962 1963 mi->mi_async_req_count++; 1964 ASSERT(mi->mi_async_req_count != 0); 1965 cv_signal(&mi->mi_async_reqs_cv); 1966 mutex_exit(&mi->mi_async_lock); 1967 return; 1968 1969 noasync: 1970 mutex_enter(&rp->r_statelock); 1971 rdc->entries = NULL; 1972 /* 1973 * Indicate that no one is trying to fill this entry and 1974 * it still needs to be filled. 1975 */ 1976 rdc->flags &= ~RDDIR; 1977 rdc->flags |= RDDIRREQ; 1978 rddir4_cache_rele(rp, rdc); 1979 mutex_exit(&rp->r_statelock); 1980 } 1981 1982 void 1983 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1984 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1985 cred_t *)) 1986 { 1987 rnode4_t *rp; 1988 mntinfo4_t *mi; 1989 struct nfs4_async_reqs *args; 1990 page_t *pp; 1991 1992 rp = VTOR4(vp); 1993 mi = VTOMI4(vp); 1994 1995 /* 1996 * If we can't allocate a request structure, do the commit 1997 * operation synchronously in this thread's context. 1998 */ 1999 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2000 goto noasync; 2001 2002 args->a_next = NULL; 2003 #ifdef DEBUG 2004 args->a_queuer = curthread; 2005 #endif 2006 VN_HOLD(vp); 2007 args->a_vp = vp; 2008 ASSERT(cr != NULL); 2009 crhold(cr); 2010 args->a_cred = cr; 2011 args->a_io = NFS4_COMMIT; 2012 args->a_nfs4_commit = commit; 2013 args->a_nfs4_plist = plist; 2014 args->a_nfs4_offset = offset; 2015 args->a_nfs4_count = count; 2016 2017 mutex_enter(&mi->mi_async_lock); 2018 2019 /* 2020 * If asyncio has been disabled, then make a synchronous request. 2021 * This check is done a second time in case async io was diabled 2022 * while this thread was blocked waiting for memory pressure to 2023 * reduce or for the queue to drain. 2024 */ 2025 if (mi->mi_max_threads == 0) { 2026 mutex_exit(&mi->mi_async_lock); 2027 2028 VN_RELE(vp); 2029 crfree(cr); 2030 kmem_free(args, sizeof (*args)); 2031 goto noasync; 2032 } 2033 2034 /* 2035 * Link request structure into the async list and 2036 * wakeup async thread to do the i/o. 2037 */ 2038 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2039 mi->mi_async_reqs[NFS4_COMMIT] = args; 2040 mi->mi_async_tail[NFS4_COMMIT] = args; 2041 } else { 2042 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2043 mi->mi_async_tail[NFS4_COMMIT] = args; 2044 } 2045 2046 mutex_enter(&rp->r_statelock); 2047 rp->r_count++; 2048 mutex_exit(&rp->r_statelock); 2049 2050 if (mi->mi_io_kstats) { 2051 mutex_enter(&mi->mi_lock); 2052 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2053 mutex_exit(&mi->mi_lock); 2054 } 2055 2056 mi->mi_async_req_count++; 2057 ASSERT(mi->mi_async_req_count != 0); 2058 cv_signal(&mi->mi_async_reqs_cv); 2059 mutex_exit(&mi->mi_async_lock); 2060 return; 2061 2062 noasync: 2063 if (curproc == proc_pageout || curproc == proc_fsflush || 2064 nfs_zone() != mi->mi_zone) { 2065 while (plist != NULL) { 2066 pp = plist; 2067 page_sub(&plist, pp); 2068 pp->p_fsdata = C_COMMIT; 2069 page_unlock(pp); 2070 } 2071 return; 2072 } 2073 (*commit)(vp, plist, offset, count, cr); 2074 } 2075 2076 /* 2077 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2078 * reference to the vnode is handed over to the thread; the caller should 2079 * no longer refer to the vnode. 2080 * 2081 * Unlike most of the async routines, this handoff is needed for 2082 * correctness reasons, not just performance. So doing operations in the 2083 * context of the current thread is not an option. 2084 */ 2085 void 2086 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2087 { 2088 mntinfo4_t *mi; 2089 struct nfs4_async_reqs *args; 2090 boolean_t signal_inactive_thread = B_FALSE; 2091 2092 mi = VTOMI4(vp); 2093 2094 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2095 args->a_next = NULL; 2096 #ifdef DEBUG 2097 args->a_queuer = curthread; 2098 #endif 2099 args->a_vp = vp; 2100 ASSERT(cr != NULL); 2101 crhold(cr); 2102 args->a_cred = cr; 2103 args->a_io = NFS4_INACTIVE; 2104 2105 /* 2106 * Note that we don't check mi->mi_max_threads here, since we 2107 * *need* to get rid of this vnode regardless of whether someone 2108 * set nfs4_max_threads to zero in /etc/system. 2109 * 2110 * The manager thread knows about this and is willing to create 2111 * at least one thread to accomodate us. 2112 */ 2113 mutex_enter(&mi->mi_async_lock); 2114 if (mi->mi_inactive_thread == NULL) { 2115 rnode4_t *rp; 2116 vnode_t *unldvp = NULL; 2117 char *unlname; 2118 cred_t *unlcred; 2119 2120 mutex_exit(&mi->mi_async_lock); 2121 /* 2122 * We just need to free up the memory associated with the 2123 * vnode, which can be safely done from within the current 2124 * context. 2125 */ 2126 crfree(cr); /* drop our reference */ 2127 kmem_free(args, sizeof (*args)); 2128 rp = VTOR4(vp); 2129 mutex_enter(&rp->r_statelock); 2130 if (rp->r_unldvp != NULL) { 2131 unldvp = rp->r_unldvp; 2132 rp->r_unldvp = NULL; 2133 unlname = rp->r_unlname; 2134 rp->r_unlname = NULL; 2135 unlcred = rp->r_unlcred; 2136 rp->r_unlcred = NULL; 2137 } 2138 mutex_exit(&rp->r_statelock); 2139 /* 2140 * No need to explicitly throw away any cached pages. The 2141 * eventual r4inactive() will attempt a synchronous 2142 * VOP_PUTPAGE() which will immediately fail since the request 2143 * is coming from the wrong zone, and then will proceed to call 2144 * nfs4_invalidate_pages() which will clean things up for us. 2145 * 2146 * Throw away the delegation here so rp4_addfree()'s attempt to 2147 * return any existing delegations becomes a no-op. 2148 */ 2149 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) 2150 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2151 nfs4_clear_open_streams(rp); 2152 2153 rp4_addfree(rp, cr); 2154 if (unldvp != NULL) { 2155 kmem_free(unlname, MAXNAMELEN); 2156 VN_RELE(unldvp); 2157 crfree(unlcred); 2158 } 2159 return; 2160 } 2161 2162 if (mi->mi_manager_thread == NULL) { 2163 /* 2164 * We want to talk to the inactive thread. 2165 */ 2166 signal_inactive_thread = B_TRUE; 2167 } 2168 2169 /* 2170 * Enqueue the vnode and wake up either the special thread (empty 2171 * list) or an async thread. 2172 */ 2173 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2174 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2175 mi->mi_async_tail[NFS4_INACTIVE] = args; 2176 signal_inactive_thread = B_TRUE; 2177 } else { 2178 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2179 mi->mi_async_tail[NFS4_INACTIVE] = args; 2180 } 2181 if (signal_inactive_thread) { 2182 cv_signal(&mi->mi_inact_req_cv); 2183 } else { 2184 mi->mi_async_req_count++; 2185 ASSERT(mi->mi_async_req_count != 0); 2186 cv_signal(&mi->mi_async_reqs_cv); 2187 } 2188 2189 mutex_exit(&mi->mi_async_lock); 2190 } 2191 2192 int 2193 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2194 { 2195 int pagecreate; 2196 int n; 2197 int saved_n; 2198 caddr_t saved_base; 2199 u_offset_t offset; 2200 int error; 2201 int sm_error; 2202 2203 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2204 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2205 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2206 2207 /* 2208 * Move bytes in at most PAGESIZE chunks. We must avoid 2209 * spanning pages in uiomove() because page faults may cause 2210 * the cache to be invalidated out from under us. The r_size is not 2211 * updated until after the uiomove. If we push the last page of a 2212 * file before r_size is correct, we will lose the data written past 2213 * the current (and invalid) r_size. 2214 */ 2215 do { 2216 offset = uio->uio_loffset; 2217 pagecreate = 0; 2218 2219 /* 2220 * n is the number of bytes required to satisfy the request 2221 * or the number of bytes to fill out the page. 2222 */ 2223 n = (int)MIN((PAGESIZE - ((uintptr_t)base & PAGEOFFSET)), 2224 tcount); 2225 2226 /* 2227 * Check to see if we can skip reading in the page 2228 * and just allocate the memory. We can do this 2229 * if we are going to rewrite the entire mapping 2230 * or if we are going to write to or beyond the current 2231 * end of file from the beginning of the mapping. 2232 * 2233 * The read of r_size is now protected by r_statelock. 2234 */ 2235 mutex_enter(&rp->r_statelock); 2236 /* 2237 * When pgcreated is nonzero the caller has already done 2238 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2239 * segkpm this means we already have at least one page 2240 * created and mapped at base. 2241 */ 2242 pagecreate = pgcreated || 2243 (((uintptr_t)base & PAGEOFFSET) == 0 && 2244 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2245 2246 mutex_exit(&rp->r_statelock); 2247 2248 if (pagecreate) { 2249 /* 2250 * The last argument tells segmap_pagecreate() to 2251 * always lock the page, as opposed to sometimes 2252 * returning with the page locked. This way we avoid a 2253 * fault on the ensuing uiomove(), but also 2254 * more importantly (to fix bug 1094402) we can 2255 * call segmap_fault() to unlock the page in all 2256 * cases. An alternative would be to modify 2257 * segmap_pagecreate() to tell us when it is 2258 * locking a page, but that's a fairly major 2259 * interface change. 2260 */ 2261 if (pgcreated == 0) 2262 (void) segmap_pagecreate(segkmap, base, 2263 (uint_t)n, 1); 2264 saved_base = base; 2265 saved_n = n; 2266 } 2267 2268 /* 2269 * The number of bytes of data in the last page can not 2270 * be accurately be determined while page is being 2271 * uiomove'd to and the size of the file being updated. 2272 * Thus, inform threads which need to know accurately 2273 * how much data is in the last page of the file. They 2274 * will not do the i/o immediately, but will arrange for 2275 * the i/o to happen later when this modify operation 2276 * will have finished. 2277 */ 2278 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2279 mutex_enter(&rp->r_statelock); 2280 rp->r_flags |= R4MODINPROGRESS; 2281 rp->r_modaddr = (offset & MAXBMASK); 2282 mutex_exit(&rp->r_statelock); 2283 2284 error = uiomove(base, n, UIO_WRITE, uio); 2285 2286 /* 2287 * r_size is the maximum number of 2288 * bytes known to be in the file. 2289 * Make sure it is at least as high as the 2290 * first unwritten byte pointed to by uio_loffset. 2291 */ 2292 mutex_enter(&rp->r_statelock); 2293 if (rp->r_size < uio->uio_loffset) 2294 rp->r_size = uio->uio_loffset; 2295 rp->r_flags &= ~R4MODINPROGRESS; 2296 rp->r_flags |= R4DIRTY; 2297 mutex_exit(&rp->r_statelock); 2298 2299 /* n = # of bytes written */ 2300 n = (int)(uio->uio_loffset - offset); 2301 base += n; 2302 tcount -= n; 2303 /* 2304 * If we created pages w/o initializing them completely, 2305 * we need to zero the part that wasn't set up. 2306 * This happens on a most EOF write cases and if 2307 * we had some sort of error during the uiomove. 2308 */ 2309 if (pagecreate) { 2310 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2311 (void) kzero(base, PAGESIZE - n); 2312 2313 if (pgcreated) { 2314 /* 2315 * Caller is responsible for this page, 2316 * it was not created in this loop. 2317 */ 2318 pgcreated = 0; 2319 } else { 2320 /* 2321 * For bug 1094402: segmap_pagecreate locks 2322 * page. Unlock it. This also unlocks the 2323 * pages allocated by page_create_va() in 2324 * segmap_pagecreate(). 2325 */ 2326 sm_error = segmap_fault(kas.a_hat, segkmap, 2327 saved_base, saved_n, 2328 F_SOFTUNLOCK, S_WRITE); 2329 if (error == 0) 2330 error = sm_error; 2331 } 2332 } 2333 } while (tcount > 0 && error == 0); 2334 2335 return (error); 2336 } 2337 2338 int 2339 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2340 { 2341 rnode4_t *rp; 2342 page_t *pp; 2343 u_offset_t eoff; 2344 u_offset_t io_off; 2345 size_t io_len; 2346 int error; 2347 int rdirty; 2348 int err; 2349 2350 rp = VTOR4(vp); 2351 ASSERT(rp->r_count > 0); 2352 2353 if (!nfs4_has_pages(vp)) 2354 return (0); 2355 2356 ASSERT(vp->v_type != VCHR); 2357 2358 /* 2359 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2360 * writes. B_FORCE is set to force the VM system to actually 2361 * invalidate the pages, even if the i/o failed. The pages 2362 * need to get invalidated because they can't be written out 2363 * because there isn't any space left on either the server's 2364 * file system or in the user's disk quota. The B_FREE bit 2365 * is cleared to avoid confusion as to whether this is a 2366 * request to place the page on the freelist or to destroy 2367 * it. 2368 */ 2369 if ((rp->r_flags & R4OUTOFSPACE) || 2370 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2371 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2372 2373 if (len == 0) { 2374 /* 2375 * If doing a full file synchronous operation, then clear 2376 * the R4DIRTY bit. If a page gets dirtied while the flush 2377 * is happening, then R4DIRTY will get set again. The 2378 * R4DIRTY bit must get cleared before the flush so that 2379 * we don't lose this information. 2380 */ 2381 if (off == (u_offset_t)0 && 2382 !(flags & B_ASYNC) && 2383 (rp->r_flags & R4DIRTY)) { 2384 mutex_enter(&rp->r_statelock); 2385 rdirty = (rp->r_flags & R4DIRTY); 2386 rp->r_flags &= ~R4DIRTY; 2387 mutex_exit(&rp->r_statelock); 2388 } else 2389 rdirty = 0; 2390 2391 /* 2392 * Search the entire vp list for pages >= off, and flush 2393 * the dirty pages. 2394 */ 2395 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2396 flags, cr); 2397 2398 /* 2399 * If an error occured and the file was marked as dirty 2400 * before and we aren't forcibly invalidating pages, then 2401 * reset the R4DIRTY flag. 2402 */ 2403 if (error && rdirty && 2404 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2405 mutex_enter(&rp->r_statelock); 2406 rp->r_flags |= R4DIRTY; 2407 mutex_exit(&rp->r_statelock); 2408 } 2409 } else { 2410 /* 2411 * Do a range from [off...off + len) looking for pages 2412 * to deal with. 2413 */ 2414 error = 0; 2415 io_len = 0; 2416 eoff = off + len; 2417 mutex_enter(&rp->r_statelock); 2418 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2419 io_off += io_len) { 2420 mutex_exit(&rp->r_statelock); 2421 /* 2422 * If we are not invalidating, synchronously 2423 * freeing or writing pages use the routine 2424 * page_lookup_nowait() to prevent reclaiming 2425 * them from the free list. 2426 */ 2427 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2428 pp = page_lookup(vp, io_off, 2429 (flags & (B_INVAL | B_FREE)) ? 2430 SE_EXCL : SE_SHARED); 2431 } else { 2432 pp = page_lookup_nowait(vp, io_off, 2433 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2434 } 2435 2436 if (pp == NULL || !pvn_getdirty(pp, flags)) 2437 io_len = PAGESIZE; 2438 else { 2439 err = (*rp->r_putapage)(vp, pp, &io_off, 2440 &io_len, flags, cr); 2441 if (!error) 2442 error = err; 2443 /* 2444 * "io_off" and "io_len" are returned as 2445 * the range of pages we actually wrote. 2446 * This allows us to skip ahead more quickly 2447 * since several pages may've been dealt 2448 * with by this iteration of the loop. 2449 */ 2450 } 2451 mutex_enter(&rp->r_statelock); 2452 } 2453 mutex_exit(&rp->r_statelock); 2454 } 2455 2456 return (error); 2457 } 2458 2459 void 2460 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2461 { 2462 rnode4_t *rp; 2463 2464 rp = VTOR4(vp); 2465 if (IS_SHADOW(vp, rp)) 2466 vp = RTOV4(rp); 2467 mutex_enter(&rp->r_statelock); 2468 while (rp->r_flags & R4TRUNCATE) 2469 cv_wait(&rp->r_cv, &rp->r_statelock); 2470 rp->r_flags |= R4TRUNCATE; 2471 if (off == (u_offset_t)0) { 2472 rp->r_flags &= ~R4DIRTY; 2473 if (!(rp->r_flags & R4STALE)) 2474 rp->r_error = 0; 2475 } 2476 rp->r_truncaddr = off; 2477 mutex_exit(&rp->r_statelock); 2478 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2479 B_INVAL | B_TRUNC, cr); 2480 mutex_enter(&rp->r_statelock); 2481 rp->r_flags &= ~R4TRUNCATE; 2482 cv_broadcast(&rp->r_cv); 2483 mutex_exit(&rp->r_statelock); 2484 } 2485 2486 static int 2487 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2488 { 2489 mntinfo4_t *mi; 2490 struct mntinfo_kstat *mik; 2491 vfs_t *vfsp; 2492 2493 /* this is a read-only kstat. Bail out on a write */ 2494 if (rw == KSTAT_WRITE) 2495 return (EACCES); 2496 2497 2498 /* 2499 * We don't want to wait here as kstat_chain_lock could be held by 2500 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2501 * and thus could lead to a deadlock. 2502 */ 2503 vfsp = (struct vfs *)ksp->ks_private; 2504 2505 mi = VFTOMI4(vfsp); 2506 mik = (struct mntinfo_kstat *)ksp->ks_data; 2507 2508 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2509 2510 mik->mik_vers = (uint32_t)mi->mi_vers; 2511 mik->mik_flags = mi->mi_flags; 2512 /* 2513 * The sv_secdata holds the flavor the client specifies. 2514 * If the client uses default and a security negotiation 2515 * occurs, sv_currsec will point to the current flavor 2516 * selected from the server flavor list. 2517 * sv_currsec is NULL if no security negotiation takes place. 2518 */ 2519 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2520 mi->mi_curr_serv->sv_currsec->secmod : 2521 mi->mi_curr_serv->sv_secdata->secmod; 2522 mik->mik_curread = (uint32_t)mi->mi_curread; 2523 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2524 mik->mik_retrans = mi->mi_retrans; 2525 mik->mik_timeo = mi->mi_timeo; 2526 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2527 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2528 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2529 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2530 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2531 mik->mik_failover = (uint32_t)mi->mi_failover; 2532 mik->mik_remap = (uint32_t)mi->mi_remap; 2533 2534 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2535 2536 return (0); 2537 } 2538 2539 void 2540 nfs4_mnt_kstat_init(struct vfs *vfsp) 2541 { 2542 mntinfo4_t *mi = VFTOMI4(vfsp); 2543 2544 /* 2545 * PSARC 2001/697 Contract Private Interface 2546 * All nfs kstats are under SunMC contract 2547 * Please refer to the PSARC listed above and contact 2548 * SunMC before making any changes! 2549 * 2550 * Changes must be reviewed by Solaris File Sharing 2551 * Changes must be communicated to contract-2001-697@sun.com 2552 * 2553 */ 2554 2555 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2556 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2557 if (mi->mi_io_kstats) { 2558 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2559 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2560 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2561 kstat_install(mi->mi_io_kstats); 2562 } 2563 2564 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2565 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2566 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2567 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2568 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2569 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2570 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2571 kstat_install(mi->mi_ro_kstats); 2572 } 2573 2574 nfs4_mnt_recov_kstat_init(vfsp); 2575 } 2576 2577 void 2578 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2579 { 2580 mntinfo4_t *mi; 2581 2582 mi = VTOMI4(vp); 2583 /* 2584 * In case of forced unmount, do not print any messages 2585 * since it can flood the console with error messages. 2586 */ 2587 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2588 return; 2589 2590 /* 2591 * If the mount point is dead, not recoverable, do not 2592 * print error messages that can flood the console. 2593 */ 2594 if (mi->mi_flags & MI4_RECOV_FAIL) 2595 return; 2596 2597 /* 2598 * No use in flooding the console with ENOSPC 2599 * messages from the same file system. 2600 */ 2601 if ((error != ENOSPC && error != EDQUOT) || 2602 lbolt - mi->mi_printftime > 0) { 2603 zoneid_t zoneid = mi->mi_zone->zone_id; 2604 2605 #ifdef DEBUG 2606 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2607 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2608 #else 2609 nfs_perror(error, "NFS write error on host %s: %m.\n", 2610 VTOR4(vp)->r_server->sv_hostname, NULL); 2611 #endif 2612 if (error == ENOSPC || error == EDQUOT) { 2613 zcmn_err(zoneid, CE_CONT, 2614 "^File: userid=%d, groupid=%d\n", 2615 crgetuid(cr), crgetgid(cr)); 2616 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2617 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2618 zcmn_err(zoneid, CE_CONT, 2619 "^User: userid=%d, groupid=%d\n", 2620 crgetuid(curthread->t_cred), 2621 crgetgid(curthread->t_cred)); 2622 } 2623 mi->mi_printftime = lbolt + 2624 nfs_write_error_interval * hz; 2625 } 2626 sfh4_printfhandle(VTOR4(vp)->r_fh); 2627 #ifdef DEBUG 2628 if (error == EACCES) { 2629 zcmn_err(zoneid, CE_CONT, 2630 "nfs_bio: cred is%s kcred\n", 2631 cr == kcred ? "" : " not"); 2632 } 2633 #endif 2634 } 2635 } 2636 2637 /* 2638 * Return non-zero if the given file can be safely memory mapped. Locks 2639 * are safe if whole-file (length and offset are both zero). 2640 */ 2641 2642 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2643 2644 static int 2645 nfs4_safemap(const vnode_t *vp) 2646 { 2647 locklist_t *llp, *next_llp; 2648 int safe = 1; 2649 rnode4_t *rp = VTOR4(vp); 2650 2651 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2652 2653 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2654 "vp = %p", (void *)vp)); 2655 2656 /* 2657 * Review all the locks for the vnode, both ones that have been 2658 * acquired and ones that are pending. We assume that 2659 * flk_active_locks_for_vp() has merged any locks that can be 2660 * merged (so that if a process has the entire file locked, it is 2661 * represented as a single lock). 2662 * 2663 * Note that we can't bail out of the loop if we find a non-safe 2664 * lock, because we have to free all the elements in the llp list. 2665 * We might be able to speed up this code slightly by not looking 2666 * at each lock's l_start and l_len fields once we've found a 2667 * non-safe lock. 2668 */ 2669 2670 llp = flk_active_locks_for_vp(vp); 2671 while (llp) { 2672 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2673 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2674 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2675 if (!SAFE_LOCK(llp->ll_flock)) { 2676 safe = 0; 2677 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2678 "nfs4_safemap: unsafe active lock (%" PRId64 2679 ", %" PRId64 ")", llp->ll_flock.l_start, 2680 llp->ll_flock.l_len)); 2681 } 2682 next_llp = llp->ll_next; 2683 VN_RELE(llp->ll_vp); 2684 kmem_free(llp, sizeof (*llp)); 2685 llp = next_llp; 2686 } 2687 2688 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2689 safe ? "safe" : "unsafe")); 2690 return (safe); 2691 } 2692 2693 /* 2694 * Return whether there is a lost LOCK or LOCKU queued up for the given 2695 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2696 */ 2697 2698 bool_t 2699 nfs4_map_lost_lock_conflict(vnode_t *vp) 2700 { 2701 bool_t conflict = FALSE; 2702 nfs4_lost_rqst_t *lrp; 2703 mntinfo4_t *mi = VTOMI4(vp); 2704 2705 mutex_enter(&mi->mi_lock); 2706 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2707 lrp = list_next(&mi->mi_lost_state, lrp)) { 2708 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2709 continue; 2710 ASSERT(lrp->lr_vp != NULL); 2711 if (!VOP_CMP(lrp->lr_vp, vp)) 2712 continue; /* different file */ 2713 if (!SAFE_LOCK(*lrp->lr_flk)) { 2714 conflict = TRUE; 2715 break; 2716 } 2717 } 2718 2719 mutex_exit(&mi->mi_lock); 2720 return (conflict); 2721 } 2722 2723 /* 2724 * nfs_lockcompletion: 2725 * 2726 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2727 * as non cachable (set VNOCACHE bit). 2728 */ 2729 2730 void 2731 nfs4_lockcompletion(vnode_t *vp, int cmd) 2732 { 2733 rnode4_t *rp = VTOR4(vp); 2734 2735 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2736 ASSERT(!IS_SHADOW(vp, rp)); 2737 2738 if (cmd == F_SETLK || cmd == F_SETLKW) { 2739 2740 if (!nfs4_safemap(vp)) { 2741 mutex_enter(&vp->v_lock); 2742 vp->v_flag |= VNOCACHE; 2743 mutex_exit(&vp->v_lock); 2744 } else { 2745 mutex_enter(&vp->v_lock); 2746 vp->v_flag &= ~VNOCACHE; 2747 mutex_exit(&vp->v_lock); 2748 } 2749 } 2750 /* 2751 * The cached attributes of the file are stale after acquiring 2752 * the lock on the file. They were updated when the file was 2753 * opened, but not updated when the lock was acquired. Therefore the 2754 * cached attributes are invalidated after the lock is obtained. 2755 */ 2756 PURGE_ATTRCACHE4(vp); 2757 } 2758 2759 /* ARGSUSED */ 2760 static void * 2761 nfs4_mi_init(zoneid_t zoneid) 2762 { 2763 struct mi4_globals *mig; 2764 2765 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2766 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2767 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2768 offsetof(mntinfo4_t, mi_zone_node)); 2769 mig->mig_destructor_called = B_FALSE; 2770 return (mig); 2771 } 2772 2773 /* 2774 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2775 * state and killing off threads. 2776 */ 2777 /* ARGSUSED */ 2778 static void 2779 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2780 { 2781 struct mi4_globals *mig = data; 2782 mntinfo4_t *mi; 2783 nfs4_server_t *np; 2784 2785 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2786 "nfs4_mi_shutdown zone %d\n", zoneid)); 2787 ASSERT(mig != NULL); 2788 again: 2789 mutex_enter(&mig->mig_lock); 2790 for (mi = list_head(&mig->mig_list); mi != NULL; 2791 mi = list_next(&mig->mig_list, mi)) { 2792 /* 2793 * If we've done the shutdown work for this FS, skip. 2794 * Once we go off the end of the list, we're done. 2795 */ 2796 if (mi->mi_flags & MI4_DEAD) 2797 continue; 2798 2799 /* 2800 * We will do work, so not done. Get a hold on the FS. 2801 */ 2802 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2803 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2804 VFS_HOLD(mi->mi_vfsp); 2805 2806 /* 2807 * purge the DNLC for this filesystem 2808 */ 2809 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2810 2811 mutex_enter(&mi->mi_async_lock); 2812 /* 2813 * Tell existing async worker threads to exit. 2814 */ 2815 mi->mi_max_threads = 0; 2816 cv_broadcast(&mi->mi_async_work_cv); 2817 /* 2818 * Set the appropriate flags so both the async manager and the 2819 * inactive thread start getting ready to exit when they're done 2820 * with their current work. 2821 */ 2822 mutex_enter(&mi->mi_lock); 2823 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2824 mutex_exit(&mi->mi_lock); 2825 /* 2826 * Wake up async manager thread. When it is done it will wake 2827 * up the inactive thread which will then exit. 2828 */ 2829 cv_broadcast(&mi->mi_async_reqs_cv); 2830 mutex_exit(&mi->mi_async_lock); 2831 2832 /* 2833 * Drop lock and release FS, which may change list, then repeat. 2834 * We're done when every mi has been done or the list is empty. 2835 */ 2836 mutex_exit(&mig->mig_lock); 2837 VFS_RELE(mi->mi_vfsp); 2838 goto again; 2839 } 2840 mutex_exit(&mig->mig_lock); 2841 /* 2842 * Tell each renew thread in the zone to exit 2843 */ 2844 mutex_enter(&nfs4_server_lst_lock); 2845 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2846 mutex_enter(&np->s_lock); 2847 if (np->zoneid == zoneid) { 2848 /* 2849 * We add another hold onto the nfs4_server_t 2850 * because this will make sure tha the nfs4_server_t 2851 * stays around until nfs4_callback_fini_zone destroys 2852 * the zone. This way, the renew thread can 2853 * unconditionally release its holds on the 2854 * nfs4_server_t. 2855 */ 2856 np->s_refcnt++; 2857 nfs4_mark_srv_dead(np); 2858 } 2859 mutex_exit(&np->s_lock); 2860 } 2861 mutex_exit(&nfs4_server_lst_lock); 2862 } 2863 2864 static void 2865 nfs4_mi_free_globals(struct mi4_globals *mig) 2866 { 2867 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2868 mutex_destroy(&mig->mig_lock); 2869 kmem_free(mig, sizeof (*mig)); 2870 2871 } 2872 2873 /* ARGSUSED */ 2874 static void 2875 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2876 { 2877 struct mi4_globals *mig = data; 2878 2879 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2880 "nfs4_mi_destroy zone %d\n", zoneid)); 2881 ASSERT(mig != NULL); 2882 mutex_enter(&mig->mig_lock); 2883 if (list_head(&mig->mig_list) != NULL) { 2884 /* Still waiting for VFS_FREEVFS() */ 2885 mig->mig_destructor_called = B_TRUE; 2886 mutex_exit(&mig->mig_lock); 2887 return; 2888 } 2889 nfs4_mi_free_globals(mig); 2890 } 2891 2892 /* 2893 * Add an NFS mount to the per-zone list of NFS mounts. 2894 */ 2895 void 2896 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2897 { 2898 struct mi4_globals *mig; 2899 2900 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2901 mutex_enter(&mig->mig_lock); 2902 list_insert_head(&mig->mig_list, mi); 2903 mutex_exit(&mig->mig_lock); 2904 } 2905 2906 /* 2907 * Remove an NFS mount from the per-zone list of NFS mounts. 2908 */ 2909 static void 2910 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 2911 { 2912 struct mi4_globals *mig; 2913 2914 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2915 mutex_enter(&mig->mig_lock); 2916 list_remove(&mig->mig_list, mi); 2917 /* 2918 * We can be called asynchronously by VFS_FREEVFS() after the zone 2919 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2920 * mi globals. 2921 */ 2922 if (list_head(&mig->mig_list) == NULL && 2923 mig->mig_destructor_called == B_TRUE) { 2924 nfs4_mi_free_globals(mig); 2925 return; 2926 } 2927 mutex_exit(&mig->mig_lock); 2928 } 2929 2930 void 2931 nfs_free_mi4(mntinfo4_t *mi) 2932 { 2933 nfs4_open_owner_t *foop; 2934 nfs4_oo_hash_bucket_t *bucketp; 2935 nfs4_debug_msg_t *msgp; 2936 int i; 2937 2938 /* 2939 * Tell the thread for over the wire inactive calls to exit. 2940 * 2941 * By the time we get here the last VFS_RELE() has already been called, 2942 * or this is an aborted mount; in either case the async manager thread 2943 * should be dead by now. The recovery thread has called recov_done(), 2944 * but may not have exited yet. 2945 */ 2946 mutex_enter(&mi->mi_lock); 2947 ASSERT(mi->mi_recovthread == NULL); 2948 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 2949 mi->mi_flags |= MI4_DEAD; 2950 mutex_exit(&mi->mi_lock); 2951 2952 mutex_enter(&mi->mi_async_lock); 2953 ASSERT(mi->mi_threads == 0); 2954 ASSERT(mi->mi_manager_thread == NULL); 2955 2956 /* 2957 * If we are the inactive thread NULL mi_inactive_thread 2958 * then return. The inactive thread will detect MI4_DEAD 2959 * and call nfs_free_mi4 directly so that the cleanup and 2960 * thread exit can occur. 2961 */ 2962 if (mi->mi_inactive_thread == curthread) { 2963 mi->mi_inactive_thread = NULL; 2964 mutex_exit(&mi->mi_async_lock); 2965 return; 2966 } 2967 2968 /* 2969 * Wake up the inactive thread. 2970 */ 2971 cv_signal(&mi->mi_inact_req_cv); 2972 2973 /* 2974 * Wait for the inactive thread to exit. 2975 */ 2976 while (mi->mi_inactive_thread != NULL) { 2977 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2978 } 2979 2980 mutex_exit(&mi->mi_async_lock); 2981 2982 /* 2983 * Wait for the recovery thread to complete, that is, it will signal 2984 * when it is done using the "mi" structure and about to exit. 2985 */ 2986 mutex_enter(&mi->mi_lock); 2987 while (mi->mi_in_recovery > 0) 2988 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2989 mutex_exit(&mi->mi_lock); 2990 2991 mutex_enter(&mi->mi_msg_list_lock); 2992 while (msgp = list_head(&mi->mi_msg_list)) { 2993 list_remove(&mi->mi_msg_list, msgp); 2994 nfs4_free_msg(msgp); 2995 } 2996 mutex_exit(&mi->mi_msg_list_lock); 2997 list_destroy(&mi->mi_msg_list); 2998 2999 if (mi->mi_rootfh != NULL) 3000 sfh4_rele(&mi->mi_rootfh); 3001 if (mi->mi_srvparentfh != NULL) 3002 sfh4_rele(&mi->mi_srvparentfh); 3003 3004 mutex_destroy(&mi->mi_lock); 3005 mutex_destroy(&mi->mi_async_lock); 3006 mutex_destroy(&mi->mi_msg_list_lock); 3007 nfs_rw_destroy(&mi->mi_recovlock); 3008 nfs_rw_destroy(&mi->mi_rename_lock); 3009 nfs_rw_destroy(&mi->mi_fh_lock); 3010 cv_destroy(&mi->mi_failover_cv); 3011 cv_destroy(&mi->mi_async_reqs_cv); 3012 cv_destroy(&mi->mi_async_work_cv); 3013 cv_destroy(&mi->mi_async_cv); 3014 cv_destroy(&mi->mi_inact_req_cv); 3015 3016 /* 3017 * Destroy the oo hash lists and mutexes for the cred hash table. 3018 */ 3019 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3020 bucketp = &(mi->mi_oo_list[i]); 3021 /* Destroy any remaining open owners on the list */ 3022 foop = list_head(&bucketp->b_oo_hash_list); 3023 while (foop != NULL) { 3024 list_remove(&bucketp->b_oo_hash_list, foop); 3025 nfs4_destroy_open_owner(foop); 3026 foop = list_head(&bucketp->b_oo_hash_list); 3027 } 3028 list_destroy(&bucketp->b_oo_hash_list); 3029 mutex_destroy(&bucketp->b_lock); 3030 } 3031 3032 /* 3033 * Empty and destroy the freed open owner list. 3034 */ 3035 foop = list_head(&mi->mi_foo_list); 3036 while (foop != NULL) { 3037 list_remove(&mi->mi_foo_list, foop); 3038 nfs4_destroy_open_owner(foop); 3039 foop = list_head(&mi->mi_foo_list); 3040 } 3041 3042 list_destroy(&mi->mi_foo_list); 3043 list_destroy(&mi->mi_bseqid_list); 3044 list_destroy(&mi->mi_lost_state); 3045 avl_destroy(&mi->mi_filehandles); 3046 fn_rele(&mi->mi_fname); 3047 nfs4_mi_zonelist_remove(mi); 3048 zone_rele(mi->mi_zone); 3049 3050 kmem_free(mi, sizeof (*mi)); 3051 } 3052 3053 vnode_t nfs4_xattr_notsupp_vnode; 3054 3055 void 3056 nfs4_clnt_init(void) 3057 { 3058 nfs4_vnops_init(); 3059 (void) nfs4_rnode_init(); 3060 (void) nfs4_shadow_init(); 3061 (void) nfs4_acache_init(); 3062 (void) nfs4_subr_init(); 3063 nfs4_acl_init(); 3064 nfs_idmap_init(); 3065 nfs4_callback_init(); 3066 nfs4_secinfo_init(); 3067 #ifdef DEBUG 3068 tsd_create(&nfs4_tsd_key, NULL); 3069 #endif 3070 3071 /* 3072 * Add a CPR callback so that we can update client 3073 * lease after a suspend and resume. 3074 */ 3075 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3076 3077 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3078 nfs4_mi_destroy); 3079 3080 /* 3081 * Initialise the reference count of the notsupp xattr cache vnode to 1 3082 * so that it never goes away (VOP_INACTIVE isn't called on it). 3083 */ 3084 nfs4_xattr_notsupp_vnode.v_count = 1; 3085 } 3086 3087 void 3088 nfs4_clnt_fini(void) 3089 { 3090 (void) zone_key_delete(mi4_list_key); 3091 nfs4_vnops_fini(); 3092 (void) nfs4_rnode_fini(); 3093 (void) nfs4_shadow_fini(); 3094 (void) nfs4_acache_fini(); 3095 (void) nfs4_subr_fini(); 3096 nfs_idmap_fini(); 3097 nfs4_callback_fini(); 3098 nfs4_secinfo_fini(); 3099 #ifdef DEBUG 3100 tsd_destroy(&nfs4_tsd_key); 3101 #endif 3102 if (cid) 3103 (void) callb_delete(cid); 3104 } 3105 3106 /*ARGSUSED*/ 3107 static boolean_t 3108 nfs4_client_cpr_callb(void *arg, int code) 3109 { 3110 /* 3111 * We get called for Suspend and Resume events. 3112 * For the suspend case we simply don't care! 3113 */ 3114 if (code == CB_CODE_CPR_CHKPT) { 3115 return (B_TRUE); 3116 } 3117 3118 /* 3119 * When we get to here we are in the process of 3120 * resuming the system from a previous suspend. 3121 */ 3122 nfs4_client_resumed = gethrestime_sec(); 3123 return (B_TRUE); 3124 } 3125 3126 void 3127 nfs4_renew_lease_thread(nfs4_server_t *sp) 3128 { 3129 int error = 0; 3130 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3131 clock_t tick_delay = 0; 3132 clock_t time_left = 0; 3133 callb_cpr_t cpr_info; 3134 kmutex_t cpr_lock; 3135 3136 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3137 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3138 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3139 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3140 3141 mutex_enter(&sp->s_lock); 3142 /* sp->s_lease_time is set via a GETATTR */ 3143 sp->last_renewal_time = gethrestime_sec(); 3144 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3145 ASSERT(sp->s_refcnt >= 1); 3146 3147 for (;;) { 3148 if (!sp->state_ref_count || 3149 sp->lease_valid != NFS4_LEASE_VALID) { 3150 3151 kip_secs = MAX((sp->s_lease_time >> 1) - 3152 (3 * sp->propagation_delay.tv_sec), 1); 3153 3154 tick_delay = SEC_TO_TICK(kip_secs); 3155 3156 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3157 "nfs4_renew_lease_thread: no renew : thread " 3158 "wait %ld secs", kip_secs)); 3159 3160 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3161 "nfs4_renew_lease_thread: no renew : " 3162 "state_ref_count %d, lease_valid %d", 3163 sp->state_ref_count, sp->lease_valid)); 3164 3165 mutex_enter(&cpr_lock); 3166 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3167 mutex_exit(&cpr_lock); 3168 time_left = cv_timedwait(&sp->cv_thread_exit, 3169 &sp->s_lock, tick_delay + lbolt); 3170 mutex_enter(&cpr_lock); 3171 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3172 mutex_exit(&cpr_lock); 3173 3174 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3175 "nfs4_renew_lease_thread: no renew: " 3176 "time left %ld", time_left)); 3177 3178 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3179 goto die; 3180 continue; 3181 } 3182 3183 tmp_last_renewal_time = sp->last_renewal_time; 3184 3185 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3186 (3 * sp->propagation_delay.tv_sec); 3187 3188 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3189 "nfs4_renew_lease_thread: tmp_time %ld, " 3190 "sp->last_renewal_time %ld", tmp_time, 3191 sp->last_renewal_time)); 3192 3193 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3194 3195 tick_delay = SEC_TO_TICK(kip_secs); 3196 3197 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3198 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3199 "secs", kip_secs)); 3200 3201 mutex_enter(&cpr_lock); 3202 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3203 mutex_exit(&cpr_lock); 3204 time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock, 3205 tick_delay + lbolt); 3206 mutex_enter(&cpr_lock); 3207 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3208 mutex_exit(&cpr_lock); 3209 3210 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3211 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3212 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3213 "tmp_last_renewal_time %ld", time_left, 3214 sp->last_renewal_time, nfs4_client_resumed, 3215 tmp_last_renewal_time)); 3216 3217 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3218 goto die; 3219 3220 if (tmp_last_renewal_time == sp->last_renewal_time || 3221 (nfs4_client_resumed != 0 && 3222 nfs4_client_resumed > sp->last_renewal_time)) { 3223 /* 3224 * Issue RENEW op since we haven't renewed the lease 3225 * since we slept. 3226 */ 3227 tmp_now_time = gethrestime_sec(); 3228 error = nfs4renew(sp); 3229 /* 3230 * Need to re-acquire sp's lock, nfs4renew() 3231 * relinqueshes it. 3232 */ 3233 mutex_enter(&sp->s_lock); 3234 3235 /* 3236 * See if someone changed s_thread_exit while we gave 3237 * up s_lock. 3238 */ 3239 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3240 goto die; 3241 3242 if (!error) { 3243 /* 3244 * check to see if we implicitly renewed while 3245 * we waited for a reply for our RENEW call. 3246 */ 3247 if (tmp_last_renewal_time == 3248 sp->last_renewal_time) { 3249 /* no implicit renew came */ 3250 sp->last_renewal_time = tmp_now_time; 3251 } else { 3252 NFS4_DEBUG(nfs4_client_lease_debug, 3253 (CE_NOTE, "renew_thread: did " 3254 "implicit renewal before reply " 3255 "from server for RENEW")); 3256 } 3257 } else { 3258 /* figure out error */ 3259 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3260 "renew_thread: nfs4renew returned error" 3261 " %d", error)); 3262 } 3263 3264 } 3265 } 3266 3267 die: 3268 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3269 "nfs4_renew_lease_thread: thread exiting")); 3270 3271 while (sp->s_otw_call_count != 0) { 3272 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3273 "nfs4_renew_lease_thread: waiting for outstanding " 3274 "otw calls to finish for sp 0x%p, current " 3275 "s_otw_call_count %d", (void *)sp, 3276 sp->s_otw_call_count)); 3277 mutex_enter(&cpr_lock); 3278 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3279 mutex_exit(&cpr_lock); 3280 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3281 mutex_enter(&cpr_lock); 3282 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3283 mutex_exit(&cpr_lock); 3284 } 3285 mutex_exit(&sp->s_lock); 3286 3287 nfs4_server_rele(sp); /* free the thread's reference */ 3288 nfs4_server_rele(sp); /* free the list's reference */ 3289 sp = NULL; 3290 3291 done: 3292 mutex_enter(&cpr_lock); 3293 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3294 mutex_destroy(&cpr_lock); 3295 3296 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3297 "nfs4_renew_lease_thread: renew thread exit officially")); 3298 3299 zthread_exit(); 3300 /* NOT REACHED */ 3301 } 3302 3303 /* 3304 * Send out a RENEW op to the server. 3305 * Assumes sp is locked down. 3306 */ 3307 static int 3308 nfs4renew(nfs4_server_t *sp) 3309 { 3310 COMPOUND4args_clnt args; 3311 COMPOUND4res_clnt res; 3312 nfs_argop4 argop[1]; 3313 int doqueue = 1; 3314 int rpc_error; 3315 cred_t *cr; 3316 mntinfo4_t *mi; 3317 timespec_t prop_time, after_time; 3318 int needrecov = FALSE; 3319 nfs4_recov_state_t recov_state; 3320 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3321 3322 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3323 3324 recov_state.rs_flags = 0; 3325 recov_state.rs_num_retry_despite_err = 0; 3326 3327 recov_retry: 3328 mi = sp->mntinfo4_list; 3329 VFS_HOLD(mi->mi_vfsp); 3330 mutex_exit(&sp->s_lock); 3331 ASSERT(mi != NULL); 3332 3333 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3334 if (e.error) { 3335 VFS_RELE(mi->mi_vfsp); 3336 return (e.error); 3337 } 3338 3339 /* Check to see if we're dealing with a marked-dead sp */ 3340 mutex_enter(&sp->s_lock); 3341 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3342 mutex_exit(&sp->s_lock); 3343 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3344 VFS_RELE(mi->mi_vfsp); 3345 return (0); 3346 } 3347 3348 /* Make sure mi hasn't changed on us */ 3349 if (mi != sp->mntinfo4_list) { 3350 /* Must drop sp's lock to avoid a recursive mutex enter */ 3351 mutex_exit(&sp->s_lock); 3352 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3353 VFS_RELE(mi->mi_vfsp); 3354 mutex_enter(&sp->s_lock); 3355 goto recov_retry; 3356 } 3357 mutex_exit(&sp->s_lock); 3358 3359 args.ctag = TAG_RENEW; 3360 3361 args.array_len = 1; 3362 args.array = argop; 3363 3364 argop[0].argop = OP_RENEW; 3365 3366 mutex_enter(&sp->s_lock); 3367 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3368 cr = sp->s_cred; 3369 crhold(cr); 3370 mutex_exit(&sp->s_lock); 3371 3372 ASSERT(cr != NULL); 3373 3374 /* used to figure out RTT for sp */ 3375 gethrestime(&prop_time); 3376 3377 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3378 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3379 (void*)sp)); 3380 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3381 prop_time.tv_sec, prop_time.tv_nsec)); 3382 3383 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3384 mntinfo4_t *, mi); 3385 3386 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3387 crfree(cr); 3388 3389 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3390 mntinfo4_t *, mi); 3391 3392 gethrestime(&after_time); 3393 3394 mutex_enter(&sp->s_lock); 3395 sp->propagation_delay.tv_sec = 3396 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3397 mutex_exit(&sp->s_lock); 3398 3399 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3400 after_time.tv_sec, after_time.tv_nsec)); 3401 3402 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3403 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3404 nfs4_delegreturn_all(sp); 3405 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3406 VFS_RELE(mi->mi_vfsp); 3407 /* 3408 * If the server returns CB_PATH_DOWN, it has renewed 3409 * the lease and informed us that the callback path is 3410 * down. Since the lease is renewed, just return 0 and 3411 * let the renew thread proceed as normal. 3412 */ 3413 return (0); 3414 } 3415 3416 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3417 if (!needrecov && e.error) { 3418 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3419 VFS_RELE(mi->mi_vfsp); 3420 return (e.error); 3421 } 3422 3423 rpc_error = e.error; 3424 3425 if (needrecov) { 3426 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3427 "nfs4renew: initiating recovery\n")); 3428 3429 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3430 OP_RENEW, NULL) == FALSE) { 3431 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3432 VFS_RELE(mi->mi_vfsp); 3433 if (!e.error) 3434 (void) xdr_free(xdr_COMPOUND4res_clnt, 3435 (caddr_t)&res); 3436 mutex_enter(&sp->s_lock); 3437 goto recov_retry; 3438 } 3439 /* fall through for res.status case */ 3440 } 3441 3442 if (res.status) { 3443 if (res.status == NFS4ERR_LEASE_MOVED) { 3444 /*EMPTY*/ 3445 /* 3446 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3447 * to renew the lease on that server 3448 */ 3449 } 3450 e.error = geterrno4(res.status); 3451 } 3452 3453 if (!rpc_error) 3454 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3455 3456 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3457 3458 VFS_RELE(mi->mi_vfsp); 3459 3460 return (e.error); 3461 } 3462 3463 void 3464 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3465 { 3466 nfs4_server_t *sp; 3467 3468 /* this locks down sp if it is found */ 3469 sp = find_nfs4_server(mi); 3470 3471 if (sp != NULL) { 3472 nfs4_inc_state_ref_count_nolock(sp, mi); 3473 mutex_exit(&sp->s_lock); 3474 nfs4_server_rele(sp); 3475 } 3476 } 3477 3478 /* 3479 * Bump the number of OPEN files (ie: those with state) so we know if this 3480 * nfs4_server has any state to maintain a lease for or not. 3481 * 3482 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3483 */ 3484 void 3485 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3486 { 3487 ASSERT(mutex_owned(&sp->s_lock)); 3488 3489 sp->state_ref_count++; 3490 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3491 "nfs4_inc_state_ref_count: state_ref_count now %d", 3492 sp->state_ref_count)); 3493 3494 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3495 sp->lease_valid = NFS4_LEASE_VALID; 3496 3497 /* 3498 * If this call caused the lease to be marked valid and/or 3499 * took the state_ref_count from 0 to 1, then start the time 3500 * on lease renewal. 3501 */ 3502 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3503 sp->last_renewal_time = gethrestime_sec(); 3504 3505 /* update the number of open files for mi */ 3506 mi->mi_open_files++; 3507 } 3508 3509 void 3510 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3511 { 3512 nfs4_server_t *sp; 3513 3514 /* this locks down sp if it is found */ 3515 sp = find_nfs4_server_all(mi, 1); 3516 3517 if (sp != NULL) { 3518 nfs4_dec_state_ref_count_nolock(sp, mi); 3519 mutex_exit(&sp->s_lock); 3520 nfs4_server_rele(sp); 3521 } 3522 } 3523 3524 /* 3525 * Decrement the number of OPEN files (ie: those with state) so we know if 3526 * this nfs4_server has any state to maintain a lease for or not. 3527 */ 3528 void 3529 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3530 { 3531 ASSERT(mutex_owned(&sp->s_lock)); 3532 ASSERT(sp->state_ref_count != 0); 3533 sp->state_ref_count--; 3534 3535 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3536 "nfs4_dec_state_ref_count: state ref count now %d", 3537 sp->state_ref_count)); 3538 3539 mi->mi_open_files--; 3540 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3541 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3542 mi->mi_open_files, mi->mi_flags)); 3543 3544 /* We don't have to hold the mi_lock to test mi_flags */ 3545 if (mi->mi_open_files == 0 && 3546 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3547 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3548 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3549 "we have closed the last open file", (void*)mi)); 3550 nfs4_remove_mi_from_server(mi, sp); 3551 } 3552 } 3553 3554 bool_t 3555 inlease(nfs4_server_t *sp) 3556 { 3557 bool_t result; 3558 3559 ASSERT(mutex_owned(&sp->s_lock)); 3560 3561 if (sp->lease_valid == NFS4_LEASE_VALID && 3562 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3563 result = TRUE; 3564 else 3565 result = FALSE; 3566 3567 return (result); 3568 } 3569 3570 3571 /* 3572 * Return non-zero if the given nfs4_server_t is going through recovery. 3573 */ 3574 3575 int 3576 nfs4_server_in_recovery(nfs4_server_t *sp) 3577 { 3578 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3579 } 3580 3581 /* 3582 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3583 * first is less than, equal to, or greater than the second. 3584 */ 3585 3586 int 3587 sfh4cmp(const void *p1, const void *p2) 3588 { 3589 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3590 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3591 3592 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3593 } 3594 3595 /* 3596 * Create a table for shared filehandle objects. 3597 */ 3598 3599 void 3600 sfh4_createtab(avl_tree_t *tab) 3601 { 3602 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3603 offsetof(nfs4_sharedfh_t, sfh_tree)); 3604 } 3605 3606 /* 3607 * Return a shared filehandle object for the given filehandle. The caller 3608 * is responsible for eventually calling sfh4_rele(). 3609 */ 3610 3611 nfs4_sharedfh_t * 3612 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3613 { 3614 nfs4_sharedfh_t *sfh, *nsfh; 3615 avl_index_t where; 3616 nfs4_sharedfh_t skey; 3617 3618 if (!key) { 3619 skey.sfh_fh = *fh; 3620 key = &skey; 3621 } 3622 3623 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3624 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3625 /* 3626 * We allocate the largest possible filehandle size because it's 3627 * not that big, and it saves us from possibly having to resize the 3628 * buffer later. 3629 */ 3630 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3631 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3632 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3633 nsfh->sfh_refcnt = 1; 3634 nsfh->sfh_flags = SFH4_IN_TREE; 3635 nsfh->sfh_mi = mi; 3636 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3637 (void *)nsfh)); 3638 3639 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3640 sfh = avl_find(&mi->mi_filehandles, key, &where); 3641 if (sfh != NULL) { 3642 mutex_enter(&sfh->sfh_lock); 3643 sfh->sfh_refcnt++; 3644 mutex_exit(&sfh->sfh_lock); 3645 nfs_rw_exit(&mi->mi_fh_lock); 3646 /* free our speculative allocs */ 3647 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3648 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3649 return (sfh); 3650 } 3651 3652 avl_insert(&mi->mi_filehandles, nsfh, where); 3653 nfs_rw_exit(&mi->mi_fh_lock); 3654 3655 return (nsfh); 3656 } 3657 3658 /* 3659 * Return a shared filehandle object for the given filehandle. The caller 3660 * is responsible for eventually calling sfh4_rele(). 3661 */ 3662 3663 nfs4_sharedfh_t * 3664 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3665 { 3666 nfs4_sharedfh_t *sfh; 3667 nfs4_sharedfh_t key; 3668 3669 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3670 3671 #ifdef DEBUG 3672 if (nfs4_sharedfh_debug) { 3673 nfs4_fhandle_t fhandle; 3674 3675 fhandle.fh_len = fh->nfs_fh4_len; 3676 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3677 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3678 nfs4_printfhandle(&fhandle); 3679 } 3680 #endif 3681 3682 /* 3683 * If there's already an object for the given filehandle, bump the 3684 * reference count and return it. Otherwise, create a new object 3685 * and add it to the AVL tree. 3686 */ 3687 3688 key.sfh_fh = *fh; 3689 3690 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3691 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3692 if (sfh != NULL) { 3693 mutex_enter(&sfh->sfh_lock); 3694 sfh->sfh_refcnt++; 3695 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3696 "sfh4_get: found existing %p, new refcnt=%d", 3697 (void *)sfh, sfh->sfh_refcnt)); 3698 mutex_exit(&sfh->sfh_lock); 3699 nfs_rw_exit(&mi->mi_fh_lock); 3700 return (sfh); 3701 } 3702 nfs_rw_exit(&mi->mi_fh_lock); 3703 3704 return (sfh4_put(fh, mi, &key)); 3705 } 3706 3707 /* 3708 * Get a reference to the given shared filehandle object. 3709 */ 3710 3711 void 3712 sfh4_hold(nfs4_sharedfh_t *sfh) 3713 { 3714 ASSERT(sfh->sfh_refcnt > 0); 3715 3716 mutex_enter(&sfh->sfh_lock); 3717 sfh->sfh_refcnt++; 3718 NFS4_DEBUG(nfs4_sharedfh_debug, 3719 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3720 (void *)sfh, sfh->sfh_refcnt)); 3721 mutex_exit(&sfh->sfh_lock); 3722 } 3723 3724 /* 3725 * Release a reference to the given shared filehandle object and null out 3726 * the given pointer. 3727 */ 3728 3729 void 3730 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3731 { 3732 mntinfo4_t *mi; 3733 nfs4_sharedfh_t *sfh = *sfhpp; 3734 3735 ASSERT(sfh->sfh_refcnt > 0); 3736 3737 mutex_enter(&sfh->sfh_lock); 3738 if (sfh->sfh_refcnt > 1) { 3739 sfh->sfh_refcnt--; 3740 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3741 "sfh4_rele %p, new refcnt=%d", 3742 (void *)sfh, sfh->sfh_refcnt)); 3743 mutex_exit(&sfh->sfh_lock); 3744 goto finish; 3745 } 3746 mutex_exit(&sfh->sfh_lock); 3747 3748 /* 3749 * Possibly the last reference, so get the lock for the table in 3750 * case it's time to remove the object from the table. 3751 */ 3752 mi = sfh->sfh_mi; 3753 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3754 mutex_enter(&sfh->sfh_lock); 3755 sfh->sfh_refcnt--; 3756 if (sfh->sfh_refcnt > 0) { 3757 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3758 "sfh4_rele %p, new refcnt=%d", 3759 (void *)sfh, sfh->sfh_refcnt)); 3760 mutex_exit(&sfh->sfh_lock); 3761 nfs_rw_exit(&mi->mi_fh_lock); 3762 goto finish; 3763 } 3764 3765 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3766 "sfh4_rele %p, last ref", (void *)sfh)); 3767 if (sfh->sfh_flags & SFH4_IN_TREE) { 3768 avl_remove(&mi->mi_filehandles, sfh); 3769 sfh->sfh_flags &= ~SFH4_IN_TREE; 3770 } 3771 mutex_exit(&sfh->sfh_lock); 3772 nfs_rw_exit(&mi->mi_fh_lock); 3773 mutex_destroy(&sfh->sfh_lock); 3774 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3775 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3776 3777 finish: 3778 *sfhpp = NULL; 3779 } 3780 3781 /* 3782 * Update the filehandle for the given shared filehandle object. 3783 */ 3784 3785 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3786 3787 void 3788 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3789 { 3790 mntinfo4_t *mi = sfh->sfh_mi; 3791 nfs4_sharedfh_t *dupsfh; 3792 avl_index_t where; 3793 nfs4_sharedfh_t key; 3794 3795 #ifdef DEBUG 3796 mutex_enter(&sfh->sfh_lock); 3797 ASSERT(sfh->sfh_refcnt > 0); 3798 mutex_exit(&sfh->sfh_lock); 3799 #endif 3800 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3801 3802 /* 3803 * The basic plan is to remove the shared filehandle object from 3804 * the table, update it to have the new filehandle, then reinsert 3805 * it. 3806 */ 3807 3808 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3809 mutex_enter(&sfh->sfh_lock); 3810 if (sfh->sfh_flags & SFH4_IN_TREE) { 3811 avl_remove(&mi->mi_filehandles, sfh); 3812 sfh->sfh_flags &= ~SFH4_IN_TREE; 3813 } 3814 mutex_exit(&sfh->sfh_lock); 3815 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3816 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3817 sfh->sfh_fh.nfs_fh4_len); 3818 3819 /* 3820 * XXX If there is already a shared filehandle object with the new 3821 * filehandle, we're in trouble, because the rnode code assumes 3822 * that there is only one shared filehandle object for a given 3823 * filehandle. So issue a warning (for read-write mounts only) 3824 * and don't try to re-insert the given object into the table. 3825 * Hopefully the given object will quickly go away and everyone 3826 * will use the new object. 3827 */ 3828 key.sfh_fh = *newfh; 3829 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3830 if (dupsfh != NULL) { 3831 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3832 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3833 "duplicate filehandle detected"); 3834 sfh4_printfhandle(dupsfh); 3835 } 3836 } else { 3837 avl_insert(&mi->mi_filehandles, sfh, where); 3838 mutex_enter(&sfh->sfh_lock); 3839 sfh->sfh_flags |= SFH4_IN_TREE; 3840 mutex_exit(&sfh->sfh_lock); 3841 } 3842 nfs_rw_exit(&mi->mi_fh_lock); 3843 } 3844 3845 /* 3846 * Copy out the current filehandle for the given shared filehandle object. 3847 */ 3848 3849 void 3850 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3851 { 3852 mntinfo4_t *mi = sfh->sfh_mi; 3853 3854 ASSERT(sfh->sfh_refcnt > 0); 3855 3856 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3857 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3858 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3859 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3860 nfs_rw_exit(&mi->mi_fh_lock); 3861 } 3862 3863 /* 3864 * Print out the filehandle for the given shared filehandle object. 3865 */ 3866 3867 void 3868 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3869 { 3870 nfs4_fhandle_t fhandle; 3871 3872 sfh4_copyval(sfh, &fhandle); 3873 nfs4_printfhandle(&fhandle); 3874 } 3875 3876 /* 3877 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3878 * if they're the same, +1 if the first is "greater" than the second. The 3879 * caller (or whoever's calling the AVL package) is responsible for 3880 * handling locking issues. 3881 */ 3882 3883 static int 3884 fncmp(const void *p1, const void *p2) 3885 { 3886 const nfs4_fname_t *f1 = p1; 3887 const nfs4_fname_t *f2 = p2; 3888 int res; 3889 3890 res = strcmp(f1->fn_name, f2->fn_name); 3891 /* 3892 * The AVL package wants +/-1, not arbitrary positive or negative 3893 * integers. 3894 */ 3895 if (res > 0) 3896 res = 1; 3897 else if (res < 0) 3898 res = -1; 3899 return (res); 3900 } 3901 3902 /* 3903 * Get or create an fname with the given name, as a child of the given 3904 * fname. The caller is responsible for eventually releasing the reference 3905 * (fn_rele()). parent may be NULL. 3906 */ 3907 3908 nfs4_fname_t * 3909 fn_get(nfs4_fname_t *parent, char *name) 3910 { 3911 nfs4_fname_t key; 3912 nfs4_fname_t *fnp; 3913 avl_index_t where; 3914 3915 key.fn_name = name; 3916 3917 /* 3918 * If there's already an fname registered with the given name, bump 3919 * its reference count and return it. Otherwise, create a new one 3920 * and add it to the parent's AVL tree. 3921 */ 3922 3923 if (parent != NULL) { 3924 mutex_enter(&parent->fn_lock); 3925 fnp = avl_find(&parent->fn_children, &key, &where); 3926 if (fnp != NULL) { 3927 fn_hold(fnp); 3928 mutex_exit(&parent->fn_lock); 3929 return (fnp); 3930 } 3931 } 3932 3933 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 3934 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 3935 fnp->fn_parent = parent; 3936 if (parent != NULL) 3937 fn_hold(parent); 3938 fnp->fn_len = strlen(name); 3939 ASSERT(fnp->fn_len < MAXNAMELEN); 3940 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 3941 (void) strcpy(fnp->fn_name, name); 3942 fnp->fn_refcnt = 1; 3943 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 3944 offsetof(nfs4_fname_t, fn_tree)); 3945 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3946 "fn_get %p:%s, a new nfs4_fname_t!", 3947 (void *)fnp, fnp->fn_name)); 3948 if (parent != NULL) { 3949 avl_insert(&parent->fn_children, fnp, where); 3950 mutex_exit(&parent->fn_lock); 3951 } 3952 3953 return (fnp); 3954 } 3955 3956 void 3957 fn_hold(nfs4_fname_t *fnp) 3958 { 3959 atomic_add_32(&fnp->fn_refcnt, 1); 3960 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3961 "fn_hold %p:%s, new refcnt=%d", 3962 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 3963 } 3964 3965 /* 3966 * Decrement the reference count of the given fname, and destroy it if its 3967 * reference count goes to zero. Nulls out the given pointer. 3968 */ 3969 3970 void 3971 fn_rele(nfs4_fname_t **fnpp) 3972 { 3973 nfs4_fname_t *parent; 3974 uint32_t newref; 3975 nfs4_fname_t *fnp; 3976 3977 recur: 3978 fnp = *fnpp; 3979 *fnpp = NULL; 3980 3981 mutex_enter(&fnp->fn_lock); 3982 parent = fnp->fn_parent; 3983 if (parent != NULL) 3984 mutex_enter(&parent->fn_lock); /* prevent new references */ 3985 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 3986 if (newref > 0) { 3987 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3988 "fn_rele %p:%s, new refcnt=%d", 3989 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 3990 if (parent != NULL) 3991 mutex_exit(&parent->fn_lock); 3992 mutex_exit(&fnp->fn_lock); 3993 return; 3994 } 3995 3996 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3997 "fn_rele %p:%s, last reference, deleting...", 3998 (void *)fnp, fnp->fn_name)); 3999 if (parent != NULL) { 4000 avl_remove(&parent->fn_children, fnp); 4001 mutex_exit(&parent->fn_lock); 4002 } 4003 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4004 mutex_destroy(&fnp->fn_lock); 4005 avl_destroy(&fnp->fn_children); 4006 kmem_free(fnp, sizeof (nfs4_fname_t)); 4007 /* 4008 * Recursivly fn_rele the parent. 4009 * Use goto instead of a recursive call to avoid stack overflow. 4010 */ 4011 if (parent != NULL) { 4012 fnpp = &parent; 4013 goto recur; 4014 } 4015 } 4016 4017 /* 4018 * Returns the single component name of the given fname, in a MAXNAMELEN 4019 * string buffer, which the caller is responsible for freeing. Note that 4020 * the name may become invalid as a result of fn_move(). 4021 */ 4022 4023 char * 4024 fn_name(nfs4_fname_t *fnp) 4025 { 4026 char *name; 4027 4028 ASSERT(fnp->fn_len < MAXNAMELEN); 4029 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4030 mutex_enter(&fnp->fn_lock); 4031 (void) strcpy(name, fnp->fn_name); 4032 mutex_exit(&fnp->fn_lock); 4033 4034 return (name); 4035 } 4036 4037 4038 /* 4039 * fn_path_realloc 4040 * 4041 * This function, used only by fn_path, constructs 4042 * a new string which looks like "prepend" + "/" + "current". 4043 * by allocating a new string and freeing the old one. 4044 */ 4045 static void 4046 fn_path_realloc(char **curses, char *prepend) 4047 { 4048 int len, curlen = 0; 4049 char *news; 4050 4051 if (*curses == NULL) { 4052 /* 4053 * Prime the pump, allocate just the 4054 * space for prepend and return that. 4055 */ 4056 len = strlen(prepend) + 1; 4057 news = kmem_alloc(len, KM_SLEEP); 4058 (void) strncpy(news, prepend, len); 4059 } else { 4060 /* 4061 * Allocate the space for a new string 4062 * +1 +1 is for the "/" and the NULL 4063 * byte at the end of it all. 4064 */ 4065 curlen = strlen(*curses); 4066 len = curlen + strlen(prepend) + 1 + 1; 4067 news = kmem_alloc(len, KM_SLEEP); 4068 (void) strncpy(news, prepend, len); 4069 (void) strcat(news, "/"); 4070 (void) strcat(news, *curses); 4071 kmem_free(*curses, curlen + 1); 4072 } 4073 *curses = news; 4074 } 4075 4076 /* 4077 * Returns the path name (starting from the fs root) for the given fname. 4078 * The caller is responsible for freeing. Note that the path may be or 4079 * become invalid as a result of fn_move(). 4080 */ 4081 4082 char * 4083 fn_path(nfs4_fname_t *fnp) 4084 { 4085 char *path; 4086 nfs4_fname_t *nextfnp; 4087 4088 if (fnp == NULL) 4089 return (NULL); 4090 4091 path = NULL; 4092 4093 /* walk up the tree constructing the pathname. */ 4094 4095 fn_hold(fnp); /* adjust for later rele */ 4096 do { 4097 mutex_enter(&fnp->fn_lock); 4098 /* 4099 * Add fn_name in front of the current path 4100 */ 4101 fn_path_realloc(&path, fnp->fn_name); 4102 nextfnp = fnp->fn_parent; 4103 if (nextfnp != NULL) 4104 fn_hold(nextfnp); 4105 mutex_exit(&fnp->fn_lock); 4106 fn_rele(&fnp); 4107 fnp = nextfnp; 4108 } while (fnp != NULL); 4109 4110 return (path); 4111 } 4112 4113 /* 4114 * Return a reference to the parent of the given fname, which the caller is 4115 * responsible for eventually releasing. 4116 */ 4117 4118 nfs4_fname_t * 4119 fn_parent(nfs4_fname_t *fnp) 4120 { 4121 nfs4_fname_t *parent; 4122 4123 mutex_enter(&fnp->fn_lock); 4124 parent = fnp->fn_parent; 4125 if (parent != NULL) 4126 fn_hold(parent); 4127 mutex_exit(&fnp->fn_lock); 4128 4129 return (parent); 4130 } 4131 4132 /* 4133 * Update fnp so that its parent is newparent and its name is newname. 4134 */ 4135 4136 void 4137 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4138 { 4139 nfs4_fname_t *parent, *tmpfnp; 4140 ssize_t newlen; 4141 nfs4_fname_t key; 4142 avl_index_t where; 4143 4144 /* 4145 * This assert exists to catch the client trying to rename 4146 * a dir to be a child of itself. This happened at a recent 4147 * bakeoff against a 3rd party (broken) server which allowed 4148 * the rename to succeed. If it trips it means that: 4149 * a) the code in nfs4rename that detects this case is broken 4150 * b) the server is broken (since it allowed the bogus rename) 4151 * 4152 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4153 * panic below from: mutex_enter(&newparent->fn_lock); 4154 */ 4155 ASSERT(fnp != newparent); 4156 4157 /* 4158 * Remove fnp from its current parent, change its name, then add it 4159 * to newparent. 4160 */ 4161 mutex_enter(&fnp->fn_lock); 4162 parent = fnp->fn_parent; 4163 mutex_enter(&parent->fn_lock); 4164 avl_remove(&parent->fn_children, fnp); 4165 mutex_exit(&parent->fn_lock); 4166 fn_rele(&fnp->fn_parent); 4167 4168 newlen = strlen(newname); 4169 if (newlen != fnp->fn_len) { 4170 ASSERT(newlen < MAXNAMELEN); 4171 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4172 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4173 fnp->fn_len = newlen; 4174 } 4175 (void) strcpy(fnp->fn_name, newname); 4176 4177 again: 4178 mutex_enter(&newparent->fn_lock); 4179 key.fn_name = fnp->fn_name; 4180 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4181 if (tmpfnp != NULL) { 4182 /* 4183 * This could be due to a file that was unlinked while 4184 * open, or perhaps the rnode is in the free list. Remove 4185 * it from newparent and let it go away on its own. The 4186 * contorted code is to deal with lock order issues and 4187 * race conditions. 4188 */ 4189 fn_hold(tmpfnp); 4190 mutex_exit(&newparent->fn_lock); 4191 mutex_enter(&tmpfnp->fn_lock); 4192 if (tmpfnp->fn_parent == newparent) { 4193 mutex_enter(&newparent->fn_lock); 4194 avl_remove(&newparent->fn_children, tmpfnp); 4195 mutex_exit(&newparent->fn_lock); 4196 fn_rele(&tmpfnp->fn_parent); 4197 } 4198 mutex_exit(&tmpfnp->fn_lock); 4199 fn_rele(&tmpfnp); 4200 goto again; 4201 } 4202 fnp->fn_parent = newparent; 4203 fn_hold(newparent); 4204 avl_insert(&newparent->fn_children, fnp, where); 4205 mutex_exit(&newparent->fn_lock); 4206 mutex_exit(&fnp->fn_lock); 4207 } 4208 4209 #ifdef DEBUG 4210 /* 4211 * Return non-zero if the type information makes sense for the given vnode. 4212 * Otherwise panic. 4213 */ 4214 int 4215 nfs4_consistent_type(vnode_t *vp) 4216 { 4217 rnode4_t *rp = VTOR4(vp); 4218 4219 if (nfs4_vtype_debug && vp->v_type != VNON && 4220 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4221 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4222 "rnode attr type=%d", (void *)vp, vp->v_type, 4223 rp->r_attr.va_type); 4224 } 4225 4226 return (1); 4227 } 4228 #endif /* DEBUG */ 4229