1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/thread.h> 37 #include <sys/t_lock.h> 38 #include <sys/time.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/errno.h> 42 #include <sys/buf.h> 43 #include <sys/stat.h> 44 #include <sys/cred.h> 45 #include <sys/kmem.h> 46 #include <sys/debug.h> 47 #include <sys/dnlc.h> 48 #include <sys/vmsystm.h> 49 #include <sys/flock.h> 50 #include <sys/share.h> 51 #include <sys/cmn_err.h> 52 #include <sys/tiuser.h> 53 #include <sys/sysmacros.h> 54 #include <sys/callb.h> 55 #include <sys/acl.h> 56 #include <sys/kstat.h> 57 #include <sys/signal.h> 58 #include <sys/disp.h> 59 #include <sys/atomic.h> 60 #include <sys/list.h> 61 #include <sys/sdt.h> 62 63 #include <rpc/types.h> 64 #include <rpc/xdr.h> 65 #include <rpc/auth.h> 66 #include <rpc/clnt.h> 67 68 #include <nfs/nfs.h> 69 #include <nfs/nfs_clnt.h> 70 #include <nfs/nfs_acl.h> 71 72 #include <nfs/nfs4.h> 73 #include <nfs/rnode4.h> 74 #include <nfs/nfs4_clnt.h> 75 76 #include <vm/hat.h> 77 #include <vm/as.h> 78 #include <vm/page.h> 79 #include <vm/pvn.h> 80 #include <vm/seg.h> 81 #include <vm/seg_map.h> 82 #include <vm/seg_vn.h> 83 84 #include <sys/ddi.h> 85 86 /* 87 * Arguments to page-flush thread. 88 */ 89 typedef struct { 90 vnode_t *vp; 91 cred_t *cr; 92 } pgflush_t; 93 94 #ifdef DEBUG 95 int nfs4_client_lease_debug; 96 int nfs4_sharedfh_debug; 97 int nfs4_fname_debug; 98 99 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 100 int nfs4_vtype_debug; 101 102 uint_t nfs4_tsd_key; 103 #endif 104 105 static time_t nfs4_client_resumed = 0; 106 static callb_id_t cid = 0; 107 108 static int nfs4renew(nfs4_server_t *); 109 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 110 static void nfs4_pgflush_thread(pgflush_t *); 111 static void flush_pages(vnode_t *, cred_t *); 112 113 static boolean_t nfs4_client_cpr_callb(void *, int); 114 115 struct mi4_globals { 116 kmutex_t mig_lock; /* lock protecting mig_list */ 117 list_t mig_list; /* list of NFS v4 mounts in zone */ 118 boolean_t mig_destructor_called; 119 }; 120 121 static zone_key_t mi4_list_key; 122 123 /* 124 * Attributes caching: 125 * 126 * Attributes are cached in the rnode in struct vattr form. 127 * There is a time associated with the cached attributes (r_time_attr_inval) 128 * which tells whether the attributes are valid. The time is initialized 129 * to the difference between current time and the modify time of the vnode 130 * when new attributes are cached. This allows the attributes for 131 * files that have changed recently to be timed out sooner than for files 132 * that have not changed for a long time. There are minimum and maximum 133 * timeout values that can be set per mount point. 134 */ 135 136 /* 137 * If a cache purge is in progress, wait for it to finish. 138 * 139 * The current thread must not be in the middle of an 140 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 141 * between this thread, a recovery thread, and the page flush thread. 142 */ 143 int 144 nfs4_waitfor_purge_complete(vnode_t *vp) 145 { 146 rnode4_t *rp; 147 k_sigset_t smask; 148 149 rp = VTOR4(vp); 150 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 151 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 152 mutex_enter(&rp->r_statelock); 153 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 154 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 155 ((rp->r_flags & R4PGFLUSH) && 156 rp->r_pgflush != curthread)) { 157 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 158 sigunintr(&smask); 159 mutex_exit(&rp->r_statelock); 160 return (EINTR); 161 } 162 } 163 sigunintr(&smask); 164 mutex_exit(&rp->r_statelock); 165 } 166 return (0); 167 } 168 169 /* 170 * Validate caches by checking cached attributes. If they have timed out, 171 * then get new attributes from the server. As a side effect, cache 172 * invalidation is done if the attributes have changed. 173 * 174 * If the attributes have not timed out and if there is a cache 175 * invalidation being done by some other thread, then wait until that 176 * thread has completed the cache invalidation. 177 */ 178 int 179 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 180 { 181 int error; 182 nfs4_ga_res_t gar; 183 184 if (ATTRCACHE4_VALID(vp)) { 185 error = nfs4_waitfor_purge_complete(vp); 186 if (error) 187 return (error); 188 return (0); 189 } 190 191 gar.n4g_va.va_mask = AT_ALL; 192 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 193 } 194 195 /* 196 * Fill in attribute from the cache. 197 * If valid, then return 0 to indicate that no error occurred, 198 * otherwise return 1 to indicate that an error occurred. 199 */ 200 static int 201 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 202 { 203 rnode4_t *rp; 204 205 rp = VTOR4(vp); 206 mutex_enter(&rp->r_statelock); 207 mutex_enter(&rp->r_statev4_lock); 208 if (ATTRCACHE4_VALID(vp)) { 209 mutex_exit(&rp->r_statev4_lock); 210 /* 211 * Cached attributes are valid 212 */ 213 *vap = rp->r_attr; 214 mutex_exit(&rp->r_statelock); 215 return (0); 216 } 217 mutex_exit(&rp->r_statev4_lock); 218 mutex_exit(&rp->r_statelock); 219 return (1); 220 } 221 222 223 /* 224 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 225 * call is synchronous because all the pages were invalidated by the 226 * nfs4_invalidate_pages() call. 227 */ 228 void 229 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 230 { 231 struct rnode4 *rp = VTOR4(vp); 232 233 /* Ensure that the ..._end_op() call has been done */ 234 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 235 236 if (errno != ESTALE) 237 return; 238 239 mutex_enter(&rp->r_statelock); 240 rp->r_flags |= R4STALE; 241 if (!rp->r_error) 242 rp->r_error = errno; 243 mutex_exit(&rp->r_statelock); 244 if (nfs4_has_pages(vp)) 245 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 246 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 247 } 248 249 /* 250 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 251 * page purge is done asynchronously. 252 */ 253 void 254 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 255 { 256 rnode4_t *rp; 257 char *contents; 258 vnode_t *xattr; 259 int size; 260 int pgflush; /* are we the page flush thread? */ 261 262 /* 263 * Purge the DNLC for any entries which refer to this file. 264 */ 265 if (vp->v_count > 1 && 266 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 267 dnlc_purge_vp(vp); 268 269 /* 270 * Clear any readdir state bits and purge the readlink response cache. 271 */ 272 rp = VTOR4(vp); 273 mutex_enter(&rp->r_statelock); 274 rp->r_flags &= ~R4LOOKUP; 275 contents = rp->r_symlink.contents; 276 size = rp->r_symlink.size; 277 rp->r_symlink.contents = NULL; 278 279 xattr = rp->r_xattr_dir; 280 rp->r_xattr_dir = NULL; 281 282 /* 283 * Purge pathconf cache too. 284 */ 285 rp->r_pathconf.pc4_xattr_valid = 0; 286 rp->r_pathconf.pc4_cache_valid = 0; 287 288 pgflush = (curthread == rp->r_pgflush); 289 mutex_exit(&rp->r_statelock); 290 291 if (contents != NULL) { 292 293 kmem_free((void *)contents, size); 294 } 295 296 if (xattr != NULL) 297 VN_RELE(xattr); 298 299 /* 300 * Flush the page cache. If the current thread is the page flush 301 * thread, don't initiate a new page flush. There's no need for 302 * it, and doing it correctly is hard. 303 */ 304 if (nfs4_has_pages(vp) && !pgflush) { 305 if (!asyncpg) { 306 (void) nfs4_waitfor_purge_complete(vp); 307 flush_pages(vp, cr); 308 } else { 309 pgflush_t *args; 310 311 /* 312 * We don't hold r_statelock while creating the 313 * thread, in case the call blocks. So we use a 314 * flag to indicate that a page flush thread is 315 * active. 316 */ 317 mutex_enter(&rp->r_statelock); 318 if (rp->r_flags & R4PGFLUSH) { 319 mutex_exit(&rp->r_statelock); 320 } else { 321 rp->r_flags |= R4PGFLUSH; 322 mutex_exit(&rp->r_statelock); 323 324 args = kmem_alloc(sizeof (pgflush_t), 325 KM_SLEEP); 326 args->vp = vp; 327 VN_HOLD(args->vp); 328 args->cr = cr; 329 crhold(args->cr); 330 (void) zthread_create(NULL, 0, 331 nfs4_pgflush_thread, args, 0, 332 minclsyspri); 333 } 334 } 335 } 336 337 /* 338 * Flush the readdir response cache. 339 */ 340 nfs4_purge_rddir_cache(vp); 341 } 342 343 /* 344 * Invalidate all pages for the given file, after writing back the dirty 345 * ones. 346 */ 347 348 static void 349 flush_pages(vnode_t *vp, cred_t *cr) 350 { 351 int error; 352 rnode4_t *rp = VTOR4(vp); 353 354 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 355 if (error == ENOSPC || error == EDQUOT) { 356 mutex_enter(&rp->r_statelock); 357 if (!rp->r_error) 358 rp->r_error = error; 359 mutex_exit(&rp->r_statelock); 360 } 361 } 362 363 /* 364 * Page flush thread. 365 */ 366 367 static void 368 nfs4_pgflush_thread(pgflush_t *args) 369 { 370 rnode4_t *rp = VTOR4(args->vp); 371 372 /* remember which thread we are, so we don't deadlock ourselves */ 373 mutex_enter(&rp->r_statelock); 374 ASSERT(rp->r_pgflush == NULL); 375 rp->r_pgflush = curthread; 376 mutex_exit(&rp->r_statelock); 377 378 flush_pages(args->vp, args->cr); 379 380 mutex_enter(&rp->r_statelock); 381 rp->r_pgflush = NULL; 382 rp->r_flags &= ~R4PGFLUSH; 383 cv_broadcast(&rp->r_cv); 384 mutex_exit(&rp->r_statelock); 385 386 VN_RELE(args->vp); 387 crfree(args->cr); 388 kmem_free(args, sizeof (pgflush_t)); 389 zthread_exit(); 390 } 391 392 /* 393 * Purge the readdir cache of all entries which are not currently 394 * being filled. 395 */ 396 void 397 nfs4_purge_rddir_cache(vnode_t *vp) 398 { 399 rnode4_t *rp; 400 401 rp = VTOR4(vp); 402 403 mutex_enter(&rp->r_statelock); 404 rp->r_direof = NULL; 405 rp->r_flags &= ~R4LOOKUP; 406 rp->r_flags |= R4READDIRWATTR; 407 rddir4_cache_purge(rp); 408 mutex_exit(&rp->r_statelock); 409 } 410 411 /* 412 * Set attributes cache for given vnode using virtual attributes. There is 413 * no cache validation, but if the attributes are deemed to be stale, they 414 * are ignored. This corresponds to nfs3_attrcache(). 415 * 416 * Set the timeout value on the attribute cache and fill it 417 * with the passed in attributes. 418 */ 419 void 420 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 421 { 422 rnode4_t *rp = VTOR4(vp); 423 424 mutex_enter(&rp->r_statelock); 425 if (rp->r_time_attr_saved <= t) 426 nfs4_attrcache_va(vp, garp, FALSE); 427 mutex_exit(&rp->r_statelock); 428 } 429 430 /* 431 * Use the passed in virtual attributes to check to see whether the 432 * data and metadata caches are valid, cache the new attributes, and 433 * then do the cache invalidation if required. 434 * 435 * The cache validation and caching of the new attributes is done 436 * atomically via the use of the mutex, r_statelock. If required, 437 * the cache invalidation is done atomically w.r.t. the cache 438 * validation and caching of the attributes via the pseudo lock, 439 * r_serial. 440 * 441 * This routine is used to do cache validation and attributes caching 442 * for operations with a single set of post operation attributes. 443 */ 444 445 void 446 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 447 hrtime_t t, cred_t *cr, int async, 448 change_info4 *cinfo) 449 { 450 rnode4_t *rp; 451 int mtime_changed; 452 int ctime_changed; 453 vsecattr_t *vsp; 454 int was_serial, set_time_cache_inval, recov; 455 vattr_t *vap = &garp->n4g_va; 456 mntinfo4_t *mi = VTOMI4(vp); 457 458 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 459 460 /* Is curthread the recovery thread? */ 461 mutex_enter(&mi->mi_lock); 462 recov = (VTOMI4(vp)->mi_recovthread == curthread); 463 mutex_exit(&mi->mi_lock); 464 465 rp = VTOR4(vp); 466 mutex_enter(&rp->r_statelock); 467 was_serial = (rp->r_serial == curthread); 468 if (rp->r_serial && !was_serial) { 469 klwp_t *lwp = ttolwp(curthread); 470 471 /* 472 * If we're the recovery thread, then purge current attrs 473 * and bail out to avoid potential deadlock between another 474 * thread caching attrs (r_serial thread), recov thread, 475 * and an async writer thread. 476 */ 477 if (recov) { 478 PURGE_ATTRCACHE4_LOCKED(rp); 479 mutex_exit(&rp->r_statelock); 480 return; 481 } 482 483 if (lwp != NULL) 484 lwp->lwp_nostop++; 485 while (rp->r_serial != NULL) { 486 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 487 mutex_exit(&rp->r_statelock); 488 if (lwp != NULL) 489 lwp->lwp_nostop--; 490 return; 491 } 492 } 493 if (lwp != NULL) 494 lwp->lwp_nostop--; 495 } 496 497 /* 498 * If there is a page flush thread, the current thread needs to 499 * bail out, to prevent a possible deadlock between the current 500 * thread (which might be in a start_op/end_op region), the 501 * recovery thread, and the page flush thread. Expire the 502 * attribute cache, so that any attributes the current thread was 503 * going to set are not lost. 504 */ 505 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 506 PURGE_ATTRCACHE4_LOCKED(rp); 507 mutex_exit(&rp->r_statelock); 508 return; 509 } 510 511 if (rp->r_time_attr_saved > t) { 512 /* 513 * Attributes have been cached since these attributes were 514 * probably made. If there is an inconsistency in what is 515 * cached, mark them invalid. If not, don't act on them. 516 */ 517 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 518 PURGE_ATTRCACHE4_LOCKED(rp); 519 mutex_exit(&rp->r_statelock); 520 return; 521 } 522 set_time_cache_inval = 0; 523 if (cinfo) { 524 /* 525 * Only directory modifying callers pass non-NULL cinfo. 526 */ 527 ASSERT(vp->v_type == VDIR); 528 /* 529 * If the cache timeout either doesn't exist or hasn't expired, 530 * and dir didn't changed on server before dirmod op 531 * and dir didn't change after dirmod op but before getattr 532 * then there's a chance that the client's cached data for 533 * this object is current (not stale). No immediate cache 534 * flush is required. 535 * 536 */ 537 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 538 cinfo->before == rp->r_change && 539 (garp->n4g_change_valid && 540 cinfo->after == garp->n4g_change)) { 541 542 /* 543 * If atomic isn't set, then the before/after info 544 * cannot be blindly trusted. For this case, we tell 545 * nfs4_attrcache_va to cache the attrs but also 546 * establish an absolute maximum cache timeout. When 547 * the timeout is reached, caches will be flushed. 548 */ 549 if (! cinfo->atomic) 550 set_time_cache_inval = 1; 551 552 mtime_changed = 0; 553 ctime_changed = 0; 554 } else { 555 556 /* 557 * We're not sure exactly what changed, but we know 558 * what to do. flush all caches for dir. remove the 559 * attr timeout. 560 * 561 * a) timeout expired. flush all caches. 562 * b) r_change != cinfo.before. flush all caches. 563 * c) r_change == cinfo.before, but cinfo.after != 564 * post-op getattr(change). flush all caches. 565 * d) post-op getattr(change) not provided by server. 566 * flush all caches. 567 */ 568 mtime_changed = 1; 569 ctime_changed = 1; 570 rp->r_time_cache_inval = 0; 571 } 572 } else { 573 if (!(rp->r_flags & R4WRITEMODIFIED)) { 574 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 575 mtime_changed = 1; 576 else 577 mtime_changed = 0; 578 if (rp->r_attr.va_ctime.tv_sec != 579 vap->va_ctime.tv_sec || 580 rp->r_attr.va_ctime.tv_nsec != 581 vap->va_ctime.tv_nsec) 582 ctime_changed = 1; 583 else 584 ctime_changed = 0; 585 } else { 586 mtime_changed = 0; 587 ctime_changed = 0; 588 } 589 } 590 591 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 592 593 if (!mtime_changed && !ctime_changed) { 594 mutex_exit(&rp->r_statelock); 595 return; 596 } 597 598 rp->r_serial = curthread; 599 600 mutex_exit(&rp->r_statelock); 601 602 /* 603 * If we're the recov thread, then force async nfs4_purge_caches 604 * to avoid potential deadlock. 605 */ 606 if (mtime_changed) 607 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 608 609 if (ctime_changed) { 610 (void) nfs4_access_purge_rp(rp); 611 if (rp->r_secattr != NULL) { 612 mutex_enter(&rp->r_statelock); 613 vsp = rp->r_secattr; 614 rp->r_secattr = NULL; 615 mutex_exit(&rp->r_statelock); 616 if (vsp != NULL) 617 nfs4_acl_free_cache(vsp); 618 } 619 } 620 621 if (!was_serial) { 622 mutex_enter(&rp->r_statelock); 623 rp->r_serial = NULL; 624 cv_broadcast(&rp->r_cv); 625 mutex_exit(&rp->r_statelock); 626 } 627 } 628 629 /* 630 * Set attributes cache for given vnode using virtual attributes. 631 * 632 * Set the timeout value on the attribute cache and fill it 633 * with the passed in attributes. 634 * 635 * The caller must be holding r_statelock. 636 */ 637 static void 638 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 639 { 640 rnode4_t *rp; 641 mntinfo4_t *mi; 642 hrtime_t delta; 643 hrtime_t now; 644 vattr_t *vap = &garp->n4g_va; 645 646 rp = VTOR4(vp); 647 648 ASSERT(MUTEX_HELD(&rp->r_statelock)); 649 ASSERT(vap->va_mask == AT_ALL); 650 651 /* Switch to master before checking v_flag */ 652 if (IS_SHADOW(vp, rp)) 653 vp = RTOV4(rp); 654 655 now = gethrtime(); 656 657 mi = VTOMI4(vp); 658 659 /* 660 * Only establish a new cache timeout (if requested). Never 661 * extend a timeout. Never clear a timeout. Clearing a timeout 662 * is done by nfs4_update_dircaches (ancestor in our call chain) 663 */ 664 if (set_cache_timeout && ! rp->r_time_cache_inval) 665 rp->r_time_cache_inval = now + mi->mi_acdirmax; 666 667 /* 668 * Delta is the number of nanoseconds that we will 669 * cache the attributes of the file. It is based on 670 * the number of nanoseconds since the last time that 671 * we detected a change. The assumption is that files 672 * that changed recently are likely to change again. 673 * There is a minimum and a maximum for regular files 674 * and for directories which is enforced though. 675 * 676 * Using the time since last change was detected 677 * eliminates direct comparison or calculation 678 * using mixed client and server times. NFS does 679 * not make any assumptions regarding the client 680 * and server clocks being synchronized. 681 */ 682 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 683 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 684 vap->va_size != rp->r_attr.va_size) { 685 rp->r_time_attr_saved = now; 686 } 687 688 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 689 delta = 0; 690 else { 691 delta = now - rp->r_time_attr_saved; 692 if (vp->v_type == VDIR) { 693 if (delta < mi->mi_acdirmin) 694 delta = mi->mi_acdirmin; 695 else if (delta > mi->mi_acdirmax) 696 delta = mi->mi_acdirmax; 697 } else { 698 if (delta < mi->mi_acregmin) 699 delta = mi->mi_acregmin; 700 else if (delta > mi->mi_acregmax) 701 delta = mi->mi_acregmax; 702 } 703 } 704 rp->r_time_attr_inval = now + delta; 705 706 rp->r_attr = *vap; 707 if (garp->n4g_change_valid) 708 rp->r_change = garp->n4g_change; 709 710 /* 711 * The attributes that were returned may be valid and can 712 * be used, but they may not be allowed to be cached. 713 * Reset the timers to cause immediate invalidation and 714 * clear r_change so no VERIFY operations will suceed 715 */ 716 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 717 rp->r_time_attr_inval = now; 718 rp->r_time_attr_saved = now; 719 rp->r_change = 0; 720 } 721 722 /* 723 * If mounted_on_fileid returned AND the object is a stub, 724 * then set object's va_nodeid to the mounted over fid 725 * returned by server. 726 * 727 * If mounted_on_fileid not provided/supported, then 728 * just set it to 0 for now. Eventually it would be 729 * better to set it to a hashed version of FH. This 730 * would probably be good enough to provide a unique 731 * fid/d_ino within a dir. 732 * 733 * We don't need to carry mounted_on_fileid in the 734 * rnode as long as the client never requests fileid 735 * without also requesting mounted_on_fileid. For 736 * now, it stays. 737 */ 738 if (garp->n4g_mon_fid_valid) { 739 rp->r_mntd_fid = garp->n4g_mon_fid; 740 741 if (RP_ISSTUB(rp)) 742 rp->r_attr.va_nodeid = rp->r_mntd_fid; 743 } 744 745 /* 746 * Check to see if there are valid pathconf bits to 747 * cache in the rnode. 748 */ 749 if (garp->n4g_ext_res) { 750 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 751 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 752 } else { 753 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 754 rp->r_pathconf.pc4_xattr_valid = TRUE; 755 rp->r_pathconf.pc4_xattr_exists = 756 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 757 } 758 } 759 } 760 /* 761 * Update the size of the file if there is no cached data or if 762 * the cached data is clean and there is no data being written 763 * out. 764 */ 765 if (rp->r_size != vap->va_size && 766 (!vn_has_cached_data(vp) || 767 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 768 rp->r_size = vap->va_size; 769 } 770 nfs_setswaplike(vp, vap); 771 rp->r_flags &= ~R4WRITEMODIFIED; 772 } 773 774 /* 775 * Get attributes over-the-wire and update attributes cache 776 * if no error occurred in the over-the-wire operation. 777 * Return 0 if successful, otherwise error. 778 */ 779 int 780 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 781 { 782 mntinfo4_t *mi = VTOMI4(vp); 783 hrtime_t t; 784 nfs4_recov_state_t recov_state; 785 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 786 787 recov_state.rs_flags = 0; 788 recov_state.rs_num_retry_despite_err = 0; 789 790 /* Save the original mount point security flavor */ 791 (void) save_mnt_secinfo(mi->mi_curr_serv); 792 793 recov_retry: 794 795 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 796 &recov_state, NULL))) { 797 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 798 return (e.error); 799 } 800 801 t = gethrtime(); 802 803 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 804 805 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 806 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 807 NULL, OP_GETATTR, NULL) == FALSE) { 808 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 809 &recov_state, 1); 810 goto recov_retry; 811 } 812 } 813 814 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 815 816 if (!e.error) { 817 if (e.stat == NFS4_OK) { 818 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 819 } else { 820 e.error = geterrno4(e.stat); 821 822 nfs4_purge_stale_fh(e.error, vp, cr); 823 } 824 } 825 826 /* 827 * If getattr a node that is a stub for a crossed 828 * mount point, keep the original secinfo flavor for 829 * the current file system, not the crossed one. 830 */ 831 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 832 833 return (e.error); 834 } 835 836 /* 837 * Generate a compound to get attributes over-the-wire. 838 */ 839 void 840 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 841 nfs4_error_t *ep, cred_t *cr, int get_acl) 842 { 843 COMPOUND4args_clnt args; 844 COMPOUND4res_clnt res; 845 int doqueue; 846 rnode4_t *rp = VTOR4(vp); 847 nfs_argop4 argop[2]; 848 849 args.ctag = TAG_GETATTR; 850 851 args.array_len = 2; 852 args.array = argop; 853 854 /* putfh */ 855 argop[0].argop = OP_CPUTFH; 856 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 857 858 /* getattr */ 859 /* 860 * Unlike nfs version 2 and 3, where getattr returns all the 861 * attributes, nfs version 4 returns only the ones explicitly 862 * asked for. This creates problems, as some system functions 863 * (e.g. cache check) require certain attributes and if the 864 * cached node lacks some attributes such as uid/gid, it can 865 * affect system utilities (e.g. "ls") that rely on the information 866 * to be there. This can lead to anything from system crashes to 867 * corrupted information processed by user apps. 868 * So to ensure that all bases are covered, request at least 869 * the AT_ALL attribute mask. 870 */ 871 argop[1].argop = OP_GETATTR; 872 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 873 if (get_acl) 874 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 875 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 876 877 doqueue = 1; 878 879 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 880 881 if (ep->error) 882 return; 883 884 if (res.status != NFS4_OK) { 885 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 886 return; 887 } 888 889 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 890 891 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 892 } 893 894 /* 895 * Return either cached or remote attributes. If get remote attr 896 * use them to check and invalidate caches, then cache the new attributes. 897 */ 898 int 899 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 900 { 901 int error; 902 rnode4_t *rp; 903 nfs4_ga_res_t gar; 904 905 ASSERT(nfs4_consistent_type(vp)); 906 907 /* 908 * If we've got cached attributes, we're done, otherwise go 909 * to the server to get attributes, which will update the cache 910 * in the process. Either way, use the cached attributes for 911 * the caller's vattr_t. 912 * 913 * Note that we ignore the gar set by the OTW call: the attr caching 914 * code may make adjustments when storing to the rnode, and we want 915 * to see those changes here. 916 */ 917 rp = VTOR4(vp); 918 error = 0; 919 mutex_enter(&rp->r_statelock); 920 if (!ATTRCACHE4_VALID(vp)) { 921 mutex_exit(&rp->r_statelock); 922 error = nfs4_getattr_otw(vp, &gar, cr, 0); 923 mutex_enter(&rp->r_statelock); 924 } 925 926 if (!error) 927 *vap = rp->r_attr; 928 929 /* Return the client's view of file size */ 930 vap->va_size = rp->r_size; 931 932 mutex_exit(&rp->r_statelock); 933 934 ASSERT(nfs4_consistent_type(vp)); 935 936 return (error); 937 } 938 939 int 940 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 941 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 942 { 943 COMPOUND4args_clnt args; 944 COMPOUND4res_clnt res; 945 int doqueue; 946 nfs_argop4 argop[2]; 947 mntinfo4_t *mi = VTOMI4(vp); 948 bool_t needrecov = FALSE; 949 nfs4_recov_state_t recov_state; 950 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 951 nfs4_ga_ext_res_t *gerp; 952 953 recov_state.rs_flags = 0; 954 recov_state.rs_num_retry_despite_err = 0; 955 956 recov_retry: 957 args.ctag = tag_type; 958 959 args.array_len = 2; 960 args.array = argop; 961 962 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 963 if (e.error) 964 return (e.error); 965 966 /* putfh */ 967 argop[0].argop = OP_CPUTFH; 968 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 969 970 /* getattr */ 971 argop[1].argop = OP_GETATTR; 972 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 973 argop[1].nfs_argop4_u.opgetattr.mi = mi; 974 975 doqueue = 1; 976 977 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 978 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 979 rnode4info(VTOR4(vp)))); 980 981 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 982 983 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 984 if (!needrecov && e.error) { 985 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 986 needrecov); 987 return (e.error); 988 } 989 990 if (needrecov) { 991 bool_t abort; 992 993 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 994 "nfs4_attr_otw: initiating recovery\n")); 995 996 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 997 NULL, OP_GETATTR, NULL); 998 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 999 needrecov); 1000 if (!e.error) { 1001 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1002 e.error = geterrno4(res.status); 1003 } 1004 if (abort == FALSE) 1005 goto recov_retry; 1006 return (e.error); 1007 } 1008 1009 if (res.status) { 1010 e.error = geterrno4(res.status); 1011 } else { 1012 gerp = garp->n4g_ext_res; 1013 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1014 garp, sizeof (nfs4_ga_res_t)); 1015 garp->n4g_ext_res = gerp; 1016 if (garp->n4g_ext_res && 1017 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1018 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1019 ga_res.n4g_ext_res, 1020 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1021 } 1022 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1023 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1024 needrecov); 1025 return (e.error); 1026 } 1027 1028 /* 1029 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1030 * for the demand-based allocation of async threads per-mount. The 1031 * nfs_async_timeout is the amount of time a thread will live after it 1032 * becomes idle, unless new I/O requests are received before the thread 1033 * dies. See nfs4_async_putpage and nfs4_async_start. 1034 */ 1035 1036 static void nfs4_async_start(struct vfs *); 1037 1038 static void 1039 free_async_args4(struct nfs4_async_reqs *args) 1040 { 1041 rnode4_t *rp; 1042 1043 if (args->a_io != NFS4_INACTIVE) { 1044 rp = VTOR4(args->a_vp); 1045 mutex_enter(&rp->r_statelock); 1046 rp->r_count--; 1047 if (args->a_io == NFS4_PUTAPAGE || 1048 args->a_io == NFS4_PAGEIO) 1049 rp->r_awcount--; 1050 cv_broadcast(&rp->r_cv); 1051 mutex_exit(&rp->r_statelock); 1052 VN_RELE(args->a_vp); 1053 } 1054 crfree(args->a_cred); 1055 kmem_free(args, sizeof (*args)); 1056 } 1057 1058 /* 1059 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1060 * pageout(), running in the global zone, have legitimate reasons to do 1061 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1062 * use of a a per-mount "asynchronous requests manager thread" which is 1063 * signaled by the various asynchronous work routines when there is 1064 * asynchronous work to be done. It is responsible for creating new 1065 * worker threads if necessary, and notifying existing worker threads 1066 * that there is work to be done. 1067 * 1068 * In other words, it will "take the specifications from the customers and 1069 * give them to the engineers." 1070 * 1071 * Worker threads die off of their own accord if they are no longer 1072 * needed. 1073 * 1074 * This thread is killed when the zone is going away or the filesystem 1075 * is being unmounted. 1076 */ 1077 void 1078 nfs4_async_manager(vfs_t *vfsp) 1079 { 1080 callb_cpr_t cprinfo; 1081 mntinfo4_t *mi; 1082 uint_t max_threads; 1083 1084 mi = VFTOMI4(vfsp); 1085 1086 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1087 "nfs4_async_manager"); 1088 1089 mutex_enter(&mi->mi_async_lock); 1090 /* 1091 * We want to stash the max number of threads that this mount was 1092 * allowed so we can use it later when the variable is set to zero as 1093 * part of the zone/mount going away. 1094 * 1095 * We want to be able to create at least one thread to handle 1096 * asyncrhonous inactive calls. 1097 */ 1098 max_threads = MAX(mi->mi_max_threads, 1); 1099 mutex_enter(&mi->mi_lock); 1100 /* 1101 * We don't want to wait for mi_max_threads to go to zero, since that 1102 * happens as part of a failed unmount, but this thread should only 1103 * exit when the mount is really going away. 1104 * 1105 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1106 * attempted: the various _async_*() functions know to do things 1107 * inline if mi_max_threads == 0. Henceforth we just drain out the 1108 * outstanding requests. 1109 * 1110 * Note that we still create zthreads even if we notice the zone is 1111 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1112 * shutdown sequence to take slightly longer in some cases, but 1113 * doesn't violate the protocol, as all threads will exit as soon as 1114 * they're done processing the remaining requests. 1115 */ 1116 while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) || 1117 mi->mi_async_req_count > 0) { 1118 mutex_exit(&mi->mi_lock); 1119 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1120 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1121 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1122 while (mi->mi_async_req_count > 0) { 1123 /* 1124 * Paranoia: If the mount started out having 1125 * (mi->mi_max_threads == 0), and the value was 1126 * later changed (via a debugger or somesuch), 1127 * we could be confused since we will think we 1128 * can't create any threads, and the calling 1129 * code (which looks at the current value of 1130 * mi->mi_max_threads, now non-zero) thinks we 1131 * can. 1132 * 1133 * So, because we're paranoid, we create threads 1134 * up to the maximum of the original and the 1135 * current value. This means that future 1136 * (debugger-induced) alterations of 1137 * mi->mi_max_threads are ignored for our 1138 * purposes, but who told them they could change 1139 * random values on a live kernel anyhow? 1140 */ 1141 if (mi->mi_threads < 1142 MAX(mi->mi_max_threads, max_threads)) { 1143 mi->mi_threads++; 1144 mutex_exit(&mi->mi_async_lock); 1145 MI4_HOLD(mi); 1146 VFS_HOLD(vfsp); /* hold for new thread */ 1147 (void) zthread_create(NULL, 0, nfs4_async_start, 1148 vfsp, 0, minclsyspri); 1149 mutex_enter(&mi->mi_async_lock); 1150 } 1151 cv_signal(&mi->mi_async_work_cv); 1152 ASSERT(mi->mi_async_req_count != 0); 1153 mi->mi_async_req_count--; 1154 } 1155 mutex_enter(&mi->mi_lock); 1156 } 1157 mutex_exit(&mi->mi_lock); 1158 1159 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1160 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1161 /* 1162 * Let everyone know we're done. 1163 */ 1164 mi->mi_manager_thread = NULL; 1165 /* 1166 * Wake up the inactive thread. 1167 */ 1168 cv_broadcast(&mi->mi_inact_req_cv); 1169 /* 1170 * Wake up anyone sitting in nfs4_async_manager_stop() 1171 */ 1172 cv_broadcast(&mi->mi_async_cv); 1173 /* 1174 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1175 * since CALLB_CPR_EXIT is actually responsible for releasing 1176 * 'mi_async_lock'. 1177 */ 1178 CALLB_CPR_EXIT(&cprinfo); 1179 VFS_RELE(vfsp); /* release thread's hold */ 1180 MI4_RELE(mi); 1181 zthread_exit(); 1182 } 1183 1184 /* 1185 * Signal (and wait for) the async manager thread to clean up and go away. 1186 */ 1187 void 1188 nfs4_async_manager_stop(vfs_t *vfsp) 1189 { 1190 mntinfo4_t *mi = VFTOMI4(vfsp); 1191 1192 mutex_enter(&mi->mi_async_lock); 1193 mutex_enter(&mi->mi_lock); 1194 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1195 mutex_exit(&mi->mi_lock); 1196 cv_broadcast(&mi->mi_async_reqs_cv); 1197 /* 1198 * Wait for the async manager thread to die. 1199 */ 1200 while (mi->mi_manager_thread != NULL) 1201 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1202 mutex_exit(&mi->mi_async_lock); 1203 } 1204 1205 int 1206 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1207 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1208 u_offset_t, caddr_t, struct seg *, cred_t *)) 1209 { 1210 rnode4_t *rp; 1211 mntinfo4_t *mi; 1212 struct nfs4_async_reqs *args; 1213 1214 rp = VTOR4(vp); 1215 ASSERT(rp->r_freef == NULL); 1216 1217 mi = VTOMI4(vp); 1218 1219 /* 1220 * If addr falls in a different segment, don't bother doing readahead. 1221 */ 1222 if (addr >= seg->s_base + seg->s_size) 1223 return (-1); 1224 1225 /* 1226 * If we can't allocate a request structure, punt on the readahead. 1227 */ 1228 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1229 return (-1); 1230 1231 /* 1232 * If a lock operation is pending, don't initiate any new 1233 * readaheads. Otherwise, bump r_count to indicate the new 1234 * asynchronous I/O. 1235 */ 1236 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1237 kmem_free(args, sizeof (*args)); 1238 return (-1); 1239 } 1240 mutex_enter(&rp->r_statelock); 1241 rp->r_count++; 1242 mutex_exit(&rp->r_statelock); 1243 nfs_rw_exit(&rp->r_lkserlock); 1244 1245 args->a_next = NULL; 1246 #ifdef DEBUG 1247 args->a_queuer = curthread; 1248 #endif 1249 VN_HOLD(vp); 1250 args->a_vp = vp; 1251 ASSERT(cr != NULL); 1252 crhold(cr); 1253 args->a_cred = cr; 1254 args->a_io = NFS4_READ_AHEAD; 1255 args->a_nfs4_readahead = readahead; 1256 args->a_nfs4_blkoff = blkoff; 1257 args->a_nfs4_seg = seg; 1258 args->a_nfs4_addr = addr; 1259 1260 mutex_enter(&mi->mi_async_lock); 1261 1262 /* 1263 * If asyncio has been disabled, don't bother readahead. 1264 */ 1265 if (mi->mi_max_threads == 0) { 1266 mutex_exit(&mi->mi_async_lock); 1267 goto noasync; 1268 } 1269 1270 /* 1271 * Link request structure into the async list and 1272 * wakeup async thread to do the i/o. 1273 */ 1274 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1275 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1276 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1277 } else { 1278 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1279 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1280 } 1281 1282 if (mi->mi_io_kstats) { 1283 mutex_enter(&mi->mi_lock); 1284 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1285 mutex_exit(&mi->mi_lock); 1286 } 1287 1288 mi->mi_async_req_count++; 1289 ASSERT(mi->mi_async_req_count != 0); 1290 cv_signal(&mi->mi_async_reqs_cv); 1291 mutex_exit(&mi->mi_async_lock); 1292 return (0); 1293 1294 noasync: 1295 mutex_enter(&rp->r_statelock); 1296 rp->r_count--; 1297 cv_broadcast(&rp->r_cv); 1298 mutex_exit(&rp->r_statelock); 1299 VN_RELE(vp); 1300 crfree(cr); 1301 kmem_free(args, sizeof (*args)); 1302 return (-1); 1303 } 1304 1305 /* 1306 * The async queues for each mounted file system are arranged as a 1307 * set of queues, one for each async i/o type. Requests are taken 1308 * from the queues in a round-robin fashion. A number of consecutive 1309 * requests are taken from each queue before moving on to the next 1310 * queue. This functionality may allow the NFS Version 2 server to do 1311 * write clustering, even if the client is mixing writes and reads 1312 * because it will take multiple write requests from the queue 1313 * before processing any of the other async i/o types. 1314 * 1315 * XXX The nfs4_async_start thread is unsafe in the light of the present 1316 * model defined by cpr to suspend the system. Specifically over the 1317 * wire calls are cpr-unsafe. The thread should be reevaluated in 1318 * case of future updates to the cpr model. 1319 */ 1320 static void 1321 nfs4_async_start(struct vfs *vfsp) 1322 { 1323 struct nfs4_async_reqs *args; 1324 mntinfo4_t *mi = VFTOMI4(vfsp); 1325 clock_t time_left = 1; 1326 callb_cpr_t cprinfo; 1327 int i; 1328 extern int nfs_async_timeout; 1329 1330 /* 1331 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1332 * built in an implementation independent manner. 1333 */ 1334 if (nfs_async_timeout == -1) 1335 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1336 1337 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1338 1339 mutex_enter(&mi->mi_async_lock); 1340 for (;;) { 1341 /* 1342 * Find the next queue containing an entry. We start 1343 * at the current queue pointer and then round robin 1344 * through all of them until we either find a non-empty 1345 * queue or have looked through all of them. 1346 */ 1347 for (i = 0; i < NFS4_ASYNC_TYPES; i++) { 1348 args = *mi->mi_async_curr; 1349 if (args != NULL) 1350 break; 1351 mi->mi_async_curr++; 1352 if (mi->mi_async_curr == 1353 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1354 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1355 } 1356 /* 1357 * If we didn't find a entry, then block until woken up 1358 * again and then look through the queues again. 1359 */ 1360 if (args == NULL) { 1361 /* 1362 * Exiting is considered to be safe for CPR as well 1363 */ 1364 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1365 1366 /* 1367 * Wakeup thread waiting to unmount the file 1368 * system only if all async threads are inactive. 1369 * 1370 * If we've timed-out and there's nothing to do, 1371 * then get rid of this thread. 1372 */ 1373 if (mi->mi_max_threads == 0 || time_left <= 0) { 1374 if (--mi->mi_threads == 0) 1375 cv_signal(&mi->mi_async_cv); 1376 CALLB_CPR_EXIT(&cprinfo); 1377 VFS_RELE(vfsp); /* release thread's hold */ 1378 MI4_RELE(mi); 1379 zthread_exit(); 1380 /* NOTREACHED */ 1381 } 1382 time_left = cv_timedwait(&mi->mi_async_work_cv, 1383 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1384 1385 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1386 1387 continue; 1388 } else { 1389 time_left = 1; 1390 } 1391 1392 /* 1393 * Remove the request from the async queue and then 1394 * update the current async request queue pointer. If 1395 * the current queue is empty or we have removed enough 1396 * consecutive entries from it, then reset the counter 1397 * for this queue and then move the current pointer to 1398 * the next queue. 1399 */ 1400 *mi->mi_async_curr = args->a_next; 1401 if (*mi->mi_async_curr == NULL || 1402 --mi->mi_async_clusters[args->a_io] == 0) { 1403 mi->mi_async_clusters[args->a_io] = 1404 mi->mi_async_init_clusters; 1405 mi->mi_async_curr++; 1406 if (mi->mi_async_curr == 1407 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1408 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1409 } 1410 1411 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1412 mutex_enter(&mi->mi_lock); 1413 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1414 mutex_exit(&mi->mi_lock); 1415 } 1416 1417 mutex_exit(&mi->mi_async_lock); 1418 1419 /* 1420 * Obtain arguments from the async request structure. 1421 */ 1422 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1423 (*args->a_nfs4_readahead)(args->a_vp, 1424 args->a_nfs4_blkoff, args->a_nfs4_addr, 1425 args->a_nfs4_seg, args->a_cred); 1426 } else if (args->a_io == NFS4_PUTAPAGE) { 1427 (void) (*args->a_nfs4_putapage)(args->a_vp, 1428 args->a_nfs4_pp, args->a_nfs4_off, 1429 args->a_nfs4_len, args->a_nfs4_flags, 1430 args->a_cred); 1431 } else if (args->a_io == NFS4_PAGEIO) { 1432 (void) (*args->a_nfs4_pageio)(args->a_vp, 1433 args->a_nfs4_pp, args->a_nfs4_off, 1434 args->a_nfs4_len, args->a_nfs4_flags, 1435 args->a_cred); 1436 } else if (args->a_io == NFS4_READDIR) { 1437 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1438 args->a_nfs4_rdc, args->a_cred)); 1439 } else if (args->a_io == NFS4_COMMIT) { 1440 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1441 args->a_nfs4_offset, args->a_nfs4_count, 1442 args->a_cred); 1443 } else if (args->a_io == NFS4_INACTIVE) { 1444 nfs4_inactive_otw(args->a_vp, args->a_cred); 1445 } 1446 1447 /* 1448 * Now, release the vnode and free the credentials 1449 * structure. 1450 */ 1451 free_async_args4(args); 1452 /* 1453 * Reacquire the mutex because it will be needed above. 1454 */ 1455 mutex_enter(&mi->mi_async_lock); 1456 } 1457 } 1458 1459 /* 1460 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1461 * part of VOP_INACTIVE. 1462 */ 1463 1464 void 1465 nfs4_inactive_thread(mntinfo4_t *mi) 1466 { 1467 struct nfs4_async_reqs *args; 1468 callb_cpr_t cprinfo; 1469 vfs_t *vfsp = mi->mi_vfsp; 1470 1471 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1472 "nfs4_inactive_thread"); 1473 1474 for (;;) { 1475 mutex_enter(&mi->mi_async_lock); 1476 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1477 if (args == NULL) { 1478 mutex_enter(&mi->mi_lock); 1479 /* 1480 * We don't want to exit until the async manager is done 1481 * with its work; hence the check for mi_manager_thread 1482 * being NULL. 1483 * 1484 * The async manager thread will cv_broadcast() on 1485 * mi_inact_req_cv when it's done, at which point we'll 1486 * wake up and exit. 1487 */ 1488 if (mi->mi_manager_thread == NULL) 1489 goto die; 1490 mi->mi_flags |= MI4_INACTIVE_IDLE; 1491 mutex_exit(&mi->mi_lock); 1492 cv_signal(&mi->mi_async_cv); 1493 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1494 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1495 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1496 mutex_exit(&mi->mi_async_lock); 1497 } else { 1498 mutex_enter(&mi->mi_lock); 1499 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1500 mutex_exit(&mi->mi_lock); 1501 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1502 mutex_exit(&mi->mi_async_lock); 1503 nfs4_inactive_otw(args->a_vp, args->a_cred); 1504 crfree(args->a_cred); 1505 kmem_free(args, sizeof (*args)); 1506 } 1507 } 1508 die: 1509 mutex_exit(&mi->mi_lock); 1510 mi->mi_inactive_thread = NULL; 1511 cv_signal(&mi->mi_async_cv); 1512 1513 /* 1514 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1515 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1516 */ 1517 CALLB_CPR_EXIT(&cprinfo); 1518 1519 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1520 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1521 1522 MI4_RELE(mi); 1523 zthread_exit(); 1524 /* NOTREACHED */ 1525 } 1526 1527 /* 1528 * nfs_async_stop: 1529 * Wait for all outstanding putpage operations and the inactive thread to 1530 * complete; nfs4_async_stop_sig() without interruptibility. 1531 */ 1532 void 1533 nfs4_async_stop(struct vfs *vfsp) 1534 { 1535 mntinfo4_t *mi = VFTOMI4(vfsp); 1536 1537 /* 1538 * Wait for all outstanding async operations to complete and for 1539 * worker threads to exit. 1540 */ 1541 mutex_enter(&mi->mi_async_lock); 1542 mi->mi_max_threads = 0; 1543 cv_broadcast(&mi->mi_async_work_cv); 1544 while (mi->mi_threads != 0) 1545 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1546 1547 /* 1548 * Wait for the inactive thread to finish doing what it's doing. It 1549 * won't exit until the last reference to the vfs_t goes away. 1550 */ 1551 if (mi->mi_inactive_thread != NULL) { 1552 mutex_enter(&mi->mi_lock); 1553 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1554 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1555 mutex_exit(&mi->mi_lock); 1556 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1557 mutex_enter(&mi->mi_lock); 1558 } 1559 mutex_exit(&mi->mi_lock); 1560 } 1561 mutex_exit(&mi->mi_async_lock); 1562 } 1563 1564 /* 1565 * nfs_async_stop_sig: 1566 * Wait for all outstanding putpage operations and the inactive thread to 1567 * complete. If a signal is delivered we will abort and return non-zero; 1568 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1569 * need to make it interruptible. 1570 */ 1571 int 1572 nfs4_async_stop_sig(struct vfs *vfsp) 1573 { 1574 mntinfo4_t *mi = VFTOMI4(vfsp); 1575 ushort_t omax; 1576 bool_t intr = FALSE; 1577 1578 /* 1579 * Wait for all outstanding putpage operations to complete and for 1580 * worker threads to exit. 1581 */ 1582 mutex_enter(&mi->mi_async_lock); 1583 omax = mi->mi_max_threads; 1584 mi->mi_max_threads = 0; 1585 cv_broadcast(&mi->mi_async_work_cv); 1586 while (mi->mi_threads != 0) { 1587 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1588 intr = TRUE; 1589 goto interrupted; 1590 } 1591 } 1592 1593 /* 1594 * Wait for the inactive thread to finish doing what it's doing. It 1595 * won't exit until the a last reference to the vfs_t goes away. 1596 */ 1597 if (mi->mi_inactive_thread != NULL) { 1598 mutex_enter(&mi->mi_lock); 1599 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1600 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1601 mutex_exit(&mi->mi_lock); 1602 if (!cv_wait_sig(&mi->mi_async_cv, 1603 &mi->mi_async_lock)) { 1604 intr = TRUE; 1605 goto interrupted; 1606 } 1607 mutex_enter(&mi->mi_lock); 1608 } 1609 mutex_exit(&mi->mi_lock); 1610 } 1611 interrupted: 1612 if (intr) 1613 mi->mi_max_threads = omax; 1614 mutex_exit(&mi->mi_async_lock); 1615 1616 return (intr); 1617 } 1618 1619 int 1620 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1621 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1622 u_offset_t, size_t, int, cred_t *)) 1623 { 1624 rnode4_t *rp; 1625 mntinfo4_t *mi; 1626 struct nfs4_async_reqs *args; 1627 1628 ASSERT(flags & B_ASYNC); 1629 ASSERT(vp->v_vfsp != NULL); 1630 1631 rp = VTOR4(vp); 1632 ASSERT(rp->r_count > 0); 1633 1634 mi = VTOMI4(vp); 1635 1636 /* 1637 * If we can't allocate a request structure, do the putpage 1638 * operation synchronously in this thread's context. 1639 */ 1640 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1641 goto noasync; 1642 1643 args->a_next = NULL; 1644 #ifdef DEBUG 1645 args->a_queuer = curthread; 1646 #endif 1647 VN_HOLD(vp); 1648 args->a_vp = vp; 1649 ASSERT(cr != NULL); 1650 crhold(cr); 1651 args->a_cred = cr; 1652 args->a_io = NFS4_PUTAPAGE; 1653 args->a_nfs4_putapage = putapage; 1654 args->a_nfs4_pp = pp; 1655 args->a_nfs4_off = off; 1656 args->a_nfs4_len = (uint_t)len; 1657 args->a_nfs4_flags = flags; 1658 1659 mutex_enter(&mi->mi_async_lock); 1660 1661 /* 1662 * If asyncio has been disabled, then make a synchronous request. 1663 * This check is done a second time in case async io was diabled 1664 * while this thread was blocked waiting for memory pressure to 1665 * reduce or for the queue to drain. 1666 */ 1667 if (mi->mi_max_threads == 0) { 1668 mutex_exit(&mi->mi_async_lock); 1669 1670 VN_RELE(vp); 1671 crfree(cr); 1672 kmem_free(args, sizeof (*args)); 1673 goto noasync; 1674 } 1675 1676 /* 1677 * Link request structure into the async list and 1678 * wakeup async thread to do the i/o. 1679 */ 1680 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1681 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1682 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1683 } else { 1684 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1685 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1686 } 1687 1688 mutex_enter(&rp->r_statelock); 1689 rp->r_count++; 1690 rp->r_awcount++; 1691 mutex_exit(&rp->r_statelock); 1692 1693 if (mi->mi_io_kstats) { 1694 mutex_enter(&mi->mi_lock); 1695 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1696 mutex_exit(&mi->mi_lock); 1697 } 1698 1699 mi->mi_async_req_count++; 1700 ASSERT(mi->mi_async_req_count != 0); 1701 cv_signal(&mi->mi_async_reqs_cv); 1702 mutex_exit(&mi->mi_async_lock); 1703 return (0); 1704 1705 noasync: 1706 1707 if (curproc == proc_pageout || curproc == proc_fsflush || 1708 nfs_zone() == mi->mi_zone) { 1709 /* 1710 * If we get here in the context of the pageout/fsflush, 1711 * or we have run out of memory or we're attempting to 1712 * unmount we refuse to do a sync write, because this may 1713 * hang pageout/fsflush and the machine. In this case, 1714 * we just re-mark the page as dirty and punt on the page. 1715 * 1716 * Make sure B_FORCE isn't set. We can re-mark the 1717 * pages as dirty and unlock the pages in one swoop by 1718 * passing in B_ERROR to pvn_write_done(). However, 1719 * we should make sure B_FORCE isn't set - we don't 1720 * want the page tossed before it gets written out. 1721 */ 1722 if (flags & B_FORCE) 1723 flags &= ~(B_INVAL | B_FORCE); 1724 pvn_write_done(pp, flags | B_ERROR); 1725 return (0); 1726 } 1727 1728 /* 1729 * We'll get here only if (nfs_zone() != mi->mi_zone) 1730 * which means that this was a cross-zone sync putpage. 1731 * 1732 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1733 * as dirty and unlock them. 1734 * 1735 * We don't want to clear B_FORCE here as the caller presumably 1736 * knows what they're doing if they set it. 1737 */ 1738 pvn_write_done(pp, flags | B_ERROR); 1739 return (EPERM); 1740 } 1741 1742 int 1743 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1744 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1745 size_t, int, cred_t *)) 1746 { 1747 rnode4_t *rp; 1748 mntinfo4_t *mi; 1749 struct nfs4_async_reqs *args; 1750 1751 ASSERT(flags & B_ASYNC); 1752 ASSERT(vp->v_vfsp != NULL); 1753 1754 rp = VTOR4(vp); 1755 ASSERT(rp->r_count > 0); 1756 1757 mi = VTOMI4(vp); 1758 1759 /* 1760 * If we can't allocate a request structure, do the pageio 1761 * request synchronously in this thread's context. 1762 */ 1763 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1764 goto noasync; 1765 1766 args->a_next = NULL; 1767 #ifdef DEBUG 1768 args->a_queuer = curthread; 1769 #endif 1770 VN_HOLD(vp); 1771 args->a_vp = vp; 1772 ASSERT(cr != NULL); 1773 crhold(cr); 1774 args->a_cred = cr; 1775 args->a_io = NFS4_PAGEIO; 1776 args->a_nfs4_pageio = pageio; 1777 args->a_nfs4_pp = pp; 1778 args->a_nfs4_off = io_off; 1779 args->a_nfs4_len = (uint_t)io_len; 1780 args->a_nfs4_flags = flags; 1781 1782 mutex_enter(&mi->mi_async_lock); 1783 1784 /* 1785 * If asyncio has been disabled, then make a synchronous request. 1786 * This check is done a second time in case async io was diabled 1787 * while this thread was blocked waiting for memory pressure to 1788 * reduce or for the queue to drain. 1789 */ 1790 if (mi->mi_max_threads == 0) { 1791 mutex_exit(&mi->mi_async_lock); 1792 1793 VN_RELE(vp); 1794 crfree(cr); 1795 kmem_free(args, sizeof (*args)); 1796 goto noasync; 1797 } 1798 1799 /* 1800 * Link request structure into the async list and 1801 * wakeup async thread to do the i/o. 1802 */ 1803 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1804 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1805 mi->mi_async_tail[NFS4_PAGEIO] = args; 1806 } else { 1807 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1808 mi->mi_async_tail[NFS4_PAGEIO] = args; 1809 } 1810 1811 mutex_enter(&rp->r_statelock); 1812 rp->r_count++; 1813 rp->r_awcount++; 1814 mutex_exit(&rp->r_statelock); 1815 1816 if (mi->mi_io_kstats) { 1817 mutex_enter(&mi->mi_lock); 1818 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1819 mutex_exit(&mi->mi_lock); 1820 } 1821 1822 mi->mi_async_req_count++; 1823 ASSERT(mi->mi_async_req_count != 0); 1824 cv_signal(&mi->mi_async_reqs_cv); 1825 mutex_exit(&mi->mi_async_lock); 1826 return (0); 1827 1828 noasync: 1829 /* 1830 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1831 * the page list), for writes we do it synchronously, except for 1832 * proc_pageout/proc_fsflush as described below. 1833 */ 1834 if (flags & B_READ) { 1835 pvn_read_done(pp, flags | B_ERROR); 1836 return (0); 1837 } 1838 1839 if (curproc == proc_pageout || curproc == proc_fsflush) { 1840 /* 1841 * If we get here in the context of the pageout/fsflush, 1842 * we refuse to do a sync write, because this may hang 1843 * pageout/fsflush (and the machine). In this case, we just 1844 * re-mark the page as dirty and punt on the page. 1845 * 1846 * Make sure B_FORCE isn't set. We can re-mark the 1847 * pages as dirty and unlock the pages in one swoop by 1848 * passing in B_ERROR to pvn_write_done(). However, 1849 * we should make sure B_FORCE isn't set - we don't 1850 * want the page tossed before it gets written out. 1851 */ 1852 if (flags & B_FORCE) 1853 flags &= ~(B_INVAL | B_FORCE); 1854 pvn_write_done(pp, flags | B_ERROR); 1855 return (0); 1856 } 1857 1858 if (nfs_zone() != mi->mi_zone) { 1859 /* 1860 * So this was a cross-zone sync pageio. We pass in B_ERROR 1861 * to pvn_write_done() to re-mark the pages as dirty and unlock 1862 * them. 1863 * 1864 * We don't want to clear B_FORCE here as the caller presumably 1865 * knows what they're doing if they set it. 1866 */ 1867 pvn_write_done(pp, flags | B_ERROR); 1868 return (EPERM); 1869 } 1870 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1871 } 1872 1873 void 1874 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1875 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1876 { 1877 rnode4_t *rp; 1878 mntinfo4_t *mi; 1879 struct nfs4_async_reqs *args; 1880 1881 rp = VTOR4(vp); 1882 ASSERT(rp->r_freef == NULL); 1883 1884 mi = VTOMI4(vp); 1885 1886 /* 1887 * If we can't allocate a request structure, skip the readdir. 1888 */ 1889 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1890 goto noasync; 1891 1892 args->a_next = NULL; 1893 #ifdef DEBUG 1894 args->a_queuer = curthread; 1895 #endif 1896 VN_HOLD(vp); 1897 args->a_vp = vp; 1898 ASSERT(cr != NULL); 1899 crhold(cr); 1900 args->a_cred = cr; 1901 args->a_io = NFS4_READDIR; 1902 args->a_nfs4_readdir = readdir; 1903 args->a_nfs4_rdc = rdc; 1904 1905 mutex_enter(&mi->mi_async_lock); 1906 1907 /* 1908 * If asyncio has been disabled, then skip this request 1909 */ 1910 if (mi->mi_max_threads == 0) { 1911 mutex_exit(&mi->mi_async_lock); 1912 1913 VN_RELE(vp); 1914 crfree(cr); 1915 kmem_free(args, sizeof (*args)); 1916 goto noasync; 1917 } 1918 1919 /* 1920 * Link request structure into the async list and 1921 * wakeup async thread to do the i/o. 1922 */ 1923 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 1924 mi->mi_async_reqs[NFS4_READDIR] = args; 1925 mi->mi_async_tail[NFS4_READDIR] = args; 1926 } else { 1927 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 1928 mi->mi_async_tail[NFS4_READDIR] = args; 1929 } 1930 1931 mutex_enter(&rp->r_statelock); 1932 rp->r_count++; 1933 mutex_exit(&rp->r_statelock); 1934 1935 if (mi->mi_io_kstats) { 1936 mutex_enter(&mi->mi_lock); 1937 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1938 mutex_exit(&mi->mi_lock); 1939 } 1940 1941 mi->mi_async_req_count++; 1942 ASSERT(mi->mi_async_req_count != 0); 1943 cv_signal(&mi->mi_async_reqs_cv); 1944 mutex_exit(&mi->mi_async_lock); 1945 return; 1946 1947 noasync: 1948 mutex_enter(&rp->r_statelock); 1949 rdc->entries = NULL; 1950 /* 1951 * Indicate that no one is trying to fill this entry and 1952 * it still needs to be filled. 1953 */ 1954 rdc->flags &= ~RDDIR; 1955 rdc->flags |= RDDIRREQ; 1956 rddir4_cache_rele(rp, rdc); 1957 mutex_exit(&rp->r_statelock); 1958 } 1959 1960 void 1961 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1962 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1963 cred_t *)) 1964 { 1965 rnode4_t *rp; 1966 mntinfo4_t *mi; 1967 struct nfs4_async_reqs *args; 1968 page_t *pp; 1969 1970 rp = VTOR4(vp); 1971 mi = VTOMI4(vp); 1972 1973 /* 1974 * If we can't allocate a request structure, do the commit 1975 * operation synchronously in this thread's context. 1976 */ 1977 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1978 goto noasync; 1979 1980 args->a_next = NULL; 1981 #ifdef DEBUG 1982 args->a_queuer = curthread; 1983 #endif 1984 VN_HOLD(vp); 1985 args->a_vp = vp; 1986 ASSERT(cr != NULL); 1987 crhold(cr); 1988 args->a_cred = cr; 1989 args->a_io = NFS4_COMMIT; 1990 args->a_nfs4_commit = commit; 1991 args->a_nfs4_plist = plist; 1992 args->a_nfs4_offset = offset; 1993 args->a_nfs4_count = count; 1994 1995 mutex_enter(&mi->mi_async_lock); 1996 1997 /* 1998 * If asyncio has been disabled, then make a synchronous request. 1999 * This check is done a second time in case async io was diabled 2000 * while this thread was blocked waiting for memory pressure to 2001 * reduce or for the queue to drain. 2002 */ 2003 if (mi->mi_max_threads == 0) { 2004 mutex_exit(&mi->mi_async_lock); 2005 2006 VN_RELE(vp); 2007 crfree(cr); 2008 kmem_free(args, sizeof (*args)); 2009 goto noasync; 2010 } 2011 2012 /* 2013 * Link request structure into the async list and 2014 * wakeup async thread to do the i/o. 2015 */ 2016 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2017 mi->mi_async_reqs[NFS4_COMMIT] = args; 2018 mi->mi_async_tail[NFS4_COMMIT] = args; 2019 } else { 2020 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2021 mi->mi_async_tail[NFS4_COMMIT] = args; 2022 } 2023 2024 mutex_enter(&rp->r_statelock); 2025 rp->r_count++; 2026 mutex_exit(&rp->r_statelock); 2027 2028 if (mi->mi_io_kstats) { 2029 mutex_enter(&mi->mi_lock); 2030 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2031 mutex_exit(&mi->mi_lock); 2032 } 2033 2034 mi->mi_async_req_count++; 2035 ASSERT(mi->mi_async_req_count != 0); 2036 cv_signal(&mi->mi_async_reqs_cv); 2037 mutex_exit(&mi->mi_async_lock); 2038 return; 2039 2040 noasync: 2041 if (curproc == proc_pageout || curproc == proc_fsflush || 2042 nfs_zone() != mi->mi_zone) { 2043 while (plist != NULL) { 2044 pp = plist; 2045 page_sub(&plist, pp); 2046 pp->p_fsdata = C_COMMIT; 2047 page_unlock(pp); 2048 } 2049 return; 2050 } 2051 (*commit)(vp, plist, offset, count, cr); 2052 } 2053 2054 /* 2055 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2056 * reference to the vnode is handed over to the thread; the caller should 2057 * no longer refer to the vnode. 2058 * 2059 * Unlike most of the async routines, this handoff is needed for 2060 * correctness reasons, not just performance. So doing operations in the 2061 * context of the current thread is not an option. 2062 */ 2063 void 2064 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2065 { 2066 mntinfo4_t *mi; 2067 struct nfs4_async_reqs *args; 2068 boolean_t signal_inactive_thread = B_FALSE; 2069 2070 mi = VTOMI4(vp); 2071 2072 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2073 args->a_next = NULL; 2074 #ifdef DEBUG 2075 args->a_queuer = curthread; 2076 #endif 2077 args->a_vp = vp; 2078 ASSERT(cr != NULL); 2079 crhold(cr); 2080 args->a_cred = cr; 2081 args->a_io = NFS4_INACTIVE; 2082 2083 /* 2084 * Note that we don't check mi->mi_max_threads here, since we 2085 * *need* to get rid of this vnode regardless of whether someone 2086 * set nfs4_max_threads to zero in /etc/system. 2087 * 2088 * The manager thread knows about this and is willing to create 2089 * at least one thread to accommodate us. 2090 */ 2091 mutex_enter(&mi->mi_async_lock); 2092 if (mi->mi_inactive_thread == NULL) { 2093 rnode4_t *rp; 2094 vnode_t *unldvp = NULL; 2095 char *unlname; 2096 cred_t *unlcred; 2097 2098 mutex_exit(&mi->mi_async_lock); 2099 /* 2100 * We just need to free up the memory associated with the 2101 * vnode, which can be safely done from within the current 2102 * context. 2103 */ 2104 crfree(cr); /* drop our reference */ 2105 kmem_free(args, sizeof (*args)); 2106 rp = VTOR4(vp); 2107 mutex_enter(&rp->r_statelock); 2108 if (rp->r_unldvp != NULL) { 2109 unldvp = rp->r_unldvp; 2110 rp->r_unldvp = NULL; 2111 unlname = rp->r_unlname; 2112 rp->r_unlname = NULL; 2113 unlcred = rp->r_unlcred; 2114 rp->r_unlcred = NULL; 2115 } 2116 mutex_exit(&rp->r_statelock); 2117 /* 2118 * No need to explicitly throw away any cached pages. The 2119 * eventual r4inactive() will attempt a synchronous 2120 * VOP_PUTPAGE() which will immediately fail since the request 2121 * is coming from the wrong zone, and then will proceed to call 2122 * nfs4_invalidate_pages() which will clean things up for us. 2123 * 2124 * Throw away the delegation here so rp4_addfree()'s attempt to 2125 * return any existing delegations becomes a no-op. 2126 */ 2127 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2128 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2129 FALSE); 2130 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2131 nfs_rw_exit(&mi->mi_recovlock); 2132 } 2133 nfs4_clear_open_streams(rp); 2134 2135 rp4_addfree(rp, cr); 2136 if (unldvp != NULL) { 2137 kmem_free(unlname, MAXNAMELEN); 2138 VN_RELE(unldvp); 2139 crfree(unlcred); 2140 } 2141 return; 2142 } 2143 2144 if (mi->mi_manager_thread == NULL) { 2145 /* 2146 * We want to talk to the inactive thread. 2147 */ 2148 signal_inactive_thread = B_TRUE; 2149 } 2150 2151 /* 2152 * Enqueue the vnode and wake up either the special thread (empty 2153 * list) or an async thread. 2154 */ 2155 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2156 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2157 mi->mi_async_tail[NFS4_INACTIVE] = args; 2158 signal_inactive_thread = B_TRUE; 2159 } else { 2160 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2161 mi->mi_async_tail[NFS4_INACTIVE] = args; 2162 } 2163 if (signal_inactive_thread) { 2164 cv_signal(&mi->mi_inact_req_cv); 2165 } else { 2166 mi->mi_async_req_count++; 2167 ASSERT(mi->mi_async_req_count != 0); 2168 cv_signal(&mi->mi_async_reqs_cv); 2169 } 2170 2171 mutex_exit(&mi->mi_async_lock); 2172 } 2173 2174 int 2175 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2176 { 2177 int pagecreate; 2178 int n; 2179 int saved_n; 2180 caddr_t saved_base; 2181 u_offset_t offset; 2182 int error; 2183 int sm_error; 2184 vnode_t *vp = RTOV(rp); 2185 2186 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2187 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2188 if (!vpm_enable) { 2189 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2190 } 2191 2192 /* 2193 * Move bytes in at most PAGESIZE chunks. We must avoid 2194 * spanning pages in uiomove() because page faults may cause 2195 * the cache to be invalidated out from under us. The r_size is not 2196 * updated until after the uiomove. If we push the last page of a 2197 * file before r_size is correct, we will lose the data written past 2198 * the current (and invalid) r_size. 2199 */ 2200 do { 2201 offset = uio->uio_loffset; 2202 pagecreate = 0; 2203 2204 /* 2205 * n is the number of bytes required to satisfy the request 2206 * or the number of bytes to fill out the page. 2207 */ 2208 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2209 2210 /* 2211 * Check to see if we can skip reading in the page 2212 * and just allocate the memory. We can do this 2213 * if we are going to rewrite the entire mapping 2214 * or if we are going to write to or beyond the current 2215 * end of file from the beginning of the mapping. 2216 * 2217 * The read of r_size is now protected by r_statelock. 2218 */ 2219 mutex_enter(&rp->r_statelock); 2220 /* 2221 * When pgcreated is nonzero the caller has already done 2222 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2223 * segkpm this means we already have at least one page 2224 * created and mapped at base. 2225 */ 2226 pagecreate = pgcreated || 2227 ((offset & PAGEOFFSET) == 0 && 2228 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2229 2230 mutex_exit(&rp->r_statelock); 2231 2232 if (!vpm_enable && pagecreate) { 2233 /* 2234 * The last argument tells segmap_pagecreate() to 2235 * always lock the page, as opposed to sometimes 2236 * returning with the page locked. This way we avoid a 2237 * fault on the ensuing uiomove(), but also 2238 * more importantly (to fix bug 1094402) we can 2239 * call segmap_fault() to unlock the page in all 2240 * cases. An alternative would be to modify 2241 * segmap_pagecreate() to tell us when it is 2242 * locking a page, but that's a fairly major 2243 * interface change. 2244 */ 2245 if (pgcreated == 0) 2246 (void) segmap_pagecreate(segkmap, base, 2247 (uint_t)n, 1); 2248 saved_base = base; 2249 saved_n = n; 2250 } 2251 2252 /* 2253 * The number of bytes of data in the last page can not 2254 * be accurately be determined while page is being 2255 * uiomove'd to and the size of the file being updated. 2256 * Thus, inform threads which need to know accurately 2257 * how much data is in the last page of the file. They 2258 * will not do the i/o immediately, but will arrange for 2259 * the i/o to happen later when this modify operation 2260 * will have finished. 2261 */ 2262 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2263 mutex_enter(&rp->r_statelock); 2264 rp->r_flags |= R4MODINPROGRESS; 2265 rp->r_modaddr = (offset & MAXBMASK); 2266 mutex_exit(&rp->r_statelock); 2267 2268 if (vpm_enable) { 2269 /* 2270 * Copy data. If new pages are created, part of 2271 * the page that is not written will be initizliazed 2272 * with zeros. 2273 */ 2274 error = vpm_data_copy(vp, offset, n, uio, 2275 !pagecreate, NULL, 0, S_WRITE); 2276 } else { 2277 error = uiomove(base, n, UIO_WRITE, uio); 2278 } 2279 2280 /* 2281 * r_size is the maximum number of 2282 * bytes known to be in the file. 2283 * Make sure it is at least as high as the 2284 * first unwritten byte pointed to by uio_loffset. 2285 */ 2286 mutex_enter(&rp->r_statelock); 2287 if (rp->r_size < uio->uio_loffset) 2288 rp->r_size = uio->uio_loffset; 2289 rp->r_flags &= ~R4MODINPROGRESS; 2290 rp->r_flags |= R4DIRTY; 2291 mutex_exit(&rp->r_statelock); 2292 2293 /* n = # of bytes written */ 2294 n = (int)(uio->uio_loffset - offset); 2295 2296 if (!vpm_enable) { 2297 base += n; 2298 } 2299 2300 tcount -= n; 2301 /* 2302 * If we created pages w/o initializing them completely, 2303 * we need to zero the part that wasn't set up. 2304 * This happens on a most EOF write cases and if 2305 * we had some sort of error during the uiomove. 2306 */ 2307 if (!vpm_enable && pagecreate) { 2308 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2309 (void) kzero(base, PAGESIZE - n); 2310 2311 if (pgcreated) { 2312 /* 2313 * Caller is responsible for this page, 2314 * it was not created in this loop. 2315 */ 2316 pgcreated = 0; 2317 } else { 2318 /* 2319 * For bug 1094402: segmap_pagecreate locks 2320 * page. Unlock it. This also unlocks the 2321 * pages allocated by page_create_va() in 2322 * segmap_pagecreate(). 2323 */ 2324 sm_error = segmap_fault(kas.a_hat, segkmap, 2325 saved_base, saved_n, 2326 F_SOFTUNLOCK, S_WRITE); 2327 if (error == 0) 2328 error = sm_error; 2329 } 2330 } 2331 } while (tcount > 0 && error == 0); 2332 2333 return (error); 2334 } 2335 2336 int 2337 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2338 { 2339 rnode4_t *rp; 2340 page_t *pp; 2341 u_offset_t eoff; 2342 u_offset_t io_off; 2343 size_t io_len; 2344 int error; 2345 int rdirty; 2346 int err; 2347 2348 rp = VTOR4(vp); 2349 ASSERT(rp->r_count > 0); 2350 2351 if (!nfs4_has_pages(vp)) 2352 return (0); 2353 2354 ASSERT(vp->v_type != VCHR); 2355 2356 /* 2357 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2358 * writes. B_FORCE is set to force the VM system to actually 2359 * invalidate the pages, even if the i/o failed. The pages 2360 * need to get invalidated because they can't be written out 2361 * because there isn't any space left on either the server's 2362 * file system or in the user's disk quota. The B_FREE bit 2363 * is cleared to avoid confusion as to whether this is a 2364 * request to place the page on the freelist or to destroy 2365 * it. 2366 */ 2367 if ((rp->r_flags & R4OUTOFSPACE) || 2368 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2369 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2370 2371 if (len == 0) { 2372 /* 2373 * If doing a full file synchronous operation, then clear 2374 * the R4DIRTY bit. If a page gets dirtied while the flush 2375 * is happening, then R4DIRTY will get set again. The 2376 * R4DIRTY bit must get cleared before the flush so that 2377 * we don't lose this information. 2378 * 2379 * If there are no full file async write operations 2380 * pending and RDIRTY bit is set, clear it. 2381 */ 2382 if (off == (u_offset_t)0 && 2383 !(flags & B_ASYNC) && 2384 (rp->r_flags & R4DIRTY)) { 2385 mutex_enter(&rp->r_statelock); 2386 rdirty = (rp->r_flags & R4DIRTY); 2387 rp->r_flags &= ~R4DIRTY; 2388 mutex_exit(&rp->r_statelock); 2389 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2390 mutex_enter(&rp->r_statelock); 2391 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2392 rdirty = (rp->r_flags & R4DIRTY); 2393 rp->r_flags &= ~R4DIRTY; 2394 } 2395 mutex_exit(&rp->r_statelock); 2396 } else 2397 rdirty = 0; 2398 2399 /* 2400 * Search the entire vp list for pages >= off, and flush 2401 * the dirty pages. 2402 */ 2403 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2404 flags, cr); 2405 2406 /* 2407 * If an error occurred and the file was marked as dirty 2408 * before and we aren't forcibly invalidating pages, then 2409 * reset the R4DIRTY flag. 2410 */ 2411 if (error && rdirty && 2412 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2413 mutex_enter(&rp->r_statelock); 2414 rp->r_flags |= R4DIRTY; 2415 mutex_exit(&rp->r_statelock); 2416 } 2417 } else { 2418 /* 2419 * Do a range from [off...off + len) looking for pages 2420 * to deal with. 2421 */ 2422 error = 0; 2423 io_len = 0; 2424 eoff = off + len; 2425 mutex_enter(&rp->r_statelock); 2426 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2427 io_off += io_len) { 2428 mutex_exit(&rp->r_statelock); 2429 /* 2430 * If we are not invalidating, synchronously 2431 * freeing or writing pages use the routine 2432 * page_lookup_nowait() to prevent reclaiming 2433 * them from the free list. 2434 */ 2435 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2436 pp = page_lookup(vp, io_off, 2437 (flags & (B_INVAL | B_FREE)) ? 2438 SE_EXCL : SE_SHARED); 2439 } else { 2440 pp = page_lookup_nowait(vp, io_off, 2441 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2442 } 2443 2444 if (pp == NULL || !pvn_getdirty(pp, flags)) 2445 io_len = PAGESIZE; 2446 else { 2447 err = (*rp->r_putapage)(vp, pp, &io_off, 2448 &io_len, flags, cr); 2449 if (!error) 2450 error = err; 2451 /* 2452 * "io_off" and "io_len" are returned as 2453 * the range of pages we actually wrote. 2454 * This allows us to skip ahead more quickly 2455 * since several pages may've been dealt 2456 * with by this iteration of the loop. 2457 */ 2458 } 2459 mutex_enter(&rp->r_statelock); 2460 } 2461 mutex_exit(&rp->r_statelock); 2462 } 2463 2464 return (error); 2465 } 2466 2467 void 2468 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2469 { 2470 rnode4_t *rp; 2471 2472 rp = VTOR4(vp); 2473 if (IS_SHADOW(vp, rp)) 2474 vp = RTOV4(rp); 2475 mutex_enter(&rp->r_statelock); 2476 while (rp->r_flags & R4TRUNCATE) 2477 cv_wait(&rp->r_cv, &rp->r_statelock); 2478 rp->r_flags |= R4TRUNCATE; 2479 if (off == (u_offset_t)0) { 2480 rp->r_flags &= ~R4DIRTY; 2481 if (!(rp->r_flags & R4STALE)) 2482 rp->r_error = 0; 2483 } 2484 rp->r_truncaddr = off; 2485 mutex_exit(&rp->r_statelock); 2486 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2487 B_INVAL | B_TRUNC, cr); 2488 mutex_enter(&rp->r_statelock); 2489 rp->r_flags &= ~R4TRUNCATE; 2490 cv_broadcast(&rp->r_cv); 2491 mutex_exit(&rp->r_statelock); 2492 } 2493 2494 static int 2495 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2496 { 2497 mntinfo4_t *mi; 2498 struct mntinfo_kstat *mik; 2499 vfs_t *vfsp; 2500 2501 /* this is a read-only kstat. Bail out on a write */ 2502 if (rw == KSTAT_WRITE) 2503 return (EACCES); 2504 2505 2506 /* 2507 * We don't want to wait here as kstat_chain_lock could be held by 2508 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2509 * and thus could lead to a deadlock. 2510 */ 2511 vfsp = (struct vfs *)ksp->ks_private; 2512 2513 mi = VFTOMI4(vfsp); 2514 mik = (struct mntinfo_kstat *)ksp->ks_data; 2515 2516 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2517 2518 mik->mik_vers = (uint32_t)mi->mi_vers; 2519 mik->mik_flags = mi->mi_flags; 2520 /* 2521 * The sv_secdata holds the flavor the client specifies. 2522 * If the client uses default and a security negotiation 2523 * occurs, sv_currsec will point to the current flavor 2524 * selected from the server flavor list. 2525 * sv_currsec is NULL if no security negotiation takes place. 2526 */ 2527 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2528 mi->mi_curr_serv->sv_currsec->secmod : 2529 mi->mi_curr_serv->sv_secdata->secmod; 2530 mik->mik_curread = (uint32_t)mi->mi_curread; 2531 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2532 mik->mik_retrans = mi->mi_retrans; 2533 mik->mik_timeo = mi->mi_timeo; 2534 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2535 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2536 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2537 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2538 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2539 mik->mik_failover = (uint32_t)mi->mi_failover; 2540 mik->mik_remap = (uint32_t)mi->mi_remap; 2541 2542 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2543 2544 return (0); 2545 } 2546 2547 void 2548 nfs4_mnt_kstat_init(struct vfs *vfsp) 2549 { 2550 mntinfo4_t *mi = VFTOMI4(vfsp); 2551 2552 /* 2553 * PSARC 2001/697 Contract Private Interface 2554 * All nfs kstats are under SunMC contract 2555 * Please refer to the PSARC listed above and contact 2556 * SunMC before making any changes! 2557 * 2558 * Changes must be reviewed by Solaris File Sharing 2559 * Changes must be communicated to contract-2001-697@sun.com 2560 * 2561 */ 2562 2563 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2564 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2565 if (mi->mi_io_kstats) { 2566 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2567 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2568 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2569 kstat_install(mi->mi_io_kstats); 2570 } 2571 2572 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2573 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2574 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2575 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2576 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2577 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2578 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2579 kstat_install(mi->mi_ro_kstats); 2580 } 2581 2582 nfs4_mnt_recov_kstat_init(vfsp); 2583 } 2584 2585 void 2586 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2587 { 2588 mntinfo4_t *mi; 2589 2590 mi = VTOMI4(vp); 2591 /* 2592 * In case of forced unmount, do not print any messages 2593 * since it can flood the console with error messages. 2594 */ 2595 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2596 return; 2597 2598 /* 2599 * If the mount point is dead, not recoverable, do not 2600 * print error messages that can flood the console. 2601 */ 2602 if (mi->mi_flags & MI4_RECOV_FAIL) 2603 return; 2604 2605 /* 2606 * No use in flooding the console with ENOSPC 2607 * messages from the same file system. 2608 */ 2609 if ((error != ENOSPC && error != EDQUOT) || 2610 lbolt - mi->mi_printftime > 0) { 2611 zoneid_t zoneid = mi->mi_zone->zone_id; 2612 2613 #ifdef DEBUG 2614 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2615 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2616 #else 2617 nfs_perror(error, "NFS write error on host %s: %m.\n", 2618 VTOR4(vp)->r_server->sv_hostname, NULL); 2619 #endif 2620 if (error == ENOSPC || error == EDQUOT) { 2621 zcmn_err(zoneid, CE_CONT, 2622 "^File: userid=%d, groupid=%d\n", 2623 crgetuid(cr), crgetgid(cr)); 2624 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2625 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2626 zcmn_err(zoneid, CE_CONT, 2627 "^User: userid=%d, groupid=%d\n", 2628 crgetuid(curthread->t_cred), 2629 crgetgid(curthread->t_cred)); 2630 } 2631 mi->mi_printftime = lbolt + 2632 nfs_write_error_interval * hz; 2633 } 2634 sfh4_printfhandle(VTOR4(vp)->r_fh); 2635 #ifdef DEBUG 2636 if (error == EACCES) { 2637 zcmn_err(zoneid, CE_CONT, 2638 "nfs_bio: cred is%s kcred\n", 2639 cr == kcred ? "" : " not"); 2640 } 2641 #endif 2642 } 2643 } 2644 2645 /* 2646 * Return non-zero if the given file can be safely memory mapped. Locks 2647 * are safe if whole-file (length and offset are both zero). 2648 */ 2649 2650 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2651 2652 static int 2653 nfs4_safemap(const vnode_t *vp) 2654 { 2655 locklist_t *llp, *next_llp; 2656 int safe = 1; 2657 rnode4_t *rp = VTOR4(vp); 2658 2659 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2660 2661 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2662 "vp = %p", (void *)vp)); 2663 2664 /* 2665 * Review all the locks for the vnode, both ones that have been 2666 * acquired and ones that are pending. We assume that 2667 * flk_active_locks_for_vp() has merged any locks that can be 2668 * merged (so that if a process has the entire file locked, it is 2669 * represented as a single lock). 2670 * 2671 * Note that we can't bail out of the loop if we find a non-safe 2672 * lock, because we have to free all the elements in the llp list. 2673 * We might be able to speed up this code slightly by not looking 2674 * at each lock's l_start and l_len fields once we've found a 2675 * non-safe lock. 2676 */ 2677 2678 llp = flk_active_locks_for_vp(vp); 2679 while (llp) { 2680 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2681 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2682 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2683 if (!SAFE_LOCK(llp->ll_flock)) { 2684 safe = 0; 2685 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2686 "nfs4_safemap: unsafe active lock (%" PRId64 2687 ", %" PRId64 ")", llp->ll_flock.l_start, 2688 llp->ll_flock.l_len)); 2689 } 2690 next_llp = llp->ll_next; 2691 VN_RELE(llp->ll_vp); 2692 kmem_free(llp, sizeof (*llp)); 2693 llp = next_llp; 2694 } 2695 2696 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2697 safe ? "safe" : "unsafe")); 2698 return (safe); 2699 } 2700 2701 /* 2702 * Return whether there is a lost LOCK or LOCKU queued up for the given 2703 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2704 */ 2705 2706 bool_t 2707 nfs4_map_lost_lock_conflict(vnode_t *vp) 2708 { 2709 bool_t conflict = FALSE; 2710 nfs4_lost_rqst_t *lrp; 2711 mntinfo4_t *mi = VTOMI4(vp); 2712 2713 mutex_enter(&mi->mi_lock); 2714 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2715 lrp = list_next(&mi->mi_lost_state, lrp)) { 2716 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2717 continue; 2718 ASSERT(lrp->lr_vp != NULL); 2719 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2720 continue; /* different file */ 2721 if (!SAFE_LOCK(*lrp->lr_flk)) { 2722 conflict = TRUE; 2723 break; 2724 } 2725 } 2726 2727 mutex_exit(&mi->mi_lock); 2728 return (conflict); 2729 } 2730 2731 /* 2732 * nfs_lockcompletion: 2733 * 2734 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2735 * as non cachable (set VNOCACHE bit). 2736 */ 2737 2738 void 2739 nfs4_lockcompletion(vnode_t *vp, int cmd) 2740 { 2741 rnode4_t *rp = VTOR4(vp); 2742 2743 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2744 ASSERT(!IS_SHADOW(vp, rp)); 2745 2746 if (cmd == F_SETLK || cmd == F_SETLKW) { 2747 2748 if (!nfs4_safemap(vp)) { 2749 mutex_enter(&vp->v_lock); 2750 vp->v_flag |= VNOCACHE; 2751 mutex_exit(&vp->v_lock); 2752 } else { 2753 mutex_enter(&vp->v_lock); 2754 vp->v_flag &= ~VNOCACHE; 2755 mutex_exit(&vp->v_lock); 2756 } 2757 } 2758 /* 2759 * The cached attributes of the file are stale after acquiring 2760 * the lock on the file. They were updated when the file was 2761 * opened, but not updated when the lock was acquired. Therefore the 2762 * cached attributes are invalidated after the lock is obtained. 2763 */ 2764 PURGE_ATTRCACHE4(vp); 2765 } 2766 2767 /* ARGSUSED */ 2768 static void * 2769 nfs4_mi_init(zoneid_t zoneid) 2770 { 2771 struct mi4_globals *mig; 2772 2773 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2774 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2775 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2776 offsetof(mntinfo4_t, mi_zone_node)); 2777 mig->mig_destructor_called = B_FALSE; 2778 return (mig); 2779 } 2780 2781 /* 2782 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2783 * state and killing off threads. 2784 */ 2785 /* ARGSUSED */ 2786 static void 2787 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2788 { 2789 struct mi4_globals *mig = data; 2790 mntinfo4_t *mi; 2791 nfs4_server_t *np; 2792 2793 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2794 "nfs4_mi_shutdown zone %d\n", zoneid)); 2795 ASSERT(mig != NULL); 2796 for (;;) { 2797 mutex_enter(&mig->mig_lock); 2798 mi = list_head(&mig->mig_list); 2799 if (mi == NULL) { 2800 mutex_exit(&mig->mig_lock); 2801 break; 2802 } 2803 2804 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2805 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2806 /* 2807 * purge the DNLC for this filesystem 2808 */ 2809 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2810 /* 2811 * Tell existing async worker threads to exit. 2812 */ 2813 mutex_enter(&mi->mi_async_lock); 2814 mi->mi_max_threads = 0; 2815 cv_broadcast(&mi->mi_async_work_cv); 2816 /* 2817 * Set the appropriate flags, signal and wait for both the 2818 * async manager and the inactive thread to exit when they're 2819 * done with their current work. 2820 */ 2821 mutex_enter(&mi->mi_lock); 2822 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2823 mutex_exit(&mi->mi_lock); 2824 mutex_exit(&mi->mi_async_lock); 2825 if (mi->mi_manager_thread) { 2826 nfs4_async_manager_stop(mi->mi_vfsp); 2827 } 2828 if (mi->mi_inactive_thread) { 2829 mutex_enter(&mi->mi_async_lock); 2830 cv_signal(&mi->mi_inact_req_cv); 2831 /* 2832 * Wait for the inactive thread to exit. 2833 */ 2834 while (mi->mi_inactive_thread != NULL) { 2835 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2836 } 2837 mutex_exit(&mi->mi_async_lock); 2838 } 2839 /* 2840 * Wait for the recovery thread to complete, that is, it will 2841 * signal when it is done using the "mi" structure and about 2842 * to exit 2843 */ 2844 mutex_enter(&mi->mi_lock); 2845 while (mi->mi_in_recovery > 0) 2846 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2847 mutex_exit(&mi->mi_lock); 2848 /* 2849 * We're done when every mi has been done or the list is empty. 2850 * This one is done, remove it from the list. 2851 */ 2852 list_remove(&mig->mig_list, mi); 2853 mutex_exit(&mig->mig_lock); 2854 zone_rele(mi->mi_zone); 2855 /* 2856 * Release hold on vfs and mi done to prevent race with zone 2857 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2858 */ 2859 VFS_RELE(mi->mi_vfsp); 2860 MI4_RELE(mi); 2861 } 2862 /* 2863 * Tell each renew thread in the zone to exit 2864 */ 2865 mutex_enter(&nfs4_server_lst_lock); 2866 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2867 mutex_enter(&np->s_lock); 2868 if (np->zoneid == zoneid) { 2869 /* 2870 * We add another hold onto the nfs4_server_t 2871 * because this will make sure tha the nfs4_server_t 2872 * stays around until nfs4_callback_fini_zone destroys 2873 * the zone. This way, the renew thread can 2874 * unconditionally release its holds on the 2875 * nfs4_server_t. 2876 */ 2877 np->s_refcnt++; 2878 nfs4_mark_srv_dead(np); 2879 } 2880 mutex_exit(&np->s_lock); 2881 } 2882 mutex_exit(&nfs4_server_lst_lock); 2883 } 2884 2885 static void 2886 nfs4_mi_free_globals(struct mi4_globals *mig) 2887 { 2888 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2889 mutex_destroy(&mig->mig_lock); 2890 kmem_free(mig, sizeof (*mig)); 2891 } 2892 2893 /* ARGSUSED */ 2894 static void 2895 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2896 { 2897 struct mi4_globals *mig = data; 2898 2899 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2900 "nfs4_mi_destroy zone %d\n", zoneid)); 2901 ASSERT(mig != NULL); 2902 mutex_enter(&mig->mig_lock); 2903 if (list_head(&mig->mig_list) != NULL) { 2904 /* Still waiting for VFS_FREEVFS() */ 2905 mig->mig_destructor_called = B_TRUE; 2906 mutex_exit(&mig->mig_lock); 2907 return; 2908 } 2909 nfs4_mi_free_globals(mig); 2910 } 2911 2912 /* 2913 * Add an NFS mount to the per-zone list of NFS mounts. 2914 */ 2915 void 2916 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2917 { 2918 struct mi4_globals *mig; 2919 2920 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2921 mutex_enter(&mig->mig_lock); 2922 list_insert_head(&mig->mig_list, mi); 2923 /* 2924 * hold added to eliminate race with zone shutdown -this will be 2925 * released in mi_shutdown 2926 */ 2927 MI4_HOLD(mi); 2928 VFS_HOLD(mi->mi_vfsp); 2929 mutex_exit(&mig->mig_lock); 2930 } 2931 2932 /* 2933 * Remove an NFS mount from the per-zone list of NFS mounts. 2934 */ 2935 int 2936 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 2937 { 2938 struct mi4_globals *mig; 2939 int ret = 0; 2940 2941 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2942 mutex_enter(&mig->mig_lock); 2943 mutex_enter(&mi->mi_lock); 2944 /* if this mi is marked dead, then the zone already released it */ 2945 if (!(mi->mi_flags & MI4_DEAD)) { 2946 list_remove(&mig->mig_list, mi); 2947 2948 /* release the holds put on in zonelist_add(). */ 2949 VFS_RELE(mi->mi_vfsp); 2950 MI4_RELE(mi); 2951 ret = 1; 2952 } 2953 mutex_exit(&mi->mi_lock); 2954 2955 /* 2956 * We can be called asynchronously by VFS_FREEVFS() after the zone 2957 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2958 * mi globals. 2959 */ 2960 if (list_head(&mig->mig_list) == NULL && 2961 mig->mig_destructor_called == B_TRUE) { 2962 nfs4_mi_free_globals(mig); 2963 return (ret); 2964 } 2965 mutex_exit(&mig->mig_lock); 2966 return (ret); 2967 } 2968 2969 void 2970 nfs_free_mi4(mntinfo4_t *mi) 2971 { 2972 nfs4_open_owner_t *foop; 2973 nfs4_oo_hash_bucket_t *bucketp; 2974 nfs4_debug_msg_t *msgp; 2975 int i; 2976 servinfo4_t *svp; 2977 2978 mutex_enter(&mi->mi_lock); 2979 ASSERT(mi->mi_recovthread == NULL); 2980 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 2981 mutex_exit(&mi->mi_lock); 2982 mutex_enter(&mi->mi_async_lock); 2983 ASSERT(mi->mi_threads == 0); 2984 ASSERT(mi->mi_manager_thread == NULL); 2985 mutex_exit(&mi->mi_async_lock); 2986 svp = mi->mi_servers; 2987 sv4_free(svp); 2988 if (mi->mi_io_kstats) { 2989 kstat_delete(mi->mi_io_kstats); 2990 mi->mi_io_kstats = NULL; 2991 } 2992 if (mi->mi_ro_kstats) { 2993 kstat_delete(mi->mi_ro_kstats); 2994 mi->mi_ro_kstats = NULL; 2995 } 2996 if (mi->mi_recov_ksp) { 2997 kstat_delete(mi->mi_recov_ksp); 2998 mi->mi_recov_ksp = NULL; 2999 } 3000 mutex_enter(&mi->mi_msg_list_lock); 3001 while (msgp = list_head(&mi->mi_msg_list)) { 3002 list_remove(&mi->mi_msg_list, msgp); 3003 nfs4_free_msg(msgp); 3004 } 3005 mutex_exit(&mi->mi_msg_list_lock); 3006 list_destroy(&mi->mi_msg_list); 3007 if (mi->mi_rootfh != NULL) 3008 sfh4_rele(&mi->mi_rootfh); 3009 if (mi->mi_srvparentfh != NULL) 3010 sfh4_rele(&mi->mi_srvparentfh); 3011 mutex_destroy(&mi->mi_lock); 3012 mutex_destroy(&mi->mi_async_lock); 3013 mutex_destroy(&mi->mi_msg_list_lock); 3014 nfs_rw_destroy(&mi->mi_recovlock); 3015 nfs_rw_destroy(&mi->mi_rename_lock); 3016 nfs_rw_destroy(&mi->mi_fh_lock); 3017 cv_destroy(&mi->mi_failover_cv); 3018 cv_destroy(&mi->mi_async_reqs_cv); 3019 cv_destroy(&mi->mi_async_work_cv); 3020 cv_destroy(&mi->mi_async_cv); 3021 cv_destroy(&mi->mi_inact_req_cv); 3022 /* 3023 * Destroy the oo hash lists and mutexes for the cred hash table. 3024 */ 3025 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3026 bucketp = &(mi->mi_oo_list[i]); 3027 /* Destroy any remaining open owners on the list */ 3028 foop = list_head(&bucketp->b_oo_hash_list); 3029 while (foop != NULL) { 3030 list_remove(&bucketp->b_oo_hash_list, foop); 3031 nfs4_destroy_open_owner(foop); 3032 foop = list_head(&bucketp->b_oo_hash_list); 3033 } 3034 list_destroy(&bucketp->b_oo_hash_list); 3035 mutex_destroy(&bucketp->b_lock); 3036 } 3037 /* 3038 * Empty and destroy the freed open owner list. 3039 */ 3040 foop = list_head(&mi->mi_foo_list); 3041 while (foop != NULL) { 3042 list_remove(&mi->mi_foo_list, foop); 3043 nfs4_destroy_open_owner(foop); 3044 foop = list_head(&mi->mi_foo_list); 3045 } 3046 list_destroy(&mi->mi_foo_list); 3047 list_destroy(&mi->mi_bseqid_list); 3048 list_destroy(&mi->mi_lost_state); 3049 avl_destroy(&mi->mi_filehandles); 3050 fn_rele(&mi->mi_fname); 3051 kmem_free(mi, sizeof (*mi)); 3052 } 3053 void 3054 mi_hold(mntinfo4_t *mi) 3055 { 3056 atomic_add_32(&mi->mi_count, 1); 3057 ASSERT(mi->mi_count != 0); 3058 } 3059 3060 void 3061 mi_rele(mntinfo4_t *mi) 3062 { 3063 ASSERT(mi->mi_count != 0); 3064 if (atomic_add_32_nv(&mi->mi_count, -1) == 0) { 3065 nfs_free_mi4(mi); 3066 } 3067 } 3068 3069 vnode_t nfs4_xattr_notsupp_vnode; 3070 3071 void 3072 nfs4_clnt_init(void) 3073 { 3074 nfs4_vnops_init(); 3075 (void) nfs4_rnode_init(); 3076 (void) nfs4_shadow_init(); 3077 (void) nfs4_acache_init(); 3078 (void) nfs4_subr_init(); 3079 nfs4_acl_init(); 3080 nfs_idmap_init(); 3081 nfs4_callback_init(); 3082 nfs4_secinfo_init(); 3083 #ifdef DEBUG 3084 tsd_create(&nfs4_tsd_key, NULL); 3085 #endif 3086 3087 /* 3088 * Add a CPR callback so that we can update client 3089 * lease after a suspend and resume. 3090 */ 3091 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3092 3093 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3094 nfs4_mi_destroy); 3095 3096 /* 3097 * Initialise the reference count of the notsupp xattr cache vnode to 1 3098 * so that it never goes away (VOP_INACTIVE isn't called on it). 3099 */ 3100 nfs4_xattr_notsupp_vnode.v_count = 1; 3101 } 3102 3103 void 3104 nfs4_clnt_fini(void) 3105 { 3106 (void) zone_key_delete(mi4_list_key); 3107 nfs4_vnops_fini(); 3108 (void) nfs4_rnode_fini(); 3109 (void) nfs4_shadow_fini(); 3110 (void) nfs4_acache_fini(); 3111 (void) nfs4_subr_fini(); 3112 nfs_idmap_fini(); 3113 nfs4_callback_fini(); 3114 nfs4_secinfo_fini(); 3115 #ifdef DEBUG 3116 tsd_destroy(&nfs4_tsd_key); 3117 #endif 3118 if (cid) 3119 (void) callb_delete(cid); 3120 } 3121 3122 /*ARGSUSED*/ 3123 static boolean_t 3124 nfs4_client_cpr_callb(void *arg, int code) 3125 { 3126 /* 3127 * We get called for Suspend and Resume events. 3128 * For the suspend case we simply don't care! 3129 */ 3130 if (code == CB_CODE_CPR_CHKPT) { 3131 return (B_TRUE); 3132 } 3133 3134 /* 3135 * When we get to here we are in the process of 3136 * resuming the system from a previous suspend. 3137 */ 3138 nfs4_client_resumed = gethrestime_sec(); 3139 return (B_TRUE); 3140 } 3141 3142 void 3143 nfs4_renew_lease_thread(nfs4_server_t *sp) 3144 { 3145 int error = 0; 3146 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3147 clock_t tick_delay = 0; 3148 clock_t time_left = 0; 3149 callb_cpr_t cpr_info; 3150 kmutex_t cpr_lock; 3151 3152 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3153 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3154 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3155 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3156 3157 mutex_enter(&sp->s_lock); 3158 /* sp->s_lease_time is set via a GETATTR */ 3159 sp->last_renewal_time = gethrestime_sec(); 3160 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3161 ASSERT(sp->s_refcnt >= 1); 3162 3163 for (;;) { 3164 if (!sp->state_ref_count || 3165 sp->lease_valid != NFS4_LEASE_VALID) { 3166 3167 kip_secs = MAX((sp->s_lease_time >> 1) - 3168 (3 * sp->propagation_delay.tv_sec), 1); 3169 3170 tick_delay = SEC_TO_TICK(kip_secs); 3171 3172 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3173 "nfs4_renew_lease_thread: no renew : thread " 3174 "wait %ld secs", kip_secs)); 3175 3176 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3177 "nfs4_renew_lease_thread: no renew : " 3178 "state_ref_count %d, lease_valid %d", 3179 sp->state_ref_count, sp->lease_valid)); 3180 3181 mutex_enter(&cpr_lock); 3182 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3183 mutex_exit(&cpr_lock); 3184 time_left = cv_timedwait(&sp->cv_thread_exit, 3185 &sp->s_lock, tick_delay + lbolt); 3186 mutex_enter(&cpr_lock); 3187 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3188 mutex_exit(&cpr_lock); 3189 3190 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3191 "nfs4_renew_lease_thread: no renew: " 3192 "time left %ld", time_left)); 3193 3194 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3195 goto die; 3196 continue; 3197 } 3198 3199 tmp_last_renewal_time = sp->last_renewal_time; 3200 3201 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3202 (3 * sp->propagation_delay.tv_sec); 3203 3204 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3205 "nfs4_renew_lease_thread: tmp_time %ld, " 3206 "sp->last_renewal_time %ld", tmp_time, 3207 sp->last_renewal_time)); 3208 3209 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3210 3211 tick_delay = SEC_TO_TICK(kip_secs); 3212 3213 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3214 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3215 "secs", kip_secs)); 3216 3217 mutex_enter(&cpr_lock); 3218 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3219 mutex_exit(&cpr_lock); 3220 time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock, 3221 tick_delay + lbolt); 3222 mutex_enter(&cpr_lock); 3223 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3224 mutex_exit(&cpr_lock); 3225 3226 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3227 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3228 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3229 "tmp_last_renewal_time %ld", time_left, 3230 sp->last_renewal_time, nfs4_client_resumed, 3231 tmp_last_renewal_time)); 3232 3233 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3234 goto die; 3235 3236 if (tmp_last_renewal_time == sp->last_renewal_time || 3237 (nfs4_client_resumed != 0 && 3238 nfs4_client_resumed > sp->last_renewal_time)) { 3239 /* 3240 * Issue RENEW op since we haven't renewed the lease 3241 * since we slept. 3242 */ 3243 tmp_now_time = gethrestime_sec(); 3244 error = nfs4renew(sp); 3245 /* 3246 * Need to re-acquire sp's lock, nfs4renew() 3247 * relinqueshes it. 3248 */ 3249 mutex_enter(&sp->s_lock); 3250 3251 /* 3252 * See if someone changed s_thread_exit while we gave 3253 * up s_lock. 3254 */ 3255 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3256 goto die; 3257 3258 if (!error) { 3259 /* 3260 * check to see if we implicitly renewed while 3261 * we waited for a reply for our RENEW call. 3262 */ 3263 if (tmp_last_renewal_time == 3264 sp->last_renewal_time) { 3265 /* no implicit renew came */ 3266 sp->last_renewal_time = tmp_now_time; 3267 } else { 3268 NFS4_DEBUG(nfs4_client_lease_debug, 3269 (CE_NOTE, "renew_thread: did " 3270 "implicit renewal before reply " 3271 "from server for RENEW")); 3272 } 3273 } else { 3274 /* figure out error */ 3275 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3276 "renew_thread: nfs4renew returned error" 3277 " %d", error)); 3278 } 3279 3280 } 3281 } 3282 3283 die: 3284 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3285 "nfs4_renew_lease_thread: thread exiting")); 3286 3287 while (sp->s_otw_call_count != 0) { 3288 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3289 "nfs4_renew_lease_thread: waiting for outstanding " 3290 "otw calls to finish for sp 0x%p, current " 3291 "s_otw_call_count %d", (void *)sp, 3292 sp->s_otw_call_count)); 3293 mutex_enter(&cpr_lock); 3294 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3295 mutex_exit(&cpr_lock); 3296 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3297 mutex_enter(&cpr_lock); 3298 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3299 mutex_exit(&cpr_lock); 3300 } 3301 mutex_exit(&sp->s_lock); 3302 3303 nfs4_server_rele(sp); /* free the thread's reference */ 3304 nfs4_server_rele(sp); /* free the list's reference */ 3305 sp = NULL; 3306 3307 done: 3308 mutex_enter(&cpr_lock); 3309 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3310 mutex_destroy(&cpr_lock); 3311 3312 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3313 "nfs4_renew_lease_thread: renew thread exit officially")); 3314 3315 zthread_exit(); 3316 /* NOT REACHED */ 3317 } 3318 3319 /* 3320 * Send out a RENEW op to the server. 3321 * Assumes sp is locked down. 3322 */ 3323 static int 3324 nfs4renew(nfs4_server_t *sp) 3325 { 3326 COMPOUND4args_clnt args; 3327 COMPOUND4res_clnt res; 3328 nfs_argop4 argop[1]; 3329 int doqueue = 1; 3330 int rpc_error; 3331 cred_t *cr; 3332 mntinfo4_t *mi; 3333 timespec_t prop_time, after_time; 3334 int needrecov = FALSE; 3335 nfs4_recov_state_t recov_state; 3336 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3337 3338 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3339 3340 recov_state.rs_flags = 0; 3341 recov_state.rs_num_retry_despite_err = 0; 3342 3343 recov_retry: 3344 mi = sp->mntinfo4_list; 3345 VFS_HOLD(mi->mi_vfsp); 3346 mutex_exit(&sp->s_lock); 3347 ASSERT(mi != NULL); 3348 3349 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3350 if (e.error) { 3351 VFS_RELE(mi->mi_vfsp); 3352 return (e.error); 3353 } 3354 3355 /* Check to see if we're dealing with a marked-dead sp */ 3356 mutex_enter(&sp->s_lock); 3357 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3358 mutex_exit(&sp->s_lock); 3359 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3360 VFS_RELE(mi->mi_vfsp); 3361 return (0); 3362 } 3363 3364 /* Make sure mi hasn't changed on us */ 3365 if (mi != sp->mntinfo4_list) { 3366 /* Must drop sp's lock to avoid a recursive mutex enter */ 3367 mutex_exit(&sp->s_lock); 3368 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3369 VFS_RELE(mi->mi_vfsp); 3370 mutex_enter(&sp->s_lock); 3371 goto recov_retry; 3372 } 3373 mutex_exit(&sp->s_lock); 3374 3375 args.ctag = TAG_RENEW; 3376 3377 args.array_len = 1; 3378 args.array = argop; 3379 3380 argop[0].argop = OP_RENEW; 3381 3382 mutex_enter(&sp->s_lock); 3383 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3384 cr = sp->s_cred; 3385 crhold(cr); 3386 mutex_exit(&sp->s_lock); 3387 3388 ASSERT(cr != NULL); 3389 3390 /* used to figure out RTT for sp */ 3391 gethrestime(&prop_time); 3392 3393 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3394 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3395 (void*)sp)); 3396 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3397 prop_time.tv_sec, prop_time.tv_nsec)); 3398 3399 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3400 mntinfo4_t *, mi); 3401 3402 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3403 crfree(cr); 3404 3405 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3406 mntinfo4_t *, mi); 3407 3408 gethrestime(&after_time); 3409 3410 mutex_enter(&sp->s_lock); 3411 sp->propagation_delay.tv_sec = 3412 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3413 mutex_exit(&sp->s_lock); 3414 3415 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3416 after_time.tv_sec, after_time.tv_nsec)); 3417 3418 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3419 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3420 nfs4_delegreturn_all(sp); 3421 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3422 VFS_RELE(mi->mi_vfsp); 3423 /* 3424 * If the server returns CB_PATH_DOWN, it has renewed 3425 * the lease and informed us that the callback path is 3426 * down. Since the lease is renewed, just return 0 and 3427 * let the renew thread proceed as normal. 3428 */ 3429 return (0); 3430 } 3431 3432 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3433 if (!needrecov && e.error) { 3434 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3435 VFS_RELE(mi->mi_vfsp); 3436 return (e.error); 3437 } 3438 3439 rpc_error = e.error; 3440 3441 if (needrecov) { 3442 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3443 "nfs4renew: initiating recovery\n")); 3444 3445 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3446 OP_RENEW, NULL) == FALSE) { 3447 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3448 VFS_RELE(mi->mi_vfsp); 3449 if (!e.error) 3450 (void) xdr_free(xdr_COMPOUND4res_clnt, 3451 (caddr_t)&res); 3452 mutex_enter(&sp->s_lock); 3453 goto recov_retry; 3454 } 3455 /* fall through for res.status case */ 3456 } 3457 3458 if (res.status) { 3459 if (res.status == NFS4ERR_LEASE_MOVED) { 3460 /*EMPTY*/ 3461 /* 3462 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3463 * to renew the lease on that server 3464 */ 3465 } 3466 e.error = geterrno4(res.status); 3467 } 3468 3469 if (!rpc_error) 3470 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3471 3472 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3473 3474 VFS_RELE(mi->mi_vfsp); 3475 3476 return (e.error); 3477 } 3478 3479 void 3480 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3481 { 3482 nfs4_server_t *sp; 3483 3484 /* this locks down sp if it is found */ 3485 sp = find_nfs4_server(mi); 3486 3487 if (sp != NULL) { 3488 nfs4_inc_state_ref_count_nolock(sp, mi); 3489 mutex_exit(&sp->s_lock); 3490 nfs4_server_rele(sp); 3491 } 3492 } 3493 3494 /* 3495 * Bump the number of OPEN files (ie: those with state) so we know if this 3496 * nfs4_server has any state to maintain a lease for or not. 3497 * 3498 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3499 */ 3500 void 3501 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3502 { 3503 ASSERT(mutex_owned(&sp->s_lock)); 3504 3505 sp->state_ref_count++; 3506 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3507 "nfs4_inc_state_ref_count: state_ref_count now %d", 3508 sp->state_ref_count)); 3509 3510 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3511 sp->lease_valid = NFS4_LEASE_VALID; 3512 3513 /* 3514 * If this call caused the lease to be marked valid and/or 3515 * took the state_ref_count from 0 to 1, then start the time 3516 * on lease renewal. 3517 */ 3518 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3519 sp->last_renewal_time = gethrestime_sec(); 3520 3521 /* update the number of open files for mi */ 3522 mi->mi_open_files++; 3523 } 3524 3525 void 3526 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3527 { 3528 nfs4_server_t *sp; 3529 3530 /* this locks down sp if it is found */ 3531 sp = find_nfs4_server_all(mi, 1); 3532 3533 if (sp != NULL) { 3534 nfs4_dec_state_ref_count_nolock(sp, mi); 3535 mutex_exit(&sp->s_lock); 3536 nfs4_server_rele(sp); 3537 } 3538 } 3539 3540 /* 3541 * Decrement the number of OPEN files (ie: those with state) so we know if 3542 * this nfs4_server has any state to maintain a lease for or not. 3543 */ 3544 void 3545 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3546 { 3547 ASSERT(mutex_owned(&sp->s_lock)); 3548 ASSERT(sp->state_ref_count != 0); 3549 sp->state_ref_count--; 3550 3551 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3552 "nfs4_dec_state_ref_count: state ref count now %d", 3553 sp->state_ref_count)); 3554 3555 mi->mi_open_files--; 3556 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3557 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3558 mi->mi_open_files, mi->mi_flags)); 3559 3560 /* We don't have to hold the mi_lock to test mi_flags */ 3561 if (mi->mi_open_files == 0 && 3562 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3563 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3564 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3565 "we have closed the last open file", (void*)mi)); 3566 nfs4_remove_mi_from_server(mi, sp); 3567 } 3568 } 3569 3570 bool_t 3571 inlease(nfs4_server_t *sp) 3572 { 3573 bool_t result; 3574 3575 ASSERT(mutex_owned(&sp->s_lock)); 3576 3577 if (sp->lease_valid == NFS4_LEASE_VALID && 3578 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3579 result = TRUE; 3580 else 3581 result = FALSE; 3582 3583 return (result); 3584 } 3585 3586 3587 /* 3588 * Return non-zero if the given nfs4_server_t is going through recovery. 3589 */ 3590 3591 int 3592 nfs4_server_in_recovery(nfs4_server_t *sp) 3593 { 3594 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3595 } 3596 3597 /* 3598 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3599 * first is less than, equal to, or greater than the second. 3600 */ 3601 3602 int 3603 sfh4cmp(const void *p1, const void *p2) 3604 { 3605 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3606 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3607 3608 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3609 } 3610 3611 /* 3612 * Create a table for shared filehandle objects. 3613 */ 3614 3615 void 3616 sfh4_createtab(avl_tree_t *tab) 3617 { 3618 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3619 offsetof(nfs4_sharedfh_t, sfh_tree)); 3620 } 3621 3622 /* 3623 * Return a shared filehandle object for the given filehandle. The caller 3624 * is responsible for eventually calling sfh4_rele(). 3625 */ 3626 3627 nfs4_sharedfh_t * 3628 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3629 { 3630 nfs4_sharedfh_t *sfh, *nsfh; 3631 avl_index_t where; 3632 nfs4_sharedfh_t skey; 3633 3634 if (!key) { 3635 skey.sfh_fh = *fh; 3636 key = &skey; 3637 } 3638 3639 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3640 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3641 /* 3642 * We allocate the largest possible filehandle size because it's 3643 * not that big, and it saves us from possibly having to resize the 3644 * buffer later. 3645 */ 3646 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3647 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3648 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3649 nsfh->sfh_refcnt = 1; 3650 nsfh->sfh_flags = SFH4_IN_TREE; 3651 nsfh->sfh_mi = mi; 3652 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3653 (void *)nsfh)); 3654 3655 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3656 sfh = avl_find(&mi->mi_filehandles, key, &where); 3657 if (sfh != NULL) { 3658 mutex_enter(&sfh->sfh_lock); 3659 sfh->sfh_refcnt++; 3660 mutex_exit(&sfh->sfh_lock); 3661 nfs_rw_exit(&mi->mi_fh_lock); 3662 /* free our speculative allocs */ 3663 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3664 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3665 return (sfh); 3666 } 3667 3668 avl_insert(&mi->mi_filehandles, nsfh, where); 3669 nfs_rw_exit(&mi->mi_fh_lock); 3670 3671 return (nsfh); 3672 } 3673 3674 /* 3675 * Return a shared filehandle object for the given filehandle. The caller 3676 * is responsible for eventually calling sfh4_rele(). 3677 */ 3678 3679 nfs4_sharedfh_t * 3680 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3681 { 3682 nfs4_sharedfh_t *sfh; 3683 nfs4_sharedfh_t key; 3684 3685 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3686 3687 #ifdef DEBUG 3688 if (nfs4_sharedfh_debug) { 3689 nfs4_fhandle_t fhandle; 3690 3691 fhandle.fh_len = fh->nfs_fh4_len; 3692 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3693 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3694 nfs4_printfhandle(&fhandle); 3695 } 3696 #endif 3697 3698 /* 3699 * If there's already an object for the given filehandle, bump the 3700 * reference count and return it. Otherwise, create a new object 3701 * and add it to the AVL tree. 3702 */ 3703 3704 key.sfh_fh = *fh; 3705 3706 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3707 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3708 if (sfh != NULL) { 3709 mutex_enter(&sfh->sfh_lock); 3710 sfh->sfh_refcnt++; 3711 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3712 "sfh4_get: found existing %p, new refcnt=%d", 3713 (void *)sfh, sfh->sfh_refcnt)); 3714 mutex_exit(&sfh->sfh_lock); 3715 nfs_rw_exit(&mi->mi_fh_lock); 3716 return (sfh); 3717 } 3718 nfs_rw_exit(&mi->mi_fh_lock); 3719 3720 return (sfh4_put(fh, mi, &key)); 3721 } 3722 3723 /* 3724 * Get a reference to the given shared filehandle object. 3725 */ 3726 3727 void 3728 sfh4_hold(nfs4_sharedfh_t *sfh) 3729 { 3730 ASSERT(sfh->sfh_refcnt > 0); 3731 3732 mutex_enter(&sfh->sfh_lock); 3733 sfh->sfh_refcnt++; 3734 NFS4_DEBUG(nfs4_sharedfh_debug, 3735 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3736 (void *)sfh, sfh->sfh_refcnt)); 3737 mutex_exit(&sfh->sfh_lock); 3738 } 3739 3740 /* 3741 * Release a reference to the given shared filehandle object and null out 3742 * the given pointer. 3743 */ 3744 3745 void 3746 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3747 { 3748 mntinfo4_t *mi; 3749 nfs4_sharedfh_t *sfh = *sfhpp; 3750 3751 ASSERT(sfh->sfh_refcnt > 0); 3752 3753 mutex_enter(&sfh->sfh_lock); 3754 if (sfh->sfh_refcnt > 1) { 3755 sfh->sfh_refcnt--; 3756 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3757 "sfh4_rele %p, new refcnt=%d", 3758 (void *)sfh, sfh->sfh_refcnt)); 3759 mutex_exit(&sfh->sfh_lock); 3760 goto finish; 3761 } 3762 mutex_exit(&sfh->sfh_lock); 3763 3764 /* 3765 * Possibly the last reference, so get the lock for the table in 3766 * case it's time to remove the object from the table. 3767 */ 3768 mi = sfh->sfh_mi; 3769 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3770 mutex_enter(&sfh->sfh_lock); 3771 sfh->sfh_refcnt--; 3772 if (sfh->sfh_refcnt > 0) { 3773 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3774 "sfh4_rele %p, new refcnt=%d", 3775 (void *)sfh, sfh->sfh_refcnt)); 3776 mutex_exit(&sfh->sfh_lock); 3777 nfs_rw_exit(&mi->mi_fh_lock); 3778 goto finish; 3779 } 3780 3781 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3782 "sfh4_rele %p, last ref", (void *)sfh)); 3783 if (sfh->sfh_flags & SFH4_IN_TREE) { 3784 avl_remove(&mi->mi_filehandles, sfh); 3785 sfh->sfh_flags &= ~SFH4_IN_TREE; 3786 } 3787 mutex_exit(&sfh->sfh_lock); 3788 nfs_rw_exit(&mi->mi_fh_lock); 3789 mutex_destroy(&sfh->sfh_lock); 3790 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3791 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3792 3793 finish: 3794 *sfhpp = NULL; 3795 } 3796 3797 /* 3798 * Update the filehandle for the given shared filehandle object. 3799 */ 3800 3801 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3802 3803 void 3804 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3805 { 3806 mntinfo4_t *mi = sfh->sfh_mi; 3807 nfs4_sharedfh_t *dupsfh; 3808 avl_index_t where; 3809 nfs4_sharedfh_t key; 3810 3811 #ifdef DEBUG 3812 mutex_enter(&sfh->sfh_lock); 3813 ASSERT(sfh->sfh_refcnt > 0); 3814 mutex_exit(&sfh->sfh_lock); 3815 #endif 3816 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3817 3818 /* 3819 * The basic plan is to remove the shared filehandle object from 3820 * the table, update it to have the new filehandle, then reinsert 3821 * it. 3822 */ 3823 3824 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3825 mutex_enter(&sfh->sfh_lock); 3826 if (sfh->sfh_flags & SFH4_IN_TREE) { 3827 avl_remove(&mi->mi_filehandles, sfh); 3828 sfh->sfh_flags &= ~SFH4_IN_TREE; 3829 } 3830 mutex_exit(&sfh->sfh_lock); 3831 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3832 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3833 sfh->sfh_fh.nfs_fh4_len); 3834 3835 /* 3836 * XXX If there is already a shared filehandle object with the new 3837 * filehandle, we're in trouble, because the rnode code assumes 3838 * that there is only one shared filehandle object for a given 3839 * filehandle. So issue a warning (for read-write mounts only) 3840 * and don't try to re-insert the given object into the table. 3841 * Hopefully the given object will quickly go away and everyone 3842 * will use the new object. 3843 */ 3844 key.sfh_fh = *newfh; 3845 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3846 if (dupsfh != NULL) { 3847 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3848 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3849 "duplicate filehandle detected"); 3850 sfh4_printfhandle(dupsfh); 3851 } 3852 } else { 3853 avl_insert(&mi->mi_filehandles, sfh, where); 3854 mutex_enter(&sfh->sfh_lock); 3855 sfh->sfh_flags |= SFH4_IN_TREE; 3856 mutex_exit(&sfh->sfh_lock); 3857 } 3858 nfs_rw_exit(&mi->mi_fh_lock); 3859 } 3860 3861 /* 3862 * Copy out the current filehandle for the given shared filehandle object. 3863 */ 3864 3865 void 3866 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3867 { 3868 mntinfo4_t *mi = sfh->sfh_mi; 3869 3870 ASSERT(sfh->sfh_refcnt > 0); 3871 3872 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3873 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3874 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3875 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3876 nfs_rw_exit(&mi->mi_fh_lock); 3877 } 3878 3879 /* 3880 * Print out the filehandle for the given shared filehandle object. 3881 */ 3882 3883 void 3884 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3885 { 3886 nfs4_fhandle_t fhandle; 3887 3888 sfh4_copyval(sfh, &fhandle); 3889 nfs4_printfhandle(&fhandle); 3890 } 3891 3892 /* 3893 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3894 * if they're the same, +1 if the first is "greater" than the second. The 3895 * caller (or whoever's calling the AVL package) is responsible for 3896 * handling locking issues. 3897 */ 3898 3899 static int 3900 fncmp(const void *p1, const void *p2) 3901 { 3902 const nfs4_fname_t *f1 = p1; 3903 const nfs4_fname_t *f2 = p2; 3904 int res; 3905 3906 res = strcmp(f1->fn_name, f2->fn_name); 3907 /* 3908 * The AVL package wants +/-1, not arbitrary positive or negative 3909 * integers. 3910 */ 3911 if (res > 0) 3912 res = 1; 3913 else if (res < 0) 3914 res = -1; 3915 return (res); 3916 } 3917 3918 /* 3919 * Get or create an fname with the given name, as a child of the given 3920 * fname. The caller is responsible for eventually releasing the reference 3921 * (fn_rele()). parent may be NULL. 3922 */ 3923 3924 nfs4_fname_t * 3925 fn_get(nfs4_fname_t *parent, char *name) 3926 { 3927 nfs4_fname_t key; 3928 nfs4_fname_t *fnp; 3929 avl_index_t where; 3930 3931 key.fn_name = name; 3932 3933 /* 3934 * If there's already an fname registered with the given name, bump 3935 * its reference count and return it. Otherwise, create a new one 3936 * and add it to the parent's AVL tree. 3937 */ 3938 3939 if (parent != NULL) { 3940 mutex_enter(&parent->fn_lock); 3941 fnp = avl_find(&parent->fn_children, &key, &where); 3942 if (fnp != NULL) { 3943 fn_hold(fnp); 3944 mutex_exit(&parent->fn_lock); 3945 return (fnp); 3946 } 3947 } 3948 3949 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 3950 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 3951 fnp->fn_parent = parent; 3952 if (parent != NULL) 3953 fn_hold(parent); 3954 fnp->fn_len = strlen(name); 3955 ASSERT(fnp->fn_len < MAXNAMELEN); 3956 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 3957 (void) strcpy(fnp->fn_name, name); 3958 fnp->fn_refcnt = 1; 3959 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 3960 offsetof(nfs4_fname_t, fn_tree)); 3961 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3962 "fn_get %p:%s, a new nfs4_fname_t!", 3963 (void *)fnp, fnp->fn_name)); 3964 if (parent != NULL) { 3965 avl_insert(&parent->fn_children, fnp, where); 3966 mutex_exit(&parent->fn_lock); 3967 } 3968 3969 return (fnp); 3970 } 3971 3972 void 3973 fn_hold(nfs4_fname_t *fnp) 3974 { 3975 atomic_add_32(&fnp->fn_refcnt, 1); 3976 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3977 "fn_hold %p:%s, new refcnt=%d", 3978 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 3979 } 3980 3981 /* 3982 * Decrement the reference count of the given fname, and destroy it if its 3983 * reference count goes to zero. Nulls out the given pointer. 3984 */ 3985 3986 void 3987 fn_rele(nfs4_fname_t **fnpp) 3988 { 3989 nfs4_fname_t *parent; 3990 uint32_t newref; 3991 nfs4_fname_t *fnp; 3992 3993 recur: 3994 fnp = *fnpp; 3995 *fnpp = NULL; 3996 3997 mutex_enter(&fnp->fn_lock); 3998 parent = fnp->fn_parent; 3999 if (parent != NULL) 4000 mutex_enter(&parent->fn_lock); /* prevent new references */ 4001 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 4002 if (newref > 0) { 4003 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4004 "fn_rele %p:%s, new refcnt=%d", 4005 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4006 if (parent != NULL) 4007 mutex_exit(&parent->fn_lock); 4008 mutex_exit(&fnp->fn_lock); 4009 return; 4010 } 4011 4012 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4013 "fn_rele %p:%s, last reference, deleting...", 4014 (void *)fnp, fnp->fn_name)); 4015 if (parent != NULL) { 4016 avl_remove(&parent->fn_children, fnp); 4017 mutex_exit(&parent->fn_lock); 4018 } 4019 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4020 mutex_destroy(&fnp->fn_lock); 4021 avl_destroy(&fnp->fn_children); 4022 kmem_free(fnp, sizeof (nfs4_fname_t)); 4023 /* 4024 * Recursivly fn_rele the parent. 4025 * Use goto instead of a recursive call to avoid stack overflow. 4026 */ 4027 if (parent != NULL) { 4028 fnpp = &parent; 4029 goto recur; 4030 } 4031 } 4032 4033 /* 4034 * Returns the single component name of the given fname, in a MAXNAMELEN 4035 * string buffer, which the caller is responsible for freeing. Note that 4036 * the name may become invalid as a result of fn_move(). 4037 */ 4038 4039 char * 4040 fn_name(nfs4_fname_t *fnp) 4041 { 4042 char *name; 4043 4044 ASSERT(fnp->fn_len < MAXNAMELEN); 4045 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4046 mutex_enter(&fnp->fn_lock); 4047 (void) strcpy(name, fnp->fn_name); 4048 mutex_exit(&fnp->fn_lock); 4049 4050 return (name); 4051 } 4052 4053 4054 /* 4055 * fn_path_realloc 4056 * 4057 * This function, used only by fn_path, constructs 4058 * a new string which looks like "prepend" + "/" + "current". 4059 * by allocating a new string and freeing the old one. 4060 */ 4061 static void 4062 fn_path_realloc(char **curses, char *prepend) 4063 { 4064 int len, curlen = 0; 4065 char *news; 4066 4067 if (*curses == NULL) { 4068 /* 4069 * Prime the pump, allocate just the 4070 * space for prepend and return that. 4071 */ 4072 len = strlen(prepend) + 1; 4073 news = kmem_alloc(len, KM_SLEEP); 4074 (void) strncpy(news, prepend, len); 4075 } else { 4076 /* 4077 * Allocate the space for a new string 4078 * +1 +1 is for the "/" and the NULL 4079 * byte at the end of it all. 4080 */ 4081 curlen = strlen(*curses); 4082 len = curlen + strlen(prepend) + 1 + 1; 4083 news = kmem_alloc(len, KM_SLEEP); 4084 (void) strncpy(news, prepend, len); 4085 (void) strcat(news, "/"); 4086 (void) strcat(news, *curses); 4087 kmem_free(*curses, curlen + 1); 4088 } 4089 *curses = news; 4090 } 4091 4092 /* 4093 * Returns the path name (starting from the fs root) for the given fname. 4094 * The caller is responsible for freeing. Note that the path may be or 4095 * become invalid as a result of fn_move(). 4096 */ 4097 4098 char * 4099 fn_path(nfs4_fname_t *fnp) 4100 { 4101 char *path; 4102 nfs4_fname_t *nextfnp; 4103 4104 if (fnp == NULL) 4105 return (NULL); 4106 4107 path = NULL; 4108 4109 /* walk up the tree constructing the pathname. */ 4110 4111 fn_hold(fnp); /* adjust for later rele */ 4112 do { 4113 mutex_enter(&fnp->fn_lock); 4114 /* 4115 * Add fn_name in front of the current path 4116 */ 4117 fn_path_realloc(&path, fnp->fn_name); 4118 nextfnp = fnp->fn_parent; 4119 if (nextfnp != NULL) 4120 fn_hold(nextfnp); 4121 mutex_exit(&fnp->fn_lock); 4122 fn_rele(&fnp); 4123 fnp = nextfnp; 4124 } while (fnp != NULL); 4125 4126 return (path); 4127 } 4128 4129 /* 4130 * Return a reference to the parent of the given fname, which the caller is 4131 * responsible for eventually releasing. 4132 */ 4133 4134 nfs4_fname_t * 4135 fn_parent(nfs4_fname_t *fnp) 4136 { 4137 nfs4_fname_t *parent; 4138 4139 mutex_enter(&fnp->fn_lock); 4140 parent = fnp->fn_parent; 4141 if (parent != NULL) 4142 fn_hold(parent); 4143 mutex_exit(&fnp->fn_lock); 4144 4145 return (parent); 4146 } 4147 4148 /* 4149 * Update fnp so that its parent is newparent and its name is newname. 4150 */ 4151 4152 void 4153 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4154 { 4155 nfs4_fname_t *parent, *tmpfnp; 4156 ssize_t newlen; 4157 nfs4_fname_t key; 4158 avl_index_t where; 4159 4160 /* 4161 * This assert exists to catch the client trying to rename 4162 * a dir to be a child of itself. This happened at a recent 4163 * bakeoff against a 3rd party (broken) server which allowed 4164 * the rename to succeed. If it trips it means that: 4165 * a) the code in nfs4rename that detects this case is broken 4166 * b) the server is broken (since it allowed the bogus rename) 4167 * 4168 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4169 * panic below from: mutex_enter(&newparent->fn_lock); 4170 */ 4171 ASSERT(fnp != newparent); 4172 4173 /* 4174 * Remove fnp from its current parent, change its name, then add it 4175 * to newparent. 4176 */ 4177 mutex_enter(&fnp->fn_lock); 4178 parent = fnp->fn_parent; 4179 mutex_enter(&parent->fn_lock); 4180 avl_remove(&parent->fn_children, fnp); 4181 mutex_exit(&parent->fn_lock); 4182 fn_rele(&fnp->fn_parent); 4183 4184 newlen = strlen(newname); 4185 if (newlen != fnp->fn_len) { 4186 ASSERT(newlen < MAXNAMELEN); 4187 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4188 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4189 fnp->fn_len = newlen; 4190 } 4191 (void) strcpy(fnp->fn_name, newname); 4192 4193 again: 4194 mutex_enter(&newparent->fn_lock); 4195 key.fn_name = fnp->fn_name; 4196 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4197 if (tmpfnp != NULL) { 4198 /* 4199 * This could be due to a file that was unlinked while 4200 * open, or perhaps the rnode is in the free list. Remove 4201 * it from newparent and let it go away on its own. The 4202 * contorted code is to deal with lock order issues and 4203 * race conditions. 4204 */ 4205 fn_hold(tmpfnp); 4206 mutex_exit(&newparent->fn_lock); 4207 mutex_enter(&tmpfnp->fn_lock); 4208 if (tmpfnp->fn_parent == newparent) { 4209 mutex_enter(&newparent->fn_lock); 4210 avl_remove(&newparent->fn_children, tmpfnp); 4211 mutex_exit(&newparent->fn_lock); 4212 fn_rele(&tmpfnp->fn_parent); 4213 } 4214 mutex_exit(&tmpfnp->fn_lock); 4215 fn_rele(&tmpfnp); 4216 goto again; 4217 } 4218 fnp->fn_parent = newparent; 4219 fn_hold(newparent); 4220 avl_insert(&newparent->fn_children, fnp, where); 4221 mutex_exit(&newparent->fn_lock); 4222 mutex_exit(&fnp->fn_lock); 4223 } 4224 4225 #ifdef DEBUG 4226 /* 4227 * Return non-zero if the type information makes sense for the given vnode. 4228 * Otherwise panic. 4229 */ 4230 int 4231 nfs4_consistent_type(vnode_t *vp) 4232 { 4233 rnode4_t *rp = VTOR4(vp); 4234 4235 if (nfs4_vtype_debug && vp->v_type != VNON && 4236 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4237 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4238 "rnode attr type=%d", (void *)vp, vp->v_type, 4239 rp->r_attr.va_type); 4240 } 4241 4242 return (1); 4243 } 4244 #endif /* DEBUG */ 4245