1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/thread.h> 37 #include <sys/t_lock.h> 38 #include <sys/time.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/errno.h> 42 #include <sys/buf.h> 43 #include <sys/stat.h> 44 #include <sys/cred.h> 45 #include <sys/kmem.h> 46 #include <sys/debug.h> 47 #include <sys/dnlc.h> 48 #include <sys/vmsystm.h> 49 #include <sys/flock.h> 50 #include <sys/share.h> 51 #include <sys/cmn_err.h> 52 #include <sys/tiuser.h> 53 #include <sys/sysmacros.h> 54 #include <sys/callb.h> 55 #include <sys/acl.h> 56 #include <sys/kstat.h> 57 #include <sys/signal.h> 58 #include <sys/disp.h> 59 #include <sys/atomic.h> 60 #include <sys/list.h> 61 #include <sys/sdt.h> 62 63 #include <rpc/types.h> 64 #include <rpc/xdr.h> 65 #include <rpc/auth.h> 66 #include <rpc/clnt.h> 67 68 #include <nfs/nfs.h> 69 #include <nfs/nfs_clnt.h> 70 #include <nfs/nfs_acl.h> 71 72 #include <nfs/nfs4.h> 73 #include <nfs/rnode4.h> 74 #include <nfs/nfs4_clnt.h> 75 76 #include <vm/hat.h> 77 #include <vm/as.h> 78 #include <vm/page.h> 79 #include <vm/pvn.h> 80 #include <vm/seg.h> 81 #include <vm/seg_map.h> 82 #include <vm/seg_vn.h> 83 84 #include <sys/ddi.h> 85 86 /* 87 * Arguments to page-flush thread. 88 */ 89 typedef struct { 90 vnode_t *vp; 91 cred_t *cr; 92 } pgflush_t; 93 94 #ifdef DEBUG 95 int nfs4_client_lease_debug; 96 int nfs4_sharedfh_debug; 97 int nfs4_fname_debug; 98 99 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 100 int nfs4_vtype_debug; 101 102 uint_t nfs4_tsd_key; 103 #endif 104 105 static time_t nfs4_client_resumed = 0; 106 static callb_id_t cid = 0; 107 108 static int nfs4renew(nfs4_server_t *); 109 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 110 static void nfs4_pgflush_thread(pgflush_t *); 111 static void flush_pages(vnode_t *, cred_t *); 112 113 static boolean_t nfs4_client_cpr_callb(void *, int); 114 115 struct mi4_globals { 116 kmutex_t mig_lock; /* lock protecting mig_list */ 117 list_t mig_list; /* list of NFS v4 mounts in zone */ 118 boolean_t mig_destructor_called; 119 }; 120 121 static zone_key_t mi4_list_key; 122 123 /* 124 * Attributes caching: 125 * 126 * Attributes are cached in the rnode in struct vattr form. 127 * There is a time associated with the cached attributes (r_time_attr_inval) 128 * which tells whether the attributes are valid. The time is initialized 129 * to the difference between current time and the modify time of the vnode 130 * when new attributes are cached. This allows the attributes for 131 * files that have changed recently to be timed out sooner than for files 132 * that have not changed for a long time. There are minimum and maximum 133 * timeout values that can be set per mount point. 134 */ 135 136 /* 137 * If a cache purge is in progress, wait for it to finish. 138 * 139 * The current thread must not be in the middle of an 140 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 141 * between this thread, a recovery thread, and the page flush thread. 142 */ 143 int 144 nfs4_waitfor_purge_complete(vnode_t *vp) 145 { 146 rnode4_t *rp; 147 k_sigset_t smask; 148 149 rp = VTOR4(vp); 150 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 151 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 152 mutex_enter(&rp->r_statelock); 153 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 154 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 155 ((rp->r_flags & R4PGFLUSH) && 156 rp->r_pgflush != curthread)) { 157 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 158 sigunintr(&smask); 159 mutex_exit(&rp->r_statelock); 160 return (EINTR); 161 } 162 } 163 sigunintr(&smask); 164 mutex_exit(&rp->r_statelock); 165 } 166 return (0); 167 } 168 169 /* 170 * Validate caches by checking cached attributes. If they have timed out, 171 * then get new attributes from the server. As a side effect, cache 172 * invalidation is done if the attributes have changed. 173 * 174 * If the attributes have not timed out and if there is a cache 175 * invalidation being done by some other thread, then wait until that 176 * thread has completed the cache invalidation. 177 */ 178 int 179 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 180 { 181 int error; 182 nfs4_ga_res_t gar; 183 184 if (ATTRCACHE4_VALID(vp)) { 185 error = nfs4_waitfor_purge_complete(vp); 186 if (error) 187 return (error); 188 return (0); 189 } 190 191 gar.n4g_va.va_mask = AT_ALL; 192 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 193 } 194 195 /* 196 * Fill in attribute from the cache. 197 * If valid, then return 0 to indicate that no error occurred, 198 * otherwise return 1 to indicate that an error occurred. 199 */ 200 static int 201 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 202 { 203 rnode4_t *rp; 204 205 rp = VTOR4(vp); 206 mutex_enter(&rp->r_statelock); 207 mutex_enter(&rp->r_statev4_lock); 208 if (ATTRCACHE4_VALID(vp)) { 209 mutex_exit(&rp->r_statev4_lock); 210 /* 211 * Cached attributes are valid 212 */ 213 *vap = rp->r_attr; 214 mutex_exit(&rp->r_statelock); 215 return (0); 216 } 217 mutex_exit(&rp->r_statev4_lock); 218 mutex_exit(&rp->r_statelock); 219 return (1); 220 } 221 222 223 /* 224 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 225 * call is synchronous because all the pages were invalidated by the 226 * nfs4_invalidate_pages() call. 227 */ 228 void 229 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 230 { 231 struct rnode4 *rp = VTOR4(vp); 232 233 /* Ensure that the ..._end_op() call has been done */ 234 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 235 236 if (errno != ESTALE) 237 return; 238 239 mutex_enter(&rp->r_statelock); 240 rp->r_flags |= R4STALE; 241 if (!rp->r_error) 242 rp->r_error = errno; 243 mutex_exit(&rp->r_statelock); 244 if (nfs4_has_pages(vp)) 245 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 246 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 247 } 248 249 /* 250 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 251 * page purge is done asynchronously. 252 */ 253 void 254 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 255 { 256 rnode4_t *rp; 257 char *contents; 258 vnode_t *xattr; 259 int size; 260 int pgflush; /* are we the page flush thread? */ 261 262 /* 263 * Purge the DNLC for any entries which refer to this file. 264 */ 265 if (vp->v_count > 1 && 266 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 267 dnlc_purge_vp(vp); 268 269 /* 270 * Clear any readdir state bits and purge the readlink response cache. 271 */ 272 rp = VTOR4(vp); 273 mutex_enter(&rp->r_statelock); 274 rp->r_flags &= ~R4LOOKUP; 275 contents = rp->r_symlink.contents; 276 size = rp->r_symlink.size; 277 rp->r_symlink.contents = NULL; 278 279 xattr = rp->r_xattr_dir; 280 rp->r_xattr_dir = NULL; 281 282 /* 283 * Purge pathconf cache too. 284 */ 285 rp->r_pathconf.pc4_xattr_valid = 0; 286 rp->r_pathconf.pc4_cache_valid = 0; 287 288 pgflush = (curthread == rp->r_pgflush); 289 mutex_exit(&rp->r_statelock); 290 291 if (contents != NULL) { 292 293 kmem_free((void *)contents, size); 294 } 295 296 if (xattr != NULL) 297 VN_RELE(xattr); 298 299 /* 300 * Flush the page cache. If the current thread is the page flush 301 * thread, don't initiate a new page flush. There's no need for 302 * it, and doing it correctly is hard. 303 */ 304 if (nfs4_has_pages(vp) && !pgflush) { 305 if (!asyncpg) { 306 (void) nfs4_waitfor_purge_complete(vp); 307 flush_pages(vp, cr); 308 } else { 309 pgflush_t *args; 310 311 /* 312 * We don't hold r_statelock while creating the 313 * thread, in case the call blocks. So we use a 314 * flag to indicate that a page flush thread is 315 * active. 316 */ 317 mutex_enter(&rp->r_statelock); 318 if (rp->r_flags & R4PGFLUSH) { 319 mutex_exit(&rp->r_statelock); 320 } else { 321 rp->r_flags |= R4PGFLUSH; 322 mutex_exit(&rp->r_statelock); 323 324 args = kmem_alloc(sizeof (pgflush_t), 325 KM_SLEEP); 326 args->vp = vp; 327 VN_HOLD(args->vp); 328 args->cr = cr; 329 crhold(args->cr); 330 (void) zthread_create(NULL, 0, 331 nfs4_pgflush_thread, args, 0, 332 minclsyspri); 333 } 334 } 335 } 336 337 /* 338 * Flush the readdir response cache. 339 */ 340 nfs4_purge_rddir_cache(vp); 341 } 342 343 /* 344 * Invalidate all pages for the given file, after writing back the dirty 345 * ones. 346 */ 347 348 static void 349 flush_pages(vnode_t *vp, cred_t *cr) 350 { 351 int error; 352 rnode4_t *rp = VTOR4(vp); 353 354 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr); 355 if (error == ENOSPC || error == EDQUOT) { 356 mutex_enter(&rp->r_statelock); 357 if (!rp->r_error) 358 rp->r_error = error; 359 mutex_exit(&rp->r_statelock); 360 } 361 } 362 363 /* 364 * Page flush thread. 365 */ 366 367 static void 368 nfs4_pgflush_thread(pgflush_t *args) 369 { 370 rnode4_t *rp = VTOR4(args->vp); 371 372 /* remember which thread we are, so we don't deadlock ourselves */ 373 mutex_enter(&rp->r_statelock); 374 ASSERT(rp->r_pgflush == NULL); 375 rp->r_pgflush = curthread; 376 mutex_exit(&rp->r_statelock); 377 378 flush_pages(args->vp, args->cr); 379 380 mutex_enter(&rp->r_statelock); 381 rp->r_pgflush = NULL; 382 rp->r_flags &= ~R4PGFLUSH; 383 cv_broadcast(&rp->r_cv); 384 mutex_exit(&rp->r_statelock); 385 386 VN_RELE(args->vp); 387 crfree(args->cr); 388 kmem_free(args, sizeof (pgflush_t)); 389 zthread_exit(); 390 } 391 392 /* 393 * Purge the readdir cache of all entries which are not currently 394 * being filled. 395 */ 396 void 397 nfs4_purge_rddir_cache(vnode_t *vp) 398 { 399 rnode4_t *rp; 400 401 rp = VTOR4(vp); 402 403 mutex_enter(&rp->r_statelock); 404 rp->r_direof = NULL; 405 rp->r_flags &= ~R4LOOKUP; 406 rp->r_flags |= R4READDIRWATTR; 407 rddir4_cache_purge(rp); 408 mutex_exit(&rp->r_statelock); 409 } 410 411 /* 412 * Set attributes cache for given vnode using virtual attributes. There is 413 * no cache validation, but if the attributes are deemed to be stale, they 414 * are ignored. This corresponds to nfs3_attrcache(). 415 * 416 * Set the timeout value on the attribute cache and fill it 417 * with the passed in attributes. 418 */ 419 void 420 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 421 { 422 rnode4_t *rp = VTOR4(vp); 423 424 mutex_enter(&rp->r_statelock); 425 if (rp->r_time_attr_saved <= t) 426 nfs4_attrcache_va(vp, garp, FALSE); 427 mutex_exit(&rp->r_statelock); 428 } 429 430 /* 431 * Use the passed in virtual attributes to check to see whether the 432 * data and metadata caches are valid, cache the new attributes, and 433 * then do the cache invalidation if required. 434 * 435 * The cache validation and caching of the new attributes is done 436 * atomically via the use of the mutex, r_statelock. If required, 437 * the cache invalidation is done atomically w.r.t. the cache 438 * validation and caching of the attributes via the pseudo lock, 439 * r_serial. 440 * 441 * This routine is used to do cache validation and attributes caching 442 * for operations with a single set of post operation attributes. 443 */ 444 445 void 446 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 447 hrtime_t t, cred_t *cr, int async, 448 change_info4 *cinfo) 449 { 450 rnode4_t *rp; 451 int mtime_changed; 452 int ctime_changed; 453 vsecattr_t *vsp; 454 int was_serial, set_time_cache_inval, recov; 455 vattr_t *vap = &garp->n4g_va; 456 mntinfo4_t *mi = VTOMI4(vp); 457 458 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 459 460 /* Is curthread the recovery thread? */ 461 mutex_enter(&mi->mi_lock); 462 recov = (VTOMI4(vp)->mi_recovthread == curthread); 463 mutex_exit(&mi->mi_lock); 464 465 rp = VTOR4(vp); 466 mutex_enter(&rp->r_statelock); 467 was_serial = (rp->r_serial == curthread); 468 if (rp->r_serial && !was_serial) { 469 klwp_t *lwp = ttolwp(curthread); 470 471 /* 472 * If we're the recovery thread, then purge current attrs 473 * and bail out to avoid potential deadlock between another 474 * thread caching attrs (r_serial thread), recov thread, 475 * and an async writer thread. 476 */ 477 if (recov) { 478 PURGE_ATTRCACHE4_LOCKED(rp); 479 mutex_exit(&rp->r_statelock); 480 return; 481 } 482 483 if (lwp != NULL) 484 lwp->lwp_nostop++; 485 while (rp->r_serial != NULL) { 486 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 487 mutex_exit(&rp->r_statelock); 488 if (lwp != NULL) 489 lwp->lwp_nostop--; 490 return; 491 } 492 } 493 if (lwp != NULL) 494 lwp->lwp_nostop--; 495 } 496 497 /* 498 * If there is a page flush thread, the current thread needs to 499 * bail out, to prevent a possible deadlock between the current 500 * thread (which might be in a start_op/end_op region), the 501 * recovery thread, and the page flush thread. Expire the 502 * attribute cache, so that any attributes the current thread was 503 * going to set are not lost. 504 */ 505 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 506 PURGE_ATTRCACHE4_LOCKED(rp); 507 mutex_exit(&rp->r_statelock); 508 return; 509 } 510 511 if (rp->r_time_attr_saved > t) { 512 /* 513 * Attributes have been cached since these attributes were 514 * made, so don't act on them. 515 */ 516 mutex_exit(&rp->r_statelock); 517 return; 518 } 519 set_time_cache_inval = 0; 520 if (cinfo) { 521 /* 522 * Only directory modifying callers pass non-NULL cinfo. 523 */ 524 ASSERT(vp->v_type == VDIR); 525 /* 526 * If the cache timeout either doesn't exist or hasn't expired, 527 * and dir didn't changed on server before dirmod op 528 * and dir didn't change after dirmod op but before getattr 529 * then there's a chance that the client's cached data for 530 * this object is current (not stale). No immediate cache 531 * flush is required. 532 * 533 */ 534 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 535 cinfo->before == rp->r_change && 536 (garp->n4g_change_valid && 537 cinfo->after == garp->n4g_change)) { 538 539 /* 540 * If atomic isn't set, then the before/after info 541 * cannot be blindly trusted. For this case, we tell 542 * nfs4_attrcache_va to cache the attrs but also 543 * establish an absolute maximum cache timeout. When 544 * the timeout is reached, caches will be flushed. 545 */ 546 if (! cinfo->atomic) 547 set_time_cache_inval = 1; 548 549 mtime_changed = 0; 550 ctime_changed = 0; 551 } else { 552 553 /* 554 * We're not sure exactly what changed, but we know 555 * what to do. flush all caches for dir. remove the 556 * attr timeout. 557 * 558 * a) timeout expired. flush all caches. 559 * b) r_change != cinfo.before. flush all caches. 560 * c) r_change == cinfo.before, but cinfo.after != 561 * post-op getattr(change). flush all caches. 562 * d) post-op getattr(change) not provided by server. 563 * flush all caches. 564 */ 565 mtime_changed = 1; 566 ctime_changed = 1; 567 rp->r_time_cache_inval = 0; 568 } 569 } else { 570 if (!(rp->r_flags & R4WRITEMODIFIED)) { 571 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 572 mtime_changed = 1; 573 else 574 mtime_changed = 0; 575 if (rp->r_attr.va_ctime.tv_sec != 576 vap->va_ctime.tv_sec || 577 rp->r_attr.va_ctime.tv_nsec != 578 vap->va_ctime.tv_nsec) 579 ctime_changed = 1; 580 else 581 ctime_changed = 0; 582 } else { 583 mtime_changed = 0; 584 ctime_changed = 0; 585 } 586 } 587 588 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 589 590 if (!mtime_changed && !ctime_changed) { 591 mutex_exit(&rp->r_statelock); 592 return; 593 } 594 595 rp->r_serial = curthread; 596 597 mutex_exit(&rp->r_statelock); 598 599 /* 600 * If we're the recov thread, then force async nfs4_purge_caches 601 * to avoid potential deadlock. 602 */ 603 if (mtime_changed) 604 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 605 606 if (ctime_changed) { 607 (void) nfs4_access_purge_rp(rp); 608 if (rp->r_secattr != NULL) { 609 mutex_enter(&rp->r_statelock); 610 vsp = rp->r_secattr; 611 rp->r_secattr = NULL; 612 mutex_exit(&rp->r_statelock); 613 if (vsp != NULL) 614 nfs4_acl_free_cache(vsp); 615 } 616 } 617 618 if (!was_serial) { 619 mutex_enter(&rp->r_statelock); 620 rp->r_serial = NULL; 621 cv_broadcast(&rp->r_cv); 622 mutex_exit(&rp->r_statelock); 623 } 624 } 625 626 /* 627 * Set attributes cache for given vnode using virtual attributes. 628 * 629 * Set the timeout value on the attribute cache and fill it 630 * with the passed in attributes. 631 * 632 * The caller must be holding r_statelock. 633 */ 634 static void 635 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 636 { 637 rnode4_t *rp; 638 mntinfo4_t *mi; 639 hrtime_t delta; 640 hrtime_t now; 641 vattr_t *vap = &garp->n4g_va; 642 643 rp = VTOR4(vp); 644 645 ASSERT(MUTEX_HELD(&rp->r_statelock)); 646 ASSERT(vap->va_mask == AT_ALL); 647 648 /* Switch to master before checking v_flag */ 649 if (IS_SHADOW(vp, rp)) 650 vp = RTOV4(rp); 651 652 now = gethrtime(); 653 654 mi = VTOMI4(vp); 655 656 /* 657 * Only establish a new cache timeout (if requested). Never 658 * extend a timeout. Never clear a timeout. Clearing a timeout 659 * is done by nfs4_update_dircaches (ancestor in our call chain) 660 */ 661 if (set_cache_timeout && ! rp->r_time_cache_inval) 662 rp->r_time_cache_inval = now + mi->mi_acdirmax; 663 664 /* 665 * Delta is the number of nanoseconds that we will 666 * cache the attributes of the file. It is based on 667 * the number of nanoseconds since the last time that 668 * we detected a change. The assumption is that files 669 * that changed recently are likely to change again. 670 * There is a minimum and a maximum for regular files 671 * and for directories which is enforced though. 672 * 673 * Using the time since last change was detected 674 * eliminates direct comparison or calculation 675 * using mixed client and server times. NFS does 676 * not make any assumptions regarding the client 677 * and server clocks being synchronized. 678 */ 679 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 680 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 681 vap->va_size != rp->r_attr.va_size) { 682 rp->r_time_attr_saved = now; 683 } 684 685 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 686 delta = 0; 687 else { 688 delta = now - rp->r_time_attr_saved; 689 if (vp->v_type == VDIR) { 690 if (delta < mi->mi_acdirmin) 691 delta = mi->mi_acdirmin; 692 else if (delta > mi->mi_acdirmax) 693 delta = mi->mi_acdirmax; 694 } else { 695 if (delta < mi->mi_acregmin) 696 delta = mi->mi_acregmin; 697 else if (delta > mi->mi_acregmax) 698 delta = mi->mi_acregmax; 699 } 700 } 701 rp->r_time_attr_inval = now + delta; 702 703 rp->r_attr = *vap; 704 if (garp->n4g_change_valid) 705 rp->r_change = garp->n4g_change; 706 707 /* 708 * The attributes that were returned may be valid and can 709 * be used, but they may not be allowed to be cached. 710 * Reset the timers to cause immediate invalidation and 711 * clear r_change so no VERIFY operations will suceed 712 */ 713 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 714 rp->r_time_attr_inval = now; 715 rp->r_time_attr_saved = now; 716 rp->r_change = 0; 717 } 718 719 /* 720 * If mounted_on_fileid returned AND the object is a stub, 721 * then set object's va_nodeid to the mounted over fid 722 * returned by server. 723 * 724 * If mounted_on_fileid not provided/supported, then 725 * just set it to 0 for now. Eventually it would be 726 * better to set it to a hashed version of FH. This 727 * would probably be good enough to provide a unique 728 * fid/d_ino within a dir. 729 * 730 * We don't need to carry mounted_on_fileid in the 731 * rnode as long as the client never requests fileid 732 * without also requesting mounted_on_fileid. For 733 * now, it stays. 734 */ 735 if (garp->n4g_mon_fid_valid) { 736 rp->r_mntd_fid = garp->n4g_mon_fid; 737 738 if (rp->r_flags & R4SRVSTUB) 739 rp->r_attr.va_nodeid = rp->r_mntd_fid; 740 } 741 742 /* 743 * Check to see if there are valid pathconf bits to 744 * cache in the rnode. 745 */ 746 if (garp->n4g_ext_res) { 747 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 748 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 749 } else { 750 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 751 rp->r_pathconf.pc4_xattr_valid = TRUE; 752 rp->r_pathconf.pc4_xattr_exists = 753 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 754 } 755 } 756 } 757 /* 758 * Update the size of the file if there is no cached data or if 759 * the cached data is clean and there is no data being written 760 * out. 761 */ 762 if (rp->r_size != vap->va_size && 763 (!vn_has_cached_data(vp) || 764 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 765 rp->r_size = vap->va_size; 766 } 767 nfs_setswaplike(vp, vap); 768 rp->r_flags &= ~R4WRITEMODIFIED; 769 } 770 771 /* 772 * Get attributes over-the-wire and update attributes cache 773 * if no error occurred in the over-the-wire operation. 774 * Return 0 if successful, otherwise error. 775 */ 776 int 777 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 778 { 779 mntinfo4_t *mi = VTOMI4(vp); 780 hrtime_t t; 781 nfs4_recov_state_t recov_state; 782 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 783 784 recov_state.rs_flags = 0; 785 recov_state.rs_num_retry_despite_err = 0; 786 787 /* Save the original mount point security flavor */ 788 (void) save_mnt_secinfo(mi->mi_curr_serv); 789 790 recov_retry: 791 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 792 &recov_state, NULL))) { 793 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 794 return (e.error); 795 } 796 797 t = gethrtime(); 798 799 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 800 801 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 802 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 803 NULL, OP_GETATTR, NULL) == FALSE) { 804 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 805 &recov_state, 1); 806 goto recov_retry; 807 } 808 } 809 810 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 811 812 if (!e.error) { 813 if (e.stat == NFS4_OK) { 814 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 815 } else { 816 e.error = geterrno4(e.stat); 817 818 nfs4_purge_stale_fh(e.error, vp, cr); 819 } 820 } 821 822 /* 823 * If getattr a node that is a stub for a crossed 824 * mount point, keep the original secinfo flavor for 825 * the current file system, not the crossed one. 826 */ 827 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 828 829 return (e.error); 830 } 831 832 /* 833 * Generate a compound to get attributes over-the-wire. 834 */ 835 void 836 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 837 nfs4_error_t *ep, cred_t *cr, int get_acl) 838 { 839 COMPOUND4args_clnt args; 840 COMPOUND4res_clnt res; 841 int doqueue; 842 rnode4_t *rp = VTOR4(vp); 843 nfs_argop4 argop[2]; 844 845 args.ctag = TAG_GETATTR; 846 847 args.array_len = 2; 848 args.array = argop; 849 850 /* putfh */ 851 argop[0].argop = OP_CPUTFH; 852 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 853 854 /* getattr */ 855 /* 856 * Unlike nfs version 2 and 3, where getattr returns all the 857 * attributes, nfs version 4 returns only the ones explicitely 858 * asked for. This creates problems, as some system functions 859 * (e.g. cache check) require certain attributes and if the 860 * cached node lacks some attributes such as uid/gid, it can 861 * affect system utilities (e.g. "ls") that rely on the information 862 * to be there. This can lead to anything from system crashes to 863 * corrupted information processed by user apps. 864 * So to ensure that all bases are covered, request at least 865 * the AT_ALL attribute mask. 866 */ 867 argop[1].argop = OP_GETATTR; 868 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 869 if (get_acl) 870 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 871 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 872 873 doqueue = 1; 874 875 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 876 877 if (ep->error) 878 return; 879 880 if (res.status != NFS4_OK) { 881 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 882 return; 883 } 884 885 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 886 887 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 888 } 889 890 /* 891 * Return either cached or remote attributes. If get remote attr 892 * use them to check and invalidate caches, then cache the new attributes. 893 */ 894 int 895 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 896 { 897 int error; 898 rnode4_t *rp; 899 nfs4_ga_res_t gar; 900 901 ASSERT(nfs4_consistent_type(vp)); 902 903 /* 904 * If we've got cached attributes, we're done, otherwise go 905 * to the server to get attributes, which will update the cache 906 * in the process. 907 */ 908 rp = VTOR4(vp); 909 mutex_enter(&rp->r_statelock); 910 mutex_enter(&rp->r_statev4_lock); 911 if (ATTRCACHE4_VALID(vp)) { 912 mutex_exit(&rp->r_statev4_lock); 913 /* 914 * Cached attributes are valid 915 * Return the client's view of file size 916 */ 917 *vap = rp->r_attr; 918 vap->va_size = rp->r_size; 919 mutex_exit(&rp->r_statelock); 920 921 ASSERT(nfs4_consistent_type(vp)); 922 923 return (0); 924 } 925 mutex_exit(&rp->r_statev4_lock); 926 mutex_exit(&rp->r_statelock); 927 928 error = nfs4_getattr_otw(vp, &gar, cr, 0); 929 if (!error) 930 *vap = gar.n4g_va; 931 932 /* Return the client's view of file size */ 933 mutex_enter(&rp->r_statelock); 934 vap->va_size = rp->r_size; 935 mutex_exit(&rp->r_statelock); 936 937 ASSERT(nfs4_consistent_type(vp)); 938 939 return (error); 940 } 941 942 int 943 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 944 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 945 { 946 COMPOUND4args_clnt args; 947 COMPOUND4res_clnt res; 948 int doqueue; 949 nfs_argop4 argop[2]; 950 mntinfo4_t *mi = VTOMI4(vp); 951 bool_t needrecov = FALSE; 952 nfs4_recov_state_t recov_state; 953 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 954 nfs4_ga_ext_res_t *gerp; 955 956 recov_state.rs_flags = 0; 957 recov_state.rs_num_retry_despite_err = 0; 958 959 recov_retry: 960 args.ctag = tag_type; 961 962 args.array_len = 2; 963 args.array = argop; 964 965 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 966 if (e.error) 967 return (e.error); 968 969 /* putfh */ 970 argop[0].argop = OP_CPUTFH; 971 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 972 973 /* getattr */ 974 argop[1].argop = OP_GETATTR; 975 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 976 argop[1].nfs_argop4_u.opgetattr.mi = mi; 977 978 doqueue = 1; 979 980 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 981 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 982 rnode4info(VTOR4(vp)))); 983 984 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 985 986 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 987 if (!needrecov && e.error) { 988 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 989 needrecov); 990 return (e.error); 991 } 992 993 if (needrecov) { 994 bool_t abort; 995 996 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 997 "nfs4_attr_otw: initiating recovery\n")); 998 999 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1000 NULL, OP_GETATTR, NULL); 1001 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1002 needrecov); 1003 if (!e.error) { 1004 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1005 e.error = geterrno4(res.status); 1006 } 1007 if (abort == FALSE) 1008 goto recov_retry; 1009 return (e.error); 1010 } 1011 1012 if (res.status) { 1013 e.error = geterrno4(res.status); 1014 } else { 1015 gerp = garp->n4g_ext_res; 1016 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1017 garp, sizeof (nfs4_ga_res_t)); 1018 garp->n4g_ext_res = gerp; 1019 if (garp->n4g_ext_res && 1020 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1021 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1022 ga_res.n4g_ext_res, 1023 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1024 } 1025 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1026 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1027 needrecov); 1028 return (e.error); 1029 } 1030 1031 /* 1032 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1033 * for the demand-based allocation of async threads per-mount. The 1034 * nfs_async_timeout is the amount of time a thread will live after it 1035 * becomes idle, unless new I/O requests are received before the thread 1036 * dies. See nfs4_async_putpage and nfs4_async_start. 1037 */ 1038 1039 static void nfs4_async_start(struct vfs *); 1040 1041 static void 1042 free_async_args4(struct nfs4_async_reqs *args) 1043 { 1044 rnode4_t *rp; 1045 1046 if (args->a_io != NFS4_INACTIVE) { 1047 rp = VTOR4(args->a_vp); 1048 mutex_enter(&rp->r_statelock); 1049 rp->r_count--; 1050 if (args->a_io == NFS4_PUTAPAGE || 1051 args->a_io == NFS4_PAGEIO) 1052 rp->r_awcount--; 1053 cv_broadcast(&rp->r_cv); 1054 mutex_exit(&rp->r_statelock); 1055 VN_RELE(args->a_vp); 1056 } 1057 crfree(args->a_cred); 1058 kmem_free(args, sizeof (*args)); 1059 } 1060 1061 /* 1062 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1063 * pageout(), running in the global zone, have legitimate reasons to do 1064 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1065 * use of a a per-mount "asynchronous requests manager thread" which is 1066 * signaled by the various asynchronous work routines when there is 1067 * asynchronous work to be done. It is responsible for creating new 1068 * worker threads if necessary, and notifying existing worker threads 1069 * that there is work to be done. 1070 * 1071 * In other words, it will "take the specifications from the customers and 1072 * give them to the engineers." 1073 * 1074 * Worker threads die off of their own accord if they are no longer 1075 * needed. 1076 * 1077 * This thread is killed when the zone is going away or the filesystem 1078 * is being unmounted. 1079 */ 1080 void 1081 nfs4_async_manager(vfs_t *vfsp) 1082 { 1083 callb_cpr_t cprinfo; 1084 mntinfo4_t *mi; 1085 uint_t max_threads; 1086 1087 mi = VFTOMI4(vfsp); 1088 1089 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1090 "nfs4_async_manager"); 1091 1092 mutex_enter(&mi->mi_async_lock); 1093 /* 1094 * We want to stash the max number of threads that this mount was 1095 * allowed so we can use it later when the variable is set to zero as 1096 * part of the zone/mount going away. 1097 * 1098 * We want to be able to create at least one thread to handle 1099 * asyncrhonous inactive calls. 1100 */ 1101 max_threads = MAX(mi->mi_max_threads, 1); 1102 mutex_enter(&mi->mi_lock); 1103 /* 1104 * We don't want to wait for mi_max_threads to go to zero, since that 1105 * happens as part of a failed unmount, but this thread should only 1106 * exit when the mount is really going away. 1107 * 1108 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1109 * attempted: the various _async_*() functions know to do things 1110 * inline if mi_max_threads == 0. Henceforth we just drain out the 1111 * outstanding requests. 1112 * 1113 * Note that we still create zthreads even if we notice the zone is 1114 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1115 * shutdown sequence to take slightly longer in some cases, but 1116 * doesn't violate the protocol, as all threads will exit as soon as 1117 * they're done processing the remaining requests. 1118 */ 1119 while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) || 1120 mi->mi_async_req_count > 0) { 1121 mutex_exit(&mi->mi_lock); 1122 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1123 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1124 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1125 while (mi->mi_async_req_count > 0) { 1126 /* 1127 * Paranoia: If the mount started out having 1128 * (mi->mi_max_threads == 0), and the value was 1129 * later changed (via a debugger or somesuch), 1130 * we could be confused since we will think we 1131 * can't create any threads, and the calling 1132 * code (which looks at the current value of 1133 * mi->mi_max_threads, now non-zero) thinks we 1134 * can. 1135 * 1136 * So, because we're paranoid, we create threads 1137 * up to the maximum of the original and the 1138 * current value. This means that future 1139 * (debugger-induced) alterations of 1140 * mi->mi_max_threads are ignored for our 1141 * purposes, but who told them they could change 1142 * random values on a live kernel anyhow? 1143 */ 1144 if (mi->mi_threads < 1145 MAX(mi->mi_max_threads, max_threads)) { 1146 mi->mi_threads++; 1147 mutex_exit(&mi->mi_async_lock); 1148 MI4_HOLD(mi); 1149 VFS_HOLD(vfsp); /* hold for new thread */ 1150 (void) zthread_create(NULL, 0, nfs4_async_start, 1151 vfsp, 0, minclsyspri); 1152 mutex_enter(&mi->mi_async_lock); 1153 } 1154 cv_signal(&mi->mi_async_work_cv); 1155 ASSERT(mi->mi_async_req_count != 0); 1156 mi->mi_async_req_count--; 1157 } 1158 mutex_enter(&mi->mi_lock); 1159 } 1160 mutex_exit(&mi->mi_lock); 1161 1162 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1163 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1164 /* 1165 * Let everyone know we're done. 1166 */ 1167 mi->mi_manager_thread = NULL; 1168 /* 1169 * Wake up the inactive thread. 1170 */ 1171 cv_broadcast(&mi->mi_inact_req_cv); 1172 /* 1173 * Wake up anyone sitting in nfs4_async_manager_stop() 1174 */ 1175 cv_broadcast(&mi->mi_async_cv); 1176 /* 1177 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1178 * since CALLB_CPR_EXIT is actually responsible for releasing 1179 * 'mi_async_lock'. 1180 */ 1181 CALLB_CPR_EXIT(&cprinfo); 1182 VFS_RELE(vfsp); /* release thread's hold */ 1183 MI4_RELE(mi); 1184 zthread_exit(); 1185 } 1186 1187 /* 1188 * Signal (and wait for) the async manager thread to clean up and go away. 1189 */ 1190 void 1191 nfs4_async_manager_stop(vfs_t *vfsp) 1192 { 1193 mntinfo4_t *mi = VFTOMI4(vfsp); 1194 1195 mutex_enter(&mi->mi_async_lock); 1196 mutex_enter(&mi->mi_lock); 1197 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1198 mutex_exit(&mi->mi_lock); 1199 cv_broadcast(&mi->mi_async_reqs_cv); 1200 /* 1201 * Wait for the async manager thread to die. 1202 */ 1203 while (mi->mi_manager_thread != NULL) 1204 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1205 mutex_exit(&mi->mi_async_lock); 1206 } 1207 1208 int 1209 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1210 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1211 u_offset_t, caddr_t, struct seg *, cred_t *)) 1212 { 1213 rnode4_t *rp; 1214 mntinfo4_t *mi; 1215 struct nfs4_async_reqs *args; 1216 1217 rp = VTOR4(vp); 1218 ASSERT(rp->r_freef == NULL); 1219 1220 mi = VTOMI4(vp); 1221 1222 /* 1223 * If addr falls in a different segment, don't bother doing readahead. 1224 */ 1225 if (addr >= seg->s_base + seg->s_size) 1226 return (-1); 1227 1228 /* 1229 * If we can't allocate a request structure, punt on the readahead. 1230 */ 1231 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1232 return (-1); 1233 1234 /* 1235 * If a lock operation is pending, don't initiate any new 1236 * readaheads. Otherwise, bump r_count to indicate the new 1237 * asynchronous I/O. 1238 */ 1239 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1240 kmem_free(args, sizeof (*args)); 1241 return (-1); 1242 } 1243 mutex_enter(&rp->r_statelock); 1244 rp->r_count++; 1245 mutex_exit(&rp->r_statelock); 1246 nfs_rw_exit(&rp->r_lkserlock); 1247 1248 args->a_next = NULL; 1249 #ifdef DEBUG 1250 args->a_queuer = curthread; 1251 #endif 1252 VN_HOLD(vp); 1253 args->a_vp = vp; 1254 ASSERT(cr != NULL); 1255 crhold(cr); 1256 args->a_cred = cr; 1257 args->a_io = NFS4_READ_AHEAD; 1258 args->a_nfs4_readahead = readahead; 1259 args->a_nfs4_blkoff = blkoff; 1260 args->a_nfs4_seg = seg; 1261 args->a_nfs4_addr = addr; 1262 1263 mutex_enter(&mi->mi_async_lock); 1264 1265 /* 1266 * If asyncio has been disabled, don't bother readahead. 1267 */ 1268 if (mi->mi_max_threads == 0) { 1269 mutex_exit(&mi->mi_async_lock); 1270 goto noasync; 1271 } 1272 1273 /* 1274 * Link request structure into the async list and 1275 * wakeup async thread to do the i/o. 1276 */ 1277 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1278 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1279 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1280 } else { 1281 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1282 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1283 } 1284 1285 if (mi->mi_io_kstats) { 1286 mutex_enter(&mi->mi_lock); 1287 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1288 mutex_exit(&mi->mi_lock); 1289 } 1290 1291 mi->mi_async_req_count++; 1292 ASSERT(mi->mi_async_req_count != 0); 1293 cv_signal(&mi->mi_async_reqs_cv); 1294 mutex_exit(&mi->mi_async_lock); 1295 return (0); 1296 1297 noasync: 1298 mutex_enter(&rp->r_statelock); 1299 rp->r_count--; 1300 cv_broadcast(&rp->r_cv); 1301 mutex_exit(&rp->r_statelock); 1302 VN_RELE(vp); 1303 crfree(cr); 1304 kmem_free(args, sizeof (*args)); 1305 return (-1); 1306 } 1307 1308 /* 1309 * The async queues for each mounted file system are arranged as a 1310 * set of queues, one for each async i/o type. Requests are taken 1311 * from the queues in a round-robin fashion. A number of consecutive 1312 * requests are taken from each queue before moving on to the next 1313 * queue. This functionality may allow the NFS Version 2 server to do 1314 * write clustering, even if the client is mixing writes and reads 1315 * because it will take multiple write requests from the queue 1316 * before processing any of the other async i/o types. 1317 * 1318 * XXX The nfs4_async_start thread is unsafe in the light of the present 1319 * model defined by cpr to suspend the system. Specifically over the 1320 * wire calls are cpr-unsafe. The thread should be reevaluated in 1321 * case of future updates to the cpr model. 1322 */ 1323 static void 1324 nfs4_async_start(struct vfs *vfsp) 1325 { 1326 struct nfs4_async_reqs *args; 1327 mntinfo4_t *mi = VFTOMI4(vfsp); 1328 clock_t time_left = 1; 1329 callb_cpr_t cprinfo; 1330 int i; 1331 extern int nfs_async_timeout; 1332 1333 /* 1334 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1335 * built in an implementation independent manner. 1336 */ 1337 if (nfs_async_timeout == -1) 1338 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1339 1340 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1341 1342 mutex_enter(&mi->mi_async_lock); 1343 for (;;) { 1344 /* 1345 * Find the next queue containing an entry. We start 1346 * at the current queue pointer and then round robin 1347 * through all of them until we either find a non-empty 1348 * queue or have looked through all of them. 1349 */ 1350 for (i = 0; i < NFS4_ASYNC_TYPES; i++) { 1351 args = *mi->mi_async_curr; 1352 if (args != NULL) 1353 break; 1354 mi->mi_async_curr++; 1355 if (mi->mi_async_curr == 1356 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1357 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1358 } 1359 /* 1360 * If we didn't find a entry, then block until woken up 1361 * again and then look through the queues again. 1362 */ 1363 if (args == NULL) { 1364 /* 1365 * Exiting is considered to be safe for CPR as well 1366 */ 1367 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1368 1369 /* 1370 * Wakeup thread waiting to unmount the file 1371 * system only if all async threads are inactive. 1372 * 1373 * If we've timed-out and there's nothing to do, 1374 * then get rid of this thread. 1375 */ 1376 if (mi->mi_max_threads == 0 || time_left <= 0) { 1377 if (--mi->mi_threads == 0) 1378 cv_signal(&mi->mi_async_cv); 1379 CALLB_CPR_EXIT(&cprinfo); 1380 VFS_RELE(vfsp); /* release thread's hold */ 1381 MI4_RELE(mi); 1382 zthread_exit(); 1383 /* NOTREACHED */ 1384 } 1385 time_left = cv_timedwait(&mi->mi_async_work_cv, 1386 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1387 1388 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1389 1390 continue; 1391 } else { 1392 time_left = 1; 1393 } 1394 1395 /* 1396 * Remove the request from the async queue and then 1397 * update the current async request queue pointer. If 1398 * the current queue is empty or we have removed enough 1399 * consecutive entries from it, then reset the counter 1400 * for this queue and then move the current pointer to 1401 * the next queue. 1402 */ 1403 *mi->mi_async_curr = args->a_next; 1404 if (*mi->mi_async_curr == NULL || 1405 --mi->mi_async_clusters[args->a_io] == 0) { 1406 mi->mi_async_clusters[args->a_io] = 1407 mi->mi_async_init_clusters; 1408 mi->mi_async_curr++; 1409 if (mi->mi_async_curr == 1410 &mi->mi_async_reqs[NFS4_ASYNC_TYPES]) 1411 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1412 } 1413 1414 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1415 mutex_enter(&mi->mi_lock); 1416 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1417 mutex_exit(&mi->mi_lock); 1418 } 1419 1420 mutex_exit(&mi->mi_async_lock); 1421 1422 /* 1423 * Obtain arguments from the async request structure. 1424 */ 1425 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1426 (*args->a_nfs4_readahead)(args->a_vp, 1427 args->a_nfs4_blkoff, 1428 args->a_nfs4_addr, args->a_nfs4_seg, 1429 args->a_cred); 1430 } else if (args->a_io == NFS4_PUTAPAGE) { 1431 (void) (*args->a_nfs4_putapage)(args->a_vp, 1432 args->a_nfs4_pp, args->a_nfs4_off, 1433 args->a_nfs4_len, args->a_nfs4_flags, 1434 args->a_cred); 1435 } else if (args->a_io == NFS4_PAGEIO) { 1436 (void) (*args->a_nfs4_pageio)(args->a_vp, 1437 args->a_nfs4_pp, args->a_nfs4_off, 1438 args->a_nfs4_len, args->a_nfs4_flags, 1439 args->a_cred); 1440 } else if (args->a_io == NFS4_READDIR) { 1441 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1442 args->a_nfs4_rdc, args->a_cred)); 1443 } else if (args->a_io == NFS4_COMMIT) { 1444 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1445 args->a_nfs4_offset, args->a_nfs4_count, 1446 args->a_cred); 1447 } else if (args->a_io == NFS4_INACTIVE) { 1448 nfs4_inactive_otw(args->a_vp, args->a_cred); 1449 } 1450 1451 /* 1452 * Now, release the vnode and free the credentials 1453 * structure. 1454 */ 1455 free_async_args4(args); 1456 /* 1457 * Reacquire the mutex because it will be needed above. 1458 */ 1459 mutex_enter(&mi->mi_async_lock); 1460 } 1461 } 1462 1463 /* 1464 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1465 * part of VOP_INACTIVE. 1466 */ 1467 1468 void 1469 nfs4_inactive_thread(mntinfo4_t *mi) 1470 { 1471 struct nfs4_async_reqs *args; 1472 callb_cpr_t cprinfo; 1473 vfs_t *vfsp = mi->mi_vfsp; 1474 1475 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1476 "nfs4_inactive_thread"); 1477 1478 for (;;) { 1479 mutex_enter(&mi->mi_async_lock); 1480 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1481 if (args == NULL) { 1482 mutex_enter(&mi->mi_lock); 1483 /* 1484 * We don't want to exit until the async manager is done 1485 * with its work; hence the check for mi_manager_thread 1486 * being NULL. 1487 * 1488 * The async manager thread will cv_broadcast() on 1489 * mi_inact_req_cv when it's done, at which point we'll 1490 * wake up and exit. 1491 */ 1492 if (mi->mi_manager_thread == NULL) 1493 goto die; 1494 mi->mi_flags |= MI4_INACTIVE_IDLE; 1495 mutex_exit(&mi->mi_lock); 1496 cv_signal(&mi->mi_async_cv); 1497 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1498 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1499 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1500 mutex_exit(&mi->mi_async_lock); 1501 } else { 1502 mutex_enter(&mi->mi_lock); 1503 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1504 mutex_exit(&mi->mi_lock); 1505 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1506 mutex_exit(&mi->mi_async_lock); 1507 nfs4_inactive_otw(args->a_vp, args->a_cred); 1508 crfree(args->a_cred); 1509 kmem_free(args, sizeof (*args)); 1510 } 1511 } 1512 die: 1513 mutex_exit(&mi->mi_lock); 1514 mi->mi_inactive_thread = NULL; 1515 cv_signal(&mi->mi_async_cv); 1516 1517 /* 1518 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1519 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1520 */ 1521 CALLB_CPR_EXIT(&cprinfo); 1522 1523 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1524 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1525 1526 MI4_RELE(mi); 1527 zthread_exit(); 1528 /* NOTREACHED */ 1529 } 1530 1531 /* 1532 * nfs_async_stop: 1533 * Wait for all outstanding putpage operations and the inactive thread to 1534 * complete; nfs4_async_stop_sig() without interruptibility. 1535 */ 1536 void 1537 nfs4_async_stop(struct vfs *vfsp) 1538 { 1539 mntinfo4_t *mi = VFTOMI4(vfsp); 1540 1541 /* 1542 * Wait for all outstanding async operations to complete and for 1543 * worker threads to exit. 1544 */ 1545 mutex_enter(&mi->mi_async_lock); 1546 mi->mi_max_threads = 0; 1547 cv_broadcast(&mi->mi_async_work_cv); 1548 while (mi->mi_threads != 0) 1549 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1550 1551 /* 1552 * Wait for the inactive thread to finish doing what it's doing. It 1553 * won't exit until the last reference to the vfs_t goes away. 1554 */ 1555 if (mi->mi_inactive_thread != NULL) { 1556 mutex_enter(&mi->mi_lock); 1557 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1558 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1559 mutex_exit(&mi->mi_lock); 1560 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1561 mutex_enter(&mi->mi_lock); 1562 } 1563 mutex_exit(&mi->mi_lock); 1564 } 1565 mutex_exit(&mi->mi_async_lock); 1566 } 1567 1568 /* 1569 * nfs_async_stop_sig: 1570 * Wait for all outstanding putpage operations and the inactive thread to 1571 * complete. If a signal is delivered we will abort and return non-zero; 1572 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1573 * need to make it interruptable. 1574 */ 1575 int 1576 nfs4_async_stop_sig(struct vfs *vfsp) 1577 { 1578 mntinfo4_t *mi = VFTOMI4(vfsp); 1579 ushort_t omax; 1580 bool_t intr = FALSE; 1581 1582 /* 1583 * Wait for all outstanding putpage operations to complete and for 1584 * worker threads to exit. 1585 */ 1586 mutex_enter(&mi->mi_async_lock); 1587 omax = mi->mi_max_threads; 1588 mi->mi_max_threads = 0; 1589 cv_broadcast(&mi->mi_async_work_cv); 1590 while (mi->mi_threads != 0) { 1591 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1592 intr = TRUE; 1593 goto interrupted; 1594 } 1595 } 1596 1597 /* 1598 * Wait for the inactive thread to finish doing what it's doing. It 1599 * won't exit until the a last reference to the vfs_t goes away. 1600 */ 1601 if (mi->mi_inactive_thread != NULL) { 1602 mutex_enter(&mi->mi_lock); 1603 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1604 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1605 mutex_exit(&mi->mi_lock); 1606 if (!cv_wait_sig(&mi->mi_async_cv, 1607 &mi->mi_async_lock)) { 1608 intr = TRUE; 1609 goto interrupted; 1610 } 1611 mutex_enter(&mi->mi_lock); 1612 } 1613 mutex_exit(&mi->mi_lock); 1614 } 1615 interrupted: 1616 if (intr) 1617 mi->mi_max_threads = omax; 1618 mutex_exit(&mi->mi_async_lock); 1619 1620 return (intr); 1621 } 1622 1623 int 1624 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1625 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1626 u_offset_t, size_t, int, cred_t *)) 1627 { 1628 rnode4_t *rp; 1629 mntinfo4_t *mi; 1630 struct nfs4_async_reqs *args; 1631 1632 ASSERT(flags & B_ASYNC); 1633 ASSERT(vp->v_vfsp != NULL); 1634 1635 rp = VTOR4(vp); 1636 ASSERT(rp->r_count > 0); 1637 1638 mi = VTOMI4(vp); 1639 1640 /* 1641 * If we can't allocate a request structure, do the putpage 1642 * operation synchronously in this thread's context. 1643 */ 1644 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1645 goto noasync; 1646 1647 args->a_next = NULL; 1648 #ifdef DEBUG 1649 args->a_queuer = curthread; 1650 #endif 1651 VN_HOLD(vp); 1652 args->a_vp = vp; 1653 ASSERT(cr != NULL); 1654 crhold(cr); 1655 args->a_cred = cr; 1656 args->a_io = NFS4_PUTAPAGE; 1657 args->a_nfs4_putapage = putapage; 1658 args->a_nfs4_pp = pp; 1659 args->a_nfs4_off = off; 1660 args->a_nfs4_len = (uint_t)len; 1661 args->a_nfs4_flags = flags; 1662 1663 mutex_enter(&mi->mi_async_lock); 1664 1665 /* 1666 * If asyncio has been disabled, then make a synchronous request. 1667 * This check is done a second time in case async io was diabled 1668 * while this thread was blocked waiting for memory pressure to 1669 * reduce or for the queue to drain. 1670 */ 1671 if (mi->mi_max_threads == 0) { 1672 mutex_exit(&mi->mi_async_lock); 1673 1674 VN_RELE(vp); 1675 crfree(cr); 1676 kmem_free(args, sizeof (*args)); 1677 goto noasync; 1678 } 1679 1680 /* 1681 * Link request structure into the async list and 1682 * wakeup async thread to do the i/o. 1683 */ 1684 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1685 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1686 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1687 } else { 1688 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1689 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1690 } 1691 1692 mutex_enter(&rp->r_statelock); 1693 rp->r_count++; 1694 rp->r_awcount++; 1695 mutex_exit(&rp->r_statelock); 1696 1697 if (mi->mi_io_kstats) { 1698 mutex_enter(&mi->mi_lock); 1699 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1700 mutex_exit(&mi->mi_lock); 1701 } 1702 1703 mi->mi_async_req_count++; 1704 ASSERT(mi->mi_async_req_count != 0); 1705 cv_signal(&mi->mi_async_reqs_cv); 1706 mutex_exit(&mi->mi_async_lock); 1707 return (0); 1708 1709 noasync: 1710 1711 if (curproc == proc_pageout || curproc == proc_fsflush || 1712 nfs_zone() == mi->mi_zone) { 1713 /* 1714 * If we get here in the context of the pageout/fsflush, 1715 * or we have run out of memory or we're attempting to 1716 * unmount we refuse to do a sync write, because this may 1717 * hang pageout/fsflush and the machine. In this case, 1718 * we just re-mark the page as dirty and punt on the page. 1719 * 1720 * Make sure B_FORCE isn't set. We can re-mark the 1721 * pages as dirty and unlock the pages in one swoop by 1722 * passing in B_ERROR to pvn_write_done(). However, 1723 * we should make sure B_FORCE isn't set - we don't 1724 * want the page tossed before it gets written out. 1725 */ 1726 if (flags & B_FORCE) 1727 flags &= ~(B_INVAL | B_FORCE); 1728 pvn_write_done(pp, flags | B_ERROR); 1729 return (0); 1730 } 1731 1732 /* 1733 * We'll get here only if (nfs_zone() != mi->mi_zone) 1734 * which means that this was a cross-zone sync putpage. 1735 * 1736 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1737 * as dirty and unlock them. 1738 * 1739 * We don't want to clear B_FORCE here as the caller presumably 1740 * knows what they're doing if they set it. 1741 */ 1742 pvn_write_done(pp, flags | B_ERROR); 1743 return (EPERM); 1744 } 1745 1746 int 1747 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1748 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1749 size_t, int, cred_t *)) 1750 { 1751 rnode4_t *rp; 1752 mntinfo4_t *mi; 1753 struct nfs4_async_reqs *args; 1754 1755 ASSERT(flags & B_ASYNC); 1756 ASSERT(vp->v_vfsp != NULL); 1757 1758 rp = VTOR4(vp); 1759 ASSERT(rp->r_count > 0); 1760 1761 mi = VTOMI4(vp); 1762 1763 /* 1764 * If we can't allocate a request structure, do the pageio 1765 * request synchronously in this thread's context. 1766 */ 1767 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1768 goto noasync; 1769 1770 args->a_next = NULL; 1771 #ifdef DEBUG 1772 args->a_queuer = curthread; 1773 #endif 1774 VN_HOLD(vp); 1775 args->a_vp = vp; 1776 ASSERT(cr != NULL); 1777 crhold(cr); 1778 args->a_cred = cr; 1779 args->a_io = NFS4_PAGEIO; 1780 args->a_nfs4_pageio = pageio; 1781 args->a_nfs4_pp = pp; 1782 args->a_nfs4_off = io_off; 1783 args->a_nfs4_len = (uint_t)io_len; 1784 args->a_nfs4_flags = flags; 1785 1786 mutex_enter(&mi->mi_async_lock); 1787 1788 /* 1789 * If asyncio has been disabled, then make a synchronous request. 1790 * This check is done a second time in case async io was diabled 1791 * while this thread was blocked waiting for memory pressure to 1792 * reduce or for the queue to drain. 1793 */ 1794 if (mi->mi_max_threads == 0) { 1795 mutex_exit(&mi->mi_async_lock); 1796 1797 VN_RELE(vp); 1798 crfree(cr); 1799 kmem_free(args, sizeof (*args)); 1800 goto noasync; 1801 } 1802 1803 /* 1804 * Link request structure into the async list and 1805 * wakeup async thread to do the i/o. 1806 */ 1807 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1808 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1809 mi->mi_async_tail[NFS4_PAGEIO] = args; 1810 } else { 1811 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1812 mi->mi_async_tail[NFS4_PAGEIO] = args; 1813 } 1814 1815 mutex_enter(&rp->r_statelock); 1816 rp->r_count++; 1817 rp->r_awcount++; 1818 mutex_exit(&rp->r_statelock); 1819 1820 if (mi->mi_io_kstats) { 1821 mutex_enter(&mi->mi_lock); 1822 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1823 mutex_exit(&mi->mi_lock); 1824 } 1825 1826 mi->mi_async_req_count++; 1827 ASSERT(mi->mi_async_req_count != 0); 1828 cv_signal(&mi->mi_async_reqs_cv); 1829 mutex_exit(&mi->mi_async_lock); 1830 return (0); 1831 1832 noasync: 1833 /* 1834 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1835 * the page list), for writes we do it synchronously, except for 1836 * proc_pageout/proc_fsflush as described below. 1837 */ 1838 if (flags & B_READ) { 1839 pvn_read_done(pp, flags | B_ERROR); 1840 return (0); 1841 } 1842 1843 if (curproc == proc_pageout || curproc == proc_fsflush) { 1844 /* 1845 * If we get here in the context of the pageout/fsflush, 1846 * we refuse to do a sync write, because this may hang 1847 * pageout/fsflush (and the machine). In this case, we just 1848 * re-mark the page as dirty and punt on the page. 1849 * 1850 * Make sure B_FORCE isn't set. We can re-mark the 1851 * pages as dirty and unlock the pages in one swoop by 1852 * passing in B_ERROR to pvn_write_done(). However, 1853 * we should make sure B_FORCE isn't set - we don't 1854 * want the page tossed before it gets written out. 1855 */ 1856 if (flags & B_FORCE) 1857 flags &= ~(B_INVAL | B_FORCE); 1858 pvn_write_done(pp, flags | B_ERROR); 1859 return (0); 1860 } 1861 1862 if (nfs_zone() != mi->mi_zone) { 1863 /* 1864 * So this was a cross-zone sync pageio. We pass in B_ERROR 1865 * to pvn_write_done() to re-mark the pages as dirty and unlock 1866 * them. 1867 * 1868 * We don't want to clear B_FORCE here as the caller presumably 1869 * knows what they're doing if they set it. 1870 */ 1871 pvn_write_done(pp, flags | B_ERROR); 1872 return (EPERM); 1873 } 1874 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1875 } 1876 1877 void 1878 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1879 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1880 { 1881 rnode4_t *rp; 1882 mntinfo4_t *mi; 1883 struct nfs4_async_reqs *args; 1884 1885 rp = VTOR4(vp); 1886 ASSERT(rp->r_freef == NULL); 1887 1888 mi = VTOMI4(vp); 1889 1890 /* 1891 * If we can't allocate a request structure, skip the readdir. 1892 */ 1893 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1894 goto noasync; 1895 1896 args->a_next = NULL; 1897 #ifdef DEBUG 1898 args->a_queuer = curthread; 1899 #endif 1900 VN_HOLD(vp); 1901 args->a_vp = vp; 1902 ASSERT(cr != NULL); 1903 crhold(cr); 1904 args->a_cred = cr; 1905 args->a_io = NFS4_READDIR; 1906 args->a_nfs4_readdir = readdir; 1907 args->a_nfs4_rdc = rdc; 1908 1909 mutex_enter(&mi->mi_async_lock); 1910 1911 /* 1912 * If asyncio has been disabled, then skip this request 1913 */ 1914 if (mi->mi_max_threads == 0) { 1915 mutex_exit(&mi->mi_async_lock); 1916 1917 VN_RELE(vp); 1918 crfree(cr); 1919 kmem_free(args, sizeof (*args)); 1920 goto noasync; 1921 } 1922 1923 /* 1924 * Link request structure into the async list and 1925 * wakeup async thread to do the i/o. 1926 */ 1927 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 1928 mi->mi_async_reqs[NFS4_READDIR] = args; 1929 mi->mi_async_tail[NFS4_READDIR] = args; 1930 } else { 1931 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 1932 mi->mi_async_tail[NFS4_READDIR] = args; 1933 } 1934 1935 mutex_enter(&rp->r_statelock); 1936 rp->r_count++; 1937 mutex_exit(&rp->r_statelock); 1938 1939 if (mi->mi_io_kstats) { 1940 mutex_enter(&mi->mi_lock); 1941 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1942 mutex_exit(&mi->mi_lock); 1943 } 1944 1945 mi->mi_async_req_count++; 1946 ASSERT(mi->mi_async_req_count != 0); 1947 cv_signal(&mi->mi_async_reqs_cv); 1948 mutex_exit(&mi->mi_async_lock); 1949 return; 1950 1951 noasync: 1952 mutex_enter(&rp->r_statelock); 1953 rdc->entries = NULL; 1954 /* 1955 * Indicate that no one is trying to fill this entry and 1956 * it still needs to be filled. 1957 */ 1958 rdc->flags &= ~RDDIR; 1959 rdc->flags |= RDDIRREQ; 1960 rddir4_cache_rele(rp, rdc); 1961 mutex_exit(&rp->r_statelock); 1962 } 1963 1964 void 1965 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1966 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1967 cred_t *)) 1968 { 1969 rnode4_t *rp; 1970 mntinfo4_t *mi; 1971 struct nfs4_async_reqs *args; 1972 page_t *pp; 1973 1974 rp = VTOR4(vp); 1975 mi = VTOMI4(vp); 1976 1977 /* 1978 * If we can't allocate a request structure, do the commit 1979 * operation synchronously in this thread's context. 1980 */ 1981 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1982 goto noasync; 1983 1984 args->a_next = NULL; 1985 #ifdef DEBUG 1986 args->a_queuer = curthread; 1987 #endif 1988 VN_HOLD(vp); 1989 args->a_vp = vp; 1990 ASSERT(cr != NULL); 1991 crhold(cr); 1992 args->a_cred = cr; 1993 args->a_io = NFS4_COMMIT; 1994 args->a_nfs4_commit = commit; 1995 args->a_nfs4_plist = plist; 1996 args->a_nfs4_offset = offset; 1997 args->a_nfs4_count = count; 1998 1999 mutex_enter(&mi->mi_async_lock); 2000 2001 /* 2002 * If asyncio has been disabled, then make a synchronous request. 2003 * This check is done a second time in case async io was diabled 2004 * while this thread was blocked waiting for memory pressure to 2005 * reduce or for the queue to drain. 2006 */ 2007 if (mi->mi_max_threads == 0) { 2008 mutex_exit(&mi->mi_async_lock); 2009 2010 VN_RELE(vp); 2011 crfree(cr); 2012 kmem_free(args, sizeof (*args)); 2013 goto noasync; 2014 } 2015 2016 /* 2017 * Link request structure into the async list and 2018 * wakeup async thread to do the i/o. 2019 */ 2020 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2021 mi->mi_async_reqs[NFS4_COMMIT] = args; 2022 mi->mi_async_tail[NFS4_COMMIT] = args; 2023 } else { 2024 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2025 mi->mi_async_tail[NFS4_COMMIT] = args; 2026 } 2027 2028 mutex_enter(&rp->r_statelock); 2029 rp->r_count++; 2030 mutex_exit(&rp->r_statelock); 2031 2032 if (mi->mi_io_kstats) { 2033 mutex_enter(&mi->mi_lock); 2034 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2035 mutex_exit(&mi->mi_lock); 2036 } 2037 2038 mi->mi_async_req_count++; 2039 ASSERT(mi->mi_async_req_count != 0); 2040 cv_signal(&mi->mi_async_reqs_cv); 2041 mutex_exit(&mi->mi_async_lock); 2042 return; 2043 2044 noasync: 2045 if (curproc == proc_pageout || curproc == proc_fsflush || 2046 nfs_zone() != mi->mi_zone) { 2047 while (plist != NULL) { 2048 pp = plist; 2049 page_sub(&plist, pp); 2050 pp->p_fsdata = C_COMMIT; 2051 page_unlock(pp); 2052 } 2053 return; 2054 } 2055 (*commit)(vp, plist, offset, count, cr); 2056 } 2057 2058 /* 2059 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2060 * reference to the vnode is handed over to the thread; the caller should 2061 * no longer refer to the vnode. 2062 * 2063 * Unlike most of the async routines, this handoff is needed for 2064 * correctness reasons, not just performance. So doing operations in the 2065 * context of the current thread is not an option. 2066 */ 2067 void 2068 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2069 { 2070 mntinfo4_t *mi; 2071 struct nfs4_async_reqs *args; 2072 boolean_t signal_inactive_thread = B_FALSE; 2073 2074 mi = VTOMI4(vp); 2075 2076 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2077 args->a_next = NULL; 2078 #ifdef DEBUG 2079 args->a_queuer = curthread; 2080 #endif 2081 args->a_vp = vp; 2082 ASSERT(cr != NULL); 2083 crhold(cr); 2084 args->a_cred = cr; 2085 args->a_io = NFS4_INACTIVE; 2086 2087 /* 2088 * Note that we don't check mi->mi_max_threads here, since we 2089 * *need* to get rid of this vnode regardless of whether someone 2090 * set nfs4_max_threads to zero in /etc/system. 2091 * 2092 * The manager thread knows about this and is willing to create 2093 * at least one thread to accomodate us. 2094 */ 2095 mutex_enter(&mi->mi_async_lock); 2096 if (mi->mi_inactive_thread == NULL) { 2097 rnode4_t *rp; 2098 vnode_t *unldvp = NULL; 2099 char *unlname; 2100 cred_t *unlcred; 2101 2102 mutex_exit(&mi->mi_async_lock); 2103 /* 2104 * We just need to free up the memory associated with the 2105 * vnode, which can be safely done from within the current 2106 * context. 2107 */ 2108 crfree(cr); /* drop our reference */ 2109 kmem_free(args, sizeof (*args)); 2110 rp = VTOR4(vp); 2111 mutex_enter(&rp->r_statelock); 2112 if (rp->r_unldvp != NULL) { 2113 unldvp = rp->r_unldvp; 2114 rp->r_unldvp = NULL; 2115 unlname = rp->r_unlname; 2116 rp->r_unlname = NULL; 2117 unlcred = rp->r_unlcred; 2118 rp->r_unlcred = NULL; 2119 } 2120 mutex_exit(&rp->r_statelock); 2121 /* 2122 * No need to explicitly throw away any cached pages. The 2123 * eventual r4inactive() will attempt a synchronous 2124 * VOP_PUTPAGE() which will immediately fail since the request 2125 * is coming from the wrong zone, and then will proceed to call 2126 * nfs4_invalidate_pages() which will clean things up for us. 2127 * 2128 * Throw away the delegation here so rp4_addfree()'s attempt to 2129 * return any existing delegations becomes a no-op. 2130 */ 2131 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2132 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2133 FALSE); 2134 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2135 nfs_rw_exit(&mi->mi_recovlock); 2136 } 2137 nfs4_clear_open_streams(rp); 2138 2139 rp4_addfree(rp, cr); 2140 if (unldvp != NULL) { 2141 kmem_free(unlname, MAXNAMELEN); 2142 VN_RELE(unldvp); 2143 crfree(unlcred); 2144 } 2145 return; 2146 } 2147 2148 if (mi->mi_manager_thread == NULL) { 2149 /* 2150 * We want to talk to the inactive thread. 2151 */ 2152 signal_inactive_thread = B_TRUE; 2153 } 2154 2155 /* 2156 * Enqueue the vnode and wake up either the special thread (empty 2157 * list) or an async thread. 2158 */ 2159 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2160 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2161 mi->mi_async_tail[NFS4_INACTIVE] = args; 2162 signal_inactive_thread = B_TRUE; 2163 } else { 2164 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2165 mi->mi_async_tail[NFS4_INACTIVE] = args; 2166 } 2167 if (signal_inactive_thread) { 2168 cv_signal(&mi->mi_inact_req_cv); 2169 } else { 2170 mi->mi_async_req_count++; 2171 ASSERT(mi->mi_async_req_count != 0); 2172 cv_signal(&mi->mi_async_reqs_cv); 2173 } 2174 2175 mutex_exit(&mi->mi_async_lock); 2176 } 2177 2178 int 2179 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2180 { 2181 int pagecreate; 2182 int n; 2183 int saved_n; 2184 caddr_t saved_base; 2185 u_offset_t offset; 2186 int error; 2187 int sm_error; 2188 vnode_t *vp = RTOV(rp); 2189 2190 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2191 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2192 if (!vpm_enable) { 2193 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2194 } 2195 2196 /* 2197 * Move bytes in at most PAGESIZE chunks. We must avoid 2198 * spanning pages in uiomove() because page faults may cause 2199 * the cache to be invalidated out from under us. The r_size is not 2200 * updated until after the uiomove. If we push the last page of a 2201 * file before r_size is correct, we will lose the data written past 2202 * the current (and invalid) r_size. 2203 */ 2204 do { 2205 offset = uio->uio_loffset; 2206 pagecreate = 0; 2207 2208 /* 2209 * n is the number of bytes required to satisfy the request 2210 * or the number of bytes to fill out the page. 2211 */ 2212 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2213 2214 /* 2215 * Check to see if we can skip reading in the page 2216 * and just allocate the memory. We can do this 2217 * if we are going to rewrite the entire mapping 2218 * or if we are going to write to or beyond the current 2219 * end of file from the beginning of the mapping. 2220 * 2221 * The read of r_size is now protected by r_statelock. 2222 */ 2223 mutex_enter(&rp->r_statelock); 2224 /* 2225 * When pgcreated is nonzero the caller has already done 2226 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2227 * segkpm this means we already have at least one page 2228 * created and mapped at base. 2229 */ 2230 pagecreate = pgcreated || 2231 ((offset & PAGEOFFSET) == 0 && 2232 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2233 2234 mutex_exit(&rp->r_statelock); 2235 2236 if (!vpm_enable && pagecreate) { 2237 /* 2238 * The last argument tells segmap_pagecreate() to 2239 * always lock the page, as opposed to sometimes 2240 * returning with the page locked. This way we avoid a 2241 * fault on the ensuing uiomove(), but also 2242 * more importantly (to fix bug 1094402) we can 2243 * call segmap_fault() to unlock the page in all 2244 * cases. An alternative would be to modify 2245 * segmap_pagecreate() to tell us when it is 2246 * locking a page, but that's a fairly major 2247 * interface change. 2248 */ 2249 if (pgcreated == 0) 2250 (void) segmap_pagecreate(segkmap, base, 2251 (uint_t)n, 1); 2252 saved_base = base; 2253 saved_n = n; 2254 } 2255 2256 /* 2257 * The number of bytes of data in the last page can not 2258 * be accurately be determined while page is being 2259 * uiomove'd to and the size of the file being updated. 2260 * Thus, inform threads which need to know accurately 2261 * how much data is in the last page of the file. They 2262 * will not do the i/o immediately, but will arrange for 2263 * the i/o to happen later when this modify operation 2264 * will have finished. 2265 */ 2266 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2267 mutex_enter(&rp->r_statelock); 2268 rp->r_flags |= R4MODINPROGRESS; 2269 rp->r_modaddr = (offset & MAXBMASK); 2270 mutex_exit(&rp->r_statelock); 2271 2272 if (vpm_enable) { 2273 /* 2274 * Copy data. If new pages are created, part of 2275 * the page that is not written will be initizliazed 2276 * with zeros. 2277 */ 2278 error = vpm_data_copy(vp, offset, n, uio, 2279 !pagecreate, NULL, 0, S_WRITE); 2280 } else { 2281 error = uiomove(base, n, UIO_WRITE, uio); 2282 } 2283 2284 /* 2285 * r_size is the maximum number of 2286 * bytes known to be in the file. 2287 * Make sure it is at least as high as the 2288 * first unwritten byte pointed to by uio_loffset. 2289 */ 2290 mutex_enter(&rp->r_statelock); 2291 if (rp->r_size < uio->uio_loffset) 2292 rp->r_size = uio->uio_loffset; 2293 rp->r_flags &= ~R4MODINPROGRESS; 2294 rp->r_flags |= R4DIRTY; 2295 mutex_exit(&rp->r_statelock); 2296 2297 /* n = # of bytes written */ 2298 n = (int)(uio->uio_loffset - offset); 2299 2300 if (!vpm_enable) { 2301 base += n; 2302 } 2303 2304 tcount -= n; 2305 /* 2306 * If we created pages w/o initializing them completely, 2307 * we need to zero the part that wasn't set up. 2308 * This happens on a most EOF write cases and if 2309 * we had some sort of error during the uiomove. 2310 */ 2311 if (!vpm_enable && pagecreate) { 2312 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2313 (void) kzero(base, PAGESIZE - n); 2314 2315 if (pgcreated) { 2316 /* 2317 * Caller is responsible for this page, 2318 * it was not created in this loop. 2319 */ 2320 pgcreated = 0; 2321 } else { 2322 /* 2323 * For bug 1094402: segmap_pagecreate locks 2324 * page. Unlock it. This also unlocks the 2325 * pages allocated by page_create_va() in 2326 * segmap_pagecreate(). 2327 */ 2328 sm_error = segmap_fault(kas.a_hat, segkmap, 2329 saved_base, saved_n, 2330 F_SOFTUNLOCK, S_WRITE); 2331 if (error == 0) 2332 error = sm_error; 2333 } 2334 } 2335 } while (tcount > 0 && error == 0); 2336 2337 return (error); 2338 } 2339 2340 int 2341 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2342 { 2343 rnode4_t *rp; 2344 page_t *pp; 2345 u_offset_t eoff; 2346 u_offset_t io_off; 2347 size_t io_len; 2348 int error; 2349 int rdirty; 2350 int err; 2351 2352 rp = VTOR4(vp); 2353 ASSERT(rp->r_count > 0); 2354 2355 if (!nfs4_has_pages(vp)) 2356 return (0); 2357 2358 ASSERT(vp->v_type != VCHR); 2359 2360 /* 2361 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2362 * writes. B_FORCE is set to force the VM system to actually 2363 * invalidate the pages, even if the i/o failed. The pages 2364 * need to get invalidated because they can't be written out 2365 * because there isn't any space left on either the server's 2366 * file system or in the user's disk quota. The B_FREE bit 2367 * is cleared to avoid confusion as to whether this is a 2368 * request to place the page on the freelist or to destroy 2369 * it. 2370 */ 2371 if ((rp->r_flags & R4OUTOFSPACE) || 2372 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2373 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2374 2375 if (len == 0) { 2376 /* 2377 * If doing a full file synchronous operation, then clear 2378 * the R4DIRTY bit. If a page gets dirtied while the flush 2379 * is happening, then R4DIRTY will get set again. The 2380 * R4DIRTY bit must get cleared before the flush so that 2381 * we don't lose this information. 2382 * 2383 * If there are no full file async write operations 2384 * pending and RDIRTY bit is set, clear it. 2385 */ 2386 if (off == (u_offset_t)0 && 2387 !(flags & B_ASYNC) && 2388 (rp->r_flags & R4DIRTY)) { 2389 mutex_enter(&rp->r_statelock); 2390 rdirty = (rp->r_flags & R4DIRTY); 2391 rp->r_flags &= ~R4DIRTY; 2392 mutex_exit(&rp->r_statelock); 2393 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2394 mutex_enter(&rp->r_statelock); 2395 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2396 rdirty = (rp->r_flags & R4DIRTY); 2397 rp->r_flags &= ~R4DIRTY; 2398 } 2399 mutex_exit(&rp->r_statelock); 2400 } else 2401 rdirty = 0; 2402 2403 /* 2404 * Search the entire vp list for pages >= off, and flush 2405 * the dirty pages. 2406 */ 2407 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2408 flags, cr); 2409 2410 /* 2411 * If an error occured and the file was marked as dirty 2412 * before and we aren't forcibly invalidating pages, then 2413 * reset the R4DIRTY flag. 2414 */ 2415 if (error && rdirty && 2416 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2417 mutex_enter(&rp->r_statelock); 2418 rp->r_flags |= R4DIRTY; 2419 mutex_exit(&rp->r_statelock); 2420 } 2421 } else { 2422 /* 2423 * Do a range from [off...off + len) looking for pages 2424 * to deal with. 2425 */ 2426 error = 0; 2427 io_len = 0; 2428 eoff = off + len; 2429 mutex_enter(&rp->r_statelock); 2430 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2431 io_off += io_len) { 2432 mutex_exit(&rp->r_statelock); 2433 /* 2434 * If we are not invalidating, synchronously 2435 * freeing or writing pages use the routine 2436 * page_lookup_nowait() to prevent reclaiming 2437 * them from the free list. 2438 */ 2439 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2440 pp = page_lookup(vp, io_off, 2441 (flags & (B_INVAL | B_FREE)) ? 2442 SE_EXCL : SE_SHARED); 2443 } else { 2444 pp = page_lookup_nowait(vp, io_off, 2445 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2446 } 2447 2448 if (pp == NULL || !pvn_getdirty(pp, flags)) 2449 io_len = PAGESIZE; 2450 else { 2451 err = (*rp->r_putapage)(vp, pp, &io_off, 2452 &io_len, flags, cr); 2453 if (!error) 2454 error = err; 2455 /* 2456 * "io_off" and "io_len" are returned as 2457 * the range of pages we actually wrote. 2458 * This allows us to skip ahead more quickly 2459 * since several pages may've been dealt 2460 * with by this iteration of the loop. 2461 */ 2462 } 2463 mutex_enter(&rp->r_statelock); 2464 } 2465 mutex_exit(&rp->r_statelock); 2466 } 2467 2468 return (error); 2469 } 2470 2471 void 2472 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2473 { 2474 rnode4_t *rp; 2475 2476 rp = VTOR4(vp); 2477 if (IS_SHADOW(vp, rp)) 2478 vp = RTOV4(rp); 2479 mutex_enter(&rp->r_statelock); 2480 while (rp->r_flags & R4TRUNCATE) 2481 cv_wait(&rp->r_cv, &rp->r_statelock); 2482 rp->r_flags |= R4TRUNCATE; 2483 if (off == (u_offset_t)0) { 2484 rp->r_flags &= ~R4DIRTY; 2485 if (!(rp->r_flags & R4STALE)) 2486 rp->r_error = 0; 2487 } 2488 rp->r_truncaddr = off; 2489 mutex_exit(&rp->r_statelock); 2490 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2491 B_INVAL | B_TRUNC, cr); 2492 mutex_enter(&rp->r_statelock); 2493 rp->r_flags &= ~R4TRUNCATE; 2494 cv_broadcast(&rp->r_cv); 2495 mutex_exit(&rp->r_statelock); 2496 } 2497 2498 static int 2499 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2500 { 2501 mntinfo4_t *mi; 2502 struct mntinfo_kstat *mik; 2503 vfs_t *vfsp; 2504 2505 /* this is a read-only kstat. Bail out on a write */ 2506 if (rw == KSTAT_WRITE) 2507 return (EACCES); 2508 2509 2510 /* 2511 * We don't want to wait here as kstat_chain_lock could be held by 2512 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2513 * and thus could lead to a deadlock. 2514 */ 2515 vfsp = (struct vfs *)ksp->ks_private; 2516 2517 mi = VFTOMI4(vfsp); 2518 mik = (struct mntinfo_kstat *)ksp->ks_data; 2519 2520 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2521 2522 mik->mik_vers = (uint32_t)mi->mi_vers; 2523 mik->mik_flags = mi->mi_flags; 2524 /* 2525 * The sv_secdata holds the flavor the client specifies. 2526 * If the client uses default and a security negotiation 2527 * occurs, sv_currsec will point to the current flavor 2528 * selected from the server flavor list. 2529 * sv_currsec is NULL if no security negotiation takes place. 2530 */ 2531 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2532 mi->mi_curr_serv->sv_currsec->secmod : 2533 mi->mi_curr_serv->sv_secdata->secmod; 2534 mik->mik_curread = (uint32_t)mi->mi_curread; 2535 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2536 mik->mik_retrans = mi->mi_retrans; 2537 mik->mik_timeo = mi->mi_timeo; 2538 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2539 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2540 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2541 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2542 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2543 mik->mik_failover = (uint32_t)mi->mi_failover; 2544 mik->mik_remap = (uint32_t)mi->mi_remap; 2545 2546 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2547 2548 return (0); 2549 } 2550 2551 void 2552 nfs4_mnt_kstat_init(struct vfs *vfsp) 2553 { 2554 mntinfo4_t *mi = VFTOMI4(vfsp); 2555 2556 /* 2557 * PSARC 2001/697 Contract Private Interface 2558 * All nfs kstats are under SunMC contract 2559 * Please refer to the PSARC listed above and contact 2560 * SunMC before making any changes! 2561 * 2562 * Changes must be reviewed by Solaris File Sharing 2563 * Changes must be communicated to contract-2001-697@sun.com 2564 * 2565 */ 2566 2567 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2568 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2569 if (mi->mi_io_kstats) { 2570 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2571 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2572 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2573 kstat_install(mi->mi_io_kstats); 2574 } 2575 2576 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2577 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2578 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2579 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2580 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2581 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2582 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2583 kstat_install(mi->mi_ro_kstats); 2584 } 2585 2586 nfs4_mnt_recov_kstat_init(vfsp); 2587 } 2588 2589 void 2590 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2591 { 2592 mntinfo4_t *mi; 2593 2594 mi = VTOMI4(vp); 2595 /* 2596 * In case of forced unmount, do not print any messages 2597 * since it can flood the console with error messages. 2598 */ 2599 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2600 return; 2601 2602 /* 2603 * If the mount point is dead, not recoverable, do not 2604 * print error messages that can flood the console. 2605 */ 2606 if (mi->mi_flags & MI4_RECOV_FAIL) 2607 return; 2608 2609 /* 2610 * No use in flooding the console with ENOSPC 2611 * messages from the same file system. 2612 */ 2613 if ((error != ENOSPC && error != EDQUOT) || 2614 lbolt - mi->mi_printftime > 0) { 2615 zoneid_t zoneid = mi->mi_zone->zone_id; 2616 2617 #ifdef DEBUG 2618 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2619 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2620 #else 2621 nfs_perror(error, "NFS write error on host %s: %m.\n", 2622 VTOR4(vp)->r_server->sv_hostname, NULL); 2623 #endif 2624 if (error == ENOSPC || error == EDQUOT) { 2625 zcmn_err(zoneid, CE_CONT, 2626 "^File: userid=%d, groupid=%d\n", 2627 crgetuid(cr), crgetgid(cr)); 2628 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2629 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2630 zcmn_err(zoneid, CE_CONT, 2631 "^User: userid=%d, groupid=%d\n", 2632 crgetuid(curthread->t_cred), 2633 crgetgid(curthread->t_cred)); 2634 } 2635 mi->mi_printftime = lbolt + 2636 nfs_write_error_interval * hz; 2637 } 2638 sfh4_printfhandle(VTOR4(vp)->r_fh); 2639 #ifdef DEBUG 2640 if (error == EACCES) { 2641 zcmn_err(zoneid, CE_CONT, 2642 "nfs_bio: cred is%s kcred\n", 2643 cr == kcred ? "" : " not"); 2644 } 2645 #endif 2646 } 2647 } 2648 2649 /* 2650 * Return non-zero if the given file can be safely memory mapped. Locks 2651 * are safe if whole-file (length and offset are both zero). 2652 */ 2653 2654 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2655 2656 static int 2657 nfs4_safemap(const vnode_t *vp) 2658 { 2659 locklist_t *llp, *next_llp; 2660 int safe = 1; 2661 rnode4_t *rp = VTOR4(vp); 2662 2663 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2664 2665 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2666 "vp = %p", (void *)vp)); 2667 2668 /* 2669 * Review all the locks for the vnode, both ones that have been 2670 * acquired and ones that are pending. We assume that 2671 * flk_active_locks_for_vp() has merged any locks that can be 2672 * merged (so that if a process has the entire file locked, it is 2673 * represented as a single lock). 2674 * 2675 * Note that we can't bail out of the loop if we find a non-safe 2676 * lock, because we have to free all the elements in the llp list. 2677 * We might be able to speed up this code slightly by not looking 2678 * at each lock's l_start and l_len fields once we've found a 2679 * non-safe lock. 2680 */ 2681 2682 llp = flk_active_locks_for_vp(vp); 2683 while (llp) { 2684 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2685 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2686 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2687 if (!SAFE_LOCK(llp->ll_flock)) { 2688 safe = 0; 2689 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2690 "nfs4_safemap: unsafe active lock (%" PRId64 2691 ", %" PRId64 ")", llp->ll_flock.l_start, 2692 llp->ll_flock.l_len)); 2693 } 2694 next_llp = llp->ll_next; 2695 VN_RELE(llp->ll_vp); 2696 kmem_free(llp, sizeof (*llp)); 2697 llp = next_llp; 2698 } 2699 2700 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2701 safe ? "safe" : "unsafe")); 2702 return (safe); 2703 } 2704 2705 /* 2706 * Return whether there is a lost LOCK or LOCKU queued up for the given 2707 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2708 */ 2709 2710 bool_t 2711 nfs4_map_lost_lock_conflict(vnode_t *vp) 2712 { 2713 bool_t conflict = FALSE; 2714 nfs4_lost_rqst_t *lrp; 2715 mntinfo4_t *mi = VTOMI4(vp); 2716 2717 mutex_enter(&mi->mi_lock); 2718 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2719 lrp = list_next(&mi->mi_lost_state, lrp)) { 2720 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2721 continue; 2722 ASSERT(lrp->lr_vp != NULL); 2723 if (!VOP_CMP(lrp->lr_vp, vp)) 2724 continue; /* different file */ 2725 if (!SAFE_LOCK(*lrp->lr_flk)) { 2726 conflict = TRUE; 2727 break; 2728 } 2729 } 2730 2731 mutex_exit(&mi->mi_lock); 2732 return (conflict); 2733 } 2734 2735 /* 2736 * nfs_lockcompletion: 2737 * 2738 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2739 * as non cachable (set VNOCACHE bit). 2740 */ 2741 2742 void 2743 nfs4_lockcompletion(vnode_t *vp, int cmd) 2744 { 2745 rnode4_t *rp = VTOR4(vp); 2746 2747 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2748 ASSERT(!IS_SHADOW(vp, rp)); 2749 2750 if (cmd == F_SETLK || cmd == F_SETLKW) { 2751 2752 if (!nfs4_safemap(vp)) { 2753 mutex_enter(&vp->v_lock); 2754 vp->v_flag |= VNOCACHE; 2755 mutex_exit(&vp->v_lock); 2756 } else { 2757 mutex_enter(&vp->v_lock); 2758 vp->v_flag &= ~VNOCACHE; 2759 mutex_exit(&vp->v_lock); 2760 } 2761 } 2762 /* 2763 * The cached attributes of the file are stale after acquiring 2764 * the lock on the file. They were updated when the file was 2765 * opened, but not updated when the lock was acquired. Therefore the 2766 * cached attributes are invalidated after the lock is obtained. 2767 */ 2768 PURGE_ATTRCACHE4(vp); 2769 } 2770 2771 /* ARGSUSED */ 2772 static void * 2773 nfs4_mi_init(zoneid_t zoneid) 2774 { 2775 struct mi4_globals *mig; 2776 2777 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2778 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2779 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2780 offsetof(mntinfo4_t, mi_zone_node)); 2781 mig->mig_destructor_called = B_FALSE; 2782 return (mig); 2783 } 2784 2785 /* 2786 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2787 * state and killing off threads. 2788 */ 2789 /* ARGSUSED */ 2790 static void 2791 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2792 { 2793 struct mi4_globals *mig = data; 2794 mntinfo4_t *mi; 2795 nfs4_server_t *np; 2796 2797 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2798 "nfs4_mi_shutdown zone %d\n", zoneid)); 2799 ASSERT(mig != NULL); 2800 for (;;) { 2801 mutex_enter(&mig->mig_lock); 2802 mi = list_head(&mig->mig_list); 2803 if (mi == NULL) { 2804 mutex_exit(&mig->mig_lock); 2805 break; 2806 } 2807 2808 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2809 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2810 /* 2811 * purge the DNLC for this filesystem 2812 */ 2813 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2814 /* 2815 * Tell existing async worker threads to exit. 2816 */ 2817 mutex_enter(&mi->mi_async_lock); 2818 mi->mi_max_threads = 0; 2819 cv_broadcast(&mi->mi_async_work_cv); 2820 /* 2821 * Set the appropriate flags, signal and wait for both the 2822 * async manager and the inactive thread to exit when they're 2823 * done with their current work. 2824 */ 2825 mutex_enter(&mi->mi_lock); 2826 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2827 mutex_exit(&mi->mi_lock); 2828 mutex_exit(&mi->mi_async_lock); 2829 if (mi->mi_manager_thread) { 2830 nfs4_async_manager_stop(mi->mi_vfsp); 2831 } 2832 if (mi->mi_inactive_thread) { 2833 mutex_enter(&mi->mi_async_lock); 2834 cv_signal(&mi->mi_inact_req_cv); 2835 /* 2836 * Wait for the inactive thread to exit. 2837 */ 2838 while (mi->mi_inactive_thread != NULL) { 2839 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2840 } 2841 mutex_exit(&mi->mi_async_lock); 2842 } 2843 /* 2844 * Wait for the recovery thread to complete, that is, it will 2845 * signal when it is done using the "mi" structure and about 2846 * to exit 2847 */ 2848 mutex_enter(&mi->mi_lock); 2849 while (mi->mi_in_recovery > 0) 2850 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2851 mutex_exit(&mi->mi_lock); 2852 /* 2853 * We're done when every mi has been done or the list is empty. 2854 * This one is done, remove it from the list. 2855 */ 2856 list_remove(&mig->mig_list, mi); 2857 mutex_exit(&mig->mig_lock); 2858 zone_rele(mi->mi_zone); 2859 /* 2860 * Release hold on vfs and mi done to prevent race with zone 2861 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2862 */ 2863 VFS_RELE(mi->mi_vfsp); 2864 MI4_RELE(mi); 2865 } 2866 /* 2867 * Tell each renew thread in the zone to exit 2868 */ 2869 mutex_enter(&nfs4_server_lst_lock); 2870 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2871 mutex_enter(&np->s_lock); 2872 if (np->zoneid == zoneid) { 2873 /* 2874 * We add another hold onto the nfs4_server_t 2875 * because this will make sure tha the nfs4_server_t 2876 * stays around until nfs4_callback_fini_zone destroys 2877 * the zone. This way, the renew thread can 2878 * unconditionally release its holds on the 2879 * nfs4_server_t. 2880 */ 2881 np->s_refcnt++; 2882 nfs4_mark_srv_dead(np); 2883 } 2884 mutex_exit(&np->s_lock); 2885 } 2886 mutex_exit(&nfs4_server_lst_lock); 2887 } 2888 2889 static void 2890 nfs4_mi_free_globals(struct mi4_globals *mig) 2891 { 2892 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2893 mutex_destroy(&mig->mig_lock); 2894 kmem_free(mig, sizeof (*mig)); 2895 } 2896 2897 /* ARGSUSED */ 2898 static void 2899 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2900 { 2901 struct mi4_globals *mig = data; 2902 2903 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2904 "nfs4_mi_destroy zone %d\n", zoneid)); 2905 ASSERT(mig != NULL); 2906 mutex_enter(&mig->mig_lock); 2907 if (list_head(&mig->mig_list) != NULL) { 2908 /* Still waiting for VFS_FREEVFS() */ 2909 mig->mig_destructor_called = B_TRUE; 2910 mutex_exit(&mig->mig_lock); 2911 return; 2912 } 2913 nfs4_mi_free_globals(mig); 2914 } 2915 2916 /* 2917 * Add an NFS mount to the per-zone list of NFS mounts. 2918 */ 2919 void 2920 nfs4_mi_zonelist_add(mntinfo4_t *mi) 2921 { 2922 struct mi4_globals *mig; 2923 2924 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2925 mutex_enter(&mig->mig_lock); 2926 list_insert_head(&mig->mig_list, mi); 2927 /* 2928 * hold added to eliminate race with zone shutdown -this will be 2929 * released in mi_shutdown 2930 */ 2931 MI4_HOLD(mi); 2932 VFS_HOLD(mi->mi_vfsp); 2933 mutex_exit(&mig->mig_lock); 2934 } 2935 2936 /* 2937 * Remove an NFS mount from the per-zone list of NFS mounts. 2938 */ 2939 int 2940 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 2941 { 2942 struct mi4_globals *mig; 2943 int ret = 0; 2944 2945 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 2946 mutex_enter(&mig->mig_lock); 2947 mutex_enter(&mi->mi_lock); 2948 /* if this mi is marked dead, then the zone already released it */ 2949 if (!(mi->mi_flags & MI4_DEAD)) { 2950 list_remove(&mig->mig_list, mi); 2951 2952 /* release the holds put on in zonelist_add(). */ 2953 VFS_RELE(mi->mi_vfsp); 2954 MI4_RELE(mi); 2955 ret = 1; 2956 } 2957 mutex_exit(&mi->mi_lock); 2958 2959 /* 2960 * We can be called asynchronously by VFS_FREEVFS() after the zone 2961 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2962 * mi globals. 2963 */ 2964 if (list_head(&mig->mig_list) == NULL && 2965 mig->mig_destructor_called == B_TRUE) { 2966 nfs4_mi_free_globals(mig); 2967 return (ret); 2968 } 2969 mutex_exit(&mig->mig_lock); 2970 return (ret); 2971 } 2972 2973 void 2974 nfs_free_mi4(mntinfo4_t *mi) 2975 { 2976 nfs4_open_owner_t *foop; 2977 nfs4_oo_hash_bucket_t *bucketp; 2978 nfs4_debug_msg_t *msgp; 2979 int i; 2980 servinfo4_t *svp; 2981 2982 mutex_enter(&mi->mi_lock); 2983 ASSERT(mi->mi_recovthread == NULL); 2984 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 2985 mutex_exit(&mi->mi_lock); 2986 mutex_enter(&mi->mi_async_lock); 2987 ASSERT(mi->mi_threads == 0); 2988 ASSERT(mi->mi_manager_thread == NULL); 2989 mutex_exit(&mi->mi_async_lock); 2990 svp = mi->mi_servers; 2991 sv4_free(svp); 2992 if (mi->mi_io_kstats) { 2993 kstat_delete(mi->mi_io_kstats); 2994 mi->mi_io_kstats = NULL; 2995 } 2996 if (mi->mi_ro_kstats) { 2997 kstat_delete(mi->mi_ro_kstats); 2998 mi->mi_ro_kstats = NULL; 2999 } 3000 if (mi->mi_recov_ksp) { 3001 kstat_delete(mi->mi_recov_ksp); 3002 mi->mi_recov_ksp = NULL; 3003 } 3004 mutex_enter(&mi->mi_msg_list_lock); 3005 while (msgp = list_head(&mi->mi_msg_list)) { 3006 list_remove(&mi->mi_msg_list, msgp); 3007 nfs4_free_msg(msgp); 3008 } 3009 mutex_exit(&mi->mi_msg_list_lock); 3010 list_destroy(&mi->mi_msg_list); 3011 if (mi->mi_rootfh != NULL) 3012 sfh4_rele(&mi->mi_rootfh); 3013 if (mi->mi_srvparentfh != NULL) 3014 sfh4_rele(&mi->mi_srvparentfh); 3015 mutex_destroy(&mi->mi_lock); 3016 mutex_destroy(&mi->mi_async_lock); 3017 mutex_destroy(&mi->mi_msg_list_lock); 3018 nfs_rw_destroy(&mi->mi_recovlock); 3019 nfs_rw_destroy(&mi->mi_rename_lock); 3020 nfs_rw_destroy(&mi->mi_fh_lock); 3021 cv_destroy(&mi->mi_failover_cv); 3022 cv_destroy(&mi->mi_async_reqs_cv); 3023 cv_destroy(&mi->mi_async_work_cv); 3024 cv_destroy(&mi->mi_async_cv); 3025 cv_destroy(&mi->mi_inact_req_cv); 3026 /* 3027 * Destroy the oo hash lists and mutexes for the cred hash table. 3028 */ 3029 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3030 bucketp = &(mi->mi_oo_list[i]); 3031 /* Destroy any remaining open owners on the list */ 3032 foop = list_head(&bucketp->b_oo_hash_list); 3033 while (foop != NULL) { 3034 list_remove(&bucketp->b_oo_hash_list, foop); 3035 nfs4_destroy_open_owner(foop); 3036 foop = list_head(&bucketp->b_oo_hash_list); 3037 } 3038 list_destroy(&bucketp->b_oo_hash_list); 3039 mutex_destroy(&bucketp->b_lock); 3040 } 3041 /* 3042 * Empty and destroy the freed open owner list. 3043 */ 3044 foop = list_head(&mi->mi_foo_list); 3045 while (foop != NULL) { 3046 list_remove(&mi->mi_foo_list, foop); 3047 nfs4_destroy_open_owner(foop); 3048 foop = list_head(&mi->mi_foo_list); 3049 } 3050 list_destroy(&mi->mi_foo_list); 3051 list_destroy(&mi->mi_bseqid_list); 3052 list_destroy(&mi->mi_lost_state); 3053 avl_destroy(&mi->mi_filehandles); 3054 fn_rele(&mi->mi_fname); 3055 kmem_free(mi, sizeof (*mi)); 3056 } 3057 void 3058 mi_hold(mntinfo4_t *mi) 3059 { 3060 atomic_add_32(&mi->mi_count, 1); 3061 ASSERT(mi->mi_count != 0); 3062 } 3063 3064 void 3065 mi_rele(mntinfo4_t *mi) 3066 { 3067 ASSERT(mi->mi_count != 0); 3068 if (atomic_add_32_nv(&mi->mi_count, -1) == 0) { 3069 nfs_free_mi4(mi); 3070 } 3071 } 3072 3073 vnode_t nfs4_xattr_notsupp_vnode; 3074 3075 void 3076 nfs4_clnt_init(void) 3077 { 3078 nfs4_vnops_init(); 3079 (void) nfs4_rnode_init(); 3080 (void) nfs4_shadow_init(); 3081 (void) nfs4_acache_init(); 3082 (void) nfs4_subr_init(); 3083 nfs4_acl_init(); 3084 nfs_idmap_init(); 3085 nfs4_callback_init(); 3086 nfs4_secinfo_init(); 3087 #ifdef DEBUG 3088 tsd_create(&nfs4_tsd_key, NULL); 3089 #endif 3090 3091 /* 3092 * Add a CPR callback so that we can update client 3093 * lease after a suspend and resume. 3094 */ 3095 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3096 3097 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3098 nfs4_mi_destroy); 3099 3100 /* 3101 * Initialise the reference count of the notsupp xattr cache vnode to 1 3102 * so that it never goes away (VOP_INACTIVE isn't called on it). 3103 */ 3104 nfs4_xattr_notsupp_vnode.v_count = 1; 3105 } 3106 3107 void 3108 nfs4_clnt_fini(void) 3109 { 3110 (void) zone_key_delete(mi4_list_key); 3111 nfs4_vnops_fini(); 3112 (void) nfs4_rnode_fini(); 3113 (void) nfs4_shadow_fini(); 3114 (void) nfs4_acache_fini(); 3115 (void) nfs4_subr_fini(); 3116 nfs_idmap_fini(); 3117 nfs4_callback_fini(); 3118 nfs4_secinfo_fini(); 3119 #ifdef DEBUG 3120 tsd_destroy(&nfs4_tsd_key); 3121 #endif 3122 if (cid) 3123 (void) callb_delete(cid); 3124 } 3125 3126 /*ARGSUSED*/ 3127 static boolean_t 3128 nfs4_client_cpr_callb(void *arg, int code) 3129 { 3130 /* 3131 * We get called for Suspend and Resume events. 3132 * For the suspend case we simply don't care! 3133 */ 3134 if (code == CB_CODE_CPR_CHKPT) { 3135 return (B_TRUE); 3136 } 3137 3138 /* 3139 * When we get to here we are in the process of 3140 * resuming the system from a previous suspend. 3141 */ 3142 nfs4_client_resumed = gethrestime_sec(); 3143 return (B_TRUE); 3144 } 3145 3146 void 3147 nfs4_renew_lease_thread(nfs4_server_t *sp) 3148 { 3149 int error = 0; 3150 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3151 clock_t tick_delay = 0; 3152 clock_t time_left = 0; 3153 callb_cpr_t cpr_info; 3154 kmutex_t cpr_lock; 3155 3156 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3157 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3158 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3159 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3160 3161 mutex_enter(&sp->s_lock); 3162 /* sp->s_lease_time is set via a GETATTR */ 3163 sp->last_renewal_time = gethrestime_sec(); 3164 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3165 ASSERT(sp->s_refcnt >= 1); 3166 3167 for (;;) { 3168 if (!sp->state_ref_count || 3169 sp->lease_valid != NFS4_LEASE_VALID) { 3170 3171 kip_secs = MAX((sp->s_lease_time >> 1) - 3172 (3 * sp->propagation_delay.tv_sec), 1); 3173 3174 tick_delay = SEC_TO_TICK(kip_secs); 3175 3176 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3177 "nfs4_renew_lease_thread: no renew : thread " 3178 "wait %ld secs", kip_secs)); 3179 3180 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3181 "nfs4_renew_lease_thread: no renew : " 3182 "state_ref_count %d, lease_valid %d", 3183 sp->state_ref_count, sp->lease_valid)); 3184 3185 mutex_enter(&cpr_lock); 3186 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3187 mutex_exit(&cpr_lock); 3188 time_left = cv_timedwait(&sp->cv_thread_exit, 3189 &sp->s_lock, tick_delay + lbolt); 3190 mutex_enter(&cpr_lock); 3191 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3192 mutex_exit(&cpr_lock); 3193 3194 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3195 "nfs4_renew_lease_thread: no renew: " 3196 "time left %ld", time_left)); 3197 3198 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3199 goto die; 3200 continue; 3201 } 3202 3203 tmp_last_renewal_time = sp->last_renewal_time; 3204 3205 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3206 (3 * sp->propagation_delay.tv_sec); 3207 3208 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3209 "nfs4_renew_lease_thread: tmp_time %ld, " 3210 "sp->last_renewal_time %ld", tmp_time, 3211 sp->last_renewal_time)); 3212 3213 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3214 3215 tick_delay = SEC_TO_TICK(kip_secs); 3216 3217 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3218 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3219 "secs", kip_secs)); 3220 3221 mutex_enter(&cpr_lock); 3222 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3223 mutex_exit(&cpr_lock); 3224 time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock, 3225 tick_delay + lbolt); 3226 mutex_enter(&cpr_lock); 3227 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3228 mutex_exit(&cpr_lock); 3229 3230 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3231 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3232 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3233 "tmp_last_renewal_time %ld", time_left, 3234 sp->last_renewal_time, nfs4_client_resumed, 3235 tmp_last_renewal_time)); 3236 3237 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3238 goto die; 3239 3240 if (tmp_last_renewal_time == sp->last_renewal_time || 3241 (nfs4_client_resumed != 0 && 3242 nfs4_client_resumed > sp->last_renewal_time)) { 3243 /* 3244 * Issue RENEW op since we haven't renewed the lease 3245 * since we slept. 3246 */ 3247 tmp_now_time = gethrestime_sec(); 3248 error = nfs4renew(sp); 3249 /* 3250 * Need to re-acquire sp's lock, nfs4renew() 3251 * relinqueshes it. 3252 */ 3253 mutex_enter(&sp->s_lock); 3254 3255 /* 3256 * See if someone changed s_thread_exit while we gave 3257 * up s_lock. 3258 */ 3259 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3260 goto die; 3261 3262 if (!error) { 3263 /* 3264 * check to see if we implicitly renewed while 3265 * we waited for a reply for our RENEW call. 3266 */ 3267 if (tmp_last_renewal_time == 3268 sp->last_renewal_time) { 3269 /* no implicit renew came */ 3270 sp->last_renewal_time = tmp_now_time; 3271 } else { 3272 NFS4_DEBUG(nfs4_client_lease_debug, 3273 (CE_NOTE, "renew_thread: did " 3274 "implicit renewal before reply " 3275 "from server for RENEW")); 3276 } 3277 } else { 3278 /* figure out error */ 3279 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3280 "renew_thread: nfs4renew returned error" 3281 " %d", error)); 3282 } 3283 3284 } 3285 } 3286 3287 die: 3288 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3289 "nfs4_renew_lease_thread: thread exiting")); 3290 3291 while (sp->s_otw_call_count != 0) { 3292 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3293 "nfs4_renew_lease_thread: waiting for outstanding " 3294 "otw calls to finish for sp 0x%p, current " 3295 "s_otw_call_count %d", (void *)sp, 3296 sp->s_otw_call_count)); 3297 mutex_enter(&cpr_lock); 3298 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3299 mutex_exit(&cpr_lock); 3300 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3301 mutex_enter(&cpr_lock); 3302 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3303 mutex_exit(&cpr_lock); 3304 } 3305 mutex_exit(&sp->s_lock); 3306 3307 nfs4_server_rele(sp); /* free the thread's reference */ 3308 nfs4_server_rele(sp); /* free the list's reference */ 3309 sp = NULL; 3310 3311 done: 3312 mutex_enter(&cpr_lock); 3313 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3314 mutex_destroy(&cpr_lock); 3315 3316 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3317 "nfs4_renew_lease_thread: renew thread exit officially")); 3318 3319 zthread_exit(); 3320 /* NOT REACHED */ 3321 } 3322 3323 /* 3324 * Send out a RENEW op to the server. 3325 * Assumes sp is locked down. 3326 */ 3327 static int 3328 nfs4renew(nfs4_server_t *sp) 3329 { 3330 COMPOUND4args_clnt args; 3331 COMPOUND4res_clnt res; 3332 nfs_argop4 argop[1]; 3333 int doqueue = 1; 3334 int rpc_error; 3335 cred_t *cr; 3336 mntinfo4_t *mi; 3337 timespec_t prop_time, after_time; 3338 int needrecov = FALSE; 3339 nfs4_recov_state_t recov_state; 3340 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3341 3342 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3343 3344 recov_state.rs_flags = 0; 3345 recov_state.rs_num_retry_despite_err = 0; 3346 3347 recov_retry: 3348 mi = sp->mntinfo4_list; 3349 VFS_HOLD(mi->mi_vfsp); 3350 mutex_exit(&sp->s_lock); 3351 ASSERT(mi != NULL); 3352 3353 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3354 if (e.error) { 3355 VFS_RELE(mi->mi_vfsp); 3356 return (e.error); 3357 } 3358 3359 /* Check to see if we're dealing with a marked-dead sp */ 3360 mutex_enter(&sp->s_lock); 3361 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3362 mutex_exit(&sp->s_lock); 3363 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3364 VFS_RELE(mi->mi_vfsp); 3365 return (0); 3366 } 3367 3368 /* Make sure mi hasn't changed on us */ 3369 if (mi != sp->mntinfo4_list) { 3370 /* Must drop sp's lock to avoid a recursive mutex enter */ 3371 mutex_exit(&sp->s_lock); 3372 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3373 VFS_RELE(mi->mi_vfsp); 3374 mutex_enter(&sp->s_lock); 3375 goto recov_retry; 3376 } 3377 mutex_exit(&sp->s_lock); 3378 3379 args.ctag = TAG_RENEW; 3380 3381 args.array_len = 1; 3382 args.array = argop; 3383 3384 argop[0].argop = OP_RENEW; 3385 3386 mutex_enter(&sp->s_lock); 3387 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3388 cr = sp->s_cred; 3389 crhold(cr); 3390 mutex_exit(&sp->s_lock); 3391 3392 ASSERT(cr != NULL); 3393 3394 /* used to figure out RTT for sp */ 3395 gethrestime(&prop_time); 3396 3397 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3398 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3399 (void*)sp)); 3400 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3401 prop_time.tv_sec, prop_time.tv_nsec)); 3402 3403 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3404 mntinfo4_t *, mi); 3405 3406 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3407 crfree(cr); 3408 3409 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3410 mntinfo4_t *, mi); 3411 3412 gethrestime(&after_time); 3413 3414 mutex_enter(&sp->s_lock); 3415 sp->propagation_delay.tv_sec = 3416 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3417 mutex_exit(&sp->s_lock); 3418 3419 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3420 after_time.tv_sec, after_time.tv_nsec)); 3421 3422 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3423 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3424 nfs4_delegreturn_all(sp); 3425 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3426 VFS_RELE(mi->mi_vfsp); 3427 /* 3428 * If the server returns CB_PATH_DOWN, it has renewed 3429 * the lease and informed us that the callback path is 3430 * down. Since the lease is renewed, just return 0 and 3431 * let the renew thread proceed as normal. 3432 */ 3433 return (0); 3434 } 3435 3436 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3437 if (!needrecov && e.error) { 3438 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3439 VFS_RELE(mi->mi_vfsp); 3440 return (e.error); 3441 } 3442 3443 rpc_error = e.error; 3444 3445 if (needrecov) { 3446 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3447 "nfs4renew: initiating recovery\n")); 3448 3449 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3450 OP_RENEW, NULL) == FALSE) { 3451 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3452 VFS_RELE(mi->mi_vfsp); 3453 if (!e.error) 3454 (void) xdr_free(xdr_COMPOUND4res_clnt, 3455 (caddr_t)&res); 3456 mutex_enter(&sp->s_lock); 3457 goto recov_retry; 3458 } 3459 /* fall through for res.status case */ 3460 } 3461 3462 if (res.status) { 3463 if (res.status == NFS4ERR_LEASE_MOVED) { 3464 /*EMPTY*/ 3465 /* 3466 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3467 * to renew the lease on that server 3468 */ 3469 } 3470 e.error = geterrno4(res.status); 3471 } 3472 3473 if (!rpc_error) 3474 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3475 3476 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3477 3478 VFS_RELE(mi->mi_vfsp); 3479 3480 return (e.error); 3481 } 3482 3483 void 3484 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3485 { 3486 nfs4_server_t *sp; 3487 3488 /* this locks down sp if it is found */ 3489 sp = find_nfs4_server(mi); 3490 3491 if (sp != NULL) { 3492 nfs4_inc_state_ref_count_nolock(sp, mi); 3493 mutex_exit(&sp->s_lock); 3494 nfs4_server_rele(sp); 3495 } 3496 } 3497 3498 /* 3499 * Bump the number of OPEN files (ie: those with state) so we know if this 3500 * nfs4_server has any state to maintain a lease for or not. 3501 * 3502 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3503 */ 3504 void 3505 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3506 { 3507 ASSERT(mutex_owned(&sp->s_lock)); 3508 3509 sp->state_ref_count++; 3510 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3511 "nfs4_inc_state_ref_count: state_ref_count now %d", 3512 sp->state_ref_count)); 3513 3514 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3515 sp->lease_valid = NFS4_LEASE_VALID; 3516 3517 /* 3518 * If this call caused the lease to be marked valid and/or 3519 * took the state_ref_count from 0 to 1, then start the time 3520 * on lease renewal. 3521 */ 3522 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3523 sp->last_renewal_time = gethrestime_sec(); 3524 3525 /* update the number of open files for mi */ 3526 mi->mi_open_files++; 3527 } 3528 3529 void 3530 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3531 { 3532 nfs4_server_t *sp; 3533 3534 /* this locks down sp if it is found */ 3535 sp = find_nfs4_server_all(mi, 1); 3536 3537 if (sp != NULL) { 3538 nfs4_dec_state_ref_count_nolock(sp, mi); 3539 mutex_exit(&sp->s_lock); 3540 nfs4_server_rele(sp); 3541 } 3542 } 3543 3544 /* 3545 * Decrement the number of OPEN files (ie: those with state) so we know if 3546 * this nfs4_server has any state to maintain a lease for or not. 3547 */ 3548 void 3549 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3550 { 3551 ASSERT(mutex_owned(&sp->s_lock)); 3552 ASSERT(sp->state_ref_count != 0); 3553 sp->state_ref_count--; 3554 3555 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3556 "nfs4_dec_state_ref_count: state ref count now %d", 3557 sp->state_ref_count)); 3558 3559 mi->mi_open_files--; 3560 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3561 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3562 mi->mi_open_files, mi->mi_flags)); 3563 3564 /* We don't have to hold the mi_lock to test mi_flags */ 3565 if (mi->mi_open_files == 0 && 3566 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3567 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3568 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3569 "we have closed the last open file", (void*)mi)); 3570 nfs4_remove_mi_from_server(mi, sp); 3571 } 3572 } 3573 3574 bool_t 3575 inlease(nfs4_server_t *sp) 3576 { 3577 bool_t result; 3578 3579 ASSERT(mutex_owned(&sp->s_lock)); 3580 3581 if (sp->lease_valid == NFS4_LEASE_VALID && 3582 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3583 result = TRUE; 3584 else 3585 result = FALSE; 3586 3587 return (result); 3588 } 3589 3590 3591 /* 3592 * Return non-zero if the given nfs4_server_t is going through recovery. 3593 */ 3594 3595 int 3596 nfs4_server_in_recovery(nfs4_server_t *sp) 3597 { 3598 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3599 } 3600 3601 /* 3602 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3603 * first is less than, equal to, or greater than the second. 3604 */ 3605 3606 int 3607 sfh4cmp(const void *p1, const void *p2) 3608 { 3609 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3610 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3611 3612 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3613 } 3614 3615 /* 3616 * Create a table for shared filehandle objects. 3617 */ 3618 3619 void 3620 sfh4_createtab(avl_tree_t *tab) 3621 { 3622 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3623 offsetof(nfs4_sharedfh_t, sfh_tree)); 3624 } 3625 3626 /* 3627 * Return a shared filehandle object for the given filehandle. The caller 3628 * is responsible for eventually calling sfh4_rele(). 3629 */ 3630 3631 nfs4_sharedfh_t * 3632 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3633 { 3634 nfs4_sharedfh_t *sfh, *nsfh; 3635 avl_index_t where; 3636 nfs4_sharedfh_t skey; 3637 3638 if (!key) { 3639 skey.sfh_fh = *fh; 3640 key = &skey; 3641 } 3642 3643 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3644 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3645 /* 3646 * We allocate the largest possible filehandle size because it's 3647 * not that big, and it saves us from possibly having to resize the 3648 * buffer later. 3649 */ 3650 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3651 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3652 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3653 nsfh->sfh_refcnt = 1; 3654 nsfh->sfh_flags = SFH4_IN_TREE; 3655 nsfh->sfh_mi = mi; 3656 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3657 (void *)nsfh)); 3658 3659 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3660 sfh = avl_find(&mi->mi_filehandles, key, &where); 3661 if (sfh != NULL) { 3662 mutex_enter(&sfh->sfh_lock); 3663 sfh->sfh_refcnt++; 3664 mutex_exit(&sfh->sfh_lock); 3665 nfs_rw_exit(&mi->mi_fh_lock); 3666 /* free our speculative allocs */ 3667 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3668 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3669 return (sfh); 3670 } 3671 3672 avl_insert(&mi->mi_filehandles, nsfh, where); 3673 nfs_rw_exit(&mi->mi_fh_lock); 3674 3675 return (nsfh); 3676 } 3677 3678 /* 3679 * Return a shared filehandle object for the given filehandle. The caller 3680 * is responsible for eventually calling sfh4_rele(). 3681 */ 3682 3683 nfs4_sharedfh_t * 3684 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3685 { 3686 nfs4_sharedfh_t *sfh; 3687 nfs4_sharedfh_t key; 3688 3689 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3690 3691 #ifdef DEBUG 3692 if (nfs4_sharedfh_debug) { 3693 nfs4_fhandle_t fhandle; 3694 3695 fhandle.fh_len = fh->nfs_fh4_len; 3696 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3697 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3698 nfs4_printfhandle(&fhandle); 3699 } 3700 #endif 3701 3702 /* 3703 * If there's already an object for the given filehandle, bump the 3704 * reference count and return it. Otherwise, create a new object 3705 * and add it to the AVL tree. 3706 */ 3707 3708 key.sfh_fh = *fh; 3709 3710 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3711 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3712 if (sfh != NULL) { 3713 mutex_enter(&sfh->sfh_lock); 3714 sfh->sfh_refcnt++; 3715 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3716 "sfh4_get: found existing %p, new refcnt=%d", 3717 (void *)sfh, sfh->sfh_refcnt)); 3718 mutex_exit(&sfh->sfh_lock); 3719 nfs_rw_exit(&mi->mi_fh_lock); 3720 return (sfh); 3721 } 3722 nfs_rw_exit(&mi->mi_fh_lock); 3723 3724 return (sfh4_put(fh, mi, &key)); 3725 } 3726 3727 /* 3728 * Get a reference to the given shared filehandle object. 3729 */ 3730 3731 void 3732 sfh4_hold(nfs4_sharedfh_t *sfh) 3733 { 3734 ASSERT(sfh->sfh_refcnt > 0); 3735 3736 mutex_enter(&sfh->sfh_lock); 3737 sfh->sfh_refcnt++; 3738 NFS4_DEBUG(nfs4_sharedfh_debug, 3739 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3740 (void *)sfh, sfh->sfh_refcnt)); 3741 mutex_exit(&sfh->sfh_lock); 3742 } 3743 3744 /* 3745 * Release a reference to the given shared filehandle object and null out 3746 * the given pointer. 3747 */ 3748 3749 void 3750 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3751 { 3752 mntinfo4_t *mi; 3753 nfs4_sharedfh_t *sfh = *sfhpp; 3754 3755 ASSERT(sfh->sfh_refcnt > 0); 3756 3757 mutex_enter(&sfh->sfh_lock); 3758 if (sfh->sfh_refcnt > 1) { 3759 sfh->sfh_refcnt--; 3760 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3761 "sfh4_rele %p, new refcnt=%d", 3762 (void *)sfh, sfh->sfh_refcnt)); 3763 mutex_exit(&sfh->sfh_lock); 3764 goto finish; 3765 } 3766 mutex_exit(&sfh->sfh_lock); 3767 3768 /* 3769 * Possibly the last reference, so get the lock for the table in 3770 * case it's time to remove the object from the table. 3771 */ 3772 mi = sfh->sfh_mi; 3773 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3774 mutex_enter(&sfh->sfh_lock); 3775 sfh->sfh_refcnt--; 3776 if (sfh->sfh_refcnt > 0) { 3777 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3778 "sfh4_rele %p, new refcnt=%d", 3779 (void *)sfh, sfh->sfh_refcnt)); 3780 mutex_exit(&sfh->sfh_lock); 3781 nfs_rw_exit(&mi->mi_fh_lock); 3782 goto finish; 3783 } 3784 3785 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3786 "sfh4_rele %p, last ref", (void *)sfh)); 3787 if (sfh->sfh_flags & SFH4_IN_TREE) { 3788 avl_remove(&mi->mi_filehandles, sfh); 3789 sfh->sfh_flags &= ~SFH4_IN_TREE; 3790 } 3791 mutex_exit(&sfh->sfh_lock); 3792 nfs_rw_exit(&mi->mi_fh_lock); 3793 mutex_destroy(&sfh->sfh_lock); 3794 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3795 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3796 3797 finish: 3798 *sfhpp = NULL; 3799 } 3800 3801 /* 3802 * Update the filehandle for the given shared filehandle object. 3803 */ 3804 3805 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3806 3807 void 3808 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3809 { 3810 mntinfo4_t *mi = sfh->sfh_mi; 3811 nfs4_sharedfh_t *dupsfh; 3812 avl_index_t where; 3813 nfs4_sharedfh_t key; 3814 3815 #ifdef DEBUG 3816 mutex_enter(&sfh->sfh_lock); 3817 ASSERT(sfh->sfh_refcnt > 0); 3818 mutex_exit(&sfh->sfh_lock); 3819 #endif 3820 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3821 3822 /* 3823 * The basic plan is to remove the shared filehandle object from 3824 * the table, update it to have the new filehandle, then reinsert 3825 * it. 3826 */ 3827 3828 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3829 mutex_enter(&sfh->sfh_lock); 3830 if (sfh->sfh_flags & SFH4_IN_TREE) { 3831 avl_remove(&mi->mi_filehandles, sfh); 3832 sfh->sfh_flags &= ~SFH4_IN_TREE; 3833 } 3834 mutex_exit(&sfh->sfh_lock); 3835 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3836 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3837 sfh->sfh_fh.nfs_fh4_len); 3838 3839 /* 3840 * XXX If there is already a shared filehandle object with the new 3841 * filehandle, we're in trouble, because the rnode code assumes 3842 * that there is only one shared filehandle object for a given 3843 * filehandle. So issue a warning (for read-write mounts only) 3844 * and don't try to re-insert the given object into the table. 3845 * Hopefully the given object will quickly go away and everyone 3846 * will use the new object. 3847 */ 3848 key.sfh_fh = *newfh; 3849 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3850 if (dupsfh != NULL) { 3851 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3852 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3853 "duplicate filehandle detected"); 3854 sfh4_printfhandle(dupsfh); 3855 } 3856 } else { 3857 avl_insert(&mi->mi_filehandles, sfh, where); 3858 mutex_enter(&sfh->sfh_lock); 3859 sfh->sfh_flags |= SFH4_IN_TREE; 3860 mutex_exit(&sfh->sfh_lock); 3861 } 3862 nfs_rw_exit(&mi->mi_fh_lock); 3863 } 3864 3865 /* 3866 * Copy out the current filehandle for the given shared filehandle object. 3867 */ 3868 3869 void 3870 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3871 { 3872 mntinfo4_t *mi = sfh->sfh_mi; 3873 3874 ASSERT(sfh->sfh_refcnt > 0); 3875 3876 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3877 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3878 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3879 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3880 nfs_rw_exit(&mi->mi_fh_lock); 3881 } 3882 3883 /* 3884 * Print out the filehandle for the given shared filehandle object. 3885 */ 3886 3887 void 3888 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3889 { 3890 nfs4_fhandle_t fhandle; 3891 3892 sfh4_copyval(sfh, &fhandle); 3893 nfs4_printfhandle(&fhandle); 3894 } 3895 3896 /* 3897 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 3898 * if they're the same, +1 if the first is "greater" than the second. The 3899 * caller (or whoever's calling the AVL package) is responsible for 3900 * handling locking issues. 3901 */ 3902 3903 static int 3904 fncmp(const void *p1, const void *p2) 3905 { 3906 const nfs4_fname_t *f1 = p1; 3907 const nfs4_fname_t *f2 = p2; 3908 int res; 3909 3910 res = strcmp(f1->fn_name, f2->fn_name); 3911 /* 3912 * The AVL package wants +/-1, not arbitrary positive or negative 3913 * integers. 3914 */ 3915 if (res > 0) 3916 res = 1; 3917 else if (res < 0) 3918 res = -1; 3919 return (res); 3920 } 3921 3922 /* 3923 * Get or create an fname with the given name, as a child of the given 3924 * fname. The caller is responsible for eventually releasing the reference 3925 * (fn_rele()). parent may be NULL. 3926 */ 3927 3928 nfs4_fname_t * 3929 fn_get(nfs4_fname_t *parent, char *name) 3930 { 3931 nfs4_fname_t key; 3932 nfs4_fname_t *fnp; 3933 avl_index_t where; 3934 3935 key.fn_name = name; 3936 3937 /* 3938 * If there's already an fname registered with the given name, bump 3939 * its reference count and return it. Otherwise, create a new one 3940 * and add it to the parent's AVL tree. 3941 */ 3942 3943 if (parent != NULL) { 3944 mutex_enter(&parent->fn_lock); 3945 fnp = avl_find(&parent->fn_children, &key, &where); 3946 if (fnp != NULL) { 3947 fn_hold(fnp); 3948 mutex_exit(&parent->fn_lock); 3949 return (fnp); 3950 } 3951 } 3952 3953 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 3954 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 3955 fnp->fn_parent = parent; 3956 if (parent != NULL) 3957 fn_hold(parent); 3958 fnp->fn_len = strlen(name); 3959 ASSERT(fnp->fn_len < MAXNAMELEN); 3960 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 3961 (void) strcpy(fnp->fn_name, name); 3962 fnp->fn_refcnt = 1; 3963 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 3964 offsetof(nfs4_fname_t, fn_tree)); 3965 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3966 "fn_get %p:%s, a new nfs4_fname_t!", 3967 (void *)fnp, fnp->fn_name)); 3968 if (parent != NULL) { 3969 avl_insert(&parent->fn_children, fnp, where); 3970 mutex_exit(&parent->fn_lock); 3971 } 3972 3973 return (fnp); 3974 } 3975 3976 void 3977 fn_hold(nfs4_fname_t *fnp) 3978 { 3979 atomic_add_32(&fnp->fn_refcnt, 1); 3980 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 3981 "fn_hold %p:%s, new refcnt=%d", 3982 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 3983 } 3984 3985 /* 3986 * Decrement the reference count of the given fname, and destroy it if its 3987 * reference count goes to zero. Nulls out the given pointer. 3988 */ 3989 3990 void 3991 fn_rele(nfs4_fname_t **fnpp) 3992 { 3993 nfs4_fname_t *parent; 3994 uint32_t newref; 3995 nfs4_fname_t *fnp; 3996 3997 recur: 3998 fnp = *fnpp; 3999 *fnpp = NULL; 4000 4001 mutex_enter(&fnp->fn_lock); 4002 parent = fnp->fn_parent; 4003 if (parent != NULL) 4004 mutex_enter(&parent->fn_lock); /* prevent new references */ 4005 newref = atomic_add_32_nv(&fnp->fn_refcnt, -1); 4006 if (newref > 0) { 4007 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4008 "fn_rele %p:%s, new refcnt=%d", 4009 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4010 if (parent != NULL) 4011 mutex_exit(&parent->fn_lock); 4012 mutex_exit(&fnp->fn_lock); 4013 return; 4014 } 4015 4016 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4017 "fn_rele %p:%s, last reference, deleting...", 4018 (void *)fnp, fnp->fn_name)); 4019 if (parent != NULL) { 4020 avl_remove(&parent->fn_children, fnp); 4021 mutex_exit(&parent->fn_lock); 4022 } 4023 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4024 mutex_destroy(&fnp->fn_lock); 4025 avl_destroy(&fnp->fn_children); 4026 kmem_free(fnp, sizeof (nfs4_fname_t)); 4027 /* 4028 * Recursivly fn_rele the parent. 4029 * Use goto instead of a recursive call to avoid stack overflow. 4030 */ 4031 if (parent != NULL) { 4032 fnpp = &parent; 4033 goto recur; 4034 } 4035 } 4036 4037 /* 4038 * Returns the single component name of the given fname, in a MAXNAMELEN 4039 * string buffer, which the caller is responsible for freeing. Note that 4040 * the name may become invalid as a result of fn_move(). 4041 */ 4042 4043 char * 4044 fn_name(nfs4_fname_t *fnp) 4045 { 4046 char *name; 4047 4048 ASSERT(fnp->fn_len < MAXNAMELEN); 4049 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4050 mutex_enter(&fnp->fn_lock); 4051 (void) strcpy(name, fnp->fn_name); 4052 mutex_exit(&fnp->fn_lock); 4053 4054 return (name); 4055 } 4056 4057 4058 /* 4059 * fn_path_realloc 4060 * 4061 * This function, used only by fn_path, constructs 4062 * a new string which looks like "prepend" + "/" + "current". 4063 * by allocating a new string and freeing the old one. 4064 */ 4065 static void 4066 fn_path_realloc(char **curses, char *prepend) 4067 { 4068 int len, curlen = 0; 4069 char *news; 4070 4071 if (*curses == NULL) { 4072 /* 4073 * Prime the pump, allocate just the 4074 * space for prepend and return that. 4075 */ 4076 len = strlen(prepend) + 1; 4077 news = kmem_alloc(len, KM_SLEEP); 4078 (void) strncpy(news, prepend, len); 4079 } else { 4080 /* 4081 * Allocate the space for a new string 4082 * +1 +1 is for the "/" and the NULL 4083 * byte at the end of it all. 4084 */ 4085 curlen = strlen(*curses); 4086 len = curlen + strlen(prepend) + 1 + 1; 4087 news = kmem_alloc(len, KM_SLEEP); 4088 (void) strncpy(news, prepend, len); 4089 (void) strcat(news, "/"); 4090 (void) strcat(news, *curses); 4091 kmem_free(*curses, curlen + 1); 4092 } 4093 *curses = news; 4094 } 4095 4096 /* 4097 * Returns the path name (starting from the fs root) for the given fname. 4098 * The caller is responsible for freeing. Note that the path may be or 4099 * become invalid as a result of fn_move(). 4100 */ 4101 4102 char * 4103 fn_path(nfs4_fname_t *fnp) 4104 { 4105 char *path; 4106 nfs4_fname_t *nextfnp; 4107 4108 if (fnp == NULL) 4109 return (NULL); 4110 4111 path = NULL; 4112 4113 /* walk up the tree constructing the pathname. */ 4114 4115 fn_hold(fnp); /* adjust for later rele */ 4116 do { 4117 mutex_enter(&fnp->fn_lock); 4118 /* 4119 * Add fn_name in front of the current path 4120 */ 4121 fn_path_realloc(&path, fnp->fn_name); 4122 nextfnp = fnp->fn_parent; 4123 if (nextfnp != NULL) 4124 fn_hold(nextfnp); 4125 mutex_exit(&fnp->fn_lock); 4126 fn_rele(&fnp); 4127 fnp = nextfnp; 4128 } while (fnp != NULL); 4129 4130 return (path); 4131 } 4132 4133 /* 4134 * Return a reference to the parent of the given fname, which the caller is 4135 * responsible for eventually releasing. 4136 */ 4137 4138 nfs4_fname_t * 4139 fn_parent(nfs4_fname_t *fnp) 4140 { 4141 nfs4_fname_t *parent; 4142 4143 mutex_enter(&fnp->fn_lock); 4144 parent = fnp->fn_parent; 4145 if (parent != NULL) 4146 fn_hold(parent); 4147 mutex_exit(&fnp->fn_lock); 4148 4149 return (parent); 4150 } 4151 4152 /* 4153 * Update fnp so that its parent is newparent and its name is newname. 4154 */ 4155 4156 void 4157 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4158 { 4159 nfs4_fname_t *parent, *tmpfnp; 4160 ssize_t newlen; 4161 nfs4_fname_t key; 4162 avl_index_t where; 4163 4164 /* 4165 * This assert exists to catch the client trying to rename 4166 * a dir to be a child of itself. This happened at a recent 4167 * bakeoff against a 3rd party (broken) server which allowed 4168 * the rename to succeed. If it trips it means that: 4169 * a) the code in nfs4rename that detects this case is broken 4170 * b) the server is broken (since it allowed the bogus rename) 4171 * 4172 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4173 * panic below from: mutex_enter(&newparent->fn_lock); 4174 */ 4175 ASSERT(fnp != newparent); 4176 4177 /* 4178 * Remove fnp from its current parent, change its name, then add it 4179 * to newparent. 4180 */ 4181 mutex_enter(&fnp->fn_lock); 4182 parent = fnp->fn_parent; 4183 mutex_enter(&parent->fn_lock); 4184 avl_remove(&parent->fn_children, fnp); 4185 mutex_exit(&parent->fn_lock); 4186 fn_rele(&fnp->fn_parent); 4187 4188 newlen = strlen(newname); 4189 if (newlen != fnp->fn_len) { 4190 ASSERT(newlen < MAXNAMELEN); 4191 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4192 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4193 fnp->fn_len = newlen; 4194 } 4195 (void) strcpy(fnp->fn_name, newname); 4196 4197 again: 4198 mutex_enter(&newparent->fn_lock); 4199 key.fn_name = fnp->fn_name; 4200 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4201 if (tmpfnp != NULL) { 4202 /* 4203 * This could be due to a file that was unlinked while 4204 * open, or perhaps the rnode is in the free list. Remove 4205 * it from newparent and let it go away on its own. The 4206 * contorted code is to deal with lock order issues and 4207 * race conditions. 4208 */ 4209 fn_hold(tmpfnp); 4210 mutex_exit(&newparent->fn_lock); 4211 mutex_enter(&tmpfnp->fn_lock); 4212 if (tmpfnp->fn_parent == newparent) { 4213 mutex_enter(&newparent->fn_lock); 4214 avl_remove(&newparent->fn_children, tmpfnp); 4215 mutex_exit(&newparent->fn_lock); 4216 fn_rele(&tmpfnp->fn_parent); 4217 } 4218 mutex_exit(&tmpfnp->fn_lock); 4219 fn_rele(&tmpfnp); 4220 goto again; 4221 } 4222 fnp->fn_parent = newparent; 4223 fn_hold(newparent); 4224 avl_insert(&newparent->fn_children, fnp, where); 4225 mutex_exit(&newparent->fn_lock); 4226 mutex_exit(&fnp->fn_lock); 4227 } 4228 4229 #ifdef DEBUG 4230 /* 4231 * Return non-zero if the type information makes sense for the given vnode. 4232 * Otherwise panic. 4233 */ 4234 int 4235 nfs4_consistent_type(vnode_t *vp) 4236 { 4237 rnode4_t *rp = VTOR4(vp); 4238 4239 if (nfs4_vtype_debug && vp->v_type != VNON && 4240 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4241 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4242 "rnode attr type=%d", (void *)vp, vp->v_type, 4243 rp->r_attr.va_type); 4244 } 4245 4246 return (1); 4247 } 4248 #endif /* DEBUG */ 4249