1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2017 by Delphix. All rights reserved. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/thread.h> 35 #include <sys/t_lock.h> 36 #include <sys/time.h> 37 #include <sys/vnode.h> 38 #include <sys/vfs.h> 39 #include <sys/errno.h> 40 #include <sys/buf.h> 41 #include <sys/stat.h> 42 #include <sys/cred.h> 43 #include <sys/kmem.h> 44 #include <sys/debug.h> 45 #include <sys/dnlc.h> 46 #include <sys/vmsystm.h> 47 #include <sys/flock.h> 48 #include <sys/share.h> 49 #include <sys/cmn_err.h> 50 #include <sys/tiuser.h> 51 #include <sys/sysmacros.h> 52 #include <sys/callb.h> 53 #include <sys/acl.h> 54 #include <sys/kstat.h> 55 #include <sys/signal.h> 56 #include <sys/disp.h> 57 #include <sys/atomic.h> 58 #include <sys/list.h> 59 #include <sys/sdt.h> 60 61 #include <rpc/types.h> 62 #include <rpc/xdr.h> 63 #include <rpc/auth.h> 64 #include <rpc/clnt.h> 65 66 #include <nfs/nfs.h> 67 #include <nfs/nfs_clnt.h> 68 #include <nfs/nfs_acl.h> 69 70 #include <nfs/nfs4.h> 71 #include <nfs/rnode4.h> 72 #include <nfs/nfs4_clnt.h> 73 74 #include <vm/hat.h> 75 #include <vm/as.h> 76 #include <vm/page.h> 77 #include <vm/pvn.h> 78 #include <vm/seg.h> 79 #include <vm/seg_map.h> 80 #include <vm/seg_vn.h> 81 82 #include <sys/ddi.h> 83 84 /* 85 * Arguments to page-flush thread. 86 */ 87 typedef struct { 88 vnode_t *vp; 89 cred_t *cr; 90 } pgflush_t; 91 92 #ifdef DEBUG 93 int nfs4_client_lease_debug; 94 int nfs4_sharedfh_debug; 95 int nfs4_fname_debug; 96 97 /* temporary: panic if v_type is inconsistent with r_attr va_type */ 98 int nfs4_vtype_debug; 99 100 uint_t nfs4_tsd_key; 101 #endif 102 103 static time_t nfs4_client_resumed = 0; 104 static callb_id_t cid = 0; 105 106 static int nfs4renew(nfs4_server_t *); 107 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int); 108 static void nfs4_pgflush_thread(pgflush_t *); 109 110 static boolean_t nfs4_client_cpr_callb(void *, int); 111 112 struct mi4_globals { 113 kmutex_t mig_lock; /* lock protecting mig_list */ 114 list_t mig_list; /* list of NFS v4 mounts in zone */ 115 boolean_t mig_destructor_called; 116 }; 117 118 static zone_key_t mi4_list_key; 119 120 /* 121 * Attributes caching: 122 * 123 * Attributes are cached in the rnode in struct vattr form. 124 * There is a time associated with the cached attributes (r_time_attr_inval) 125 * which tells whether the attributes are valid. The time is initialized 126 * to the difference between current time and the modify time of the vnode 127 * when new attributes are cached. This allows the attributes for 128 * files that have changed recently to be timed out sooner than for files 129 * that have not changed for a long time. There are minimum and maximum 130 * timeout values that can be set per mount point. 131 */ 132 133 /* 134 * If a cache purge is in progress, wait for it to finish. 135 * 136 * The current thread must not be in the middle of an 137 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock 138 * between this thread, a recovery thread, and the page flush thread. 139 */ 140 int 141 nfs4_waitfor_purge_complete(vnode_t *vp) 142 { 143 rnode4_t *rp; 144 k_sigset_t smask; 145 146 rp = VTOR4(vp); 147 if ((rp->r_serial != NULL && rp->r_serial != curthread) || 148 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) { 149 mutex_enter(&rp->r_statelock); 150 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 151 while ((rp->r_serial != NULL && rp->r_serial != curthread) || 152 ((rp->r_flags & R4PGFLUSH) && 153 rp->r_pgflush != curthread)) { 154 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 155 sigunintr(&smask); 156 mutex_exit(&rp->r_statelock); 157 return (EINTR); 158 } 159 } 160 sigunintr(&smask); 161 mutex_exit(&rp->r_statelock); 162 } 163 return (0); 164 } 165 166 /* 167 * Validate caches by checking cached attributes. If they have timed out, 168 * then get new attributes from the server. As a side effect, cache 169 * invalidation is done if the attributes have changed. 170 * 171 * If the attributes have not timed out and if there is a cache 172 * invalidation being done by some other thread, then wait until that 173 * thread has completed the cache invalidation. 174 */ 175 int 176 nfs4_validate_caches(vnode_t *vp, cred_t *cr) 177 { 178 int error; 179 nfs4_ga_res_t gar; 180 181 if (ATTRCACHE4_VALID(vp)) { 182 error = nfs4_waitfor_purge_complete(vp); 183 if (error) 184 return (error); 185 return (0); 186 } 187 188 return (nfs4_getattr_otw(vp, &gar, cr, 0)); 189 } 190 191 /* 192 * Fill in attribute from the cache. 193 * If valid, then return 0 to indicate that no error occurred, 194 * otherwise return 1 to indicate that an error occurred. 195 */ 196 static int 197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap) 198 { 199 rnode4_t *rp; 200 201 rp = VTOR4(vp); 202 mutex_enter(&rp->r_statelock); 203 mutex_enter(&rp->r_statev4_lock); 204 if (ATTRCACHE4_VALID(vp)) { 205 mutex_exit(&rp->r_statev4_lock); 206 /* 207 * Cached attributes are valid 208 */ 209 *vap = rp->r_attr; 210 mutex_exit(&rp->r_statelock); 211 return (0); 212 } 213 mutex_exit(&rp->r_statev4_lock); 214 mutex_exit(&rp->r_statelock); 215 return (1); 216 } 217 218 219 /* 220 * If returned error is ESTALE flush all caches. The nfs4_purge_caches() 221 * call is synchronous because all the pages were invalidated by the 222 * nfs4_invalidate_pages() call. 223 */ 224 void 225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr) 226 { 227 struct rnode4 *rp = VTOR4(vp); 228 229 /* Ensure that the ..._end_op() call has been done */ 230 ASSERT(tsd_get(nfs4_tsd_key) == NULL); 231 232 if (errno != ESTALE) 233 return; 234 235 mutex_enter(&rp->r_statelock); 236 rp->r_flags |= R4STALE; 237 if (!rp->r_error) 238 rp->r_error = errno; 239 mutex_exit(&rp->r_statelock); 240 if (nfs4_has_pages(vp)) 241 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 242 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE); 243 } 244 245 /* 246 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the 247 * page purge is done asynchronously. 248 */ 249 void 250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg) 251 { 252 rnode4_t *rp; 253 char *contents; 254 vnode_t *xattr; 255 int size; 256 int pgflush; /* are we the page flush thread? */ 257 258 /* 259 * Purge the DNLC for any entries which refer to this file. 260 */ 261 if (vp->v_count > 1 && 262 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC)) 263 dnlc_purge_vp(vp); 264 265 /* 266 * Clear any readdir state bits and purge the readlink response cache. 267 */ 268 rp = VTOR4(vp); 269 mutex_enter(&rp->r_statelock); 270 rp->r_flags &= ~R4LOOKUP; 271 contents = rp->r_symlink.contents; 272 size = rp->r_symlink.size; 273 rp->r_symlink.contents = NULL; 274 275 xattr = rp->r_xattr_dir; 276 rp->r_xattr_dir = NULL; 277 278 /* 279 * Purge pathconf cache too. 280 */ 281 rp->r_pathconf.pc4_xattr_valid = 0; 282 rp->r_pathconf.pc4_cache_valid = 0; 283 284 pgflush = (curthread == rp->r_pgflush); 285 mutex_exit(&rp->r_statelock); 286 287 if (contents != NULL) { 288 289 kmem_free((void *)contents, size); 290 } 291 292 if (xattr != NULL) 293 VN_RELE(xattr); 294 295 /* 296 * Flush the page cache. If the current thread is the page flush 297 * thread, don't initiate a new page flush. There's no need for 298 * it, and doing it correctly is hard. 299 */ 300 if (nfs4_has_pages(vp) && !pgflush) { 301 if (!asyncpg) { 302 (void) nfs4_waitfor_purge_complete(vp); 303 nfs4_flush_pages(vp, cr); 304 } else { 305 pgflush_t *args; 306 307 /* 308 * We don't hold r_statelock while creating the 309 * thread, in case the call blocks. So we use a 310 * flag to indicate that a page flush thread is 311 * active. 312 */ 313 mutex_enter(&rp->r_statelock); 314 if (rp->r_flags & R4PGFLUSH) { 315 mutex_exit(&rp->r_statelock); 316 } else { 317 rp->r_flags |= R4PGFLUSH; 318 mutex_exit(&rp->r_statelock); 319 320 args = kmem_alloc(sizeof (pgflush_t), 321 KM_SLEEP); 322 args->vp = vp; 323 VN_HOLD(args->vp); 324 args->cr = cr; 325 crhold(args->cr); 326 (void) zthread_create(NULL, 0, 327 nfs4_pgflush_thread, args, 0, 328 minclsyspri); 329 } 330 } 331 } 332 333 /* 334 * Flush the readdir response cache. 335 */ 336 nfs4_purge_rddir_cache(vp); 337 } 338 339 /* 340 * Invalidate all pages for the given file, after writing back the dirty 341 * ones. 342 */ 343 344 void 345 nfs4_flush_pages(vnode_t *vp, cred_t *cr) 346 { 347 int error; 348 rnode4_t *rp = VTOR4(vp); 349 350 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL); 351 if (error == ENOSPC || error == EDQUOT) { 352 mutex_enter(&rp->r_statelock); 353 if (!rp->r_error) 354 rp->r_error = error; 355 mutex_exit(&rp->r_statelock); 356 } 357 } 358 359 /* 360 * Page flush thread. 361 */ 362 363 static void 364 nfs4_pgflush_thread(pgflush_t *args) 365 { 366 rnode4_t *rp = VTOR4(args->vp); 367 368 /* remember which thread we are, so we don't deadlock ourselves */ 369 mutex_enter(&rp->r_statelock); 370 ASSERT(rp->r_pgflush == NULL); 371 rp->r_pgflush = curthread; 372 mutex_exit(&rp->r_statelock); 373 374 nfs4_flush_pages(args->vp, args->cr); 375 376 mutex_enter(&rp->r_statelock); 377 rp->r_pgflush = NULL; 378 rp->r_flags &= ~R4PGFLUSH; 379 cv_broadcast(&rp->r_cv); 380 mutex_exit(&rp->r_statelock); 381 382 VN_RELE(args->vp); 383 crfree(args->cr); 384 kmem_free(args, sizeof (pgflush_t)); 385 zthread_exit(); 386 } 387 388 /* 389 * Purge the readdir cache of all entries which are not currently 390 * being filled. 391 */ 392 void 393 nfs4_purge_rddir_cache(vnode_t *vp) 394 { 395 rnode4_t *rp; 396 397 rp = VTOR4(vp); 398 399 mutex_enter(&rp->r_statelock); 400 rp->r_direof = NULL; 401 rp->r_flags &= ~R4LOOKUP; 402 rp->r_flags |= R4READDIRWATTR; 403 rddir4_cache_purge(rp); 404 mutex_exit(&rp->r_statelock); 405 } 406 407 /* 408 * Set attributes cache for given vnode using virtual attributes. There is 409 * no cache validation, but if the attributes are deemed to be stale, they 410 * are ignored. This corresponds to nfs3_attrcache(). 411 * 412 * Set the timeout value on the attribute cache and fill it 413 * with the passed in attributes. 414 */ 415 void 416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t) 417 { 418 rnode4_t *rp = VTOR4(vp); 419 420 mutex_enter(&rp->r_statelock); 421 if (rp->r_time_attr_saved <= t) 422 nfs4_attrcache_va(vp, garp, FALSE); 423 mutex_exit(&rp->r_statelock); 424 } 425 426 /* 427 * Use the passed in virtual attributes to check to see whether the 428 * data and metadata caches are valid, cache the new attributes, and 429 * then do the cache invalidation if required. 430 * 431 * The cache validation and caching of the new attributes is done 432 * atomically via the use of the mutex, r_statelock. If required, 433 * the cache invalidation is done atomically w.r.t. the cache 434 * validation and caching of the attributes via the pseudo lock, 435 * r_serial. 436 * 437 * This routine is used to do cache validation and attributes caching 438 * for operations with a single set of post operation attributes. 439 */ 440 441 void 442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, 443 hrtime_t t, cred_t *cr, int async, 444 change_info4 *cinfo) 445 { 446 rnode4_t *rp; 447 int mtime_changed = 0; 448 int ctime_changed = 0; 449 vsecattr_t *vsp; 450 int was_serial, set_time_cache_inval, recov; 451 vattr_t *vap = &garp->n4g_va; 452 mntinfo4_t *mi = VTOMI4(vp); 453 len_t preattr_rsize; 454 boolean_t writemodify_set = B_FALSE; 455 boolean_t cachepurge_set = B_FALSE; 456 457 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid); 458 459 /* Is curthread the recovery thread? */ 460 mutex_enter(&mi->mi_lock); 461 recov = (VTOMI4(vp)->mi_recovthread == curthread); 462 mutex_exit(&mi->mi_lock); 463 464 rp = VTOR4(vp); 465 mutex_enter(&rp->r_statelock); 466 was_serial = (rp->r_serial == curthread); 467 if (rp->r_serial && !was_serial) { 468 klwp_t *lwp = ttolwp(curthread); 469 470 /* 471 * If we're the recovery thread, then purge current attrs 472 * and bail out to avoid potential deadlock between another 473 * thread caching attrs (r_serial thread), recov thread, 474 * and an async writer thread. 475 */ 476 if (recov) { 477 PURGE_ATTRCACHE4_LOCKED(rp); 478 mutex_exit(&rp->r_statelock); 479 return; 480 } 481 482 if (lwp != NULL) 483 lwp->lwp_nostop++; 484 while (rp->r_serial != NULL) { 485 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 486 mutex_exit(&rp->r_statelock); 487 if (lwp != NULL) 488 lwp->lwp_nostop--; 489 return; 490 } 491 } 492 if (lwp != NULL) 493 lwp->lwp_nostop--; 494 } 495 496 /* 497 * If there is a page flush thread, the current thread needs to 498 * bail out, to prevent a possible deadlock between the current 499 * thread (which might be in a start_op/end_op region), the 500 * recovery thread, and the page flush thread. Expire the 501 * attribute cache, so that any attributes the current thread was 502 * going to set are not lost. 503 */ 504 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) { 505 PURGE_ATTRCACHE4_LOCKED(rp); 506 mutex_exit(&rp->r_statelock); 507 return; 508 } 509 510 if (rp->r_time_attr_saved > t) { 511 /* 512 * Attributes have been cached since these attributes were 513 * probably made. If there is an inconsistency in what is 514 * cached, mark them invalid. If not, don't act on them. 515 */ 516 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 517 PURGE_ATTRCACHE4_LOCKED(rp); 518 mutex_exit(&rp->r_statelock); 519 return; 520 } 521 set_time_cache_inval = 0; 522 if (cinfo) { 523 /* 524 * Only directory modifying callers pass non-NULL cinfo. 525 */ 526 ASSERT(vp->v_type == VDIR); 527 /* 528 * If the cache timeout either doesn't exist or hasn't expired, 529 * and dir didn't changed on server before dirmod op 530 * and dir didn't change after dirmod op but before getattr 531 * then there's a chance that the client's cached data for 532 * this object is current (not stale). No immediate cache 533 * flush is required. 534 * 535 */ 536 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) && 537 cinfo->before == rp->r_change && 538 (garp->n4g_change_valid && 539 cinfo->after == garp->n4g_change)) { 540 541 /* 542 * If atomic isn't set, then the before/after info 543 * cannot be blindly trusted. For this case, we tell 544 * nfs4_attrcache_va to cache the attrs but also 545 * establish an absolute maximum cache timeout. When 546 * the timeout is reached, caches will be flushed. 547 */ 548 if (! cinfo->atomic) 549 set_time_cache_inval = 1; 550 } else { 551 552 /* 553 * We're not sure exactly what changed, but we know 554 * what to do. flush all caches for dir. remove the 555 * attr timeout. 556 * 557 * a) timeout expired. flush all caches. 558 * b) r_change != cinfo.before. flush all caches. 559 * c) r_change == cinfo.before, but cinfo.after != 560 * post-op getattr(change). flush all caches. 561 * d) post-op getattr(change) not provided by server. 562 * flush all caches. 563 */ 564 mtime_changed = 1; 565 ctime_changed = 1; 566 rp->r_time_cache_inval = 0; 567 } 568 } else { 569 /* 570 * Write thread after writing data to file on remote server, 571 * will always set R4WRITEMODIFIED to indicate that file on 572 * remote server was modified with a WRITE operation and would 573 * have marked attribute cache as timed out. If R4WRITEMODIFIED 574 * is set, then do not check for mtime and ctime change. 575 */ 576 if (!(rp->r_flags & R4WRITEMODIFIED)) { 577 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size)) 578 mtime_changed = 1; 579 580 if (rp->r_attr.va_ctime.tv_sec != 581 vap->va_ctime.tv_sec || 582 rp->r_attr.va_ctime.tv_nsec != 583 vap->va_ctime.tv_nsec) 584 ctime_changed = 1; 585 586 /* 587 * If the change attribute was not provided by server 588 * or it differs, then flush all caches. 589 */ 590 if (!garp->n4g_change_valid || 591 rp->r_change != garp->n4g_change) { 592 mtime_changed = 1; 593 ctime_changed = 1; 594 } 595 } else { 596 writemodify_set = B_TRUE; 597 } 598 } 599 600 preattr_rsize = rp->r_size; 601 602 nfs4_attrcache_va(vp, garp, set_time_cache_inval); 603 604 /* 605 * If we have updated filesize in nfs4_attrcache_va, as soon as we 606 * drop statelock we will be in transition of purging all 607 * our caches and updating them. It is possible for another 608 * thread to pick this new file size and read in zeroed data. 609 * stall other threads till cache purge is complete. 610 */ 611 if ((!cinfo) && (rp->r_size != preattr_rsize)) { 612 /* 613 * If R4WRITEMODIFIED was set and we have updated the file 614 * size, Server's returned file size need not necessarily 615 * be because of this Client's WRITE. We need to purge 616 * all caches. 617 */ 618 if (writemodify_set) 619 mtime_changed = 1; 620 621 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) { 622 rp->r_flags |= R4INCACHEPURGE; 623 cachepurge_set = B_TRUE; 624 } 625 } 626 627 if (!mtime_changed && !ctime_changed) { 628 mutex_exit(&rp->r_statelock); 629 return; 630 } 631 632 rp->r_serial = curthread; 633 634 mutex_exit(&rp->r_statelock); 635 636 /* 637 * If we're the recov thread, then force async nfs4_purge_caches 638 * to avoid potential deadlock. 639 */ 640 if (mtime_changed) 641 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async); 642 643 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) { 644 mutex_enter(&rp->r_statelock); 645 rp->r_flags &= ~R4INCACHEPURGE; 646 cv_broadcast(&rp->r_cv); 647 mutex_exit(&rp->r_statelock); 648 cachepurge_set = B_FALSE; 649 } 650 651 if (ctime_changed) { 652 (void) nfs4_access_purge_rp(rp); 653 if (rp->r_secattr != NULL) { 654 mutex_enter(&rp->r_statelock); 655 vsp = rp->r_secattr; 656 rp->r_secattr = NULL; 657 mutex_exit(&rp->r_statelock); 658 if (vsp != NULL) 659 nfs4_acl_free_cache(vsp); 660 } 661 } 662 663 if (!was_serial) { 664 mutex_enter(&rp->r_statelock); 665 rp->r_serial = NULL; 666 cv_broadcast(&rp->r_cv); 667 mutex_exit(&rp->r_statelock); 668 } 669 } 670 671 /* 672 * Set attributes cache for given vnode using virtual attributes. 673 * 674 * Set the timeout value on the attribute cache and fill it 675 * with the passed in attributes. 676 * 677 * The caller must be holding r_statelock. 678 */ 679 static void 680 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout) 681 { 682 rnode4_t *rp; 683 mntinfo4_t *mi; 684 hrtime_t delta; 685 hrtime_t now; 686 vattr_t *vap = &garp->n4g_va; 687 688 rp = VTOR4(vp); 689 690 ASSERT(MUTEX_HELD(&rp->r_statelock)); 691 ASSERT(vap->va_mask == AT_ALL); 692 693 /* Switch to master before checking v_flag */ 694 if (IS_SHADOW(vp, rp)) 695 vp = RTOV4(rp); 696 697 now = gethrtime(); 698 699 mi = VTOMI4(vp); 700 701 /* 702 * Only establish a new cache timeout (if requested). Never 703 * extend a timeout. Never clear a timeout. Clearing a timeout 704 * is done by nfs4_update_dircaches (ancestor in our call chain) 705 */ 706 if (set_cache_timeout && ! rp->r_time_cache_inval) 707 rp->r_time_cache_inval = now + mi->mi_acdirmax; 708 709 /* 710 * Delta is the number of nanoseconds that we will 711 * cache the attributes of the file. It is based on 712 * the number of nanoseconds since the last time that 713 * we detected a change. The assumption is that files 714 * that changed recently are likely to change again. 715 * There is a minimum and a maximum for regular files 716 * and for directories which is enforced though. 717 * 718 * Using the time since last change was detected 719 * eliminates direct comparison or calculation 720 * using mixed client and server times. NFS does 721 * not make any assumptions regarding the client 722 * and server clocks being synchronized. 723 */ 724 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 725 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 726 vap->va_size != rp->r_attr.va_size) { 727 rp->r_time_attr_saved = now; 728 } 729 730 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE)) 731 delta = 0; 732 else { 733 delta = now - rp->r_time_attr_saved; 734 if (vp->v_type == VDIR) { 735 if (delta < mi->mi_acdirmin) 736 delta = mi->mi_acdirmin; 737 else if (delta > mi->mi_acdirmax) 738 delta = mi->mi_acdirmax; 739 } else { 740 if (delta < mi->mi_acregmin) 741 delta = mi->mi_acregmin; 742 else if (delta > mi->mi_acregmax) 743 delta = mi->mi_acregmax; 744 } 745 } 746 rp->r_time_attr_inval = now + delta; 747 748 rp->r_attr = *vap; 749 if (garp->n4g_change_valid) 750 rp->r_change = garp->n4g_change; 751 752 /* 753 * The attributes that were returned may be valid and can 754 * be used, but they may not be allowed to be cached. 755 * Reset the timers to cause immediate invalidation and 756 * clear r_change so no VERIFY operations will suceed 757 */ 758 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) { 759 rp->r_time_attr_inval = now; 760 rp->r_time_attr_saved = now; 761 rp->r_change = 0; 762 } 763 764 /* 765 * If mounted_on_fileid returned AND the object is a stub, 766 * then set object's va_nodeid to the mounted over fid 767 * returned by server. 768 * 769 * If mounted_on_fileid not provided/supported, then 770 * just set it to 0 for now. Eventually it would be 771 * better to set it to a hashed version of FH. This 772 * would probably be good enough to provide a unique 773 * fid/d_ino within a dir. 774 * 775 * We don't need to carry mounted_on_fileid in the 776 * rnode as long as the client never requests fileid 777 * without also requesting mounted_on_fileid. For 778 * now, it stays. 779 */ 780 if (garp->n4g_mon_fid_valid) { 781 rp->r_mntd_fid = garp->n4g_mon_fid; 782 783 if (RP_ISSTUB(rp)) 784 rp->r_attr.va_nodeid = rp->r_mntd_fid; 785 } 786 787 /* 788 * Check to see if there are valid pathconf bits to 789 * cache in the rnode. 790 */ 791 if (garp->n4g_ext_res) { 792 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) { 793 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4; 794 } else { 795 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) { 796 rp->r_pathconf.pc4_xattr_valid = TRUE; 797 rp->r_pathconf.pc4_xattr_exists = 798 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists; 799 } 800 } 801 } 802 /* 803 * Update the size of the file if there is no cached data or if 804 * the cached data is clean and there is no data being written 805 * out. 806 */ 807 if (rp->r_size != vap->va_size && 808 (!vn_has_cached_data(vp) || 809 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) { 810 rp->r_size = vap->va_size; 811 } 812 nfs_setswaplike(vp, vap); 813 rp->r_flags &= ~R4WRITEMODIFIED; 814 } 815 816 /* 817 * Get attributes over-the-wire and update attributes cache 818 * if no error occurred in the over-the-wire operation. 819 * Return 0 if successful, otherwise error. 820 */ 821 int 822 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl) 823 { 824 mntinfo4_t *mi = VTOMI4(vp); 825 hrtime_t t; 826 nfs4_recov_state_t recov_state; 827 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 828 829 recov_state.rs_flags = 0; 830 recov_state.rs_num_retry_despite_err = 0; 831 832 /* Save the original mount point security flavor */ 833 (void) save_mnt_secinfo(mi->mi_curr_serv); 834 835 recov_retry: 836 837 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, 838 &recov_state, NULL))) { 839 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 840 return (e.error); 841 } 842 843 t = gethrtime(); 844 845 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl); 846 847 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) { 848 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 849 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) { 850 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, 851 &recov_state, 1); 852 goto recov_retry; 853 } 854 } 855 856 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0); 857 858 if (!e.error) { 859 if (e.stat == NFS4_OK) { 860 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 861 } else { 862 e.error = geterrno4(e.stat); 863 864 nfs4_purge_stale_fh(e.error, vp, cr); 865 } 866 } 867 868 /* 869 * If getattr a node that is a stub for a crossed 870 * mount point, keep the original secinfo flavor for 871 * the current file system, not the crossed one. 872 */ 873 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 874 875 return (e.error); 876 } 877 878 /* 879 * Generate a compound to get attributes over-the-wire. 880 */ 881 void 882 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp, 883 nfs4_error_t *ep, cred_t *cr, int get_acl) 884 { 885 COMPOUND4args_clnt args; 886 COMPOUND4res_clnt res; 887 int doqueue; 888 rnode4_t *rp = VTOR4(vp); 889 nfs_argop4 argop[2]; 890 891 args.ctag = TAG_GETATTR; 892 893 args.array_len = 2; 894 args.array = argop; 895 896 /* putfh */ 897 argop[0].argop = OP_CPUTFH; 898 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 899 900 /* getattr */ 901 /* 902 * Unlike nfs version 2 and 3, where getattr returns all the 903 * attributes, nfs version 4 returns only the ones explicitly 904 * asked for. This creates problems, as some system functions 905 * (e.g. cache check) require certain attributes and if the 906 * cached node lacks some attributes such as uid/gid, it can 907 * affect system utilities (e.g. "ls") that rely on the information 908 * to be there. This can lead to anything from system crashes to 909 * corrupted information processed by user apps. 910 * So to ensure that all bases are covered, request at least 911 * the AT_ALL attribute mask. 912 */ 913 argop[1].argop = OP_GETATTR; 914 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 915 if (get_acl) 916 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK; 917 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 918 919 doqueue = 1; 920 921 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep); 922 923 if (ep->error) 924 return; 925 926 if (res.status != NFS4_OK) { 927 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 928 return; 929 } 930 931 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res; 932 933 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 934 } 935 936 /* 937 * Return either cached or remote attributes. If get remote attr 938 * use them to check and invalidate caches, then cache the new attributes. 939 */ 940 int 941 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr) 942 { 943 int error; 944 rnode4_t *rp; 945 nfs4_ga_res_t gar; 946 947 ASSERT(nfs4_consistent_type(vp)); 948 949 /* 950 * If we've got cached attributes, we're done, otherwise go 951 * to the server to get attributes, which will update the cache 952 * in the process. Either way, use the cached attributes for 953 * the caller's vattr_t. 954 * 955 * Note that we ignore the gar set by the OTW call: the attr caching 956 * code may make adjustments when storing to the rnode, and we want 957 * to see those changes here. 958 */ 959 rp = VTOR4(vp); 960 error = 0; 961 mutex_enter(&rp->r_statelock); 962 if (!ATTRCACHE4_VALID(vp)) { 963 mutex_exit(&rp->r_statelock); 964 error = nfs4_getattr_otw(vp, &gar, cr, 0); 965 mutex_enter(&rp->r_statelock); 966 } 967 968 if (!error) 969 *vap = rp->r_attr; 970 971 /* Return the client's view of file size */ 972 vap->va_size = rp->r_size; 973 974 mutex_exit(&rp->r_statelock); 975 976 ASSERT(nfs4_consistent_type(vp)); 977 978 return (error); 979 } 980 981 int 982 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type, 983 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr) 984 { 985 COMPOUND4args_clnt args; 986 COMPOUND4res_clnt res; 987 int doqueue; 988 nfs_argop4 argop[2]; 989 mntinfo4_t *mi = VTOMI4(vp); 990 bool_t needrecov = FALSE; 991 nfs4_recov_state_t recov_state; 992 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 993 nfs4_ga_ext_res_t *gerp; 994 995 recov_state.rs_flags = 0; 996 recov_state.rs_num_retry_despite_err = 0; 997 998 recov_retry: 999 args.ctag = tag_type; 1000 1001 args.array_len = 2; 1002 args.array = argop; 1003 1004 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL); 1005 if (e.error) 1006 return (e.error); 1007 1008 /* putfh */ 1009 argop[0].argop = OP_CPUTFH; 1010 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 1011 1012 /* getattr */ 1013 argop[1].argop = OP_GETATTR; 1014 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap; 1015 argop[1].nfs_argop4_u.opgetattr.mi = mi; 1016 1017 doqueue = 1; 1018 1019 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1020 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first", 1021 rnode4info(VTOR4(vp)))); 1022 1023 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 1024 1025 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 1026 if (!needrecov && e.error) { 1027 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1028 needrecov); 1029 return (e.error); 1030 } 1031 1032 if (needrecov) { 1033 bool_t abort; 1034 1035 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1036 "nfs4_attr_otw: initiating recovery\n")); 1037 1038 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 1039 NULL, OP_GETATTR, NULL, NULL, NULL); 1040 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1041 needrecov); 1042 if (!e.error) { 1043 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1044 e.error = geterrno4(res.status); 1045 } 1046 if (abort == FALSE) 1047 goto recov_retry; 1048 return (e.error); 1049 } 1050 1051 if (res.status) { 1052 e.error = geterrno4(res.status); 1053 } else { 1054 gerp = garp->n4g_ext_res; 1055 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res, 1056 garp, sizeof (nfs4_ga_res_t)); 1057 garp->n4g_ext_res = gerp; 1058 if (garp->n4g_ext_res && 1059 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res) 1060 bcopy(res.array[1].nfs_resop4_u.opgetattr. 1061 ga_res.n4g_ext_res, 1062 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t)); 1063 } 1064 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1065 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 1066 needrecov); 1067 return (e.error); 1068 } 1069 1070 /* 1071 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1072 * for the demand-based allocation of async threads per-mount. The 1073 * nfs_async_timeout is the amount of time a thread will live after it 1074 * becomes idle, unless new I/O requests are received before the thread 1075 * dies. See nfs4_async_putpage and nfs4_async_start. 1076 */ 1077 1078 static void nfs4_async_start(struct vfs *); 1079 static void nfs4_async_pgops_start(struct vfs *); 1080 static void nfs4_async_common_start(struct vfs *, int); 1081 1082 static void 1083 free_async_args4(struct nfs4_async_reqs *args) 1084 { 1085 rnode4_t *rp; 1086 1087 if (args->a_io != NFS4_INACTIVE) { 1088 rp = VTOR4(args->a_vp); 1089 mutex_enter(&rp->r_statelock); 1090 rp->r_count--; 1091 if (args->a_io == NFS4_PUTAPAGE || 1092 args->a_io == NFS4_PAGEIO) 1093 rp->r_awcount--; 1094 cv_broadcast(&rp->r_cv); 1095 mutex_exit(&rp->r_statelock); 1096 VN_RELE(args->a_vp); 1097 } 1098 crfree(args->a_cred); 1099 kmem_free(args, sizeof (*args)); 1100 } 1101 1102 /* 1103 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1104 * pageout(), running in the global zone, have legitimate reasons to do 1105 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1106 * use of a a per-mount "asynchronous requests manager thread" which is 1107 * signaled by the various asynchronous work routines when there is 1108 * asynchronous work to be done. It is responsible for creating new 1109 * worker threads if necessary, and notifying existing worker threads 1110 * that there is work to be done. 1111 * 1112 * In other words, it will "take the specifications from the customers and 1113 * give them to the engineers." 1114 * 1115 * Worker threads die off of their own accord if they are no longer 1116 * needed. 1117 * 1118 * This thread is killed when the zone is going away or the filesystem 1119 * is being unmounted. 1120 */ 1121 void 1122 nfs4_async_manager(vfs_t *vfsp) 1123 { 1124 callb_cpr_t cprinfo; 1125 mntinfo4_t *mi; 1126 uint_t max_threads; 1127 1128 mi = VFTOMI4(vfsp); 1129 1130 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1131 "nfs4_async_manager"); 1132 1133 mutex_enter(&mi->mi_async_lock); 1134 /* 1135 * We want to stash the max number of threads that this mount was 1136 * allowed so we can use it later when the variable is set to zero as 1137 * part of the zone/mount going away. 1138 * 1139 * We want to be able to create at least one thread to handle 1140 * asynchronous inactive calls. 1141 */ 1142 max_threads = MAX(mi->mi_max_threads, 1); 1143 /* 1144 * We don't want to wait for mi_max_threads to go to zero, since that 1145 * happens as part of a failed unmount, but this thread should only 1146 * exit when the mount is really going away. 1147 * 1148 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be 1149 * attempted: the various _async_*() functions know to do things 1150 * inline if mi_max_threads == 0. Henceforth we just drain out the 1151 * outstanding requests. 1152 * 1153 * Note that we still create zthreads even if we notice the zone is 1154 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone 1155 * shutdown sequence to take slightly longer in some cases, but 1156 * doesn't violate the protocol, as all threads will exit as soon as 1157 * they're done processing the remaining requests. 1158 */ 1159 for (;;) { 1160 while (mi->mi_async_req_count > 0) { 1161 /* 1162 * Paranoia: If the mount started out having 1163 * (mi->mi_max_threads == 0), and the value was 1164 * later changed (via a debugger or somesuch), 1165 * we could be confused since we will think we 1166 * can't create any threads, and the calling 1167 * code (which looks at the current value of 1168 * mi->mi_max_threads, now non-zero) thinks we 1169 * can. 1170 * 1171 * So, because we're paranoid, we create threads 1172 * up to the maximum of the original and the 1173 * current value. This means that future 1174 * (debugger-induced) alterations of 1175 * mi->mi_max_threads are ignored for our 1176 * purposes, but who told them they could change 1177 * random values on a live kernel anyhow? 1178 */ 1179 if (mi->mi_threads[NFS4_ASYNC_QUEUE] < 1180 MAX(mi->mi_max_threads, max_threads)) { 1181 mi->mi_threads[NFS4_ASYNC_QUEUE]++; 1182 mutex_exit(&mi->mi_async_lock); 1183 MI4_HOLD(mi); 1184 VFS_HOLD(vfsp); /* hold for new thread */ 1185 (void) zthread_create(NULL, 0, nfs4_async_start, 1186 vfsp, 0, minclsyspri); 1187 mutex_enter(&mi->mi_async_lock); 1188 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] < 1189 NUM_ASYNC_PGOPS_THREADS) { 1190 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++; 1191 mutex_exit(&mi->mi_async_lock); 1192 MI4_HOLD(mi); 1193 VFS_HOLD(vfsp); /* hold for new thread */ 1194 (void) zthread_create(NULL, 0, 1195 nfs4_async_pgops_start, vfsp, 0, 1196 minclsyspri); 1197 mutex_enter(&mi->mi_async_lock); 1198 } 1199 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv); 1200 ASSERT(mi->mi_async_req_count != 0); 1201 mi->mi_async_req_count--; 1202 } 1203 1204 mutex_enter(&mi->mi_lock); 1205 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) { 1206 mutex_exit(&mi->mi_lock); 1207 break; 1208 } 1209 mutex_exit(&mi->mi_lock); 1210 1211 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1212 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1213 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1214 } 1215 1216 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1217 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp)); 1218 /* 1219 * Let everyone know we're done. 1220 */ 1221 mi->mi_manager_thread = NULL; 1222 /* 1223 * Wake up the inactive thread. 1224 */ 1225 cv_broadcast(&mi->mi_inact_req_cv); 1226 /* 1227 * Wake up anyone sitting in nfs4_async_manager_stop() 1228 */ 1229 cv_broadcast(&mi->mi_async_cv); 1230 /* 1231 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1232 * since CALLB_CPR_EXIT is actually responsible for releasing 1233 * 'mi_async_lock'. 1234 */ 1235 CALLB_CPR_EXIT(&cprinfo); 1236 VFS_RELE(vfsp); /* release thread's hold */ 1237 MI4_RELE(mi); 1238 zthread_exit(); 1239 } 1240 1241 /* 1242 * Signal (and wait for) the async manager thread to clean up and go away. 1243 */ 1244 void 1245 nfs4_async_manager_stop(vfs_t *vfsp) 1246 { 1247 mntinfo4_t *mi = VFTOMI4(vfsp); 1248 1249 mutex_enter(&mi->mi_async_lock); 1250 mutex_enter(&mi->mi_lock); 1251 mi->mi_flags |= MI4_ASYNC_MGR_STOP; 1252 mutex_exit(&mi->mi_lock); 1253 cv_broadcast(&mi->mi_async_reqs_cv); 1254 /* 1255 * Wait for the async manager thread to die. 1256 */ 1257 while (mi->mi_manager_thread != NULL) 1258 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1259 mutex_exit(&mi->mi_async_lock); 1260 } 1261 1262 int 1263 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1264 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1265 u_offset_t, caddr_t, struct seg *, cred_t *)) 1266 { 1267 rnode4_t *rp; 1268 mntinfo4_t *mi; 1269 struct nfs4_async_reqs *args; 1270 1271 rp = VTOR4(vp); 1272 ASSERT(rp->r_freef == NULL); 1273 1274 mi = VTOMI4(vp); 1275 1276 /* 1277 * If addr falls in a different segment, don't bother doing readahead. 1278 */ 1279 if (addr >= seg->s_base + seg->s_size) 1280 return (-1); 1281 1282 /* 1283 * If we can't allocate a request structure, punt on the readahead. 1284 */ 1285 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1286 return (-1); 1287 1288 /* 1289 * If a lock operation is pending, don't initiate any new 1290 * readaheads. Otherwise, bump r_count to indicate the new 1291 * asynchronous I/O. 1292 */ 1293 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1294 kmem_free(args, sizeof (*args)); 1295 return (-1); 1296 } 1297 mutex_enter(&rp->r_statelock); 1298 rp->r_count++; 1299 mutex_exit(&rp->r_statelock); 1300 nfs_rw_exit(&rp->r_lkserlock); 1301 1302 args->a_next = NULL; 1303 #ifdef DEBUG 1304 args->a_queuer = curthread; 1305 #endif 1306 VN_HOLD(vp); 1307 args->a_vp = vp; 1308 ASSERT(cr != NULL); 1309 crhold(cr); 1310 args->a_cred = cr; 1311 args->a_io = NFS4_READ_AHEAD; 1312 args->a_nfs4_readahead = readahead; 1313 args->a_nfs4_blkoff = blkoff; 1314 args->a_nfs4_seg = seg; 1315 args->a_nfs4_addr = addr; 1316 1317 mutex_enter(&mi->mi_async_lock); 1318 1319 /* 1320 * If asyncio has been disabled, don't bother readahead. 1321 */ 1322 if (mi->mi_max_threads == 0) { 1323 mutex_exit(&mi->mi_async_lock); 1324 goto noasync; 1325 } 1326 1327 /* 1328 * Link request structure into the async list and 1329 * wakeup async thread to do the i/o. 1330 */ 1331 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) { 1332 mi->mi_async_reqs[NFS4_READ_AHEAD] = args; 1333 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1334 } else { 1335 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args; 1336 mi->mi_async_tail[NFS4_READ_AHEAD] = args; 1337 } 1338 1339 if (mi->mi_io_kstats) { 1340 mutex_enter(&mi->mi_lock); 1341 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1342 mutex_exit(&mi->mi_lock); 1343 } 1344 1345 mi->mi_async_req_count++; 1346 ASSERT(mi->mi_async_req_count != 0); 1347 cv_signal(&mi->mi_async_reqs_cv); 1348 mutex_exit(&mi->mi_async_lock); 1349 return (0); 1350 1351 noasync: 1352 mutex_enter(&rp->r_statelock); 1353 rp->r_count--; 1354 cv_broadcast(&rp->r_cv); 1355 mutex_exit(&rp->r_statelock); 1356 VN_RELE(vp); 1357 crfree(cr); 1358 kmem_free(args, sizeof (*args)); 1359 return (-1); 1360 } 1361 1362 static void 1363 nfs4_async_start(struct vfs *vfsp) 1364 { 1365 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE); 1366 } 1367 1368 static void 1369 nfs4_async_pgops_start(struct vfs *vfsp) 1370 { 1371 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE); 1372 } 1373 1374 /* 1375 * The async queues for each mounted file system are arranged as a 1376 * set of queues, one for each async i/o type. Requests are taken 1377 * from the queues in a round-robin fashion. A number of consecutive 1378 * requests are taken from each queue before moving on to the next 1379 * queue. This functionality may allow the NFS Version 2 server to do 1380 * write clustering, even if the client is mixing writes and reads 1381 * because it will take multiple write requests from the queue 1382 * before processing any of the other async i/o types. 1383 * 1384 * XXX The nfs4_async_common_start thread is unsafe in the light of the present 1385 * model defined by cpr to suspend the system. Specifically over the 1386 * wire calls are cpr-unsafe. The thread should be reevaluated in 1387 * case of future updates to the cpr model. 1388 */ 1389 static void 1390 nfs4_async_common_start(struct vfs *vfsp, int async_queue) 1391 { 1392 struct nfs4_async_reqs *args; 1393 mntinfo4_t *mi = VFTOMI4(vfsp); 1394 clock_t time_left = 1; 1395 callb_cpr_t cprinfo; 1396 int i; 1397 extern int nfs_async_timeout; 1398 int async_types; 1399 kcondvar_t *async_work_cv; 1400 1401 if (async_queue == NFS4_ASYNC_QUEUE) { 1402 async_types = NFS4_ASYNC_TYPES; 1403 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]; 1404 } else { 1405 async_types = NFS4_ASYNC_PGOPS_TYPES; 1406 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]; 1407 } 1408 1409 /* 1410 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1411 * built in an implementation independent manner. 1412 */ 1413 if (nfs_async_timeout == -1) 1414 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1415 1416 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1417 1418 mutex_enter(&mi->mi_async_lock); 1419 for (;;) { 1420 /* 1421 * Find the next queue containing an entry. We start 1422 * at the current queue pointer and then round robin 1423 * through all of them until we either find a non-empty 1424 * queue or have looked through all of them. 1425 */ 1426 for (i = 0; i < async_types; i++) { 1427 args = *mi->mi_async_curr[async_queue]; 1428 if (args != NULL) 1429 break; 1430 mi->mi_async_curr[async_queue]++; 1431 if (mi->mi_async_curr[async_queue] == 1432 &mi->mi_async_reqs[async_types]) { 1433 mi->mi_async_curr[async_queue] = 1434 &mi->mi_async_reqs[0]; 1435 } 1436 } 1437 /* 1438 * If we didn't find a entry, then block until woken up 1439 * again and then look through the queues again. 1440 */ 1441 if (args == NULL) { 1442 /* 1443 * Exiting is considered to be safe for CPR as well 1444 */ 1445 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1446 1447 /* 1448 * Wakeup thread waiting to unmount the file 1449 * system only if all async threads are inactive. 1450 * 1451 * If we've timed-out and there's nothing to do, 1452 * then get rid of this thread. 1453 */ 1454 if (mi->mi_max_threads == 0 || time_left <= 0) { 1455 --mi->mi_threads[async_queue]; 1456 1457 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 1458 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0) 1459 cv_signal(&mi->mi_async_cv); 1460 CALLB_CPR_EXIT(&cprinfo); 1461 VFS_RELE(vfsp); /* release thread's hold */ 1462 MI4_RELE(mi); 1463 zthread_exit(); 1464 /* NOTREACHED */ 1465 } 1466 time_left = cv_reltimedwait(async_work_cv, 1467 &mi->mi_async_lock, nfs_async_timeout, 1468 TR_CLOCK_TICK); 1469 1470 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1471 1472 continue; 1473 } else { 1474 time_left = 1; 1475 } 1476 1477 /* 1478 * Remove the request from the async queue and then 1479 * update the current async request queue pointer. If 1480 * the current queue is empty or we have removed enough 1481 * consecutive entries from it, then reset the counter 1482 * for this queue and then move the current pointer to 1483 * the next queue. 1484 */ 1485 *mi->mi_async_curr[async_queue] = args->a_next; 1486 if (*mi->mi_async_curr[async_queue] == NULL || 1487 --mi->mi_async_clusters[args->a_io] == 0) { 1488 mi->mi_async_clusters[args->a_io] = 1489 mi->mi_async_init_clusters; 1490 mi->mi_async_curr[async_queue]++; 1491 if (mi->mi_async_curr[async_queue] == 1492 &mi->mi_async_reqs[async_types]) { 1493 mi->mi_async_curr[async_queue] = 1494 &mi->mi_async_reqs[0]; 1495 } 1496 } 1497 1498 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) { 1499 mutex_enter(&mi->mi_lock); 1500 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 1501 mutex_exit(&mi->mi_lock); 1502 } 1503 1504 mutex_exit(&mi->mi_async_lock); 1505 1506 /* 1507 * Obtain arguments from the async request structure. 1508 */ 1509 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) { 1510 (*args->a_nfs4_readahead)(args->a_vp, 1511 args->a_nfs4_blkoff, args->a_nfs4_addr, 1512 args->a_nfs4_seg, args->a_cred); 1513 } else if (args->a_io == NFS4_PUTAPAGE) { 1514 (void) (*args->a_nfs4_putapage)(args->a_vp, 1515 args->a_nfs4_pp, args->a_nfs4_off, 1516 args->a_nfs4_len, args->a_nfs4_flags, 1517 args->a_cred); 1518 } else if (args->a_io == NFS4_PAGEIO) { 1519 (void) (*args->a_nfs4_pageio)(args->a_vp, 1520 args->a_nfs4_pp, args->a_nfs4_off, 1521 args->a_nfs4_len, args->a_nfs4_flags, 1522 args->a_cred); 1523 } else if (args->a_io == NFS4_READDIR) { 1524 (void) ((*args->a_nfs4_readdir)(args->a_vp, 1525 args->a_nfs4_rdc, args->a_cred)); 1526 } else if (args->a_io == NFS4_COMMIT) { 1527 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist, 1528 args->a_nfs4_offset, args->a_nfs4_count, 1529 args->a_cred); 1530 } else if (args->a_io == NFS4_INACTIVE) { 1531 nfs4_inactive_otw(args->a_vp, args->a_cred); 1532 } 1533 1534 /* 1535 * Now, release the vnode and free the credentials 1536 * structure. 1537 */ 1538 free_async_args4(args); 1539 /* 1540 * Reacquire the mutex because it will be needed above. 1541 */ 1542 mutex_enter(&mi->mi_async_lock); 1543 } 1544 } 1545 1546 /* 1547 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as 1548 * part of VOP_INACTIVE. 1549 */ 1550 1551 void 1552 nfs4_inactive_thread(mntinfo4_t *mi) 1553 { 1554 struct nfs4_async_reqs *args; 1555 callb_cpr_t cprinfo; 1556 vfs_t *vfsp = mi->mi_vfsp; 1557 1558 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1559 "nfs4_inactive_thread"); 1560 1561 for (;;) { 1562 mutex_enter(&mi->mi_async_lock); 1563 args = mi->mi_async_reqs[NFS4_INACTIVE]; 1564 if (args == NULL) { 1565 mutex_enter(&mi->mi_lock); 1566 /* 1567 * We don't want to exit until the async manager is done 1568 * with its work; hence the check for mi_manager_thread 1569 * being NULL. 1570 * 1571 * The async manager thread will cv_broadcast() on 1572 * mi_inact_req_cv when it's done, at which point we'll 1573 * wake up and exit. 1574 */ 1575 if (mi->mi_manager_thread == NULL) 1576 goto die; 1577 mi->mi_flags |= MI4_INACTIVE_IDLE; 1578 mutex_exit(&mi->mi_lock); 1579 cv_signal(&mi->mi_async_cv); 1580 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1581 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock); 1582 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1583 mutex_exit(&mi->mi_async_lock); 1584 } else { 1585 mutex_enter(&mi->mi_lock); 1586 mi->mi_flags &= ~MI4_INACTIVE_IDLE; 1587 mutex_exit(&mi->mi_lock); 1588 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next; 1589 mutex_exit(&mi->mi_async_lock); 1590 nfs4_inactive_otw(args->a_vp, args->a_cred); 1591 crfree(args->a_cred); 1592 kmem_free(args, sizeof (*args)); 1593 } 1594 } 1595 die: 1596 mutex_exit(&mi->mi_lock); 1597 mi->mi_inactive_thread = NULL; 1598 cv_signal(&mi->mi_async_cv); 1599 1600 /* 1601 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since 1602 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'. 1603 */ 1604 CALLB_CPR_EXIT(&cprinfo); 1605 1606 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 1607 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp)); 1608 1609 MI4_RELE(mi); 1610 zthread_exit(); 1611 /* NOTREACHED */ 1612 } 1613 1614 /* 1615 * nfs_async_stop: 1616 * Wait for all outstanding putpage operations and the inactive thread to 1617 * complete; nfs4_async_stop_sig() without interruptibility. 1618 */ 1619 void 1620 nfs4_async_stop(struct vfs *vfsp) 1621 { 1622 mntinfo4_t *mi = VFTOMI4(vfsp); 1623 1624 /* 1625 * Wait for all outstanding async operations to complete and for 1626 * worker threads to exit. 1627 */ 1628 mutex_enter(&mi->mi_async_lock); 1629 mi->mi_max_threads = 0; 1630 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1631 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1632 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) 1633 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1634 1635 /* 1636 * Wait for the inactive thread to finish doing what it's doing. It 1637 * won't exit until the last reference to the vfs_t goes away. 1638 */ 1639 if (mi->mi_inactive_thread != NULL) { 1640 mutex_enter(&mi->mi_lock); 1641 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1642 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1643 mutex_exit(&mi->mi_lock); 1644 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1645 mutex_enter(&mi->mi_lock); 1646 } 1647 mutex_exit(&mi->mi_lock); 1648 } 1649 mutex_exit(&mi->mi_async_lock); 1650 } 1651 1652 /* 1653 * nfs_async_stop_sig: 1654 * Wait for all outstanding putpage operations and the inactive thread to 1655 * complete. If a signal is delivered we will abort and return non-zero; 1656 * otherwise return 0. Since this routine is called from nfs4_unmount, we 1657 * need to make it interruptible. 1658 */ 1659 int 1660 nfs4_async_stop_sig(struct vfs *vfsp) 1661 { 1662 mntinfo4_t *mi = VFTOMI4(vfsp); 1663 ushort_t omax; 1664 bool_t intr = FALSE; 1665 1666 /* 1667 * Wait for all outstanding putpage operations to complete and for 1668 * worker threads to exit. 1669 */ 1670 mutex_enter(&mi->mi_async_lock); 1671 omax = mi->mi_max_threads; 1672 mi->mi_max_threads = 0; 1673 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 1674 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 || 1675 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) { 1676 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) { 1677 intr = TRUE; 1678 goto interrupted; 1679 } 1680 } 1681 1682 /* 1683 * Wait for the inactive thread to finish doing what it's doing. It 1684 * won't exit until the a last reference to the vfs_t goes away. 1685 */ 1686 if (mi->mi_inactive_thread != NULL) { 1687 mutex_enter(&mi->mi_lock); 1688 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) || 1689 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) { 1690 mutex_exit(&mi->mi_lock); 1691 if (!cv_wait_sig(&mi->mi_async_cv, 1692 &mi->mi_async_lock)) { 1693 intr = TRUE; 1694 goto interrupted; 1695 } 1696 mutex_enter(&mi->mi_lock); 1697 } 1698 mutex_exit(&mi->mi_lock); 1699 } 1700 interrupted: 1701 if (intr) 1702 mi->mi_max_threads = omax; 1703 mutex_exit(&mi->mi_async_lock); 1704 1705 return (intr); 1706 } 1707 1708 int 1709 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1710 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1711 u_offset_t, size_t, int, cred_t *)) 1712 { 1713 rnode4_t *rp; 1714 mntinfo4_t *mi; 1715 struct nfs4_async_reqs *args; 1716 1717 ASSERT(flags & B_ASYNC); 1718 ASSERT(vp->v_vfsp != NULL); 1719 1720 rp = VTOR4(vp); 1721 ASSERT(rp->r_count > 0); 1722 1723 mi = VTOMI4(vp); 1724 1725 /* 1726 * If we can't allocate a request structure, do the putpage 1727 * operation synchronously in this thread's context. 1728 */ 1729 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1730 goto noasync; 1731 1732 args->a_next = NULL; 1733 #ifdef DEBUG 1734 args->a_queuer = curthread; 1735 #endif 1736 VN_HOLD(vp); 1737 args->a_vp = vp; 1738 ASSERT(cr != NULL); 1739 crhold(cr); 1740 args->a_cred = cr; 1741 args->a_io = NFS4_PUTAPAGE; 1742 args->a_nfs4_putapage = putapage; 1743 args->a_nfs4_pp = pp; 1744 args->a_nfs4_off = off; 1745 args->a_nfs4_len = (uint_t)len; 1746 args->a_nfs4_flags = flags; 1747 1748 mutex_enter(&mi->mi_async_lock); 1749 1750 /* 1751 * If asyncio has been disabled, then make a synchronous request. 1752 * This check is done a second time in case async io was diabled 1753 * while this thread was blocked waiting for memory pressure to 1754 * reduce or for the queue to drain. 1755 */ 1756 if (mi->mi_max_threads == 0) { 1757 mutex_exit(&mi->mi_async_lock); 1758 1759 VN_RELE(vp); 1760 crfree(cr); 1761 kmem_free(args, sizeof (*args)); 1762 goto noasync; 1763 } 1764 1765 /* 1766 * Link request structure into the async list and 1767 * wakeup async thread to do the i/o. 1768 */ 1769 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) { 1770 mi->mi_async_reqs[NFS4_PUTAPAGE] = args; 1771 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1772 } else { 1773 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args; 1774 mi->mi_async_tail[NFS4_PUTAPAGE] = args; 1775 } 1776 1777 mutex_enter(&rp->r_statelock); 1778 rp->r_count++; 1779 rp->r_awcount++; 1780 mutex_exit(&rp->r_statelock); 1781 1782 if (mi->mi_io_kstats) { 1783 mutex_enter(&mi->mi_lock); 1784 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1785 mutex_exit(&mi->mi_lock); 1786 } 1787 1788 mi->mi_async_req_count++; 1789 ASSERT(mi->mi_async_req_count != 0); 1790 cv_signal(&mi->mi_async_reqs_cv); 1791 mutex_exit(&mi->mi_async_lock); 1792 return (0); 1793 1794 noasync: 1795 1796 if (curproc == proc_pageout || curproc == proc_fsflush) { 1797 /* 1798 * If we get here in the context of the pageout/fsflush, 1799 * or we have run out of memory or we're attempting to 1800 * unmount we refuse to do a sync write, because this may 1801 * hang pageout/fsflush and the machine. In this case, 1802 * we just re-mark the page as dirty and punt on the page. 1803 * 1804 * Make sure B_FORCE isn't set. We can re-mark the 1805 * pages as dirty and unlock the pages in one swoop by 1806 * passing in B_ERROR to pvn_write_done(). However, 1807 * we should make sure B_FORCE isn't set - we don't 1808 * want the page tossed before it gets written out. 1809 */ 1810 if (flags & B_FORCE) 1811 flags &= ~(B_INVAL | B_FORCE); 1812 pvn_write_done(pp, flags | B_ERROR); 1813 return (0); 1814 } 1815 1816 if (nfs_zone() != mi->mi_zone) { 1817 /* 1818 * So this was a cross-zone sync putpage. 1819 * 1820 * We pass in B_ERROR to pvn_write_done() to re-mark the pages 1821 * as dirty and unlock them. 1822 * 1823 * We don't want to clear B_FORCE here as the caller presumably 1824 * knows what they're doing if they set it. 1825 */ 1826 pvn_write_done(pp, flags | B_ERROR); 1827 return (EPERM); 1828 } 1829 return ((*putapage)(vp, pp, off, len, flags, cr)); 1830 } 1831 1832 int 1833 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1834 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1835 size_t, int, cred_t *)) 1836 { 1837 rnode4_t *rp; 1838 mntinfo4_t *mi; 1839 struct nfs4_async_reqs *args; 1840 1841 ASSERT(flags & B_ASYNC); 1842 ASSERT(vp->v_vfsp != NULL); 1843 1844 rp = VTOR4(vp); 1845 ASSERT(rp->r_count > 0); 1846 1847 mi = VTOMI4(vp); 1848 1849 /* 1850 * If we can't allocate a request structure, do the pageio 1851 * request synchronously in this thread's context. 1852 */ 1853 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1854 goto noasync; 1855 1856 args->a_next = NULL; 1857 #ifdef DEBUG 1858 args->a_queuer = curthread; 1859 #endif 1860 VN_HOLD(vp); 1861 args->a_vp = vp; 1862 ASSERT(cr != NULL); 1863 crhold(cr); 1864 args->a_cred = cr; 1865 args->a_io = NFS4_PAGEIO; 1866 args->a_nfs4_pageio = pageio; 1867 args->a_nfs4_pp = pp; 1868 args->a_nfs4_off = io_off; 1869 args->a_nfs4_len = (uint_t)io_len; 1870 args->a_nfs4_flags = flags; 1871 1872 mutex_enter(&mi->mi_async_lock); 1873 1874 /* 1875 * If asyncio has been disabled, then make a synchronous request. 1876 * This check is done a second time in case async io was diabled 1877 * while this thread was blocked waiting for memory pressure to 1878 * reduce or for the queue to drain. 1879 */ 1880 if (mi->mi_max_threads == 0) { 1881 mutex_exit(&mi->mi_async_lock); 1882 1883 VN_RELE(vp); 1884 crfree(cr); 1885 kmem_free(args, sizeof (*args)); 1886 goto noasync; 1887 } 1888 1889 /* 1890 * Link request structure into the async list and 1891 * wakeup async thread to do the i/o. 1892 */ 1893 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) { 1894 mi->mi_async_reqs[NFS4_PAGEIO] = args; 1895 mi->mi_async_tail[NFS4_PAGEIO] = args; 1896 } else { 1897 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args; 1898 mi->mi_async_tail[NFS4_PAGEIO] = args; 1899 } 1900 1901 mutex_enter(&rp->r_statelock); 1902 rp->r_count++; 1903 rp->r_awcount++; 1904 mutex_exit(&rp->r_statelock); 1905 1906 if (mi->mi_io_kstats) { 1907 mutex_enter(&mi->mi_lock); 1908 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1909 mutex_exit(&mi->mi_lock); 1910 } 1911 1912 mi->mi_async_req_count++; 1913 ASSERT(mi->mi_async_req_count != 0); 1914 cv_signal(&mi->mi_async_reqs_cv); 1915 mutex_exit(&mi->mi_async_lock); 1916 return (0); 1917 1918 noasync: 1919 /* 1920 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1921 * the page list), for writes we do it synchronously, except for 1922 * proc_pageout/proc_fsflush as described below. 1923 */ 1924 if (flags & B_READ) { 1925 pvn_read_done(pp, flags | B_ERROR); 1926 return (0); 1927 } 1928 1929 if (curproc == proc_pageout || curproc == proc_fsflush) { 1930 /* 1931 * If we get here in the context of the pageout/fsflush, 1932 * we refuse to do a sync write, because this may hang 1933 * pageout/fsflush (and the machine). In this case, we just 1934 * re-mark the page as dirty and punt on the page. 1935 * 1936 * Make sure B_FORCE isn't set. We can re-mark the 1937 * pages as dirty and unlock the pages in one swoop by 1938 * passing in B_ERROR to pvn_write_done(). However, 1939 * we should make sure B_FORCE isn't set - we don't 1940 * want the page tossed before it gets written out. 1941 */ 1942 if (flags & B_FORCE) 1943 flags &= ~(B_INVAL | B_FORCE); 1944 pvn_write_done(pp, flags | B_ERROR); 1945 return (0); 1946 } 1947 1948 if (nfs_zone() != mi->mi_zone) { 1949 /* 1950 * So this was a cross-zone sync pageio. We pass in B_ERROR 1951 * to pvn_write_done() to re-mark the pages as dirty and unlock 1952 * them. 1953 * 1954 * We don't want to clear B_FORCE here as the caller presumably 1955 * knows what they're doing if they set it. 1956 */ 1957 pvn_write_done(pp, flags | B_ERROR); 1958 return (EPERM); 1959 } 1960 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1961 } 1962 1963 void 1964 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr, 1965 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *)) 1966 { 1967 rnode4_t *rp; 1968 mntinfo4_t *mi; 1969 struct nfs4_async_reqs *args; 1970 1971 rp = VTOR4(vp); 1972 ASSERT(rp->r_freef == NULL); 1973 1974 mi = VTOMI4(vp); 1975 1976 /* 1977 * If we can't allocate a request structure, skip the readdir. 1978 */ 1979 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1980 goto noasync; 1981 1982 args->a_next = NULL; 1983 #ifdef DEBUG 1984 args->a_queuer = curthread; 1985 #endif 1986 VN_HOLD(vp); 1987 args->a_vp = vp; 1988 ASSERT(cr != NULL); 1989 crhold(cr); 1990 args->a_cred = cr; 1991 args->a_io = NFS4_READDIR; 1992 args->a_nfs4_readdir = readdir; 1993 args->a_nfs4_rdc = rdc; 1994 1995 mutex_enter(&mi->mi_async_lock); 1996 1997 /* 1998 * If asyncio has been disabled, then skip this request 1999 */ 2000 if (mi->mi_max_threads == 0) { 2001 mutex_exit(&mi->mi_async_lock); 2002 2003 VN_RELE(vp); 2004 crfree(cr); 2005 kmem_free(args, sizeof (*args)); 2006 goto noasync; 2007 } 2008 2009 /* 2010 * Link request structure into the async list and 2011 * wakeup async thread to do the i/o. 2012 */ 2013 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) { 2014 mi->mi_async_reqs[NFS4_READDIR] = args; 2015 mi->mi_async_tail[NFS4_READDIR] = args; 2016 } else { 2017 mi->mi_async_tail[NFS4_READDIR]->a_next = args; 2018 mi->mi_async_tail[NFS4_READDIR] = args; 2019 } 2020 2021 mutex_enter(&rp->r_statelock); 2022 rp->r_count++; 2023 mutex_exit(&rp->r_statelock); 2024 2025 if (mi->mi_io_kstats) { 2026 mutex_enter(&mi->mi_lock); 2027 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2028 mutex_exit(&mi->mi_lock); 2029 } 2030 2031 mi->mi_async_req_count++; 2032 ASSERT(mi->mi_async_req_count != 0); 2033 cv_signal(&mi->mi_async_reqs_cv); 2034 mutex_exit(&mi->mi_async_lock); 2035 return; 2036 2037 noasync: 2038 mutex_enter(&rp->r_statelock); 2039 rdc->entries = NULL; 2040 /* 2041 * Indicate that no one is trying to fill this entry and 2042 * it still needs to be filled. 2043 */ 2044 rdc->flags &= ~RDDIR; 2045 rdc->flags |= RDDIRREQ; 2046 rddir4_cache_rele(rp, rdc); 2047 mutex_exit(&rp->r_statelock); 2048 } 2049 2050 void 2051 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 2052 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 2053 cred_t *)) 2054 { 2055 rnode4_t *rp; 2056 mntinfo4_t *mi; 2057 struct nfs4_async_reqs *args; 2058 page_t *pp; 2059 2060 rp = VTOR4(vp); 2061 mi = VTOMI4(vp); 2062 2063 /* 2064 * If we can't allocate a request structure, do the commit 2065 * operation synchronously in this thread's context. 2066 */ 2067 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 2068 goto noasync; 2069 2070 args->a_next = NULL; 2071 #ifdef DEBUG 2072 args->a_queuer = curthread; 2073 #endif 2074 VN_HOLD(vp); 2075 args->a_vp = vp; 2076 ASSERT(cr != NULL); 2077 crhold(cr); 2078 args->a_cred = cr; 2079 args->a_io = NFS4_COMMIT; 2080 args->a_nfs4_commit = commit; 2081 args->a_nfs4_plist = plist; 2082 args->a_nfs4_offset = offset; 2083 args->a_nfs4_count = count; 2084 2085 mutex_enter(&mi->mi_async_lock); 2086 2087 /* 2088 * If asyncio has been disabled, then make a synchronous request. 2089 * This check is done a second time in case async io was diabled 2090 * while this thread was blocked waiting for memory pressure to 2091 * reduce or for the queue to drain. 2092 */ 2093 if (mi->mi_max_threads == 0) { 2094 mutex_exit(&mi->mi_async_lock); 2095 2096 VN_RELE(vp); 2097 crfree(cr); 2098 kmem_free(args, sizeof (*args)); 2099 goto noasync; 2100 } 2101 2102 /* 2103 * Link request structure into the async list and 2104 * wakeup async thread to do the i/o. 2105 */ 2106 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) { 2107 mi->mi_async_reqs[NFS4_COMMIT] = args; 2108 mi->mi_async_tail[NFS4_COMMIT] = args; 2109 } else { 2110 mi->mi_async_tail[NFS4_COMMIT]->a_next = args; 2111 mi->mi_async_tail[NFS4_COMMIT] = args; 2112 } 2113 2114 mutex_enter(&rp->r_statelock); 2115 rp->r_count++; 2116 mutex_exit(&rp->r_statelock); 2117 2118 if (mi->mi_io_kstats) { 2119 mutex_enter(&mi->mi_lock); 2120 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 2121 mutex_exit(&mi->mi_lock); 2122 } 2123 2124 mi->mi_async_req_count++; 2125 ASSERT(mi->mi_async_req_count != 0); 2126 cv_signal(&mi->mi_async_reqs_cv); 2127 mutex_exit(&mi->mi_async_lock); 2128 return; 2129 2130 noasync: 2131 if (curproc == proc_pageout || curproc == proc_fsflush || 2132 nfs_zone() != mi->mi_zone) { 2133 while (plist != NULL) { 2134 pp = plist; 2135 page_sub(&plist, pp); 2136 pp->p_fsdata = C_COMMIT; 2137 page_unlock(pp); 2138 } 2139 return; 2140 } 2141 (*commit)(vp, plist, offset, count, cr); 2142 } 2143 2144 /* 2145 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The 2146 * reference to the vnode is handed over to the thread; the caller should 2147 * no longer refer to the vnode. 2148 * 2149 * Unlike most of the async routines, this handoff is needed for 2150 * correctness reasons, not just performance. So doing operations in the 2151 * context of the current thread is not an option. 2152 */ 2153 void 2154 nfs4_async_inactive(vnode_t *vp, cred_t *cr) 2155 { 2156 mntinfo4_t *mi; 2157 struct nfs4_async_reqs *args; 2158 boolean_t signal_inactive_thread = B_FALSE; 2159 2160 mi = VTOMI4(vp); 2161 2162 args = kmem_alloc(sizeof (*args), KM_SLEEP); 2163 args->a_next = NULL; 2164 #ifdef DEBUG 2165 args->a_queuer = curthread; 2166 #endif 2167 args->a_vp = vp; 2168 ASSERT(cr != NULL); 2169 crhold(cr); 2170 args->a_cred = cr; 2171 args->a_io = NFS4_INACTIVE; 2172 2173 /* 2174 * Note that we don't check mi->mi_max_threads here, since we 2175 * *need* to get rid of this vnode regardless of whether someone 2176 * set nfs4_max_threads to zero in /etc/system. 2177 * 2178 * The manager thread knows about this and is willing to create 2179 * at least one thread to accommodate us. 2180 */ 2181 mutex_enter(&mi->mi_async_lock); 2182 if (mi->mi_inactive_thread == NULL) { 2183 rnode4_t *rp; 2184 vnode_t *unldvp = NULL; 2185 char *unlname; 2186 cred_t *unlcred; 2187 2188 mutex_exit(&mi->mi_async_lock); 2189 /* 2190 * We just need to free up the memory associated with the 2191 * vnode, which can be safely done from within the current 2192 * context. 2193 */ 2194 crfree(cr); /* drop our reference */ 2195 kmem_free(args, sizeof (*args)); 2196 rp = VTOR4(vp); 2197 mutex_enter(&rp->r_statelock); 2198 if (rp->r_unldvp != NULL) { 2199 unldvp = rp->r_unldvp; 2200 rp->r_unldvp = NULL; 2201 unlname = rp->r_unlname; 2202 rp->r_unlname = NULL; 2203 unlcred = rp->r_unlcred; 2204 rp->r_unlcred = NULL; 2205 } 2206 mutex_exit(&rp->r_statelock); 2207 /* 2208 * No need to explicitly throw away any cached pages. The 2209 * eventual r4inactive() will attempt a synchronous 2210 * VOP_PUTPAGE() which will immediately fail since the request 2211 * is coming from the wrong zone, and then will proceed to call 2212 * nfs4_invalidate_pages() which will clean things up for us. 2213 * 2214 * Throw away the delegation here so rp4_addfree()'s attempt to 2215 * return any existing delegations becomes a no-op. 2216 */ 2217 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 2218 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 2219 FALSE); 2220 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 2221 nfs_rw_exit(&mi->mi_recovlock); 2222 } 2223 nfs4_clear_open_streams(rp); 2224 2225 rp4_addfree(rp, cr); 2226 if (unldvp != NULL) { 2227 kmem_free(unlname, MAXNAMELEN); 2228 VN_RELE(unldvp); 2229 crfree(unlcred); 2230 } 2231 return; 2232 } 2233 2234 if (mi->mi_manager_thread == NULL) { 2235 /* 2236 * We want to talk to the inactive thread. 2237 */ 2238 signal_inactive_thread = B_TRUE; 2239 } 2240 2241 /* 2242 * Enqueue the vnode and wake up either the special thread (empty 2243 * list) or an async thread. 2244 */ 2245 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) { 2246 mi->mi_async_reqs[NFS4_INACTIVE] = args; 2247 mi->mi_async_tail[NFS4_INACTIVE] = args; 2248 signal_inactive_thread = B_TRUE; 2249 } else { 2250 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args; 2251 mi->mi_async_tail[NFS4_INACTIVE] = args; 2252 } 2253 if (signal_inactive_thread) { 2254 cv_signal(&mi->mi_inact_req_cv); 2255 } else { 2256 mi->mi_async_req_count++; 2257 ASSERT(mi->mi_async_req_count != 0); 2258 cv_signal(&mi->mi_async_reqs_cv); 2259 } 2260 2261 mutex_exit(&mi->mi_async_lock); 2262 } 2263 2264 int 2265 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2266 { 2267 int pagecreate; 2268 int n; 2269 int saved_n; 2270 caddr_t saved_base; 2271 u_offset_t offset; 2272 int error; 2273 int sm_error; 2274 vnode_t *vp = RTOV(rp); 2275 2276 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2277 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2278 if (!vpm_enable) { 2279 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2280 } 2281 2282 /* 2283 * Move bytes in at most PAGESIZE chunks. We must avoid 2284 * spanning pages in uiomove() because page faults may cause 2285 * the cache to be invalidated out from under us. The r_size is not 2286 * updated until after the uiomove. If we push the last page of a 2287 * file before r_size is correct, we will lose the data written past 2288 * the current (and invalid) r_size. 2289 */ 2290 do { 2291 offset = uio->uio_loffset; 2292 pagecreate = 0; 2293 2294 /* 2295 * n is the number of bytes required to satisfy the request 2296 * or the number of bytes to fill out the page. 2297 */ 2298 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount); 2299 2300 /* 2301 * Check to see if we can skip reading in the page 2302 * and just allocate the memory. We can do this 2303 * if we are going to rewrite the entire mapping 2304 * or if we are going to write to or beyond the current 2305 * end of file from the beginning of the mapping. 2306 * 2307 * The read of r_size is now protected by r_statelock. 2308 */ 2309 mutex_enter(&rp->r_statelock); 2310 /* 2311 * When pgcreated is nonzero the caller has already done 2312 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2313 * segkpm this means we already have at least one page 2314 * created and mapped at base. 2315 */ 2316 pagecreate = pgcreated || 2317 ((offset & PAGEOFFSET) == 0 && 2318 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2319 2320 mutex_exit(&rp->r_statelock); 2321 2322 if (!vpm_enable && pagecreate) { 2323 /* 2324 * The last argument tells segmap_pagecreate() to 2325 * always lock the page, as opposed to sometimes 2326 * returning with the page locked. This way we avoid a 2327 * fault on the ensuing uiomove(), but also 2328 * more importantly (to fix bug 1094402) we can 2329 * call segmap_fault() to unlock the page in all 2330 * cases. An alternative would be to modify 2331 * segmap_pagecreate() to tell us when it is 2332 * locking a page, but that's a fairly major 2333 * interface change. 2334 */ 2335 if (pgcreated == 0) 2336 (void) segmap_pagecreate(segkmap, base, 2337 (uint_t)n, 1); 2338 saved_base = base; 2339 saved_n = n; 2340 } 2341 2342 /* 2343 * The number of bytes of data in the last page can not 2344 * be accurately be determined while page is being 2345 * uiomove'd to and the size of the file being updated. 2346 * Thus, inform threads which need to know accurately 2347 * how much data is in the last page of the file. They 2348 * will not do the i/o immediately, but will arrange for 2349 * the i/o to happen later when this modify operation 2350 * will have finished. 2351 */ 2352 ASSERT(!(rp->r_flags & R4MODINPROGRESS)); 2353 mutex_enter(&rp->r_statelock); 2354 rp->r_flags |= R4MODINPROGRESS; 2355 rp->r_modaddr = (offset & MAXBMASK); 2356 mutex_exit(&rp->r_statelock); 2357 2358 if (vpm_enable) { 2359 /* 2360 * Copy data. If new pages are created, part of 2361 * the page that is not written will be initizliazed 2362 * with zeros. 2363 */ 2364 error = vpm_data_copy(vp, offset, n, uio, 2365 !pagecreate, NULL, 0, S_WRITE); 2366 } else { 2367 error = uiomove(base, n, UIO_WRITE, uio); 2368 } 2369 2370 /* 2371 * r_size is the maximum number of 2372 * bytes known to be in the file. 2373 * Make sure it is at least as high as the 2374 * first unwritten byte pointed to by uio_loffset. 2375 */ 2376 mutex_enter(&rp->r_statelock); 2377 if (rp->r_size < uio->uio_loffset) 2378 rp->r_size = uio->uio_loffset; 2379 rp->r_flags &= ~R4MODINPROGRESS; 2380 rp->r_flags |= R4DIRTY; 2381 mutex_exit(&rp->r_statelock); 2382 2383 /* n = # of bytes written */ 2384 n = (int)(uio->uio_loffset - offset); 2385 2386 if (!vpm_enable) { 2387 base += n; 2388 } 2389 2390 tcount -= n; 2391 /* 2392 * If we created pages w/o initializing them completely, 2393 * we need to zero the part that wasn't set up. 2394 * This happens on a most EOF write cases and if 2395 * we had some sort of error during the uiomove. 2396 */ 2397 if (!vpm_enable && pagecreate) { 2398 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2399 (void) kzero(base, PAGESIZE - n); 2400 2401 if (pgcreated) { 2402 /* 2403 * Caller is responsible for this page, 2404 * it was not created in this loop. 2405 */ 2406 pgcreated = 0; 2407 } else { 2408 /* 2409 * For bug 1094402: segmap_pagecreate locks 2410 * page. Unlock it. This also unlocks the 2411 * pages allocated by page_create_va() in 2412 * segmap_pagecreate(). 2413 */ 2414 sm_error = segmap_fault(kas.a_hat, segkmap, 2415 saved_base, saved_n, 2416 F_SOFTUNLOCK, S_WRITE); 2417 if (error == 0) 2418 error = sm_error; 2419 } 2420 } 2421 } while (tcount > 0 && error == 0); 2422 2423 return (error); 2424 } 2425 2426 int 2427 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2428 { 2429 rnode4_t *rp; 2430 page_t *pp; 2431 u_offset_t eoff; 2432 u_offset_t io_off; 2433 size_t io_len; 2434 int error; 2435 int rdirty; 2436 int err; 2437 2438 rp = VTOR4(vp); 2439 ASSERT(rp->r_count > 0); 2440 2441 if (!nfs4_has_pages(vp)) 2442 return (0); 2443 2444 ASSERT(vp->v_type != VCHR); 2445 2446 /* 2447 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL 2448 * writes. B_FORCE is set to force the VM system to actually 2449 * invalidate the pages, even if the i/o failed. The pages 2450 * need to get invalidated because they can't be written out 2451 * because there isn't any space left on either the server's 2452 * file system or in the user's disk quota. The B_FREE bit 2453 * is cleared to avoid confusion as to whether this is a 2454 * request to place the page on the freelist or to destroy 2455 * it. 2456 */ 2457 if ((rp->r_flags & R4OUTOFSPACE) || 2458 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2459 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2460 2461 if (len == 0) { 2462 /* 2463 * If doing a full file synchronous operation, then clear 2464 * the R4DIRTY bit. If a page gets dirtied while the flush 2465 * is happening, then R4DIRTY will get set again. The 2466 * R4DIRTY bit must get cleared before the flush so that 2467 * we don't lose this information. 2468 * 2469 * If there are no full file async write operations 2470 * pending and RDIRTY bit is set, clear it. 2471 */ 2472 if (off == (u_offset_t)0 && 2473 !(flags & B_ASYNC) && 2474 (rp->r_flags & R4DIRTY)) { 2475 mutex_enter(&rp->r_statelock); 2476 rdirty = (rp->r_flags & R4DIRTY); 2477 rp->r_flags &= ~R4DIRTY; 2478 mutex_exit(&rp->r_statelock); 2479 } else if (flags & B_ASYNC && off == (u_offset_t)0) { 2480 mutex_enter(&rp->r_statelock); 2481 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) { 2482 rdirty = (rp->r_flags & R4DIRTY); 2483 rp->r_flags &= ~R4DIRTY; 2484 } 2485 mutex_exit(&rp->r_statelock); 2486 } else 2487 rdirty = 0; 2488 2489 /* 2490 * Search the entire vp list for pages >= off, and flush 2491 * the dirty pages. 2492 */ 2493 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2494 flags, cr); 2495 2496 /* 2497 * If an error occurred and the file was marked as dirty 2498 * before and we aren't forcibly invalidating pages, then 2499 * reset the R4DIRTY flag. 2500 */ 2501 if (error && rdirty && 2502 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2503 mutex_enter(&rp->r_statelock); 2504 rp->r_flags |= R4DIRTY; 2505 mutex_exit(&rp->r_statelock); 2506 } 2507 } else { 2508 /* 2509 * Do a range from [off...off + len) looking for pages 2510 * to deal with. 2511 */ 2512 error = 0; 2513 io_len = 0; 2514 eoff = off + len; 2515 mutex_enter(&rp->r_statelock); 2516 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2517 io_off += io_len) { 2518 mutex_exit(&rp->r_statelock); 2519 /* 2520 * If we are not invalidating, synchronously 2521 * freeing or writing pages use the routine 2522 * page_lookup_nowait() to prevent reclaiming 2523 * them from the free list. 2524 */ 2525 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2526 pp = page_lookup(vp, io_off, 2527 (flags & (B_INVAL | B_FREE)) ? 2528 SE_EXCL : SE_SHARED); 2529 } else { 2530 pp = page_lookup_nowait(vp, io_off, 2531 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2532 } 2533 2534 if (pp == NULL || !pvn_getdirty(pp, flags)) 2535 io_len = PAGESIZE; 2536 else { 2537 err = (*rp->r_putapage)(vp, pp, &io_off, 2538 &io_len, flags, cr); 2539 if (!error) 2540 error = err; 2541 /* 2542 * "io_off" and "io_len" are returned as 2543 * the range of pages we actually wrote. 2544 * This allows us to skip ahead more quickly 2545 * since several pages may've been dealt 2546 * with by this iteration of the loop. 2547 */ 2548 } 2549 mutex_enter(&rp->r_statelock); 2550 } 2551 mutex_exit(&rp->r_statelock); 2552 } 2553 2554 return (error); 2555 } 2556 2557 void 2558 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2559 { 2560 rnode4_t *rp; 2561 2562 rp = VTOR4(vp); 2563 if (IS_SHADOW(vp, rp)) 2564 vp = RTOV4(rp); 2565 mutex_enter(&rp->r_statelock); 2566 while (rp->r_flags & R4TRUNCATE) 2567 cv_wait(&rp->r_cv, &rp->r_statelock); 2568 rp->r_flags |= R4TRUNCATE; 2569 if (off == (u_offset_t)0) { 2570 rp->r_flags &= ~R4DIRTY; 2571 if (!(rp->r_flags & R4STALE)) 2572 rp->r_error = 0; 2573 } 2574 rp->r_truncaddr = off; 2575 mutex_exit(&rp->r_statelock); 2576 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2577 B_INVAL | B_TRUNC, cr); 2578 mutex_enter(&rp->r_statelock); 2579 rp->r_flags &= ~R4TRUNCATE; 2580 cv_broadcast(&rp->r_cv); 2581 mutex_exit(&rp->r_statelock); 2582 } 2583 2584 static int 2585 nfs4_mnt_kstat_update(kstat_t *ksp, int rw) 2586 { 2587 mntinfo4_t *mi; 2588 struct mntinfo_kstat *mik; 2589 vfs_t *vfsp; 2590 2591 /* this is a read-only kstat. Bail out on a write */ 2592 if (rw == KSTAT_WRITE) 2593 return (EACCES); 2594 2595 2596 /* 2597 * We don't want to wait here as kstat_chain_lock could be held by 2598 * dounmount(). dounmount() takes vfs_reflock before the chain lock 2599 * and thus could lead to a deadlock. 2600 */ 2601 vfsp = (struct vfs *)ksp->ks_private; 2602 2603 mi = VFTOMI4(vfsp); 2604 mik = (struct mntinfo_kstat *)ksp->ks_data; 2605 2606 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 2607 2608 mik->mik_vers = (uint32_t)mi->mi_vers; 2609 mik->mik_flags = mi->mi_flags; 2610 /* 2611 * The sv_secdata holds the flavor the client specifies. 2612 * If the client uses default and a security negotiation 2613 * occurs, sv_currsec will point to the current flavor 2614 * selected from the server flavor list. 2615 * sv_currsec is NULL if no security negotiation takes place. 2616 */ 2617 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ? 2618 mi->mi_curr_serv->sv_currsec->secmod : 2619 mi->mi_curr_serv->sv_secdata->secmod; 2620 mik->mik_curread = (uint32_t)mi->mi_curread; 2621 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 2622 mik->mik_retrans = mi->mi_retrans; 2623 mik->mik_timeo = mi->mi_timeo; 2624 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 2625 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 2626 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 2627 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 2628 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 2629 mik->mik_failover = (uint32_t)mi->mi_failover; 2630 mik->mik_remap = (uint32_t)mi->mi_remap; 2631 2632 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 2633 2634 return (0); 2635 } 2636 2637 void 2638 nfs4_mnt_kstat_init(struct vfs *vfsp) 2639 { 2640 mntinfo4_t *mi = VFTOMI4(vfsp); 2641 2642 /* 2643 * PSARC 2001/697 Contract Private Interface 2644 * All nfs kstats are under SunMC contract 2645 * Please refer to the PSARC listed above and contact 2646 * SunMC before making any changes! 2647 * 2648 * Changes must be reviewed by Solaris File Sharing 2649 * Changes must be communicated to contract-2001-697@sun.com 2650 * 2651 */ 2652 2653 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 2654 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 2655 if (mi->mi_io_kstats) { 2656 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2657 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 2658 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 2659 kstat_install(mi->mi_io_kstats); 2660 } 2661 2662 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 2663 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 2664 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 2665 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 2666 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 2667 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update; 2668 mi->mi_ro_kstats->ks_private = (void *)vfsp; 2669 kstat_install(mi->mi_ro_kstats); 2670 } 2671 2672 nfs4_mnt_recov_kstat_init(vfsp); 2673 } 2674 2675 void 2676 nfs4_write_error(vnode_t *vp, int error, cred_t *cr) 2677 { 2678 mntinfo4_t *mi; 2679 clock_t now = ddi_get_lbolt(); 2680 2681 mi = VTOMI4(vp); 2682 /* 2683 * In case of forced unmount, do not print any messages 2684 * since it can flood the console with error messages. 2685 */ 2686 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 2687 return; 2688 2689 /* 2690 * If the mount point is dead, not recoverable, do not 2691 * print error messages that can flood the console. 2692 */ 2693 if (mi->mi_flags & MI4_RECOV_FAIL) 2694 return; 2695 2696 /* 2697 * No use in flooding the console with ENOSPC 2698 * messages from the same file system. 2699 */ 2700 if ((error != ENOSPC && error != EDQUOT) || 2701 now - mi->mi_printftime > 0) { 2702 zoneid_t zoneid = mi->mi_zone->zone_id; 2703 2704 #ifdef DEBUG 2705 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2706 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL); 2707 #else 2708 nfs_perror(error, "NFS write error on host %s: %m.\n", 2709 VTOR4(vp)->r_server->sv_hostname, NULL); 2710 #endif 2711 if (error == ENOSPC || error == EDQUOT) { 2712 zcmn_err(zoneid, CE_CONT, 2713 "^File: userid=%d, groupid=%d\n", 2714 crgetuid(cr), crgetgid(cr)); 2715 if (crgetuid(curthread->t_cred) != crgetuid(cr) || 2716 crgetgid(curthread->t_cred) != crgetgid(cr)) { 2717 zcmn_err(zoneid, CE_CONT, 2718 "^User: userid=%d, groupid=%d\n", 2719 crgetuid(curthread->t_cred), 2720 crgetgid(curthread->t_cred)); 2721 } 2722 mi->mi_printftime = now + 2723 nfs_write_error_interval * hz; 2724 } 2725 sfh4_printfhandle(VTOR4(vp)->r_fh); 2726 #ifdef DEBUG 2727 if (error == EACCES) { 2728 zcmn_err(zoneid, CE_CONT, 2729 "nfs_bio: cred is%s kcred\n", 2730 cr == kcred ? "" : " not"); 2731 } 2732 #endif 2733 } 2734 } 2735 2736 /* 2737 * Return non-zero if the given file can be safely memory mapped. Locks 2738 * are safe if whole-file (length and offset are both zero). 2739 */ 2740 2741 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0) 2742 2743 static int 2744 nfs4_safemap(const vnode_t *vp) 2745 { 2746 locklist_t *llp, *next_llp; 2747 int safe = 1; 2748 rnode4_t *rp = VTOR4(vp); 2749 2750 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2751 2752 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: " 2753 "vp = %p", (void *)vp)); 2754 2755 /* 2756 * Review all the locks for the vnode, both ones that have been 2757 * acquired and ones that are pending. We assume that 2758 * flk_active_locks_for_vp() has merged any locks that can be 2759 * merged (so that if a process has the entire file locked, it is 2760 * represented as a single lock). 2761 * 2762 * Note that we can't bail out of the loop if we find a non-safe 2763 * lock, because we have to free all the elements in the llp list. 2764 * We might be able to speed up this code slightly by not looking 2765 * at each lock's l_start and l_len fields once we've found a 2766 * non-safe lock. 2767 */ 2768 2769 llp = flk_active_locks_for_vp(vp); 2770 while (llp) { 2771 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2772 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")", 2773 llp->ll_flock.l_start, llp->ll_flock.l_len)); 2774 if (!SAFE_LOCK(llp->ll_flock)) { 2775 safe = 0; 2776 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, 2777 "nfs4_safemap: unsafe active lock (%" PRId64 2778 ", %" PRId64 ")", llp->ll_flock.l_start, 2779 llp->ll_flock.l_len)); 2780 } 2781 next_llp = llp->ll_next; 2782 VN_RELE(llp->ll_vp); 2783 kmem_free(llp, sizeof (*llp)); 2784 llp = next_llp; 2785 } 2786 2787 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s", 2788 safe ? "safe" : "unsafe")); 2789 return (safe); 2790 } 2791 2792 /* 2793 * Return whether there is a lost LOCK or LOCKU queued up for the given 2794 * file that would make an mmap request unsafe. cf. nfs4_safemap(). 2795 */ 2796 2797 bool_t 2798 nfs4_map_lost_lock_conflict(vnode_t *vp) 2799 { 2800 bool_t conflict = FALSE; 2801 nfs4_lost_rqst_t *lrp; 2802 mntinfo4_t *mi = VTOMI4(vp); 2803 2804 mutex_enter(&mi->mi_lock); 2805 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL; 2806 lrp = list_next(&mi->mi_lost_state, lrp)) { 2807 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2808 continue; 2809 ASSERT(lrp->lr_vp != NULL); 2810 if (!VOP_CMP(lrp->lr_vp, vp, NULL)) 2811 continue; /* different file */ 2812 if (!SAFE_LOCK(*lrp->lr_flk)) { 2813 conflict = TRUE; 2814 break; 2815 } 2816 } 2817 2818 mutex_exit(&mi->mi_lock); 2819 return (conflict); 2820 } 2821 2822 /* 2823 * nfs_lockcompletion: 2824 * 2825 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2826 * as non cachable (set VNOCACHE bit). 2827 */ 2828 2829 void 2830 nfs4_lockcompletion(vnode_t *vp, int cmd) 2831 { 2832 rnode4_t *rp = VTOR4(vp); 2833 2834 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2835 ASSERT(!IS_SHADOW(vp, rp)); 2836 2837 if (cmd == F_SETLK || cmd == F_SETLKW) { 2838 2839 if (!nfs4_safemap(vp)) { 2840 mutex_enter(&vp->v_lock); 2841 vp->v_flag |= VNOCACHE; 2842 mutex_exit(&vp->v_lock); 2843 } else { 2844 mutex_enter(&vp->v_lock); 2845 vp->v_flag &= ~VNOCACHE; 2846 mutex_exit(&vp->v_lock); 2847 } 2848 } 2849 /* 2850 * The cached attributes of the file are stale after acquiring 2851 * the lock on the file. They were updated when the file was 2852 * opened, but not updated when the lock was acquired. Therefore the 2853 * cached attributes are invalidated after the lock is obtained. 2854 */ 2855 PURGE_ATTRCACHE4(vp); 2856 } 2857 2858 /* ARGSUSED */ 2859 static void * 2860 nfs4_mi_init(zoneid_t zoneid) 2861 { 2862 struct mi4_globals *mig; 2863 2864 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2865 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2866 list_create(&mig->mig_list, sizeof (mntinfo4_t), 2867 offsetof(mntinfo4_t, mi_zone_node)); 2868 mig->mig_destructor_called = B_FALSE; 2869 return (mig); 2870 } 2871 2872 /* 2873 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down 2874 * state and killing off threads. 2875 */ 2876 /* ARGSUSED */ 2877 static void 2878 nfs4_mi_shutdown(zoneid_t zoneid, void *data) 2879 { 2880 struct mi4_globals *mig = data; 2881 mntinfo4_t *mi; 2882 nfs4_server_t *np; 2883 2884 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2885 "nfs4_mi_shutdown zone %d\n", zoneid)); 2886 ASSERT(mig != NULL); 2887 for (;;) { 2888 mutex_enter(&mig->mig_lock); 2889 mi = list_head(&mig->mig_list); 2890 if (mi == NULL) { 2891 mutex_exit(&mig->mig_lock); 2892 break; 2893 } 2894 2895 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2896 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp)); 2897 /* 2898 * purge the DNLC for this filesystem 2899 */ 2900 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2901 /* 2902 * Tell existing async worker threads to exit. 2903 */ 2904 mutex_enter(&mi->mi_async_lock); 2905 mi->mi_max_threads = 0; 2906 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv); 2907 /* 2908 * Set the appropriate flags, signal and wait for both the 2909 * async manager and the inactive thread to exit when they're 2910 * done with their current work. 2911 */ 2912 mutex_enter(&mi->mi_lock); 2913 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD); 2914 mutex_exit(&mi->mi_lock); 2915 mutex_exit(&mi->mi_async_lock); 2916 if (mi->mi_manager_thread) { 2917 nfs4_async_manager_stop(mi->mi_vfsp); 2918 } 2919 if (mi->mi_inactive_thread) { 2920 mutex_enter(&mi->mi_async_lock); 2921 cv_signal(&mi->mi_inact_req_cv); 2922 /* 2923 * Wait for the inactive thread to exit. 2924 */ 2925 while (mi->mi_inactive_thread != NULL) { 2926 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2927 } 2928 mutex_exit(&mi->mi_async_lock); 2929 } 2930 /* 2931 * Wait for the recovery thread to complete, that is, it will 2932 * signal when it is done using the "mi" structure and about 2933 * to exit 2934 */ 2935 mutex_enter(&mi->mi_lock); 2936 while (mi->mi_in_recovery > 0) 2937 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock); 2938 mutex_exit(&mi->mi_lock); 2939 /* 2940 * We're done when every mi has been done or the list is empty. 2941 * This one is done, remove it from the list. 2942 */ 2943 list_remove(&mig->mig_list, mi); 2944 mutex_exit(&mig->mig_lock); 2945 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4); 2946 2947 /* 2948 * Release hold on vfs and mi done to prevent race with zone 2949 * shutdown. This releases the hold in nfs4_mi_zonelist_add. 2950 */ 2951 VFS_RELE(mi->mi_vfsp); 2952 MI4_RELE(mi); 2953 } 2954 /* 2955 * Tell each renew thread in the zone to exit 2956 */ 2957 mutex_enter(&nfs4_server_lst_lock); 2958 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) { 2959 mutex_enter(&np->s_lock); 2960 if (np->zoneid == zoneid) { 2961 /* 2962 * We add another hold onto the nfs4_server_t 2963 * because this will make sure tha the nfs4_server_t 2964 * stays around until nfs4_callback_fini_zone destroys 2965 * the zone. This way, the renew thread can 2966 * unconditionally release its holds on the 2967 * nfs4_server_t. 2968 */ 2969 np->s_refcnt++; 2970 nfs4_mark_srv_dead(np); 2971 } 2972 mutex_exit(&np->s_lock); 2973 } 2974 mutex_exit(&nfs4_server_lst_lock); 2975 } 2976 2977 static void 2978 nfs4_mi_free_globals(struct mi4_globals *mig) 2979 { 2980 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2981 mutex_destroy(&mig->mig_lock); 2982 kmem_free(mig, sizeof (*mig)); 2983 } 2984 2985 /* ARGSUSED */ 2986 static void 2987 nfs4_mi_destroy(zoneid_t zoneid, void *data) 2988 { 2989 struct mi4_globals *mig = data; 2990 2991 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE, 2992 "nfs4_mi_destroy zone %d\n", zoneid)); 2993 ASSERT(mig != NULL); 2994 mutex_enter(&mig->mig_lock); 2995 if (list_head(&mig->mig_list) != NULL) { 2996 /* Still waiting for VFS_FREEVFS() */ 2997 mig->mig_destructor_called = B_TRUE; 2998 mutex_exit(&mig->mig_lock); 2999 return; 3000 } 3001 nfs4_mi_free_globals(mig); 3002 } 3003 3004 /* 3005 * Add an NFS mount to the per-zone list of NFS mounts. 3006 */ 3007 void 3008 nfs4_mi_zonelist_add(mntinfo4_t *mi) 3009 { 3010 struct mi4_globals *mig; 3011 3012 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3013 mutex_enter(&mig->mig_lock); 3014 list_insert_head(&mig->mig_list, mi); 3015 /* 3016 * hold added to eliminate race with zone shutdown -this will be 3017 * released in mi_shutdown 3018 */ 3019 MI4_HOLD(mi); 3020 VFS_HOLD(mi->mi_vfsp); 3021 mutex_exit(&mig->mig_lock); 3022 } 3023 3024 /* 3025 * Remove an NFS mount from the per-zone list of NFS mounts. 3026 */ 3027 int 3028 nfs4_mi_zonelist_remove(mntinfo4_t *mi) 3029 { 3030 struct mi4_globals *mig; 3031 int ret = 0; 3032 3033 mig = zone_getspecific(mi4_list_key, mi->mi_zone); 3034 mutex_enter(&mig->mig_lock); 3035 mutex_enter(&mi->mi_lock); 3036 /* if this mi is marked dead, then the zone already released it */ 3037 if (!(mi->mi_flags & MI4_DEAD)) { 3038 list_remove(&mig->mig_list, mi); 3039 mutex_exit(&mi->mi_lock); 3040 3041 /* release the holds put on in zonelist_add(). */ 3042 VFS_RELE(mi->mi_vfsp); 3043 MI4_RELE(mi); 3044 ret = 1; 3045 } else { 3046 mutex_exit(&mi->mi_lock); 3047 } 3048 3049 /* 3050 * We can be called asynchronously by VFS_FREEVFS() after the zone 3051 * shutdown/destroy callbacks have executed; if so, clean up the zone's 3052 * mi globals. 3053 */ 3054 if (list_head(&mig->mig_list) == NULL && 3055 mig->mig_destructor_called == B_TRUE) { 3056 nfs4_mi_free_globals(mig); 3057 return (ret); 3058 } 3059 mutex_exit(&mig->mig_lock); 3060 return (ret); 3061 } 3062 3063 void 3064 nfs_free_mi4(mntinfo4_t *mi) 3065 { 3066 nfs4_open_owner_t *foop; 3067 nfs4_oo_hash_bucket_t *bucketp; 3068 nfs4_debug_msg_t *msgp; 3069 int i; 3070 servinfo4_t *svp; 3071 3072 /* 3073 * Code introduced here should be carefully evaluated to make 3074 * sure none of the freed resources are accessed either directly 3075 * or indirectly after freeing them. For eg: Introducing calls to 3076 * NFS4_DEBUG that use mntinfo4_t structure member after freeing 3077 * the structure members or other routines calling back into NFS 3078 * accessing freed mntinfo4_t structure member. 3079 */ 3080 mutex_enter(&mi->mi_lock); 3081 ASSERT(mi->mi_recovthread == NULL); 3082 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP); 3083 mutex_exit(&mi->mi_lock); 3084 mutex_enter(&mi->mi_async_lock); 3085 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 && 3086 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0); 3087 ASSERT(mi->mi_manager_thread == NULL); 3088 mutex_exit(&mi->mi_async_lock); 3089 if (mi->mi_io_kstats) { 3090 kstat_delete(mi->mi_io_kstats); 3091 mi->mi_io_kstats = NULL; 3092 } 3093 if (mi->mi_ro_kstats) { 3094 kstat_delete(mi->mi_ro_kstats); 3095 mi->mi_ro_kstats = NULL; 3096 } 3097 if (mi->mi_recov_ksp) { 3098 kstat_delete(mi->mi_recov_ksp); 3099 mi->mi_recov_ksp = NULL; 3100 } 3101 mutex_enter(&mi->mi_msg_list_lock); 3102 while (msgp = list_head(&mi->mi_msg_list)) { 3103 list_remove(&mi->mi_msg_list, msgp); 3104 nfs4_free_msg(msgp); 3105 } 3106 mutex_exit(&mi->mi_msg_list_lock); 3107 list_destroy(&mi->mi_msg_list); 3108 if (mi->mi_fname != NULL) 3109 fn_rele(&mi->mi_fname); 3110 if (mi->mi_rootfh != NULL) 3111 sfh4_rele(&mi->mi_rootfh); 3112 if (mi->mi_srvparentfh != NULL) 3113 sfh4_rele(&mi->mi_srvparentfh); 3114 svp = mi->mi_servers; 3115 sv4_free(svp); 3116 mutex_destroy(&mi->mi_lock); 3117 mutex_destroy(&mi->mi_async_lock); 3118 mutex_destroy(&mi->mi_msg_list_lock); 3119 mutex_destroy(&mi->mi_rnodes_lock); 3120 nfs_rw_destroy(&mi->mi_recovlock); 3121 nfs_rw_destroy(&mi->mi_rename_lock); 3122 nfs_rw_destroy(&mi->mi_fh_lock); 3123 cv_destroy(&mi->mi_failover_cv); 3124 cv_destroy(&mi->mi_async_reqs_cv); 3125 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]); 3126 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]); 3127 cv_destroy(&mi->mi_async_cv); 3128 cv_destroy(&mi->mi_inact_req_cv); 3129 /* 3130 * Destroy the oo hash lists and mutexes for the cred hash table. 3131 */ 3132 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) { 3133 bucketp = &(mi->mi_oo_list[i]); 3134 /* Destroy any remaining open owners on the list */ 3135 foop = list_head(&bucketp->b_oo_hash_list); 3136 while (foop != NULL) { 3137 list_remove(&bucketp->b_oo_hash_list, foop); 3138 nfs4_destroy_open_owner(foop); 3139 foop = list_head(&bucketp->b_oo_hash_list); 3140 } 3141 list_destroy(&bucketp->b_oo_hash_list); 3142 mutex_destroy(&bucketp->b_lock); 3143 } 3144 /* 3145 * Empty and destroy the freed open owner list. 3146 */ 3147 foop = list_head(&mi->mi_foo_list); 3148 while (foop != NULL) { 3149 list_remove(&mi->mi_foo_list, foop); 3150 nfs4_destroy_open_owner(foop); 3151 foop = list_head(&mi->mi_foo_list); 3152 } 3153 list_destroy(&mi->mi_foo_list); 3154 list_destroy(&mi->mi_bseqid_list); 3155 list_destroy(&mi->mi_lost_state); 3156 list_destroy(&mi->mi_rnodes); 3157 avl_destroy(&mi->mi_filehandles); 3158 kmem_free(mi, sizeof (*mi)); 3159 } 3160 void 3161 mi_hold(mntinfo4_t *mi) 3162 { 3163 atomic_inc_32(&mi->mi_count); 3164 ASSERT(mi->mi_count != 0); 3165 } 3166 3167 void 3168 mi_rele(mntinfo4_t *mi) 3169 { 3170 ASSERT(mi->mi_count != 0); 3171 if (atomic_dec_32_nv(&mi->mi_count) == 0) { 3172 nfs_free_mi4(mi); 3173 } 3174 } 3175 3176 vnode_t nfs4_xattr_notsupp_vnode; 3177 3178 void 3179 nfs4_clnt_init(void) 3180 { 3181 nfs4_vnops_init(); 3182 (void) nfs4_rnode_init(); 3183 (void) nfs4_shadow_init(); 3184 (void) nfs4_acache_init(); 3185 (void) nfs4_subr_init(); 3186 nfs4_acl_init(); 3187 nfs_idmap_init(); 3188 nfs4_callback_init(); 3189 nfs4_secinfo_init(); 3190 #ifdef DEBUG 3191 tsd_create(&nfs4_tsd_key, NULL); 3192 #endif 3193 3194 /* 3195 * Add a CPR callback so that we can update client 3196 * lease after a suspend and resume. 3197 */ 3198 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4"); 3199 3200 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown, 3201 nfs4_mi_destroy); 3202 3203 /* 3204 * Initialize the reference count of the notsupp xattr cache vnode to 1 3205 * so that it never goes away (VOP_INACTIVE isn't called on it). 3206 */ 3207 vn_reinit(&nfs4_xattr_notsupp_vnode); 3208 } 3209 3210 void 3211 nfs4_clnt_fini(void) 3212 { 3213 (void) zone_key_delete(mi4_list_key); 3214 nfs4_vnops_fini(); 3215 (void) nfs4_rnode_fini(); 3216 (void) nfs4_shadow_fini(); 3217 (void) nfs4_acache_fini(); 3218 (void) nfs4_subr_fini(); 3219 nfs_idmap_fini(); 3220 nfs4_callback_fini(); 3221 nfs4_secinfo_fini(); 3222 #ifdef DEBUG 3223 tsd_destroy(&nfs4_tsd_key); 3224 #endif 3225 if (cid) 3226 (void) callb_delete(cid); 3227 } 3228 3229 /*ARGSUSED*/ 3230 static boolean_t 3231 nfs4_client_cpr_callb(void *arg, int code) 3232 { 3233 /* 3234 * We get called for Suspend and Resume events. 3235 * For the suspend case we simply don't care! 3236 */ 3237 if (code == CB_CODE_CPR_CHKPT) { 3238 return (B_TRUE); 3239 } 3240 3241 /* 3242 * When we get to here we are in the process of 3243 * resuming the system from a previous suspend. 3244 */ 3245 nfs4_client_resumed = gethrestime_sec(); 3246 return (B_TRUE); 3247 } 3248 3249 void 3250 nfs4_renew_lease_thread(nfs4_server_t *sp) 3251 { 3252 int error = 0; 3253 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs; 3254 clock_t tick_delay = 0; 3255 clock_t time_left = 0; 3256 callb_cpr_t cpr_info; 3257 kmutex_t cpr_lock; 3258 3259 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3260 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp)); 3261 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 3262 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease"); 3263 3264 mutex_enter(&sp->s_lock); 3265 /* sp->s_lease_time is set via a GETATTR */ 3266 sp->last_renewal_time = gethrestime_sec(); 3267 sp->lease_valid = NFS4_LEASE_UNINITIALIZED; 3268 ASSERT(sp->s_refcnt >= 1); 3269 3270 for (;;) { 3271 if (!sp->state_ref_count || 3272 sp->lease_valid != NFS4_LEASE_VALID) { 3273 3274 kip_secs = MAX((sp->s_lease_time >> 1) - 3275 (3 * sp->propagation_delay.tv_sec), 1); 3276 3277 tick_delay = SEC_TO_TICK(kip_secs); 3278 3279 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3280 "nfs4_renew_lease_thread: no renew : thread " 3281 "wait %ld secs", kip_secs)); 3282 3283 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3284 "nfs4_renew_lease_thread: no renew : " 3285 "state_ref_count %d, lease_valid %d", 3286 sp->state_ref_count, sp->lease_valid)); 3287 3288 mutex_enter(&cpr_lock); 3289 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3290 mutex_exit(&cpr_lock); 3291 time_left = cv_reltimedwait(&sp->cv_thread_exit, 3292 &sp->s_lock, tick_delay, TR_CLOCK_TICK); 3293 mutex_enter(&cpr_lock); 3294 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3295 mutex_exit(&cpr_lock); 3296 3297 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3298 "nfs4_renew_lease_thread: no renew: " 3299 "time left %ld", time_left)); 3300 3301 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3302 goto die; 3303 continue; 3304 } 3305 3306 tmp_last_renewal_time = sp->last_renewal_time; 3307 3308 tmp_time = gethrestime_sec() - sp->last_renewal_time + 3309 (3 * sp->propagation_delay.tv_sec); 3310 3311 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3312 "nfs4_renew_lease_thread: tmp_time %ld, " 3313 "sp->last_renewal_time %ld", tmp_time, 3314 sp->last_renewal_time)); 3315 3316 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1); 3317 3318 tick_delay = SEC_TO_TICK(kip_secs); 3319 3320 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3321 "nfs4_renew_lease_thread: valid lease: sleep for %ld " 3322 "secs", kip_secs)); 3323 3324 mutex_enter(&cpr_lock); 3325 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3326 mutex_exit(&cpr_lock); 3327 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock, 3328 tick_delay, TR_CLOCK_TICK); 3329 mutex_enter(&cpr_lock); 3330 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3331 mutex_exit(&cpr_lock); 3332 3333 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3334 "nfs4_renew_lease_thread: valid lease: time left %ld :" 3335 "sp last_renewal_time %ld, nfs4_client_resumed %ld, " 3336 "tmp_last_renewal_time %ld", time_left, 3337 sp->last_renewal_time, nfs4_client_resumed, 3338 tmp_last_renewal_time)); 3339 3340 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3341 goto die; 3342 3343 if (tmp_last_renewal_time == sp->last_renewal_time || 3344 (nfs4_client_resumed != 0 && 3345 nfs4_client_resumed > sp->last_renewal_time)) { 3346 /* 3347 * Issue RENEW op since we haven't renewed the lease 3348 * since we slept. 3349 */ 3350 tmp_now_time = gethrestime_sec(); 3351 error = nfs4renew(sp); 3352 /* 3353 * Need to re-acquire sp's lock, nfs4renew() 3354 * relinqueshes it. 3355 */ 3356 mutex_enter(&sp->s_lock); 3357 3358 /* 3359 * See if someone changed s_thread_exit while we gave 3360 * up s_lock. 3361 */ 3362 if (sp->s_thread_exit == NFS4_THREAD_EXIT) 3363 goto die; 3364 3365 if (!error) { 3366 /* 3367 * check to see if we implicitly renewed while 3368 * we waited for a reply for our RENEW call. 3369 */ 3370 if (tmp_last_renewal_time == 3371 sp->last_renewal_time) { 3372 /* no implicit renew came */ 3373 sp->last_renewal_time = tmp_now_time; 3374 } else { 3375 NFS4_DEBUG(nfs4_client_lease_debug, 3376 (CE_NOTE, "renew_thread: did " 3377 "implicit renewal before reply " 3378 "from server for RENEW")); 3379 } 3380 } else { 3381 /* figure out error */ 3382 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3383 "renew_thread: nfs4renew returned error" 3384 " %d", error)); 3385 } 3386 3387 } 3388 } 3389 3390 die: 3391 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3392 "nfs4_renew_lease_thread: thread exiting")); 3393 3394 while (sp->s_otw_call_count != 0) { 3395 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3396 "nfs4_renew_lease_thread: waiting for outstanding " 3397 "otw calls to finish for sp 0x%p, current " 3398 "s_otw_call_count %d", (void *)sp, 3399 sp->s_otw_call_count)); 3400 mutex_enter(&cpr_lock); 3401 CALLB_CPR_SAFE_BEGIN(&cpr_info); 3402 mutex_exit(&cpr_lock); 3403 cv_wait(&sp->s_cv_otw_count, &sp->s_lock); 3404 mutex_enter(&cpr_lock); 3405 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 3406 mutex_exit(&cpr_lock); 3407 } 3408 mutex_exit(&sp->s_lock); 3409 3410 nfs4_server_rele(sp); /* free the thread's reference */ 3411 nfs4_server_rele(sp); /* free the list's reference */ 3412 sp = NULL; 3413 3414 done: 3415 mutex_enter(&cpr_lock); 3416 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */ 3417 mutex_destroy(&cpr_lock); 3418 3419 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3420 "nfs4_renew_lease_thread: renew thread exit officially")); 3421 3422 zthread_exit(); 3423 /* NOT REACHED */ 3424 } 3425 3426 /* 3427 * Send out a RENEW op to the server. 3428 * Assumes sp is locked down. 3429 */ 3430 static int 3431 nfs4renew(nfs4_server_t *sp) 3432 { 3433 COMPOUND4args_clnt args; 3434 COMPOUND4res_clnt res; 3435 nfs_argop4 argop[1]; 3436 int doqueue = 1; 3437 int rpc_error; 3438 cred_t *cr; 3439 mntinfo4_t *mi; 3440 timespec_t prop_time, after_time; 3441 int needrecov = FALSE; 3442 nfs4_recov_state_t recov_state; 3443 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3444 3445 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew")); 3446 3447 recov_state.rs_flags = 0; 3448 recov_state.rs_num_retry_despite_err = 0; 3449 3450 recov_retry: 3451 mi = sp->mntinfo4_list; 3452 VFS_HOLD(mi->mi_vfsp); 3453 mutex_exit(&sp->s_lock); 3454 ASSERT(mi != NULL); 3455 3456 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state); 3457 if (e.error) { 3458 VFS_RELE(mi->mi_vfsp); 3459 return (e.error); 3460 } 3461 3462 /* Check to see if we're dealing with a marked-dead sp */ 3463 mutex_enter(&sp->s_lock); 3464 if (sp->s_thread_exit == NFS4_THREAD_EXIT) { 3465 mutex_exit(&sp->s_lock); 3466 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3467 VFS_RELE(mi->mi_vfsp); 3468 return (0); 3469 } 3470 3471 /* Make sure mi hasn't changed on us */ 3472 if (mi != sp->mntinfo4_list) { 3473 /* Must drop sp's lock to avoid a recursive mutex enter */ 3474 mutex_exit(&sp->s_lock); 3475 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3476 VFS_RELE(mi->mi_vfsp); 3477 mutex_enter(&sp->s_lock); 3478 goto recov_retry; 3479 } 3480 mutex_exit(&sp->s_lock); 3481 3482 args.ctag = TAG_RENEW; 3483 3484 args.array_len = 1; 3485 args.array = argop; 3486 3487 argop[0].argop = OP_RENEW; 3488 3489 mutex_enter(&sp->s_lock); 3490 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid; 3491 cr = sp->s_cred; 3492 crhold(cr); 3493 mutex_exit(&sp->s_lock); 3494 3495 ASSERT(cr != NULL); 3496 3497 /* used to figure out RTT for sp */ 3498 gethrestime(&prop_time); 3499 3500 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3501 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first", 3502 (void*)sp)); 3503 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ", 3504 prop_time.tv_sec, prop_time.tv_nsec)); 3505 3506 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp, 3507 mntinfo4_t *, mi); 3508 3509 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3510 crfree(cr); 3511 3512 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp, 3513 mntinfo4_t *, mi); 3514 3515 gethrestime(&after_time); 3516 3517 mutex_enter(&sp->s_lock); 3518 sp->propagation_delay.tv_sec = 3519 MAX(1, after_time.tv_sec - prop_time.tv_sec); 3520 mutex_exit(&sp->s_lock); 3521 3522 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ", 3523 after_time.tv_sec, after_time.tv_nsec)); 3524 3525 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) { 3526 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3527 nfs4_delegreturn_all(sp); 3528 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3529 VFS_RELE(mi->mi_vfsp); 3530 /* 3531 * If the server returns CB_PATH_DOWN, it has renewed 3532 * the lease and informed us that the callback path is 3533 * down. Since the lease is renewed, just return 0 and 3534 * let the renew thread proceed as normal. 3535 */ 3536 return (0); 3537 } 3538 3539 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3540 if (!needrecov && e.error) { 3541 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3542 VFS_RELE(mi->mi_vfsp); 3543 return (e.error); 3544 } 3545 3546 rpc_error = e.error; 3547 3548 if (needrecov) { 3549 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3550 "nfs4renew: initiating recovery\n")); 3551 3552 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL, 3553 OP_RENEW, NULL, NULL, NULL) == FALSE) { 3554 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3555 VFS_RELE(mi->mi_vfsp); 3556 if (!e.error) 3557 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3558 mutex_enter(&sp->s_lock); 3559 goto recov_retry; 3560 } 3561 /* fall through for res.status case */ 3562 } 3563 3564 if (res.status) { 3565 if (res.status == NFS4ERR_LEASE_MOVED) { 3566 /*EMPTY*/ 3567 /* 3568 * XXX need to try every mntinfo4 in sp->mntinfo4_list 3569 * to renew the lease on that server 3570 */ 3571 } 3572 e.error = geterrno4(res.status); 3573 } 3574 3575 if (!rpc_error) 3576 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3577 3578 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov); 3579 3580 VFS_RELE(mi->mi_vfsp); 3581 3582 return (e.error); 3583 } 3584 3585 void 3586 nfs4_inc_state_ref_count(mntinfo4_t *mi) 3587 { 3588 nfs4_server_t *sp; 3589 3590 /* this locks down sp if it is found */ 3591 sp = find_nfs4_server(mi); 3592 3593 if (sp != NULL) { 3594 nfs4_inc_state_ref_count_nolock(sp, mi); 3595 mutex_exit(&sp->s_lock); 3596 nfs4_server_rele(sp); 3597 } 3598 } 3599 3600 /* 3601 * Bump the number of OPEN files (ie: those with state) so we know if this 3602 * nfs4_server has any state to maintain a lease for or not. 3603 * 3604 * Also, marks the nfs4_server's lease valid if it hasn't been done so already. 3605 */ 3606 void 3607 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3608 { 3609 ASSERT(mutex_owned(&sp->s_lock)); 3610 3611 sp->state_ref_count++; 3612 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3613 "nfs4_inc_state_ref_count: state_ref_count now %d", 3614 sp->state_ref_count)); 3615 3616 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED) 3617 sp->lease_valid = NFS4_LEASE_VALID; 3618 3619 /* 3620 * If this call caused the lease to be marked valid and/or 3621 * took the state_ref_count from 0 to 1, then start the time 3622 * on lease renewal. 3623 */ 3624 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1) 3625 sp->last_renewal_time = gethrestime_sec(); 3626 3627 /* update the number of open files for mi */ 3628 mi->mi_open_files++; 3629 } 3630 3631 void 3632 nfs4_dec_state_ref_count(mntinfo4_t *mi) 3633 { 3634 nfs4_server_t *sp; 3635 3636 /* this locks down sp if it is found */ 3637 sp = find_nfs4_server_all(mi, 1); 3638 3639 if (sp != NULL) { 3640 nfs4_dec_state_ref_count_nolock(sp, mi); 3641 mutex_exit(&sp->s_lock); 3642 nfs4_server_rele(sp); 3643 } 3644 } 3645 3646 /* 3647 * Decrement the number of OPEN files (ie: those with state) so we know if 3648 * this nfs4_server has any state to maintain a lease for or not. 3649 */ 3650 void 3651 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi) 3652 { 3653 ASSERT(mutex_owned(&sp->s_lock)); 3654 ASSERT(sp->state_ref_count != 0); 3655 sp->state_ref_count--; 3656 3657 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3658 "nfs4_dec_state_ref_count: state ref count now %d", 3659 sp->state_ref_count)); 3660 3661 mi->mi_open_files--; 3662 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3663 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x", 3664 mi->mi_open_files, mi->mi_flags)); 3665 3666 /* We don't have to hold the mi_lock to test mi_flags */ 3667 if (mi->mi_open_files == 0 && 3668 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) { 3669 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, 3670 "nfs4_dec_state_ref_count: remove mntinfo4 %p since " 3671 "we have closed the last open file", (void*)mi)); 3672 nfs4_remove_mi_from_server(mi, sp); 3673 } 3674 } 3675 3676 bool_t 3677 inlease(nfs4_server_t *sp) 3678 { 3679 bool_t result; 3680 3681 ASSERT(mutex_owned(&sp->s_lock)); 3682 3683 if (sp->lease_valid == NFS4_LEASE_VALID && 3684 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time) 3685 result = TRUE; 3686 else 3687 result = FALSE; 3688 3689 return (result); 3690 } 3691 3692 3693 /* 3694 * Return non-zero if the given nfs4_server_t is going through recovery. 3695 */ 3696 3697 int 3698 nfs4_server_in_recovery(nfs4_server_t *sp) 3699 { 3700 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER)); 3701 } 3702 3703 /* 3704 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the 3705 * first is less than, equal to, or greater than the second. 3706 */ 3707 3708 int 3709 sfh4cmp(const void *p1, const void *p2) 3710 { 3711 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1; 3712 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2; 3713 3714 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh)); 3715 } 3716 3717 /* 3718 * Create a table for shared filehandle objects. 3719 */ 3720 3721 void 3722 sfh4_createtab(avl_tree_t *tab) 3723 { 3724 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t), 3725 offsetof(nfs4_sharedfh_t, sfh_tree)); 3726 } 3727 3728 /* 3729 * Return a shared filehandle object for the given filehandle. The caller 3730 * is responsible for eventually calling sfh4_rele(). 3731 */ 3732 3733 nfs4_sharedfh_t * 3734 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key) 3735 { 3736 nfs4_sharedfh_t *sfh, *nsfh; 3737 avl_index_t where; 3738 nfs4_sharedfh_t skey; 3739 3740 if (!key) { 3741 skey.sfh_fh = *fh; 3742 key = &skey; 3743 } 3744 3745 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP); 3746 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len; 3747 /* 3748 * We allocate the largest possible filehandle size because it's 3749 * not that big, and it saves us from possibly having to resize the 3750 * buffer later. 3751 */ 3752 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); 3753 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len); 3754 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL); 3755 nsfh->sfh_refcnt = 1; 3756 nsfh->sfh_flags = SFH4_IN_TREE; 3757 nsfh->sfh_mi = mi; 3758 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)", 3759 (void *)nsfh)); 3760 3761 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3762 sfh = avl_find(&mi->mi_filehandles, key, &where); 3763 if (sfh != NULL) { 3764 mutex_enter(&sfh->sfh_lock); 3765 sfh->sfh_refcnt++; 3766 mutex_exit(&sfh->sfh_lock); 3767 nfs_rw_exit(&mi->mi_fh_lock); 3768 /* free our speculative allocs */ 3769 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3770 kmem_free(nsfh, sizeof (nfs4_sharedfh_t)); 3771 return (sfh); 3772 } 3773 3774 avl_insert(&mi->mi_filehandles, nsfh, where); 3775 nfs_rw_exit(&mi->mi_fh_lock); 3776 3777 return (nsfh); 3778 } 3779 3780 /* 3781 * Return a shared filehandle object for the given filehandle. The caller 3782 * is responsible for eventually calling sfh4_rele(). 3783 */ 3784 3785 nfs4_sharedfh_t * 3786 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi) 3787 { 3788 nfs4_sharedfh_t *sfh; 3789 nfs4_sharedfh_t key; 3790 3791 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE); 3792 3793 #ifdef DEBUG 3794 if (nfs4_sharedfh_debug) { 3795 nfs4_fhandle_t fhandle; 3796 3797 fhandle.fh_len = fh->nfs_fh4_len; 3798 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len); 3799 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:"); 3800 nfs4_printfhandle(&fhandle); 3801 } 3802 #endif 3803 3804 /* 3805 * If there's already an object for the given filehandle, bump the 3806 * reference count and return it. Otherwise, create a new object 3807 * and add it to the AVL tree. 3808 */ 3809 3810 key.sfh_fh = *fh; 3811 3812 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3813 sfh = avl_find(&mi->mi_filehandles, &key, NULL); 3814 if (sfh != NULL) { 3815 mutex_enter(&sfh->sfh_lock); 3816 sfh->sfh_refcnt++; 3817 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3818 "sfh4_get: found existing %p, new refcnt=%d", 3819 (void *)sfh, sfh->sfh_refcnt)); 3820 mutex_exit(&sfh->sfh_lock); 3821 nfs_rw_exit(&mi->mi_fh_lock); 3822 return (sfh); 3823 } 3824 nfs_rw_exit(&mi->mi_fh_lock); 3825 3826 return (sfh4_put(fh, mi, &key)); 3827 } 3828 3829 /* 3830 * Get a reference to the given shared filehandle object. 3831 */ 3832 3833 void 3834 sfh4_hold(nfs4_sharedfh_t *sfh) 3835 { 3836 ASSERT(sfh->sfh_refcnt > 0); 3837 3838 mutex_enter(&sfh->sfh_lock); 3839 sfh->sfh_refcnt++; 3840 NFS4_DEBUG(nfs4_sharedfh_debug, 3841 (CE_NOTE, "sfh4_hold %p, new refcnt=%d", 3842 (void *)sfh, sfh->sfh_refcnt)); 3843 mutex_exit(&sfh->sfh_lock); 3844 } 3845 3846 /* 3847 * Release a reference to the given shared filehandle object and null out 3848 * the given pointer. 3849 */ 3850 3851 void 3852 sfh4_rele(nfs4_sharedfh_t **sfhpp) 3853 { 3854 mntinfo4_t *mi; 3855 nfs4_sharedfh_t *sfh = *sfhpp; 3856 3857 ASSERT(sfh->sfh_refcnt > 0); 3858 3859 mutex_enter(&sfh->sfh_lock); 3860 if (sfh->sfh_refcnt > 1) { 3861 sfh->sfh_refcnt--; 3862 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3863 "sfh4_rele %p, new refcnt=%d", 3864 (void *)sfh, sfh->sfh_refcnt)); 3865 mutex_exit(&sfh->sfh_lock); 3866 goto finish; 3867 } 3868 mutex_exit(&sfh->sfh_lock); 3869 3870 /* 3871 * Possibly the last reference, so get the lock for the table in 3872 * case it's time to remove the object from the table. 3873 */ 3874 mi = sfh->sfh_mi; 3875 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3876 mutex_enter(&sfh->sfh_lock); 3877 sfh->sfh_refcnt--; 3878 if (sfh->sfh_refcnt > 0) { 3879 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3880 "sfh4_rele %p, new refcnt=%d", 3881 (void *)sfh, sfh->sfh_refcnt)); 3882 mutex_exit(&sfh->sfh_lock); 3883 nfs_rw_exit(&mi->mi_fh_lock); 3884 goto finish; 3885 } 3886 3887 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, 3888 "sfh4_rele %p, last ref", (void *)sfh)); 3889 if (sfh->sfh_flags & SFH4_IN_TREE) { 3890 avl_remove(&mi->mi_filehandles, sfh); 3891 sfh->sfh_flags &= ~SFH4_IN_TREE; 3892 } 3893 mutex_exit(&sfh->sfh_lock); 3894 nfs_rw_exit(&mi->mi_fh_lock); 3895 mutex_destroy(&sfh->sfh_lock); 3896 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE); 3897 kmem_free(sfh, sizeof (nfs4_sharedfh_t)); 3898 3899 finish: 3900 *sfhpp = NULL; 3901 } 3902 3903 /* 3904 * Update the filehandle for the given shared filehandle object. 3905 */ 3906 3907 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */ 3908 3909 void 3910 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh) 3911 { 3912 mntinfo4_t *mi = sfh->sfh_mi; 3913 nfs4_sharedfh_t *dupsfh; 3914 avl_index_t where; 3915 nfs4_sharedfh_t key; 3916 3917 #ifdef DEBUG 3918 mutex_enter(&sfh->sfh_lock); 3919 ASSERT(sfh->sfh_refcnt > 0); 3920 mutex_exit(&sfh->sfh_lock); 3921 #endif 3922 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE); 3923 3924 /* 3925 * The basic plan is to remove the shared filehandle object from 3926 * the table, update it to have the new filehandle, then reinsert 3927 * it. 3928 */ 3929 3930 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0); 3931 mutex_enter(&sfh->sfh_lock); 3932 if (sfh->sfh_flags & SFH4_IN_TREE) { 3933 avl_remove(&mi->mi_filehandles, sfh); 3934 sfh->sfh_flags &= ~SFH4_IN_TREE; 3935 } 3936 mutex_exit(&sfh->sfh_lock); 3937 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len; 3938 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val, 3939 sfh->sfh_fh.nfs_fh4_len); 3940 3941 /* 3942 * XXX If there is already a shared filehandle object with the new 3943 * filehandle, we're in trouble, because the rnode code assumes 3944 * that there is only one shared filehandle object for a given 3945 * filehandle. So issue a warning (for read-write mounts only) 3946 * and don't try to re-insert the given object into the table. 3947 * Hopefully the given object will quickly go away and everyone 3948 * will use the new object. 3949 */ 3950 key.sfh_fh = *newfh; 3951 dupsfh = avl_find(&mi->mi_filehandles, &key, &where); 3952 if (dupsfh != NULL) { 3953 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) { 3954 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: " 3955 "duplicate filehandle detected"); 3956 sfh4_printfhandle(dupsfh); 3957 } 3958 } else { 3959 avl_insert(&mi->mi_filehandles, sfh, where); 3960 mutex_enter(&sfh->sfh_lock); 3961 sfh->sfh_flags |= SFH4_IN_TREE; 3962 mutex_exit(&sfh->sfh_lock); 3963 } 3964 nfs_rw_exit(&mi->mi_fh_lock); 3965 } 3966 3967 /* 3968 * Copy out the current filehandle for the given shared filehandle object. 3969 */ 3970 3971 void 3972 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp) 3973 { 3974 mntinfo4_t *mi = sfh->sfh_mi; 3975 3976 ASSERT(sfh->sfh_refcnt > 0); 3977 3978 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 3979 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len; 3980 ASSERT(fhp->fh_len <= NFS4_FHSIZE); 3981 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len); 3982 nfs_rw_exit(&mi->mi_fh_lock); 3983 } 3984 3985 /* 3986 * Print out the filehandle for the given shared filehandle object. 3987 */ 3988 3989 void 3990 sfh4_printfhandle(const nfs4_sharedfh_t *sfh) 3991 { 3992 nfs4_fhandle_t fhandle; 3993 3994 sfh4_copyval(sfh, &fhandle); 3995 nfs4_printfhandle(&fhandle); 3996 } 3997 3998 /* 3999 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0 4000 * if they're the same, +1 if the first is "greater" than the second. The 4001 * caller (or whoever's calling the AVL package) is responsible for 4002 * handling locking issues. 4003 */ 4004 4005 static int 4006 fncmp(const void *p1, const void *p2) 4007 { 4008 const nfs4_fname_t *f1 = p1; 4009 const nfs4_fname_t *f2 = p2; 4010 int res; 4011 4012 res = strcmp(f1->fn_name, f2->fn_name); 4013 /* 4014 * The AVL package wants +/-1, not arbitrary positive or negative 4015 * integers. 4016 */ 4017 if (res > 0) 4018 res = 1; 4019 else if (res < 0) 4020 res = -1; 4021 return (res); 4022 } 4023 4024 /* 4025 * Get or create an fname with the given name, as a child of the given 4026 * fname. The caller is responsible for eventually releasing the reference 4027 * (fn_rele()). parent may be NULL. 4028 */ 4029 4030 nfs4_fname_t * 4031 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh) 4032 { 4033 nfs4_fname_t key; 4034 nfs4_fname_t *fnp; 4035 avl_index_t where; 4036 4037 key.fn_name = name; 4038 4039 /* 4040 * If there's already an fname registered with the given name, bump 4041 * its reference count and return it. Otherwise, create a new one 4042 * and add it to the parent's AVL tree. 4043 * 4044 * fname entries we are looking for should match both name 4045 * and sfh stored in the fname. 4046 */ 4047 again: 4048 if (parent != NULL) { 4049 mutex_enter(&parent->fn_lock); 4050 fnp = avl_find(&parent->fn_children, &key, &where); 4051 if (fnp != NULL) { 4052 /* 4053 * This hold on fnp is released below later, 4054 * in case this is not the fnp we want. 4055 */ 4056 fn_hold(fnp); 4057 4058 if (fnp->fn_sfh == sfh) { 4059 /* 4060 * We have found our entry. 4061 * put an hold and return it. 4062 */ 4063 mutex_exit(&parent->fn_lock); 4064 return (fnp); 4065 } 4066 4067 /* 4068 * We have found an entry that has a mismatching 4069 * fn_sfh. This could be a stale entry due to 4070 * server side rename. We will remove this entry 4071 * and make sure no such entries exist. 4072 */ 4073 mutex_exit(&parent->fn_lock); 4074 mutex_enter(&fnp->fn_lock); 4075 if (fnp->fn_parent == parent) { 4076 /* 4077 * Remove ourselves from parent's 4078 * fn_children tree. 4079 */ 4080 mutex_enter(&parent->fn_lock); 4081 avl_remove(&parent->fn_children, fnp); 4082 mutex_exit(&parent->fn_lock); 4083 fn_rele(&fnp->fn_parent); 4084 } 4085 mutex_exit(&fnp->fn_lock); 4086 fn_rele(&fnp); 4087 goto again; 4088 } 4089 } 4090 4091 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP); 4092 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL); 4093 fnp->fn_parent = parent; 4094 if (parent != NULL) 4095 fn_hold(parent); 4096 fnp->fn_len = strlen(name); 4097 ASSERT(fnp->fn_len < MAXNAMELEN); 4098 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP); 4099 (void) strcpy(fnp->fn_name, name); 4100 fnp->fn_refcnt = 1; 4101 4102 /* 4103 * This hold on sfh is later released 4104 * when we do the final fn_rele() on this fname. 4105 */ 4106 sfh4_hold(sfh); 4107 fnp->fn_sfh = sfh; 4108 4109 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t), 4110 offsetof(nfs4_fname_t, fn_tree)); 4111 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4112 "fn_get %p:%s, a new nfs4_fname_t!", 4113 (void *)fnp, fnp->fn_name)); 4114 if (parent != NULL) { 4115 avl_insert(&parent->fn_children, fnp, where); 4116 mutex_exit(&parent->fn_lock); 4117 } 4118 4119 return (fnp); 4120 } 4121 4122 void 4123 fn_hold(nfs4_fname_t *fnp) 4124 { 4125 atomic_inc_32(&fnp->fn_refcnt); 4126 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4127 "fn_hold %p:%s, new refcnt=%d", 4128 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4129 } 4130 4131 /* 4132 * Decrement the reference count of the given fname, and destroy it if its 4133 * reference count goes to zero. Nulls out the given pointer. 4134 */ 4135 4136 void 4137 fn_rele(nfs4_fname_t **fnpp) 4138 { 4139 nfs4_fname_t *parent; 4140 uint32_t newref; 4141 nfs4_fname_t *fnp; 4142 4143 recur: 4144 fnp = *fnpp; 4145 *fnpp = NULL; 4146 4147 mutex_enter(&fnp->fn_lock); 4148 parent = fnp->fn_parent; 4149 if (parent != NULL) 4150 mutex_enter(&parent->fn_lock); /* prevent new references */ 4151 newref = atomic_dec_32_nv(&fnp->fn_refcnt); 4152 if (newref > 0) { 4153 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4154 "fn_rele %p:%s, new refcnt=%d", 4155 (void *)fnp, fnp->fn_name, fnp->fn_refcnt)); 4156 if (parent != NULL) 4157 mutex_exit(&parent->fn_lock); 4158 mutex_exit(&fnp->fn_lock); 4159 return; 4160 } 4161 4162 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE, 4163 "fn_rele %p:%s, last reference, deleting...", 4164 (void *)fnp, fnp->fn_name)); 4165 if (parent != NULL) { 4166 avl_remove(&parent->fn_children, fnp); 4167 mutex_exit(&parent->fn_lock); 4168 } 4169 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4170 sfh4_rele(&fnp->fn_sfh); 4171 mutex_destroy(&fnp->fn_lock); 4172 avl_destroy(&fnp->fn_children); 4173 kmem_free(fnp, sizeof (nfs4_fname_t)); 4174 /* 4175 * Recursivly fn_rele the parent. 4176 * Use goto instead of a recursive call to avoid stack overflow. 4177 */ 4178 if (parent != NULL) { 4179 fnpp = &parent; 4180 goto recur; 4181 } 4182 } 4183 4184 /* 4185 * Returns the single component name of the given fname, in a MAXNAMELEN 4186 * string buffer, which the caller is responsible for freeing. Note that 4187 * the name may become invalid as a result of fn_move(). 4188 */ 4189 4190 char * 4191 fn_name(nfs4_fname_t *fnp) 4192 { 4193 char *name; 4194 4195 ASSERT(fnp->fn_len < MAXNAMELEN); 4196 name = kmem_alloc(MAXNAMELEN, KM_SLEEP); 4197 mutex_enter(&fnp->fn_lock); 4198 (void) strcpy(name, fnp->fn_name); 4199 mutex_exit(&fnp->fn_lock); 4200 4201 return (name); 4202 } 4203 4204 4205 /* 4206 * fn_path_realloc 4207 * 4208 * This function, used only by fn_path, constructs 4209 * a new string which looks like "prepend" + "/" + "current". 4210 * by allocating a new string and freeing the old one. 4211 */ 4212 static void 4213 fn_path_realloc(char **curses, char *prepend) 4214 { 4215 int len, curlen = 0; 4216 char *news; 4217 4218 if (*curses == NULL) { 4219 /* 4220 * Prime the pump, allocate just the 4221 * space for prepend and return that. 4222 */ 4223 len = strlen(prepend) + 1; 4224 news = kmem_alloc(len, KM_SLEEP); 4225 (void) strncpy(news, prepend, len); 4226 } else { 4227 /* 4228 * Allocate the space for a new string 4229 * +1 +1 is for the "/" and the NULL 4230 * byte at the end of it all. 4231 */ 4232 curlen = strlen(*curses); 4233 len = curlen + strlen(prepend) + 1 + 1; 4234 news = kmem_alloc(len, KM_SLEEP); 4235 (void) strncpy(news, prepend, len); 4236 (void) strcat(news, "/"); 4237 (void) strcat(news, *curses); 4238 kmem_free(*curses, curlen + 1); 4239 } 4240 *curses = news; 4241 } 4242 4243 /* 4244 * Returns the path name (starting from the fs root) for the given fname. 4245 * The caller is responsible for freeing. Note that the path may be or 4246 * become invalid as a result of fn_move(). 4247 */ 4248 4249 char * 4250 fn_path(nfs4_fname_t *fnp) 4251 { 4252 char *path; 4253 nfs4_fname_t *nextfnp; 4254 4255 if (fnp == NULL) 4256 return (NULL); 4257 4258 path = NULL; 4259 4260 /* walk up the tree constructing the pathname. */ 4261 4262 fn_hold(fnp); /* adjust for later rele */ 4263 do { 4264 mutex_enter(&fnp->fn_lock); 4265 /* 4266 * Add fn_name in front of the current path 4267 */ 4268 fn_path_realloc(&path, fnp->fn_name); 4269 nextfnp = fnp->fn_parent; 4270 if (nextfnp != NULL) 4271 fn_hold(nextfnp); 4272 mutex_exit(&fnp->fn_lock); 4273 fn_rele(&fnp); 4274 fnp = nextfnp; 4275 } while (fnp != NULL); 4276 4277 return (path); 4278 } 4279 4280 /* 4281 * Return a reference to the parent of the given fname, which the caller is 4282 * responsible for eventually releasing. 4283 */ 4284 4285 nfs4_fname_t * 4286 fn_parent(nfs4_fname_t *fnp) 4287 { 4288 nfs4_fname_t *parent; 4289 4290 mutex_enter(&fnp->fn_lock); 4291 parent = fnp->fn_parent; 4292 if (parent != NULL) 4293 fn_hold(parent); 4294 mutex_exit(&fnp->fn_lock); 4295 4296 return (parent); 4297 } 4298 4299 /* 4300 * Update fnp so that its parent is newparent and its name is newname. 4301 */ 4302 4303 void 4304 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname) 4305 { 4306 nfs4_fname_t *parent, *tmpfnp; 4307 ssize_t newlen; 4308 nfs4_fname_t key; 4309 avl_index_t where; 4310 4311 /* 4312 * This assert exists to catch the client trying to rename 4313 * a dir to be a child of itself. This happened at a recent 4314 * bakeoff against a 3rd party (broken) server which allowed 4315 * the rename to succeed. If it trips it means that: 4316 * a) the code in nfs4rename that detects this case is broken 4317 * b) the server is broken (since it allowed the bogus rename) 4318 * 4319 * For non-DEBUG kernels, prepare for a recursive mutex_enter 4320 * panic below from: mutex_enter(&newparent->fn_lock); 4321 */ 4322 ASSERT(fnp != newparent); 4323 4324 /* 4325 * Remove fnp from its current parent, change its name, then add it 4326 * to newparent. It might happen that fnp was replaced by another 4327 * nfs4_fname_t with the same fn_name in parent->fn_children. 4328 * In such case, fnp->fn_parent is NULL and we skip the removal 4329 * of fnp from its current parent. 4330 */ 4331 mutex_enter(&fnp->fn_lock); 4332 parent = fnp->fn_parent; 4333 if (parent != NULL) { 4334 mutex_enter(&parent->fn_lock); 4335 avl_remove(&parent->fn_children, fnp); 4336 mutex_exit(&parent->fn_lock); 4337 fn_rele(&fnp->fn_parent); 4338 } 4339 4340 newlen = strlen(newname); 4341 if (newlen != fnp->fn_len) { 4342 ASSERT(newlen < MAXNAMELEN); 4343 kmem_free(fnp->fn_name, fnp->fn_len + 1); 4344 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP); 4345 fnp->fn_len = newlen; 4346 } 4347 (void) strcpy(fnp->fn_name, newname); 4348 4349 again: 4350 mutex_enter(&newparent->fn_lock); 4351 key.fn_name = fnp->fn_name; 4352 tmpfnp = avl_find(&newparent->fn_children, &key, &where); 4353 if (tmpfnp != NULL) { 4354 /* 4355 * This could be due to a file that was unlinked while 4356 * open, or perhaps the rnode is in the free list. Remove 4357 * it from newparent and let it go away on its own. The 4358 * contorted code is to deal with lock order issues and 4359 * race conditions. 4360 */ 4361 fn_hold(tmpfnp); 4362 mutex_exit(&newparent->fn_lock); 4363 mutex_enter(&tmpfnp->fn_lock); 4364 if (tmpfnp->fn_parent == newparent) { 4365 mutex_enter(&newparent->fn_lock); 4366 avl_remove(&newparent->fn_children, tmpfnp); 4367 mutex_exit(&newparent->fn_lock); 4368 fn_rele(&tmpfnp->fn_parent); 4369 } 4370 mutex_exit(&tmpfnp->fn_lock); 4371 fn_rele(&tmpfnp); 4372 goto again; 4373 } 4374 fnp->fn_parent = newparent; 4375 fn_hold(newparent); 4376 avl_insert(&newparent->fn_children, fnp, where); 4377 mutex_exit(&newparent->fn_lock); 4378 mutex_exit(&fnp->fn_lock); 4379 } 4380 4381 #ifdef DEBUG 4382 /* 4383 * Return non-zero if the type information makes sense for the given vnode. 4384 * Otherwise panic. 4385 */ 4386 int 4387 nfs4_consistent_type(vnode_t *vp) 4388 { 4389 rnode4_t *rp = VTOR4(vp); 4390 4391 if (nfs4_vtype_debug && vp->v_type != VNON && 4392 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) { 4393 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, " 4394 "rnode attr type=%d", (void *)vp, vp->v_type, 4395 rp->r_attr.va_type); 4396 } 4397 4398 return (1); 4399 } 4400 #endif /* DEBUG */ 4401