1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 /* 32 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 33 * Copyright (c) 2017 by Delphix. All rights reserved. 34 */ 35 36 #include <sys/param.h> 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/cred.h> 40 #include <sys/proc.h> 41 #include <sys/user.h> 42 #include <sys/time.h> 43 #include <sys/buf.h> 44 #include <sys/vfs.h> 45 #include <sys/vnode.h> 46 #include <sys/socket.h> 47 #include <sys/uio.h> 48 #include <sys/tiuser.h> 49 #include <sys/swap.h> 50 #include <sys/errno.h> 51 #include <sys/debug.h> 52 #include <sys/kmem.h> 53 #include <sys/kstat.h> 54 #include <sys/cmn_err.h> 55 #include <sys/vtrace.h> 56 #include <sys/session.h> 57 #include <sys/dnlc.h> 58 #include <sys/bitmap.h> 59 #include <sys/acl.h> 60 #include <sys/ddi.h> 61 #include <sys/pathname.h> 62 #include <sys/flock.h> 63 #include <sys/dirent.h> 64 #include <sys/flock.h> 65 #include <sys/callb.h> 66 #include <sys/sdt.h> 67 68 #include <vm/pvn.h> 69 70 #include <rpc/types.h> 71 #include <rpc/xdr.h> 72 #include <rpc/auth.h> 73 #include <rpc/rpcsec_gss.h> 74 #include <rpc/clnt.h> 75 76 #include <nfs/nfs.h> 77 #include <nfs/nfs_clnt.h> 78 #include <nfs/nfs_acl.h> 79 80 #include <nfs/nfs4.h> 81 #include <nfs/rnode4.h> 82 #include <nfs/nfs4_clnt.h> 83 84 /* 85 * The hash queues for the access to active and cached rnodes 86 * are organized as doubly linked lists. A reader/writer lock 87 * for each hash bucket is used to control access and to synchronize 88 * lookups, additions, and deletions from the hash queue. 89 * 90 * The rnode freelist is organized as a doubly linked list with 91 * a head pointer. Additions and deletions are synchronized via 92 * a single mutex. 93 * 94 * In order to add an rnode to the free list, it must be hashed into 95 * a hash queue and the exclusive lock to the hash queue be held. 96 * If an rnode is not hashed into a hash queue, then it is destroyed 97 * because it represents no valuable information that can be reused 98 * about the file. The exclusive lock to the hash queue must be 99 * held in order to prevent a lookup in the hash queue from finding 100 * the rnode and using it and assuming that the rnode is not on the 101 * freelist. The lookup in the hash queue will have the hash queue 102 * locked, either exclusive or shared. 103 * 104 * The vnode reference count for each rnode is not allowed to drop 105 * below 1. This prevents external entities, such as the VM 106 * subsystem, from acquiring references to vnodes already on the 107 * freelist and then trying to place them back on the freelist 108 * when their reference is released. This means that the when an 109 * rnode is looked up in the hash queues, then either the rnode 110 * is removed from the freelist and that reference is transferred to 111 * the new reference or the vnode reference count must be incremented 112 * accordingly. The mutex for the freelist must be held in order to 113 * accurately test to see if the rnode is on the freelist or not. 114 * The hash queue lock might be held shared and it is possible that 115 * two different threads may race to remove the rnode from the 116 * freelist. This race can be resolved by holding the mutex for the 117 * freelist. Please note that the mutex for the freelist does not 118 * need to be held if the rnode is not on the freelist. It can not be 119 * placed on the freelist due to the requirement that the thread 120 * putting the rnode on the freelist must hold the exclusive lock 121 * to the hash queue and the thread doing the lookup in the hash 122 * queue is holding either a shared or exclusive lock to the hash 123 * queue. 124 * 125 * The lock ordering is: 126 * 127 * hash bucket lock -> vnode lock 128 * hash bucket lock -> freelist lock -> r_statelock 129 */ 130 r4hashq_t *rtable4; 131 132 static kmutex_t rp4freelist_lock; 133 static rnode4_t *rp4freelist = NULL; 134 static long rnode4_new = 0; 135 int rtable4size; 136 static int rtable4mask; 137 static struct kmem_cache *rnode4_cache; 138 static int rnode4_hashlen = 4; 139 140 static void r4inactive(rnode4_t *, cred_t *); 141 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 142 struct vnodeops *, 143 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 144 cred_t *), 145 int *, cred_t *); 146 static void rp4_rmfree(rnode4_t *); 147 int nfs4_free_data_reclaim(rnode4_t *); 148 static int nfs4_active_data_reclaim(rnode4_t *); 149 static int nfs4_free_reclaim(void); 150 static int nfs4_active_reclaim(void); 151 static int nfs4_rnode_reclaim(void); 152 static void nfs4_reclaim(void *); 153 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 154 static void uninit_rnode4(rnode4_t *); 155 static void destroy_rnode4(rnode4_t *); 156 static void r4_stub_set(rnode4_t *, nfs4_stub_type_t); 157 158 #ifdef DEBUG 159 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 160 static int nfs4_rnode_debug = 0; 161 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 162 static int nfs4_rnode_nofreelist = 0; 163 /* give messages on colliding shared filehandles */ 164 static void r4_dup_check(rnode4_t *, vfs_t *); 165 #endif 166 167 /* 168 * If the vnode has pages, run the list and check for any that are 169 * still dangling. We call this routine before putting an rnode on 170 * the free list. 171 */ 172 static int 173 nfs4_dross_pages(vnode_t *vp) 174 { 175 page_t *pp; 176 kmutex_t *vphm; 177 178 vphm = page_vnode_mutex(vp); 179 mutex_enter(vphm); 180 if ((pp = vp->v_pages) != NULL) { 181 do { 182 if (pp->p_hash != PVN_VPLIST_HASH_TAG && 183 pp->p_fsdata != C_NOCOMMIT) { 184 mutex_exit(vphm); 185 return (1); 186 } 187 } while ((pp = pp->p_vpnext) != vp->v_pages); 188 } 189 mutex_exit(vphm); 190 191 return (0); 192 } 193 194 /* 195 * Flush any pages left on this rnode. 196 */ 197 static void 198 r4flushpages(rnode4_t *rp, cred_t *cr) 199 { 200 vnode_t *vp; 201 int error; 202 203 /* 204 * Before freeing anything, wait until all asynchronous 205 * activity is done on this rnode. This will allow all 206 * asynchronous read ahead and write behind i/o's to 207 * finish. 208 */ 209 mutex_enter(&rp->r_statelock); 210 while (rp->r_count > 0) 211 cv_wait(&rp->r_cv, &rp->r_statelock); 212 mutex_exit(&rp->r_statelock); 213 214 /* 215 * Flush and invalidate all pages associated with the vnode. 216 */ 217 vp = RTOV4(rp); 218 if (nfs4_has_pages(vp)) { 219 ASSERT(vp->v_type != VCHR); 220 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 221 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 222 if (error && (error == ENOSPC || error == EDQUOT)) { 223 mutex_enter(&rp->r_statelock); 224 if (!rp->r_error) 225 rp->r_error = error; 226 mutex_exit(&rp->r_statelock); 227 } 228 } 229 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 230 } 231 } 232 233 /* 234 * Free the resources associated with an rnode. 235 */ 236 static void 237 r4inactive(rnode4_t *rp, cred_t *cr) 238 { 239 vnode_t *vp; 240 char *contents; 241 int size; 242 vsecattr_t *vsp; 243 vnode_t *xattr; 244 245 r4flushpages(rp, cr); 246 247 vp = RTOV4(rp); 248 249 /* 250 * Free any held caches which may be 251 * associated with this rnode. 252 */ 253 mutex_enter(&rp->r_statelock); 254 contents = rp->r_symlink.contents; 255 size = rp->r_symlink.size; 256 rp->r_symlink.contents = NULL; 257 vsp = rp->r_secattr; 258 rp->r_secattr = NULL; 259 xattr = rp->r_xattr_dir; 260 rp->r_xattr_dir = NULL; 261 mutex_exit(&rp->r_statelock); 262 263 /* 264 * Free the access cache entries. 265 */ 266 (void) nfs4_access_purge_rp(rp); 267 268 /* 269 * Free the readdir cache entries. 270 */ 271 nfs4_purge_rddir_cache(vp); 272 273 /* 274 * Free the symbolic link cache. 275 */ 276 if (contents != NULL) { 277 278 kmem_free((void *)contents, size); 279 } 280 281 /* 282 * Free any cached ACL. 283 */ 284 if (vsp != NULL) 285 nfs4_acl_free_cache(vsp); 286 287 /* 288 * Release the cached xattr_dir 289 */ 290 if (xattr != NULL) 291 VN_RELE(xattr); 292 } 293 294 /* 295 * We have seen a case that the fh passed in is for "." which 296 * should be a VROOT node, however, the fh is different from the 297 * root fh stored in the mntinfo4_t. The invalid fh might be 298 * from a misbehaved server and will panic the client system at 299 * a later time. To avoid the panic, we drop the bad fh, use 300 * the root fh from mntinfo4_t, and print an error message 301 * for attention. 302 */ 303 nfs4_sharedfh_t * 304 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 305 int *wasbad) 306 { 307 char *s; 308 309 *wasbad = 0; 310 s = fn_name(nm); 311 ASSERT(strcmp(s, "..") != 0); 312 313 if ((s[0] == '.' && s[1] == '\0') && fh && 314 !SFH4_SAME(mi->mi_rootfh, fh)) { 315 #ifdef DEBUG 316 nfs4_fhandle_t fhandle; 317 318 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 319 "Server %s returns a different " 320 "root filehandle for the path %s:", 321 mi->mi_curr_serv->sv_hostname, 322 mi->mi_curr_serv->sv_path); 323 324 /* print the bad fh */ 325 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 326 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 327 fhandle.fh_len); 328 nfs4_printfhandle(&fhandle); 329 330 /* print mi_rootfh */ 331 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 332 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 333 fhandle.fh_len); 334 nfs4_printfhandle(&fhandle); 335 #endif 336 /* use mi_rootfh instead; fh will be rele by the caller */ 337 fh = mi->mi_rootfh; 338 *wasbad = 1; 339 } 340 341 kmem_free(s, MAXNAMELEN); 342 return (fh); 343 } 344 345 void 346 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 347 hrtime_t t, cred_t *cr, int index) 348 { 349 int is_stub; 350 vattr_t *attr; 351 /* 352 * Don't add to attrcache if time overflow, but 353 * no need to check because either attr is null or the time 354 * values in it were processed by nfs4_time_ntov(), which checks 355 * for time overflows. 356 */ 357 attr = garp ? &garp->n4g_va : NULL; 358 359 if (attr) { 360 if (!newnode) { 361 rw_exit(&rtable4[index].r_lock); 362 #ifdef DEBUG 363 if (vp->v_type != attr->va_type && 364 vp->v_type != VNON && attr->va_type != VNON) { 365 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 366 "makenfs4node: type (%d) doesn't " 367 "match type of found node at %p (%d)", 368 attr->va_type, (void *)vp, vp->v_type); 369 } 370 #endif 371 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 372 } else { 373 rnode4_t *rp = VTOR4(vp); 374 375 vp->v_type = attr->va_type; 376 vp->v_rdev = attr->va_rdev; 377 378 /* 379 * Turn this object into a "stub" object if we 380 * crossed an underlying server fs boundary. 381 * To make this check, during mount we save the 382 * fsid of the server object being mounted. 383 * Here we compare this object's server fsid 384 * with the fsid we saved at mount. If they 385 * are different, we crossed server fs boundary. 386 * 387 * The stub type is set (or not) at rnode 388 * creation time and it never changes for life 389 * of the rnode. 390 * 391 * This stub will be for a mirror-mount, rather than 392 * a referral (the latter also sets R4SRVSTUB). 393 * 394 * The stub type is also set during RO failover, 395 * nfs4_remap_file(). 396 * 397 * We don't bother with taking r_state_lock to 398 * set the stub type because this is a new rnode 399 * and we're holding the hash bucket r_lock RW_WRITER. 400 * No other thread could have obtained access 401 * to this rnode. 402 */ 403 is_stub = 0; 404 if (garp->n4g_fsid_valid) { 405 fattr4_fsid ga_fsid = garp->n4g_fsid; 406 servinfo4_t *svp = rp->r_server; 407 408 rp->r_srv_fsid = ga_fsid; 409 410 (void) nfs_rw_enter_sig(&svp->sv_lock, 411 RW_READER, 0); 412 if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid)) 413 is_stub = 1; 414 nfs_rw_exit(&svp->sv_lock); 415 } 416 417 if (is_stub) 418 r4_stub_mirrormount(rp); 419 else 420 r4_stub_none(rp); 421 422 /* Can not cache partial attr */ 423 if (attr->va_mask == AT_ALL) 424 nfs4_attrcache_noinval(vp, garp, t); 425 else 426 PURGE_ATTRCACHE4(vp); 427 428 rw_exit(&rtable4[index].r_lock); 429 } 430 } else { 431 if (newnode) { 432 PURGE_ATTRCACHE4(vp); 433 } 434 rw_exit(&rtable4[index].r_lock); 435 } 436 } 437 438 /* 439 * Find or create an rnode based primarily on filehandle. To be 440 * used when dvp (vnode for parent directory) is not available; 441 * otherwise, makenfs4node() should be used. 442 * 443 * The nfs4_fname_t argument *npp is consumed and nulled out. 444 */ 445 446 vnode_t * 447 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 448 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 449 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 450 { 451 vfs_t *vfsp = mi->mi_vfsp; 452 int newnode = 0; 453 vnode_t *vp; 454 rnode4_t *rp; 455 svnode_t *svp; 456 nfs4_fname_t *name, *svpname; 457 int index; 458 459 ASSERT(npp && *npp); 460 name = *npp; 461 *npp = NULL; 462 463 index = rtable4hash(sfh); 464 rw_enter(&rtable4[index].r_lock, RW_READER); 465 466 vp = make_rnode4(sfh, &rtable4[index], vfsp, 467 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 468 469 svp = VTOSV(vp); 470 rp = VTOR4(vp); 471 if (newnode) { 472 svp->sv_forw = svp->sv_back = svp; 473 svp->sv_name = name; 474 if (psfh != NULL) 475 sfh4_hold(psfh); 476 svp->sv_dfh = psfh; 477 } else { 478 /* 479 * It is possible that due to a server 480 * side rename fnames have changed. 481 * update the fname here. 482 */ 483 mutex_enter(&rp->r_svlock); 484 svpname = svp->sv_name; 485 if (svp->sv_name != name) { 486 svp->sv_name = name; 487 mutex_exit(&rp->r_svlock); 488 fn_rele(&svpname); 489 } else { 490 mutex_exit(&rp->r_svlock); 491 fn_rele(&name); 492 } 493 } 494 495 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 496 r4_do_attrcache(vp, garp, newnode, t, cr, index); 497 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 498 499 return (vp); 500 } 501 502 /* 503 * Find or create a vnode for the given filehandle, filesystem, parent, and 504 * name. The reference to nm is consumed, so the caller must first do an 505 * fn_hold() if it wants to continue using nm after this call. 506 */ 507 vnode_t * 508 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 509 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 510 { 511 vnode_t *vp; 512 int newnode; 513 int index; 514 mntinfo4_t *mi = VFTOMI4(vfsp); 515 int had_badfh = 0; 516 rnode4_t *rp; 517 518 ASSERT(dvp != NULL); 519 520 fh = badrootfh_check(fh, nm, mi, &had_badfh); 521 522 index = rtable4hash(fh); 523 rw_enter(&rtable4[index].r_lock, RW_READER); 524 525 /* 526 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 527 */ 528 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 529 nfs4_putapage, &newnode, cr); 530 531 rp = VTOR4(vp); 532 sv_activate(&vp, dvp, &nm, newnode); 533 if (dvp->v_flag & V_XATTRDIR) { 534 mutex_enter(&rp->r_statelock); 535 rp->r_flags |= R4ISXATTR; 536 mutex_exit(&rp->r_statelock); 537 } 538 539 /* if getting a bad file handle, do not cache the attributes. */ 540 if (had_badfh) { 541 rw_exit(&rtable4[index].r_lock); 542 return (vp); 543 } 544 545 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 546 r4_do_attrcache(vp, garp, newnode, t, cr, index); 547 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 548 549 return (vp); 550 } 551 552 /* 553 * Hash on address of filehandle object. 554 * XXX totally untuned. 555 */ 556 557 int 558 rtable4hash(nfs4_sharedfh_t *fh) 559 { 560 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 561 } 562 563 /* 564 * Find or create the vnode for the given filehandle and filesystem. 565 * *newnode is set to zero if the vnode already existed; non-zero if it had 566 * to be created. 567 * 568 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 569 */ 570 571 static vnode_t * 572 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 573 struct vnodeops *vops, 574 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 575 int *newnode, cred_t *cr) 576 { 577 rnode4_t *rp; 578 rnode4_t *trp; 579 vnode_t *vp; 580 mntinfo4_t *mi; 581 582 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 583 584 mi = VFTOMI4(vfsp); 585 586 start: 587 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 588 vp = RTOV4(rp); 589 *newnode = 0; 590 return (vp); 591 } 592 rw_exit(&rhtp->r_lock); 593 594 mutex_enter(&rp4freelist_lock); 595 596 if (rp4freelist != NULL && rnode4_new >= nrnode) { 597 rp = rp4freelist; 598 rp4_rmfree(rp); 599 mutex_exit(&rp4freelist_lock); 600 601 vp = RTOV4(rp); 602 603 if (rp->r_flags & R4HASHED) { 604 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 605 mutex_enter(&vp->v_lock); 606 if (vp->v_count > 1) { 607 VN_RELE_LOCKED(vp); 608 mutex_exit(&vp->v_lock); 609 rw_exit(&rp->r_hashq->r_lock); 610 rw_enter(&rhtp->r_lock, RW_READER); 611 goto start; 612 } 613 mutex_exit(&vp->v_lock); 614 rp4_rmhash_locked(rp); 615 rw_exit(&rp->r_hashq->r_lock); 616 } 617 618 r4inactive(rp, cr); 619 620 mutex_enter(&vp->v_lock); 621 if (vp->v_count > 1) { 622 VN_RELE_LOCKED(vp); 623 mutex_exit(&vp->v_lock); 624 rw_enter(&rhtp->r_lock, RW_READER); 625 goto start; 626 } 627 mutex_exit(&vp->v_lock); 628 vn_invalid(vp); 629 630 /* 631 * destroy old locks before bzero'ing and 632 * recreating the locks below. 633 */ 634 uninit_rnode4(rp); 635 636 /* 637 * Make sure that if rnode is recycled then 638 * VFS count is decremented properly before 639 * reuse. 640 */ 641 VFS_RELE(vp->v_vfsp); 642 vn_reinit(vp); 643 } else { 644 vnode_t *new_vp; 645 646 mutex_exit(&rp4freelist_lock); 647 648 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 649 new_vp = vn_alloc(KM_SLEEP); 650 651 atomic_inc_ulong((ulong_t *)&rnode4_new); 652 #ifdef DEBUG 653 clstat4_debug.nrnode.value.ui64++; 654 #endif 655 vp = new_vp; 656 } 657 658 bzero(rp, sizeof (*rp)); 659 rp->r_vnode = vp; 660 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 661 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 662 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 663 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 664 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 665 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 666 rp->created_v4 = 0; 667 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 668 offsetof(nfs4_open_stream_t, os_node)); 669 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 670 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 671 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 672 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 673 rp->r_flags = R4READDIRWATTR; 674 rp->r_fh = fh; 675 rp->r_hashq = rhtp; 676 sfh4_hold(rp->r_fh); 677 rp->r_server = mi->mi_curr_serv; 678 rp->r_deleg_type = OPEN_DELEGATE_NONE; 679 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 680 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 681 682 rddir4_cache_create(rp); 683 rp->r_putapage = putapage; 684 vn_setops(vp, vops); 685 vp->v_data = (caddr_t)rp; 686 vp->v_vfsp = vfsp; 687 VFS_HOLD(vfsp); 688 vp->v_type = VNON; 689 vp->v_flag |= VMODSORT; 690 if (isrootfh(fh, rp)) 691 vp->v_flag = VROOT; 692 vn_exists(vp); 693 694 /* 695 * There is a race condition if someone else 696 * alloc's the rnode while no locks are held, so we 697 * check again and recover if found. 698 */ 699 rw_enter(&rhtp->r_lock, RW_WRITER); 700 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 701 vp = RTOV4(trp); 702 *newnode = 0; 703 rw_exit(&rhtp->r_lock); 704 rp4_addfree(rp, cr); 705 rw_enter(&rhtp->r_lock, RW_READER); 706 return (vp); 707 } 708 rp4_addhash(rp); 709 *newnode = 1; 710 return (vp); 711 } 712 713 static void 714 uninit_rnode4(rnode4_t *rp) 715 { 716 vnode_t *vp = RTOV4(rp); 717 718 ASSERT(rp != NULL); 719 ASSERT(vp != NULL); 720 ASSERT(vp->v_count == 1); 721 ASSERT(rp->r_count == 0); 722 ASSERT(rp->r_mapcnt == 0); 723 if (rp->r_flags & R4LODANGLERS) { 724 nfs4_flush_lock_owners(rp); 725 } 726 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 727 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 728 ASSERT(!(rp->r_flags & R4HASHED)); 729 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 730 nfs4_clear_open_streams(rp); 731 list_destroy(&rp->r_open_streams); 732 733 /* 734 * Destroy the rddir cache first since we need to grab the r_statelock. 735 */ 736 mutex_enter(&rp->r_statelock); 737 rddir4_cache_destroy(rp); 738 mutex_exit(&rp->r_statelock); 739 sv_uninit(&rp->r_svnode); 740 sfh4_rele(&rp->r_fh); 741 nfs_rw_destroy(&rp->r_rwlock); 742 nfs_rw_destroy(&rp->r_lkserlock); 743 mutex_destroy(&rp->r_statelock); 744 mutex_destroy(&rp->r_statev4_lock); 745 mutex_destroy(&rp->r_os_lock); 746 cv_destroy(&rp->r_cv); 747 cv_destroy(&rp->r_commit.c_cv); 748 nfs_rw_destroy(&rp->r_deleg_recall_lock); 749 if (rp->r_flags & R4DELMAPLIST) 750 list_destroy(&rp->r_indelmap); 751 } 752 753 /* 754 * Put an rnode on the free list. 755 * 756 * Rnodes which were allocated above and beyond the normal limit 757 * are immediately freed. 758 */ 759 void 760 rp4_addfree(rnode4_t *rp, cred_t *cr) 761 { 762 vnode_t *vp; 763 vnode_t *xattr; 764 struct vfs *vfsp; 765 766 vp = RTOV4(rp); 767 ASSERT(vp->v_count >= 1); 768 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 769 770 /* 771 * If we have too many rnodes allocated and there are no 772 * references to this rnode, or if the rnode is no longer 773 * accessible by it does not reside in the hash queues, 774 * or if an i/o error occurred while writing to the file, 775 * then just free it instead of putting it on the rnode 776 * freelist. 777 */ 778 vfsp = vp->v_vfsp; 779 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 780 #ifdef DEBUG 781 (nfs4_rnode_nofreelist != 0) || 782 #endif 783 rp->r_error || (rp->r_flags & R4RECOVERR) || 784 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 785 if (rp->r_flags & R4HASHED) { 786 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 787 mutex_enter(&vp->v_lock); 788 if (vp->v_count > 1) { 789 VN_RELE_LOCKED(vp); 790 mutex_exit(&vp->v_lock); 791 rw_exit(&rp->r_hashq->r_lock); 792 return; 793 } 794 mutex_exit(&vp->v_lock); 795 rp4_rmhash_locked(rp); 796 rw_exit(&rp->r_hashq->r_lock); 797 } 798 799 /* 800 * Make sure we don't have a delegation on this rnode 801 * before destroying it. 802 */ 803 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 804 (void) nfs4delegreturn(rp, 805 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 806 } 807 808 r4inactive(rp, cr); 809 810 /* 811 * Recheck the vnode reference count. We need to 812 * make sure that another reference has not been 813 * acquired while we were not holding v_lock. The 814 * rnode is not in the rnode hash queues; one 815 * way for a reference to have been acquired 816 * is for a VOP_PUTPAGE because the rnode was marked 817 * with R4DIRTY or for a modified page. This 818 * reference may have been acquired before our call 819 * to r4inactive. The i/o may have been completed, 820 * thus allowing r4inactive to complete, but the 821 * reference to the vnode may not have been released 822 * yet. In any case, the rnode can not be destroyed 823 * until the other references to this vnode have been 824 * released. The other references will take care of 825 * either destroying the rnode or placing it on the 826 * rnode freelist. If there are no other references, 827 * then the rnode may be safely destroyed. 828 */ 829 mutex_enter(&vp->v_lock); 830 if (vp->v_count > 1) { 831 VN_RELE_LOCKED(vp); 832 mutex_exit(&vp->v_lock); 833 return; 834 } 835 mutex_exit(&vp->v_lock); 836 837 destroy_rnode4(rp); 838 return; 839 } 840 841 /* 842 * Lock the hash queue and then recheck the reference count 843 * to ensure that no other threads have acquired a reference 844 * to indicate that the rnode should not be placed on the 845 * freelist. If another reference has been acquired, then 846 * just release this one and let the other thread complete 847 * the processing of adding this rnode to the freelist. 848 */ 849 again: 850 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 851 852 mutex_enter(&vp->v_lock); 853 if (vp->v_count > 1) { 854 VN_RELE_LOCKED(vp); 855 mutex_exit(&vp->v_lock); 856 rw_exit(&rp->r_hashq->r_lock); 857 return; 858 } 859 mutex_exit(&vp->v_lock); 860 861 /* 862 * Make sure we don't put an rnode with a delegation 863 * on the free list. 864 */ 865 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 866 rw_exit(&rp->r_hashq->r_lock); 867 (void) nfs4delegreturn(rp, 868 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 869 goto again; 870 } 871 872 /* 873 * Now that we have the hash queue lock, and we know there 874 * are not anymore references on the vnode, check to make 875 * sure there aren't any open streams still on the rnode. 876 * If so, drop the hash queue lock, remove the open streams, 877 * and recheck the v_count. 878 */ 879 mutex_enter(&rp->r_os_lock); 880 if (list_head(&rp->r_open_streams) != NULL) { 881 mutex_exit(&rp->r_os_lock); 882 rw_exit(&rp->r_hashq->r_lock); 883 if (nfs_zone() != VTOMI4(vp)->mi_zone) 884 nfs4_clear_open_streams(rp); 885 else 886 (void) nfs4close_all(vp, cr); 887 goto again; 888 } 889 mutex_exit(&rp->r_os_lock); 890 891 /* 892 * Before we put it on the freelist, make sure there are no pages. 893 * If there are, flush and commit of all of the dirty and 894 * uncommitted pages, assuming the file system isn't read only. 895 */ 896 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) { 897 rw_exit(&rp->r_hashq->r_lock); 898 r4flushpages(rp, cr); 899 goto again; 900 } 901 902 /* 903 * Before we put it on the freelist, make sure there is no 904 * active xattr directory cached, the freelist will not 905 * have its entries r4inactive'd if there is still an active 906 * rnode, thus nothing in the freelist can hold another 907 * rnode active. 908 */ 909 xattr = rp->r_xattr_dir; 910 rp->r_xattr_dir = NULL; 911 912 /* 913 * If there is no cached data or metadata for this file, then 914 * put the rnode on the front of the freelist so that it will 915 * be reused before other rnodes which may have cached data or 916 * metadata associated with them. 917 */ 918 mutex_enter(&rp4freelist_lock); 919 if (rp4freelist == NULL) { 920 rp->r_freef = rp; 921 rp->r_freeb = rp; 922 rp4freelist = rp; 923 } else { 924 rp->r_freef = rp4freelist; 925 rp->r_freeb = rp4freelist->r_freeb; 926 rp4freelist->r_freeb->r_freef = rp; 927 rp4freelist->r_freeb = rp; 928 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 929 rp->r_symlink.contents == NULL && rp->r_secattr == NULL) 930 rp4freelist = rp; 931 } 932 mutex_exit(&rp4freelist_lock); 933 934 rw_exit(&rp->r_hashq->r_lock); 935 936 if (xattr) 937 VN_RELE(xattr); 938 } 939 940 /* 941 * Remove an rnode from the free list. 942 * 943 * The caller must be holding rp4freelist_lock and the rnode 944 * must be on the freelist. 945 */ 946 static void 947 rp4_rmfree(rnode4_t *rp) 948 { 949 950 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 951 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 952 953 if (rp == rp4freelist) { 954 rp4freelist = rp->r_freef; 955 if (rp == rp4freelist) 956 rp4freelist = NULL; 957 } 958 rp->r_freeb->r_freef = rp->r_freef; 959 rp->r_freef->r_freeb = rp->r_freeb; 960 961 rp->r_freef = rp->r_freeb = NULL; 962 } 963 964 /* 965 * Put a rnode in the hash table. 966 * 967 * The caller must be holding the exclusive hash queue lock 968 */ 969 void 970 rp4_addhash(rnode4_t *rp) 971 { 972 mntinfo4_t *mi; 973 974 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 975 ASSERT(!(rp->r_flags & R4HASHED)); 976 977 #ifdef DEBUG 978 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 979 #endif 980 981 rp->r_hashf = rp->r_hashq->r_hashf; 982 rp->r_hashq->r_hashf = rp; 983 rp->r_hashb = (rnode4_t *)rp->r_hashq; 984 rp->r_hashf->r_hashb = rp; 985 986 mutex_enter(&rp->r_statelock); 987 rp->r_flags |= R4HASHED; 988 mutex_exit(&rp->r_statelock); 989 990 mi = VTOMI4(RTOV4(rp)); 991 mutex_enter(&mi->mi_rnodes_lock); 992 list_insert_tail(&mi->mi_rnodes, rp); 993 mutex_exit(&mi->mi_rnodes_lock); 994 } 995 996 /* 997 * Remove a rnode from the hash table. 998 * 999 * The caller must be holding the hash queue lock. 1000 */ 1001 void 1002 rp4_rmhash_locked(rnode4_t *rp) 1003 { 1004 mntinfo4_t *mi; 1005 1006 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 1007 ASSERT(rp->r_flags & R4HASHED); 1008 1009 rp->r_hashb->r_hashf = rp->r_hashf; 1010 rp->r_hashf->r_hashb = rp->r_hashb; 1011 1012 mutex_enter(&rp->r_statelock); 1013 rp->r_flags &= ~R4HASHED; 1014 mutex_exit(&rp->r_statelock); 1015 1016 mi = VTOMI4(RTOV4(rp)); 1017 mutex_enter(&mi->mi_rnodes_lock); 1018 if (list_link_active(&rp->r_mi_link)) 1019 list_remove(&mi->mi_rnodes, rp); 1020 mutex_exit(&mi->mi_rnodes_lock); 1021 } 1022 1023 /* 1024 * Remove a rnode from the hash table. 1025 * 1026 * The caller must not be holding the hash queue lock. 1027 */ 1028 void 1029 rp4_rmhash(rnode4_t *rp) 1030 { 1031 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1032 rp4_rmhash_locked(rp); 1033 rw_exit(&rp->r_hashq->r_lock); 1034 } 1035 1036 /* 1037 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 1038 * Returns NULL if no match. If an rnode is returned, the reference count 1039 * on the master vnode is incremented. 1040 * 1041 * The caller must be holding the hash queue lock, either shared or exclusive. 1042 */ 1043 rnode4_t * 1044 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 1045 { 1046 rnode4_t *rp; 1047 vnode_t *vp; 1048 1049 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 1050 1051 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 1052 vp = RTOV4(rp); 1053 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 1054 1055 mutex_enter(&rp->r_statelock); 1056 if (rp->r_flags & R4RECOVERR) { 1057 mutex_exit(&rp->r_statelock); 1058 continue; 1059 } 1060 mutex_exit(&rp->r_statelock); 1061 #ifdef DEBUG 1062 r4_dup_check(rp, vfsp); 1063 #endif 1064 if (rp->r_freef != NULL) { 1065 mutex_enter(&rp4freelist_lock); 1066 /* 1067 * If the rnode is on the freelist, 1068 * then remove it and use that reference 1069 * as the new reference. Otherwise, 1070 * need to increment the reference count. 1071 */ 1072 if (rp->r_freef != NULL) { 1073 rp4_rmfree(rp); 1074 mutex_exit(&rp4freelist_lock); 1075 } else { 1076 mutex_exit(&rp4freelist_lock); 1077 VN_HOLD(vp); 1078 } 1079 } else 1080 VN_HOLD(vp); 1081 1082 /* 1083 * if root vnode, set v_flag to indicate that 1084 */ 1085 if (isrootfh(fh, rp)) { 1086 if (!(vp->v_flag & VROOT)) { 1087 mutex_enter(&vp->v_lock); 1088 vp->v_flag |= VROOT; 1089 mutex_exit(&vp->v_lock); 1090 } 1091 } 1092 return (rp); 1093 } 1094 } 1095 return (NULL); 1096 } 1097 1098 /* 1099 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1100 * that assumes the caller hasn't already got the lock 1101 * on the hash bucket. 1102 */ 1103 rnode4_t * 1104 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1105 { 1106 rnode4_t *rp; 1107 int index; 1108 1109 index = rtable4hash(fh); 1110 rw_enter(&rtable4[index].r_lock, RW_READER); 1111 rp = r4find(&rtable4[index], fh, vfsp); 1112 rw_exit(&rtable4[index].r_lock); 1113 1114 return (rp); 1115 } 1116 1117 /* 1118 * Return 1 if there is an active vnode belonging to this vfs in the 1119 * rtable4 cache. 1120 * 1121 * Several of these checks are done without holding the usual 1122 * locks. This is safe because destroy_rtable4(), rp4_addfree(), 1123 * etc. will redo the necessary checks before actually destroying 1124 * any rnodes. 1125 */ 1126 int 1127 check_rtable4(struct vfs *vfsp) 1128 { 1129 rnode4_t *rp; 1130 vnode_t *vp; 1131 mntinfo4_t *mi; 1132 1133 ASSERT(vfsp != NULL); 1134 mi = VFTOMI4(vfsp); 1135 1136 mutex_enter(&mi->mi_rnodes_lock); 1137 for (rp = list_head(&mi->mi_rnodes); rp != NULL; 1138 rp = list_next(&mi->mi_rnodes, rp)) { 1139 vp = RTOV4(rp); 1140 1141 if (rp->r_freef == NULL || 1142 (nfs4_has_pages(vp) && (rp->r_flags & R4DIRTY)) || 1143 rp->r_count > 0) { 1144 mutex_exit(&mi->mi_rnodes_lock); 1145 return (1); 1146 } 1147 } 1148 mutex_exit(&mi->mi_rnodes_lock); 1149 1150 return (0); 1151 } 1152 1153 /* 1154 * Destroy inactive vnodes from the hash queues which 1155 * belong to this vfs. All of the vnodes should be inactive. 1156 * It is essential that we destroy all rnodes in case of 1157 * forced unmount as well as in normal unmount case. 1158 */ 1159 1160 void 1161 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1162 { 1163 rnode4_t *rp; 1164 mntinfo4_t *mi; 1165 1166 ASSERT(vfsp != NULL); 1167 1168 mi = VFTOMI4(vfsp); 1169 1170 mutex_enter(&rp4freelist_lock); 1171 mutex_enter(&mi->mi_rnodes_lock); 1172 while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) { 1173 /* 1174 * If the rnode is no longer on the freelist it is not 1175 * ours and it will be handled by some other thread, so 1176 * skip it. 1177 */ 1178 if (rp->r_freef == NULL) 1179 continue; 1180 mutex_exit(&mi->mi_rnodes_lock); 1181 1182 rp4_rmfree(rp); 1183 mutex_exit(&rp4freelist_lock); 1184 1185 rp4_rmhash(rp); 1186 1187 /* 1188 * This call to rp4_addfree will end up destroying the 1189 * rnode, but in a safe way with the appropriate set 1190 * of checks done. 1191 */ 1192 rp4_addfree(rp, cr); 1193 1194 mutex_enter(&rp4freelist_lock); 1195 mutex_enter(&mi->mi_rnodes_lock); 1196 } 1197 mutex_exit(&mi->mi_rnodes_lock); 1198 mutex_exit(&rp4freelist_lock); 1199 } 1200 1201 /* 1202 * This routine destroys all the resources of an rnode 1203 * and finally the rnode itself. 1204 */ 1205 static void 1206 destroy_rnode4(rnode4_t *rp) 1207 { 1208 vnode_t *vp; 1209 vfs_t *vfsp; 1210 1211 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1212 1213 vp = RTOV4(rp); 1214 vfsp = vp->v_vfsp; 1215 1216 uninit_rnode4(rp); 1217 atomic_dec_ulong((ulong_t *)&rnode4_new); 1218 #ifdef DEBUG 1219 clstat4_debug.nrnode.value.ui64--; 1220 #endif 1221 kmem_cache_free(rnode4_cache, rp); 1222 vn_invalid(vp); 1223 vn_free(vp); 1224 VFS_RELE(vfsp); 1225 } 1226 1227 /* 1228 * Invalidate the attributes on all rnodes forcing the next getattr 1229 * to go over the wire. Used to flush stale uid and gid mappings. 1230 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1231 */ 1232 void 1233 nfs4_rnode_invalidate(struct vfs *vfsp) 1234 { 1235 int index; 1236 rnode4_t *rp; 1237 vnode_t *vp; 1238 1239 /* 1240 * Walk the hash queues looking for rnodes. 1241 */ 1242 for (index = 0; index < rtable4size; index++) { 1243 rw_enter(&rtable4[index].r_lock, RW_READER); 1244 for (rp = rtable4[index].r_hashf; 1245 rp != (rnode4_t *)(&rtable4[index]); 1246 rp = rp->r_hashf) { 1247 vp = RTOV4(rp); 1248 if (vfsp != NULL && vp->v_vfsp != vfsp) 1249 continue; 1250 1251 if (!mutex_tryenter(&rp->r_statelock)) 1252 continue; 1253 1254 /* 1255 * Expire the attributes by resetting the change 1256 * and attr timeout. 1257 */ 1258 rp->r_change = 0; 1259 PURGE_ATTRCACHE4_LOCKED(rp); 1260 mutex_exit(&rp->r_statelock); 1261 } 1262 rw_exit(&rtable4[index].r_lock); 1263 } 1264 } 1265 1266 /* 1267 * Flush all vnodes in this (or every) vfs. 1268 * Used by nfs_sync and by nfs_unmount. 1269 */ 1270 void 1271 r4flush(struct vfs *vfsp, cred_t *cr) 1272 { 1273 int index; 1274 rnode4_t *rp; 1275 vnode_t *vp, **vplist; 1276 long num, cnt; 1277 1278 /* 1279 * Check to see whether there is anything to do. 1280 */ 1281 num = rnode4_new; 1282 if (num == 0) 1283 return; 1284 1285 /* 1286 * Allocate a slot for all currently active rnodes on the 1287 * supposition that they all may need flushing. 1288 */ 1289 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1290 cnt = 0; 1291 1292 /* 1293 * If the vfs is known we can do fast path by iterating all rnodes that 1294 * belongs to this vfs. This is much faster than the traditional way 1295 * of iterating rtable4 (below) in a case there is a lot of rnodes that 1296 * does not belong to our vfs. 1297 */ 1298 if (vfsp != NULL) { 1299 mntinfo4_t *mi = VFTOMI4(vfsp); 1300 1301 mutex_enter(&mi->mi_rnodes_lock); 1302 for (rp = list_head(&mi->mi_rnodes); rp != NULL; 1303 rp = list_next(&mi->mi_rnodes, rp)) { 1304 vp = RTOV4(rp); 1305 /* 1306 * Don't bother sync'ing a vp if it 1307 * is part of virtual swap device or 1308 * if VFS is read-only 1309 */ 1310 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1311 continue; 1312 /* 1313 * If the vnode has pages and is marked as either dirty 1314 * or mmap'd, hold and add this vnode to the list of 1315 * vnodes to flush. 1316 */ 1317 ASSERT(vp->v_vfsp == vfsp); 1318 if (nfs4_has_pages(vp) && 1319 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1320 VN_HOLD(vp); 1321 vplist[cnt++] = vp; 1322 if (cnt == num) { 1323 /* 1324 * The vplist is full because there is 1325 * too many rnodes. We are done for 1326 * now. 1327 */ 1328 break; 1329 } 1330 } 1331 } 1332 mutex_exit(&mi->mi_rnodes_lock); 1333 1334 goto done; 1335 } 1336 1337 ASSERT(vfsp == NULL); 1338 1339 /* 1340 * Walk the hash queues looking for rnodes with page 1341 * lists associated with them. Make a list of these 1342 * files. 1343 */ 1344 for (index = 0; index < rtable4size; index++) { 1345 rw_enter(&rtable4[index].r_lock, RW_READER); 1346 for (rp = rtable4[index].r_hashf; 1347 rp != (rnode4_t *)(&rtable4[index]); 1348 rp = rp->r_hashf) { 1349 vp = RTOV4(rp); 1350 /* 1351 * Don't bother sync'ing a vp if it 1352 * is part of virtual swap device or 1353 * if VFS is read-only 1354 */ 1355 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1356 continue; 1357 /* 1358 * If the vnode has pages and is marked as either dirty 1359 * or mmap'd, hold and add this vnode to the list of 1360 * vnodes to flush. 1361 */ 1362 if (nfs4_has_pages(vp) && 1363 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1364 VN_HOLD(vp); 1365 vplist[cnt++] = vp; 1366 if (cnt == num) { 1367 rw_exit(&rtable4[index].r_lock); 1368 /* 1369 * The vplist is full because there is 1370 * too many rnodes. We are done for 1371 * now. 1372 */ 1373 goto done; 1374 } 1375 } 1376 } 1377 rw_exit(&rtable4[index].r_lock); 1378 } 1379 1380 done: 1381 1382 /* 1383 * Flush and release all of the files on the list. 1384 */ 1385 while (cnt-- > 0) { 1386 vp = vplist[cnt]; 1387 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 1388 VN_RELE(vp); 1389 } 1390 1391 /* 1392 * Free the space allocated to hold the list. 1393 */ 1394 kmem_free(vplist, num * sizeof (*vplist)); 1395 } 1396 1397 int 1398 nfs4_free_data_reclaim(rnode4_t *rp) 1399 { 1400 char *contents; 1401 vnode_t *xattr; 1402 int size; 1403 vsecattr_t *vsp; 1404 int freed; 1405 bool_t rdc = FALSE; 1406 1407 /* 1408 * Free any held caches which may 1409 * be associated with this rnode. 1410 */ 1411 mutex_enter(&rp->r_statelock); 1412 if (rp->r_dir != NULL) 1413 rdc = TRUE; 1414 contents = rp->r_symlink.contents; 1415 size = rp->r_symlink.size; 1416 rp->r_symlink.contents = NULL; 1417 vsp = rp->r_secattr; 1418 rp->r_secattr = NULL; 1419 xattr = rp->r_xattr_dir; 1420 rp->r_xattr_dir = NULL; 1421 mutex_exit(&rp->r_statelock); 1422 1423 /* 1424 * Free the access cache entries. 1425 */ 1426 freed = nfs4_access_purge_rp(rp); 1427 1428 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1429 return (freed); 1430 1431 /* 1432 * Free the readdir cache entries, incompletely if we can't block. 1433 */ 1434 nfs4_purge_rddir_cache(RTOV4(rp)); 1435 1436 /* 1437 * Free the symbolic link cache. 1438 */ 1439 if (contents != NULL) { 1440 1441 kmem_free((void *)contents, size); 1442 } 1443 1444 /* 1445 * Free any cached ACL. 1446 */ 1447 if (vsp != NULL) 1448 nfs4_acl_free_cache(vsp); 1449 1450 /* 1451 * Release the xattr directory vnode 1452 */ 1453 if (xattr != NULL) 1454 VN_RELE(xattr); 1455 1456 return (1); 1457 } 1458 1459 static int 1460 nfs4_active_data_reclaim(rnode4_t *rp) 1461 { 1462 char *contents; 1463 vnode_t *xattr = NULL; 1464 int size; 1465 vsecattr_t *vsp; 1466 int freed; 1467 bool_t rdc = FALSE; 1468 1469 /* 1470 * Free any held credentials and caches which 1471 * may be associated with this rnode. 1472 */ 1473 if (!mutex_tryenter(&rp->r_statelock)) 1474 return (0); 1475 contents = rp->r_symlink.contents; 1476 size = rp->r_symlink.size; 1477 rp->r_symlink.contents = NULL; 1478 vsp = rp->r_secattr; 1479 rp->r_secattr = NULL; 1480 if (rp->r_dir != NULL) 1481 rdc = TRUE; 1482 /* 1483 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed 1484 * on the same r_hashq queue. We are not mandated to free all caches. 1485 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the 1486 * rnode 'rp' is freed or put on the free list. 1487 * 1488 * We will retain NFS4_XATTR_DIR_NOTSUPP because: 1489 * - it has no associated rnode4_t (its v_data is NULL), 1490 * - it is preallocated statically and will never go away, 1491 * so we cannot save anything by releasing it. 1492 */ 1493 if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP && 1494 VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) { 1495 xattr = rp->r_xattr_dir; 1496 rp->r_xattr_dir = NULL; 1497 } 1498 mutex_exit(&rp->r_statelock); 1499 1500 /* 1501 * Free the access cache entries. 1502 */ 1503 freed = nfs4_access_purge_rp(rp); 1504 1505 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1506 return (freed); 1507 1508 /* 1509 * Free the symbolic link cache. 1510 */ 1511 if (contents != NULL) { 1512 1513 kmem_free((void *)contents, size); 1514 } 1515 1516 /* 1517 * Free any cached ACL. 1518 */ 1519 if (vsp != NULL) 1520 nfs4_acl_free_cache(vsp); 1521 1522 nfs4_purge_rddir_cache(RTOV4(rp)); 1523 1524 /* 1525 * Release the xattr directory vnode 1526 */ 1527 if (xattr != NULL) 1528 VN_RELE(xattr); 1529 1530 return (1); 1531 } 1532 1533 static int 1534 nfs4_free_reclaim(void) 1535 { 1536 int freed; 1537 rnode4_t *rp; 1538 1539 #ifdef DEBUG 1540 clstat4_debug.f_reclaim.value.ui64++; 1541 #endif 1542 freed = 0; 1543 mutex_enter(&rp4freelist_lock); 1544 rp = rp4freelist; 1545 if (rp != NULL) { 1546 do { 1547 if (nfs4_free_data_reclaim(rp)) 1548 freed = 1; 1549 } while ((rp = rp->r_freef) != rp4freelist); 1550 } 1551 mutex_exit(&rp4freelist_lock); 1552 return (freed); 1553 } 1554 1555 static int 1556 nfs4_active_reclaim(void) 1557 { 1558 int freed; 1559 int index; 1560 rnode4_t *rp; 1561 1562 #ifdef DEBUG 1563 clstat4_debug.a_reclaim.value.ui64++; 1564 #endif 1565 freed = 0; 1566 for (index = 0; index < rtable4size; index++) { 1567 rw_enter(&rtable4[index].r_lock, RW_READER); 1568 for (rp = rtable4[index].r_hashf; 1569 rp != (rnode4_t *)(&rtable4[index]); 1570 rp = rp->r_hashf) { 1571 if (nfs4_active_data_reclaim(rp)) 1572 freed = 1; 1573 } 1574 rw_exit(&rtable4[index].r_lock); 1575 } 1576 return (freed); 1577 } 1578 1579 static int 1580 nfs4_rnode_reclaim(void) 1581 { 1582 int freed; 1583 rnode4_t *rp; 1584 vnode_t *vp; 1585 1586 #ifdef DEBUG 1587 clstat4_debug.r_reclaim.value.ui64++; 1588 #endif 1589 freed = 0; 1590 mutex_enter(&rp4freelist_lock); 1591 while ((rp = rp4freelist) != NULL) { 1592 rp4_rmfree(rp); 1593 mutex_exit(&rp4freelist_lock); 1594 if (rp->r_flags & R4HASHED) { 1595 vp = RTOV4(rp); 1596 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1597 mutex_enter(&vp->v_lock); 1598 if (vp->v_count > 1) { 1599 VN_RELE_LOCKED(vp); 1600 mutex_exit(&vp->v_lock); 1601 rw_exit(&rp->r_hashq->r_lock); 1602 mutex_enter(&rp4freelist_lock); 1603 continue; 1604 } 1605 mutex_exit(&vp->v_lock); 1606 rp4_rmhash_locked(rp); 1607 rw_exit(&rp->r_hashq->r_lock); 1608 } 1609 /* 1610 * This call to rp_addfree will end up destroying the 1611 * rnode, but in a safe way with the appropriate set 1612 * of checks done. 1613 */ 1614 rp4_addfree(rp, CRED()); 1615 mutex_enter(&rp4freelist_lock); 1616 } 1617 mutex_exit(&rp4freelist_lock); 1618 return (freed); 1619 } 1620 1621 /*ARGSUSED*/ 1622 static void 1623 nfs4_reclaim(void *cdrarg) 1624 { 1625 #ifdef DEBUG 1626 clstat4_debug.reclaim.value.ui64++; 1627 #endif 1628 if (nfs4_free_reclaim()) 1629 return; 1630 1631 if (nfs4_active_reclaim()) 1632 return; 1633 1634 (void) nfs4_rnode_reclaim(); 1635 } 1636 1637 /* 1638 * Returns the clientid4 to use for the given mntinfo4. Note that the 1639 * clientid can change if the caller drops mi_recovlock. 1640 */ 1641 1642 clientid4 1643 mi2clientid(mntinfo4_t *mi) 1644 { 1645 nfs4_server_t *sp; 1646 clientid4 clientid = 0; 1647 1648 /* this locks down sp if it is found */ 1649 sp = find_nfs4_server(mi); 1650 if (sp != NULL) { 1651 clientid = sp->clientid; 1652 mutex_exit(&sp->s_lock); 1653 nfs4_server_rele(sp); 1654 } 1655 return (clientid); 1656 } 1657 1658 /* 1659 * Return the current lease time for the server associated with the given 1660 * file. Note that the lease time could change immediately after this 1661 * call. 1662 */ 1663 1664 time_t 1665 r2lease_time(rnode4_t *rp) 1666 { 1667 nfs4_server_t *sp; 1668 time_t lease_time; 1669 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1670 1671 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1672 1673 /* this locks down sp if it is found */ 1674 sp = find_nfs4_server(VTOMI4(RTOV4(rp))); 1675 1676 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1677 if (sp != NULL) { 1678 mutex_exit(&sp->s_lock); 1679 nfs4_server_rele(sp); 1680 } 1681 nfs_rw_exit(&mi->mi_recovlock); 1682 return (1); /* 1 second */ 1683 } 1684 1685 ASSERT(sp != NULL); 1686 1687 lease_time = sp->s_lease_time; 1688 1689 mutex_exit(&sp->s_lock); 1690 nfs4_server_rele(sp); 1691 nfs_rw_exit(&mi->mi_recovlock); 1692 1693 return (lease_time); 1694 } 1695 1696 /* 1697 * Return a list with information about all the known open instances for 1698 * a filesystem. The caller must call r4releopenlist() when done with the 1699 * list. 1700 * 1701 * We are safe at looking at os_valid and os_pending_close across dropping 1702 * the 'os_sync_lock' to count up the number of open streams and then 1703 * allocate memory for the osp list due to: 1704 * -Looking at os_pending_close is safe since this routine is 1705 * only called via recovery, and os_pending_close can only be set via 1706 * a non-recovery operation (which are all blocked when recovery 1707 * is active). 1708 * 1709 * -Examining os_valid is safe since non-recovery operations, which 1710 * could potentially switch os_valid to 0, are blocked (via 1711 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1712 * (which means we are the only recovery thread potentially acting 1713 * on this open stream). 1714 */ 1715 1716 nfs4_opinst_t * 1717 r4mkopenlist(mntinfo4_t *mi) 1718 { 1719 nfs4_opinst_t *reopenlist, *rep; 1720 rnode4_t *rp; 1721 vnode_t *vp; 1722 vfs_t *vfsp = mi->mi_vfsp; 1723 int numosp; 1724 nfs4_open_stream_t *osp; 1725 int index; 1726 open_delegation_type4 dtype; 1727 int hold_vnode; 1728 1729 reopenlist = NULL; 1730 1731 for (index = 0; index < rtable4size; index++) { 1732 rw_enter(&rtable4[index].r_lock, RW_READER); 1733 for (rp = rtable4[index].r_hashf; 1734 rp != (rnode4_t *)(&rtable4[index]); 1735 rp = rp->r_hashf) { 1736 1737 vp = RTOV4(rp); 1738 if (vp->v_vfsp != vfsp) 1739 continue; 1740 hold_vnode = 0; 1741 1742 mutex_enter(&rp->r_os_lock); 1743 1744 /* Count the number of valid open_streams of the file */ 1745 numosp = 0; 1746 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1747 osp = list_next(&rp->r_open_streams, osp)) { 1748 mutex_enter(&osp->os_sync_lock); 1749 if (osp->os_valid && !osp->os_pending_close) 1750 numosp++; 1751 mutex_exit(&osp->os_sync_lock); 1752 } 1753 1754 /* Fill in the valid open streams per vp */ 1755 if (numosp > 0) { 1756 int j; 1757 1758 hold_vnode = 1; 1759 1760 /* 1761 * Add a new open instance to the list 1762 */ 1763 rep = kmem_zalloc(sizeof (*reopenlist), 1764 KM_SLEEP); 1765 rep->re_next = reopenlist; 1766 reopenlist = rep; 1767 1768 rep->re_vp = vp; 1769 rep->re_osp = kmem_zalloc( 1770 numosp * sizeof (*(rep->re_osp)), 1771 KM_SLEEP); 1772 rep->re_numosp = numosp; 1773 1774 j = 0; 1775 for (osp = list_head(&rp->r_open_streams); 1776 osp != NULL; 1777 osp = list_next(&rp->r_open_streams, osp)) { 1778 1779 mutex_enter(&osp->os_sync_lock); 1780 if (osp->os_valid && 1781 !osp->os_pending_close) { 1782 osp->os_ref_count++; 1783 rep->re_osp[j] = osp; 1784 j++; 1785 } 1786 mutex_exit(&osp->os_sync_lock); 1787 } 1788 /* 1789 * Assuming valid osp(s) stays valid between 1790 * the time obtaining j and numosp. 1791 */ 1792 ASSERT(j == numosp); 1793 } 1794 1795 mutex_exit(&rp->r_os_lock); 1796 /* do this here to keep v_lock > r_os_lock */ 1797 if (hold_vnode) 1798 VN_HOLD(vp); 1799 mutex_enter(&rp->r_statev4_lock); 1800 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1801 /* 1802 * If this rnode holds a delegation, 1803 * but if there are no valid open streams, 1804 * then just discard the delegation 1805 * without doing delegreturn. 1806 */ 1807 if (numosp > 0) 1808 rp->r_deleg_needs_recovery = 1809 rp->r_deleg_type; 1810 } 1811 /* Save the delegation type for use outside the lock */ 1812 dtype = rp->r_deleg_type; 1813 mutex_exit(&rp->r_statev4_lock); 1814 1815 /* 1816 * If we have a delegation then get rid of it. 1817 * We've set rp->r_deleg_needs_recovery so we have 1818 * enough information to recover. 1819 */ 1820 if (dtype != OPEN_DELEGATE_NONE) { 1821 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1822 } 1823 } 1824 rw_exit(&rtable4[index].r_lock); 1825 } 1826 return (reopenlist); 1827 } 1828 1829 /* 1830 * Given a filesystem id, check to see if any rnodes 1831 * within this fsid reside in the rnode cache, other 1832 * than one we know about. 1833 * 1834 * Return 1 if an rnode is found, 0 otherwise 1835 */ 1836 int 1837 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid) 1838 { 1839 rnode4_t *rp; 1840 vnode_t *vp; 1841 vfs_t *vfsp = mi->mi_vfsp; 1842 fattr4_fsid *fsid; 1843 int index, found = 0; 1844 1845 for (index = 0; index < rtable4size; index++) { 1846 rw_enter(&rtable4[index].r_lock, RW_READER); 1847 for (rp = rtable4[index].r_hashf; 1848 rp != (rnode4_t *)(&rtable4[index]); 1849 rp = rp->r_hashf) { 1850 1851 vp = RTOV4(rp); 1852 if (vp->v_vfsp != vfsp) 1853 continue; 1854 1855 /* 1856 * XXX there might be a case where a 1857 * replicated fs may have the same fsid 1858 * across two different servers. This 1859 * check isn't good enough in that case 1860 */ 1861 fsid = &rp->r_srv_fsid; 1862 if (FATTR4_FSID_EQ(moved_fsid, fsid)) { 1863 found = 1; 1864 break; 1865 } 1866 } 1867 rw_exit(&rtable4[index].r_lock); 1868 1869 if (found) 1870 break; 1871 } 1872 return (found); 1873 } 1874 1875 /* 1876 * Release the list of open instance references. 1877 */ 1878 1879 void 1880 r4releopenlist(nfs4_opinst_t *reopenp) 1881 { 1882 nfs4_opinst_t *rep, *next; 1883 int i; 1884 1885 for (rep = reopenp; rep; rep = next) { 1886 next = rep->re_next; 1887 1888 for (i = 0; i < rep->re_numosp; i++) 1889 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1890 1891 VN_RELE(rep->re_vp); 1892 kmem_free(rep->re_osp, 1893 rep->re_numosp * sizeof (*(rep->re_osp))); 1894 1895 kmem_free(rep, sizeof (*rep)); 1896 } 1897 } 1898 1899 int 1900 nfs4_rnode_init(void) 1901 { 1902 ulong_t nrnode4_max; 1903 int i; 1904 1905 /* 1906 * Compute the size of the rnode4 hash table 1907 */ 1908 if (nrnode <= 0) 1909 nrnode = ncsize; 1910 nrnode4_max = 1911 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1912 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1913 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1914 "!setting nrnode to max value of %ld", nrnode4_max); 1915 nrnode = nrnode4_max; 1916 } 1917 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1918 rtable4mask = rtable4size - 1; 1919 1920 /* 1921 * Allocate and initialize the hash buckets 1922 */ 1923 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1924 for (i = 0; i < rtable4size; i++) { 1925 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1926 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1927 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1928 } 1929 1930 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1931 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1932 1933 return (0); 1934 } 1935 1936 int 1937 nfs4_rnode_fini(void) 1938 { 1939 int i; 1940 1941 /* 1942 * Deallocate the rnode hash queues 1943 */ 1944 kmem_cache_destroy(rnode4_cache); 1945 1946 for (i = 0; i < rtable4size; i++) 1947 rw_destroy(&rtable4[i].r_lock); 1948 1949 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 1950 1951 return (0); 1952 } 1953 1954 /* 1955 * Return non-zero if the given filehandle refers to the root filehandle 1956 * for the given rnode. 1957 */ 1958 1959 static int 1960 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 1961 { 1962 int isroot; 1963 1964 isroot = 0; 1965 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 1966 isroot = 1; 1967 1968 return (isroot); 1969 } 1970 1971 /* 1972 * The r4_stub_* routines assume that the rnode is newly activated, and 1973 * that the caller either holds the hash bucket r_lock for this rnode as 1974 * RW_WRITER, or holds r_statelock. 1975 */ 1976 static void 1977 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type) 1978 { 1979 vnode_t *vp = RTOV4(rp); 1980 krwlock_t *hash_lock = &rp->r_hashq->r_lock; 1981 1982 ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock)); 1983 1984 rp->r_stub_type = type; 1985 1986 /* 1987 * Safely switch this vnode to the trigger vnodeops. 1988 * 1989 * Currently, we don't ever switch a trigger vnode back to using 1990 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that 1991 * a new v4 object is not a trigger, and it will already have the 1992 * correct v4 vnodeops by default. So, no "else" case required here. 1993 */ 1994 if (type != NFS4_STUB_NONE) 1995 vn_setops(vp, nfs4_trigger_vnodeops); 1996 } 1997 1998 void 1999 r4_stub_mirrormount(rnode4_t *rp) 2000 { 2001 r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT); 2002 } 2003 2004 void 2005 r4_stub_referral(rnode4_t *rp) 2006 { 2007 DTRACE_PROBE1(nfs4clnt__func__referral__moved, 2008 vnode_t *, RTOV4(rp)); 2009 r4_stub_set(rp, NFS4_STUB_REFERRAL); 2010 } 2011 2012 void 2013 r4_stub_none(rnode4_t *rp) 2014 { 2015 r4_stub_set(rp, NFS4_STUB_NONE); 2016 } 2017 2018 #ifdef DEBUG 2019 2020 /* 2021 * Look in the rnode table for other rnodes that have the same filehandle. 2022 * Assume the lock is held for the hash chain of checkrp 2023 */ 2024 2025 static void 2026 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 2027 { 2028 rnode4_t *rp; 2029 vnode_t *tvp; 2030 nfs4_fhandle_t fh, fh2; 2031 int index; 2032 2033 if (!r4_check_for_dups) 2034 return; 2035 2036 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 2037 2038 sfh4_copyval(checkrp->r_fh, &fh); 2039 2040 for (index = 0; index < rtable4size; index++) { 2041 2042 if (&rtable4[index] != checkrp->r_hashq) 2043 rw_enter(&rtable4[index].r_lock, RW_READER); 2044 2045 for (rp = rtable4[index].r_hashf; 2046 rp != (rnode4_t *)(&rtable4[index]); 2047 rp = rp->r_hashf) { 2048 2049 if (rp == checkrp) 2050 continue; 2051 2052 tvp = RTOV4(rp); 2053 if (tvp->v_vfsp != vfsp) 2054 continue; 2055 2056 sfh4_copyval(rp->r_fh, &fh2); 2057 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 2058 cmn_err(CE_PANIC, "rnodes with same fs, fh " 2059 "(%p, %p)", (void *)checkrp, (void *)rp); 2060 } 2061 } 2062 2063 if (&rtable4[index] != checkrp->r_hashq) 2064 rw_exit(&rtable4[index].r_lock); 2065 } 2066 } 2067 2068 #endif /* DEBUG */ 2069