1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 29 * All Rights Reserved 30 */ 31 32 #pragma ident "%Z%%M% %I% %E% SMI" 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/proc.h> 39 #include <sys/user.h> 40 #include <sys/time.h> 41 #include <sys/buf.h> 42 #include <sys/vfs.h> 43 #include <sys/vnode.h> 44 #include <sys/socket.h> 45 #include <sys/uio.h> 46 #include <sys/tiuser.h> 47 #include <sys/swap.h> 48 #include <sys/errno.h> 49 #include <sys/debug.h> 50 #include <sys/kmem.h> 51 #include <sys/kstat.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vtrace.h> 54 #include <sys/session.h> 55 #include <sys/dnlc.h> 56 #include <sys/bitmap.h> 57 #include <sys/acl.h> 58 #include <sys/ddi.h> 59 #include <sys/pathname.h> 60 #include <sys/flock.h> 61 #include <sys/dirent.h> 62 #include <sys/flock.h> 63 #include <sys/callb.h> 64 65 #include <rpc/types.h> 66 #include <rpc/xdr.h> 67 #include <rpc/auth.h> 68 #include <rpc/rpcsec_gss.h> 69 #include <rpc/clnt.h> 70 71 #include <nfs/nfs.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/nfs_acl.h> 74 75 #include <nfs/nfs4.h> 76 #include <nfs/rnode4.h> 77 #include <nfs/nfs4_clnt.h> 78 79 /* 80 * The hash queues for the access to active and cached rnodes 81 * are organized as doubly linked lists. A reader/writer lock 82 * for each hash bucket is used to control access and to synchronize 83 * lookups, additions, and deletions from the hash queue. 84 * 85 * The rnode freelist is organized as a doubly linked list with 86 * a head pointer. Additions and deletions are synchronized via 87 * a single mutex. 88 * 89 * In order to add an rnode to the free list, it must be hashed into 90 * a hash queue and the exclusive lock to the hash queue be held. 91 * If an rnode is not hashed into a hash queue, then it is destroyed 92 * because it represents no valuable information that can be reused 93 * about the file. The exclusive lock to the hash queue must be 94 * held in order to prevent a lookup in the hash queue from finding 95 * the rnode and using it and assuming that the rnode is not on the 96 * freelist. The lookup in the hash queue will have the hash queue 97 * locked, either exclusive or shared. 98 * 99 * The vnode reference count for each rnode is not allowed to drop 100 * below 1. This prevents external entities, such as the VM 101 * subsystem, from acquiring references to vnodes already on the 102 * freelist and then trying to place them back on the freelist 103 * when their reference is released. This means that the when an 104 * rnode is looked up in the hash queues, then either the rnode 105 * is removed from the freelist and that reference is tranfered to 106 * the new reference or the vnode reference count must be incremented 107 * accordingly. The mutex for the freelist must be held in order to 108 * accurately test to see if the rnode is on the freelist or not. 109 * The hash queue lock might be held shared and it is possible that 110 * two different threads may race to remove the rnode from the 111 * freelist. This race can be resolved by holding the mutex for the 112 * freelist. Please note that the mutex for the freelist does not 113 * need to be held if the rnode is not on the freelist. It can not be 114 * placed on the freelist due to the requirement that the thread 115 * putting the rnode on the freelist must hold the exclusive lock 116 * to the hash queue and the thread doing the lookup in the hash 117 * queue is holding either a shared or exclusive lock to the hash 118 * queue. 119 * 120 * The lock ordering is: 121 * 122 * hash bucket lock -> vnode lock 123 * hash bucket lock -> freelist lock -> mi_fileid_lock -> r_statelock 124 */ 125 r4hashq_t *rtable4; 126 127 static kmutex_t rp4freelist_lock; 128 static rnode4_t *rp4freelist = NULL; 129 static long rnode4_new = 0; 130 int rtable4size; 131 static int rtable4mask; 132 static struct kmem_cache *rnode4_cache; 133 static int rnode4_hashlen = 4; 134 135 static void r4inactive(rnode4_t *, cred_t *); 136 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 137 struct vnodeops *, 138 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 139 cred_t *), 140 int *, cred_t *); 141 static vnode_t *nfs4fidcollide(rnode4_t *, mntinfo4_t *); 142 static void rp4_rmfree(rnode4_t *); 143 int nfs4_free_data_reclaim(rnode4_t *); 144 static int nfs4_active_data_reclaim(rnode4_t *); 145 static int nfs4_free_reclaim(void); 146 static int nfs4_active_reclaim(void); 147 static int nfs4_rnode_reclaim(void); 148 static void nfs4_reclaim(void *); 149 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 150 static void uninit_rnode4(rnode4_t *); 151 static void destroy_rnode4(rnode4_t *); 152 static int rp4_fileid_cmp(const void *, const void *); 153 154 155 #ifdef DEBUG 156 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 157 static int nfs4_rnode_debug = 0; 158 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 159 static int nfs4_rnode_nofreelist = 0; 160 /* give messages on colliding shared filehandles */ 161 static int nfs4_fidcollide_debug = 0; 162 static void r4_dup_check(rnode4_t *, vfs_t *); 163 #endif 164 165 /* 166 * Free the resources associated with an rnode. 167 */ 168 static void 169 r4inactive(rnode4_t *rp, cred_t *cr) 170 { 171 vnode_t *vp; 172 char *contents; 173 int size; 174 vsecattr_t *vsp; 175 vnode_t *xattr; 176 int error; 177 178 /* 179 * Before freeing anything, wait until all asynchronous 180 * activity is done on this rnode. This will allow all 181 * asynchronous read ahead and write behind i/o's to 182 * finish. 183 */ 184 mutex_enter(&rp->r_statelock); 185 while (rp->r_count > 0) 186 cv_wait(&rp->r_cv, &rp->r_statelock); 187 mutex_exit(&rp->r_statelock); 188 189 /* 190 * Flush and invalidate all pages associated with the vnode. 191 */ 192 vp = RTOV4(rp); 193 if (nfs4_has_pages(vp)) { 194 ASSERT(vp->v_type != VCHR); 195 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 196 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr); 197 if (error && (error == ENOSPC || error == EDQUOT)) { 198 mutex_enter(&rp->r_statelock); 199 if (!rp->r_error) 200 rp->r_error = error; 201 mutex_exit(&rp->r_statelock); 202 } 203 } 204 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 205 } 206 207 /* 208 * Free any held caches which may be 209 * associated with this rnode. 210 */ 211 mutex_enter(&rp->r_statelock); 212 contents = rp->r_symlink.contents; 213 size = rp->r_symlink.size; 214 rp->r_symlink.contents = NULL; 215 vsp = rp->r_secattr; 216 rp->r_secattr = NULL; 217 xattr = rp->r_xattr_dir; 218 rp->r_xattr_dir = NULL; 219 mutex_exit(&rp->r_statelock); 220 221 /* 222 * Free the access cache entries. 223 */ 224 (void) nfs4_access_purge_rp(rp); 225 226 /* 227 * Free the readdir cache entries. 228 */ 229 nfs4_purge_rddir_cache(vp); 230 231 /* 232 * Free the symbolic link cache. 233 */ 234 if (contents != NULL) { 235 236 kmem_free((void *)contents, size); 237 } 238 239 /* 240 * Free any cached ACL. 241 */ 242 if (vsp != NULL) 243 nfs4_acl_free_cache(vsp); 244 245 /* 246 * Release the cached xattr_dir 247 */ 248 if (xattr != NULL) 249 VN_RELE(xattr); 250 } 251 252 /* 253 * We have seen a case that the fh passed in is for "." which 254 * should be a VROOT node, however, the fh is different from the 255 * root fh stored in the mntinfo4_t. The invalid fh might be 256 * from a misbehaved server and will panic the client system at 257 * a later time. To avoid the panic, we drop the bad fh, use 258 * the root fh from mntinfo4_t, and print an error message 259 * for attention. 260 */ 261 nfs4_sharedfh_t * 262 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 263 int *wasbad) 264 { 265 char *s; 266 267 *wasbad = 0; 268 s = fn_name(nm); 269 ASSERT(strcmp(s, "..") != 0); 270 271 if ((s[0] == '.' && s[1] == '\0') && fh && 272 !SFH4_SAME(mi->mi_rootfh, fh)) { 273 #ifdef DEBUG 274 nfs4_fhandle_t fhandle; 275 276 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 277 "Server %s returns a different " 278 "root filehandle for the path %s:", 279 mi->mi_curr_serv->sv_hostname, 280 mi->mi_curr_serv->sv_path); 281 282 /* print the bad fh */ 283 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 284 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 285 fhandle.fh_len); 286 nfs4_printfhandle(&fhandle); 287 288 /* print mi_rootfh */ 289 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 290 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 291 fhandle.fh_len); 292 nfs4_printfhandle(&fhandle); 293 #endif 294 /* use mi_rootfh instead; fh will be rele by the caller */ 295 fh = mi->mi_rootfh; 296 *wasbad = 1; 297 } 298 299 kmem_free(s, MAXNAMELEN); 300 return (fh); 301 } 302 303 /* 304 * If we have volatile filehandles that may be expired while 305 * a file is held open, we need to check to see if this new 306 * rnode has the same fileid as an existing rnode. If so, 307 * then we drop this rnode and start again with the other 308 * filehandle. 309 */ 310 vnode_t * 311 shfh_collide_check(vnode_t *vp, vnode_t **badvp, mntinfo4_t *mi, 312 nfs4_ga_res_t *garp) 313 { 314 *badvp = NULL; 315 316 if (((mi->mi_fh_expire_type & 317 (FH4_VOLATILE_ANY | 318 FH4_VOL_MIGRATION | 319 FH4_VOL_RENAME)) != 0) && 320 ((mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) == 0)) { 321 rnode4_t *rp = VTOR4(vp); 322 vnode_t *tmpvp; 323 324 if (! (rp->r_attr.va_mask & AT_NODEID)) { 325 /* 326 * if the rnode doesn't have its nodeid cached, 327 * try to get it from the garp. 328 */ 329 if (garp != NULL) { 330 rp->r_attr.va_nodeid = garp->n4g_va.va_nodeid; 331 rp->r_attr.va_mask |= AT_NODEID; 332 } 333 } 334 if (rp->r_attr.va_mask & AT_NODEID) { 335 mutex_enter(&mi->mi_fileid_lock); 336 tmpvp = nfs4fidcollide(rp, mi); 337 mutex_exit(&mi->mi_fileid_lock); 338 if (tmpvp != NULL) { 339 /* 340 * We got a collision. 341 * badvp needs to be released, but not until 342 * after we drop the hash bucket lock. 343 * tmpvp is returned held. 344 */ 345 *badvp = vp; 346 vp = tmpvp; 347 } 348 } else if (! (vp->v_flag & VROOT)) { 349 /* 350 * Don't issue a warning for the root, because when 351 * we're creating the rootvp at mount time, we never 352 * have the fileid. 353 */ 354 NFS4_DEBUG(nfs4_fidcollide_debug, 355 (CE_NOTE, "rp %p: " 356 "cannot get fileid for duplicate check", 357 (void *) rp)); 358 } 359 } 360 361 return (vp); 362 } 363 364 void 365 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 366 hrtime_t t, cred_t *cr, int index) 367 { 368 vattr_t *attr; 369 /* 370 * Don't add to attrcache if time overflow, but 371 * no need to check because either attr is null or the time 372 * values in it were processed by nfs4_time_ntov(), which checks 373 * for time overflows. 374 */ 375 attr = garp ? &garp->n4g_va : NULL; 376 377 if (attr) { 378 if (!newnode) { 379 rw_exit(&rtable4[index].r_lock); 380 #ifdef DEBUG 381 if (vp->v_type != attr->va_type && 382 vp->v_type != VNON && attr->va_type != VNON) { 383 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 384 "makenfs4node: type (%d) doesn't " 385 "match type of found node at %p (%d)", 386 attr->va_type, (void *)vp, vp->v_type); 387 } 388 #endif 389 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 390 } else { 391 rnode4_t *rp = VTOR4(vp); 392 393 vp->v_type = attr->va_type; 394 vp->v_rdev = attr->va_rdev; 395 396 /* 397 * Turn this object into a "stub" object if we 398 * crossed an underlying server fs boundary. To 399 * make this check, during mount we save the 400 * fsid of the server object being mounted. 401 * Here we compare this object's server fsid 402 * with the fsid we saved at mount. If they 403 * are different, we crossed server fs boundary. 404 * 405 * The stub flag is set (or not) at rnode 406 * creation time and it never changes for life 407 * of rnode. 408 * 409 * We don't bother with taking r_state_lock 410 * to set R4SRVSTUB flag because this is a new 411 * rnode and we're holding rtable lock. No other 412 * thread could have obtained access to this 413 * rnode. 414 */ 415 if (garp->n4g_fsid_valid) { 416 rp->r_srv_fsid = garp->n4g_fsid; 417 418 if (vp->v_type == VDIR) { 419 servinfo4_t *svp = rp->r_server; 420 421 (void) nfs_rw_enter_sig(&svp->sv_lock, 422 RW_READER, 0); 423 if (!FATTR4_FSID_EQ(&garp->n4g_fsid, 424 &svp->sv_fsid)) { 425 rp->r_flags |= R4SRVSTUB; 426 } 427 nfs_rw_exit(&svp->sv_lock); 428 } 429 } 430 431 /* Can not cache partial attr */ 432 if (attr->va_mask == AT_ALL) 433 nfs4_attrcache_noinval(vp, garp, t); 434 else 435 PURGE_ATTRCACHE4(vp); 436 437 rw_exit(&rtable4[index].r_lock); 438 } 439 } else { 440 if (newnode) { 441 PURGE_ATTRCACHE4(vp); 442 } 443 rw_exit(&rtable4[index].r_lock); 444 } 445 } 446 447 /* 448 * Find or create an rnode based primarily on filehandle. To be 449 * used when dvp (vnode for parent directory) is not available; 450 * otherwise, makenfs4node() should be used. 451 * 452 * The nfs4_fname_t argument *npp is consumed and nulled out. 453 */ 454 455 vnode_t * 456 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 457 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 458 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 459 { 460 vfs_t *vfsp = mi->mi_vfsp; 461 int newnode = 0; 462 vnode_t *vp; 463 rnode4_t *rp; 464 svnode_t *svp; 465 nfs4_fname_t *name; 466 int index; 467 468 ASSERT(npp && *npp); 469 name = *npp; 470 *npp = NULL; 471 472 index = rtable4hash(sfh); 473 rw_enter(&rtable4[index].r_lock, RW_READER); 474 475 rp = r4find(&rtable4[index], sfh, vfsp); 476 if (rp != NULL) { 477 rw_exit(&rtable4[index].r_lock); 478 vp = RTOV4(rp); 479 fn_rele(&name); 480 return (vp); 481 } 482 483 vp = make_rnode4(sfh, &rtable4[index], vfsp, 484 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 485 if (newnode) { 486 svp = vtosv(vp); 487 svp->sv_forw = svp->sv_back = svp; 488 svp->sv_name = name; 489 if (psfh != NULL) 490 sfh4_hold(psfh); 491 svp->sv_dfh = psfh; 492 } else { 493 fn_rele(&name); 494 } 495 496 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 497 r4_do_attrcache(vp, garp, newnode, t, cr, index); 498 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 499 500 return (vp); 501 } 502 503 /* 504 * Find or create a vnode for the given filehandle, filesystem, parent, and 505 * name. The reference to nm is consumed, so the caller must first do an 506 * fn_hold() if it wants to continue using nm after this call. 507 */ 508 vnode_t * 509 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 510 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 511 { 512 vnode_t *vp; 513 vnode_t *badvp = NULL; 514 int newnode; 515 int index; 516 mntinfo4_t *mi = VFTOMI4(vfsp); 517 int had_badfh = 0; 518 rnode4_t *rp; 519 520 ASSERT(dvp != NULL); 521 522 fh = badrootfh_check(fh, nm, mi, &had_badfh); 523 524 index = rtable4hash(fh); 525 rw_enter(&rtable4[index].r_lock, RW_READER); 526 527 /* 528 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 529 */ 530 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 531 nfs4_putapage, &newnode, cr); 532 533 /* 534 * Check for shared filehandle collisions. This only applies 535 * to servers with volatile filehandles. 536 */ 537 vp = shfh_collide_check(vp, &badvp, mi, garp); 538 rp = VTOR4(vp); 539 /* If we had a shfh collision... */ 540 if (badvp != NULL) { 541 int newindex; 542 nfs4_fname_t *tname = nm; 543 fn_hold(tname); 544 545 /* 546 * We must activate the shadow vnode, even though the 547 * rnode will be short-lived. This is because other 548 * things, especially things in inactive, 549 * assume that sv_dfh and sv_name are non-NULL. 550 */ 551 sv_activate(&badvp, dvp, &tname, newnode); 552 553 /* 554 * Since the vnode we're replacing badvp with already 555 * exists, it's not a newnode. 556 */ 557 newnode = 0; 558 559 /* check to see if we need a different hashq lock */ 560 newindex = rtable4hash(rp->r_fh); 561 if (newindex != index) { 562 rw_exit(&rtable4[index].r_lock); 563 rw_enter(&rtable4[newindex].r_lock, RW_READER); 564 index = newindex; 565 } 566 } 567 568 sv_activate(&vp, dvp, &nm, newnode); 569 if (dvp->v_flag & V_XATTRDIR) { 570 mutex_enter(&rp->r_statelock); 571 rp->r_flags |= R4ISXATTR; 572 mutex_exit(&rp->r_statelock); 573 } 574 575 /* if getting a bad file handle, do not cache the attributes. */ 576 if (had_badfh) { 577 rw_exit(&rtable4[index].r_lock); 578 if (badvp != NULL) { 579 VN_RELE(badvp); 580 } 581 return (vp); 582 } 583 584 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 585 r4_do_attrcache(vp, garp, newnode, t, cr, index); 586 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 587 588 /* 589 * If a shared filehandle collision occured, release the newly 590 * created rnode (in favor of the extant one). 591 */ 592 if (badvp != NULL) { 593 VN_RELE(badvp); 594 } 595 596 return (vp); 597 } 598 599 /* 600 * Detect if there are any extant rnodes with the same fileid. If 601 * not, store this rnode in the table. 602 * 603 * Only call this if r_attr.va_nodeid is set with the correct fileid. 604 * 605 * Returns NULL if no collision; otherwise, returns the extant vnode that 606 * has the same fileid as the one passed in. The vnode is returned 607 * held. 608 */ 609 610 vnode_t * 611 nfs4fidcollide(rnode4_t *rp, mntinfo4_t *mi) 612 { 613 avl_index_t where; 614 rnode4_t *conflict; 615 vnode_t *rvp; 616 617 ASSERT(RW_LOCK_HELD(&rp->r_hashq->r_lock)); 618 ASSERT(MUTEX_HELD(&mi->mi_fileid_lock)); 619 ASSERT(rp->r_attr.va_mask & AT_NODEID); 620 621 conflict = avl_find(&mi->mi_fileid_map, rp, &where); 622 623 if (conflict == rp) 624 return (NULL); 625 626 if (conflict == NULL) { 627 avl_insert(&mi->mi_fileid_map, 628 rp, where); 629 mutex_enter(&rp->r_statelock); 630 rp->r_flags |= R4FILEIDMAP; 631 mutex_exit(&rp->r_statelock); 632 return (NULL); 633 } 634 635 NFS4_DEBUG(nfs4_fidcollide_debug, (CE_NOTE, 636 "nfs4fidcollide: fileid %lld remapping to rnode %p", 637 rp->r_attr.va_nodeid, (void *) conflict)); 638 rvp = RTOV4(conflict); 639 VN_HOLD(rvp); 640 return (rvp); 641 } 642 643 /* 644 * Hash on address of filehandle object. 645 * XXX totally untuned. 646 */ 647 648 int 649 rtable4hash(nfs4_sharedfh_t *fh) 650 { 651 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 652 } 653 654 /* 655 * Find or create the vnode for the given filehandle and filesystem. 656 * *newnode is set to zero if the vnode already existed; non-zero if it had 657 * to be created. 658 * 659 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 660 */ 661 662 static vnode_t * 663 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 664 struct vnodeops *vops, 665 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 666 int *newnode, cred_t *cr) 667 { 668 rnode4_t *rp; 669 rnode4_t *trp; 670 vnode_t *vp; 671 mntinfo4_t *mi; 672 673 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 674 675 mi = VFTOMI4(vfsp); 676 677 start: 678 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 679 vp = RTOV4(rp); 680 *newnode = 0; 681 return (vp); 682 } 683 rw_exit(&rhtp->r_lock); 684 685 mutex_enter(&rp4freelist_lock); 686 687 if (rp4freelist != NULL && rnode4_new >= nrnode) { 688 rp = rp4freelist; 689 rp4_rmfree(rp); 690 mutex_exit(&rp4freelist_lock); 691 692 vp = RTOV4(rp); 693 694 if (rp->r_flags & R4FILEIDMAP) 695 rp4_fileid_map_remove(rp); 696 if (rp->r_flags & R4HASHED) { 697 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 698 mutex_enter(&vp->v_lock); 699 if (vp->v_count > 1) { 700 vp->v_count--; 701 mutex_exit(&vp->v_lock); 702 rw_exit(&rp->r_hashq->r_lock); 703 rw_enter(&rhtp->r_lock, RW_READER); 704 goto start; 705 } 706 mutex_exit(&vp->v_lock); 707 rp4_rmhash_locked(rp); 708 rw_exit(&rp->r_hashq->r_lock); 709 } 710 711 r4inactive(rp, cr); 712 713 mutex_enter(&vp->v_lock); 714 if (vp->v_count > 1) { 715 vp->v_count--; 716 mutex_exit(&vp->v_lock); 717 rw_enter(&rhtp->r_lock, RW_READER); 718 goto start; 719 } 720 mutex_exit(&vp->v_lock); 721 vn_invalid(vp); 722 723 /* 724 * destroy old locks before bzero'ing and 725 * recreating the locks below. 726 */ 727 uninit_rnode4(rp); 728 729 /* 730 * Make sure that if rnode is recycled then 731 * VFS count is decremented properly before 732 * reuse. 733 */ 734 VFS_RELE(vp->v_vfsp); 735 vn_reinit(vp); 736 } else { 737 vnode_t *new_vp; 738 739 mutex_exit(&rp4freelist_lock); 740 741 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 742 new_vp = vn_alloc(KM_SLEEP); 743 744 atomic_add_long((ulong_t *)&rnode4_new, 1); 745 #ifdef DEBUG 746 clstat4_debug.nrnode.value.ui64++; 747 #endif 748 vp = new_vp; 749 } 750 751 bzero(rp, sizeof (*rp)); 752 rp->r_vnode = vp; 753 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 754 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 755 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 756 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 757 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 758 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 759 rp->created_v4 = 0; 760 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 761 offsetof(nfs4_open_stream_t, os_node)); 762 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 763 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 764 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 765 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 766 rp->r_flags = R4READDIRWATTR; 767 rp->r_fh = fh; 768 rp->r_hashq = rhtp; 769 sfh4_hold(rp->r_fh); 770 rp->r_server = mi->mi_curr_serv; 771 rp->r_deleg_type = OPEN_DELEGATE_NONE; 772 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 773 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 774 775 rddir4_cache_create(rp); 776 rp->r_putapage = putapage; 777 vn_setops(vp, vops); 778 vp->v_data = (caddr_t)rp; 779 vp->v_vfsp = vfsp; 780 VFS_HOLD(vfsp); 781 vp->v_type = VNON; 782 if (isrootfh(fh, rp)) 783 vp->v_flag = VROOT; 784 vn_exists(vp); 785 786 /* 787 * There is a race condition if someone else 788 * alloc's the rnode while no locks are held, so we 789 * check again and recover if found. 790 */ 791 rw_enter(&rhtp->r_lock, RW_WRITER); 792 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 793 vp = RTOV4(trp); 794 *newnode = 0; 795 rw_exit(&rhtp->r_lock); 796 rp4_addfree(rp, cr); 797 rw_enter(&rhtp->r_lock, RW_READER); 798 return (vp); 799 } 800 rp4_addhash(rp); 801 *newnode = 1; 802 return (vp); 803 } 804 805 static void 806 uninit_rnode4(rnode4_t *rp) 807 { 808 vnode_t *vp = RTOV4(rp); 809 810 ASSERT(rp != NULL); 811 ASSERT(vp != NULL); 812 ASSERT(vp->v_count == 1); 813 ASSERT(rp->r_count == 0); 814 ASSERT(rp->r_mapcnt == 0); 815 if (rp->r_flags & R4LODANGLERS) { 816 nfs4_flush_lock_owners(rp); 817 } 818 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 819 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 820 ASSERT(!(rp->r_flags & R4HASHED)); 821 ASSERT(!(rp->r_flags & R4FILEIDMAP)); 822 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 823 nfs4_clear_open_streams(rp); 824 list_destroy(&rp->r_open_streams); 825 826 /* 827 * Destroy the rddir cache first since we need to grab the r_statelock. 828 */ 829 mutex_enter(&rp->r_statelock); 830 rddir4_cache_destroy(rp); 831 mutex_exit(&rp->r_statelock); 832 sv_uninit(&rp->r_svnode); 833 sfh4_rele(&rp->r_fh); 834 nfs_rw_destroy(&rp->r_rwlock); 835 nfs_rw_destroy(&rp->r_lkserlock); 836 mutex_destroy(&rp->r_statelock); 837 mutex_destroy(&rp->r_statev4_lock); 838 mutex_destroy(&rp->r_os_lock); 839 cv_destroy(&rp->r_cv); 840 cv_destroy(&rp->r_commit.c_cv); 841 nfs_rw_destroy(&rp->r_deleg_recall_lock); 842 if (rp->r_flags & R4DELMAPLIST) 843 list_destroy(&rp->r_indelmap); 844 } 845 846 /* 847 * Put an rnode on the free list. 848 * 849 * Rnodes which were allocated above and beyond the normal limit 850 * are immediately freed. 851 */ 852 void 853 rp4_addfree(rnode4_t *rp, cred_t *cr) 854 { 855 vnode_t *vp; 856 vnode_t *xattr; 857 struct vfs *vfsp; 858 859 vp = RTOV4(rp); 860 ASSERT(vp->v_count >= 1); 861 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 862 863 /* 864 * If we have too many rnodes allocated and there are no 865 * references to this rnode, or if the rnode is no longer 866 * accessible by it does not reside in the hash queues, 867 * or if an i/o error occurred while writing to the file, 868 * then just free it instead of putting it on the rnode 869 * freelist. 870 */ 871 vfsp = vp->v_vfsp; 872 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 873 #ifdef DEBUG 874 (nfs4_rnode_nofreelist != 0) || 875 #endif 876 rp->r_error || (rp->r_flags & R4RECOVERR) || 877 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 878 if (rp->r_flags & R4FILEIDMAP) 879 rp4_fileid_map_remove(rp); 880 if (rp->r_flags & R4HASHED) { 881 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 882 mutex_enter(&vp->v_lock); 883 if (vp->v_count > 1) { 884 vp->v_count--; 885 mutex_exit(&vp->v_lock); 886 rw_exit(&rp->r_hashq->r_lock); 887 return; 888 } 889 mutex_exit(&vp->v_lock); 890 rp4_rmhash_locked(rp); 891 rw_exit(&rp->r_hashq->r_lock); 892 } 893 894 /* 895 * Make sure we don't have a delegation on this rnode 896 * before destroying it. 897 */ 898 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 899 (void) nfs4delegreturn(rp, 900 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 901 } 902 903 r4inactive(rp, cr); 904 905 /* 906 * Recheck the vnode reference count. We need to 907 * make sure that another reference has not been 908 * acquired while we were not holding v_lock. The 909 * rnode is not in the rnode hash queues; one 910 * way for a reference to have been acquired 911 * is for a VOP_PUTPAGE because the rnode was marked 912 * with R4DIRTY or for a modified page. This 913 * reference may have been acquired before our call 914 * to r4inactive. The i/o may have been completed, 915 * thus allowing r4inactive to complete, but the 916 * reference to the vnode may not have been released 917 * yet. In any case, the rnode can not be destroyed 918 * until the other references to this vnode have been 919 * released. The other references will take care of 920 * either destroying the rnode or placing it on the 921 * rnode freelist. If there are no other references, 922 * then the rnode may be safely destroyed. 923 * 924 * Another way for a reference to be acquired 925 * is through the mi_fileid_map, which is used for 926 * detecting and correcting shared fh collisions. 927 * A race between this thread and the one using 928 * mi_fileid_map would have blocked us, above, when 929 * we called rp4_fileid_map_removed, and needed the 930 * mi_fileid_lock mutex. By the time the other thread 931 * released that mutex, it would have done a VN_HOLD(), 932 * which we check for here. 933 */ 934 mutex_enter(&vp->v_lock); 935 if (vp->v_count > 1) { 936 vp->v_count--; 937 mutex_exit(&vp->v_lock); 938 return; 939 } 940 mutex_exit(&vp->v_lock); 941 942 destroy_rnode4(rp); 943 return; 944 } 945 946 /* 947 * Lock the hash queue and then recheck the reference count 948 * to ensure that no other threads have acquired a reference 949 * to indicate that the rnode should not be placed on the 950 * freelist. If another reference has been acquired, then 951 * just release this one and let the other thread complete 952 * the processing of adding this rnode to the freelist. 953 */ 954 again: 955 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 956 957 mutex_enter(&vp->v_lock); 958 if (vp->v_count > 1) { 959 vp->v_count--; 960 mutex_exit(&vp->v_lock); 961 rw_exit(&rp->r_hashq->r_lock); 962 return; 963 } 964 mutex_exit(&vp->v_lock); 965 966 /* 967 * Make sure we don't put an rnode with a delegation 968 * on the free list. 969 */ 970 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 971 rw_exit(&rp->r_hashq->r_lock); 972 (void) nfs4delegreturn(rp, 973 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 974 goto again; 975 } 976 977 /* 978 * Now that we have the hash queue lock, and we know there 979 * are not anymore references on the vnode, check to make 980 * sure there aren't any open streams still on the rnode. 981 * If so, drop the hash queue lock, remove the open streams, 982 * and recheck the v_count. 983 */ 984 mutex_enter(&rp->r_os_lock); 985 if (list_head(&rp->r_open_streams) != NULL) { 986 mutex_exit(&rp->r_os_lock); 987 rw_exit(&rp->r_hashq->r_lock); 988 if (curproc->p_zone != VTOMI4(vp)->mi_zone) 989 nfs4_clear_open_streams(rp); 990 else 991 (void) nfs4close_all(vp, cr); 992 goto again; 993 } 994 mutex_exit(&rp->r_os_lock); 995 996 /* 997 * Before we put it on the freelist, make sure there is no 998 * active xattr directory cached, the freelist will not 999 * have its entries r4inactive'd if there is still an active 1000 * rnode, thus nothing in the freelist can hold another 1001 * rnode active. 1002 */ 1003 xattr = rp->r_xattr_dir; 1004 rp->r_xattr_dir = NULL; 1005 1006 /* 1007 * If there is no cached data or metadata for this file, then 1008 * put the rnode on the front of the freelist so that it will 1009 * be reused before other rnodes which may have cached data or 1010 * metadata associated with them. 1011 */ 1012 mutex_enter(&rp4freelist_lock); 1013 if (rp4freelist == NULL) { 1014 rp->r_freef = rp; 1015 rp->r_freeb = rp; 1016 rp4freelist = rp; 1017 } else { 1018 rp->r_freef = rp4freelist; 1019 rp->r_freeb = rp4freelist->r_freeb; 1020 rp4freelist->r_freeb->r_freef = rp; 1021 rp4freelist->r_freeb = rp; 1022 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 1023 rp->r_symlink.contents == NULL && 1024 rp->r_secattr == NULL) 1025 rp4freelist = rp; 1026 } 1027 mutex_exit(&rp4freelist_lock); 1028 1029 rw_exit(&rp->r_hashq->r_lock); 1030 1031 if (xattr) 1032 VN_RELE(xattr); 1033 } 1034 1035 /* 1036 * Remove an rnode from the free list. 1037 * 1038 * The caller must be holding rp4freelist_lock and the rnode 1039 * must be on the freelist. 1040 */ 1041 static void 1042 rp4_rmfree(rnode4_t *rp) 1043 { 1044 1045 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 1046 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 1047 1048 if (rp == rp4freelist) { 1049 rp4freelist = rp->r_freef; 1050 if (rp == rp4freelist) 1051 rp4freelist = NULL; 1052 } 1053 rp->r_freeb->r_freef = rp->r_freef; 1054 rp->r_freef->r_freeb = rp->r_freeb; 1055 1056 rp->r_freef = rp->r_freeb = NULL; 1057 } 1058 1059 /* 1060 * Put a rnode in the hash table. 1061 * 1062 * The caller must be holding the exclusive hash queue lock 1063 */ 1064 void 1065 rp4_addhash(rnode4_t *rp) 1066 { 1067 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 1068 ASSERT(!(rp->r_flags & R4HASHED)); 1069 1070 #ifdef DEBUG 1071 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 1072 #endif 1073 1074 rp->r_hashf = rp->r_hashq->r_hashf; 1075 rp->r_hashq->r_hashf = rp; 1076 rp->r_hashb = (rnode4_t *)rp->r_hashq; 1077 rp->r_hashf->r_hashb = rp; 1078 1079 mutex_enter(&rp->r_statelock); 1080 rp->r_flags |= R4HASHED; 1081 mutex_exit(&rp->r_statelock); 1082 } 1083 1084 /* 1085 * Remove a rnode from the hash table. 1086 * 1087 * The caller must be holding the hash queue lock. 1088 */ 1089 void 1090 rp4_rmhash_locked(rnode4_t *rp) 1091 { 1092 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 1093 ASSERT(rp->r_flags & R4HASHED); 1094 1095 rp->r_hashb->r_hashf = rp->r_hashf; 1096 rp->r_hashf->r_hashb = rp->r_hashb; 1097 1098 mutex_enter(&rp->r_statelock); 1099 rp->r_flags &= ~R4HASHED; 1100 mutex_exit(&rp->r_statelock); 1101 } 1102 1103 /* 1104 * Remove a rnode from the hash table. 1105 * 1106 * The caller must not be holding the hash queue lock. 1107 */ 1108 void 1109 rp4_rmhash(rnode4_t *rp) 1110 { 1111 1112 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1113 rp4_rmhash_locked(rp); 1114 rw_exit(&rp->r_hashq->r_lock); 1115 } 1116 1117 /* 1118 * fileid map routines 1119 */ 1120 1121 void 1122 rp4_fileid_map_init(avl_tree_t *map) 1123 { 1124 avl_create(map, rp4_fileid_cmp, sizeof (rnode4_t), 1125 offsetof(rnode4_t, r_fileid_map)); 1126 } 1127 1128 int 1129 rp4_fileid_cmp(const void *p1, const void *p2) 1130 { 1131 const rnode4_t *rp1 = (const rnode4_t *) p1; 1132 const rnode4_t *rp2 = (const rnode4_t *) p2; 1133 1134 if (rp1->r_attr.va_nodeid < rp2->r_attr.va_nodeid) 1135 return (-1); 1136 if (rp1->r_attr.va_nodeid > rp2->r_attr.va_nodeid) 1137 return (1); 1138 return (0); 1139 } 1140 1141 void 1142 rp4_fileid_map_remove(rnode4_t *rp) 1143 { 1144 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1145 ASSERT(rp->r_flags & R4FILEIDMAP); 1146 1147 mutex_enter(&mi->mi_fileid_lock); 1148 mutex_enter(&rp->r_statelock); 1149 avl_remove(&mi->mi_fileid_map, rp); 1150 rp->r_flags &= ~R4FILEIDMAP; 1151 mutex_exit(&rp->r_statelock); 1152 mutex_exit(&mi->mi_fileid_lock); 1153 } 1154 1155 void 1156 destroy_fileid_map(struct vfs *vfsp) 1157 { 1158 mntinfo4_t *mi; 1159 rnode4_t *rp; 1160 void *cookie = NULL; 1161 1162 if (vfsp == NULL) 1163 return; 1164 1165 mi = VFTOMI4(vfsp); 1166 1167 /* 1168 * We cannot assert that any locks (e.g. hash bucket, free list) are 1169 * held. 1170 */ 1171 1172 mutex_enter(&mi->mi_fileid_lock); 1173 while ((rp = avl_destroy_nodes(&mi->mi_fileid_map, &cookie)) != NULL) { 1174 mutex_enter(&rp->r_statelock); 1175 rp->r_flags &= ~R4FILEIDMAP; 1176 mutex_exit(&rp->r_statelock); 1177 } 1178 mutex_exit(&mi->mi_fileid_lock); 1179 } 1180 1181 /* 1182 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 1183 * Returns NULL if no match. If an rnode is returned, the reference count 1184 * on the master vnode is incremented. 1185 * 1186 * The caller must be holding the hash queue lock, either shared or exclusive. 1187 */ 1188 rnode4_t * 1189 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 1190 { 1191 rnode4_t *rp; 1192 vnode_t *vp; 1193 1194 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 1195 1196 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 1197 vp = RTOV4(rp); 1198 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 1199 1200 mutex_enter(&rp->r_statelock); 1201 if (rp->r_flags & R4RECOVERR) { 1202 mutex_exit(&rp->r_statelock); 1203 continue; 1204 } 1205 mutex_exit(&rp->r_statelock); 1206 #ifdef DEBUG 1207 r4_dup_check(rp, vfsp); 1208 #endif 1209 if (rp->r_freef != NULL) { 1210 mutex_enter(&rp4freelist_lock); 1211 /* 1212 * If the rnode is on the freelist, 1213 * then remove it and use that reference 1214 * as the new reference. Otherwise, 1215 * need to increment the reference count. 1216 */ 1217 if (rp->r_freef != NULL) { 1218 rp4_rmfree(rp); 1219 mutex_exit(&rp4freelist_lock); 1220 } else { 1221 mutex_exit(&rp4freelist_lock); 1222 VN_HOLD(vp); 1223 } 1224 } else 1225 VN_HOLD(vp); 1226 1227 /* 1228 * if root vnode, set v_flag to indicate that 1229 */ 1230 if (isrootfh(fh, rp)) { 1231 if (!(vp->v_flag & VROOT)) { 1232 mutex_enter(&vp->v_lock); 1233 vp->v_flag |= VROOT; 1234 mutex_exit(&vp->v_lock); 1235 } 1236 } 1237 return (rp); 1238 } 1239 } 1240 return (NULL); 1241 } 1242 1243 /* 1244 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1245 * that assumes the caller hasn't already got the lock 1246 * on the hash bucket. 1247 */ 1248 rnode4_t * 1249 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1250 { 1251 rnode4_t *rp; 1252 int index; 1253 1254 index = rtable4hash(fh); 1255 rw_enter(&rtable4[index].r_lock, RW_READER); 1256 rp = r4find(&rtable4[index], fh, vfsp); 1257 rw_exit(&rtable4[index].r_lock); 1258 1259 return (rp); 1260 } 1261 1262 /* 1263 * Return 1 if there is a active vnode belonging to this vfs in the 1264 * rtable4 cache. 1265 * 1266 * Several of these checks are done without holding the usual 1267 * locks. This is safe because destroy_rtable(), rp_addfree(), 1268 * etc. will redo the necessary checks before actually destroying 1269 * any rnodes. 1270 */ 1271 int 1272 check_rtable4(struct vfs *vfsp) 1273 { 1274 rnode4_t *rp; 1275 vnode_t *vp; 1276 char *busy = NULL; 1277 int index; 1278 1279 for (index = 0; index < rtable4size; index++) { 1280 rw_enter(&rtable4[index].r_lock, RW_READER); 1281 1282 for (rp = rtable4[index].r_hashf; 1283 rp != (rnode4_t *)(&rtable4[index]); 1284 rp = rp->r_hashf) { 1285 1286 vp = RTOV4(rp); 1287 if (vp->v_vfsp == vfsp) { 1288 if (rp->r_freef == NULL) { 1289 busy = "not on free list"; 1290 } else if (nfs4_has_pages(vp) && 1291 (rp->r_flags & R4DIRTY)) { 1292 busy = "dirty pages"; 1293 } else if (rp->r_count > 0) { 1294 busy = "r_count > 0"; 1295 } 1296 1297 if (busy != NULL) { 1298 #ifdef DEBUG 1299 char *path; 1300 1301 path = fn_path(rp->r_svnode.sv_name); 1302 NFS4_DEBUG(nfs4_rnode_debug, 1303 (CE_NOTE, "check_rtable4: " "%s %s", 1304 path, busy)); 1305 kmem_free(path, strlen(path)+1); 1306 #endif 1307 rw_exit(&rtable4[index].r_lock); 1308 return (1); 1309 } 1310 } 1311 } 1312 rw_exit(&rtable4[index].r_lock); 1313 } 1314 return (0); 1315 } 1316 1317 /* 1318 * Destroy inactive vnodes from the hash queues which 1319 * belong to this vfs. All of the vnodes should be inactive. 1320 * It is essential that we destory all rnodes in case of 1321 * forced unmount as well as in normal unmount case. 1322 */ 1323 1324 void 1325 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1326 { 1327 int index; 1328 vnode_t *vp; 1329 rnode4_t *rp, *r_hashf, *rlist; 1330 1331 rlist = NULL; 1332 1333 for (index = 0; index < rtable4size; index++) { 1334 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1335 for (rp = rtable4[index].r_hashf; 1336 rp != (rnode4_t *)(&rtable4[index]); 1337 rp = r_hashf) { 1338 /* save the hash pointer before destroying */ 1339 r_hashf = rp->r_hashf; 1340 1341 vp = RTOV4(rp); 1342 if (vp->v_vfsp == vfsp) { 1343 mutex_enter(&rp4freelist_lock); 1344 if (rp->r_freef != NULL) { 1345 rp4_rmfree(rp); 1346 mutex_exit(&rp4freelist_lock); 1347 rp4_rmhash_locked(rp); 1348 rp->r_hashf = rlist; 1349 rlist = rp; 1350 } else 1351 mutex_exit(&rp4freelist_lock); 1352 } 1353 } 1354 rw_exit(&rtable4[index].r_lock); 1355 } 1356 1357 for (rp = rlist; rp != NULL; rp = r_hashf) { 1358 r_hashf = rp->r_hashf; 1359 /* 1360 * This call to rp4_addfree will end up destroying the 1361 * rnode, but in a safe way with the appropriate set 1362 * of checks done. 1363 */ 1364 rp4_addfree(rp, cr); 1365 } 1366 } 1367 1368 /* 1369 * This routine destroys all the resources of an rnode 1370 * and finally the rnode itself. 1371 */ 1372 static void 1373 destroy_rnode4(rnode4_t *rp) 1374 { 1375 vnode_t *vp; 1376 vfs_t *vfsp; 1377 1378 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1379 1380 vp = RTOV4(rp); 1381 vfsp = vp->v_vfsp; 1382 1383 uninit_rnode4(rp); 1384 atomic_add_long((ulong_t *)&rnode4_new, -1); 1385 #ifdef DEBUG 1386 clstat4_debug.nrnode.value.ui64--; 1387 #endif 1388 kmem_cache_free(rnode4_cache, rp); 1389 vn_invalid(vp); 1390 vn_free(vp); 1391 VFS_RELE(vfsp); 1392 } 1393 1394 /* 1395 * Invalidate the attributes on all rnodes forcing the next getattr 1396 * to go over the wire. Used to flush stale uid and gid mappings. 1397 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1398 */ 1399 void 1400 nfs4_rnode_invalidate(struct vfs *vfsp) 1401 { 1402 int index; 1403 rnode4_t *rp; 1404 vnode_t *vp; 1405 1406 /* 1407 * Walk the hash queues looking for rnodes. 1408 */ 1409 for (index = 0; index < rtable4size; index++) { 1410 rw_enter(&rtable4[index].r_lock, RW_READER); 1411 for (rp = rtable4[index].r_hashf; 1412 rp != (rnode4_t *)(&rtable4[index]); 1413 rp = rp->r_hashf) { 1414 vp = RTOV4(rp); 1415 if (vfsp != NULL && vp->v_vfsp != vfsp) 1416 continue; 1417 1418 if (!mutex_tryenter(&rp->r_statelock)) 1419 continue; 1420 1421 /* 1422 * Expire the attributes by resetting the change 1423 * and attr timeout. 1424 */ 1425 rp->r_change = 0; 1426 PURGE_ATTRCACHE4_LOCKED(rp); 1427 mutex_exit(&rp->r_statelock); 1428 } 1429 rw_exit(&rtable4[index].r_lock); 1430 } 1431 } 1432 1433 /* 1434 * Flush all vnodes in this (or every) vfs. 1435 * Used by nfs_sync and by nfs_unmount. 1436 */ 1437 void 1438 r4flush(struct vfs *vfsp, cred_t *cr) 1439 { 1440 int index; 1441 rnode4_t *rp; 1442 vnode_t *vp, **vplist; 1443 long num, cnt; 1444 1445 /* 1446 * Check to see whether there is anything to do. 1447 */ 1448 num = rnode4_new; 1449 if (num == 0) 1450 return; 1451 1452 /* 1453 * Allocate a slot for all currently active rnodes on the 1454 * supposition that they all may need flushing. 1455 */ 1456 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1457 cnt = 0; 1458 1459 /* 1460 * Walk the hash queues looking for rnodes with page 1461 * lists associated with them. Make a list of these 1462 * files. 1463 */ 1464 for (index = 0; index < rtable4size; index++) { 1465 rw_enter(&rtable4[index].r_lock, RW_READER); 1466 for (rp = rtable4[index].r_hashf; 1467 rp != (rnode4_t *)(&rtable4[index]); 1468 rp = rp->r_hashf) { 1469 vp = RTOV4(rp); 1470 /* 1471 * Don't bother sync'ing a vp if it 1472 * is part of virtual swap device or 1473 * if VFS is read-only 1474 */ 1475 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1476 continue; 1477 /* 1478 * If flushing all mounted file systems or 1479 * the vnode belongs to this vfs, has pages 1480 * and is marked as either dirty or mmap'd, 1481 * hold and add this vnode to the list of 1482 * vnodes to flush. 1483 */ 1484 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 1485 nfs4_has_pages(vp) && 1486 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1487 VN_HOLD(vp); 1488 vplist[cnt++] = vp; 1489 if (cnt == num) { 1490 rw_exit(&rtable4[index].r_lock); 1491 goto toomany; 1492 } 1493 } 1494 } 1495 rw_exit(&rtable4[index].r_lock); 1496 } 1497 toomany: 1498 1499 /* 1500 * Flush and release all of the files on the list. 1501 */ 1502 while (cnt-- > 0) { 1503 vp = vplist[cnt]; 1504 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr); 1505 VN_RELE(vp); 1506 } 1507 1508 /* 1509 * Free the space allocated to hold the list. 1510 */ 1511 kmem_free(vplist, num * sizeof (*vplist)); 1512 } 1513 1514 int 1515 nfs4_free_data_reclaim(rnode4_t *rp) 1516 { 1517 char *contents; 1518 vnode_t *xattr; 1519 int size; 1520 vsecattr_t *vsp; 1521 int freed; 1522 bool_t rdc = FALSE; 1523 1524 /* 1525 * Free any held caches which may 1526 * be associated with this rnode. 1527 */ 1528 mutex_enter(&rp->r_statelock); 1529 if (rp->r_dir != NULL) 1530 rdc = TRUE; 1531 contents = rp->r_symlink.contents; 1532 size = rp->r_symlink.size; 1533 rp->r_symlink.contents = NULL; 1534 vsp = rp->r_secattr; 1535 rp->r_secattr = NULL; 1536 xattr = rp->r_xattr_dir; 1537 rp->r_xattr_dir = NULL; 1538 mutex_exit(&rp->r_statelock); 1539 1540 /* 1541 * Free the access cache entries. 1542 */ 1543 freed = nfs4_access_purge_rp(rp); 1544 1545 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1546 return (freed); 1547 1548 /* 1549 * Free the readdir cache entries, incompletely if we can't block. 1550 */ 1551 nfs4_purge_rddir_cache(RTOV4(rp)); 1552 1553 /* 1554 * Free the symbolic link cache. 1555 */ 1556 if (contents != NULL) { 1557 1558 kmem_free((void *)contents, size); 1559 } 1560 1561 /* 1562 * Free any cached ACL. 1563 */ 1564 if (vsp != NULL) 1565 nfs4_acl_free_cache(vsp); 1566 1567 /* 1568 * Release the xattr directory vnode 1569 */ 1570 if (xattr != NULL) 1571 VN_RELE(xattr); 1572 1573 return (1); 1574 } 1575 1576 static int 1577 nfs4_active_data_reclaim(rnode4_t *rp) 1578 { 1579 char *contents; 1580 vnode_t *xattr; 1581 int size; 1582 vsecattr_t *vsp; 1583 int freed; 1584 bool_t rdc = FALSE; 1585 1586 /* 1587 * Free any held credentials and caches which 1588 * may be associated with this rnode. 1589 */ 1590 if (!mutex_tryenter(&rp->r_statelock)) 1591 return (0); 1592 contents = rp->r_symlink.contents; 1593 size = rp->r_symlink.size; 1594 rp->r_symlink.contents = NULL; 1595 vsp = rp->r_secattr; 1596 rp->r_secattr = NULL; 1597 if (rp->r_dir != NULL) 1598 rdc = TRUE; 1599 xattr = rp->r_xattr_dir; 1600 rp->r_xattr_dir = NULL; 1601 mutex_exit(&rp->r_statelock); 1602 1603 /* 1604 * Free the access cache entries. 1605 */ 1606 freed = nfs4_access_purge_rp(rp); 1607 1608 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1609 return (freed); 1610 1611 /* 1612 * Free the symbolic link cache. 1613 */ 1614 if (contents != NULL) { 1615 1616 kmem_free((void *)contents, size); 1617 } 1618 1619 /* 1620 * Free any cached ACL. 1621 */ 1622 if (vsp != NULL) 1623 nfs4_acl_free_cache(vsp); 1624 1625 nfs4_purge_rddir_cache(RTOV4(rp)); 1626 1627 /* 1628 * Release the xattr directory vnode 1629 */ 1630 if (xattr != NULL) 1631 VN_RELE(xattr); 1632 1633 return (1); 1634 } 1635 1636 static int 1637 nfs4_free_reclaim(void) 1638 { 1639 int freed; 1640 rnode4_t *rp; 1641 1642 #ifdef DEBUG 1643 clstat4_debug.f_reclaim.value.ui64++; 1644 #endif 1645 freed = 0; 1646 mutex_enter(&rp4freelist_lock); 1647 rp = rp4freelist; 1648 if (rp != NULL) { 1649 do { 1650 if (nfs4_free_data_reclaim(rp)) 1651 freed = 1; 1652 } while ((rp = rp->r_freef) != rp4freelist); 1653 } 1654 mutex_exit(&rp4freelist_lock); 1655 return (freed); 1656 } 1657 1658 static int 1659 nfs4_active_reclaim(void) 1660 { 1661 int freed; 1662 int index; 1663 rnode4_t *rp; 1664 1665 #ifdef DEBUG 1666 clstat4_debug.a_reclaim.value.ui64++; 1667 #endif 1668 freed = 0; 1669 for (index = 0; index < rtable4size; index++) { 1670 rw_enter(&rtable4[index].r_lock, RW_READER); 1671 for (rp = rtable4[index].r_hashf; 1672 rp != (rnode4_t *)(&rtable4[index]); 1673 rp = rp->r_hashf) { 1674 if (nfs4_active_data_reclaim(rp)) 1675 freed = 1; 1676 } 1677 rw_exit(&rtable4[index].r_lock); 1678 } 1679 return (freed); 1680 } 1681 1682 static int 1683 nfs4_rnode_reclaim(void) 1684 { 1685 int freed; 1686 rnode4_t *rp; 1687 vnode_t *vp; 1688 1689 #ifdef DEBUG 1690 clstat4_debug.r_reclaim.value.ui64++; 1691 #endif 1692 freed = 0; 1693 mutex_enter(&rp4freelist_lock); 1694 while ((rp = rp4freelist) != NULL) { 1695 rp4_rmfree(rp); 1696 mutex_exit(&rp4freelist_lock); 1697 if (rp->r_flags & R4FILEIDMAP) 1698 rp4_fileid_map_remove(rp); 1699 if (rp->r_flags & R4HASHED) { 1700 vp = RTOV4(rp); 1701 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1702 mutex_enter(&vp->v_lock); 1703 if (vp->v_count > 1) { 1704 vp->v_count--; 1705 mutex_exit(&vp->v_lock); 1706 rw_exit(&rp->r_hashq->r_lock); 1707 mutex_enter(&rp4freelist_lock); 1708 continue; 1709 } 1710 mutex_exit(&vp->v_lock); 1711 rp4_rmhash_locked(rp); 1712 rw_exit(&rp->r_hashq->r_lock); 1713 } 1714 /* 1715 * This call to rp_addfree will end up destroying the 1716 * rnode, but in a safe way with the appropriate set 1717 * of checks done. 1718 */ 1719 rp4_addfree(rp, CRED()); 1720 mutex_enter(&rp4freelist_lock); 1721 } 1722 mutex_exit(&rp4freelist_lock); 1723 return (freed); 1724 } 1725 1726 /*ARGSUSED*/ 1727 static void 1728 nfs4_reclaim(void *cdrarg) 1729 { 1730 1731 #ifdef DEBUG 1732 clstat4_debug.reclaim.value.ui64++; 1733 #endif 1734 if (nfs4_free_reclaim()) 1735 return; 1736 1737 if (nfs4_active_reclaim()) 1738 return; 1739 1740 (void) nfs4_rnode_reclaim(); 1741 } 1742 1743 /* 1744 * Returns the clientid4 to use for the given mntinfo4. Note that the 1745 * clientid can change if the caller drops mi_recovlock. 1746 */ 1747 1748 clientid4 1749 mi2clientid(mntinfo4_t *mi) 1750 { 1751 nfs4_server_t *sp; 1752 clientid4 clientid = 0; 1753 1754 /* this locks down sp if it is found */ 1755 sp = find_nfs4_server(mi); 1756 if (sp != NULL) { 1757 clientid = sp->clientid; 1758 mutex_exit(&sp->s_lock); 1759 nfs4_server_rele(sp); 1760 } 1761 return (clientid); 1762 } 1763 1764 /* 1765 * Return the current lease time for the server associated with the given 1766 * file. Note that the lease time could change immediately after this 1767 * call. 1768 */ 1769 1770 time_t 1771 r2lease_time(rnode4_t *rp) 1772 { 1773 nfs4_server_t *sp; 1774 time_t lease_time; 1775 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1776 1777 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1778 1779 /* this locks down sp if it is found */ 1780 sp = find_nfs4_server(VTOMI4(RTOV4(rp))); 1781 1782 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1783 if (sp != NULL) { 1784 mutex_exit(&sp->s_lock); 1785 nfs4_server_rele(sp); 1786 } 1787 nfs_rw_exit(&mi->mi_recovlock); 1788 return (1); /* 1 second */ 1789 } 1790 1791 ASSERT(sp != NULL); 1792 1793 lease_time = sp->s_lease_time; 1794 1795 mutex_exit(&sp->s_lock); 1796 nfs4_server_rele(sp); 1797 nfs_rw_exit(&mi->mi_recovlock); 1798 1799 return (lease_time); 1800 } 1801 1802 /* 1803 * Return a list with information about all the known open instances for 1804 * a filesystem. The caller must call r4releopenlist() when done with the 1805 * list. 1806 * 1807 * We are safe at looking at os_valid and os_pending_close across dropping 1808 * the 'os_sync_lock' to count up the number of open streams and then 1809 * allocate memory for the osp list due to: 1810 * -Looking at os_pending_close is safe since this routine is 1811 * only called via recovery, and os_pending_close can only be set via 1812 * a non-recovery operation (which are all blocked when recovery 1813 * is active). 1814 * 1815 * -Examining os_valid is safe since non-recovery operations, which 1816 * could potentially switch os_valid to 0, are blocked (via 1817 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1818 * (which means we are the only recovery thread potentially acting 1819 * on this open stream). 1820 */ 1821 1822 nfs4_opinst_t * 1823 r4mkopenlist(mntinfo4_t *mi) 1824 { 1825 nfs4_opinst_t *reopenlist, *rep; 1826 rnode4_t *rp; 1827 vnode_t *vp; 1828 vfs_t *vfsp = mi->mi_vfsp; 1829 int numosp; 1830 nfs4_open_stream_t *osp; 1831 int index; 1832 open_delegation_type4 dtype; 1833 int hold_vnode; 1834 1835 reopenlist = NULL; 1836 1837 for (index = 0; index < rtable4size; index++) { 1838 rw_enter(&rtable4[index].r_lock, RW_READER); 1839 for (rp = rtable4[index].r_hashf; 1840 rp != (rnode4_t *)(&rtable4[index]); 1841 rp = rp->r_hashf) { 1842 1843 vp = RTOV4(rp); 1844 if (vp->v_vfsp != vfsp) 1845 continue; 1846 hold_vnode = 0; 1847 1848 mutex_enter(&rp->r_os_lock); 1849 1850 /* Count the number of valid open_streams of the file */ 1851 numosp = 0; 1852 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1853 osp = list_next(&rp->r_open_streams, osp)) { 1854 mutex_enter(&osp->os_sync_lock); 1855 if (osp->os_valid && !osp->os_pending_close) 1856 numosp++; 1857 mutex_exit(&osp->os_sync_lock); 1858 } 1859 1860 /* Fill in the valid open streams per vp */ 1861 if (numosp > 0) { 1862 int j; 1863 1864 hold_vnode = 1; 1865 1866 /* 1867 * Add a new open instance to the list 1868 */ 1869 rep = kmem_zalloc(sizeof (*reopenlist), 1870 KM_SLEEP); 1871 rep->re_next = reopenlist; 1872 reopenlist = rep; 1873 1874 rep->re_vp = vp; 1875 rep->re_osp = kmem_zalloc( 1876 numosp * sizeof (*(rep->re_osp)), 1877 KM_SLEEP); 1878 rep->re_numosp = numosp; 1879 1880 j = 0; 1881 for (osp = list_head(&rp->r_open_streams); 1882 osp != NULL; 1883 osp = list_next(&rp->r_open_streams, osp)) { 1884 1885 mutex_enter(&osp->os_sync_lock); 1886 if (osp->os_valid && 1887 !osp->os_pending_close) { 1888 osp->os_ref_count++; 1889 rep->re_osp[j] = osp; 1890 j++; 1891 } 1892 mutex_exit(&osp->os_sync_lock); 1893 } 1894 /* 1895 * Assuming valid osp(s) stays valid between 1896 * the time obtaining j and numosp. 1897 */ 1898 ASSERT(j == numosp); 1899 } 1900 1901 mutex_exit(&rp->r_os_lock); 1902 /* do this here to keep v_lock > r_os_lock */ 1903 if (hold_vnode) 1904 VN_HOLD(vp); 1905 mutex_enter(&rp->r_statev4_lock); 1906 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1907 /* 1908 * If this rnode holds a delegation, 1909 * but if there are no valid open streams, 1910 * then just discard the delegation 1911 * without doing delegreturn. 1912 */ 1913 if (numosp > 0) 1914 rp->r_deleg_needs_recovery = 1915 rp->r_deleg_type; 1916 } 1917 /* Save the delegation type for use outside the lock */ 1918 dtype = rp->r_deleg_type; 1919 mutex_exit(&rp->r_statev4_lock); 1920 1921 /* 1922 * If we have a delegation then get rid of it. 1923 * We've set rp->r_deleg_needs_recovery so we have 1924 * enough information to recover. 1925 */ 1926 if (dtype != OPEN_DELEGATE_NONE) { 1927 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1928 } 1929 } 1930 rw_exit(&rtable4[index].r_lock); 1931 } 1932 return (reopenlist); 1933 } 1934 1935 /* 1936 * Release the list of open instance references. 1937 */ 1938 1939 void 1940 r4releopenlist(nfs4_opinst_t *reopenp) 1941 { 1942 nfs4_opinst_t *rep, *next; 1943 int i; 1944 1945 for (rep = reopenp; rep; rep = next) { 1946 next = rep->re_next; 1947 1948 for (i = 0; i < rep->re_numosp; i++) 1949 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1950 1951 VN_RELE(rep->re_vp); 1952 kmem_free(rep->re_osp, 1953 rep->re_numosp * sizeof (*(rep->re_osp))); 1954 1955 kmem_free(rep, sizeof (*rep)); 1956 } 1957 } 1958 1959 int 1960 nfs4_rnode_init(void) 1961 { 1962 ulong_t nrnode4_max; 1963 int i; 1964 1965 /* 1966 * Compute the size of the rnode4 hash table 1967 */ 1968 if (nrnode <= 0) 1969 nrnode = ncsize; 1970 nrnode4_max = 1971 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1972 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1973 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1974 "setting nrnode to max value of %ld", nrnode4_max); 1975 nrnode = nrnode4_max; 1976 } 1977 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1978 rtable4mask = rtable4size - 1; 1979 1980 /* 1981 * Allocate and initialize the hash buckets 1982 */ 1983 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1984 for (i = 0; i < rtable4size; i++) { 1985 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1986 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1987 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1988 } 1989 1990 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1991 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1992 1993 return (0); 1994 } 1995 1996 int 1997 nfs4_rnode_fini(void) 1998 { 1999 int i; 2000 2001 /* 2002 * Deallocate the rnode hash queues 2003 */ 2004 kmem_cache_destroy(rnode4_cache); 2005 2006 for (i = 0; i < rtable4size; i++) 2007 rw_destroy(&rtable4[i].r_lock); 2008 2009 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 2010 2011 return (0); 2012 } 2013 2014 /* 2015 * Return non-zero if the given filehandle refers to the root filehandle 2016 * for the given rnode. 2017 */ 2018 2019 static int 2020 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 2021 { 2022 int isroot; 2023 2024 isroot = 0; 2025 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 2026 isroot = 1; 2027 2028 return (isroot); 2029 } 2030 2031 #ifdef DEBUG 2032 2033 /* 2034 * Look in the rnode table for other rnodes that have the same filehandle. 2035 * Assume the lock is held for the hash chain of checkrp 2036 */ 2037 2038 static void 2039 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 2040 { 2041 rnode4_t *rp; 2042 vnode_t *tvp; 2043 nfs4_fhandle_t fh, fh2; 2044 int index; 2045 2046 if (!r4_check_for_dups) 2047 return; 2048 2049 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 2050 2051 sfh4_copyval(checkrp->r_fh, &fh); 2052 2053 for (index = 0; index < rtable4size; index++) { 2054 2055 if (&rtable4[index] != checkrp->r_hashq) 2056 rw_enter(&rtable4[index].r_lock, RW_READER); 2057 2058 for (rp = rtable4[index].r_hashf; 2059 rp != (rnode4_t *)(&rtable4[index]); 2060 rp = rp->r_hashf) { 2061 2062 if (rp == checkrp) 2063 continue; 2064 2065 tvp = RTOV4(rp); 2066 if (tvp->v_vfsp != vfsp) 2067 continue; 2068 2069 sfh4_copyval(rp->r_fh, &fh2); 2070 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 2071 cmn_err(CE_PANIC, "rnodes with same fs, fh " 2072 "(%p, %p)", (void *)checkrp, (void *)rp); 2073 } 2074 } 2075 2076 if (&rtable4[index] != checkrp->r_hashq) 2077 rw_exit(&rtable4[index].r_lock); 2078 } 2079 } 2080 2081 #endif /* DEBUG */ 2082