1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/proc.h> 38 #include <sys/user.h> 39 #include <sys/time.h> 40 #include <sys/buf.h> 41 #include <sys/vfs.h> 42 #include <sys/vnode.h> 43 #include <sys/socket.h> 44 #include <sys/uio.h> 45 #include <sys/tiuser.h> 46 #include <sys/swap.h> 47 #include <sys/errno.h> 48 #include <sys/debug.h> 49 #include <sys/kmem.h> 50 #include <sys/kstat.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vtrace.h> 53 #include <sys/session.h> 54 #include <sys/dnlc.h> 55 #include <sys/bitmap.h> 56 #include <sys/acl.h> 57 #include <sys/ddi.h> 58 #include <sys/pathname.h> 59 #include <sys/flock.h> 60 #include <sys/dirent.h> 61 #include <sys/flock.h> 62 #include <sys/callb.h> 63 64 #include <rpc/types.h> 65 #include <rpc/xdr.h> 66 #include <rpc/auth.h> 67 #include <rpc/rpcsec_gss.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs_clnt.h> 72 #include <nfs/nfs_acl.h> 73 74 #include <nfs/nfs4.h> 75 #include <nfs/rnode4.h> 76 #include <nfs/nfs4_clnt.h> 77 78 /* 79 * The hash queues for the access to active and cached rnodes 80 * are organized as doubly linked lists. A reader/writer lock 81 * for each hash bucket is used to control access and to synchronize 82 * lookups, additions, and deletions from the hash queue. 83 * 84 * The rnode freelist is organized as a doubly linked list with 85 * a head pointer. Additions and deletions are synchronized via 86 * a single mutex. 87 * 88 * In order to add an rnode to the free list, it must be hashed into 89 * a hash queue and the exclusive lock to the hash queue be held. 90 * If an rnode is not hashed into a hash queue, then it is destroyed 91 * because it represents no valuable information that can be reused 92 * about the file. The exclusive lock to the hash queue must be 93 * held in order to prevent a lookup in the hash queue from finding 94 * the rnode and using it and assuming that the rnode is not on the 95 * freelist. The lookup in the hash queue will have the hash queue 96 * locked, either exclusive or shared. 97 * 98 * The vnode reference count for each rnode is not allowed to drop 99 * below 1. This prevents external entities, such as the VM 100 * subsystem, from acquiring references to vnodes already on the 101 * freelist and then trying to place them back on the freelist 102 * when their reference is released. This means that the when an 103 * rnode is looked up in the hash queues, then either the rnode 104 * is removed from the freelist and that reference is tranfered to 105 * the new reference or the vnode reference count must be incremented 106 * accordingly. The mutex for the freelist must be held in order to 107 * accurately test to see if the rnode is on the freelist or not. 108 * The hash queue lock might be held shared and it is possible that 109 * two different threads may race to remove the rnode from the 110 * freelist. This race can be resolved by holding the mutex for the 111 * freelist. Please note that the mutex for the freelist does not 112 * need to be held if the rnode is not on the freelist. It can not be 113 * placed on the freelist due to the requirement that the thread 114 * putting the rnode on the freelist must hold the exclusive lock 115 * to the hash queue and the thread doing the lookup in the hash 116 * queue is holding either a shared or exclusive lock to the hash 117 * queue. 118 * 119 * The lock ordering is: 120 * 121 * hash bucket lock -> vnode lock 122 * hash bucket lock -> freelist lock -> r_statelock 123 */ 124 r4hashq_t *rtable4; 125 126 static kmutex_t rp4freelist_lock; 127 static rnode4_t *rp4freelist = NULL; 128 static long rnode4_new = 0; 129 int rtable4size; 130 static int rtable4mask; 131 static struct kmem_cache *rnode4_cache; 132 static int rnode4_hashlen = 4; 133 134 static void r4inactive(rnode4_t *, cred_t *); 135 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 136 struct vnodeops *, 137 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 138 cred_t *), 139 int *, cred_t *); 140 static void rp4_rmfree(rnode4_t *); 141 int nfs4_free_data_reclaim(rnode4_t *); 142 static int nfs4_active_data_reclaim(rnode4_t *); 143 static int nfs4_free_reclaim(void); 144 static int nfs4_active_reclaim(void); 145 static int nfs4_rnode_reclaim(void); 146 static void nfs4_reclaim(void *); 147 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 148 static void uninit_rnode4(rnode4_t *); 149 static void destroy_rnode4(rnode4_t *); 150 151 #ifdef DEBUG 152 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 153 static int nfs4_rnode_debug = 0; 154 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 155 static int nfs4_rnode_nofreelist = 0; 156 /* give messages on colliding shared filehandles */ 157 static void r4_dup_check(rnode4_t *, vfs_t *); 158 #endif 159 160 /* 161 * If the vnode has pages, run the list and check for any that are 162 * still dangling. We call this routine before putting an rnode on 163 * the free list. 164 */ 165 static int 166 nfs4_dross_pages(vnode_t *vp) 167 { 168 page_t *pp; 169 kmutex_t *vphm; 170 171 vphm = page_vnode_mutex(vp); 172 mutex_enter(vphm); 173 if ((pp = vp->v_pages) != NULL) { 174 do { 175 if (pp->p_fsdata != C_NOCOMMIT) { 176 mutex_exit(vphm); 177 return (1); 178 } 179 } while ((pp = pp->p_vpnext) != vp->v_pages); 180 } 181 mutex_exit(vphm); 182 183 return (0); 184 } 185 186 /* 187 * Flush any pages left on this rnode. 188 */ 189 static void 190 r4flushpages(rnode4_t *rp, cred_t *cr) 191 { 192 vnode_t *vp; 193 int error; 194 195 /* 196 * Before freeing anything, wait until all asynchronous 197 * activity is done on this rnode. This will allow all 198 * asynchronous read ahead and write behind i/o's to 199 * finish. 200 */ 201 mutex_enter(&rp->r_statelock); 202 while (rp->r_count > 0) 203 cv_wait(&rp->r_cv, &rp->r_statelock); 204 mutex_exit(&rp->r_statelock); 205 206 /* 207 * Flush and invalidate all pages associated with the vnode. 208 */ 209 vp = RTOV4(rp); 210 if (nfs4_has_pages(vp)) { 211 ASSERT(vp->v_type != VCHR); 212 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 213 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr); 214 if (error && (error == ENOSPC || error == EDQUOT)) { 215 mutex_enter(&rp->r_statelock); 216 if (!rp->r_error) 217 rp->r_error = error; 218 mutex_exit(&rp->r_statelock); 219 } 220 } 221 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 222 } 223 } 224 225 /* 226 * Free the resources associated with an rnode. 227 */ 228 static void 229 r4inactive(rnode4_t *rp, cred_t *cr) 230 { 231 vnode_t *vp; 232 char *contents; 233 int size; 234 vsecattr_t *vsp; 235 vnode_t *xattr; 236 237 r4flushpages(rp, cr); 238 239 vp = RTOV4(rp); 240 241 /* 242 * Free any held caches which may be 243 * associated with this rnode. 244 */ 245 mutex_enter(&rp->r_statelock); 246 contents = rp->r_symlink.contents; 247 size = rp->r_symlink.size; 248 rp->r_symlink.contents = NULL; 249 vsp = rp->r_secattr; 250 rp->r_secattr = NULL; 251 xattr = rp->r_xattr_dir; 252 rp->r_xattr_dir = NULL; 253 mutex_exit(&rp->r_statelock); 254 255 /* 256 * Free the access cache entries. 257 */ 258 (void) nfs4_access_purge_rp(rp); 259 260 /* 261 * Free the readdir cache entries. 262 */ 263 nfs4_purge_rddir_cache(vp); 264 265 /* 266 * Free the symbolic link cache. 267 */ 268 if (contents != NULL) { 269 270 kmem_free((void *)contents, size); 271 } 272 273 /* 274 * Free any cached ACL. 275 */ 276 if (vsp != NULL) 277 nfs4_acl_free_cache(vsp); 278 279 /* 280 * Release the cached xattr_dir 281 */ 282 if (xattr != NULL) 283 VN_RELE(xattr); 284 } 285 286 /* 287 * We have seen a case that the fh passed in is for "." which 288 * should be a VROOT node, however, the fh is different from the 289 * root fh stored in the mntinfo4_t. The invalid fh might be 290 * from a misbehaved server and will panic the client system at 291 * a later time. To avoid the panic, we drop the bad fh, use 292 * the root fh from mntinfo4_t, and print an error message 293 * for attention. 294 */ 295 nfs4_sharedfh_t * 296 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 297 int *wasbad) 298 { 299 char *s; 300 301 *wasbad = 0; 302 s = fn_name(nm); 303 ASSERT(strcmp(s, "..") != 0); 304 305 if ((s[0] == '.' && s[1] == '\0') && fh && 306 !SFH4_SAME(mi->mi_rootfh, fh)) { 307 #ifdef DEBUG 308 nfs4_fhandle_t fhandle; 309 310 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 311 "Server %s returns a different " 312 "root filehandle for the path %s:", 313 mi->mi_curr_serv->sv_hostname, 314 mi->mi_curr_serv->sv_path); 315 316 /* print the bad fh */ 317 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 318 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 319 fhandle.fh_len); 320 nfs4_printfhandle(&fhandle); 321 322 /* print mi_rootfh */ 323 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 324 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 325 fhandle.fh_len); 326 nfs4_printfhandle(&fhandle); 327 #endif 328 /* use mi_rootfh instead; fh will be rele by the caller */ 329 fh = mi->mi_rootfh; 330 *wasbad = 1; 331 } 332 333 kmem_free(s, MAXNAMELEN); 334 return (fh); 335 } 336 337 void 338 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 339 hrtime_t t, cred_t *cr, int index) 340 { 341 vattr_t *attr; 342 /* 343 * Don't add to attrcache if time overflow, but 344 * no need to check because either attr is null or the time 345 * values in it were processed by nfs4_time_ntov(), which checks 346 * for time overflows. 347 */ 348 attr = garp ? &garp->n4g_va : NULL; 349 350 if (attr) { 351 if (!newnode) { 352 rw_exit(&rtable4[index].r_lock); 353 #ifdef DEBUG 354 if (vp->v_type != attr->va_type && 355 vp->v_type != VNON && attr->va_type != VNON) { 356 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 357 "makenfs4node: type (%d) doesn't " 358 "match type of found node at %p (%d)", 359 attr->va_type, (void *)vp, vp->v_type); 360 } 361 #endif 362 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 363 } else { 364 rnode4_t *rp = VTOR4(vp); 365 366 vp->v_type = attr->va_type; 367 vp->v_rdev = attr->va_rdev; 368 369 /* 370 * Turn this object into a "stub" object if we 371 * crossed an underlying server fs boundary. To 372 * make this check, during mount we save the 373 * fsid of the server object being mounted. 374 * Here we compare this object's server fsid 375 * with the fsid we saved at mount. If they 376 * are different, we crossed server fs boundary. 377 * 378 * The stub flag is set (or not) at rnode 379 * creation time and it never changes for life 380 * of rnode. 381 * 382 * We don't bother with taking r_state_lock 383 * to set R4SRVSTUB flag because this is a new 384 * rnode and we're holding rtable lock. No other 385 * thread could have obtained access to this 386 * rnode. 387 */ 388 if (garp->n4g_fsid_valid) { 389 rp->r_srv_fsid = garp->n4g_fsid; 390 391 if (vp->v_type == VDIR) { 392 servinfo4_t *svp = rp->r_server; 393 394 (void) nfs_rw_enter_sig(&svp->sv_lock, 395 RW_READER, 0); 396 if (!FATTR4_FSID_EQ(&garp->n4g_fsid, 397 &svp->sv_fsid)) { 398 rp->r_flags |= R4SRVSTUB; 399 } 400 nfs_rw_exit(&svp->sv_lock); 401 } 402 } 403 404 /* Can not cache partial attr */ 405 if (attr->va_mask == AT_ALL) 406 nfs4_attrcache_noinval(vp, garp, t); 407 else 408 PURGE_ATTRCACHE4(vp); 409 410 rw_exit(&rtable4[index].r_lock); 411 } 412 } else { 413 if (newnode) { 414 PURGE_ATTRCACHE4(vp); 415 } 416 rw_exit(&rtable4[index].r_lock); 417 } 418 } 419 420 /* 421 * Find or create an rnode based primarily on filehandle. To be 422 * used when dvp (vnode for parent directory) is not available; 423 * otherwise, makenfs4node() should be used. 424 * 425 * The nfs4_fname_t argument *npp is consumed and nulled out. 426 */ 427 428 vnode_t * 429 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 430 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 431 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 432 { 433 vfs_t *vfsp = mi->mi_vfsp; 434 int newnode = 0; 435 vnode_t *vp; 436 rnode4_t *rp; 437 svnode_t *svp; 438 nfs4_fname_t *name; 439 int index; 440 441 ASSERT(npp && *npp); 442 name = *npp; 443 *npp = NULL; 444 445 index = rtable4hash(sfh); 446 rw_enter(&rtable4[index].r_lock, RW_READER); 447 448 rp = r4find(&rtable4[index], sfh, vfsp); 449 if (rp != NULL) { 450 rw_exit(&rtable4[index].r_lock); 451 vp = RTOV4(rp); 452 fn_rele(&name); 453 return (vp); 454 } 455 456 vp = make_rnode4(sfh, &rtable4[index], vfsp, 457 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 458 if (newnode) { 459 svp = vtosv(vp); 460 svp->sv_forw = svp->sv_back = svp; 461 svp->sv_name = name; 462 if (psfh != NULL) 463 sfh4_hold(psfh); 464 svp->sv_dfh = psfh; 465 } else { 466 fn_rele(&name); 467 } 468 469 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 470 r4_do_attrcache(vp, garp, newnode, t, cr, index); 471 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 472 473 return (vp); 474 } 475 476 /* 477 * Find or create a vnode for the given filehandle, filesystem, parent, and 478 * name. The reference to nm is consumed, so the caller must first do an 479 * fn_hold() if it wants to continue using nm after this call. 480 */ 481 vnode_t * 482 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 483 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 484 { 485 vnode_t *vp; 486 int newnode; 487 int index; 488 mntinfo4_t *mi = VFTOMI4(vfsp); 489 int had_badfh = 0; 490 rnode4_t *rp; 491 492 ASSERT(dvp != NULL); 493 494 fh = badrootfh_check(fh, nm, mi, &had_badfh); 495 496 index = rtable4hash(fh); 497 rw_enter(&rtable4[index].r_lock, RW_READER); 498 499 /* 500 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 501 */ 502 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 503 nfs4_putapage, &newnode, cr); 504 505 rp = VTOR4(vp); 506 sv_activate(&vp, dvp, &nm, newnode); 507 if (dvp->v_flag & V_XATTRDIR) { 508 mutex_enter(&rp->r_statelock); 509 rp->r_flags |= R4ISXATTR; 510 mutex_exit(&rp->r_statelock); 511 } 512 513 /* if getting a bad file handle, do not cache the attributes. */ 514 if (had_badfh) { 515 rw_exit(&rtable4[index].r_lock); 516 return (vp); 517 } 518 519 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 520 r4_do_attrcache(vp, garp, newnode, t, cr, index); 521 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 522 523 return (vp); 524 } 525 526 /* 527 * Hash on address of filehandle object. 528 * XXX totally untuned. 529 */ 530 531 int 532 rtable4hash(nfs4_sharedfh_t *fh) 533 { 534 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 535 } 536 537 /* 538 * Find or create the vnode for the given filehandle and filesystem. 539 * *newnode is set to zero if the vnode already existed; non-zero if it had 540 * to be created. 541 * 542 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 543 */ 544 545 static vnode_t * 546 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 547 struct vnodeops *vops, 548 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 549 int *newnode, cred_t *cr) 550 { 551 rnode4_t *rp; 552 rnode4_t *trp; 553 vnode_t *vp; 554 mntinfo4_t *mi; 555 556 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 557 558 mi = VFTOMI4(vfsp); 559 560 start: 561 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 562 vp = RTOV4(rp); 563 *newnode = 0; 564 return (vp); 565 } 566 rw_exit(&rhtp->r_lock); 567 568 mutex_enter(&rp4freelist_lock); 569 570 if (rp4freelist != NULL && rnode4_new >= nrnode) { 571 rp = rp4freelist; 572 rp4_rmfree(rp); 573 mutex_exit(&rp4freelist_lock); 574 575 vp = RTOV4(rp); 576 577 if (rp->r_flags & R4HASHED) { 578 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 579 mutex_enter(&vp->v_lock); 580 if (vp->v_count > 1) { 581 vp->v_count--; 582 mutex_exit(&vp->v_lock); 583 rw_exit(&rp->r_hashq->r_lock); 584 rw_enter(&rhtp->r_lock, RW_READER); 585 goto start; 586 } 587 mutex_exit(&vp->v_lock); 588 rp4_rmhash_locked(rp); 589 rw_exit(&rp->r_hashq->r_lock); 590 } 591 592 r4inactive(rp, cr); 593 594 mutex_enter(&vp->v_lock); 595 if (vp->v_count > 1) { 596 vp->v_count--; 597 mutex_exit(&vp->v_lock); 598 rw_enter(&rhtp->r_lock, RW_READER); 599 goto start; 600 } 601 mutex_exit(&vp->v_lock); 602 vn_invalid(vp); 603 604 /* 605 * destroy old locks before bzero'ing and 606 * recreating the locks below. 607 */ 608 uninit_rnode4(rp); 609 610 /* 611 * Make sure that if rnode is recycled then 612 * VFS count is decremented properly before 613 * reuse. 614 */ 615 VFS_RELE(vp->v_vfsp); 616 vn_reinit(vp); 617 } else { 618 vnode_t *new_vp; 619 620 mutex_exit(&rp4freelist_lock); 621 622 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 623 new_vp = vn_alloc(KM_SLEEP); 624 625 atomic_add_long((ulong_t *)&rnode4_new, 1); 626 #ifdef DEBUG 627 clstat4_debug.nrnode.value.ui64++; 628 #endif 629 vp = new_vp; 630 } 631 632 bzero(rp, sizeof (*rp)); 633 rp->r_vnode = vp; 634 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 635 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 636 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 637 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 638 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 639 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 640 rp->created_v4 = 0; 641 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 642 offsetof(nfs4_open_stream_t, os_node)); 643 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 644 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 645 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 646 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 647 rp->r_flags = R4READDIRWATTR; 648 rp->r_fh = fh; 649 rp->r_hashq = rhtp; 650 sfh4_hold(rp->r_fh); 651 rp->r_server = mi->mi_curr_serv; 652 rp->r_deleg_type = OPEN_DELEGATE_NONE; 653 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 654 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 655 656 rddir4_cache_create(rp); 657 rp->r_putapage = putapage; 658 vn_setops(vp, vops); 659 vp->v_data = (caddr_t)rp; 660 vp->v_vfsp = vfsp; 661 VFS_HOLD(vfsp); 662 vp->v_type = VNON; 663 if (isrootfh(fh, rp)) 664 vp->v_flag = VROOT; 665 vn_exists(vp); 666 667 /* 668 * There is a race condition if someone else 669 * alloc's the rnode while no locks are held, so we 670 * check again and recover if found. 671 */ 672 rw_enter(&rhtp->r_lock, RW_WRITER); 673 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 674 vp = RTOV4(trp); 675 *newnode = 0; 676 rw_exit(&rhtp->r_lock); 677 rp4_addfree(rp, cr); 678 rw_enter(&rhtp->r_lock, RW_READER); 679 return (vp); 680 } 681 rp4_addhash(rp); 682 *newnode = 1; 683 return (vp); 684 } 685 686 static void 687 uninit_rnode4(rnode4_t *rp) 688 { 689 vnode_t *vp = RTOV4(rp); 690 691 ASSERT(rp != NULL); 692 ASSERT(vp != NULL); 693 ASSERT(vp->v_count == 1); 694 ASSERT(rp->r_count == 0); 695 ASSERT(rp->r_mapcnt == 0); 696 if (rp->r_flags & R4LODANGLERS) { 697 nfs4_flush_lock_owners(rp); 698 } 699 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 700 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 701 ASSERT(!(rp->r_flags & R4HASHED)); 702 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 703 nfs4_clear_open_streams(rp); 704 list_destroy(&rp->r_open_streams); 705 706 /* 707 * Destroy the rddir cache first since we need to grab the r_statelock. 708 */ 709 mutex_enter(&rp->r_statelock); 710 rddir4_cache_destroy(rp); 711 mutex_exit(&rp->r_statelock); 712 sv_uninit(&rp->r_svnode); 713 sfh4_rele(&rp->r_fh); 714 nfs_rw_destroy(&rp->r_rwlock); 715 nfs_rw_destroy(&rp->r_lkserlock); 716 mutex_destroy(&rp->r_statelock); 717 mutex_destroy(&rp->r_statev4_lock); 718 mutex_destroy(&rp->r_os_lock); 719 cv_destroy(&rp->r_cv); 720 cv_destroy(&rp->r_commit.c_cv); 721 nfs_rw_destroy(&rp->r_deleg_recall_lock); 722 if (rp->r_flags & R4DELMAPLIST) 723 list_destroy(&rp->r_indelmap); 724 } 725 726 /* 727 * Put an rnode on the free list. 728 * 729 * Rnodes which were allocated above and beyond the normal limit 730 * are immediately freed. 731 */ 732 void 733 rp4_addfree(rnode4_t *rp, cred_t *cr) 734 { 735 vnode_t *vp; 736 vnode_t *xattr; 737 struct vfs *vfsp; 738 739 vp = RTOV4(rp); 740 ASSERT(vp->v_count >= 1); 741 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 742 743 /* 744 * If we have too many rnodes allocated and there are no 745 * references to this rnode, or if the rnode is no longer 746 * accessible by it does not reside in the hash queues, 747 * or if an i/o error occurred while writing to the file, 748 * then just free it instead of putting it on the rnode 749 * freelist. 750 */ 751 vfsp = vp->v_vfsp; 752 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 753 #ifdef DEBUG 754 (nfs4_rnode_nofreelist != 0) || 755 #endif 756 rp->r_error || (rp->r_flags & R4RECOVERR) || 757 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 758 if (rp->r_flags & R4HASHED) { 759 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 760 mutex_enter(&vp->v_lock); 761 if (vp->v_count > 1) { 762 vp->v_count--; 763 mutex_exit(&vp->v_lock); 764 rw_exit(&rp->r_hashq->r_lock); 765 return; 766 } 767 mutex_exit(&vp->v_lock); 768 rp4_rmhash_locked(rp); 769 rw_exit(&rp->r_hashq->r_lock); 770 } 771 772 /* 773 * Make sure we don't have a delegation on this rnode 774 * before destroying it. 775 */ 776 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 777 (void) nfs4delegreturn(rp, 778 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 779 } 780 781 r4inactive(rp, cr); 782 783 /* 784 * Recheck the vnode reference count. We need to 785 * make sure that another reference has not been 786 * acquired while we were not holding v_lock. The 787 * rnode is not in the rnode hash queues; one 788 * way for a reference to have been acquired 789 * is for a VOP_PUTPAGE because the rnode was marked 790 * with R4DIRTY or for a modified page. This 791 * reference may have been acquired before our call 792 * to r4inactive. The i/o may have been completed, 793 * thus allowing r4inactive to complete, but the 794 * reference to the vnode may not have been released 795 * yet. In any case, the rnode can not be destroyed 796 * until the other references to this vnode have been 797 * released. The other references will take care of 798 * either destroying the rnode or placing it on the 799 * rnode freelist. If there are no other references, 800 * then the rnode may be safely destroyed. 801 */ 802 mutex_enter(&vp->v_lock); 803 if (vp->v_count > 1) { 804 vp->v_count--; 805 mutex_exit(&vp->v_lock); 806 return; 807 } 808 mutex_exit(&vp->v_lock); 809 810 destroy_rnode4(rp); 811 return; 812 } 813 814 /* 815 * Lock the hash queue and then recheck the reference count 816 * to ensure that no other threads have acquired a reference 817 * to indicate that the rnode should not be placed on the 818 * freelist. If another reference has been acquired, then 819 * just release this one and let the other thread complete 820 * the processing of adding this rnode to the freelist. 821 */ 822 again: 823 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 824 825 mutex_enter(&vp->v_lock); 826 if (vp->v_count > 1) { 827 vp->v_count--; 828 mutex_exit(&vp->v_lock); 829 rw_exit(&rp->r_hashq->r_lock); 830 return; 831 } 832 mutex_exit(&vp->v_lock); 833 834 /* 835 * Make sure we don't put an rnode with a delegation 836 * on the free list. 837 */ 838 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 839 rw_exit(&rp->r_hashq->r_lock); 840 (void) nfs4delegreturn(rp, 841 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 842 goto again; 843 } 844 845 /* 846 * Now that we have the hash queue lock, and we know there 847 * are not anymore references on the vnode, check to make 848 * sure there aren't any open streams still on the rnode. 849 * If so, drop the hash queue lock, remove the open streams, 850 * and recheck the v_count. 851 */ 852 mutex_enter(&rp->r_os_lock); 853 if (list_head(&rp->r_open_streams) != NULL) { 854 mutex_exit(&rp->r_os_lock); 855 rw_exit(&rp->r_hashq->r_lock); 856 if (nfs_zone() != VTOMI4(vp)->mi_zone) 857 nfs4_clear_open_streams(rp); 858 else 859 (void) nfs4close_all(vp, cr); 860 goto again; 861 } 862 mutex_exit(&rp->r_os_lock); 863 864 /* 865 * Before we put it on the freelist, make sure there are no pages. 866 * If there are, flush and commit of all of the dirty and 867 * uncommitted pages, assuming the file system isn't read only. 868 */ 869 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) { 870 rw_exit(&rp->r_hashq->r_lock); 871 r4flushpages(rp, cr); 872 goto again; 873 } 874 875 /* 876 * Before we put it on the freelist, make sure there is no 877 * active xattr directory cached, the freelist will not 878 * have its entries r4inactive'd if there is still an active 879 * rnode, thus nothing in the freelist can hold another 880 * rnode active. 881 */ 882 xattr = rp->r_xattr_dir; 883 rp->r_xattr_dir = NULL; 884 885 /* 886 * If there is no cached data or metadata for this file, then 887 * put the rnode on the front of the freelist so that it will 888 * be reused before other rnodes which may have cached data or 889 * metadata associated with them. 890 */ 891 mutex_enter(&rp4freelist_lock); 892 if (rp4freelist == NULL) { 893 rp->r_freef = rp; 894 rp->r_freeb = rp; 895 rp4freelist = rp; 896 } else { 897 rp->r_freef = rp4freelist; 898 rp->r_freeb = rp4freelist->r_freeb; 899 rp4freelist->r_freeb->r_freef = rp; 900 rp4freelist->r_freeb = rp; 901 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 902 rp->r_symlink.contents == NULL && rp->r_secattr == NULL) 903 rp4freelist = rp; 904 } 905 mutex_exit(&rp4freelist_lock); 906 907 rw_exit(&rp->r_hashq->r_lock); 908 909 if (xattr) 910 VN_RELE(xattr); 911 } 912 913 /* 914 * Remove an rnode from the free list. 915 * 916 * The caller must be holding rp4freelist_lock and the rnode 917 * must be on the freelist. 918 */ 919 static void 920 rp4_rmfree(rnode4_t *rp) 921 { 922 923 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 924 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 925 926 if (rp == rp4freelist) { 927 rp4freelist = rp->r_freef; 928 if (rp == rp4freelist) 929 rp4freelist = NULL; 930 } 931 rp->r_freeb->r_freef = rp->r_freef; 932 rp->r_freef->r_freeb = rp->r_freeb; 933 934 rp->r_freef = rp->r_freeb = NULL; 935 } 936 937 /* 938 * Put a rnode in the hash table. 939 * 940 * The caller must be holding the exclusive hash queue lock 941 */ 942 void 943 rp4_addhash(rnode4_t *rp) 944 { 945 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 946 ASSERT(!(rp->r_flags & R4HASHED)); 947 948 #ifdef DEBUG 949 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 950 #endif 951 952 rp->r_hashf = rp->r_hashq->r_hashf; 953 rp->r_hashq->r_hashf = rp; 954 rp->r_hashb = (rnode4_t *)rp->r_hashq; 955 rp->r_hashf->r_hashb = rp; 956 957 mutex_enter(&rp->r_statelock); 958 rp->r_flags |= R4HASHED; 959 mutex_exit(&rp->r_statelock); 960 } 961 962 /* 963 * Remove a rnode from the hash table. 964 * 965 * The caller must be holding the hash queue lock. 966 */ 967 void 968 rp4_rmhash_locked(rnode4_t *rp) 969 { 970 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 971 ASSERT(rp->r_flags & R4HASHED); 972 973 rp->r_hashb->r_hashf = rp->r_hashf; 974 rp->r_hashf->r_hashb = rp->r_hashb; 975 976 mutex_enter(&rp->r_statelock); 977 rp->r_flags &= ~R4HASHED; 978 mutex_exit(&rp->r_statelock); 979 } 980 981 /* 982 * Remove a rnode from the hash table. 983 * 984 * The caller must not be holding the hash queue lock. 985 */ 986 void 987 rp4_rmhash(rnode4_t *rp) 988 { 989 990 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 991 rp4_rmhash_locked(rp); 992 rw_exit(&rp->r_hashq->r_lock); 993 } 994 995 /* 996 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 997 * Returns NULL if no match. If an rnode is returned, the reference count 998 * on the master vnode is incremented. 999 * 1000 * The caller must be holding the hash queue lock, either shared or exclusive. 1001 */ 1002 rnode4_t * 1003 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 1004 { 1005 rnode4_t *rp; 1006 vnode_t *vp; 1007 1008 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 1009 1010 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 1011 vp = RTOV4(rp); 1012 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 1013 1014 mutex_enter(&rp->r_statelock); 1015 if (rp->r_flags & R4RECOVERR) { 1016 mutex_exit(&rp->r_statelock); 1017 continue; 1018 } 1019 mutex_exit(&rp->r_statelock); 1020 #ifdef DEBUG 1021 r4_dup_check(rp, vfsp); 1022 #endif 1023 if (rp->r_freef != NULL) { 1024 mutex_enter(&rp4freelist_lock); 1025 /* 1026 * If the rnode is on the freelist, 1027 * then remove it and use that reference 1028 * as the new reference. Otherwise, 1029 * need to increment the reference count. 1030 */ 1031 if (rp->r_freef != NULL) { 1032 rp4_rmfree(rp); 1033 mutex_exit(&rp4freelist_lock); 1034 } else { 1035 mutex_exit(&rp4freelist_lock); 1036 VN_HOLD(vp); 1037 } 1038 } else 1039 VN_HOLD(vp); 1040 1041 /* 1042 * if root vnode, set v_flag to indicate that 1043 */ 1044 if (isrootfh(fh, rp)) { 1045 if (!(vp->v_flag & VROOT)) { 1046 mutex_enter(&vp->v_lock); 1047 vp->v_flag |= VROOT; 1048 mutex_exit(&vp->v_lock); 1049 } 1050 } 1051 return (rp); 1052 } 1053 } 1054 return (NULL); 1055 } 1056 1057 /* 1058 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1059 * that assumes the caller hasn't already got the lock 1060 * on the hash bucket. 1061 */ 1062 rnode4_t * 1063 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1064 { 1065 rnode4_t *rp; 1066 int index; 1067 1068 index = rtable4hash(fh); 1069 rw_enter(&rtable4[index].r_lock, RW_READER); 1070 rp = r4find(&rtable4[index], fh, vfsp); 1071 rw_exit(&rtable4[index].r_lock); 1072 1073 return (rp); 1074 } 1075 1076 /* 1077 * Return 1 if there is a active vnode belonging to this vfs in the 1078 * rtable4 cache. 1079 * 1080 * Several of these checks are done without holding the usual 1081 * locks. This is safe because destroy_rtable(), rp_addfree(), 1082 * etc. will redo the necessary checks before actually destroying 1083 * any rnodes. 1084 */ 1085 int 1086 check_rtable4(struct vfs *vfsp) 1087 { 1088 rnode4_t *rp; 1089 vnode_t *vp; 1090 char *busy = NULL; 1091 int index; 1092 1093 for (index = 0; index < rtable4size; index++) { 1094 rw_enter(&rtable4[index].r_lock, RW_READER); 1095 1096 for (rp = rtable4[index].r_hashf; 1097 rp != (rnode4_t *)(&rtable4[index]); 1098 rp = rp->r_hashf) { 1099 1100 vp = RTOV4(rp); 1101 if (vp->v_vfsp == vfsp) { 1102 if (rp->r_freef == NULL) { 1103 busy = "not on free list"; 1104 } else if (nfs4_has_pages(vp) && 1105 (rp->r_flags & R4DIRTY)) { 1106 busy = "dirty pages"; 1107 } else if (rp->r_count > 0) { 1108 busy = "r_count > 0"; 1109 } 1110 1111 if (busy != NULL) { 1112 #ifdef DEBUG 1113 char *path; 1114 1115 path = fn_path(rp->r_svnode.sv_name); 1116 NFS4_DEBUG(nfs4_rnode_debug, 1117 (CE_NOTE, "check_rtable4: " "%s %s", 1118 path, busy)); 1119 kmem_free(path, strlen(path)+1); 1120 #endif 1121 rw_exit(&rtable4[index].r_lock); 1122 return (1); 1123 } 1124 } 1125 } 1126 rw_exit(&rtable4[index].r_lock); 1127 } 1128 return (0); 1129 } 1130 1131 /* 1132 * Destroy inactive vnodes from the hash queues which 1133 * belong to this vfs. All of the vnodes should be inactive. 1134 * It is essential that we destory all rnodes in case of 1135 * forced unmount as well as in normal unmount case. 1136 */ 1137 1138 void 1139 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1140 { 1141 int index; 1142 vnode_t *vp; 1143 rnode4_t *rp, *r_hashf, *rlist; 1144 1145 rlist = NULL; 1146 1147 for (index = 0; index < rtable4size; index++) { 1148 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1149 for (rp = rtable4[index].r_hashf; 1150 rp != (rnode4_t *)(&rtable4[index]); 1151 rp = r_hashf) { 1152 /* save the hash pointer before destroying */ 1153 r_hashf = rp->r_hashf; 1154 1155 vp = RTOV4(rp); 1156 if (vp->v_vfsp == vfsp) { 1157 mutex_enter(&rp4freelist_lock); 1158 if (rp->r_freef != NULL) { 1159 rp4_rmfree(rp); 1160 mutex_exit(&rp4freelist_lock); 1161 rp4_rmhash_locked(rp); 1162 rp->r_hashf = rlist; 1163 rlist = rp; 1164 } else 1165 mutex_exit(&rp4freelist_lock); 1166 } 1167 } 1168 rw_exit(&rtable4[index].r_lock); 1169 } 1170 1171 for (rp = rlist; rp != NULL; rp = r_hashf) { 1172 r_hashf = rp->r_hashf; 1173 /* 1174 * This call to rp4_addfree will end up destroying the 1175 * rnode, but in a safe way with the appropriate set 1176 * of checks done. 1177 */ 1178 rp4_addfree(rp, cr); 1179 } 1180 } 1181 1182 /* 1183 * This routine destroys all the resources of an rnode 1184 * and finally the rnode itself. 1185 */ 1186 static void 1187 destroy_rnode4(rnode4_t *rp) 1188 { 1189 vnode_t *vp; 1190 vfs_t *vfsp; 1191 1192 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1193 1194 vp = RTOV4(rp); 1195 vfsp = vp->v_vfsp; 1196 1197 uninit_rnode4(rp); 1198 atomic_add_long((ulong_t *)&rnode4_new, -1); 1199 #ifdef DEBUG 1200 clstat4_debug.nrnode.value.ui64--; 1201 #endif 1202 kmem_cache_free(rnode4_cache, rp); 1203 vn_invalid(vp); 1204 vn_free(vp); 1205 VFS_RELE(vfsp); 1206 } 1207 1208 /* 1209 * Invalidate the attributes on all rnodes forcing the next getattr 1210 * to go over the wire. Used to flush stale uid and gid mappings. 1211 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1212 */ 1213 void 1214 nfs4_rnode_invalidate(struct vfs *vfsp) 1215 { 1216 int index; 1217 rnode4_t *rp; 1218 vnode_t *vp; 1219 1220 /* 1221 * Walk the hash queues looking for rnodes. 1222 */ 1223 for (index = 0; index < rtable4size; index++) { 1224 rw_enter(&rtable4[index].r_lock, RW_READER); 1225 for (rp = rtable4[index].r_hashf; 1226 rp != (rnode4_t *)(&rtable4[index]); 1227 rp = rp->r_hashf) { 1228 vp = RTOV4(rp); 1229 if (vfsp != NULL && vp->v_vfsp != vfsp) 1230 continue; 1231 1232 if (!mutex_tryenter(&rp->r_statelock)) 1233 continue; 1234 1235 /* 1236 * Expire the attributes by resetting the change 1237 * and attr timeout. 1238 */ 1239 rp->r_change = 0; 1240 PURGE_ATTRCACHE4_LOCKED(rp); 1241 mutex_exit(&rp->r_statelock); 1242 } 1243 rw_exit(&rtable4[index].r_lock); 1244 } 1245 } 1246 1247 /* 1248 * Flush all vnodes in this (or every) vfs. 1249 * Used by nfs_sync and by nfs_unmount. 1250 */ 1251 void 1252 r4flush(struct vfs *vfsp, cred_t *cr) 1253 { 1254 int index; 1255 rnode4_t *rp; 1256 vnode_t *vp, **vplist; 1257 long num, cnt; 1258 1259 /* 1260 * Check to see whether there is anything to do. 1261 */ 1262 num = rnode4_new; 1263 if (num == 0) 1264 return; 1265 1266 /* 1267 * Allocate a slot for all currently active rnodes on the 1268 * supposition that they all may need flushing. 1269 */ 1270 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1271 cnt = 0; 1272 1273 /* 1274 * Walk the hash queues looking for rnodes with page 1275 * lists associated with them. Make a list of these 1276 * files. 1277 */ 1278 for (index = 0; index < rtable4size; index++) { 1279 rw_enter(&rtable4[index].r_lock, RW_READER); 1280 for (rp = rtable4[index].r_hashf; 1281 rp != (rnode4_t *)(&rtable4[index]); 1282 rp = rp->r_hashf) { 1283 vp = RTOV4(rp); 1284 /* 1285 * Don't bother sync'ing a vp if it 1286 * is part of virtual swap device or 1287 * if VFS is read-only 1288 */ 1289 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1290 continue; 1291 /* 1292 * If flushing all mounted file systems or 1293 * the vnode belongs to this vfs, has pages 1294 * and is marked as either dirty or mmap'd, 1295 * hold and add this vnode to the list of 1296 * vnodes to flush. 1297 */ 1298 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 1299 nfs4_has_pages(vp) && 1300 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1301 VN_HOLD(vp); 1302 vplist[cnt++] = vp; 1303 if (cnt == num) { 1304 rw_exit(&rtable4[index].r_lock); 1305 goto toomany; 1306 } 1307 } 1308 } 1309 rw_exit(&rtable4[index].r_lock); 1310 } 1311 toomany: 1312 1313 /* 1314 * Flush and release all of the files on the list. 1315 */ 1316 while (cnt-- > 0) { 1317 vp = vplist[cnt]; 1318 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr); 1319 VN_RELE(vp); 1320 } 1321 1322 /* 1323 * Free the space allocated to hold the list. 1324 */ 1325 kmem_free(vplist, num * sizeof (*vplist)); 1326 } 1327 1328 int 1329 nfs4_free_data_reclaim(rnode4_t *rp) 1330 { 1331 char *contents; 1332 vnode_t *xattr; 1333 int size; 1334 vsecattr_t *vsp; 1335 int freed; 1336 bool_t rdc = FALSE; 1337 1338 /* 1339 * Free any held caches which may 1340 * be associated with this rnode. 1341 */ 1342 mutex_enter(&rp->r_statelock); 1343 if (rp->r_dir != NULL) 1344 rdc = TRUE; 1345 contents = rp->r_symlink.contents; 1346 size = rp->r_symlink.size; 1347 rp->r_symlink.contents = NULL; 1348 vsp = rp->r_secattr; 1349 rp->r_secattr = NULL; 1350 xattr = rp->r_xattr_dir; 1351 rp->r_xattr_dir = NULL; 1352 mutex_exit(&rp->r_statelock); 1353 1354 /* 1355 * Free the access cache entries. 1356 */ 1357 freed = nfs4_access_purge_rp(rp); 1358 1359 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1360 return (freed); 1361 1362 /* 1363 * Free the readdir cache entries, incompletely if we can't block. 1364 */ 1365 nfs4_purge_rddir_cache(RTOV4(rp)); 1366 1367 /* 1368 * Free the symbolic link cache. 1369 */ 1370 if (contents != NULL) { 1371 1372 kmem_free((void *)contents, size); 1373 } 1374 1375 /* 1376 * Free any cached ACL. 1377 */ 1378 if (vsp != NULL) 1379 nfs4_acl_free_cache(vsp); 1380 1381 /* 1382 * Release the xattr directory vnode 1383 */ 1384 if (xattr != NULL) 1385 VN_RELE(xattr); 1386 1387 return (1); 1388 } 1389 1390 static int 1391 nfs4_active_data_reclaim(rnode4_t *rp) 1392 { 1393 char *contents; 1394 vnode_t *xattr; 1395 int size; 1396 vsecattr_t *vsp; 1397 int freed; 1398 bool_t rdc = FALSE; 1399 1400 /* 1401 * Free any held credentials and caches which 1402 * may be associated with this rnode. 1403 */ 1404 if (!mutex_tryenter(&rp->r_statelock)) 1405 return (0); 1406 contents = rp->r_symlink.contents; 1407 size = rp->r_symlink.size; 1408 rp->r_symlink.contents = NULL; 1409 vsp = rp->r_secattr; 1410 rp->r_secattr = NULL; 1411 if (rp->r_dir != NULL) 1412 rdc = TRUE; 1413 xattr = rp->r_xattr_dir; 1414 rp->r_xattr_dir = NULL; 1415 mutex_exit(&rp->r_statelock); 1416 1417 /* 1418 * Free the access cache entries. 1419 */ 1420 freed = nfs4_access_purge_rp(rp); 1421 1422 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1423 return (freed); 1424 1425 /* 1426 * Free the symbolic link cache. 1427 */ 1428 if (contents != NULL) { 1429 1430 kmem_free((void *)contents, size); 1431 } 1432 1433 /* 1434 * Free any cached ACL. 1435 */ 1436 if (vsp != NULL) 1437 nfs4_acl_free_cache(vsp); 1438 1439 nfs4_purge_rddir_cache(RTOV4(rp)); 1440 1441 /* 1442 * Release the xattr directory vnode 1443 */ 1444 if (xattr != NULL) 1445 VN_RELE(xattr); 1446 1447 return (1); 1448 } 1449 1450 static int 1451 nfs4_free_reclaim(void) 1452 { 1453 int freed; 1454 rnode4_t *rp; 1455 1456 #ifdef DEBUG 1457 clstat4_debug.f_reclaim.value.ui64++; 1458 #endif 1459 freed = 0; 1460 mutex_enter(&rp4freelist_lock); 1461 rp = rp4freelist; 1462 if (rp != NULL) { 1463 do { 1464 if (nfs4_free_data_reclaim(rp)) 1465 freed = 1; 1466 } while ((rp = rp->r_freef) != rp4freelist); 1467 } 1468 mutex_exit(&rp4freelist_lock); 1469 return (freed); 1470 } 1471 1472 static int 1473 nfs4_active_reclaim(void) 1474 { 1475 int freed; 1476 int index; 1477 rnode4_t *rp; 1478 1479 #ifdef DEBUG 1480 clstat4_debug.a_reclaim.value.ui64++; 1481 #endif 1482 freed = 0; 1483 for (index = 0; index < rtable4size; index++) { 1484 rw_enter(&rtable4[index].r_lock, RW_READER); 1485 for (rp = rtable4[index].r_hashf; 1486 rp != (rnode4_t *)(&rtable4[index]); 1487 rp = rp->r_hashf) { 1488 if (nfs4_active_data_reclaim(rp)) 1489 freed = 1; 1490 } 1491 rw_exit(&rtable4[index].r_lock); 1492 } 1493 return (freed); 1494 } 1495 1496 static int 1497 nfs4_rnode_reclaim(void) 1498 { 1499 int freed; 1500 rnode4_t *rp; 1501 vnode_t *vp; 1502 1503 #ifdef DEBUG 1504 clstat4_debug.r_reclaim.value.ui64++; 1505 #endif 1506 freed = 0; 1507 mutex_enter(&rp4freelist_lock); 1508 while ((rp = rp4freelist) != NULL) { 1509 rp4_rmfree(rp); 1510 mutex_exit(&rp4freelist_lock); 1511 if (rp->r_flags & R4HASHED) { 1512 vp = RTOV4(rp); 1513 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1514 mutex_enter(&vp->v_lock); 1515 if (vp->v_count > 1) { 1516 vp->v_count--; 1517 mutex_exit(&vp->v_lock); 1518 rw_exit(&rp->r_hashq->r_lock); 1519 mutex_enter(&rp4freelist_lock); 1520 continue; 1521 } 1522 mutex_exit(&vp->v_lock); 1523 rp4_rmhash_locked(rp); 1524 rw_exit(&rp->r_hashq->r_lock); 1525 } 1526 /* 1527 * This call to rp_addfree will end up destroying the 1528 * rnode, but in a safe way with the appropriate set 1529 * of checks done. 1530 */ 1531 rp4_addfree(rp, CRED()); 1532 mutex_enter(&rp4freelist_lock); 1533 } 1534 mutex_exit(&rp4freelist_lock); 1535 return (freed); 1536 } 1537 1538 /*ARGSUSED*/ 1539 static void 1540 nfs4_reclaim(void *cdrarg) 1541 { 1542 1543 #ifdef DEBUG 1544 clstat4_debug.reclaim.value.ui64++; 1545 #endif 1546 if (nfs4_free_reclaim()) 1547 return; 1548 1549 if (nfs4_active_reclaim()) 1550 return; 1551 1552 (void) nfs4_rnode_reclaim(); 1553 } 1554 1555 /* 1556 * Returns the clientid4 to use for the given mntinfo4. Note that the 1557 * clientid can change if the caller drops mi_recovlock. 1558 */ 1559 1560 clientid4 1561 mi2clientid(mntinfo4_t *mi) 1562 { 1563 nfs4_server_t *sp; 1564 clientid4 clientid = 0; 1565 1566 /* this locks down sp if it is found */ 1567 sp = find_nfs4_server(mi); 1568 if (sp != NULL) { 1569 clientid = sp->clientid; 1570 mutex_exit(&sp->s_lock); 1571 nfs4_server_rele(sp); 1572 } 1573 return (clientid); 1574 } 1575 1576 /* 1577 * Return the current lease time for the server associated with the given 1578 * file. Note that the lease time could change immediately after this 1579 * call. 1580 */ 1581 1582 time_t 1583 r2lease_time(rnode4_t *rp) 1584 { 1585 nfs4_server_t *sp; 1586 time_t lease_time; 1587 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1588 1589 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1590 1591 /* this locks down sp if it is found */ 1592 sp = find_nfs4_server(VTOMI4(RTOV4(rp))); 1593 1594 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1595 if (sp != NULL) { 1596 mutex_exit(&sp->s_lock); 1597 nfs4_server_rele(sp); 1598 } 1599 nfs_rw_exit(&mi->mi_recovlock); 1600 return (1); /* 1 second */ 1601 } 1602 1603 ASSERT(sp != NULL); 1604 1605 lease_time = sp->s_lease_time; 1606 1607 mutex_exit(&sp->s_lock); 1608 nfs4_server_rele(sp); 1609 nfs_rw_exit(&mi->mi_recovlock); 1610 1611 return (lease_time); 1612 } 1613 1614 /* 1615 * Return a list with information about all the known open instances for 1616 * a filesystem. The caller must call r4releopenlist() when done with the 1617 * list. 1618 * 1619 * We are safe at looking at os_valid and os_pending_close across dropping 1620 * the 'os_sync_lock' to count up the number of open streams and then 1621 * allocate memory for the osp list due to: 1622 * -Looking at os_pending_close is safe since this routine is 1623 * only called via recovery, and os_pending_close can only be set via 1624 * a non-recovery operation (which are all blocked when recovery 1625 * is active). 1626 * 1627 * -Examining os_valid is safe since non-recovery operations, which 1628 * could potentially switch os_valid to 0, are blocked (via 1629 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1630 * (which means we are the only recovery thread potentially acting 1631 * on this open stream). 1632 */ 1633 1634 nfs4_opinst_t * 1635 r4mkopenlist(mntinfo4_t *mi) 1636 { 1637 nfs4_opinst_t *reopenlist, *rep; 1638 rnode4_t *rp; 1639 vnode_t *vp; 1640 vfs_t *vfsp = mi->mi_vfsp; 1641 int numosp; 1642 nfs4_open_stream_t *osp; 1643 int index; 1644 open_delegation_type4 dtype; 1645 int hold_vnode; 1646 1647 reopenlist = NULL; 1648 1649 for (index = 0; index < rtable4size; index++) { 1650 rw_enter(&rtable4[index].r_lock, RW_READER); 1651 for (rp = rtable4[index].r_hashf; 1652 rp != (rnode4_t *)(&rtable4[index]); 1653 rp = rp->r_hashf) { 1654 1655 vp = RTOV4(rp); 1656 if (vp->v_vfsp != vfsp) 1657 continue; 1658 hold_vnode = 0; 1659 1660 mutex_enter(&rp->r_os_lock); 1661 1662 /* Count the number of valid open_streams of the file */ 1663 numosp = 0; 1664 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1665 osp = list_next(&rp->r_open_streams, osp)) { 1666 mutex_enter(&osp->os_sync_lock); 1667 if (osp->os_valid && !osp->os_pending_close) 1668 numosp++; 1669 mutex_exit(&osp->os_sync_lock); 1670 } 1671 1672 /* Fill in the valid open streams per vp */ 1673 if (numosp > 0) { 1674 int j; 1675 1676 hold_vnode = 1; 1677 1678 /* 1679 * Add a new open instance to the list 1680 */ 1681 rep = kmem_zalloc(sizeof (*reopenlist), 1682 KM_SLEEP); 1683 rep->re_next = reopenlist; 1684 reopenlist = rep; 1685 1686 rep->re_vp = vp; 1687 rep->re_osp = kmem_zalloc( 1688 numosp * sizeof (*(rep->re_osp)), 1689 KM_SLEEP); 1690 rep->re_numosp = numosp; 1691 1692 j = 0; 1693 for (osp = list_head(&rp->r_open_streams); 1694 osp != NULL; 1695 osp = list_next(&rp->r_open_streams, osp)) { 1696 1697 mutex_enter(&osp->os_sync_lock); 1698 if (osp->os_valid && 1699 !osp->os_pending_close) { 1700 osp->os_ref_count++; 1701 rep->re_osp[j] = osp; 1702 j++; 1703 } 1704 mutex_exit(&osp->os_sync_lock); 1705 } 1706 /* 1707 * Assuming valid osp(s) stays valid between 1708 * the time obtaining j and numosp. 1709 */ 1710 ASSERT(j == numosp); 1711 } 1712 1713 mutex_exit(&rp->r_os_lock); 1714 /* do this here to keep v_lock > r_os_lock */ 1715 if (hold_vnode) 1716 VN_HOLD(vp); 1717 mutex_enter(&rp->r_statev4_lock); 1718 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1719 /* 1720 * If this rnode holds a delegation, 1721 * but if there are no valid open streams, 1722 * then just discard the delegation 1723 * without doing delegreturn. 1724 */ 1725 if (numosp > 0) 1726 rp->r_deleg_needs_recovery = 1727 rp->r_deleg_type; 1728 } 1729 /* Save the delegation type for use outside the lock */ 1730 dtype = rp->r_deleg_type; 1731 mutex_exit(&rp->r_statev4_lock); 1732 1733 /* 1734 * If we have a delegation then get rid of it. 1735 * We've set rp->r_deleg_needs_recovery so we have 1736 * enough information to recover. 1737 */ 1738 if (dtype != OPEN_DELEGATE_NONE) { 1739 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1740 } 1741 } 1742 rw_exit(&rtable4[index].r_lock); 1743 } 1744 return (reopenlist); 1745 } 1746 1747 /* 1748 * Release the list of open instance references. 1749 */ 1750 1751 void 1752 r4releopenlist(nfs4_opinst_t *reopenp) 1753 { 1754 nfs4_opinst_t *rep, *next; 1755 int i; 1756 1757 for (rep = reopenp; rep; rep = next) { 1758 next = rep->re_next; 1759 1760 for (i = 0; i < rep->re_numosp; i++) 1761 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1762 1763 VN_RELE(rep->re_vp); 1764 kmem_free(rep->re_osp, 1765 rep->re_numosp * sizeof (*(rep->re_osp))); 1766 1767 kmem_free(rep, sizeof (*rep)); 1768 } 1769 } 1770 1771 int 1772 nfs4_rnode_init(void) 1773 { 1774 ulong_t nrnode4_max; 1775 int i; 1776 1777 /* 1778 * Compute the size of the rnode4 hash table 1779 */ 1780 if (nrnode <= 0) 1781 nrnode = ncsize; 1782 nrnode4_max = 1783 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1784 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1785 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1786 "setting nrnode to max value of %ld", nrnode4_max); 1787 nrnode = nrnode4_max; 1788 } 1789 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1790 rtable4mask = rtable4size - 1; 1791 1792 /* 1793 * Allocate and initialize the hash buckets 1794 */ 1795 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1796 for (i = 0; i < rtable4size; i++) { 1797 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1798 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1799 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1800 } 1801 1802 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1803 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1804 1805 return (0); 1806 } 1807 1808 int 1809 nfs4_rnode_fini(void) 1810 { 1811 int i; 1812 1813 /* 1814 * Deallocate the rnode hash queues 1815 */ 1816 kmem_cache_destroy(rnode4_cache); 1817 1818 for (i = 0; i < rtable4size; i++) 1819 rw_destroy(&rtable4[i].r_lock); 1820 1821 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 1822 1823 return (0); 1824 } 1825 1826 /* 1827 * Return non-zero if the given filehandle refers to the root filehandle 1828 * for the given rnode. 1829 */ 1830 1831 static int 1832 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 1833 { 1834 int isroot; 1835 1836 isroot = 0; 1837 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 1838 isroot = 1; 1839 1840 return (isroot); 1841 } 1842 1843 #ifdef DEBUG 1844 1845 /* 1846 * Look in the rnode table for other rnodes that have the same filehandle. 1847 * Assume the lock is held for the hash chain of checkrp 1848 */ 1849 1850 static void 1851 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 1852 { 1853 rnode4_t *rp; 1854 vnode_t *tvp; 1855 nfs4_fhandle_t fh, fh2; 1856 int index; 1857 1858 if (!r4_check_for_dups) 1859 return; 1860 1861 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 1862 1863 sfh4_copyval(checkrp->r_fh, &fh); 1864 1865 for (index = 0; index < rtable4size; index++) { 1866 1867 if (&rtable4[index] != checkrp->r_hashq) 1868 rw_enter(&rtable4[index].r_lock, RW_READER); 1869 1870 for (rp = rtable4[index].r_hashf; 1871 rp != (rnode4_t *)(&rtable4[index]); 1872 rp = rp->r_hashf) { 1873 1874 if (rp == checkrp) 1875 continue; 1876 1877 tvp = RTOV4(rp); 1878 if (tvp->v_vfsp != vfsp) 1879 continue; 1880 1881 sfh4_copyval(rp->r_fh, &fh2); 1882 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 1883 cmn_err(CE_PANIC, "rnodes with same fs, fh " 1884 "(%p, %p)", (void *)checkrp, (void *)rp); 1885 } 1886 } 1887 1888 if (&rtable4[index] != checkrp->r_hashq) 1889 rw_exit(&rtable4[index].r_lock); 1890 } 1891 } 1892 1893 #endif /* DEBUG */ 1894