1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 29 * All Rights Reserved 30 */ 31 32 #pragma ident "%Z%%M% %I% %E% SMI" 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/proc.h> 39 #include <sys/user.h> 40 #include <sys/time.h> 41 #include <sys/buf.h> 42 #include <sys/vfs.h> 43 #include <sys/vnode.h> 44 #include <sys/socket.h> 45 #include <sys/uio.h> 46 #include <sys/tiuser.h> 47 #include <sys/swap.h> 48 #include <sys/errno.h> 49 #include <sys/debug.h> 50 #include <sys/kmem.h> 51 #include <sys/kstat.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vtrace.h> 54 #include <sys/session.h> 55 #include <sys/dnlc.h> 56 #include <sys/bitmap.h> 57 #include <sys/acl.h> 58 #include <sys/ddi.h> 59 #include <sys/pathname.h> 60 #include <sys/flock.h> 61 #include <sys/dirent.h> 62 #include <sys/flock.h> 63 #include <sys/callb.h> 64 65 #include <rpc/types.h> 66 #include <rpc/xdr.h> 67 #include <rpc/auth.h> 68 #include <rpc/rpcsec_gss.h> 69 #include <rpc/clnt.h> 70 71 #include <nfs/nfs.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/nfs_acl.h> 74 75 #include <nfs/nfs4.h> 76 #include <nfs/rnode4.h> 77 #include <nfs/nfs4_clnt.h> 78 79 /* 80 * The hash queues for the access to active and cached rnodes 81 * are organized as doubly linked lists. A reader/writer lock 82 * for each hash bucket is used to control access and to synchronize 83 * lookups, additions, and deletions from the hash queue. 84 * 85 * The rnode freelist is organized as a doubly linked list with 86 * a head pointer. Additions and deletions are synchronized via 87 * a single mutex. 88 * 89 * In order to add an rnode to the free list, it must be hashed into 90 * a hash queue and the exclusive lock to the hash queue be held. 91 * If an rnode is not hashed into a hash queue, then it is destroyed 92 * because it represents no valuable information that can be reused 93 * about the file. The exclusive lock to the hash queue must be 94 * held in order to prevent a lookup in the hash queue from finding 95 * the rnode and using it and assuming that the rnode is not on the 96 * freelist. The lookup in the hash queue will have the hash queue 97 * locked, either exclusive or shared. 98 * 99 * The vnode reference count for each rnode is not allowed to drop 100 * below 1. This prevents external entities, such as the VM 101 * subsystem, from acquiring references to vnodes already on the 102 * freelist and then trying to place them back on the freelist 103 * when their reference is released. This means that the when an 104 * rnode is looked up in the hash queues, then either the rnode 105 * is removed from the freelist and that reference is tranfered to 106 * the new reference or the vnode reference count must be incremented 107 * accordingly. The mutex for the freelist must be held in order to 108 * accurately test to see if the rnode is on the freelist or not. 109 * The hash queue lock might be held shared and it is possible that 110 * two different threads may race to remove the rnode from the 111 * freelist. This race can be resolved by holding the mutex for the 112 * freelist. Please note that the mutex for the freelist does not 113 * need to be held if the rnode is not on the freelist. It can not be 114 * placed on the freelist due to the requirement that the thread 115 * putting the rnode on the freelist must hold the exclusive lock 116 * to the hash queue and the thread doing the lookup in the hash 117 * queue is holding either a shared or exclusive lock to the hash 118 * queue. 119 * 120 * The lock ordering is: 121 * 122 * hash bucket lock -> vnode lock 123 * hash bucket lock -> freelist lock -> r_statelock 124 */ 125 r4hashq_t *rtable4; 126 127 static kmutex_t rp4freelist_lock; 128 static rnode4_t *rp4freelist = NULL; 129 static long rnode4_new = 0; 130 int rtable4size; 131 static int rtable4mask; 132 static struct kmem_cache *rnode4_cache; 133 static int rnode4_hashlen = 4; 134 135 static void r4inactive(rnode4_t *, cred_t *); 136 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 137 struct vnodeops *, 138 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 139 cred_t *), 140 int *, cred_t *); 141 static void rp4_rmfree(rnode4_t *); 142 int nfs4_free_data_reclaim(rnode4_t *); 143 static int nfs4_active_data_reclaim(rnode4_t *); 144 static int nfs4_free_reclaim(void); 145 static int nfs4_active_reclaim(void); 146 static int nfs4_rnode_reclaim(void); 147 static void nfs4_reclaim(void *); 148 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 149 static void uninit_rnode4(rnode4_t *); 150 static void destroy_rnode4(rnode4_t *); 151 152 #ifdef DEBUG 153 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 154 static int nfs4_rnode_debug = 0; 155 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 156 static int nfs4_rnode_nofreelist = 0; 157 /* give messages on colliding shared filehandles */ 158 static void r4_dup_check(rnode4_t *, vfs_t *); 159 #endif 160 161 /* 162 * Free the resources associated with an rnode. 163 */ 164 static void 165 r4inactive(rnode4_t *rp, cred_t *cr) 166 { 167 vnode_t *vp; 168 char *contents; 169 int size; 170 vsecattr_t *vsp; 171 vnode_t *xattr; 172 int error; 173 174 /* 175 * Before freeing anything, wait until all asynchronous 176 * activity is done on this rnode. This will allow all 177 * asynchronous read ahead and write behind i/o's to 178 * finish. 179 */ 180 mutex_enter(&rp->r_statelock); 181 while (rp->r_count > 0) 182 cv_wait(&rp->r_cv, &rp->r_statelock); 183 mutex_exit(&rp->r_statelock); 184 185 /* 186 * Flush and invalidate all pages associated with the vnode. 187 */ 188 vp = RTOV4(rp); 189 if (nfs4_has_pages(vp)) { 190 ASSERT(vp->v_type != VCHR); 191 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 192 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr); 193 if (error && (error == ENOSPC || error == EDQUOT)) { 194 mutex_enter(&rp->r_statelock); 195 if (!rp->r_error) 196 rp->r_error = error; 197 mutex_exit(&rp->r_statelock); 198 } 199 } 200 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 201 } 202 203 /* 204 * Free any held caches which may be 205 * associated with this rnode. 206 */ 207 mutex_enter(&rp->r_statelock); 208 contents = rp->r_symlink.contents; 209 size = rp->r_symlink.size; 210 rp->r_symlink.contents = NULL; 211 vsp = rp->r_secattr; 212 rp->r_secattr = NULL; 213 xattr = rp->r_xattr_dir; 214 rp->r_xattr_dir = NULL; 215 mutex_exit(&rp->r_statelock); 216 217 /* 218 * Free the access cache entries. 219 */ 220 (void) nfs4_access_purge_rp(rp); 221 222 /* 223 * Free the readdir cache entries. 224 */ 225 nfs4_purge_rddir_cache(vp); 226 227 /* 228 * Free the symbolic link cache. 229 */ 230 if (contents != NULL) { 231 232 kmem_free((void *)contents, size); 233 } 234 235 /* 236 * Free any cached ACL. 237 */ 238 if (vsp != NULL) 239 nfs4_acl_free_cache(vsp); 240 241 /* 242 * Release the cached xattr_dir 243 */ 244 if (xattr != NULL) 245 VN_RELE(xattr); 246 } 247 248 /* 249 * We have seen a case that the fh passed in is for "." which 250 * should be a VROOT node, however, the fh is different from the 251 * root fh stored in the mntinfo4_t. The invalid fh might be 252 * from a misbehaved server and will panic the client system at 253 * a later time. To avoid the panic, we drop the bad fh, use 254 * the root fh from mntinfo4_t, and print an error message 255 * for attention. 256 */ 257 nfs4_sharedfh_t * 258 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 259 int *wasbad) 260 { 261 char *s; 262 263 *wasbad = 0; 264 s = fn_name(nm); 265 ASSERT(strcmp(s, "..") != 0); 266 267 if ((s[0] == '.' && s[1] == '\0') && fh && 268 !SFH4_SAME(mi->mi_rootfh, fh)) { 269 #ifdef DEBUG 270 nfs4_fhandle_t fhandle; 271 272 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 273 "Server %s returns a different " 274 "root filehandle for the path %s:", 275 mi->mi_curr_serv->sv_hostname, 276 mi->mi_curr_serv->sv_path); 277 278 /* print the bad fh */ 279 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 280 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 281 fhandle.fh_len); 282 nfs4_printfhandle(&fhandle); 283 284 /* print mi_rootfh */ 285 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 286 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 287 fhandle.fh_len); 288 nfs4_printfhandle(&fhandle); 289 #endif 290 /* use mi_rootfh instead; fh will be rele by the caller */ 291 fh = mi->mi_rootfh; 292 *wasbad = 1; 293 } 294 295 kmem_free(s, MAXNAMELEN); 296 return (fh); 297 } 298 299 void 300 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 301 hrtime_t t, cred_t *cr, int index) 302 { 303 vattr_t *attr; 304 /* 305 * Don't add to attrcache if time overflow, but 306 * no need to check because either attr is null or the time 307 * values in it were processed by nfs4_time_ntov(), which checks 308 * for time overflows. 309 */ 310 attr = garp ? &garp->n4g_va : NULL; 311 312 if (attr) { 313 if (!newnode) { 314 rw_exit(&rtable4[index].r_lock); 315 #ifdef DEBUG 316 if (vp->v_type != attr->va_type && 317 vp->v_type != VNON && attr->va_type != VNON) { 318 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 319 "makenfs4node: type (%d) doesn't " 320 "match type of found node at %p (%d)", 321 attr->va_type, (void *)vp, vp->v_type); 322 } 323 #endif 324 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 325 } else { 326 rnode4_t *rp = VTOR4(vp); 327 328 vp->v_type = attr->va_type; 329 vp->v_rdev = attr->va_rdev; 330 331 /* 332 * Turn this object into a "stub" object if we 333 * crossed an underlying server fs boundary. To 334 * make this check, during mount we save the 335 * fsid of the server object being mounted. 336 * Here we compare this object's server fsid 337 * with the fsid we saved at mount. If they 338 * are different, we crossed server fs boundary. 339 * 340 * The stub flag is set (or not) at rnode 341 * creation time and it never changes for life 342 * of rnode. 343 * 344 * We don't bother with taking r_state_lock 345 * to set R4SRVSTUB flag because this is a new 346 * rnode and we're holding rtable lock. No other 347 * thread could have obtained access to this 348 * rnode. 349 */ 350 if (garp->n4g_fsid_valid) { 351 rp->r_srv_fsid = garp->n4g_fsid; 352 353 if (vp->v_type == VDIR) { 354 servinfo4_t *svp = rp->r_server; 355 356 (void) nfs_rw_enter_sig(&svp->sv_lock, 357 RW_READER, 0); 358 if (!FATTR4_FSID_EQ(&garp->n4g_fsid, 359 &svp->sv_fsid)) { 360 rp->r_flags |= R4SRVSTUB; 361 } 362 nfs_rw_exit(&svp->sv_lock); 363 } 364 } 365 366 /* Can not cache partial attr */ 367 if (attr->va_mask == AT_ALL) 368 nfs4_attrcache_noinval(vp, garp, t); 369 else 370 PURGE_ATTRCACHE4(vp); 371 372 rw_exit(&rtable4[index].r_lock); 373 } 374 } else { 375 if (newnode) { 376 PURGE_ATTRCACHE4(vp); 377 } 378 rw_exit(&rtable4[index].r_lock); 379 } 380 } 381 382 /* 383 * Find or create an rnode based primarily on filehandle. To be 384 * used when dvp (vnode for parent directory) is not available; 385 * otherwise, makenfs4node() should be used. 386 * 387 * The nfs4_fname_t argument *npp is consumed and nulled out. 388 */ 389 390 vnode_t * 391 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 392 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 393 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 394 { 395 vfs_t *vfsp = mi->mi_vfsp; 396 int newnode = 0; 397 vnode_t *vp; 398 rnode4_t *rp; 399 svnode_t *svp; 400 nfs4_fname_t *name; 401 int index; 402 403 ASSERT(npp && *npp); 404 name = *npp; 405 *npp = NULL; 406 407 index = rtable4hash(sfh); 408 rw_enter(&rtable4[index].r_lock, RW_READER); 409 410 rp = r4find(&rtable4[index], sfh, vfsp); 411 if (rp != NULL) { 412 rw_exit(&rtable4[index].r_lock); 413 vp = RTOV4(rp); 414 fn_rele(&name); 415 return (vp); 416 } 417 418 vp = make_rnode4(sfh, &rtable4[index], vfsp, 419 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 420 if (newnode) { 421 svp = vtosv(vp); 422 svp->sv_forw = svp->sv_back = svp; 423 svp->sv_name = name; 424 if (psfh != NULL) 425 sfh4_hold(psfh); 426 svp->sv_dfh = psfh; 427 } else { 428 fn_rele(&name); 429 } 430 431 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 432 r4_do_attrcache(vp, garp, newnode, t, cr, index); 433 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 434 435 return (vp); 436 } 437 438 /* 439 * Find or create a vnode for the given filehandle, filesystem, parent, and 440 * name. The reference to nm is consumed, so the caller must first do an 441 * fn_hold() if it wants to continue using nm after this call. 442 */ 443 vnode_t * 444 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 445 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 446 { 447 vnode_t *vp; 448 int newnode; 449 int index; 450 mntinfo4_t *mi = VFTOMI4(vfsp); 451 int had_badfh = 0; 452 rnode4_t *rp; 453 454 ASSERT(dvp != NULL); 455 456 fh = badrootfh_check(fh, nm, mi, &had_badfh); 457 458 index = rtable4hash(fh); 459 rw_enter(&rtable4[index].r_lock, RW_READER); 460 461 /* 462 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 463 */ 464 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 465 nfs4_putapage, &newnode, cr); 466 467 rp = VTOR4(vp); 468 sv_activate(&vp, dvp, &nm, newnode); 469 if (dvp->v_flag & V_XATTRDIR) { 470 mutex_enter(&rp->r_statelock); 471 rp->r_flags |= R4ISXATTR; 472 mutex_exit(&rp->r_statelock); 473 } 474 475 /* if getting a bad file handle, do not cache the attributes. */ 476 if (had_badfh) { 477 rw_exit(&rtable4[index].r_lock); 478 return (vp); 479 } 480 481 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 482 r4_do_attrcache(vp, garp, newnode, t, cr, index); 483 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 484 485 return (vp); 486 } 487 488 /* 489 * Hash on address of filehandle object. 490 * XXX totally untuned. 491 */ 492 493 int 494 rtable4hash(nfs4_sharedfh_t *fh) 495 { 496 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 497 } 498 499 /* 500 * Find or create the vnode for the given filehandle and filesystem. 501 * *newnode is set to zero if the vnode already existed; non-zero if it had 502 * to be created. 503 * 504 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 505 */ 506 507 static vnode_t * 508 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 509 struct vnodeops *vops, 510 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 511 int *newnode, cred_t *cr) 512 { 513 rnode4_t *rp; 514 rnode4_t *trp; 515 vnode_t *vp; 516 mntinfo4_t *mi; 517 518 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 519 520 mi = VFTOMI4(vfsp); 521 522 start: 523 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 524 vp = RTOV4(rp); 525 *newnode = 0; 526 return (vp); 527 } 528 rw_exit(&rhtp->r_lock); 529 530 mutex_enter(&rp4freelist_lock); 531 532 if (rp4freelist != NULL && rnode4_new >= nrnode) { 533 rp = rp4freelist; 534 rp4_rmfree(rp); 535 mutex_exit(&rp4freelist_lock); 536 537 vp = RTOV4(rp); 538 539 if (rp->r_flags & R4HASHED) { 540 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 541 mutex_enter(&vp->v_lock); 542 if (vp->v_count > 1) { 543 vp->v_count--; 544 mutex_exit(&vp->v_lock); 545 rw_exit(&rp->r_hashq->r_lock); 546 rw_enter(&rhtp->r_lock, RW_READER); 547 goto start; 548 } 549 mutex_exit(&vp->v_lock); 550 rp4_rmhash_locked(rp); 551 rw_exit(&rp->r_hashq->r_lock); 552 } 553 554 r4inactive(rp, cr); 555 556 mutex_enter(&vp->v_lock); 557 if (vp->v_count > 1) { 558 vp->v_count--; 559 mutex_exit(&vp->v_lock); 560 rw_enter(&rhtp->r_lock, RW_READER); 561 goto start; 562 } 563 mutex_exit(&vp->v_lock); 564 vn_invalid(vp); 565 566 /* 567 * destroy old locks before bzero'ing and 568 * recreating the locks below. 569 */ 570 uninit_rnode4(rp); 571 572 /* 573 * Make sure that if rnode is recycled then 574 * VFS count is decremented properly before 575 * reuse. 576 */ 577 VFS_RELE(vp->v_vfsp); 578 vn_reinit(vp); 579 } else { 580 vnode_t *new_vp; 581 582 mutex_exit(&rp4freelist_lock); 583 584 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 585 new_vp = vn_alloc(KM_SLEEP); 586 587 atomic_add_long((ulong_t *)&rnode4_new, 1); 588 #ifdef DEBUG 589 clstat4_debug.nrnode.value.ui64++; 590 #endif 591 vp = new_vp; 592 } 593 594 bzero(rp, sizeof (*rp)); 595 rp->r_vnode = vp; 596 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 597 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 598 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 599 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 600 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 601 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 602 rp->created_v4 = 0; 603 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 604 offsetof(nfs4_open_stream_t, os_node)); 605 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 606 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 607 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 608 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 609 rp->r_flags = R4READDIRWATTR; 610 rp->r_fh = fh; 611 rp->r_hashq = rhtp; 612 sfh4_hold(rp->r_fh); 613 rp->r_server = mi->mi_curr_serv; 614 rp->r_deleg_type = OPEN_DELEGATE_NONE; 615 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 616 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 617 618 rddir4_cache_create(rp); 619 rp->r_putapage = putapage; 620 vn_setops(vp, vops); 621 vp->v_data = (caddr_t)rp; 622 vp->v_vfsp = vfsp; 623 VFS_HOLD(vfsp); 624 vp->v_type = VNON; 625 if (isrootfh(fh, rp)) 626 vp->v_flag = VROOT; 627 vn_exists(vp); 628 629 /* 630 * There is a race condition if someone else 631 * alloc's the rnode while no locks are held, so we 632 * check again and recover if found. 633 */ 634 rw_enter(&rhtp->r_lock, RW_WRITER); 635 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 636 vp = RTOV4(trp); 637 *newnode = 0; 638 rw_exit(&rhtp->r_lock); 639 rp4_addfree(rp, cr); 640 rw_enter(&rhtp->r_lock, RW_READER); 641 return (vp); 642 } 643 rp4_addhash(rp); 644 *newnode = 1; 645 return (vp); 646 } 647 648 static void 649 uninit_rnode4(rnode4_t *rp) 650 { 651 vnode_t *vp = RTOV4(rp); 652 653 ASSERT(rp != NULL); 654 ASSERT(vp != NULL); 655 ASSERT(vp->v_count == 1); 656 ASSERT(rp->r_count == 0); 657 ASSERT(rp->r_mapcnt == 0); 658 if (rp->r_flags & R4LODANGLERS) { 659 nfs4_flush_lock_owners(rp); 660 } 661 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 662 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 663 ASSERT(!(rp->r_flags & R4HASHED)); 664 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 665 nfs4_clear_open_streams(rp); 666 list_destroy(&rp->r_open_streams); 667 668 /* 669 * Destroy the rddir cache first since we need to grab the r_statelock. 670 */ 671 mutex_enter(&rp->r_statelock); 672 rddir4_cache_destroy(rp); 673 mutex_exit(&rp->r_statelock); 674 sv_uninit(&rp->r_svnode); 675 sfh4_rele(&rp->r_fh); 676 nfs_rw_destroy(&rp->r_rwlock); 677 nfs_rw_destroy(&rp->r_lkserlock); 678 mutex_destroy(&rp->r_statelock); 679 mutex_destroy(&rp->r_statev4_lock); 680 mutex_destroy(&rp->r_os_lock); 681 cv_destroy(&rp->r_cv); 682 cv_destroy(&rp->r_commit.c_cv); 683 nfs_rw_destroy(&rp->r_deleg_recall_lock); 684 if (rp->r_flags & R4DELMAPLIST) 685 list_destroy(&rp->r_indelmap); 686 } 687 688 /* 689 * Put an rnode on the free list. 690 * 691 * Rnodes which were allocated above and beyond the normal limit 692 * are immediately freed. 693 */ 694 void 695 rp4_addfree(rnode4_t *rp, cred_t *cr) 696 { 697 vnode_t *vp; 698 vnode_t *xattr; 699 struct vfs *vfsp; 700 701 vp = RTOV4(rp); 702 ASSERT(vp->v_count >= 1); 703 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 704 705 /* 706 * If we have too many rnodes allocated and there are no 707 * references to this rnode, or if the rnode is no longer 708 * accessible by it does not reside in the hash queues, 709 * or if an i/o error occurred while writing to the file, 710 * then just free it instead of putting it on the rnode 711 * freelist. 712 */ 713 vfsp = vp->v_vfsp; 714 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 715 #ifdef DEBUG 716 (nfs4_rnode_nofreelist != 0) || 717 #endif 718 rp->r_error || (rp->r_flags & R4RECOVERR) || 719 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 720 if (rp->r_flags & R4HASHED) { 721 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 722 mutex_enter(&vp->v_lock); 723 if (vp->v_count > 1) { 724 vp->v_count--; 725 mutex_exit(&vp->v_lock); 726 rw_exit(&rp->r_hashq->r_lock); 727 return; 728 } 729 mutex_exit(&vp->v_lock); 730 rp4_rmhash_locked(rp); 731 rw_exit(&rp->r_hashq->r_lock); 732 } 733 734 /* 735 * Make sure we don't have a delegation on this rnode 736 * before destroying it. 737 */ 738 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 739 (void) nfs4delegreturn(rp, 740 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 741 } 742 743 r4inactive(rp, cr); 744 745 /* 746 * Recheck the vnode reference count. We need to 747 * make sure that another reference has not been 748 * acquired while we were not holding v_lock. The 749 * rnode is not in the rnode hash queues; one 750 * way for a reference to have been acquired 751 * is for a VOP_PUTPAGE because the rnode was marked 752 * with R4DIRTY or for a modified page. This 753 * reference may have been acquired before our call 754 * to r4inactive. The i/o may have been completed, 755 * thus allowing r4inactive to complete, but the 756 * reference to the vnode may not have been released 757 * yet. In any case, the rnode can not be destroyed 758 * until the other references to this vnode have been 759 * released. The other references will take care of 760 * either destroying the rnode or placing it on the 761 * rnode freelist. If there are no other references, 762 * then the rnode may be safely destroyed. 763 */ 764 mutex_enter(&vp->v_lock); 765 if (vp->v_count > 1) { 766 vp->v_count--; 767 mutex_exit(&vp->v_lock); 768 return; 769 } 770 mutex_exit(&vp->v_lock); 771 772 destroy_rnode4(rp); 773 return; 774 } 775 776 /* 777 * Lock the hash queue and then recheck the reference count 778 * to ensure that no other threads have acquired a reference 779 * to indicate that the rnode should not be placed on the 780 * freelist. If another reference has been acquired, then 781 * just release this one and let the other thread complete 782 * the processing of adding this rnode to the freelist. 783 */ 784 again: 785 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 786 787 mutex_enter(&vp->v_lock); 788 if (vp->v_count > 1) { 789 vp->v_count--; 790 mutex_exit(&vp->v_lock); 791 rw_exit(&rp->r_hashq->r_lock); 792 return; 793 } 794 mutex_exit(&vp->v_lock); 795 796 /* 797 * Make sure we don't put an rnode with a delegation 798 * on the free list. 799 */ 800 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 801 rw_exit(&rp->r_hashq->r_lock); 802 (void) nfs4delegreturn(rp, 803 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 804 goto again; 805 } 806 807 /* 808 * Now that we have the hash queue lock, and we know there 809 * are not anymore references on the vnode, check to make 810 * sure there aren't any open streams still on the rnode. 811 * If so, drop the hash queue lock, remove the open streams, 812 * and recheck the v_count. 813 */ 814 mutex_enter(&rp->r_os_lock); 815 if (list_head(&rp->r_open_streams) != NULL) { 816 mutex_exit(&rp->r_os_lock); 817 rw_exit(&rp->r_hashq->r_lock); 818 if (curproc->p_zone != VTOMI4(vp)->mi_zone) 819 nfs4_clear_open_streams(rp); 820 else 821 (void) nfs4close_all(vp, cr); 822 goto again; 823 } 824 mutex_exit(&rp->r_os_lock); 825 826 /* 827 * Before we put it on the freelist, make sure there is no 828 * active xattr directory cached, the freelist will not 829 * have its entries r4inactive'd if there is still an active 830 * rnode, thus nothing in the freelist can hold another 831 * rnode active. 832 */ 833 xattr = rp->r_xattr_dir; 834 rp->r_xattr_dir = NULL; 835 836 /* 837 * If there is no cached data or metadata for this file, then 838 * put the rnode on the front of the freelist so that it will 839 * be reused before other rnodes which may have cached data or 840 * metadata associated with them. 841 */ 842 mutex_enter(&rp4freelist_lock); 843 if (rp4freelist == NULL) { 844 rp->r_freef = rp; 845 rp->r_freeb = rp; 846 rp4freelist = rp; 847 } else { 848 rp->r_freef = rp4freelist; 849 rp->r_freeb = rp4freelist->r_freeb; 850 rp4freelist->r_freeb->r_freef = rp; 851 rp4freelist->r_freeb = rp; 852 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 853 rp->r_symlink.contents == NULL && 854 rp->r_secattr == NULL) 855 rp4freelist = rp; 856 } 857 mutex_exit(&rp4freelist_lock); 858 859 rw_exit(&rp->r_hashq->r_lock); 860 861 if (xattr) 862 VN_RELE(xattr); 863 } 864 865 /* 866 * Remove an rnode from the free list. 867 * 868 * The caller must be holding rp4freelist_lock and the rnode 869 * must be on the freelist. 870 */ 871 static void 872 rp4_rmfree(rnode4_t *rp) 873 { 874 875 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 876 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 877 878 if (rp == rp4freelist) { 879 rp4freelist = rp->r_freef; 880 if (rp == rp4freelist) 881 rp4freelist = NULL; 882 } 883 rp->r_freeb->r_freef = rp->r_freef; 884 rp->r_freef->r_freeb = rp->r_freeb; 885 886 rp->r_freef = rp->r_freeb = NULL; 887 } 888 889 /* 890 * Put a rnode in the hash table. 891 * 892 * The caller must be holding the exclusive hash queue lock 893 */ 894 void 895 rp4_addhash(rnode4_t *rp) 896 { 897 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 898 ASSERT(!(rp->r_flags & R4HASHED)); 899 900 #ifdef DEBUG 901 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 902 #endif 903 904 rp->r_hashf = rp->r_hashq->r_hashf; 905 rp->r_hashq->r_hashf = rp; 906 rp->r_hashb = (rnode4_t *)rp->r_hashq; 907 rp->r_hashf->r_hashb = rp; 908 909 mutex_enter(&rp->r_statelock); 910 rp->r_flags |= R4HASHED; 911 mutex_exit(&rp->r_statelock); 912 } 913 914 /* 915 * Remove a rnode from the hash table. 916 * 917 * The caller must be holding the hash queue lock. 918 */ 919 void 920 rp4_rmhash_locked(rnode4_t *rp) 921 { 922 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 923 ASSERT(rp->r_flags & R4HASHED); 924 925 rp->r_hashb->r_hashf = rp->r_hashf; 926 rp->r_hashf->r_hashb = rp->r_hashb; 927 928 mutex_enter(&rp->r_statelock); 929 rp->r_flags &= ~R4HASHED; 930 mutex_exit(&rp->r_statelock); 931 } 932 933 /* 934 * Remove a rnode from the hash table. 935 * 936 * The caller must not be holding the hash queue lock. 937 */ 938 void 939 rp4_rmhash(rnode4_t *rp) 940 { 941 942 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 943 rp4_rmhash_locked(rp); 944 rw_exit(&rp->r_hashq->r_lock); 945 } 946 947 /* 948 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 949 * Returns NULL if no match. If an rnode is returned, the reference count 950 * on the master vnode is incremented. 951 * 952 * The caller must be holding the hash queue lock, either shared or exclusive. 953 */ 954 rnode4_t * 955 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 956 { 957 rnode4_t *rp; 958 vnode_t *vp; 959 960 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 961 962 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 963 vp = RTOV4(rp); 964 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 965 966 mutex_enter(&rp->r_statelock); 967 if (rp->r_flags & R4RECOVERR) { 968 mutex_exit(&rp->r_statelock); 969 continue; 970 } 971 mutex_exit(&rp->r_statelock); 972 #ifdef DEBUG 973 r4_dup_check(rp, vfsp); 974 #endif 975 if (rp->r_freef != NULL) { 976 mutex_enter(&rp4freelist_lock); 977 /* 978 * If the rnode is on the freelist, 979 * then remove it and use that reference 980 * as the new reference. Otherwise, 981 * need to increment the reference count. 982 */ 983 if (rp->r_freef != NULL) { 984 rp4_rmfree(rp); 985 mutex_exit(&rp4freelist_lock); 986 } else { 987 mutex_exit(&rp4freelist_lock); 988 VN_HOLD(vp); 989 } 990 } else 991 VN_HOLD(vp); 992 993 /* 994 * if root vnode, set v_flag to indicate that 995 */ 996 if (isrootfh(fh, rp)) { 997 if (!(vp->v_flag & VROOT)) { 998 mutex_enter(&vp->v_lock); 999 vp->v_flag |= VROOT; 1000 mutex_exit(&vp->v_lock); 1001 } 1002 } 1003 return (rp); 1004 } 1005 } 1006 return (NULL); 1007 } 1008 1009 /* 1010 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1011 * that assumes the caller hasn't already got the lock 1012 * on the hash bucket. 1013 */ 1014 rnode4_t * 1015 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1016 { 1017 rnode4_t *rp; 1018 int index; 1019 1020 index = rtable4hash(fh); 1021 rw_enter(&rtable4[index].r_lock, RW_READER); 1022 rp = r4find(&rtable4[index], fh, vfsp); 1023 rw_exit(&rtable4[index].r_lock); 1024 1025 return (rp); 1026 } 1027 1028 /* 1029 * Return 1 if there is a active vnode belonging to this vfs in the 1030 * rtable4 cache. 1031 * 1032 * Several of these checks are done without holding the usual 1033 * locks. This is safe because destroy_rtable(), rp_addfree(), 1034 * etc. will redo the necessary checks before actually destroying 1035 * any rnodes. 1036 */ 1037 int 1038 check_rtable4(struct vfs *vfsp) 1039 { 1040 rnode4_t *rp; 1041 vnode_t *vp; 1042 char *busy = NULL; 1043 int index; 1044 1045 for (index = 0; index < rtable4size; index++) { 1046 rw_enter(&rtable4[index].r_lock, RW_READER); 1047 1048 for (rp = rtable4[index].r_hashf; 1049 rp != (rnode4_t *)(&rtable4[index]); 1050 rp = rp->r_hashf) { 1051 1052 vp = RTOV4(rp); 1053 if (vp->v_vfsp == vfsp) { 1054 if (rp->r_freef == NULL) { 1055 busy = "not on free list"; 1056 } else if (nfs4_has_pages(vp) && 1057 (rp->r_flags & R4DIRTY)) { 1058 busy = "dirty pages"; 1059 } else if (rp->r_count > 0) { 1060 busy = "r_count > 0"; 1061 } 1062 1063 if (busy != NULL) { 1064 #ifdef DEBUG 1065 char *path; 1066 1067 path = fn_path(rp->r_svnode.sv_name); 1068 NFS4_DEBUG(nfs4_rnode_debug, 1069 (CE_NOTE, "check_rtable4: " "%s %s", 1070 path, busy)); 1071 kmem_free(path, strlen(path)+1); 1072 #endif 1073 rw_exit(&rtable4[index].r_lock); 1074 return (1); 1075 } 1076 } 1077 } 1078 rw_exit(&rtable4[index].r_lock); 1079 } 1080 return (0); 1081 } 1082 1083 /* 1084 * Destroy inactive vnodes from the hash queues which 1085 * belong to this vfs. All of the vnodes should be inactive. 1086 * It is essential that we destory all rnodes in case of 1087 * forced unmount as well as in normal unmount case. 1088 */ 1089 1090 void 1091 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1092 { 1093 int index; 1094 vnode_t *vp; 1095 rnode4_t *rp, *r_hashf, *rlist; 1096 1097 rlist = NULL; 1098 1099 for (index = 0; index < rtable4size; index++) { 1100 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1101 for (rp = rtable4[index].r_hashf; 1102 rp != (rnode4_t *)(&rtable4[index]); 1103 rp = r_hashf) { 1104 /* save the hash pointer before destroying */ 1105 r_hashf = rp->r_hashf; 1106 1107 vp = RTOV4(rp); 1108 if (vp->v_vfsp == vfsp) { 1109 mutex_enter(&rp4freelist_lock); 1110 if (rp->r_freef != NULL) { 1111 rp4_rmfree(rp); 1112 mutex_exit(&rp4freelist_lock); 1113 rp4_rmhash_locked(rp); 1114 rp->r_hashf = rlist; 1115 rlist = rp; 1116 } else 1117 mutex_exit(&rp4freelist_lock); 1118 } 1119 } 1120 rw_exit(&rtable4[index].r_lock); 1121 } 1122 1123 for (rp = rlist; rp != NULL; rp = r_hashf) { 1124 r_hashf = rp->r_hashf; 1125 /* 1126 * This call to rp4_addfree will end up destroying the 1127 * rnode, but in a safe way with the appropriate set 1128 * of checks done. 1129 */ 1130 rp4_addfree(rp, cr); 1131 } 1132 } 1133 1134 /* 1135 * This routine destroys all the resources of an rnode 1136 * and finally the rnode itself. 1137 */ 1138 static void 1139 destroy_rnode4(rnode4_t *rp) 1140 { 1141 vnode_t *vp; 1142 vfs_t *vfsp; 1143 1144 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1145 1146 vp = RTOV4(rp); 1147 vfsp = vp->v_vfsp; 1148 1149 uninit_rnode4(rp); 1150 atomic_add_long((ulong_t *)&rnode4_new, -1); 1151 #ifdef DEBUG 1152 clstat4_debug.nrnode.value.ui64--; 1153 #endif 1154 kmem_cache_free(rnode4_cache, rp); 1155 vn_invalid(vp); 1156 vn_free(vp); 1157 VFS_RELE(vfsp); 1158 } 1159 1160 /* 1161 * Invalidate the attributes on all rnodes forcing the next getattr 1162 * to go over the wire. Used to flush stale uid and gid mappings. 1163 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1164 */ 1165 void 1166 nfs4_rnode_invalidate(struct vfs *vfsp) 1167 { 1168 int index; 1169 rnode4_t *rp; 1170 vnode_t *vp; 1171 1172 /* 1173 * Walk the hash queues looking for rnodes. 1174 */ 1175 for (index = 0; index < rtable4size; index++) { 1176 rw_enter(&rtable4[index].r_lock, RW_READER); 1177 for (rp = rtable4[index].r_hashf; 1178 rp != (rnode4_t *)(&rtable4[index]); 1179 rp = rp->r_hashf) { 1180 vp = RTOV4(rp); 1181 if (vfsp != NULL && vp->v_vfsp != vfsp) 1182 continue; 1183 1184 if (!mutex_tryenter(&rp->r_statelock)) 1185 continue; 1186 1187 /* 1188 * Expire the attributes by resetting the change 1189 * and attr timeout. 1190 */ 1191 rp->r_change = 0; 1192 PURGE_ATTRCACHE4_LOCKED(rp); 1193 mutex_exit(&rp->r_statelock); 1194 } 1195 rw_exit(&rtable4[index].r_lock); 1196 } 1197 } 1198 1199 /* 1200 * Flush all vnodes in this (or every) vfs. 1201 * Used by nfs_sync and by nfs_unmount. 1202 */ 1203 void 1204 r4flush(struct vfs *vfsp, cred_t *cr) 1205 { 1206 int index; 1207 rnode4_t *rp; 1208 vnode_t *vp, **vplist; 1209 long num, cnt; 1210 1211 /* 1212 * Check to see whether there is anything to do. 1213 */ 1214 num = rnode4_new; 1215 if (num == 0) 1216 return; 1217 1218 /* 1219 * Allocate a slot for all currently active rnodes on the 1220 * supposition that they all may need flushing. 1221 */ 1222 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1223 cnt = 0; 1224 1225 /* 1226 * Walk the hash queues looking for rnodes with page 1227 * lists associated with them. Make a list of these 1228 * files. 1229 */ 1230 for (index = 0; index < rtable4size; index++) { 1231 rw_enter(&rtable4[index].r_lock, RW_READER); 1232 for (rp = rtable4[index].r_hashf; 1233 rp != (rnode4_t *)(&rtable4[index]); 1234 rp = rp->r_hashf) { 1235 vp = RTOV4(rp); 1236 /* 1237 * Don't bother sync'ing a vp if it 1238 * is part of virtual swap device or 1239 * if VFS is read-only 1240 */ 1241 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1242 continue; 1243 /* 1244 * If flushing all mounted file systems or 1245 * the vnode belongs to this vfs, has pages 1246 * and is marked as either dirty or mmap'd, 1247 * hold and add this vnode to the list of 1248 * vnodes to flush. 1249 */ 1250 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 1251 nfs4_has_pages(vp) && 1252 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1253 VN_HOLD(vp); 1254 vplist[cnt++] = vp; 1255 if (cnt == num) { 1256 rw_exit(&rtable4[index].r_lock); 1257 goto toomany; 1258 } 1259 } 1260 } 1261 rw_exit(&rtable4[index].r_lock); 1262 } 1263 toomany: 1264 1265 /* 1266 * Flush and release all of the files on the list. 1267 */ 1268 while (cnt-- > 0) { 1269 vp = vplist[cnt]; 1270 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr); 1271 VN_RELE(vp); 1272 } 1273 1274 /* 1275 * Free the space allocated to hold the list. 1276 */ 1277 kmem_free(vplist, num * sizeof (*vplist)); 1278 } 1279 1280 int 1281 nfs4_free_data_reclaim(rnode4_t *rp) 1282 { 1283 char *contents; 1284 vnode_t *xattr; 1285 int size; 1286 vsecattr_t *vsp; 1287 int freed; 1288 bool_t rdc = FALSE; 1289 1290 /* 1291 * Free any held caches which may 1292 * be associated with this rnode. 1293 */ 1294 mutex_enter(&rp->r_statelock); 1295 if (rp->r_dir != NULL) 1296 rdc = TRUE; 1297 contents = rp->r_symlink.contents; 1298 size = rp->r_symlink.size; 1299 rp->r_symlink.contents = NULL; 1300 vsp = rp->r_secattr; 1301 rp->r_secattr = NULL; 1302 xattr = rp->r_xattr_dir; 1303 rp->r_xattr_dir = NULL; 1304 mutex_exit(&rp->r_statelock); 1305 1306 /* 1307 * Free the access cache entries. 1308 */ 1309 freed = nfs4_access_purge_rp(rp); 1310 1311 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1312 return (freed); 1313 1314 /* 1315 * Free the readdir cache entries, incompletely if we can't block. 1316 */ 1317 nfs4_purge_rddir_cache(RTOV4(rp)); 1318 1319 /* 1320 * Free the symbolic link cache. 1321 */ 1322 if (contents != NULL) { 1323 1324 kmem_free((void *)contents, size); 1325 } 1326 1327 /* 1328 * Free any cached ACL. 1329 */ 1330 if (vsp != NULL) 1331 nfs4_acl_free_cache(vsp); 1332 1333 /* 1334 * Release the xattr directory vnode 1335 */ 1336 if (xattr != NULL) 1337 VN_RELE(xattr); 1338 1339 return (1); 1340 } 1341 1342 static int 1343 nfs4_active_data_reclaim(rnode4_t *rp) 1344 { 1345 char *contents; 1346 vnode_t *xattr; 1347 int size; 1348 vsecattr_t *vsp; 1349 int freed; 1350 bool_t rdc = FALSE; 1351 1352 /* 1353 * Free any held credentials and caches which 1354 * may be associated with this rnode. 1355 */ 1356 if (!mutex_tryenter(&rp->r_statelock)) 1357 return (0); 1358 contents = rp->r_symlink.contents; 1359 size = rp->r_symlink.size; 1360 rp->r_symlink.contents = NULL; 1361 vsp = rp->r_secattr; 1362 rp->r_secattr = NULL; 1363 if (rp->r_dir != NULL) 1364 rdc = TRUE; 1365 xattr = rp->r_xattr_dir; 1366 rp->r_xattr_dir = NULL; 1367 mutex_exit(&rp->r_statelock); 1368 1369 /* 1370 * Free the access cache entries. 1371 */ 1372 freed = nfs4_access_purge_rp(rp); 1373 1374 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1375 return (freed); 1376 1377 /* 1378 * Free the symbolic link cache. 1379 */ 1380 if (contents != NULL) { 1381 1382 kmem_free((void *)contents, size); 1383 } 1384 1385 /* 1386 * Free any cached ACL. 1387 */ 1388 if (vsp != NULL) 1389 nfs4_acl_free_cache(vsp); 1390 1391 nfs4_purge_rddir_cache(RTOV4(rp)); 1392 1393 /* 1394 * Release the xattr directory vnode 1395 */ 1396 if (xattr != NULL) 1397 VN_RELE(xattr); 1398 1399 return (1); 1400 } 1401 1402 static int 1403 nfs4_free_reclaim(void) 1404 { 1405 int freed; 1406 rnode4_t *rp; 1407 1408 #ifdef DEBUG 1409 clstat4_debug.f_reclaim.value.ui64++; 1410 #endif 1411 freed = 0; 1412 mutex_enter(&rp4freelist_lock); 1413 rp = rp4freelist; 1414 if (rp != NULL) { 1415 do { 1416 if (nfs4_free_data_reclaim(rp)) 1417 freed = 1; 1418 } while ((rp = rp->r_freef) != rp4freelist); 1419 } 1420 mutex_exit(&rp4freelist_lock); 1421 return (freed); 1422 } 1423 1424 static int 1425 nfs4_active_reclaim(void) 1426 { 1427 int freed; 1428 int index; 1429 rnode4_t *rp; 1430 1431 #ifdef DEBUG 1432 clstat4_debug.a_reclaim.value.ui64++; 1433 #endif 1434 freed = 0; 1435 for (index = 0; index < rtable4size; index++) { 1436 rw_enter(&rtable4[index].r_lock, RW_READER); 1437 for (rp = rtable4[index].r_hashf; 1438 rp != (rnode4_t *)(&rtable4[index]); 1439 rp = rp->r_hashf) { 1440 if (nfs4_active_data_reclaim(rp)) 1441 freed = 1; 1442 } 1443 rw_exit(&rtable4[index].r_lock); 1444 } 1445 return (freed); 1446 } 1447 1448 static int 1449 nfs4_rnode_reclaim(void) 1450 { 1451 int freed; 1452 rnode4_t *rp; 1453 vnode_t *vp; 1454 1455 #ifdef DEBUG 1456 clstat4_debug.r_reclaim.value.ui64++; 1457 #endif 1458 freed = 0; 1459 mutex_enter(&rp4freelist_lock); 1460 while ((rp = rp4freelist) != NULL) { 1461 rp4_rmfree(rp); 1462 mutex_exit(&rp4freelist_lock); 1463 if (rp->r_flags & R4HASHED) { 1464 vp = RTOV4(rp); 1465 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1466 mutex_enter(&vp->v_lock); 1467 if (vp->v_count > 1) { 1468 vp->v_count--; 1469 mutex_exit(&vp->v_lock); 1470 rw_exit(&rp->r_hashq->r_lock); 1471 mutex_enter(&rp4freelist_lock); 1472 continue; 1473 } 1474 mutex_exit(&vp->v_lock); 1475 rp4_rmhash_locked(rp); 1476 rw_exit(&rp->r_hashq->r_lock); 1477 } 1478 /* 1479 * This call to rp_addfree will end up destroying the 1480 * rnode, but in a safe way with the appropriate set 1481 * of checks done. 1482 */ 1483 rp4_addfree(rp, CRED()); 1484 mutex_enter(&rp4freelist_lock); 1485 } 1486 mutex_exit(&rp4freelist_lock); 1487 return (freed); 1488 } 1489 1490 /*ARGSUSED*/ 1491 static void 1492 nfs4_reclaim(void *cdrarg) 1493 { 1494 1495 #ifdef DEBUG 1496 clstat4_debug.reclaim.value.ui64++; 1497 #endif 1498 if (nfs4_free_reclaim()) 1499 return; 1500 1501 if (nfs4_active_reclaim()) 1502 return; 1503 1504 (void) nfs4_rnode_reclaim(); 1505 } 1506 1507 /* 1508 * Returns the clientid4 to use for the given mntinfo4. Note that the 1509 * clientid can change if the caller drops mi_recovlock. 1510 */ 1511 1512 clientid4 1513 mi2clientid(mntinfo4_t *mi) 1514 { 1515 nfs4_server_t *sp; 1516 clientid4 clientid = 0; 1517 1518 /* this locks down sp if it is found */ 1519 sp = find_nfs4_server(mi); 1520 if (sp != NULL) { 1521 clientid = sp->clientid; 1522 mutex_exit(&sp->s_lock); 1523 nfs4_server_rele(sp); 1524 } 1525 return (clientid); 1526 } 1527 1528 /* 1529 * Return the current lease time for the server associated with the given 1530 * file. Note that the lease time could change immediately after this 1531 * call. 1532 */ 1533 1534 time_t 1535 r2lease_time(rnode4_t *rp) 1536 { 1537 nfs4_server_t *sp; 1538 time_t lease_time; 1539 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1540 1541 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1542 1543 /* this locks down sp if it is found */ 1544 sp = find_nfs4_server(VTOMI4(RTOV4(rp))); 1545 1546 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1547 if (sp != NULL) { 1548 mutex_exit(&sp->s_lock); 1549 nfs4_server_rele(sp); 1550 } 1551 nfs_rw_exit(&mi->mi_recovlock); 1552 return (1); /* 1 second */ 1553 } 1554 1555 ASSERT(sp != NULL); 1556 1557 lease_time = sp->s_lease_time; 1558 1559 mutex_exit(&sp->s_lock); 1560 nfs4_server_rele(sp); 1561 nfs_rw_exit(&mi->mi_recovlock); 1562 1563 return (lease_time); 1564 } 1565 1566 /* 1567 * Return a list with information about all the known open instances for 1568 * a filesystem. The caller must call r4releopenlist() when done with the 1569 * list. 1570 * 1571 * We are safe at looking at os_valid and os_pending_close across dropping 1572 * the 'os_sync_lock' to count up the number of open streams and then 1573 * allocate memory for the osp list due to: 1574 * -Looking at os_pending_close is safe since this routine is 1575 * only called via recovery, and os_pending_close can only be set via 1576 * a non-recovery operation (which are all blocked when recovery 1577 * is active). 1578 * 1579 * -Examining os_valid is safe since non-recovery operations, which 1580 * could potentially switch os_valid to 0, are blocked (via 1581 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1582 * (which means we are the only recovery thread potentially acting 1583 * on this open stream). 1584 */ 1585 1586 nfs4_opinst_t * 1587 r4mkopenlist(mntinfo4_t *mi) 1588 { 1589 nfs4_opinst_t *reopenlist, *rep; 1590 rnode4_t *rp; 1591 vnode_t *vp; 1592 vfs_t *vfsp = mi->mi_vfsp; 1593 int numosp; 1594 nfs4_open_stream_t *osp; 1595 int index; 1596 open_delegation_type4 dtype; 1597 int hold_vnode; 1598 1599 reopenlist = NULL; 1600 1601 for (index = 0; index < rtable4size; index++) { 1602 rw_enter(&rtable4[index].r_lock, RW_READER); 1603 for (rp = rtable4[index].r_hashf; 1604 rp != (rnode4_t *)(&rtable4[index]); 1605 rp = rp->r_hashf) { 1606 1607 vp = RTOV4(rp); 1608 if (vp->v_vfsp != vfsp) 1609 continue; 1610 hold_vnode = 0; 1611 1612 mutex_enter(&rp->r_os_lock); 1613 1614 /* Count the number of valid open_streams of the file */ 1615 numosp = 0; 1616 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1617 osp = list_next(&rp->r_open_streams, osp)) { 1618 mutex_enter(&osp->os_sync_lock); 1619 if (osp->os_valid && !osp->os_pending_close) 1620 numosp++; 1621 mutex_exit(&osp->os_sync_lock); 1622 } 1623 1624 /* Fill in the valid open streams per vp */ 1625 if (numosp > 0) { 1626 int j; 1627 1628 hold_vnode = 1; 1629 1630 /* 1631 * Add a new open instance to the list 1632 */ 1633 rep = kmem_zalloc(sizeof (*reopenlist), 1634 KM_SLEEP); 1635 rep->re_next = reopenlist; 1636 reopenlist = rep; 1637 1638 rep->re_vp = vp; 1639 rep->re_osp = kmem_zalloc( 1640 numosp * sizeof (*(rep->re_osp)), 1641 KM_SLEEP); 1642 rep->re_numosp = numosp; 1643 1644 j = 0; 1645 for (osp = list_head(&rp->r_open_streams); 1646 osp != NULL; 1647 osp = list_next(&rp->r_open_streams, osp)) { 1648 1649 mutex_enter(&osp->os_sync_lock); 1650 if (osp->os_valid && 1651 !osp->os_pending_close) { 1652 osp->os_ref_count++; 1653 rep->re_osp[j] = osp; 1654 j++; 1655 } 1656 mutex_exit(&osp->os_sync_lock); 1657 } 1658 /* 1659 * Assuming valid osp(s) stays valid between 1660 * the time obtaining j and numosp. 1661 */ 1662 ASSERT(j == numosp); 1663 } 1664 1665 mutex_exit(&rp->r_os_lock); 1666 /* do this here to keep v_lock > r_os_lock */ 1667 if (hold_vnode) 1668 VN_HOLD(vp); 1669 mutex_enter(&rp->r_statev4_lock); 1670 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1671 /* 1672 * If this rnode holds a delegation, 1673 * but if there are no valid open streams, 1674 * then just discard the delegation 1675 * without doing delegreturn. 1676 */ 1677 if (numosp > 0) 1678 rp->r_deleg_needs_recovery = 1679 rp->r_deleg_type; 1680 } 1681 /* Save the delegation type for use outside the lock */ 1682 dtype = rp->r_deleg_type; 1683 mutex_exit(&rp->r_statev4_lock); 1684 1685 /* 1686 * If we have a delegation then get rid of it. 1687 * We've set rp->r_deleg_needs_recovery so we have 1688 * enough information to recover. 1689 */ 1690 if (dtype != OPEN_DELEGATE_NONE) { 1691 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1692 } 1693 } 1694 rw_exit(&rtable4[index].r_lock); 1695 } 1696 return (reopenlist); 1697 } 1698 1699 /* 1700 * Release the list of open instance references. 1701 */ 1702 1703 void 1704 r4releopenlist(nfs4_opinst_t *reopenp) 1705 { 1706 nfs4_opinst_t *rep, *next; 1707 int i; 1708 1709 for (rep = reopenp; rep; rep = next) { 1710 next = rep->re_next; 1711 1712 for (i = 0; i < rep->re_numosp; i++) 1713 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1714 1715 VN_RELE(rep->re_vp); 1716 kmem_free(rep->re_osp, 1717 rep->re_numosp * sizeof (*(rep->re_osp))); 1718 1719 kmem_free(rep, sizeof (*rep)); 1720 } 1721 } 1722 1723 int 1724 nfs4_rnode_init(void) 1725 { 1726 ulong_t nrnode4_max; 1727 int i; 1728 1729 /* 1730 * Compute the size of the rnode4 hash table 1731 */ 1732 if (nrnode <= 0) 1733 nrnode = ncsize; 1734 nrnode4_max = 1735 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1736 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1737 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1738 "setting nrnode to max value of %ld", nrnode4_max); 1739 nrnode = nrnode4_max; 1740 } 1741 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1742 rtable4mask = rtable4size - 1; 1743 1744 /* 1745 * Allocate and initialize the hash buckets 1746 */ 1747 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1748 for (i = 0; i < rtable4size; i++) { 1749 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1750 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1751 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1752 } 1753 1754 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1755 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1756 1757 return (0); 1758 } 1759 1760 int 1761 nfs4_rnode_fini(void) 1762 { 1763 int i; 1764 1765 /* 1766 * Deallocate the rnode hash queues 1767 */ 1768 kmem_cache_destroy(rnode4_cache); 1769 1770 for (i = 0; i < rtable4size; i++) 1771 rw_destroy(&rtable4[i].r_lock); 1772 1773 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 1774 1775 return (0); 1776 } 1777 1778 /* 1779 * Return non-zero if the given filehandle refers to the root filehandle 1780 * for the given rnode. 1781 */ 1782 1783 static int 1784 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 1785 { 1786 int isroot; 1787 1788 isroot = 0; 1789 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 1790 isroot = 1; 1791 1792 return (isroot); 1793 } 1794 1795 #ifdef DEBUG 1796 1797 /* 1798 * Look in the rnode table for other rnodes that have the same filehandle. 1799 * Assume the lock is held for the hash chain of checkrp 1800 */ 1801 1802 static void 1803 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 1804 { 1805 rnode4_t *rp; 1806 vnode_t *tvp; 1807 nfs4_fhandle_t fh, fh2; 1808 int index; 1809 1810 if (!r4_check_for_dups) 1811 return; 1812 1813 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 1814 1815 sfh4_copyval(checkrp->r_fh, &fh); 1816 1817 for (index = 0; index < rtable4size; index++) { 1818 1819 if (&rtable4[index] != checkrp->r_hashq) 1820 rw_enter(&rtable4[index].r_lock, RW_READER); 1821 1822 for (rp = rtable4[index].r_hashf; 1823 rp != (rnode4_t *)(&rtable4[index]); 1824 rp = rp->r_hashf) { 1825 1826 if (rp == checkrp) 1827 continue; 1828 1829 tvp = RTOV4(rp); 1830 if (tvp->v_vfsp != vfsp) 1831 continue; 1832 1833 sfh4_copyval(rp->r_fh, &fh2); 1834 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 1835 cmn_err(CE_PANIC, "rnodes with same fs, fh " 1836 "(%p, %p)", (void *)checkrp, (void *)rp); 1837 } 1838 } 1839 1840 if (&rtable4[index] != checkrp->r_hashq) 1841 rw_exit(&rtable4[index].r_lock); 1842 } 1843 } 1844 1845 #endif /* DEBUG */ 1846