1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/proc.h> 37 #include <sys/user.h> 38 #include <sys/time.h> 39 #include <sys/buf.h> 40 #include <sys/vfs.h> 41 #include <sys/vnode.h> 42 #include <sys/socket.h> 43 #include <sys/uio.h> 44 #include <sys/tiuser.h> 45 #include <sys/swap.h> 46 #include <sys/errno.h> 47 #include <sys/debug.h> 48 #include <sys/kmem.h> 49 #include <sys/kstat.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vtrace.h> 52 #include <sys/session.h> 53 #include <sys/dnlc.h> 54 #include <sys/bitmap.h> 55 #include <sys/acl.h> 56 #include <sys/ddi.h> 57 #include <sys/pathname.h> 58 #include <sys/flock.h> 59 #include <sys/dirent.h> 60 #include <sys/flock.h> 61 #include <sys/callb.h> 62 63 #include <rpc/types.h> 64 #include <rpc/xdr.h> 65 #include <rpc/auth.h> 66 #include <rpc/rpcsec_gss.h> 67 #include <rpc/clnt.h> 68 69 #include <nfs/nfs.h> 70 #include <nfs/nfs_clnt.h> 71 #include <nfs/nfs_acl.h> 72 73 #include <nfs/nfs4.h> 74 #include <nfs/rnode4.h> 75 #include <nfs/nfs4_clnt.h> 76 77 /* 78 * The hash queues for the access to active and cached rnodes 79 * are organized as doubly linked lists. A reader/writer lock 80 * for each hash bucket is used to control access and to synchronize 81 * lookups, additions, and deletions from the hash queue. 82 * 83 * The rnode freelist is organized as a doubly linked list with 84 * a head pointer. Additions and deletions are synchronized via 85 * a single mutex. 86 * 87 * In order to add an rnode to the free list, it must be hashed into 88 * a hash queue and the exclusive lock to the hash queue be held. 89 * If an rnode is not hashed into a hash queue, then it is destroyed 90 * because it represents no valuable information that can be reused 91 * about the file. The exclusive lock to the hash queue must be 92 * held in order to prevent a lookup in the hash queue from finding 93 * the rnode and using it and assuming that the rnode is not on the 94 * freelist. The lookup in the hash queue will have the hash queue 95 * locked, either exclusive or shared. 96 * 97 * The vnode reference count for each rnode is not allowed to drop 98 * below 1. This prevents external entities, such as the VM 99 * subsystem, from acquiring references to vnodes already on the 100 * freelist and then trying to place them back on the freelist 101 * when their reference is released. This means that the when an 102 * rnode is looked up in the hash queues, then either the rnode 103 * is removed from the freelist and that reference is transferred to 104 * the new reference or the vnode reference count must be incremented 105 * accordingly. The mutex for the freelist must be held in order to 106 * accurately test to see if the rnode is on the freelist or not. 107 * The hash queue lock might be held shared and it is possible that 108 * two different threads may race to remove the rnode from the 109 * freelist. This race can be resolved by holding the mutex for the 110 * freelist. Please note that the mutex for the freelist does not 111 * need to be held if the rnode is not on the freelist. It can not be 112 * placed on the freelist due to the requirement that the thread 113 * putting the rnode on the freelist must hold the exclusive lock 114 * to the hash queue and the thread doing the lookup in the hash 115 * queue is holding either a shared or exclusive lock to the hash 116 * queue. 117 * 118 * The lock ordering is: 119 * 120 * hash bucket lock -> vnode lock 121 * hash bucket lock -> freelist lock -> r_statelock 122 */ 123 r4hashq_t *rtable4; 124 125 static kmutex_t rp4freelist_lock; 126 static rnode4_t *rp4freelist = NULL; 127 static long rnode4_new = 0; 128 int rtable4size; 129 static int rtable4mask; 130 static struct kmem_cache *rnode4_cache; 131 static int rnode4_hashlen = 4; 132 133 static void r4inactive(rnode4_t *, cred_t *); 134 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 135 struct vnodeops *, 136 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 137 cred_t *), 138 int *, cred_t *); 139 static void rp4_rmfree(rnode4_t *); 140 int nfs4_free_data_reclaim(rnode4_t *); 141 static int nfs4_active_data_reclaim(rnode4_t *); 142 static int nfs4_free_reclaim(void); 143 static int nfs4_active_reclaim(void); 144 static int nfs4_rnode_reclaim(void); 145 static void nfs4_reclaim(void *); 146 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 147 static void uninit_rnode4(rnode4_t *); 148 static void destroy_rnode4(rnode4_t *); 149 static void r4_stub_set(rnode4_t *, nfs4_stub_type_t); 150 151 #ifdef DEBUG 152 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 153 static int nfs4_rnode_debug = 0; 154 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 155 static int nfs4_rnode_nofreelist = 0; 156 /* give messages on colliding shared filehandles */ 157 static void r4_dup_check(rnode4_t *, vfs_t *); 158 #endif 159 160 /* 161 * If the vnode has pages, run the list and check for any that are 162 * still dangling. We call this routine before putting an rnode on 163 * the free list. 164 */ 165 static int 166 nfs4_dross_pages(vnode_t *vp) 167 { 168 page_t *pp; 169 kmutex_t *vphm; 170 171 vphm = page_vnode_mutex(vp); 172 mutex_enter(vphm); 173 if ((pp = vp->v_pages) != NULL) { 174 do { 175 if (pp->p_fsdata != C_NOCOMMIT) { 176 mutex_exit(vphm); 177 return (1); 178 } 179 } while ((pp = pp->p_vpnext) != vp->v_pages); 180 } 181 mutex_exit(vphm); 182 183 return (0); 184 } 185 186 /* 187 * Flush any pages left on this rnode. 188 */ 189 static void 190 r4flushpages(rnode4_t *rp, cred_t *cr) 191 { 192 vnode_t *vp; 193 int error; 194 195 /* 196 * Before freeing anything, wait until all asynchronous 197 * activity is done on this rnode. This will allow all 198 * asynchronous read ahead and write behind i/o's to 199 * finish. 200 */ 201 mutex_enter(&rp->r_statelock); 202 while (rp->r_count > 0) 203 cv_wait(&rp->r_cv, &rp->r_statelock); 204 mutex_exit(&rp->r_statelock); 205 206 /* 207 * Flush and invalidate all pages associated with the vnode. 208 */ 209 vp = RTOV4(rp); 210 if (nfs4_has_pages(vp)) { 211 ASSERT(vp->v_type != VCHR); 212 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 213 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 214 if (error && (error == ENOSPC || error == EDQUOT)) { 215 mutex_enter(&rp->r_statelock); 216 if (!rp->r_error) 217 rp->r_error = error; 218 mutex_exit(&rp->r_statelock); 219 } 220 } 221 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 222 } 223 } 224 225 /* 226 * Free the resources associated with an rnode. 227 */ 228 static void 229 r4inactive(rnode4_t *rp, cred_t *cr) 230 { 231 vnode_t *vp; 232 char *contents; 233 int size; 234 vsecattr_t *vsp; 235 vnode_t *xattr; 236 237 r4flushpages(rp, cr); 238 239 vp = RTOV4(rp); 240 241 /* 242 * Free any held caches which may be 243 * associated with this rnode. 244 */ 245 mutex_enter(&rp->r_statelock); 246 contents = rp->r_symlink.contents; 247 size = rp->r_symlink.size; 248 rp->r_symlink.contents = NULL; 249 vsp = rp->r_secattr; 250 rp->r_secattr = NULL; 251 xattr = rp->r_xattr_dir; 252 rp->r_xattr_dir = NULL; 253 mutex_exit(&rp->r_statelock); 254 255 /* 256 * Free the access cache entries. 257 */ 258 (void) nfs4_access_purge_rp(rp); 259 260 /* 261 * Free the readdir cache entries. 262 */ 263 nfs4_purge_rddir_cache(vp); 264 265 /* 266 * Free the symbolic link cache. 267 */ 268 if (contents != NULL) { 269 270 kmem_free((void *)contents, size); 271 } 272 273 /* 274 * Free any cached ACL. 275 */ 276 if (vsp != NULL) 277 nfs4_acl_free_cache(vsp); 278 279 /* 280 * Release the cached xattr_dir 281 */ 282 if (xattr != NULL) 283 VN_RELE(xattr); 284 } 285 286 /* 287 * We have seen a case that the fh passed in is for "." which 288 * should be a VROOT node, however, the fh is different from the 289 * root fh stored in the mntinfo4_t. The invalid fh might be 290 * from a misbehaved server and will panic the client system at 291 * a later time. To avoid the panic, we drop the bad fh, use 292 * the root fh from mntinfo4_t, and print an error message 293 * for attention. 294 */ 295 nfs4_sharedfh_t * 296 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 297 int *wasbad) 298 { 299 char *s; 300 301 *wasbad = 0; 302 s = fn_name(nm); 303 ASSERT(strcmp(s, "..") != 0); 304 305 if ((s[0] == '.' && s[1] == '\0') && fh && 306 !SFH4_SAME(mi->mi_rootfh, fh)) { 307 #ifdef DEBUG 308 nfs4_fhandle_t fhandle; 309 310 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 311 "Server %s returns a different " 312 "root filehandle for the path %s:", 313 mi->mi_curr_serv->sv_hostname, 314 mi->mi_curr_serv->sv_path); 315 316 /* print the bad fh */ 317 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 318 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 319 fhandle.fh_len); 320 nfs4_printfhandle(&fhandle); 321 322 /* print mi_rootfh */ 323 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 324 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 325 fhandle.fh_len); 326 nfs4_printfhandle(&fhandle); 327 #endif 328 /* use mi_rootfh instead; fh will be rele by the caller */ 329 fh = mi->mi_rootfh; 330 *wasbad = 1; 331 } 332 333 kmem_free(s, MAXNAMELEN); 334 return (fh); 335 } 336 337 void 338 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 339 hrtime_t t, cred_t *cr, int index) 340 { 341 int is_stub; 342 vattr_t *attr; 343 /* 344 * Don't add to attrcache if time overflow, but 345 * no need to check because either attr is null or the time 346 * values in it were processed by nfs4_time_ntov(), which checks 347 * for time overflows. 348 */ 349 attr = garp ? &garp->n4g_va : NULL; 350 351 if (attr) { 352 if (!newnode) { 353 rw_exit(&rtable4[index].r_lock); 354 #ifdef DEBUG 355 if (vp->v_type != attr->va_type && 356 vp->v_type != VNON && attr->va_type != VNON) { 357 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 358 "makenfs4node: type (%d) doesn't " 359 "match type of found node at %p (%d)", 360 attr->va_type, (void *)vp, vp->v_type); 361 } 362 #endif 363 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 364 } else { 365 rnode4_t *rp = VTOR4(vp); 366 367 vp->v_type = attr->va_type; 368 vp->v_rdev = attr->va_rdev; 369 370 /* 371 * Turn this object into a "stub" object if we 372 * crossed an underlying server fs boundary. 373 * To make this check, during mount we save the 374 * fsid of the server object being mounted. 375 * Here we compare this object's server fsid 376 * with the fsid we saved at mount. If they 377 * are different, we crossed server fs boundary. 378 * 379 * The stub type is set (or not) at rnode 380 * creation time and it never changes for life 381 * of the rnode. 382 * 383 * The stub type is also set during RO failover, 384 * nfs4_remap_file(). 385 * 386 * This stub will be for a mirror-mount. 387 * 388 * We don't bother with taking r_state_lock to 389 * set the stub type because this is a new rnode 390 * and we're holding the hash bucket r_lock RW_WRITER. 391 * No other thread could have obtained access 392 * to this rnode. 393 */ 394 is_stub = 0; 395 if (garp->n4g_fsid_valid) { 396 fattr4_fsid ga_fsid = garp->n4g_fsid; 397 servinfo4_t *svp = rp->r_server; 398 399 rp->r_srv_fsid = ga_fsid; 400 401 (void) nfs_rw_enter_sig(&svp->sv_lock, 402 RW_READER, 0); 403 if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid)) 404 is_stub = 1; 405 nfs_rw_exit(&svp->sv_lock); 406 } 407 408 if (is_stub) 409 r4_stub_mirrormount(rp); 410 else 411 r4_stub_none(rp); 412 413 /* Can not cache partial attr */ 414 if (attr->va_mask == AT_ALL) 415 nfs4_attrcache_noinval(vp, garp, t); 416 else 417 PURGE_ATTRCACHE4(vp); 418 419 rw_exit(&rtable4[index].r_lock); 420 } 421 } else { 422 if (newnode) { 423 PURGE_ATTRCACHE4(vp); 424 } 425 rw_exit(&rtable4[index].r_lock); 426 } 427 } 428 429 /* 430 * Find or create an rnode based primarily on filehandle. To be 431 * used when dvp (vnode for parent directory) is not available; 432 * otherwise, makenfs4node() should be used. 433 * 434 * The nfs4_fname_t argument *npp is consumed and nulled out. 435 */ 436 437 vnode_t * 438 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 439 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 440 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 441 { 442 vfs_t *vfsp = mi->mi_vfsp; 443 int newnode = 0; 444 vnode_t *vp; 445 rnode4_t *rp; 446 svnode_t *svp; 447 nfs4_fname_t *name, *svpname; 448 int index; 449 450 ASSERT(npp && *npp); 451 name = *npp; 452 *npp = NULL; 453 454 index = rtable4hash(sfh); 455 rw_enter(&rtable4[index].r_lock, RW_READER); 456 457 vp = make_rnode4(sfh, &rtable4[index], vfsp, 458 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 459 460 svp = VTOSV(vp); 461 rp = VTOR4(vp); 462 if (newnode) { 463 svp->sv_forw = svp->sv_back = svp; 464 svp->sv_name = name; 465 if (psfh != NULL) 466 sfh4_hold(psfh); 467 svp->sv_dfh = psfh; 468 } else { 469 /* 470 * It is possible that due to a server 471 * side rename fnames have changed. 472 * update the fname here. 473 */ 474 mutex_enter(&rp->r_svlock); 475 svpname = svp->sv_name; 476 if (svp->sv_name != name) { 477 svp->sv_name = name; 478 mutex_exit(&rp->r_svlock); 479 fn_rele(&svpname); 480 } else { 481 mutex_exit(&rp->r_svlock); 482 fn_rele(&name); 483 } 484 } 485 486 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 487 r4_do_attrcache(vp, garp, newnode, t, cr, index); 488 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 489 490 return (vp); 491 } 492 493 /* 494 * Find or create a vnode for the given filehandle, filesystem, parent, and 495 * name. The reference to nm is consumed, so the caller must first do an 496 * fn_hold() if it wants to continue using nm after this call. 497 */ 498 vnode_t * 499 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 500 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 501 { 502 vnode_t *vp; 503 int newnode; 504 int index; 505 mntinfo4_t *mi = VFTOMI4(vfsp); 506 int had_badfh = 0; 507 rnode4_t *rp; 508 509 ASSERT(dvp != NULL); 510 511 fh = badrootfh_check(fh, nm, mi, &had_badfh); 512 513 index = rtable4hash(fh); 514 rw_enter(&rtable4[index].r_lock, RW_READER); 515 516 /* 517 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 518 */ 519 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 520 nfs4_putapage, &newnode, cr); 521 522 rp = VTOR4(vp); 523 sv_activate(&vp, dvp, &nm, newnode); 524 if (dvp->v_flag & V_XATTRDIR) { 525 mutex_enter(&rp->r_statelock); 526 rp->r_flags |= R4ISXATTR; 527 mutex_exit(&rp->r_statelock); 528 } 529 530 /* if getting a bad file handle, do not cache the attributes. */ 531 if (had_badfh) { 532 rw_exit(&rtable4[index].r_lock); 533 return (vp); 534 } 535 536 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 537 r4_do_attrcache(vp, garp, newnode, t, cr, index); 538 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 539 540 return (vp); 541 } 542 543 /* 544 * Hash on address of filehandle object. 545 * XXX totally untuned. 546 */ 547 548 int 549 rtable4hash(nfs4_sharedfh_t *fh) 550 { 551 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 552 } 553 554 /* 555 * Find or create the vnode for the given filehandle and filesystem. 556 * *newnode is set to zero if the vnode already existed; non-zero if it had 557 * to be created. 558 * 559 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 560 */ 561 562 static vnode_t * 563 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 564 struct vnodeops *vops, 565 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 566 int *newnode, cred_t *cr) 567 { 568 rnode4_t *rp; 569 rnode4_t *trp; 570 vnode_t *vp; 571 mntinfo4_t *mi; 572 573 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 574 575 mi = VFTOMI4(vfsp); 576 577 start: 578 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 579 vp = RTOV4(rp); 580 *newnode = 0; 581 return (vp); 582 } 583 rw_exit(&rhtp->r_lock); 584 585 mutex_enter(&rp4freelist_lock); 586 587 if (rp4freelist != NULL && rnode4_new >= nrnode) { 588 rp = rp4freelist; 589 rp4_rmfree(rp); 590 mutex_exit(&rp4freelist_lock); 591 592 vp = RTOV4(rp); 593 594 if (rp->r_flags & R4HASHED) { 595 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 596 mutex_enter(&vp->v_lock); 597 if (vp->v_count > 1) { 598 vp->v_count--; 599 mutex_exit(&vp->v_lock); 600 rw_exit(&rp->r_hashq->r_lock); 601 rw_enter(&rhtp->r_lock, RW_READER); 602 goto start; 603 } 604 mutex_exit(&vp->v_lock); 605 rp4_rmhash_locked(rp); 606 rw_exit(&rp->r_hashq->r_lock); 607 } 608 609 r4inactive(rp, cr); 610 611 mutex_enter(&vp->v_lock); 612 if (vp->v_count > 1) { 613 vp->v_count--; 614 mutex_exit(&vp->v_lock); 615 rw_enter(&rhtp->r_lock, RW_READER); 616 goto start; 617 } 618 mutex_exit(&vp->v_lock); 619 vn_invalid(vp); 620 621 /* 622 * destroy old locks before bzero'ing and 623 * recreating the locks below. 624 */ 625 uninit_rnode4(rp); 626 627 /* 628 * Make sure that if rnode is recycled then 629 * VFS count is decremented properly before 630 * reuse. 631 */ 632 VFS_RELE(vp->v_vfsp); 633 vn_reinit(vp); 634 } else { 635 vnode_t *new_vp; 636 637 mutex_exit(&rp4freelist_lock); 638 639 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 640 new_vp = vn_alloc(KM_SLEEP); 641 642 atomic_add_long((ulong_t *)&rnode4_new, 1); 643 #ifdef DEBUG 644 clstat4_debug.nrnode.value.ui64++; 645 #endif 646 vp = new_vp; 647 } 648 649 bzero(rp, sizeof (*rp)); 650 rp->r_vnode = vp; 651 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 652 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 653 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 654 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 655 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 656 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 657 rp->created_v4 = 0; 658 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 659 offsetof(nfs4_open_stream_t, os_node)); 660 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 661 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 662 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 663 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 664 rp->r_flags = R4READDIRWATTR; 665 rp->r_fh = fh; 666 rp->r_hashq = rhtp; 667 sfh4_hold(rp->r_fh); 668 rp->r_server = mi->mi_curr_serv; 669 rp->r_deleg_type = OPEN_DELEGATE_NONE; 670 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 671 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 672 673 rddir4_cache_create(rp); 674 rp->r_putapage = putapage; 675 vn_setops(vp, vops); 676 vp->v_data = (caddr_t)rp; 677 vp->v_vfsp = vfsp; 678 VFS_HOLD(vfsp); 679 vp->v_type = VNON; 680 if (isrootfh(fh, rp)) 681 vp->v_flag = VROOT; 682 vn_exists(vp); 683 684 /* 685 * There is a race condition if someone else 686 * alloc's the rnode while no locks are held, so we 687 * check again and recover if found. 688 */ 689 rw_enter(&rhtp->r_lock, RW_WRITER); 690 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 691 vp = RTOV4(trp); 692 *newnode = 0; 693 rw_exit(&rhtp->r_lock); 694 rp4_addfree(rp, cr); 695 rw_enter(&rhtp->r_lock, RW_READER); 696 return (vp); 697 } 698 rp4_addhash(rp); 699 *newnode = 1; 700 return (vp); 701 } 702 703 static void 704 uninit_rnode4(rnode4_t *rp) 705 { 706 vnode_t *vp = RTOV4(rp); 707 708 ASSERT(rp != NULL); 709 ASSERT(vp != NULL); 710 ASSERT(vp->v_count == 1); 711 ASSERT(rp->r_count == 0); 712 ASSERT(rp->r_mapcnt == 0); 713 if (rp->r_flags & R4LODANGLERS) { 714 nfs4_flush_lock_owners(rp); 715 } 716 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 717 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 718 ASSERT(!(rp->r_flags & R4HASHED)); 719 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 720 nfs4_clear_open_streams(rp); 721 list_destroy(&rp->r_open_streams); 722 723 /* 724 * Destroy the rddir cache first since we need to grab the r_statelock. 725 */ 726 mutex_enter(&rp->r_statelock); 727 rddir4_cache_destroy(rp); 728 mutex_exit(&rp->r_statelock); 729 sv_uninit(&rp->r_svnode); 730 sfh4_rele(&rp->r_fh); 731 nfs_rw_destroy(&rp->r_rwlock); 732 nfs_rw_destroy(&rp->r_lkserlock); 733 mutex_destroy(&rp->r_statelock); 734 mutex_destroy(&rp->r_statev4_lock); 735 mutex_destroy(&rp->r_os_lock); 736 cv_destroy(&rp->r_cv); 737 cv_destroy(&rp->r_commit.c_cv); 738 nfs_rw_destroy(&rp->r_deleg_recall_lock); 739 if (rp->r_flags & R4DELMAPLIST) 740 list_destroy(&rp->r_indelmap); 741 } 742 743 /* 744 * Put an rnode on the free list. 745 * 746 * Rnodes which were allocated above and beyond the normal limit 747 * are immediately freed. 748 */ 749 void 750 rp4_addfree(rnode4_t *rp, cred_t *cr) 751 { 752 vnode_t *vp; 753 vnode_t *xattr; 754 struct vfs *vfsp; 755 756 vp = RTOV4(rp); 757 ASSERT(vp->v_count >= 1); 758 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 759 760 /* 761 * If we have too many rnodes allocated and there are no 762 * references to this rnode, or if the rnode is no longer 763 * accessible by it does not reside in the hash queues, 764 * or if an i/o error occurred while writing to the file, 765 * then just free it instead of putting it on the rnode 766 * freelist. 767 */ 768 vfsp = vp->v_vfsp; 769 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 770 #ifdef DEBUG 771 (nfs4_rnode_nofreelist != 0) || 772 #endif 773 rp->r_error || (rp->r_flags & R4RECOVERR) || 774 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 775 if (rp->r_flags & R4HASHED) { 776 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 777 mutex_enter(&vp->v_lock); 778 if (vp->v_count > 1) { 779 vp->v_count--; 780 mutex_exit(&vp->v_lock); 781 rw_exit(&rp->r_hashq->r_lock); 782 return; 783 } 784 mutex_exit(&vp->v_lock); 785 rp4_rmhash_locked(rp); 786 rw_exit(&rp->r_hashq->r_lock); 787 } 788 789 /* 790 * Make sure we don't have a delegation on this rnode 791 * before destroying it. 792 */ 793 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 794 (void) nfs4delegreturn(rp, 795 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 796 } 797 798 r4inactive(rp, cr); 799 800 /* 801 * Recheck the vnode reference count. We need to 802 * make sure that another reference has not been 803 * acquired while we were not holding v_lock. The 804 * rnode is not in the rnode hash queues; one 805 * way for a reference to have been acquired 806 * is for a VOP_PUTPAGE because the rnode was marked 807 * with R4DIRTY or for a modified page. This 808 * reference may have been acquired before our call 809 * to r4inactive. The i/o may have been completed, 810 * thus allowing r4inactive to complete, but the 811 * reference to the vnode may not have been released 812 * yet. In any case, the rnode can not be destroyed 813 * until the other references to this vnode have been 814 * released. The other references will take care of 815 * either destroying the rnode or placing it on the 816 * rnode freelist. If there are no other references, 817 * then the rnode may be safely destroyed. 818 */ 819 mutex_enter(&vp->v_lock); 820 if (vp->v_count > 1) { 821 vp->v_count--; 822 mutex_exit(&vp->v_lock); 823 return; 824 } 825 mutex_exit(&vp->v_lock); 826 827 destroy_rnode4(rp); 828 return; 829 } 830 831 /* 832 * Lock the hash queue and then recheck the reference count 833 * to ensure that no other threads have acquired a reference 834 * to indicate that the rnode should not be placed on the 835 * freelist. If another reference has been acquired, then 836 * just release this one and let the other thread complete 837 * the processing of adding this rnode to the freelist. 838 */ 839 again: 840 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 841 842 mutex_enter(&vp->v_lock); 843 if (vp->v_count > 1) { 844 vp->v_count--; 845 mutex_exit(&vp->v_lock); 846 rw_exit(&rp->r_hashq->r_lock); 847 return; 848 } 849 mutex_exit(&vp->v_lock); 850 851 /* 852 * Make sure we don't put an rnode with a delegation 853 * on the free list. 854 */ 855 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 856 rw_exit(&rp->r_hashq->r_lock); 857 (void) nfs4delegreturn(rp, 858 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 859 goto again; 860 } 861 862 /* 863 * Now that we have the hash queue lock, and we know there 864 * are not anymore references on the vnode, check to make 865 * sure there aren't any open streams still on the rnode. 866 * If so, drop the hash queue lock, remove the open streams, 867 * and recheck the v_count. 868 */ 869 mutex_enter(&rp->r_os_lock); 870 if (list_head(&rp->r_open_streams) != NULL) { 871 mutex_exit(&rp->r_os_lock); 872 rw_exit(&rp->r_hashq->r_lock); 873 if (nfs_zone() != VTOMI4(vp)->mi_zone) 874 nfs4_clear_open_streams(rp); 875 else 876 (void) nfs4close_all(vp, cr); 877 goto again; 878 } 879 mutex_exit(&rp->r_os_lock); 880 881 /* 882 * Before we put it on the freelist, make sure there are no pages. 883 * If there are, flush and commit of all of the dirty and 884 * uncommitted pages, assuming the file system isn't read only. 885 */ 886 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) { 887 rw_exit(&rp->r_hashq->r_lock); 888 r4flushpages(rp, cr); 889 goto again; 890 } 891 892 /* 893 * Before we put it on the freelist, make sure there is no 894 * active xattr directory cached, the freelist will not 895 * have its entries r4inactive'd if there is still an active 896 * rnode, thus nothing in the freelist can hold another 897 * rnode active. 898 */ 899 xattr = rp->r_xattr_dir; 900 rp->r_xattr_dir = NULL; 901 902 /* 903 * If there is no cached data or metadata for this file, then 904 * put the rnode on the front of the freelist so that it will 905 * be reused before other rnodes which may have cached data or 906 * metadata associated with them. 907 */ 908 mutex_enter(&rp4freelist_lock); 909 if (rp4freelist == NULL) { 910 rp->r_freef = rp; 911 rp->r_freeb = rp; 912 rp4freelist = rp; 913 } else { 914 rp->r_freef = rp4freelist; 915 rp->r_freeb = rp4freelist->r_freeb; 916 rp4freelist->r_freeb->r_freef = rp; 917 rp4freelist->r_freeb = rp; 918 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 919 rp->r_symlink.contents == NULL && rp->r_secattr == NULL) 920 rp4freelist = rp; 921 } 922 mutex_exit(&rp4freelist_lock); 923 924 rw_exit(&rp->r_hashq->r_lock); 925 926 if (xattr) 927 VN_RELE(xattr); 928 } 929 930 /* 931 * Remove an rnode from the free list. 932 * 933 * The caller must be holding rp4freelist_lock and the rnode 934 * must be on the freelist. 935 */ 936 static void 937 rp4_rmfree(rnode4_t *rp) 938 { 939 940 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 941 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 942 943 if (rp == rp4freelist) { 944 rp4freelist = rp->r_freef; 945 if (rp == rp4freelist) 946 rp4freelist = NULL; 947 } 948 rp->r_freeb->r_freef = rp->r_freef; 949 rp->r_freef->r_freeb = rp->r_freeb; 950 951 rp->r_freef = rp->r_freeb = NULL; 952 } 953 954 /* 955 * Put a rnode in the hash table. 956 * 957 * The caller must be holding the exclusive hash queue lock 958 */ 959 void 960 rp4_addhash(rnode4_t *rp) 961 { 962 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 963 ASSERT(!(rp->r_flags & R4HASHED)); 964 965 #ifdef DEBUG 966 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 967 #endif 968 969 rp->r_hashf = rp->r_hashq->r_hashf; 970 rp->r_hashq->r_hashf = rp; 971 rp->r_hashb = (rnode4_t *)rp->r_hashq; 972 rp->r_hashf->r_hashb = rp; 973 974 mutex_enter(&rp->r_statelock); 975 rp->r_flags |= R4HASHED; 976 mutex_exit(&rp->r_statelock); 977 } 978 979 /* 980 * Remove a rnode from the hash table. 981 * 982 * The caller must be holding the hash queue lock. 983 */ 984 void 985 rp4_rmhash_locked(rnode4_t *rp) 986 { 987 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 988 ASSERT(rp->r_flags & R4HASHED); 989 990 rp->r_hashb->r_hashf = rp->r_hashf; 991 rp->r_hashf->r_hashb = rp->r_hashb; 992 993 mutex_enter(&rp->r_statelock); 994 rp->r_flags &= ~R4HASHED; 995 mutex_exit(&rp->r_statelock); 996 } 997 998 /* 999 * Remove a rnode from the hash table. 1000 * 1001 * The caller must not be holding the hash queue lock. 1002 */ 1003 void 1004 rp4_rmhash(rnode4_t *rp) 1005 { 1006 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1007 rp4_rmhash_locked(rp); 1008 rw_exit(&rp->r_hashq->r_lock); 1009 } 1010 1011 /* 1012 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 1013 * Returns NULL if no match. If an rnode is returned, the reference count 1014 * on the master vnode is incremented. 1015 * 1016 * The caller must be holding the hash queue lock, either shared or exclusive. 1017 */ 1018 rnode4_t * 1019 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 1020 { 1021 rnode4_t *rp; 1022 vnode_t *vp; 1023 1024 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 1025 1026 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 1027 vp = RTOV4(rp); 1028 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 1029 1030 mutex_enter(&rp->r_statelock); 1031 if (rp->r_flags & R4RECOVERR) { 1032 mutex_exit(&rp->r_statelock); 1033 continue; 1034 } 1035 mutex_exit(&rp->r_statelock); 1036 #ifdef DEBUG 1037 r4_dup_check(rp, vfsp); 1038 #endif 1039 if (rp->r_freef != NULL) { 1040 mutex_enter(&rp4freelist_lock); 1041 /* 1042 * If the rnode is on the freelist, 1043 * then remove it and use that reference 1044 * as the new reference. Otherwise, 1045 * need to increment the reference count. 1046 */ 1047 if (rp->r_freef != NULL) { 1048 rp4_rmfree(rp); 1049 mutex_exit(&rp4freelist_lock); 1050 } else { 1051 mutex_exit(&rp4freelist_lock); 1052 VN_HOLD(vp); 1053 } 1054 } else 1055 VN_HOLD(vp); 1056 1057 /* 1058 * if root vnode, set v_flag to indicate that 1059 */ 1060 if (isrootfh(fh, rp)) { 1061 if (!(vp->v_flag & VROOT)) { 1062 mutex_enter(&vp->v_lock); 1063 vp->v_flag |= VROOT; 1064 mutex_exit(&vp->v_lock); 1065 } 1066 } 1067 return (rp); 1068 } 1069 } 1070 return (NULL); 1071 } 1072 1073 /* 1074 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1075 * that assumes the caller hasn't already got the lock 1076 * on the hash bucket. 1077 */ 1078 rnode4_t * 1079 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1080 { 1081 rnode4_t *rp; 1082 int index; 1083 1084 index = rtable4hash(fh); 1085 rw_enter(&rtable4[index].r_lock, RW_READER); 1086 rp = r4find(&rtable4[index], fh, vfsp); 1087 rw_exit(&rtable4[index].r_lock); 1088 1089 return (rp); 1090 } 1091 1092 /* 1093 * Return 1 if there is a active vnode belonging to this vfs in the 1094 * rtable4 cache. 1095 * 1096 * Several of these checks are done without holding the usual 1097 * locks. This is safe because destroy_rtable(), rp_addfree(), 1098 * etc. will redo the necessary checks before actually destroying 1099 * any rnodes. 1100 */ 1101 int 1102 check_rtable4(struct vfs *vfsp) 1103 { 1104 rnode4_t *rp; 1105 vnode_t *vp; 1106 char *busy = NULL; 1107 int index; 1108 1109 for (index = 0; index < rtable4size; index++) { 1110 rw_enter(&rtable4[index].r_lock, RW_READER); 1111 1112 for (rp = rtable4[index].r_hashf; 1113 rp != (rnode4_t *)(&rtable4[index]); 1114 rp = rp->r_hashf) { 1115 1116 vp = RTOV4(rp); 1117 if (vp->v_vfsp == vfsp) { 1118 if (rp->r_freef == NULL) { 1119 busy = "not on free list"; 1120 } else if (nfs4_has_pages(vp) && 1121 (rp->r_flags & R4DIRTY)) { 1122 busy = "dirty pages"; 1123 } else if (rp->r_count > 0) { 1124 busy = "r_count > 0"; 1125 } 1126 1127 if (busy != NULL) { 1128 #ifdef DEBUG 1129 char *path; 1130 1131 path = fn_path(rp->r_svnode.sv_name); 1132 NFS4_DEBUG(nfs4_rnode_debug, 1133 (CE_NOTE, "check_rtable4: " "%s %s", 1134 path, busy)); 1135 kmem_free(path, strlen(path)+1); 1136 #endif 1137 rw_exit(&rtable4[index].r_lock); 1138 return (1); 1139 } 1140 } 1141 } 1142 rw_exit(&rtable4[index].r_lock); 1143 } 1144 return (0); 1145 } 1146 1147 /* 1148 * Destroy inactive vnodes from the hash queues which 1149 * belong to this vfs. All of the vnodes should be inactive. 1150 * It is essential that we destroy all rnodes in case of 1151 * forced unmount as well as in normal unmount case. 1152 */ 1153 1154 void 1155 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1156 { 1157 int index; 1158 vnode_t *vp; 1159 rnode4_t *rp, *r_hashf, *rlist; 1160 1161 rlist = NULL; 1162 1163 for (index = 0; index < rtable4size; index++) { 1164 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1165 for (rp = rtable4[index].r_hashf; 1166 rp != (rnode4_t *)(&rtable4[index]); 1167 rp = r_hashf) { 1168 /* save the hash pointer before destroying */ 1169 r_hashf = rp->r_hashf; 1170 1171 vp = RTOV4(rp); 1172 if (vp->v_vfsp == vfsp) { 1173 mutex_enter(&rp4freelist_lock); 1174 if (rp->r_freef != NULL) { 1175 rp4_rmfree(rp); 1176 mutex_exit(&rp4freelist_lock); 1177 rp4_rmhash_locked(rp); 1178 rp->r_hashf = rlist; 1179 rlist = rp; 1180 } else 1181 mutex_exit(&rp4freelist_lock); 1182 } 1183 } 1184 rw_exit(&rtable4[index].r_lock); 1185 } 1186 1187 for (rp = rlist; rp != NULL; rp = r_hashf) { 1188 r_hashf = rp->r_hashf; 1189 /* 1190 * This call to rp4_addfree will end up destroying the 1191 * rnode, but in a safe way with the appropriate set 1192 * of checks done. 1193 */ 1194 rp4_addfree(rp, cr); 1195 } 1196 } 1197 1198 /* 1199 * This routine destroys all the resources of an rnode 1200 * and finally the rnode itself. 1201 */ 1202 static void 1203 destroy_rnode4(rnode4_t *rp) 1204 { 1205 vnode_t *vp; 1206 vfs_t *vfsp; 1207 1208 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1209 1210 vp = RTOV4(rp); 1211 vfsp = vp->v_vfsp; 1212 1213 uninit_rnode4(rp); 1214 atomic_add_long((ulong_t *)&rnode4_new, -1); 1215 #ifdef DEBUG 1216 clstat4_debug.nrnode.value.ui64--; 1217 #endif 1218 kmem_cache_free(rnode4_cache, rp); 1219 vn_invalid(vp); 1220 vn_free(vp); 1221 VFS_RELE(vfsp); 1222 } 1223 1224 /* 1225 * Invalidate the attributes on all rnodes forcing the next getattr 1226 * to go over the wire. Used to flush stale uid and gid mappings. 1227 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1228 */ 1229 void 1230 nfs4_rnode_invalidate(struct vfs *vfsp) 1231 { 1232 int index; 1233 rnode4_t *rp; 1234 vnode_t *vp; 1235 1236 /* 1237 * Walk the hash queues looking for rnodes. 1238 */ 1239 for (index = 0; index < rtable4size; index++) { 1240 rw_enter(&rtable4[index].r_lock, RW_READER); 1241 for (rp = rtable4[index].r_hashf; 1242 rp != (rnode4_t *)(&rtable4[index]); 1243 rp = rp->r_hashf) { 1244 vp = RTOV4(rp); 1245 if (vfsp != NULL && vp->v_vfsp != vfsp) 1246 continue; 1247 1248 if (!mutex_tryenter(&rp->r_statelock)) 1249 continue; 1250 1251 /* 1252 * Expire the attributes by resetting the change 1253 * and attr timeout. 1254 */ 1255 rp->r_change = 0; 1256 PURGE_ATTRCACHE4_LOCKED(rp); 1257 mutex_exit(&rp->r_statelock); 1258 } 1259 rw_exit(&rtable4[index].r_lock); 1260 } 1261 } 1262 1263 /* 1264 * Flush all vnodes in this (or every) vfs. 1265 * Used by nfs_sync and by nfs_unmount. 1266 */ 1267 void 1268 r4flush(struct vfs *vfsp, cred_t *cr) 1269 { 1270 int index; 1271 rnode4_t *rp; 1272 vnode_t *vp, **vplist; 1273 long num, cnt; 1274 1275 /* 1276 * Check to see whether there is anything to do. 1277 */ 1278 num = rnode4_new; 1279 if (num == 0) 1280 return; 1281 1282 /* 1283 * Allocate a slot for all currently active rnodes on the 1284 * supposition that they all may need flushing. 1285 */ 1286 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1287 cnt = 0; 1288 1289 /* 1290 * Walk the hash queues looking for rnodes with page 1291 * lists associated with them. Make a list of these 1292 * files. 1293 */ 1294 for (index = 0; index < rtable4size; index++) { 1295 rw_enter(&rtable4[index].r_lock, RW_READER); 1296 for (rp = rtable4[index].r_hashf; 1297 rp != (rnode4_t *)(&rtable4[index]); 1298 rp = rp->r_hashf) { 1299 vp = RTOV4(rp); 1300 /* 1301 * Don't bother sync'ing a vp if it 1302 * is part of virtual swap device or 1303 * if VFS is read-only 1304 */ 1305 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1306 continue; 1307 /* 1308 * If flushing all mounted file systems or 1309 * the vnode belongs to this vfs, has pages 1310 * and is marked as either dirty or mmap'd, 1311 * hold and add this vnode to the list of 1312 * vnodes to flush. 1313 */ 1314 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 1315 nfs4_has_pages(vp) && 1316 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1317 VN_HOLD(vp); 1318 vplist[cnt++] = vp; 1319 if (cnt == num) { 1320 rw_exit(&rtable4[index].r_lock); 1321 goto toomany; 1322 } 1323 } 1324 } 1325 rw_exit(&rtable4[index].r_lock); 1326 } 1327 toomany: 1328 1329 /* 1330 * Flush and release all of the files on the list. 1331 */ 1332 while (cnt-- > 0) { 1333 vp = vplist[cnt]; 1334 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 1335 VN_RELE(vp); 1336 } 1337 1338 /* 1339 * Free the space allocated to hold the list. 1340 */ 1341 kmem_free(vplist, num * sizeof (*vplist)); 1342 } 1343 1344 int 1345 nfs4_free_data_reclaim(rnode4_t *rp) 1346 { 1347 char *contents; 1348 vnode_t *xattr; 1349 int size; 1350 vsecattr_t *vsp; 1351 int freed; 1352 bool_t rdc = FALSE; 1353 1354 /* 1355 * Free any held caches which may 1356 * be associated with this rnode. 1357 */ 1358 mutex_enter(&rp->r_statelock); 1359 if (rp->r_dir != NULL) 1360 rdc = TRUE; 1361 contents = rp->r_symlink.contents; 1362 size = rp->r_symlink.size; 1363 rp->r_symlink.contents = NULL; 1364 vsp = rp->r_secattr; 1365 rp->r_secattr = NULL; 1366 xattr = rp->r_xattr_dir; 1367 rp->r_xattr_dir = NULL; 1368 mutex_exit(&rp->r_statelock); 1369 1370 /* 1371 * Free the access cache entries. 1372 */ 1373 freed = nfs4_access_purge_rp(rp); 1374 1375 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1376 return (freed); 1377 1378 /* 1379 * Free the readdir cache entries, incompletely if we can't block. 1380 */ 1381 nfs4_purge_rddir_cache(RTOV4(rp)); 1382 1383 /* 1384 * Free the symbolic link cache. 1385 */ 1386 if (contents != NULL) { 1387 1388 kmem_free((void *)contents, size); 1389 } 1390 1391 /* 1392 * Free any cached ACL. 1393 */ 1394 if (vsp != NULL) 1395 nfs4_acl_free_cache(vsp); 1396 1397 /* 1398 * Release the xattr directory vnode 1399 */ 1400 if (xattr != NULL) 1401 VN_RELE(xattr); 1402 1403 return (1); 1404 } 1405 1406 static int 1407 nfs4_active_data_reclaim(rnode4_t *rp) 1408 { 1409 char *contents; 1410 vnode_t *xattr; 1411 int size; 1412 vsecattr_t *vsp; 1413 int freed; 1414 bool_t rdc = FALSE; 1415 1416 /* 1417 * Free any held credentials and caches which 1418 * may be associated with this rnode. 1419 */ 1420 if (!mutex_tryenter(&rp->r_statelock)) 1421 return (0); 1422 contents = rp->r_symlink.contents; 1423 size = rp->r_symlink.size; 1424 rp->r_symlink.contents = NULL; 1425 vsp = rp->r_secattr; 1426 rp->r_secattr = NULL; 1427 if (rp->r_dir != NULL) 1428 rdc = TRUE; 1429 xattr = rp->r_xattr_dir; 1430 rp->r_xattr_dir = NULL; 1431 mutex_exit(&rp->r_statelock); 1432 1433 /* 1434 * Free the access cache entries. 1435 */ 1436 freed = nfs4_access_purge_rp(rp); 1437 1438 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1439 return (freed); 1440 1441 /* 1442 * Free the symbolic link cache. 1443 */ 1444 if (contents != NULL) { 1445 1446 kmem_free((void *)contents, size); 1447 } 1448 1449 /* 1450 * Free any cached ACL. 1451 */ 1452 if (vsp != NULL) 1453 nfs4_acl_free_cache(vsp); 1454 1455 nfs4_purge_rddir_cache(RTOV4(rp)); 1456 1457 /* 1458 * Release the xattr directory vnode 1459 */ 1460 if (xattr != NULL) 1461 VN_RELE(xattr); 1462 1463 return (1); 1464 } 1465 1466 static int 1467 nfs4_free_reclaim(void) 1468 { 1469 int freed; 1470 rnode4_t *rp; 1471 1472 #ifdef DEBUG 1473 clstat4_debug.f_reclaim.value.ui64++; 1474 #endif 1475 freed = 0; 1476 mutex_enter(&rp4freelist_lock); 1477 rp = rp4freelist; 1478 if (rp != NULL) { 1479 do { 1480 if (nfs4_free_data_reclaim(rp)) 1481 freed = 1; 1482 } while ((rp = rp->r_freef) != rp4freelist); 1483 } 1484 mutex_exit(&rp4freelist_lock); 1485 return (freed); 1486 } 1487 1488 static int 1489 nfs4_active_reclaim(void) 1490 { 1491 int freed; 1492 int index; 1493 rnode4_t *rp; 1494 1495 #ifdef DEBUG 1496 clstat4_debug.a_reclaim.value.ui64++; 1497 #endif 1498 freed = 0; 1499 for (index = 0; index < rtable4size; index++) { 1500 rw_enter(&rtable4[index].r_lock, RW_READER); 1501 for (rp = rtable4[index].r_hashf; 1502 rp != (rnode4_t *)(&rtable4[index]); 1503 rp = rp->r_hashf) { 1504 if (nfs4_active_data_reclaim(rp)) 1505 freed = 1; 1506 } 1507 rw_exit(&rtable4[index].r_lock); 1508 } 1509 return (freed); 1510 } 1511 1512 static int 1513 nfs4_rnode_reclaim(void) 1514 { 1515 int freed; 1516 rnode4_t *rp; 1517 vnode_t *vp; 1518 1519 #ifdef DEBUG 1520 clstat4_debug.r_reclaim.value.ui64++; 1521 #endif 1522 freed = 0; 1523 mutex_enter(&rp4freelist_lock); 1524 while ((rp = rp4freelist) != NULL) { 1525 rp4_rmfree(rp); 1526 mutex_exit(&rp4freelist_lock); 1527 if (rp->r_flags & R4HASHED) { 1528 vp = RTOV4(rp); 1529 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1530 mutex_enter(&vp->v_lock); 1531 if (vp->v_count > 1) { 1532 vp->v_count--; 1533 mutex_exit(&vp->v_lock); 1534 rw_exit(&rp->r_hashq->r_lock); 1535 mutex_enter(&rp4freelist_lock); 1536 continue; 1537 } 1538 mutex_exit(&vp->v_lock); 1539 rp4_rmhash_locked(rp); 1540 rw_exit(&rp->r_hashq->r_lock); 1541 } 1542 /* 1543 * This call to rp_addfree will end up destroying the 1544 * rnode, but in a safe way with the appropriate set 1545 * of checks done. 1546 */ 1547 rp4_addfree(rp, CRED()); 1548 mutex_enter(&rp4freelist_lock); 1549 } 1550 mutex_exit(&rp4freelist_lock); 1551 return (freed); 1552 } 1553 1554 /*ARGSUSED*/ 1555 static void 1556 nfs4_reclaim(void *cdrarg) 1557 { 1558 #ifdef DEBUG 1559 clstat4_debug.reclaim.value.ui64++; 1560 #endif 1561 if (nfs4_free_reclaim()) 1562 return; 1563 1564 if (nfs4_active_reclaim()) 1565 return; 1566 1567 (void) nfs4_rnode_reclaim(); 1568 } 1569 1570 /* 1571 * Returns the clientid4 to use for the given mntinfo4. Note that the 1572 * clientid can change if the caller drops mi_recovlock. 1573 */ 1574 1575 clientid4 1576 mi2clientid(mntinfo4_t *mi) 1577 { 1578 nfs4_server_t *sp; 1579 clientid4 clientid = 0; 1580 1581 /* this locks down sp if it is found */ 1582 sp = find_nfs4_server(mi); 1583 if (sp != NULL) { 1584 clientid = sp->clientid; 1585 mutex_exit(&sp->s_lock); 1586 nfs4_server_rele(sp); 1587 } 1588 return (clientid); 1589 } 1590 1591 /* 1592 * Return the current lease time for the server associated with the given 1593 * file. Note that the lease time could change immediately after this 1594 * call. 1595 */ 1596 1597 time_t 1598 r2lease_time(rnode4_t *rp) 1599 { 1600 nfs4_server_t *sp; 1601 time_t lease_time; 1602 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1603 1604 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1605 1606 /* this locks down sp if it is found */ 1607 sp = find_nfs4_server(VTOMI4(RTOV4(rp))); 1608 1609 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1610 if (sp != NULL) { 1611 mutex_exit(&sp->s_lock); 1612 nfs4_server_rele(sp); 1613 } 1614 nfs_rw_exit(&mi->mi_recovlock); 1615 return (1); /* 1 second */ 1616 } 1617 1618 ASSERT(sp != NULL); 1619 1620 lease_time = sp->s_lease_time; 1621 1622 mutex_exit(&sp->s_lock); 1623 nfs4_server_rele(sp); 1624 nfs_rw_exit(&mi->mi_recovlock); 1625 1626 return (lease_time); 1627 } 1628 1629 /* 1630 * Return a list with information about all the known open instances for 1631 * a filesystem. The caller must call r4releopenlist() when done with the 1632 * list. 1633 * 1634 * We are safe at looking at os_valid and os_pending_close across dropping 1635 * the 'os_sync_lock' to count up the number of open streams and then 1636 * allocate memory for the osp list due to: 1637 * -Looking at os_pending_close is safe since this routine is 1638 * only called via recovery, and os_pending_close can only be set via 1639 * a non-recovery operation (which are all blocked when recovery 1640 * is active). 1641 * 1642 * -Examining os_valid is safe since non-recovery operations, which 1643 * could potentially switch os_valid to 0, are blocked (via 1644 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1645 * (which means we are the only recovery thread potentially acting 1646 * on this open stream). 1647 */ 1648 1649 nfs4_opinst_t * 1650 r4mkopenlist(mntinfo4_t *mi) 1651 { 1652 nfs4_opinst_t *reopenlist, *rep; 1653 rnode4_t *rp; 1654 vnode_t *vp; 1655 vfs_t *vfsp = mi->mi_vfsp; 1656 int numosp; 1657 nfs4_open_stream_t *osp; 1658 int index; 1659 open_delegation_type4 dtype; 1660 int hold_vnode; 1661 1662 reopenlist = NULL; 1663 1664 for (index = 0; index < rtable4size; index++) { 1665 rw_enter(&rtable4[index].r_lock, RW_READER); 1666 for (rp = rtable4[index].r_hashf; 1667 rp != (rnode4_t *)(&rtable4[index]); 1668 rp = rp->r_hashf) { 1669 1670 vp = RTOV4(rp); 1671 if (vp->v_vfsp != vfsp) 1672 continue; 1673 hold_vnode = 0; 1674 1675 mutex_enter(&rp->r_os_lock); 1676 1677 /* Count the number of valid open_streams of the file */ 1678 numosp = 0; 1679 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1680 osp = list_next(&rp->r_open_streams, osp)) { 1681 mutex_enter(&osp->os_sync_lock); 1682 if (osp->os_valid && !osp->os_pending_close) 1683 numosp++; 1684 mutex_exit(&osp->os_sync_lock); 1685 } 1686 1687 /* Fill in the valid open streams per vp */ 1688 if (numosp > 0) { 1689 int j; 1690 1691 hold_vnode = 1; 1692 1693 /* 1694 * Add a new open instance to the list 1695 */ 1696 rep = kmem_zalloc(sizeof (*reopenlist), 1697 KM_SLEEP); 1698 rep->re_next = reopenlist; 1699 reopenlist = rep; 1700 1701 rep->re_vp = vp; 1702 rep->re_osp = kmem_zalloc( 1703 numosp * sizeof (*(rep->re_osp)), 1704 KM_SLEEP); 1705 rep->re_numosp = numosp; 1706 1707 j = 0; 1708 for (osp = list_head(&rp->r_open_streams); 1709 osp != NULL; 1710 osp = list_next(&rp->r_open_streams, osp)) { 1711 1712 mutex_enter(&osp->os_sync_lock); 1713 if (osp->os_valid && 1714 !osp->os_pending_close) { 1715 osp->os_ref_count++; 1716 rep->re_osp[j] = osp; 1717 j++; 1718 } 1719 mutex_exit(&osp->os_sync_lock); 1720 } 1721 /* 1722 * Assuming valid osp(s) stays valid between 1723 * the time obtaining j and numosp. 1724 */ 1725 ASSERT(j == numosp); 1726 } 1727 1728 mutex_exit(&rp->r_os_lock); 1729 /* do this here to keep v_lock > r_os_lock */ 1730 if (hold_vnode) 1731 VN_HOLD(vp); 1732 mutex_enter(&rp->r_statev4_lock); 1733 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1734 /* 1735 * If this rnode holds a delegation, 1736 * but if there are no valid open streams, 1737 * then just discard the delegation 1738 * without doing delegreturn. 1739 */ 1740 if (numosp > 0) 1741 rp->r_deleg_needs_recovery = 1742 rp->r_deleg_type; 1743 } 1744 /* Save the delegation type for use outside the lock */ 1745 dtype = rp->r_deleg_type; 1746 mutex_exit(&rp->r_statev4_lock); 1747 1748 /* 1749 * If we have a delegation then get rid of it. 1750 * We've set rp->r_deleg_needs_recovery so we have 1751 * enough information to recover. 1752 */ 1753 if (dtype != OPEN_DELEGATE_NONE) { 1754 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1755 } 1756 } 1757 rw_exit(&rtable4[index].r_lock); 1758 } 1759 return (reopenlist); 1760 } 1761 1762 /* 1763 * Release the list of open instance references. 1764 */ 1765 1766 void 1767 r4releopenlist(nfs4_opinst_t *reopenp) 1768 { 1769 nfs4_opinst_t *rep, *next; 1770 int i; 1771 1772 for (rep = reopenp; rep; rep = next) { 1773 next = rep->re_next; 1774 1775 for (i = 0; i < rep->re_numosp; i++) 1776 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1777 1778 VN_RELE(rep->re_vp); 1779 kmem_free(rep->re_osp, 1780 rep->re_numosp * sizeof (*(rep->re_osp))); 1781 1782 kmem_free(rep, sizeof (*rep)); 1783 } 1784 } 1785 1786 int 1787 nfs4_rnode_init(void) 1788 { 1789 ulong_t nrnode4_max; 1790 int i; 1791 1792 /* 1793 * Compute the size of the rnode4 hash table 1794 */ 1795 if (nrnode <= 0) 1796 nrnode = ncsize; 1797 nrnode4_max = 1798 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1799 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1800 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1801 "setting nrnode to max value of %ld", nrnode4_max); 1802 nrnode = nrnode4_max; 1803 } 1804 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1805 rtable4mask = rtable4size - 1; 1806 1807 /* 1808 * Allocate and initialize the hash buckets 1809 */ 1810 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1811 for (i = 0; i < rtable4size; i++) { 1812 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1813 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1814 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1815 } 1816 1817 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1818 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1819 1820 return (0); 1821 } 1822 1823 int 1824 nfs4_rnode_fini(void) 1825 { 1826 int i; 1827 1828 /* 1829 * Deallocate the rnode hash queues 1830 */ 1831 kmem_cache_destroy(rnode4_cache); 1832 1833 for (i = 0; i < rtable4size; i++) 1834 rw_destroy(&rtable4[i].r_lock); 1835 1836 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 1837 1838 return (0); 1839 } 1840 1841 /* 1842 * Return non-zero if the given filehandle refers to the root filehandle 1843 * for the given rnode. 1844 */ 1845 1846 static int 1847 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 1848 { 1849 int isroot; 1850 1851 isroot = 0; 1852 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 1853 isroot = 1; 1854 1855 return (isroot); 1856 } 1857 1858 /* 1859 * The r4_stub_* routines assume that the rnode is newly activated, and 1860 * that the caller either holds the hash bucket r_lock for this rnode as 1861 * RW_WRITER, or holds r_statelock. 1862 */ 1863 static void 1864 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type) 1865 { 1866 vnode_t *vp = RTOV4(rp); 1867 krwlock_t *hash_lock = &rp->r_hashq->r_lock; 1868 1869 ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock)); 1870 1871 rp->r_stub_type = type; 1872 1873 /* 1874 * Safely switch this vnode to the trigger vnodeops. 1875 * 1876 * Currently, we don't ever switch a trigger vnode back to using 1877 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that 1878 * a new v4 object is not a trigger, and it will already have the 1879 * correct v4 vnodeops by default. So, no "else" case required here. 1880 */ 1881 if (type != NFS4_STUB_NONE) 1882 vn_setops(vp, nfs4_trigger_vnodeops); 1883 } 1884 1885 void 1886 r4_stub_mirrormount(rnode4_t *rp) 1887 { 1888 r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT); 1889 } 1890 1891 void 1892 r4_stub_none(rnode4_t *rp) 1893 { 1894 r4_stub_set(rp, NFS4_STUB_NONE); 1895 } 1896 1897 #ifdef DEBUG 1898 1899 /* 1900 * Look in the rnode table for other rnodes that have the same filehandle. 1901 * Assume the lock is held for the hash chain of checkrp 1902 */ 1903 1904 static void 1905 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 1906 { 1907 rnode4_t *rp; 1908 vnode_t *tvp; 1909 nfs4_fhandle_t fh, fh2; 1910 int index; 1911 1912 if (!r4_check_for_dups) 1913 return; 1914 1915 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 1916 1917 sfh4_copyval(checkrp->r_fh, &fh); 1918 1919 for (index = 0; index < rtable4size; index++) { 1920 1921 if (&rtable4[index] != checkrp->r_hashq) 1922 rw_enter(&rtable4[index].r_lock, RW_READER); 1923 1924 for (rp = rtable4[index].r_hashf; 1925 rp != (rnode4_t *)(&rtable4[index]); 1926 rp = rp->r_hashf) { 1927 1928 if (rp == checkrp) 1929 continue; 1930 1931 tvp = RTOV4(rp); 1932 if (tvp->v_vfsp != vfsp) 1933 continue; 1934 1935 sfh4_copyval(rp->r_fh, &fh2); 1936 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 1937 cmn_err(CE_PANIC, "rnodes with same fs, fh " 1938 "(%p, %p)", (void *)checkrp, (void *)rp); 1939 } 1940 } 1941 1942 if (&rtable4[index] != checkrp->r_hashq) 1943 rw_exit(&rtable4[index].r_lock); 1944 } 1945 } 1946 1947 #endif /* DEBUG */ 1948