1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/proc.h> 37 #include <sys/user.h> 38 #include <sys/time.h> 39 #include <sys/buf.h> 40 #include <sys/vfs.h> 41 #include <sys/vnode.h> 42 #include <sys/socket.h> 43 #include <sys/uio.h> 44 #include <sys/tiuser.h> 45 #include <sys/swap.h> 46 #include <sys/errno.h> 47 #include <sys/debug.h> 48 #include <sys/kmem.h> 49 #include <sys/kstat.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vtrace.h> 52 #include <sys/session.h> 53 #include <sys/dnlc.h> 54 #include <sys/bitmap.h> 55 #include <sys/acl.h> 56 #include <sys/ddi.h> 57 #include <sys/pathname.h> 58 #include <sys/flock.h> 59 #include <sys/dirent.h> 60 #include <sys/flock.h> 61 #include <sys/callb.h> 62 #include <sys/sdt.h> 63 64 #include <vm/pvn.h> 65 66 #include <rpc/types.h> 67 #include <rpc/xdr.h> 68 #include <rpc/auth.h> 69 #include <rpc/rpcsec_gss.h> 70 #include <rpc/clnt.h> 71 72 #include <nfs/nfs.h> 73 #include <nfs/nfs_clnt.h> 74 #include <nfs/nfs_acl.h> 75 76 #include <nfs/nfs4.h> 77 #include <nfs/rnode4.h> 78 #include <nfs/nfs4_clnt.h> 79 80 /* 81 * The hash queues for the access to active and cached rnodes 82 * are organized as doubly linked lists. A reader/writer lock 83 * for each hash bucket is used to control access and to synchronize 84 * lookups, additions, and deletions from the hash queue. 85 * 86 * The rnode freelist is organized as a doubly linked list with 87 * a head pointer. Additions and deletions are synchronized via 88 * a single mutex. 89 * 90 * In order to add an rnode to the free list, it must be hashed into 91 * a hash queue and the exclusive lock to the hash queue be held. 92 * If an rnode is not hashed into a hash queue, then it is destroyed 93 * because it represents no valuable information that can be reused 94 * about the file. The exclusive lock to the hash queue must be 95 * held in order to prevent a lookup in the hash queue from finding 96 * the rnode and using it and assuming that the rnode is not on the 97 * freelist. The lookup in the hash queue will have the hash queue 98 * locked, either exclusive or shared. 99 * 100 * The vnode reference count for each rnode is not allowed to drop 101 * below 1. This prevents external entities, such as the VM 102 * subsystem, from acquiring references to vnodes already on the 103 * freelist and then trying to place them back on the freelist 104 * when their reference is released. This means that the when an 105 * rnode is looked up in the hash queues, then either the rnode 106 * is removed from the freelist and that reference is transferred to 107 * the new reference or the vnode reference count must be incremented 108 * accordingly. The mutex for the freelist must be held in order to 109 * accurately test to see if the rnode is on the freelist or not. 110 * The hash queue lock might be held shared and it is possible that 111 * two different threads may race to remove the rnode from the 112 * freelist. This race can be resolved by holding the mutex for the 113 * freelist. Please note that the mutex for the freelist does not 114 * need to be held if the rnode is not on the freelist. It can not be 115 * placed on the freelist due to the requirement that the thread 116 * putting the rnode on the freelist must hold the exclusive lock 117 * to the hash queue and the thread doing the lookup in the hash 118 * queue is holding either a shared or exclusive lock to the hash 119 * queue. 120 * 121 * The lock ordering is: 122 * 123 * hash bucket lock -> vnode lock 124 * hash bucket lock -> freelist lock -> r_statelock 125 */ 126 r4hashq_t *rtable4; 127 128 static kmutex_t rp4freelist_lock; 129 static rnode4_t *rp4freelist = NULL; 130 static long rnode4_new = 0; 131 int rtable4size; 132 static int rtable4mask; 133 static struct kmem_cache *rnode4_cache; 134 static int rnode4_hashlen = 4; 135 136 static void r4inactive(rnode4_t *, cred_t *); 137 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 138 struct vnodeops *, 139 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 140 cred_t *), 141 int *, cred_t *); 142 static void rp4_rmfree(rnode4_t *); 143 int nfs4_free_data_reclaim(rnode4_t *); 144 static int nfs4_active_data_reclaim(rnode4_t *); 145 static int nfs4_free_reclaim(void); 146 static int nfs4_active_reclaim(void); 147 static int nfs4_rnode_reclaim(void); 148 static void nfs4_reclaim(void *); 149 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 150 static void uninit_rnode4(rnode4_t *); 151 static void destroy_rnode4(rnode4_t *); 152 static void r4_stub_set(rnode4_t *, nfs4_stub_type_t); 153 154 #ifdef DEBUG 155 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 156 static int nfs4_rnode_debug = 0; 157 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 158 static int nfs4_rnode_nofreelist = 0; 159 /* give messages on colliding shared filehandles */ 160 static void r4_dup_check(rnode4_t *, vfs_t *); 161 #endif 162 163 /* 164 * If the vnode has pages, run the list and check for any that are 165 * still dangling. We call this routine before putting an rnode on 166 * the free list. 167 */ 168 static int 169 nfs4_dross_pages(vnode_t *vp) 170 { 171 page_t *pp; 172 kmutex_t *vphm; 173 174 vphm = page_vnode_mutex(vp); 175 mutex_enter(vphm); 176 if ((pp = vp->v_pages) != NULL) { 177 do { 178 if (pp->p_hash != PVN_VPLIST_HASH_TAG && 179 pp->p_fsdata != C_NOCOMMIT) { 180 mutex_exit(vphm); 181 return (1); 182 } 183 } while ((pp = pp->p_vpnext) != vp->v_pages); 184 } 185 mutex_exit(vphm); 186 187 return (0); 188 } 189 190 /* 191 * Flush any pages left on this rnode. 192 */ 193 static void 194 r4flushpages(rnode4_t *rp, cred_t *cr) 195 { 196 vnode_t *vp; 197 int error; 198 199 /* 200 * Before freeing anything, wait until all asynchronous 201 * activity is done on this rnode. This will allow all 202 * asynchronous read ahead and write behind i/o's to 203 * finish. 204 */ 205 mutex_enter(&rp->r_statelock); 206 while (rp->r_count > 0) 207 cv_wait(&rp->r_cv, &rp->r_statelock); 208 mutex_exit(&rp->r_statelock); 209 210 /* 211 * Flush and invalidate all pages associated with the vnode. 212 */ 213 vp = RTOV4(rp); 214 if (nfs4_has_pages(vp)) { 215 ASSERT(vp->v_type != VCHR); 216 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 217 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 218 if (error && (error == ENOSPC || error == EDQUOT)) { 219 mutex_enter(&rp->r_statelock); 220 if (!rp->r_error) 221 rp->r_error = error; 222 mutex_exit(&rp->r_statelock); 223 } 224 } 225 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 226 } 227 } 228 229 /* 230 * Free the resources associated with an rnode. 231 */ 232 static void 233 r4inactive(rnode4_t *rp, cred_t *cr) 234 { 235 vnode_t *vp; 236 char *contents; 237 int size; 238 vsecattr_t *vsp; 239 vnode_t *xattr; 240 241 r4flushpages(rp, cr); 242 243 vp = RTOV4(rp); 244 245 /* 246 * Free any held caches which may be 247 * associated with this rnode. 248 */ 249 mutex_enter(&rp->r_statelock); 250 contents = rp->r_symlink.contents; 251 size = rp->r_symlink.size; 252 rp->r_symlink.contents = NULL; 253 vsp = rp->r_secattr; 254 rp->r_secattr = NULL; 255 xattr = rp->r_xattr_dir; 256 rp->r_xattr_dir = NULL; 257 mutex_exit(&rp->r_statelock); 258 259 /* 260 * Free the access cache entries. 261 */ 262 (void) nfs4_access_purge_rp(rp); 263 264 /* 265 * Free the readdir cache entries. 266 */ 267 nfs4_purge_rddir_cache(vp); 268 269 /* 270 * Free the symbolic link cache. 271 */ 272 if (contents != NULL) { 273 274 kmem_free((void *)contents, size); 275 } 276 277 /* 278 * Free any cached ACL. 279 */ 280 if (vsp != NULL) 281 nfs4_acl_free_cache(vsp); 282 283 /* 284 * Release the cached xattr_dir 285 */ 286 if (xattr != NULL) 287 VN_RELE(xattr); 288 } 289 290 /* 291 * We have seen a case that the fh passed in is for "." which 292 * should be a VROOT node, however, the fh is different from the 293 * root fh stored in the mntinfo4_t. The invalid fh might be 294 * from a misbehaved server and will panic the client system at 295 * a later time. To avoid the panic, we drop the bad fh, use 296 * the root fh from mntinfo4_t, and print an error message 297 * for attention. 298 */ 299 nfs4_sharedfh_t * 300 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 301 int *wasbad) 302 { 303 char *s; 304 305 *wasbad = 0; 306 s = fn_name(nm); 307 ASSERT(strcmp(s, "..") != 0); 308 309 if ((s[0] == '.' && s[1] == '\0') && fh && 310 !SFH4_SAME(mi->mi_rootfh, fh)) { 311 #ifdef DEBUG 312 nfs4_fhandle_t fhandle; 313 314 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 315 "Server %s returns a different " 316 "root filehandle for the path %s:", 317 mi->mi_curr_serv->sv_hostname, 318 mi->mi_curr_serv->sv_path); 319 320 /* print the bad fh */ 321 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 322 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 323 fhandle.fh_len); 324 nfs4_printfhandle(&fhandle); 325 326 /* print mi_rootfh */ 327 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 328 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 329 fhandle.fh_len); 330 nfs4_printfhandle(&fhandle); 331 #endif 332 /* use mi_rootfh instead; fh will be rele by the caller */ 333 fh = mi->mi_rootfh; 334 *wasbad = 1; 335 } 336 337 kmem_free(s, MAXNAMELEN); 338 return (fh); 339 } 340 341 void 342 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 343 hrtime_t t, cred_t *cr, int index) 344 { 345 int is_stub; 346 vattr_t *attr; 347 /* 348 * Don't add to attrcache if time overflow, but 349 * no need to check because either attr is null or the time 350 * values in it were processed by nfs4_time_ntov(), which checks 351 * for time overflows. 352 */ 353 attr = garp ? &garp->n4g_va : NULL; 354 355 if (attr) { 356 if (!newnode) { 357 rw_exit(&rtable4[index].r_lock); 358 #ifdef DEBUG 359 if (vp->v_type != attr->va_type && 360 vp->v_type != VNON && attr->va_type != VNON) { 361 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 362 "makenfs4node: type (%d) doesn't " 363 "match type of found node at %p (%d)", 364 attr->va_type, (void *)vp, vp->v_type); 365 } 366 #endif 367 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 368 } else { 369 rnode4_t *rp = VTOR4(vp); 370 371 vp->v_type = attr->va_type; 372 vp->v_rdev = attr->va_rdev; 373 374 /* 375 * Turn this object into a "stub" object if we 376 * crossed an underlying server fs boundary. 377 * To make this check, during mount we save the 378 * fsid of the server object being mounted. 379 * Here we compare this object's server fsid 380 * with the fsid we saved at mount. If they 381 * are different, we crossed server fs boundary. 382 * 383 * The stub type is set (or not) at rnode 384 * creation time and it never changes for life 385 * of the rnode. 386 * 387 * This stub will be for a mirror-mount, rather than 388 * a referral (the latter also sets R4SRVSTUB). 389 * 390 * The stub type is also set during RO failover, 391 * nfs4_remap_file(). 392 * 393 * We don't bother with taking r_state_lock to 394 * set the stub type because this is a new rnode 395 * and we're holding the hash bucket r_lock RW_WRITER. 396 * No other thread could have obtained access 397 * to this rnode. 398 */ 399 is_stub = 0; 400 if (garp->n4g_fsid_valid) { 401 fattr4_fsid ga_fsid = garp->n4g_fsid; 402 servinfo4_t *svp = rp->r_server; 403 404 rp->r_srv_fsid = ga_fsid; 405 406 (void) nfs_rw_enter_sig(&svp->sv_lock, 407 RW_READER, 0); 408 if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid)) 409 is_stub = 1; 410 nfs_rw_exit(&svp->sv_lock); 411 } 412 413 if (is_stub) 414 r4_stub_mirrormount(rp); 415 else 416 r4_stub_none(rp); 417 418 /* Can not cache partial attr */ 419 if (attr->va_mask == AT_ALL) 420 nfs4_attrcache_noinval(vp, garp, t); 421 else 422 PURGE_ATTRCACHE4(vp); 423 424 rw_exit(&rtable4[index].r_lock); 425 } 426 } else { 427 if (newnode) { 428 PURGE_ATTRCACHE4(vp); 429 } 430 rw_exit(&rtable4[index].r_lock); 431 } 432 } 433 434 /* 435 * Find or create an rnode based primarily on filehandle. To be 436 * used when dvp (vnode for parent directory) is not available; 437 * otherwise, makenfs4node() should be used. 438 * 439 * The nfs4_fname_t argument *npp is consumed and nulled out. 440 */ 441 442 vnode_t * 443 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 444 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 445 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 446 { 447 vfs_t *vfsp = mi->mi_vfsp; 448 int newnode = 0; 449 vnode_t *vp; 450 rnode4_t *rp; 451 svnode_t *svp; 452 nfs4_fname_t *name, *svpname; 453 int index; 454 455 ASSERT(npp && *npp); 456 name = *npp; 457 *npp = NULL; 458 459 index = rtable4hash(sfh); 460 rw_enter(&rtable4[index].r_lock, RW_READER); 461 462 vp = make_rnode4(sfh, &rtable4[index], vfsp, 463 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 464 465 svp = VTOSV(vp); 466 rp = VTOR4(vp); 467 if (newnode) { 468 svp->sv_forw = svp->sv_back = svp; 469 svp->sv_name = name; 470 if (psfh != NULL) 471 sfh4_hold(psfh); 472 svp->sv_dfh = psfh; 473 } else { 474 /* 475 * It is possible that due to a server 476 * side rename fnames have changed. 477 * update the fname here. 478 */ 479 mutex_enter(&rp->r_svlock); 480 svpname = svp->sv_name; 481 if (svp->sv_name != name) { 482 svp->sv_name = name; 483 mutex_exit(&rp->r_svlock); 484 fn_rele(&svpname); 485 } else { 486 mutex_exit(&rp->r_svlock); 487 fn_rele(&name); 488 } 489 } 490 491 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 492 r4_do_attrcache(vp, garp, newnode, t, cr, index); 493 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 494 495 return (vp); 496 } 497 498 /* 499 * Find or create a vnode for the given filehandle, filesystem, parent, and 500 * name. The reference to nm is consumed, so the caller must first do an 501 * fn_hold() if it wants to continue using nm after this call. 502 */ 503 vnode_t * 504 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 505 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 506 { 507 vnode_t *vp; 508 int newnode; 509 int index; 510 mntinfo4_t *mi = VFTOMI4(vfsp); 511 int had_badfh = 0; 512 rnode4_t *rp; 513 514 ASSERT(dvp != NULL); 515 516 fh = badrootfh_check(fh, nm, mi, &had_badfh); 517 518 index = rtable4hash(fh); 519 rw_enter(&rtable4[index].r_lock, RW_READER); 520 521 /* 522 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 523 */ 524 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 525 nfs4_putapage, &newnode, cr); 526 527 rp = VTOR4(vp); 528 sv_activate(&vp, dvp, &nm, newnode); 529 if (dvp->v_flag & V_XATTRDIR) { 530 mutex_enter(&rp->r_statelock); 531 rp->r_flags |= R4ISXATTR; 532 mutex_exit(&rp->r_statelock); 533 } 534 535 /* if getting a bad file handle, do not cache the attributes. */ 536 if (had_badfh) { 537 rw_exit(&rtable4[index].r_lock); 538 return (vp); 539 } 540 541 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 542 r4_do_attrcache(vp, garp, newnode, t, cr, index); 543 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 544 545 return (vp); 546 } 547 548 /* 549 * Hash on address of filehandle object. 550 * XXX totally untuned. 551 */ 552 553 int 554 rtable4hash(nfs4_sharedfh_t *fh) 555 { 556 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 557 } 558 559 /* 560 * Find or create the vnode for the given filehandle and filesystem. 561 * *newnode is set to zero if the vnode already existed; non-zero if it had 562 * to be created. 563 * 564 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 565 */ 566 567 static vnode_t * 568 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 569 struct vnodeops *vops, 570 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 571 int *newnode, cred_t *cr) 572 { 573 rnode4_t *rp; 574 rnode4_t *trp; 575 vnode_t *vp; 576 mntinfo4_t *mi; 577 578 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 579 580 mi = VFTOMI4(vfsp); 581 582 start: 583 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 584 vp = RTOV4(rp); 585 *newnode = 0; 586 return (vp); 587 } 588 rw_exit(&rhtp->r_lock); 589 590 mutex_enter(&rp4freelist_lock); 591 592 if (rp4freelist != NULL && rnode4_new >= nrnode) { 593 rp = rp4freelist; 594 rp4_rmfree(rp); 595 mutex_exit(&rp4freelist_lock); 596 597 vp = RTOV4(rp); 598 599 if (rp->r_flags & R4HASHED) { 600 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 601 mutex_enter(&vp->v_lock); 602 if (vp->v_count > 1) { 603 vp->v_count--; 604 mutex_exit(&vp->v_lock); 605 rw_exit(&rp->r_hashq->r_lock); 606 rw_enter(&rhtp->r_lock, RW_READER); 607 goto start; 608 } 609 mutex_exit(&vp->v_lock); 610 rp4_rmhash_locked(rp); 611 rw_exit(&rp->r_hashq->r_lock); 612 } 613 614 r4inactive(rp, cr); 615 616 mutex_enter(&vp->v_lock); 617 if (vp->v_count > 1) { 618 vp->v_count--; 619 mutex_exit(&vp->v_lock); 620 rw_enter(&rhtp->r_lock, RW_READER); 621 goto start; 622 } 623 mutex_exit(&vp->v_lock); 624 vn_invalid(vp); 625 626 /* 627 * destroy old locks before bzero'ing and 628 * recreating the locks below. 629 */ 630 uninit_rnode4(rp); 631 632 /* 633 * Make sure that if rnode is recycled then 634 * VFS count is decremented properly before 635 * reuse. 636 */ 637 VFS_RELE(vp->v_vfsp); 638 vn_reinit(vp); 639 } else { 640 vnode_t *new_vp; 641 642 mutex_exit(&rp4freelist_lock); 643 644 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 645 new_vp = vn_alloc(KM_SLEEP); 646 647 atomic_add_long((ulong_t *)&rnode4_new, 1); 648 #ifdef DEBUG 649 clstat4_debug.nrnode.value.ui64++; 650 #endif 651 vp = new_vp; 652 } 653 654 bzero(rp, sizeof (*rp)); 655 rp->r_vnode = vp; 656 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 657 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 658 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 659 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 660 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 661 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 662 rp->created_v4 = 0; 663 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 664 offsetof(nfs4_open_stream_t, os_node)); 665 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 666 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 667 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 668 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 669 rp->r_flags = R4READDIRWATTR; 670 rp->r_fh = fh; 671 rp->r_hashq = rhtp; 672 sfh4_hold(rp->r_fh); 673 rp->r_server = mi->mi_curr_serv; 674 rp->r_deleg_type = OPEN_DELEGATE_NONE; 675 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 676 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 677 678 rddir4_cache_create(rp); 679 rp->r_putapage = putapage; 680 vn_setops(vp, vops); 681 vp->v_data = (caddr_t)rp; 682 vp->v_vfsp = vfsp; 683 VFS_HOLD(vfsp); 684 vp->v_type = VNON; 685 vp->v_flag |= VMODSORT; 686 if (isrootfh(fh, rp)) 687 vp->v_flag = VROOT; 688 vn_exists(vp); 689 690 /* 691 * There is a race condition if someone else 692 * alloc's the rnode while no locks are held, so we 693 * check again and recover if found. 694 */ 695 rw_enter(&rhtp->r_lock, RW_WRITER); 696 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 697 vp = RTOV4(trp); 698 *newnode = 0; 699 rw_exit(&rhtp->r_lock); 700 rp4_addfree(rp, cr); 701 rw_enter(&rhtp->r_lock, RW_READER); 702 return (vp); 703 } 704 rp4_addhash(rp); 705 *newnode = 1; 706 return (vp); 707 } 708 709 static void 710 uninit_rnode4(rnode4_t *rp) 711 { 712 vnode_t *vp = RTOV4(rp); 713 714 ASSERT(rp != NULL); 715 ASSERT(vp != NULL); 716 ASSERT(vp->v_count == 1); 717 ASSERT(rp->r_count == 0); 718 ASSERT(rp->r_mapcnt == 0); 719 if (rp->r_flags & R4LODANGLERS) { 720 nfs4_flush_lock_owners(rp); 721 } 722 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 723 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 724 ASSERT(!(rp->r_flags & R4HASHED)); 725 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 726 nfs4_clear_open_streams(rp); 727 list_destroy(&rp->r_open_streams); 728 729 /* 730 * Destroy the rddir cache first since we need to grab the r_statelock. 731 */ 732 mutex_enter(&rp->r_statelock); 733 rddir4_cache_destroy(rp); 734 mutex_exit(&rp->r_statelock); 735 sv_uninit(&rp->r_svnode); 736 sfh4_rele(&rp->r_fh); 737 nfs_rw_destroy(&rp->r_rwlock); 738 nfs_rw_destroy(&rp->r_lkserlock); 739 mutex_destroy(&rp->r_statelock); 740 mutex_destroy(&rp->r_statev4_lock); 741 mutex_destroy(&rp->r_os_lock); 742 cv_destroy(&rp->r_cv); 743 cv_destroy(&rp->r_commit.c_cv); 744 nfs_rw_destroy(&rp->r_deleg_recall_lock); 745 if (rp->r_flags & R4DELMAPLIST) 746 list_destroy(&rp->r_indelmap); 747 } 748 749 /* 750 * Put an rnode on the free list. 751 * 752 * Rnodes which were allocated above and beyond the normal limit 753 * are immediately freed. 754 */ 755 void 756 rp4_addfree(rnode4_t *rp, cred_t *cr) 757 { 758 vnode_t *vp; 759 vnode_t *xattr; 760 struct vfs *vfsp; 761 762 vp = RTOV4(rp); 763 ASSERT(vp->v_count >= 1); 764 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 765 766 /* 767 * If we have too many rnodes allocated and there are no 768 * references to this rnode, or if the rnode is no longer 769 * accessible by it does not reside in the hash queues, 770 * or if an i/o error occurred while writing to the file, 771 * then just free it instead of putting it on the rnode 772 * freelist. 773 */ 774 vfsp = vp->v_vfsp; 775 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 776 #ifdef DEBUG 777 (nfs4_rnode_nofreelist != 0) || 778 #endif 779 rp->r_error || (rp->r_flags & R4RECOVERR) || 780 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 781 if (rp->r_flags & R4HASHED) { 782 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 783 mutex_enter(&vp->v_lock); 784 if (vp->v_count > 1) { 785 vp->v_count--; 786 mutex_exit(&vp->v_lock); 787 rw_exit(&rp->r_hashq->r_lock); 788 return; 789 } 790 mutex_exit(&vp->v_lock); 791 rp4_rmhash_locked(rp); 792 rw_exit(&rp->r_hashq->r_lock); 793 } 794 795 /* 796 * Make sure we don't have a delegation on this rnode 797 * before destroying it. 798 */ 799 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 800 (void) nfs4delegreturn(rp, 801 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 802 } 803 804 r4inactive(rp, cr); 805 806 /* 807 * Recheck the vnode reference count. We need to 808 * make sure that another reference has not been 809 * acquired while we were not holding v_lock. The 810 * rnode is not in the rnode hash queues; one 811 * way for a reference to have been acquired 812 * is for a VOP_PUTPAGE because the rnode was marked 813 * with R4DIRTY or for a modified page. This 814 * reference may have been acquired before our call 815 * to r4inactive. The i/o may have been completed, 816 * thus allowing r4inactive to complete, but the 817 * reference to the vnode may not have been released 818 * yet. In any case, the rnode can not be destroyed 819 * until the other references to this vnode have been 820 * released. The other references will take care of 821 * either destroying the rnode or placing it on the 822 * rnode freelist. If there are no other references, 823 * then the rnode may be safely destroyed. 824 */ 825 mutex_enter(&vp->v_lock); 826 if (vp->v_count > 1) { 827 vp->v_count--; 828 mutex_exit(&vp->v_lock); 829 return; 830 } 831 mutex_exit(&vp->v_lock); 832 833 destroy_rnode4(rp); 834 return; 835 } 836 837 /* 838 * Lock the hash queue and then recheck the reference count 839 * to ensure that no other threads have acquired a reference 840 * to indicate that the rnode should not be placed on the 841 * freelist. If another reference has been acquired, then 842 * just release this one and let the other thread complete 843 * the processing of adding this rnode to the freelist. 844 */ 845 again: 846 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 847 848 mutex_enter(&vp->v_lock); 849 if (vp->v_count > 1) { 850 vp->v_count--; 851 mutex_exit(&vp->v_lock); 852 rw_exit(&rp->r_hashq->r_lock); 853 return; 854 } 855 mutex_exit(&vp->v_lock); 856 857 /* 858 * Make sure we don't put an rnode with a delegation 859 * on the free list. 860 */ 861 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 862 rw_exit(&rp->r_hashq->r_lock); 863 (void) nfs4delegreturn(rp, 864 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 865 goto again; 866 } 867 868 /* 869 * Now that we have the hash queue lock, and we know there 870 * are not anymore references on the vnode, check to make 871 * sure there aren't any open streams still on the rnode. 872 * If so, drop the hash queue lock, remove the open streams, 873 * and recheck the v_count. 874 */ 875 mutex_enter(&rp->r_os_lock); 876 if (list_head(&rp->r_open_streams) != NULL) { 877 mutex_exit(&rp->r_os_lock); 878 rw_exit(&rp->r_hashq->r_lock); 879 if (nfs_zone() != VTOMI4(vp)->mi_zone) 880 nfs4_clear_open_streams(rp); 881 else 882 (void) nfs4close_all(vp, cr); 883 goto again; 884 } 885 mutex_exit(&rp->r_os_lock); 886 887 /* 888 * Before we put it on the freelist, make sure there are no pages. 889 * If there are, flush and commit of all of the dirty and 890 * uncommitted pages, assuming the file system isn't read only. 891 */ 892 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) { 893 rw_exit(&rp->r_hashq->r_lock); 894 r4flushpages(rp, cr); 895 goto again; 896 } 897 898 /* 899 * Before we put it on the freelist, make sure there is no 900 * active xattr directory cached, the freelist will not 901 * have its entries r4inactive'd if there is still an active 902 * rnode, thus nothing in the freelist can hold another 903 * rnode active. 904 */ 905 xattr = rp->r_xattr_dir; 906 rp->r_xattr_dir = NULL; 907 908 /* 909 * If there is no cached data or metadata for this file, then 910 * put the rnode on the front of the freelist so that it will 911 * be reused before other rnodes which may have cached data or 912 * metadata associated with them. 913 */ 914 mutex_enter(&rp4freelist_lock); 915 if (rp4freelist == NULL) { 916 rp->r_freef = rp; 917 rp->r_freeb = rp; 918 rp4freelist = rp; 919 } else { 920 rp->r_freef = rp4freelist; 921 rp->r_freeb = rp4freelist->r_freeb; 922 rp4freelist->r_freeb->r_freef = rp; 923 rp4freelist->r_freeb = rp; 924 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 925 rp->r_symlink.contents == NULL && rp->r_secattr == NULL) 926 rp4freelist = rp; 927 } 928 mutex_exit(&rp4freelist_lock); 929 930 rw_exit(&rp->r_hashq->r_lock); 931 932 if (xattr) 933 VN_RELE(xattr); 934 } 935 936 /* 937 * Remove an rnode from the free list. 938 * 939 * The caller must be holding rp4freelist_lock and the rnode 940 * must be on the freelist. 941 */ 942 static void 943 rp4_rmfree(rnode4_t *rp) 944 { 945 946 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 947 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 948 949 if (rp == rp4freelist) { 950 rp4freelist = rp->r_freef; 951 if (rp == rp4freelist) 952 rp4freelist = NULL; 953 } 954 rp->r_freeb->r_freef = rp->r_freef; 955 rp->r_freef->r_freeb = rp->r_freeb; 956 957 rp->r_freef = rp->r_freeb = NULL; 958 } 959 960 /* 961 * Put a rnode in the hash table. 962 * 963 * The caller must be holding the exclusive hash queue lock 964 */ 965 void 966 rp4_addhash(rnode4_t *rp) 967 { 968 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 969 ASSERT(!(rp->r_flags & R4HASHED)); 970 971 #ifdef DEBUG 972 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 973 #endif 974 975 rp->r_hashf = rp->r_hashq->r_hashf; 976 rp->r_hashq->r_hashf = rp; 977 rp->r_hashb = (rnode4_t *)rp->r_hashq; 978 rp->r_hashf->r_hashb = rp; 979 980 mutex_enter(&rp->r_statelock); 981 rp->r_flags |= R4HASHED; 982 mutex_exit(&rp->r_statelock); 983 } 984 985 /* 986 * Remove a rnode from the hash table. 987 * 988 * The caller must be holding the hash queue lock. 989 */ 990 void 991 rp4_rmhash_locked(rnode4_t *rp) 992 { 993 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 994 ASSERT(rp->r_flags & R4HASHED); 995 996 rp->r_hashb->r_hashf = rp->r_hashf; 997 rp->r_hashf->r_hashb = rp->r_hashb; 998 999 mutex_enter(&rp->r_statelock); 1000 rp->r_flags &= ~R4HASHED; 1001 mutex_exit(&rp->r_statelock); 1002 } 1003 1004 /* 1005 * Remove a rnode from the hash table. 1006 * 1007 * The caller must not be holding the hash queue lock. 1008 */ 1009 void 1010 rp4_rmhash(rnode4_t *rp) 1011 { 1012 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1013 rp4_rmhash_locked(rp); 1014 rw_exit(&rp->r_hashq->r_lock); 1015 } 1016 1017 /* 1018 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 1019 * Returns NULL if no match. If an rnode is returned, the reference count 1020 * on the master vnode is incremented. 1021 * 1022 * The caller must be holding the hash queue lock, either shared or exclusive. 1023 */ 1024 rnode4_t * 1025 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 1026 { 1027 rnode4_t *rp; 1028 vnode_t *vp; 1029 1030 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 1031 1032 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 1033 vp = RTOV4(rp); 1034 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 1035 1036 mutex_enter(&rp->r_statelock); 1037 if (rp->r_flags & R4RECOVERR) { 1038 mutex_exit(&rp->r_statelock); 1039 continue; 1040 } 1041 mutex_exit(&rp->r_statelock); 1042 #ifdef DEBUG 1043 r4_dup_check(rp, vfsp); 1044 #endif 1045 if (rp->r_freef != NULL) { 1046 mutex_enter(&rp4freelist_lock); 1047 /* 1048 * If the rnode is on the freelist, 1049 * then remove it and use that reference 1050 * as the new reference. Otherwise, 1051 * need to increment the reference count. 1052 */ 1053 if (rp->r_freef != NULL) { 1054 rp4_rmfree(rp); 1055 mutex_exit(&rp4freelist_lock); 1056 } else { 1057 mutex_exit(&rp4freelist_lock); 1058 VN_HOLD(vp); 1059 } 1060 } else 1061 VN_HOLD(vp); 1062 1063 /* 1064 * if root vnode, set v_flag to indicate that 1065 */ 1066 if (isrootfh(fh, rp)) { 1067 if (!(vp->v_flag & VROOT)) { 1068 mutex_enter(&vp->v_lock); 1069 vp->v_flag |= VROOT; 1070 mutex_exit(&vp->v_lock); 1071 } 1072 } 1073 return (rp); 1074 } 1075 } 1076 return (NULL); 1077 } 1078 1079 /* 1080 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1081 * that assumes the caller hasn't already got the lock 1082 * on the hash bucket. 1083 */ 1084 rnode4_t * 1085 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1086 { 1087 rnode4_t *rp; 1088 int index; 1089 1090 index = rtable4hash(fh); 1091 rw_enter(&rtable4[index].r_lock, RW_READER); 1092 rp = r4find(&rtable4[index], fh, vfsp); 1093 rw_exit(&rtable4[index].r_lock); 1094 1095 return (rp); 1096 } 1097 1098 /* 1099 * Return >0 if there is a active vnode belonging to this vfs in the 1100 * rtable4 cache. 1101 * 1102 * Several of these checks are done without holding the usual 1103 * locks. This is safe because destroy_rtable(), rp_addfree(), 1104 * etc. will redo the necessary checks before actually destroying 1105 * any rnodes. 1106 */ 1107 int 1108 check_rtable4(struct vfs *vfsp) 1109 { 1110 rnode4_t *rp; 1111 vnode_t *vp; 1112 int busy = NFSV4_RTABLE4_OK; 1113 int index; 1114 1115 for (index = 0; index < rtable4size; index++) { 1116 rw_enter(&rtable4[index].r_lock, RW_READER); 1117 1118 for (rp = rtable4[index].r_hashf; 1119 rp != (rnode4_t *)(&rtable4[index]); 1120 rp = rp->r_hashf) { 1121 1122 vp = RTOV4(rp); 1123 if (vp->v_vfsp == vfsp) { 1124 if (rp->r_freef == NULL) { 1125 busy = NFSV4_RTABLE4_NOT_FREE_LIST; 1126 } else if (nfs4_has_pages(vp) && 1127 (rp->r_flags & R4DIRTY)) { 1128 busy = NFSV4_RTABLE4_DIRTY_PAGES; 1129 } else if (rp->r_count > 0) { 1130 busy = NFSV4_RTABLE4_POS_R_COUNT; 1131 } 1132 1133 if (busy != NFSV4_RTABLE4_OK) { 1134 #ifdef DEBUG 1135 char *path; 1136 1137 path = fn_path(rp->r_svnode.sv_name); 1138 DTRACE_NFSV4_3(rnode__e__debug, 1139 int, busy, char *, path, 1140 rnode4_t *, rp); 1141 kmem_free(path, strlen(path)+1); 1142 #endif 1143 rw_exit(&rtable4[index].r_lock); 1144 return (busy); 1145 } 1146 } 1147 } 1148 rw_exit(&rtable4[index].r_lock); 1149 } 1150 return (busy); 1151 } 1152 1153 /* 1154 * Destroy inactive vnodes from the hash queues which 1155 * belong to this vfs. All of the vnodes should be inactive. 1156 * It is essential that we destroy all rnodes in case of 1157 * forced unmount as well as in normal unmount case. 1158 */ 1159 1160 void 1161 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1162 { 1163 int index; 1164 vnode_t *vp; 1165 rnode4_t *rp, *r_hashf, *rlist; 1166 1167 rlist = NULL; 1168 1169 for (index = 0; index < rtable4size; index++) { 1170 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1171 for (rp = rtable4[index].r_hashf; 1172 rp != (rnode4_t *)(&rtable4[index]); 1173 rp = r_hashf) { 1174 /* save the hash pointer before destroying */ 1175 r_hashf = rp->r_hashf; 1176 1177 vp = RTOV4(rp); 1178 if (vp->v_vfsp == vfsp) { 1179 mutex_enter(&rp4freelist_lock); 1180 if (rp->r_freef != NULL) { 1181 rp4_rmfree(rp); 1182 mutex_exit(&rp4freelist_lock); 1183 rp4_rmhash_locked(rp); 1184 rp->r_hashf = rlist; 1185 rlist = rp; 1186 } else 1187 mutex_exit(&rp4freelist_lock); 1188 } 1189 } 1190 rw_exit(&rtable4[index].r_lock); 1191 } 1192 1193 for (rp = rlist; rp != NULL; rp = r_hashf) { 1194 r_hashf = rp->r_hashf; 1195 /* 1196 * This call to rp4_addfree will end up destroying the 1197 * rnode, but in a safe way with the appropriate set 1198 * of checks done. 1199 */ 1200 rp4_addfree(rp, cr); 1201 } 1202 } 1203 1204 /* 1205 * This routine destroys all the resources of an rnode 1206 * and finally the rnode itself. 1207 */ 1208 static void 1209 destroy_rnode4(rnode4_t *rp) 1210 { 1211 vnode_t *vp; 1212 vfs_t *vfsp; 1213 1214 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1215 1216 vp = RTOV4(rp); 1217 vfsp = vp->v_vfsp; 1218 1219 uninit_rnode4(rp); 1220 atomic_add_long((ulong_t *)&rnode4_new, -1); 1221 #ifdef DEBUG 1222 clstat4_debug.nrnode.value.ui64--; 1223 #endif 1224 kmem_cache_free(rnode4_cache, rp); 1225 vn_invalid(vp); 1226 vn_free(vp); 1227 VFS_RELE(vfsp); 1228 } 1229 1230 /* 1231 * Invalidate the attributes on all rnodes forcing the next getattr 1232 * to go over the wire. Used to flush stale uid and gid mappings. 1233 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1234 */ 1235 void 1236 nfs4_rnode_invalidate(struct vfs *vfsp) 1237 { 1238 int index; 1239 rnode4_t *rp; 1240 vnode_t *vp; 1241 1242 /* 1243 * Walk the hash queues looking for rnodes. 1244 */ 1245 for (index = 0; index < rtable4size; index++) { 1246 rw_enter(&rtable4[index].r_lock, RW_READER); 1247 for (rp = rtable4[index].r_hashf; 1248 rp != (rnode4_t *)(&rtable4[index]); 1249 rp = rp->r_hashf) { 1250 vp = RTOV4(rp); 1251 if (vfsp != NULL && vp->v_vfsp != vfsp) 1252 continue; 1253 1254 if (!mutex_tryenter(&rp->r_statelock)) 1255 continue; 1256 1257 /* 1258 * Expire the attributes by resetting the change 1259 * and attr timeout. 1260 */ 1261 rp->r_change = 0; 1262 PURGE_ATTRCACHE4_LOCKED(rp); 1263 mutex_exit(&rp->r_statelock); 1264 } 1265 rw_exit(&rtable4[index].r_lock); 1266 } 1267 } 1268 1269 /* 1270 * Flush all vnodes in this (or every) vfs. 1271 * Used by nfs_sync and by nfs_unmount. 1272 */ 1273 void 1274 r4flush(struct vfs *vfsp, cred_t *cr) 1275 { 1276 int index; 1277 rnode4_t *rp; 1278 vnode_t *vp, **vplist; 1279 long num, cnt; 1280 1281 /* 1282 * Check to see whether there is anything to do. 1283 */ 1284 num = rnode4_new; 1285 if (num == 0) 1286 return; 1287 1288 /* 1289 * Allocate a slot for all currently active rnodes on the 1290 * supposition that they all may need flushing. 1291 */ 1292 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1293 cnt = 0; 1294 1295 /* 1296 * Walk the hash queues looking for rnodes with page 1297 * lists associated with them. Make a list of these 1298 * files. 1299 */ 1300 for (index = 0; index < rtable4size; index++) { 1301 rw_enter(&rtable4[index].r_lock, RW_READER); 1302 for (rp = rtable4[index].r_hashf; 1303 rp != (rnode4_t *)(&rtable4[index]); 1304 rp = rp->r_hashf) { 1305 vp = RTOV4(rp); 1306 /* 1307 * Don't bother sync'ing a vp if it 1308 * is part of virtual swap device or 1309 * if VFS is read-only 1310 */ 1311 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1312 continue; 1313 /* 1314 * If flushing all mounted file systems or 1315 * the vnode belongs to this vfs, has pages 1316 * and is marked as either dirty or mmap'd, 1317 * hold and add this vnode to the list of 1318 * vnodes to flush. 1319 */ 1320 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 1321 nfs4_has_pages(vp) && 1322 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1323 VN_HOLD(vp); 1324 vplist[cnt++] = vp; 1325 if (cnt == num) { 1326 rw_exit(&rtable4[index].r_lock); 1327 goto toomany; 1328 } 1329 } 1330 } 1331 rw_exit(&rtable4[index].r_lock); 1332 } 1333 toomany: 1334 1335 /* 1336 * Flush and release all of the files on the list. 1337 */ 1338 while (cnt-- > 0) { 1339 vp = vplist[cnt]; 1340 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 1341 VN_RELE(vp); 1342 } 1343 1344 /* 1345 * Free the space allocated to hold the list. 1346 */ 1347 kmem_free(vplist, num * sizeof (*vplist)); 1348 } 1349 1350 int 1351 nfs4_free_data_reclaim(rnode4_t *rp) 1352 { 1353 char *contents; 1354 vnode_t *xattr; 1355 int size; 1356 vsecattr_t *vsp; 1357 int freed; 1358 bool_t rdc = FALSE; 1359 1360 /* 1361 * Free any held caches which may 1362 * be associated with this rnode. 1363 */ 1364 mutex_enter(&rp->r_statelock); 1365 if (rp->r_dir != NULL) 1366 rdc = TRUE; 1367 contents = rp->r_symlink.contents; 1368 size = rp->r_symlink.size; 1369 rp->r_symlink.contents = NULL; 1370 vsp = rp->r_secattr; 1371 rp->r_secattr = NULL; 1372 xattr = rp->r_xattr_dir; 1373 rp->r_xattr_dir = NULL; 1374 mutex_exit(&rp->r_statelock); 1375 1376 /* 1377 * Free the access cache entries. 1378 */ 1379 freed = nfs4_access_purge_rp(rp); 1380 1381 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1382 return (freed); 1383 1384 /* 1385 * Free the readdir cache entries, incompletely if we can't block. 1386 */ 1387 nfs4_purge_rddir_cache(RTOV4(rp)); 1388 1389 /* 1390 * Free the symbolic link cache. 1391 */ 1392 if (contents != NULL) { 1393 1394 kmem_free((void *)contents, size); 1395 } 1396 1397 /* 1398 * Free any cached ACL. 1399 */ 1400 if (vsp != NULL) 1401 nfs4_acl_free_cache(vsp); 1402 1403 /* 1404 * Release the xattr directory vnode 1405 */ 1406 if (xattr != NULL) 1407 VN_RELE(xattr); 1408 1409 return (1); 1410 } 1411 1412 static int 1413 nfs4_active_data_reclaim(rnode4_t *rp) 1414 { 1415 char *contents; 1416 vnode_t *xattr = NULL; 1417 int size; 1418 vsecattr_t *vsp; 1419 int freed; 1420 bool_t rdc = FALSE; 1421 1422 /* 1423 * Free any held credentials and caches which 1424 * may be associated with this rnode. 1425 */ 1426 if (!mutex_tryenter(&rp->r_statelock)) 1427 return (0); 1428 contents = rp->r_symlink.contents; 1429 size = rp->r_symlink.size; 1430 rp->r_symlink.contents = NULL; 1431 vsp = rp->r_secattr; 1432 rp->r_secattr = NULL; 1433 if (rp->r_dir != NULL) 1434 rdc = TRUE; 1435 /* 1436 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed 1437 * on the same r_hashq queue. We are not mandated to free all caches. 1438 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the 1439 * rnode 'rp' is freed or put on the free list. 1440 * 1441 * We will retain NFS4_XATTR_DIR_NOTSUPP because: 1442 * - it has no associated rnode4_t (its v_data is NULL), 1443 * - it is preallocated statically and will never go away, 1444 * so we cannot save anything by releasing it. 1445 */ 1446 if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP && 1447 VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) { 1448 xattr = rp->r_xattr_dir; 1449 rp->r_xattr_dir = NULL; 1450 } 1451 mutex_exit(&rp->r_statelock); 1452 1453 /* 1454 * Free the access cache entries. 1455 */ 1456 freed = nfs4_access_purge_rp(rp); 1457 1458 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1459 return (freed); 1460 1461 /* 1462 * Free the symbolic link cache. 1463 */ 1464 if (contents != NULL) { 1465 1466 kmem_free((void *)contents, size); 1467 } 1468 1469 /* 1470 * Free any cached ACL. 1471 */ 1472 if (vsp != NULL) 1473 nfs4_acl_free_cache(vsp); 1474 1475 nfs4_purge_rddir_cache(RTOV4(rp)); 1476 1477 /* 1478 * Release the xattr directory vnode 1479 */ 1480 if (xattr != NULL) 1481 VN_RELE(xattr); 1482 1483 return (1); 1484 } 1485 1486 static int 1487 nfs4_free_reclaim(void) 1488 { 1489 int freed; 1490 rnode4_t *rp; 1491 1492 #ifdef DEBUG 1493 clstat4_debug.f_reclaim.value.ui64++; 1494 #endif 1495 freed = 0; 1496 mutex_enter(&rp4freelist_lock); 1497 rp = rp4freelist; 1498 if (rp != NULL) { 1499 do { 1500 if (nfs4_free_data_reclaim(rp)) 1501 freed = 1; 1502 } while ((rp = rp->r_freef) != rp4freelist); 1503 } 1504 mutex_exit(&rp4freelist_lock); 1505 return (freed); 1506 } 1507 1508 static int 1509 nfs4_active_reclaim(void) 1510 { 1511 int freed; 1512 int index; 1513 rnode4_t *rp; 1514 1515 #ifdef DEBUG 1516 clstat4_debug.a_reclaim.value.ui64++; 1517 #endif 1518 freed = 0; 1519 for (index = 0; index < rtable4size; index++) { 1520 rw_enter(&rtable4[index].r_lock, RW_READER); 1521 for (rp = rtable4[index].r_hashf; 1522 rp != (rnode4_t *)(&rtable4[index]); 1523 rp = rp->r_hashf) { 1524 if (nfs4_active_data_reclaim(rp)) 1525 freed = 1; 1526 } 1527 rw_exit(&rtable4[index].r_lock); 1528 } 1529 return (freed); 1530 } 1531 1532 static int 1533 nfs4_rnode_reclaim(void) 1534 { 1535 int freed; 1536 rnode4_t *rp; 1537 vnode_t *vp; 1538 1539 #ifdef DEBUG 1540 clstat4_debug.r_reclaim.value.ui64++; 1541 #endif 1542 freed = 0; 1543 mutex_enter(&rp4freelist_lock); 1544 while ((rp = rp4freelist) != NULL) { 1545 rp4_rmfree(rp); 1546 mutex_exit(&rp4freelist_lock); 1547 if (rp->r_flags & R4HASHED) { 1548 vp = RTOV4(rp); 1549 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1550 mutex_enter(&vp->v_lock); 1551 if (vp->v_count > 1) { 1552 vp->v_count--; 1553 mutex_exit(&vp->v_lock); 1554 rw_exit(&rp->r_hashq->r_lock); 1555 mutex_enter(&rp4freelist_lock); 1556 continue; 1557 } 1558 mutex_exit(&vp->v_lock); 1559 rp4_rmhash_locked(rp); 1560 rw_exit(&rp->r_hashq->r_lock); 1561 } 1562 /* 1563 * This call to rp_addfree will end up destroying the 1564 * rnode, but in a safe way with the appropriate set 1565 * of checks done. 1566 */ 1567 rp4_addfree(rp, CRED()); 1568 mutex_enter(&rp4freelist_lock); 1569 } 1570 mutex_exit(&rp4freelist_lock); 1571 return (freed); 1572 } 1573 1574 /*ARGSUSED*/ 1575 static void 1576 nfs4_reclaim(void *cdrarg) 1577 { 1578 #ifdef DEBUG 1579 clstat4_debug.reclaim.value.ui64++; 1580 #endif 1581 if (nfs4_free_reclaim()) 1582 return; 1583 1584 if (nfs4_active_reclaim()) 1585 return; 1586 1587 (void) nfs4_rnode_reclaim(); 1588 } 1589 1590 /* 1591 * Returns the clientid4 to use for the given mntinfo4. Note that the 1592 * clientid can change if the caller drops mi_recovlock. 1593 */ 1594 1595 clientid4 1596 mi2clientid(mntinfo4_t *mi) 1597 { 1598 nfs4_server_t *sp; 1599 clientid4 clientid = 0; 1600 1601 /* this locks down sp if it is found */ 1602 sp = find_nfs4_server(mi); 1603 if (sp != NULL) { 1604 clientid = sp->clientid; 1605 mutex_exit(&sp->s_lock); 1606 nfs4_server_rele(sp); 1607 } 1608 return (clientid); 1609 } 1610 1611 /* 1612 * Return the current lease time for the server associated with the given 1613 * file. Note that the lease time could change immediately after this 1614 * call. 1615 */ 1616 1617 time_t 1618 r2lease_time(rnode4_t *rp) 1619 { 1620 nfs4_server_t *sp; 1621 time_t lease_time; 1622 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1623 1624 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1625 1626 /* this locks down sp if it is found */ 1627 sp = find_nfs4_server(VTOMI4(RTOV4(rp))); 1628 1629 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1630 if (sp != NULL) { 1631 mutex_exit(&sp->s_lock); 1632 nfs4_server_rele(sp); 1633 } 1634 nfs_rw_exit(&mi->mi_recovlock); 1635 return (1); /* 1 second */ 1636 } 1637 1638 ASSERT(sp != NULL); 1639 1640 lease_time = sp->s_lease_time; 1641 1642 mutex_exit(&sp->s_lock); 1643 nfs4_server_rele(sp); 1644 nfs_rw_exit(&mi->mi_recovlock); 1645 1646 return (lease_time); 1647 } 1648 1649 /* 1650 * Return a list with information about all the known open instances for 1651 * a filesystem. The caller must call r4releopenlist() when done with the 1652 * list. 1653 * 1654 * We are safe at looking at os_valid and os_pending_close across dropping 1655 * the 'os_sync_lock' to count up the number of open streams and then 1656 * allocate memory for the osp list due to: 1657 * -Looking at os_pending_close is safe since this routine is 1658 * only called via recovery, and os_pending_close can only be set via 1659 * a non-recovery operation (which are all blocked when recovery 1660 * is active). 1661 * 1662 * -Examining os_valid is safe since non-recovery operations, which 1663 * could potentially switch os_valid to 0, are blocked (via 1664 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1665 * (which means we are the only recovery thread potentially acting 1666 * on this open stream). 1667 */ 1668 1669 nfs4_opinst_t * 1670 r4mkopenlist(mntinfo4_t *mi) 1671 { 1672 nfs4_opinst_t *reopenlist, *rep; 1673 rnode4_t *rp; 1674 vnode_t *vp; 1675 vfs_t *vfsp = mi->mi_vfsp; 1676 int numosp; 1677 nfs4_open_stream_t *osp; 1678 int index; 1679 open_delegation_type4 dtype; 1680 int hold_vnode; 1681 1682 reopenlist = NULL; 1683 1684 for (index = 0; index < rtable4size; index++) { 1685 rw_enter(&rtable4[index].r_lock, RW_READER); 1686 for (rp = rtable4[index].r_hashf; 1687 rp != (rnode4_t *)(&rtable4[index]); 1688 rp = rp->r_hashf) { 1689 1690 vp = RTOV4(rp); 1691 if (vp->v_vfsp != vfsp) 1692 continue; 1693 hold_vnode = 0; 1694 1695 mutex_enter(&rp->r_os_lock); 1696 1697 /* Count the number of valid open_streams of the file */ 1698 numosp = 0; 1699 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1700 osp = list_next(&rp->r_open_streams, osp)) { 1701 mutex_enter(&osp->os_sync_lock); 1702 if (osp->os_valid && !osp->os_pending_close) 1703 numosp++; 1704 mutex_exit(&osp->os_sync_lock); 1705 } 1706 1707 /* Fill in the valid open streams per vp */ 1708 if (numosp > 0) { 1709 int j; 1710 1711 hold_vnode = 1; 1712 1713 /* 1714 * Add a new open instance to the list 1715 */ 1716 rep = kmem_zalloc(sizeof (*reopenlist), 1717 KM_SLEEP); 1718 rep->re_next = reopenlist; 1719 reopenlist = rep; 1720 1721 rep->re_vp = vp; 1722 rep->re_osp = kmem_zalloc( 1723 numosp * sizeof (*(rep->re_osp)), 1724 KM_SLEEP); 1725 rep->re_numosp = numosp; 1726 1727 j = 0; 1728 for (osp = list_head(&rp->r_open_streams); 1729 osp != NULL; 1730 osp = list_next(&rp->r_open_streams, osp)) { 1731 1732 mutex_enter(&osp->os_sync_lock); 1733 if (osp->os_valid && 1734 !osp->os_pending_close) { 1735 osp->os_ref_count++; 1736 rep->re_osp[j] = osp; 1737 j++; 1738 } 1739 mutex_exit(&osp->os_sync_lock); 1740 } 1741 /* 1742 * Assuming valid osp(s) stays valid between 1743 * the time obtaining j and numosp. 1744 */ 1745 ASSERT(j == numosp); 1746 } 1747 1748 mutex_exit(&rp->r_os_lock); 1749 /* do this here to keep v_lock > r_os_lock */ 1750 if (hold_vnode) 1751 VN_HOLD(vp); 1752 mutex_enter(&rp->r_statev4_lock); 1753 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1754 /* 1755 * If this rnode holds a delegation, 1756 * but if there are no valid open streams, 1757 * then just discard the delegation 1758 * without doing delegreturn. 1759 */ 1760 if (numosp > 0) 1761 rp->r_deleg_needs_recovery = 1762 rp->r_deleg_type; 1763 } 1764 /* Save the delegation type for use outside the lock */ 1765 dtype = rp->r_deleg_type; 1766 mutex_exit(&rp->r_statev4_lock); 1767 1768 /* 1769 * If we have a delegation then get rid of it. 1770 * We've set rp->r_deleg_needs_recovery so we have 1771 * enough information to recover. 1772 */ 1773 if (dtype != OPEN_DELEGATE_NONE) { 1774 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1775 } 1776 } 1777 rw_exit(&rtable4[index].r_lock); 1778 } 1779 return (reopenlist); 1780 } 1781 1782 /* 1783 * Given a filesystem id, check to see if any rnodes 1784 * within this fsid reside in the rnode cache, other 1785 * than one we know about. 1786 * 1787 * Return 1 if an rnode is found, 0 otherwise 1788 */ 1789 int 1790 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid) 1791 { 1792 rnode4_t *rp; 1793 vnode_t *vp; 1794 vfs_t *vfsp = mi->mi_vfsp; 1795 fattr4_fsid *fsid; 1796 int index, found = 0; 1797 1798 for (index = 0; index < rtable4size; index++) { 1799 rw_enter(&rtable4[index].r_lock, RW_READER); 1800 for (rp = rtable4[index].r_hashf; 1801 rp != (rnode4_t *)(&rtable4[index]); 1802 rp = rp->r_hashf) { 1803 1804 vp = RTOV4(rp); 1805 if (vp->v_vfsp != vfsp) 1806 continue; 1807 1808 /* 1809 * XXX there might be a case where a 1810 * replicated fs may have the same fsid 1811 * across two different servers. This 1812 * check isn't good enough in that case 1813 */ 1814 fsid = &rp->r_srv_fsid; 1815 if (FATTR4_FSID_EQ(moved_fsid, fsid)) { 1816 found = 1; 1817 break; 1818 } 1819 } 1820 rw_exit(&rtable4[index].r_lock); 1821 1822 if (found) 1823 break; 1824 } 1825 return (found); 1826 } 1827 1828 /* 1829 * Release the list of open instance references. 1830 */ 1831 1832 void 1833 r4releopenlist(nfs4_opinst_t *reopenp) 1834 { 1835 nfs4_opinst_t *rep, *next; 1836 int i; 1837 1838 for (rep = reopenp; rep; rep = next) { 1839 next = rep->re_next; 1840 1841 for (i = 0; i < rep->re_numosp; i++) 1842 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1843 1844 VN_RELE(rep->re_vp); 1845 kmem_free(rep->re_osp, 1846 rep->re_numosp * sizeof (*(rep->re_osp))); 1847 1848 kmem_free(rep, sizeof (*rep)); 1849 } 1850 } 1851 1852 int 1853 nfs4_rnode_init(void) 1854 { 1855 ulong_t nrnode4_max; 1856 int i; 1857 1858 /* 1859 * Compute the size of the rnode4 hash table 1860 */ 1861 if (nrnode <= 0) 1862 nrnode = ncsize; 1863 nrnode4_max = 1864 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1865 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1866 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1867 "setting nrnode to max value of %ld", nrnode4_max); 1868 nrnode = nrnode4_max; 1869 } 1870 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1871 rtable4mask = rtable4size - 1; 1872 1873 /* 1874 * Allocate and initialize the hash buckets 1875 */ 1876 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1877 for (i = 0; i < rtable4size; i++) { 1878 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1879 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1880 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1881 } 1882 1883 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1884 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1885 1886 return (0); 1887 } 1888 1889 int 1890 nfs4_rnode_fini(void) 1891 { 1892 int i; 1893 1894 /* 1895 * Deallocate the rnode hash queues 1896 */ 1897 kmem_cache_destroy(rnode4_cache); 1898 1899 for (i = 0; i < rtable4size; i++) 1900 rw_destroy(&rtable4[i].r_lock); 1901 1902 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 1903 1904 return (0); 1905 } 1906 1907 /* 1908 * Return non-zero if the given filehandle refers to the root filehandle 1909 * for the given rnode. 1910 */ 1911 1912 static int 1913 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 1914 { 1915 int isroot; 1916 1917 isroot = 0; 1918 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 1919 isroot = 1; 1920 1921 return (isroot); 1922 } 1923 1924 /* 1925 * The r4_stub_* routines assume that the rnode is newly activated, and 1926 * that the caller either holds the hash bucket r_lock for this rnode as 1927 * RW_WRITER, or holds r_statelock. 1928 */ 1929 static void 1930 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type) 1931 { 1932 vnode_t *vp = RTOV4(rp); 1933 krwlock_t *hash_lock = &rp->r_hashq->r_lock; 1934 1935 ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock)); 1936 1937 rp->r_stub_type = type; 1938 1939 /* 1940 * Safely switch this vnode to the trigger vnodeops. 1941 * 1942 * Currently, we don't ever switch a trigger vnode back to using 1943 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that 1944 * a new v4 object is not a trigger, and it will already have the 1945 * correct v4 vnodeops by default. So, no "else" case required here. 1946 */ 1947 if (type != NFS4_STUB_NONE) 1948 vn_setops(vp, nfs4_trigger_vnodeops); 1949 } 1950 1951 void 1952 r4_stub_mirrormount(rnode4_t *rp) 1953 { 1954 r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT); 1955 } 1956 1957 void 1958 r4_stub_referral(rnode4_t *rp) 1959 { 1960 DTRACE_PROBE1(nfs4clnt__func__referral__moved, 1961 vnode_t *, RTOV4(rp)); 1962 r4_stub_set(rp, NFS4_STUB_REFERRAL); 1963 } 1964 1965 void 1966 r4_stub_none(rnode4_t *rp) 1967 { 1968 r4_stub_set(rp, NFS4_STUB_NONE); 1969 } 1970 1971 #ifdef DEBUG 1972 1973 /* 1974 * Look in the rnode table for other rnodes that have the same filehandle. 1975 * Assume the lock is held for the hash chain of checkrp 1976 */ 1977 1978 static void 1979 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 1980 { 1981 rnode4_t *rp; 1982 vnode_t *tvp; 1983 nfs4_fhandle_t fh, fh2; 1984 int index; 1985 1986 if (!r4_check_for_dups) 1987 return; 1988 1989 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 1990 1991 sfh4_copyval(checkrp->r_fh, &fh); 1992 1993 for (index = 0; index < rtable4size; index++) { 1994 1995 if (&rtable4[index] != checkrp->r_hashq) 1996 rw_enter(&rtable4[index].r_lock, RW_READER); 1997 1998 for (rp = rtable4[index].r_hashf; 1999 rp != (rnode4_t *)(&rtable4[index]); 2000 rp = rp->r_hashf) { 2001 2002 if (rp == checkrp) 2003 continue; 2004 2005 tvp = RTOV4(rp); 2006 if (tvp->v_vfsp != vfsp) 2007 continue; 2008 2009 sfh4_copyval(rp->r_fh, &fh2); 2010 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 2011 cmn_err(CE_PANIC, "rnodes with same fs, fh " 2012 "(%p, %p)", (void *)checkrp, (void *)rp); 2013 } 2014 } 2015 2016 if (&rtable4[index] != checkrp->r_hashq) 2017 rw_exit(&rtable4[index].r_lock); 2018 } 2019 } 2020 2021 #endif /* DEBUG */ 2022