1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 /* 32 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/types.h> 37 #include <sys/systm.h> 38 #include <sys/cred.h> 39 #include <sys/proc.h> 40 #include <sys/user.h> 41 #include <sys/time.h> 42 #include <sys/buf.h> 43 #include <sys/vfs.h> 44 #include <sys/vnode.h> 45 #include <sys/socket.h> 46 #include <sys/uio.h> 47 #include <sys/tiuser.h> 48 #include <sys/swap.h> 49 #include <sys/errno.h> 50 #include <sys/debug.h> 51 #include <sys/kmem.h> 52 #include <sys/kstat.h> 53 #include <sys/cmn_err.h> 54 #include <sys/vtrace.h> 55 #include <sys/session.h> 56 #include <sys/dnlc.h> 57 #include <sys/bitmap.h> 58 #include <sys/acl.h> 59 #include <sys/ddi.h> 60 #include <sys/pathname.h> 61 #include <sys/flock.h> 62 #include <sys/dirent.h> 63 #include <sys/flock.h> 64 #include <sys/callb.h> 65 #include <sys/sdt.h> 66 67 #include <vm/pvn.h> 68 69 #include <rpc/types.h> 70 #include <rpc/xdr.h> 71 #include <rpc/auth.h> 72 #include <rpc/rpcsec_gss.h> 73 #include <rpc/clnt.h> 74 75 #include <nfs/nfs.h> 76 #include <nfs/nfs_clnt.h> 77 #include <nfs/nfs_acl.h> 78 79 #include <nfs/nfs4.h> 80 #include <nfs/rnode4.h> 81 #include <nfs/nfs4_clnt.h> 82 83 /* 84 * The hash queues for the access to active and cached rnodes 85 * are organized as doubly linked lists. A reader/writer lock 86 * for each hash bucket is used to control access and to synchronize 87 * lookups, additions, and deletions from the hash queue. 88 * 89 * The rnode freelist is organized as a doubly linked list with 90 * a head pointer. Additions and deletions are synchronized via 91 * a single mutex. 92 * 93 * In order to add an rnode to the free list, it must be hashed into 94 * a hash queue and the exclusive lock to the hash queue be held. 95 * If an rnode is not hashed into a hash queue, then it is destroyed 96 * because it represents no valuable information that can be reused 97 * about the file. The exclusive lock to the hash queue must be 98 * held in order to prevent a lookup in the hash queue from finding 99 * the rnode and using it and assuming that the rnode is not on the 100 * freelist. The lookup in the hash queue will have the hash queue 101 * locked, either exclusive or shared. 102 * 103 * The vnode reference count for each rnode is not allowed to drop 104 * below 1. This prevents external entities, such as the VM 105 * subsystem, from acquiring references to vnodes already on the 106 * freelist and then trying to place them back on the freelist 107 * when their reference is released. This means that the when an 108 * rnode is looked up in the hash queues, then either the rnode 109 * is removed from the freelist and that reference is transferred to 110 * the new reference or the vnode reference count must be incremented 111 * accordingly. The mutex for the freelist must be held in order to 112 * accurately test to see if the rnode is on the freelist or not. 113 * The hash queue lock might be held shared and it is possible that 114 * two different threads may race to remove the rnode from the 115 * freelist. This race can be resolved by holding the mutex for the 116 * freelist. Please note that the mutex for the freelist does not 117 * need to be held if the rnode is not on the freelist. It can not be 118 * placed on the freelist due to the requirement that the thread 119 * putting the rnode on the freelist must hold the exclusive lock 120 * to the hash queue and the thread doing the lookup in the hash 121 * queue is holding either a shared or exclusive lock to the hash 122 * queue. 123 * 124 * The lock ordering is: 125 * 126 * hash bucket lock -> vnode lock 127 * hash bucket lock -> freelist lock -> r_statelock 128 */ 129 r4hashq_t *rtable4; 130 131 static kmutex_t rp4freelist_lock; 132 static rnode4_t *rp4freelist = NULL; 133 static long rnode4_new = 0; 134 int rtable4size; 135 static int rtable4mask; 136 static struct kmem_cache *rnode4_cache; 137 static int rnode4_hashlen = 4; 138 139 static void r4inactive(rnode4_t *, cred_t *); 140 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 141 struct vnodeops *, 142 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 143 cred_t *), 144 int *, cred_t *); 145 static void rp4_rmfree(rnode4_t *); 146 int nfs4_free_data_reclaim(rnode4_t *); 147 static int nfs4_active_data_reclaim(rnode4_t *); 148 static int nfs4_free_reclaim(void); 149 static int nfs4_active_reclaim(void); 150 static int nfs4_rnode_reclaim(void); 151 static void nfs4_reclaim(void *); 152 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 153 static void uninit_rnode4(rnode4_t *); 154 static void destroy_rnode4(rnode4_t *); 155 static void r4_stub_set(rnode4_t *, nfs4_stub_type_t); 156 157 #ifdef DEBUG 158 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 159 static int nfs4_rnode_debug = 0; 160 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 161 static int nfs4_rnode_nofreelist = 0; 162 /* give messages on colliding shared filehandles */ 163 static void r4_dup_check(rnode4_t *, vfs_t *); 164 #endif 165 166 /* 167 * If the vnode has pages, run the list and check for any that are 168 * still dangling. We call this routine before putting an rnode on 169 * the free list. 170 */ 171 static int 172 nfs4_dross_pages(vnode_t *vp) 173 { 174 page_t *pp; 175 kmutex_t *vphm; 176 177 vphm = page_vnode_mutex(vp); 178 mutex_enter(vphm); 179 if ((pp = vp->v_pages) != NULL) { 180 do { 181 if (pp->p_hash != PVN_VPLIST_HASH_TAG && 182 pp->p_fsdata != C_NOCOMMIT) { 183 mutex_exit(vphm); 184 return (1); 185 } 186 } while ((pp = pp->p_vpnext) != vp->v_pages); 187 } 188 mutex_exit(vphm); 189 190 return (0); 191 } 192 193 /* 194 * Flush any pages left on this rnode. 195 */ 196 static void 197 r4flushpages(rnode4_t *rp, cred_t *cr) 198 { 199 vnode_t *vp; 200 int error; 201 202 /* 203 * Before freeing anything, wait until all asynchronous 204 * activity is done on this rnode. This will allow all 205 * asynchronous read ahead and write behind i/o's to 206 * finish. 207 */ 208 mutex_enter(&rp->r_statelock); 209 while (rp->r_count > 0) 210 cv_wait(&rp->r_cv, &rp->r_statelock); 211 mutex_exit(&rp->r_statelock); 212 213 /* 214 * Flush and invalidate all pages associated with the vnode. 215 */ 216 vp = RTOV4(rp); 217 if (nfs4_has_pages(vp)) { 218 ASSERT(vp->v_type != VCHR); 219 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 220 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 221 if (error && (error == ENOSPC || error == EDQUOT)) { 222 mutex_enter(&rp->r_statelock); 223 if (!rp->r_error) 224 rp->r_error = error; 225 mutex_exit(&rp->r_statelock); 226 } 227 } 228 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 229 } 230 } 231 232 /* 233 * Free the resources associated with an rnode. 234 */ 235 static void 236 r4inactive(rnode4_t *rp, cred_t *cr) 237 { 238 vnode_t *vp; 239 char *contents; 240 int size; 241 vsecattr_t *vsp; 242 vnode_t *xattr; 243 244 r4flushpages(rp, cr); 245 246 vp = RTOV4(rp); 247 248 /* 249 * Free any held caches which may be 250 * associated with this rnode. 251 */ 252 mutex_enter(&rp->r_statelock); 253 contents = rp->r_symlink.contents; 254 size = rp->r_symlink.size; 255 rp->r_symlink.contents = NULL; 256 vsp = rp->r_secattr; 257 rp->r_secattr = NULL; 258 xattr = rp->r_xattr_dir; 259 rp->r_xattr_dir = NULL; 260 mutex_exit(&rp->r_statelock); 261 262 /* 263 * Free the access cache entries. 264 */ 265 (void) nfs4_access_purge_rp(rp); 266 267 /* 268 * Free the readdir cache entries. 269 */ 270 nfs4_purge_rddir_cache(vp); 271 272 /* 273 * Free the symbolic link cache. 274 */ 275 if (contents != NULL) { 276 277 kmem_free((void *)contents, size); 278 } 279 280 /* 281 * Free any cached ACL. 282 */ 283 if (vsp != NULL) 284 nfs4_acl_free_cache(vsp); 285 286 /* 287 * Release the cached xattr_dir 288 */ 289 if (xattr != NULL) 290 VN_RELE(xattr); 291 } 292 293 /* 294 * We have seen a case that the fh passed in is for "." which 295 * should be a VROOT node, however, the fh is different from the 296 * root fh stored in the mntinfo4_t. The invalid fh might be 297 * from a misbehaved server and will panic the client system at 298 * a later time. To avoid the panic, we drop the bad fh, use 299 * the root fh from mntinfo4_t, and print an error message 300 * for attention. 301 */ 302 nfs4_sharedfh_t * 303 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 304 int *wasbad) 305 { 306 char *s; 307 308 *wasbad = 0; 309 s = fn_name(nm); 310 ASSERT(strcmp(s, "..") != 0); 311 312 if ((s[0] == '.' && s[1] == '\0') && fh && 313 !SFH4_SAME(mi->mi_rootfh, fh)) { 314 #ifdef DEBUG 315 nfs4_fhandle_t fhandle; 316 317 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 318 "Server %s returns a different " 319 "root filehandle for the path %s:", 320 mi->mi_curr_serv->sv_hostname, 321 mi->mi_curr_serv->sv_path); 322 323 /* print the bad fh */ 324 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 325 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 326 fhandle.fh_len); 327 nfs4_printfhandle(&fhandle); 328 329 /* print mi_rootfh */ 330 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 331 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 332 fhandle.fh_len); 333 nfs4_printfhandle(&fhandle); 334 #endif 335 /* use mi_rootfh instead; fh will be rele by the caller */ 336 fh = mi->mi_rootfh; 337 *wasbad = 1; 338 } 339 340 kmem_free(s, MAXNAMELEN); 341 return (fh); 342 } 343 344 void 345 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 346 hrtime_t t, cred_t *cr, int index) 347 { 348 int is_stub; 349 vattr_t *attr; 350 /* 351 * Don't add to attrcache if time overflow, but 352 * no need to check because either attr is null or the time 353 * values in it were processed by nfs4_time_ntov(), which checks 354 * for time overflows. 355 */ 356 attr = garp ? &garp->n4g_va : NULL; 357 358 if (attr) { 359 if (!newnode) { 360 rw_exit(&rtable4[index].r_lock); 361 #ifdef DEBUG 362 if (vp->v_type != attr->va_type && 363 vp->v_type != VNON && attr->va_type != VNON) { 364 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 365 "makenfs4node: type (%d) doesn't " 366 "match type of found node at %p (%d)", 367 attr->va_type, (void *)vp, vp->v_type); 368 } 369 #endif 370 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 371 } else { 372 rnode4_t *rp = VTOR4(vp); 373 374 vp->v_type = attr->va_type; 375 vp->v_rdev = attr->va_rdev; 376 377 /* 378 * Turn this object into a "stub" object if we 379 * crossed an underlying server fs boundary. 380 * To make this check, during mount we save the 381 * fsid of the server object being mounted. 382 * Here we compare this object's server fsid 383 * with the fsid we saved at mount. If they 384 * are different, we crossed server fs boundary. 385 * 386 * The stub type is set (or not) at rnode 387 * creation time and it never changes for life 388 * of the rnode. 389 * 390 * This stub will be for a mirror-mount, rather than 391 * a referral (the latter also sets R4SRVSTUB). 392 * 393 * The stub type is also set during RO failover, 394 * nfs4_remap_file(). 395 * 396 * We don't bother with taking r_state_lock to 397 * set the stub type because this is a new rnode 398 * and we're holding the hash bucket r_lock RW_WRITER. 399 * No other thread could have obtained access 400 * to this rnode. 401 */ 402 is_stub = 0; 403 if (garp->n4g_fsid_valid) { 404 fattr4_fsid ga_fsid = garp->n4g_fsid; 405 servinfo4_t *svp = rp->r_server; 406 407 rp->r_srv_fsid = ga_fsid; 408 409 (void) nfs_rw_enter_sig(&svp->sv_lock, 410 RW_READER, 0); 411 if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid)) 412 is_stub = 1; 413 nfs_rw_exit(&svp->sv_lock); 414 } 415 416 if (is_stub) 417 r4_stub_mirrormount(rp); 418 else 419 r4_stub_none(rp); 420 421 /* Can not cache partial attr */ 422 if (attr->va_mask == AT_ALL) 423 nfs4_attrcache_noinval(vp, garp, t); 424 else 425 PURGE_ATTRCACHE4(vp); 426 427 rw_exit(&rtable4[index].r_lock); 428 } 429 } else { 430 if (newnode) { 431 PURGE_ATTRCACHE4(vp); 432 } 433 rw_exit(&rtable4[index].r_lock); 434 } 435 } 436 437 /* 438 * Find or create an rnode based primarily on filehandle. To be 439 * used when dvp (vnode for parent directory) is not available; 440 * otherwise, makenfs4node() should be used. 441 * 442 * The nfs4_fname_t argument *npp is consumed and nulled out. 443 */ 444 445 vnode_t * 446 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 447 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 448 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 449 { 450 vfs_t *vfsp = mi->mi_vfsp; 451 int newnode = 0; 452 vnode_t *vp; 453 rnode4_t *rp; 454 svnode_t *svp; 455 nfs4_fname_t *name, *svpname; 456 int index; 457 458 ASSERT(npp && *npp); 459 name = *npp; 460 *npp = NULL; 461 462 index = rtable4hash(sfh); 463 rw_enter(&rtable4[index].r_lock, RW_READER); 464 465 vp = make_rnode4(sfh, &rtable4[index], vfsp, 466 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 467 468 svp = VTOSV(vp); 469 rp = VTOR4(vp); 470 if (newnode) { 471 svp->sv_forw = svp->sv_back = svp; 472 svp->sv_name = name; 473 if (psfh != NULL) 474 sfh4_hold(psfh); 475 svp->sv_dfh = psfh; 476 } else { 477 /* 478 * It is possible that due to a server 479 * side rename fnames have changed. 480 * update the fname here. 481 */ 482 mutex_enter(&rp->r_svlock); 483 svpname = svp->sv_name; 484 if (svp->sv_name != name) { 485 svp->sv_name = name; 486 mutex_exit(&rp->r_svlock); 487 fn_rele(&svpname); 488 } else { 489 mutex_exit(&rp->r_svlock); 490 fn_rele(&name); 491 } 492 } 493 494 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 495 r4_do_attrcache(vp, garp, newnode, t, cr, index); 496 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 497 498 return (vp); 499 } 500 501 /* 502 * Find or create a vnode for the given filehandle, filesystem, parent, and 503 * name. The reference to nm is consumed, so the caller must first do an 504 * fn_hold() if it wants to continue using nm after this call. 505 */ 506 vnode_t * 507 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 508 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 509 { 510 vnode_t *vp; 511 int newnode; 512 int index; 513 mntinfo4_t *mi = VFTOMI4(vfsp); 514 int had_badfh = 0; 515 rnode4_t *rp; 516 517 ASSERT(dvp != NULL); 518 519 fh = badrootfh_check(fh, nm, mi, &had_badfh); 520 521 index = rtable4hash(fh); 522 rw_enter(&rtable4[index].r_lock, RW_READER); 523 524 /* 525 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 526 */ 527 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 528 nfs4_putapage, &newnode, cr); 529 530 rp = VTOR4(vp); 531 sv_activate(&vp, dvp, &nm, newnode); 532 if (dvp->v_flag & V_XATTRDIR) { 533 mutex_enter(&rp->r_statelock); 534 rp->r_flags |= R4ISXATTR; 535 mutex_exit(&rp->r_statelock); 536 } 537 538 /* if getting a bad file handle, do not cache the attributes. */ 539 if (had_badfh) { 540 rw_exit(&rtable4[index].r_lock); 541 return (vp); 542 } 543 544 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 545 r4_do_attrcache(vp, garp, newnode, t, cr, index); 546 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 547 548 return (vp); 549 } 550 551 /* 552 * Hash on address of filehandle object. 553 * XXX totally untuned. 554 */ 555 556 int 557 rtable4hash(nfs4_sharedfh_t *fh) 558 { 559 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 560 } 561 562 /* 563 * Find or create the vnode for the given filehandle and filesystem. 564 * *newnode is set to zero if the vnode already existed; non-zero if it had 565 * to be created. 566 * 567 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 568 */ 569 570 static vnode_t * 571 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 572 struct vnodeops *vops, 573 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 574 int *newnode, cred_t *cr) 575 { 576 rnode4_t *rp; 577 rnode4_t *trp; 578 vnode_t *vp; 579 mntinfo4_t *mi; 580 581 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 582 583 mi = VFTOMI4(vfsp); 584 585 start: 586 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 587 vp = RTOV4(rp); 588 *newnode = 0; 589 return (vp); 590 } 591 rw_exit(&rhtp->r_lock); 592 593 mutex_enter(&rp4freelist_lock); 594 595 if (rp4freelist != NULL && rnode4_new >= nrnode) { 596 rp = rp4freelist; 597 rp4_rmfree(rp); 598 mutex_exit(&rp4freelist_lock); 599 600 vp = RTOV4(rp); 601 602 if (rp->r_flags & R4HASHED) { 603 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 604 mutex_enter(&vp->v_lock); 605 if (vp->v_count > 1) { 606 vp->v_count--; 607 mutex_exit(&vp->v_lock); 608 rw_exit(&rp->r_hashq->r_lock); 609 rw_enter(&rhtp->r_lock, RW_READER); 610 goto start; 611 } 612 mutex_exit(&vp->v_lock); 613 rp4_rmhash_locked(rp); 614 rw_exit(&rp->r_hashq->r_lock); 615 } 616 617 r4inactive(rp, cr); 618 619 mutex_enter(&vp->v_lock); 620 if (vp->v_count > 1) { 621 vp->v_count--; 622 mutex_exit(&vp->v_lock); 623 rw_enter(&rhtp->r_lock, RW_READER); 624 goto start; 625 } 626 mutex_exit(&vp->v_lock); 627 vn_invalid(vp); 628 629 /* 630 * destroy old locks before bzero'ing and 631 * recreating the locks below. 632 */ 633 uninit_rnode4(rp); 634 635 /* 636 * Make sure that if rnode is recycled then 637 * VFS count is decremented properly before 638 * reuse. 639 */ 640 VFS_RELE(vp->v_vfsp); 641 vn_reinit(vp); 642 } else { 643 vnode_t *new_vp; 644 645 mutex_exit(&rp4freelist_lock); 646 647 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 648 new_vp = vn_alloc(KM_SLEEP); 649 650 atomic_inc_ulong((ulong_t *)&rnode4_new); 651 #ifdef DEBUG 652 clstat4_debug.nrnode.value.ui64++; 653 #endif 654 vp = new_vp; 655 } 656 657 bzero(rp, sizeof (*rp)); 658 rp->r_vnode = vp; 659 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 660 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 661 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 662 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 663 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 664 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 665 rp->created_v4 = 0; 666 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 667 offsetof(nfs4_open_stream_t, os_node)); 668 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 669 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 670 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 671 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 672 rp->r_flags = R4READDIRWATTR; 673 rp->r_fh = fh; 674 rp->r_hashq = rhtp; 675 sfh4_hold(rp->r_fh); 676 rp->r_server = mi->mi_curr_serv; 677 rp->r_deleg_type = OPEN_DELEGATE_NONE; 678 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 679 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 680 681 rddir4_cache_create(rp); 682 rp->r_putapage = putapage; 683 vn_setops(vp, vops); 684 vp->v_data = (caddr_t)rp; 685 vp->v_vfsp = vfsp; 686 VFS_HOLD(vfsp); 687 vp->v_type = VNON; 688 vp->v_flag |= VMODSORT; 689 if (isrootfh(fh, rp)) 690 vp->v_flag = VROOT; 691 vn_exists(vp); 692 693 /* 694 * There is a race condition if someone else 695 * alloc's the rnode while no locks are held, so we 696 * check again and recover if found. 697 */ 698 rw_enter(&rhtp->r_lock, RW_WRITER); 699 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 700 vp = RTOV4(trp); 701 *newnode = 0; 702 rw_exit(&rhtp->r_lock); 703 rp4_addfree(rp, cr); 704 rw_enter(&rhtp->r_lock, RW_READER); 705 return (vp); 706 } 707 rp4_addhash(rp); 708 *newnode = 1; 709 return (vp); 710 } 711 712 static void 713 uninit_rnode4(rnode4_t *rp) 714 { 715 vnode_t *vp = RTOV4(rp); 716 717 ASSERT(rp != NULL); 718 ASSERT(vp != NULL); 719 ASSERT(vp->v_count == 1); 720 ASSERT(rp->r_count == 0); 721 ASSERT(rp->r_mapcnt == 0); 722 if (rp->r_flags & R4LODANGLERS) { 723 nfs4_flush_lock_owners(rp); 724 } 725 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 726 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 727 ASSERT(!(rp->r_flags & R4HASHED)); 728 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 729 nfs4_clear_open_streams(rp); 730 list_destroy(&rp->r_open_streams); 731 732 /* 733 * Destroy the rddir cache first since we need to grab the r_statelock. 734 */ 735 mutex_enter(&rp->r_statelock); 736 rddir4_cache_destroy(rp); 737 mutex_exit(&rp->r_statelock); 738 sv_uninit(&rp->r_svnode); 739 sfh4_rele(&rp->r_fh); 740 nfs_rw_destroy(&rp->r_rwlock); 741 nfs_rw_destroy(&rp->r_lkserlock); 742 mutex_destroy(&rp->r_statelock); 743 mutex_destroy(&rp->r_statev4_lock); 744 mutex_destroy(&rp->r_os_lock); 745 cv_destroy(&rp->r_cv); 746 cv_destroy(&rp->r_commit.c_cv); 747 nfs_rw_destroy(&rp->r_deleg_recall_lock); 748 if (rp->r_flags & R4DELMAPLIST) 749 list_destroy(&rp->r_indelmap); 750 } 751 752 /* 753 * Put an rnode on the free list. 754 * 755 * Rnodes which were allocated above and beyond the normal limit 756 * are immediately freed. 757 */ 758 void 759 rp4_addfree(rnode4_t *rp, cred_t *cr) 760 { 761 vnode_t *vp; 762 vnode_t *xattr; 763 struct vfs *vfsp; 764 765 vp = RTOV4(rp); 766 ASSERT(vp->v_count >= 1); 767 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 768 769 /* 770 * If we have too many rnodes allocated and there are no 771 * references to this rnode, or if the rnode is no longer 772 * accessible by it does not reside in the hash queues, 773 * or if an i/o error occurred while writing to the file, 774 * then just free it instead of putting it on the rnode 775 * freelist. 776 */ 777 vfsp = vp->v_vfsp; 778 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 779 #ifdef DEBUG 780 (nfs4_rnode_nofreelist != 0) || 781 #endif 782 rp->r_error || (rp->r_flags & R4RECOVERR) || 783 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 784 if (rp->r_flags & R4HASHED) { 785 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 786 mutex_enter(&vp->v_lock); 787 if (vp->v_count > 1) { 788 vp->v_count--; 789 mutex_exit(&vp->v_lock); 790 rw_exit(&rp->r_hashq->r_lock); 791 return; 792 } 793 mutex_exit(&vp->v_lock); 794 rp4_rmhash_locked(rp); 795 rw_exit(&rp->r_hashq->r_lock); 796 } 797 798 /* 799 * Make sure we don't have a delegation on this rnode 800 * before destroying it. 801 */ 802 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 803 (void) nfs4delegreturn(rp, 804 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 805 } 806 807 r4inactive(rp, cr); 808 809 /* 810 * Recheck the vnode reference count. We need to 811 * make sure that another reference has not been 812 * acquired while we were not holding v_lock. The 813 * rnode is not in the rnode hash queues; one 814 * way for a reference to have been acquired 815 * is for a VOP_PUTPAGE because the rnode was marked 816 * with R4DIRTY or for a modified page. This 817 * reference may have been acquired before our call 818 * to r4inactive. The i/o may have been completed, 819 * thus allowing r4inactive to complete, but the 820 * reference to the vnode may not have been released 821 * yet. In any case, the rnode can not be destroyed 822 * until the other references to this vnode have been 823 * released. The other references will take care of 824 * either destroying the rnode or placing it on the 825 * rnode freelist. If there are no other references, 826 * then the rnode may be safely destroyed. 827 */ 828 mutex_enter(&vp->v_lock); 829 if (vp->v_count > 1) { 830 vp->v_count--; 831 mutex_exit(&vp->v_lock); 832 return; 833 } 834 mutex_exit(&vp->v_lock); 835 836 destroy_rnode4(rp); 837 return; 838 } 839 840 /* 841 * Lock the hash queue and then recheck the reference count 842 * to ensure that no other threads have acquired a reference 843 * to indicate that the rnode should not be placed on the 844 * freelist. If another reference has been acquired, then 845 * just release this one and let the other thread complete 846 * the processing of adding this rnode to the freelist. 847 */ 848 again: 849 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 850 851 mutex_enter(&vp->v_lock); 852 if (vp->v_count > 1) { 853 vp->v_count--; 854 mutex_exit(&vp->v_lock); 855 rw_exit(&rp->r_hashq->r_lock); 856 return; 857 } 858 mutex_exit(&vp->v_lock); 859 860 /* 861 * Make sure we don't put an rnode with a delegation 862 * on the free list. 863 */ 864 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 865 rw_exit(&rp->r_hashq->r_lock); 866 (void) nfs4delegreturn(rp, 867 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 868 goto again; 869 } 870 871 /* 872 * Now that we have the hash queue lock, and we know there 873 * are not anymore references on the vnode, check to make 874 * sure there aren't any open streams still on the rnode. 875 * If so, drop the hash queue lock, remove the open streams, 876 * and recheck the v_count. 877 */ 878 mutex_enter(&rp->r_os_lock); 879 if (list_head(&rp->r_open_streams) != NULL) { 880 mutex_exit(&rp->r_os_lock); 881 rw_exit(&rp->r_hashq->r_lock); 882 if (nfs_zone() != VTOMI4(vp)->mi_zone) 883 nfs4_clear_open_streams(rp); 884 else 885 (void) nfs4close_all(vp, cr); 886 goto again; 887 } 888 mutex_exit(&rp->r_os_lock); 889 890 /* 891 * Before we put it on the freelist, make sure there are no pages. 892 * If there are, flush and commit of all of the dirty and 893 * uncommitted pages, assuming the file system isn't read only. 894 */ 895 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) { 896 rw_exit(&rp->r_hashq->r_lock); 897 r4flushpages(rp, cr); 898 goto again; 899 } 900 901 /* 902 * Before we put it on the freelist, make sure there is no 903 * active xattr directory cached, the freelist will not 904 * have its entries r4inactive'd if there is still an active 905 * rnode, thus nothing in the freelist can hold another 906 * rnode active. 907 */ 908 xattr = rp->r_xattr_dir; 909 rp->r_xattr_dir = NULL; 910 911 /* 912 * If there is no cached data or metadata for this file, then 913 * put the rnode on the front of the freelist so that it will 914 * be reused before other rnodes which may have cached data or 915 * metadata associated with them. 916 */ 917 mutex_enter(&rp4freelist_lock); 918 if (rp4freelist == NULL) { 919 rp->r_freef = rp; 920 rp->r_freeb = rp; 921 rp4freelist = rp; 922 } else { 923 rp->r_freef = rp4freelist; 924 rp->r_freeb = rp4freelist->r_freeb; 925 rp4freelist->r_freeb->r_freef = rp; 926 rp4freelist->r_freeb = rp; 927 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 928 rp->r_symlink.contents == NULL && rp->r_secattr == NULL) 929 rp4freelist = rp; 930 } 931 mutex_exit(&rp4freelist_lock); 932 933 rw_exit(&rp->r_hashq->r_lock); 934 935 if (xattr) 936 VN_RELE(xattr); 937 } 938 939 /* 940 * Remove an rnode from the free list. 941 * 942 * The caller must be holding rp4freelist_lock and the rnode 943 * must be on the freelist. 944 */ 945 static void 946 rp4_rmfree(rnode4_t *rp) 947 { 948 949 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 950 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 951 952 if (rp == rp4freelist) { 953 rp4freelist = rp->r_freef; 954 if (rp == rp4freelist) 955 rp4freelist = NULL; 956 } 957 rp->r_freeb->r_freef = rp->r_freef; 958 rp->r_freef->r_freeb = rp->r_freeb; 959 960 rp->r_freef = rp->r_freeb = NULL; 961 } 962 963 /* 964 * Put a rnode in the hash table. 965 * 966 * The caller must be holding the exclusive hash queue lock 967 */ 968 void 969 rp4_addhash(rnode4_t *rp) 970 { 971 mntinfo4_t *mi; 972 973 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 974 ASSERT(!(rp->r_flags & R4HASHED)); 975 976 #ifdef DEBUG 977 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 978 #endif 979 980 rp->r_hashf = rp->r_hashq->r_hashf; 981 rp->r_hashq->r_hashf = rp; 982 rp->r_hashb = (rnode4_t *)rp->r_hashq; 983 rp->r_hashf->r_hashb = rp; 984 985 mutex_enter(&rp->r_statelock); 986 rp->r_flags |= R4HASHED; 987 mutex_exit(&rp->r_statelock); 988 989 mi = VTOMI4(RTOV4(rp)); 990 mutex_enter(&mi->mi_rnodes_lock); 991 list_insert_tail(&mi->mi_rnodes, rp); 992 mutex_exit(&mi->mi_rnodes_lock); 993 } 994 995 /* 996 * Remove a rnode from the hash table. 997 * 998 * The caller must be holding the hash queue lock. 999 */ 1000 void 1001 rp4_rmhash_locked(rnode4_t *rp) 1002 { 1003 mntinfo4_t *mi; 1004 1005 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 1006 ASSERT(rp->r_flags & R4HASHED); 1007 1008 rp->r_hashb->r_hashf = rp->r_hashf; 1009 rp->r_hashf->r_hashb = rp->r_hashb; 1010 1011 mutex_enter(&rp->r_statelock); 1012 rp->r_flags &= ~R4HASHED; 1013 mutex_exit(&rp->r_statelock); 1014 1015 mi = VTOMI4(RTOV4(rp)); 1016 mutex_enter(&mi->mi_rnodes_lock); 1017 if (list_link_active(&rp->r_mi_link)) 1018 list_remove(&mi->mi_rnodes, rp); 1019 mutex_exit(&mi->mi_rnodes_lock); 1020 } 1021 1022 /* 1023 * Remove a rnode from the hash table. 1024 * 1025 * The caller must not be holding the hash queue lock. 1026 */ 1027 void 1028 rp4_rmhash(rnode4_t *rp) 1029 { 1030 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1031 rp4_rmhash_locked(rp); 1032 rw_exit(&rp->r_hashq->r_lock); 1033 } 1034 1035 /* 1036 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 1037 * Returns NULL if no match. If an rnode is returned, the reference count 1038 * on the master vnode is incremented. 1039 * 1040 * The caller must be holding the hash queue lock, either shared or exclusive. 1041 */ 1042 rnode4_t * 1043 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 1044 { 1045 rnode4_t *rp; 1046 vnode_t *vp; 1047 1048 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 1049 1050 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 1051 vp = RTOV4(rp); 1052 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 1053 1054 mutex_enter(&rp->r_statelock); 1055 if (rp->r_flags & R4RECOVERR) { 1056 mutex_exit(&rp->r_statelock); 1057 continue; 1058 } 1059 mutex_exit(&rp->r_statelock); 1060 #ifdef DEBUG 1061 r4_dup_check(rp, vfsp); 1062 #endif 1063 if (rp->r_freef != NULL) { 1064 mutex_enter(&rp4freelist_lock); 1065 /* 1066 * If the rnode is on the freelist, 1067 * then remove it and use that reference 1068 * as the new reference. Otherwise, 1069 * need to increment the reference count. 1070 */ 1071 if (rp->r_freef != NULL) { 1072 rp4_rmfree(rp); 1073 mutex_exit(&rp4freelist_lock); 1074 } else { 1075 mutex_exit(&rp4freelist_lock); 1076 VN_HOLD(vp); 1077 } 1078 } else 1079 VN_HOLD(vp); 1080 1081 /* 1082 * if root vnode, set v_flag to indicate that 1083 */ 1084 if (isrootfh(fh, rp)) { 1085 if (!(vp->v_flag & VROOT)) { 1086 mutex_enter(&vp->v_lock); 1087 vp->v_flag |= VROOT; 1088 mutex_exit(&vp->v_lock); 1089 } 1090 } 1091 return (rp); 1092 } 1093 } 1094 return (NULL); 1095 } 1096 1097 /* 1098 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1099 * that assumes the caller hasn't already got the lock 1100 * on the hash bucket. 1101 */ 1102 rnode4_t * 1103 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1104 { 1105 rnode4_t *rp; 1106 int index; 1107 1108 index = rtable4hash(fh); 1109 rw_enter(&rtable4[index].r_lock, RW_READER); 1110 rp = r4find(&rtable4[index], fh, vfsp); 1111 rw_exit(&rtable4[index].r_lock); 1112 1113 return (rp); 1114 } 1115 1116 /* 1117 * Return 1 if there is an active vnode belonging to this vfs in the 1118 * rtable4 cache. 1119 * 1120 * Several of these checks are done without holding the usual 1121 * locks. This is safe because destroy_rtable4(), rp4_addfree(), 1122 * etc. will redo the necessary checks before actually destroying 1123 * any rnodes. 1124 */ 1125 int 1126 check_rtable4(struct vfs *vfsp) 1127 { 1128 rnode4_t *rp; 1129 vnode_t *vp; 1130 mntinfo4_t *mi; 1131 1132 ASSERT(vfsp != NULL); 1133 mi = VFTOMI4(vfsp); 1134 1135 mutex_enter(&mi->mi_rnodes_lock); 1136 for (rp = list_head(&mi->mi_rnodes); rp != NULL; 1137 rp = list_next(&mi->mi_rnodes, rp)) { 1138 vp = RTOV4(rp); 1139 1140 if (rp->r_freef == NULL || 1141 (nfs4_has_pages(vp) && (rp->r_flags & R4DIRTY)) || 1142 rp->r_count > 0) { 1143 mutex_exit(&mi->mi_rnodes_lock); 1144 return (1); 1145 } 1146 } 1147 mutex_exit(&mi->mi_rnodes_lock); 1148 1149 return (0); 1150 } 1151 1152 /* 1153 * Destroy inactive vnodes from the hash queues which 1154 * belong to this vfs. All of the vnodes should be inactive. 1155 * It is essential that we destroy all rnodes in case of 1156 * forced unmount as well as in normal unmount case. 1157 */ 1158 1159 void 1160 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1161 { 1162 rnode4_t *rp; 1163 mntinfo4_t *mi; 1164 1165 ASSERT(vfsp != NULL); 1166 1167 mi = VFTOMI4(vfsp); 1168 1169 mutex_enter(&rp4freelist_lock); 1170 mutex_enter(&mi->mi_rnodes_lock); 1171 while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) { 1172 /* 1173 * If the rnode is no longer on the freelist it is not 1174 * ours and it will be handled by some other thread, so 1175 * skip it. 1176 */ 1177 if (rp->r_freef == NULL) 1178 continue; 1179 mutex_exit(&mi->mi_rnodes_lock); 1180 1181 rp4_rmfree(rp); 1182 mutex_exit(&rp4freelist_lock); 1183 1184 rp4_rmhash(rp); 1185 1186 /* 1187 * This call to rp4_addfree will end up destroying the 1188 * rnode, but in a safe way with the appropriate set 1189 * of checks done. 1190 */ 1191 rp4_addfree(rp, cr); 1192 1193 mutex_enter(&rp4freelist_lock); 1194 mutex_enter(&mi->mi_rnodes_lock); 1195 } 1196 mutex_exit(&mi->mi_rnodes_lock); 1197 mutex_exit(&rp4freelist_lock); 1198 } 1199 1200 /* 1201 * This routine destroys all the resources of an rnode 1202 * and finally the rnode itself. 1203 */ 1204 static void 1205 destroy_rnode4(rnode4_t *rp) 1206 { 1207 vnode_t *vp; 1208 vfs_t *vfsp; 1209 1210 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1211 1212 vp = RTOV4(rp); 1213 vfsp = vp->v_vfsp; 1214 1215 uninit_rnode4(rp); 1216 atomic_dec_ulong((ulong_t *)&rnode4_new); 1217 #ifdef DEBUG 1218 clstat4_debug.nrnode.value.ui64--; 1219 #endif 1220 kmem_cache_free(rnode4_cache, rp); 1221 vn_invalid(vp); 1222 vn_free(vp); 1223 VFS_RELE(vfsp); 1224 } 1225 1226 /* 1227 * Invalidate the attributes on all rnodes forcing the next getattr 1228 * to go over the wire. Used to flush stale uid and gid mappings. 1229 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1230 */ 1231 void 1232 nfs4_rnode_invalidate(struct vfs *vfsp) 1233 { 1234 int index; 1235 rnode4_t *rp; 1236 vnode_t *vp; 1237 1238 /* 1239 * Walk the hash queues looking for rnodes. 1240 */ 1241 for (index = 0; index < rtable4size; index++) { 1242 rw_enter(&rtable4[index].r_lock, RW_READER); 1243 for (rp = rtable4[index].r_hashf; 1244 rp != (rnode4_t *)(&rtable4[index]); 1245 rp = rp->r_hashf) { 1246 vp = RTOV4(rp); 1247 if (vfsp != NULL && vp->v_vfsp != vfsp) 1248 continue; 1249 1250 if (!mutex_tryenter(&rp->r_statelock)) 1251 continue; 1252 1253 /* 1254 * Expire the attributes by resetting the change 1255 * and attr timeout. 1256 */ 1257 rp->r_change = 0; 1258 PURGE_ATTRCACHE4_LOCKED(rp); 1259 mutex_exit(&rp->r_statelock); 1260 } 1261 rw_exit(&rtable4[index].r_lock); 1262 } 1263 } 1264 1265 /* 1266 * Flush all vnodes in this (or every) vfs. 1267 * Used by nfs_sync and by nfs_unmount. 1268 */ 1269 void 1270 r4flush(struct vfs *vfsp, cred_t *cr) 1271 { 1272 int index; 1273 rnode4_t *rp; 1274 vnode_t *vp, **vplist; 1275 long num, cnt; 1276 1277 /* 1278 * Check to see whether there is anything to do. 1279 */ 1280 num = rnode4_new; 1281 if (num == 0) 1282 return; 1283 1284 /* 1285 * Allocate a slot for all currently active rnodes on the 1286 * supposition that they all may need flushing. 1287 */ 1288 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1289 cnt = 0; 1290 1291 /* 1292 * If the vfs is known we can do fast path by iterating all rnodes that 1293 * belongs to this vfs. This is much faster than the traditional way 1294 * of iterating rtable4 (below) in a case there is a lot of rnodes that 1295 * does not belong to our vfs. 1296 */ 1297 if (vfsp != NULL) { 1298 mntinfo4_t *mi = VFTOMI4(vfsp); 1299 1300 mutex_enter(&mi->mi_rnodes_lock); 1301 for (rp = list_head(&mi->mi_rnodes); rp != NULL; 1302 rp = list_next(&mi->mi_rnodes, rp)) { 1303 vp = RTOV4(rp); 1304 /* 1305 * Don't bother sync'ing a vp if it 1306 * is part of virtual swap device or 1307 * if VFS is read-only 1308 */ 1309 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1310 continue; 1311 /* 1312 * If the vnode has pages and is marked as either dirty 1313 * or mmap'd, hold and add this vnode to the list of 1314 * vnodes to flush. 1315 */ 1316 ASSERT(vp->v_vfsp == vfsp); 1317 if (nfs4_has_pages(vp) && 1318 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1319 VN_HOLD(vp); 1320 vplist[cnt++] = vp; 1321 if (cnt == num) { 1322 /* 1323 * The vplist is full because there is 1324 * too many rnodes. We are done for 1325 * now. 1326 */ 1327 break; 1328 } 1329 } 1330 } 1331 mutex_exit(&mi->mi_rnodes_lock); 1332 1333 goto done; 1334 } 1335 1336 ASSERT(vfsp == NULL); 1337 1338 /* 1339 * Walk the hash queues looking for rnodes with page 1340 * lists associated with them. Make a list of these 1341 * files. 1342 */ 1343 for (index = 0; index < rtable4size; index++) { 1344 rw_enter(&rtable4[index].r_lock, RW_READER); 1345 for (rp = rtable4[index].r_hashf; 1346 rp != (rnode4_t *)(&rtable4[index]); 1347 rp = rp->r_hashf) { 1348 vp = RTOV4(rp); 1349 /* 1350 * Don't bother sync'ing a vp if it 1351 * is part of virtual swap device or 1352 * if VFS is read-only 1353 */ 1354 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1355 continue; 1356 /* 1357 * If the vnode has pages and is marked as either dirty 1358 * or mmap'd, hold and add this vnode to the list of 1359 * vnodes to flush. 1360 */ 1361 if (nfs4_has_pages(vp) && 1362 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1363 VN_HOLD(vp); 1364 vplist[cnt++] = vp; 1365 if (cnt == num) { 1366 rw_exit(&rtable4[index].r_lock); 1367 /* 1368 * The vplist is full because there is 1369 * too many rnodes. We are done for 1370 * now. 1371 */ 1372 goto done; 1373 } 1374 } 1375 } 1376 rw_exit(&rtable4[index].r_lock); 1377 } 1378 1379 done: 1380 1381 /* 1382 * Flush and release all of the files on the list. 1383 */ 1384 while (cnt-- > 0) { 1385 vp = vplist[cnt]; 1386 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 1387 VN_RELE(vp); 1388 } 1389 1390 /* 1391 * Free the space allocated to hold the list. 1392 */ 1393 kmem_free(vplist, num * sizeof (*vplist)); 1394 } 1395 1396 int 1397 nfs4_free_data_reclaim(rnode4_t *rp) 1398 { 1399 char *contents; 1400 vnode_t *xattr; 1401 int size; 1402 vsecattr_t *vsp; 1403 int freed; 1404 bool_t rdc = FALSE; 1405 1406 /* 1407 * Free any held caches which may 1408 * be associated with this rnode. 1409 */ 1410 mutex_enter(&rp->r_statelock); 1411 if (rp->r_dir != NULL) 1412 rdc = TRUE; 1413 contents = rp->r_symlink.contents; 1414 size = rp->r_symlink.size; 1415 rp->r_symlink.contents = NULL; 1416 vsp = rp->r_secattr; 1417 rp->r_secattr = NULL; 1418 xattr = rp->r_xattr_dir; 1419 rp->r_xattr_dir = NULL; 1420 mutex_exit(&rp->r_statelock); 1421 1422 /* 1423 * Free the access cache entries. 1424 */ 1425 freed = nfs4_access_purge_rp(rp); 1426 1427 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1428 return (freed); 1429 1430 /* 1431 * Free the readdir cache entries, incompletely if we can't block. 1432 */ 1433 nfs4_purge_rddir_cache(RTOV4(rp)); 1434 1435 /* 1436 * Free the symbolic link cache. 1437 */ 1438 if (contents != NULL) { 1439 1440 kmem_free((void *)contents, size); 1441 } 1442 1443 /* 1444 * Free any cached ACL. 1445 */ 1446 if (vsp != NULL) 1447 nfs4_acl_free_cache(vsp); 1448 1449 /* 1450 * Release the xattr directory vnode 1451 */ 1452 if (xattr != NULL) 1453 VN_RELE(xattr); 1454 1455 return (1); 1456 } 1457 1458 static int 1459 nfs4_active_data_reclaim(rnode4_t *rp) 1460 { 1461 char *contents; 1462 vnode_t *xattr = NULL; 1463 int size; 1464 vsecattr_t *vsp; 1465 int freed; 1466 bool_t rdc = FALSE; 1467 1468 /* 1469 * Free any held credentials and caches which 1470 * may be associated with this rnode. 1471 */ 1472 if (!mutex_tryenter(&rp->r_statelock)) 1473 return (0); 1474 contents = rp->r_symlink.contents; 1475 size = rp->r_symlink.size; 1476 rp->r_symlink.contents = NULL; 1477 vsp = rp->r_secattr; 1478 rp->r_secattr = NULL; 1479 if (rp->r_dir != NULL) 1480 rdc = TRUE; 1481 /* 1482 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed 1483 * on the same r_hashq queue. We are not mandated to free all caches. 1484 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the 1485 * rnode 'rp' is freed or put on the free list. 1486 * 1487 * We will retain NFS4_XATTR_DIR_NOTSUPP because: 1488 * - it has no associated rnode4_t (its v_data is NULL), 1489 * - it is preallocated statically and will never go away, 1490 * so we cannot save anything by releasing it. 1491 */ 1492 if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP && 1493 VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) { 1494 xattr = rp->r_xattr_dir; 1495 rp->r_xattr_dir = NULL; 1496 } 1497 mutex_exit(&rp->r_statelock); 1498 1499 /* 1500 * Free the access cache entries. 1501 */ 1502 freed = nfs4_access_purge_rp(rp); 1503 1504 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1505 return (freed); 1506 1507 /* 1508 * Free the symbolic link cache. 1509 */ 1510 if (contents != NULL) { 1511 1512 kmem_free((void *)contents, size); 1513 } 1514 1515 /* 1516 * Free any cached ACL. 1517 */ 1518 if (vsp != NULL) 1519 nfs4_acl_free_cache(vsp); 1520 1521 nfs4_purge_rddir_cache(RTOV4(rp)); 1522 1523 /* 1524 * Release the xattr directory vnode 1525 */ 1526 if (xattr != NULL) 1527 VN_RELE(xattr); 1528 1529 return (1); 1530 } 1531 1532 static int 1533 nfs4_free_reclaim(void) 1534 { 1535 int freed; 1536 rnode4_t *rp; 1537 1538 #ifdef DEBUG 1539 clstat4_debug.f_reclaim.value.ui64++; 1540 #endif 1541 freed = 0; 1542 mutex_enter(&rp4freelist_lock); 1543 rp = rp4freelist; 1544 if (rp != NULL) { 1545 do { 1546 if (nfs4_free_data_reclaim(rp)) 1547 freed = 1; 1548 } while ((rp = rp->r_freef) != rp4freelist); 1549 } 1550 mutex_exit(&rp4freelist_lock); 1551 return (freed); 1552 } 1553 1554 static int 1555 nfs4_active_reclaim(void) 1556 { 1557 int freed; 1558 int index; 1559 rnode4_t *rp; 1560 1561 #ifdef DEBUG 1562 clstat4_debug.a_reclaim.value.ui64++; 1563 #endif 1564 freed = 0; 1565 for (index = 0; index < rtable4size; index++) { 1566 rw_enter(&rtable4[index].r_lock, RW_READER); 1567 for (rp = rtable4[index].r_hashf; 1568 rp != (rnode4_t *)(&rtable4[index]); 1569 rp = rp->r_hashf) { 1570 if (nfs4_active_data_reclaim(rp)) 1571 freed = 1; 1572 } 1573 rw_exit(&rtable4[index].r_lock); 1574 } 1575 return (freed); 1576 } 1577 1578 static int 1579 nfs4_rnode_reclaim(void) 1580 { 1581 int freed; 1582 rnode4_t *rp; 1583 vnode_t *vp; 1584 1585 #ifdef DEBUG 1586 clstat4_debug.r_reclaim.value.ui64++; 1587 #endif 1588 freed = 0; 1589 mutex_enter(&rp4freelist_lock); 1590 while ((rp = rp4freelist) != NULL) { 1591 rp4_rmfree(rp); 1592 mutex_exit(&rp4freelist_lock); 1593 if (rp->r_flags & R4HASHED) { 1594 vp = RTOV4(rp); 1595 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1596 mutex_enter(&vp->v_lock); 1597 if (vp->v_count > 1) { 1598 vp->v_count--; 1599 mutex_exit(&vp->v_lock); 1600 rw_exit(&rp->r_hashq->r_lock); 1601 mutex_enter(&rp4freelist_lock); 1602 continue; 1603 } 1604 mutex_exit(&vp->v_lock); 1605 rp4_rmhash_locked(rp); 1606 rw_exit(&rp->r_hashq->r_lock); 1607 } 1608 /* 1609 * This call to rp_addfree will end up destroying the 1610 * rnode, but in a safe way with the appropriate set 1611 * of checks done. 1612 */ 1613 rp4_addfree(rp, CRED()); 1614 mutex_enter(&rp4freelist_lock); 1615 } 1616 mutex_exit(&rp4freelist_lock); 1617 return (freed); 1618 } 1619 1620 /*ARGSUSED*/ 1621 static void 1622 nfs4_reclaim(void *cdrarg) 1623 { 1624 #ifdef DEBUG 1625 clstat4_debug.reclaim.value.ui64++; 1626 #endif 1627 if (nfs4_free_reclaim()) 1628 return; 1629 1630 if (nfs4_active_reclaim()) 1631 return; 1632 1633 (void) nfs4_rnode_reclaim(); 1634 } 1635 1636 /* 1637 * Returns the clientid4 to use for the given mntinfo4. Note that the 1638 * clientid can change if the caller drops mi_recovlock. 1639 */ 1640 1641 clientid4 1642 mi2clientid(mntinfo4_t *mi) 1643 { 1644 nfs4_server_t *sp; 1645 clientid4 clientid = 0; 1646 1647 /* this locks down sp if it is found */ 1648 sp = find_nfs4_server(mi); 1649 if (sp != NULL) { 1650 clientid = sp->clientid; 1651 mutex_exit(&sp->s_lock); 1652 nfs4_server_rele(sp); 1653 } 1654 return (clientid); 1655 } 1656 1657 /* 1658 * Return a list with information about all the known open instances for 1659 * a filesystem. The caller must call r4releopenlist() when done with the 1660 * list. 1661 * 1662 * We are safe at looking at os_valid and os_pending_close across dropping 1663 * the 'os_sync_lock' to count up the number of open streams and then 1664 * allocate memory for the osp list due to: 1665 * -Looking at os_pending_close is safe since this routine is 1666 * only called via recovery, and os_pending_close can only be set via 1667 * a non-recovery operation (which are all blocked when recovery 1668 * is active). 1669 * 1670 * -Examining os_valid is safe since non-recovery operations, which 1671 * could potentially switch os_valid to 0, are blocked (via 1672 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1673 * (which means we are the only recovery thread potentially acting 1674 * on this open stream). 1675 */ 1676 1677 nfs4_opinst_t * 1678 r4mkopenlist(mntinfo4_t *mi) 1679 { 1680 nfs4_opinst_t *reopenlist, *rep; 1681 rnode4_t *rp; 1682 vnode_t *vp; 1683 vfs_t *vfsp = mi->mi_vfsp; 1684 int numosp; 1685 nfs4_open_stream_t *osp; 1686 int index; 1687 open_delegation_type4 dtype; 1688 int hold_vnode; 1689 1690 reopenlist = NULL; 1691 1692 for (index = 0; index < rtable4size; index++) { 1693 rw_enter(&rtable4[index].r_lock, RW_READER); 1694 for (rp = rtable4[index].r_hashf; 1695 rp != (rnode4_t *)(&rtable4[index]); 1696 rp = rp->r_hashf) { 1697 1698 vp = RTOV4(rp); 1699 if (vp->v_vfsp != vfsp) 1700 continue; 1701 hold_vnode = 0; 1702 1703 mutex_enter(&rp->r_os_lock); 1704 1705 /* Count the number of valid open_streams of the file */ 1706 numosp = 0; 1707 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1708 osp = list_next(&rp->r_open_streams, osp)) { 1709 mutex_enter(&osp->os_sync_lock); 1710 if (osp->os_valid && !osp->os_pending_close) 1711 numosp++; 1712 mutex_exit(&osp->os_sync_lock); 1713 } 1714 1715 /* Fill in the valid open streams per vp */ 1716 if (numosp > 0) { 1717 int j; 1718 1719 hold_vnode = 1; 1720 1721 /* 1722 * Add a new open instance to the list 1723 */ 1724 rep = kmem_zalloc(sizeof (*reopenlist), 1725 KM_SLEEP); 1726 rep->re_next = reopenlist; 1727 reopenlist = rep; 1728 1729 rep->re_vp = vp; 1730 rep->re_osp = kmem_zalloc( 1731 numosp * sizeof (*(rep->re_osp)), 1732 KM_SLEEP); 1733 rep->re_numosp = numosp; 1734 1735 j = 0; 1736 for (osp = list_head(&rp->r_open_streams); 1737 osp != NULL; 1738 osp = list_next(&rp->r_open_streams, osp)) { 1739 1740 mutex_enter(&osp->os_sync_lock); 1741 if (osp->os_valid && 1742 !osp->os_pending_close) { 1743 osp->os_ref_count++; 1744 rep->re_osp[j] = osp; 1745 j++; 1746 } 1747 mutex_exit(&osp->os_sync_lock); 1748 } 1749 /* 1750 * Assuming valid osp(s) stays valid between 1751 * the time obtaining j and numosp. 1752 */ 1753 ASSERT(j == numosp); 1754 } 1755 1756 mutex_exit(&rp->r_os_lock); 1757 /* do this here to keep v_lock > r_os_lock */ 1758 if (hold_vnode) 1759 VN_HOLD(vp); 1760 mutex_enter(&rp->r_statev4_lock); 1761 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1762 /* 1763 * If this rnode holds a delegation, 1764 * but if there are no valid open streams, 1765 * then just discard the delegation 1766 * without doing delegreturn. 1767 */ 1768 if (numosp > 0) 1769 rp->r_deleg_needs_recovery = 1770 rp->r_deleg_type; 1771 } 1772 /* Save the delegation type for use outside the lock */ 1773 dtype = rp->r_deleg_type; 1774 mutex_exit(&rp->r_statev4_lock); 1775 1776 /* 1777 * If we have a delegation then get rid of it. 1778 * We've set rp->r_deleg_needs_recovery so we have 1779 * enough information to recover. 1780 */ 1781 if (dtype != OPEN_DELEGATE_NONE) { 1782 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1783 } 1784 } 1785 rw_exit(&rtable4[index].r_lock); 1786 } 1787 return (reopenlist); 1788 } 1789 1790 /* 1791 * Given a filesystem id, check to see if any rnodes 1792 * within this fsid reside in the rnode cache, other 1793 * than one we know about. 1794 * 1795 * Return 1 if an rnode is found, 0 otherwise 1796 */ 1797 int 1798 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid) 1799 { 1800 rnode4_t *rp; 1801 vnode_t *vp; 1802 vfs_t *vfsp = mi->mi_vfsp; 1803 fattr4_fsid *fsid; 1804 int index, found = 0; 1805 1806 for (index = 0; index < rtable4size; index++) { 1807 rw_enter(&rtable4[index].r_lock, RW_READER); 1808 for (rp = rtable4[index].r_hashf; 1809 rp != (rnode4_t *)(&rtable4[index]); 1810 rp = rp->r_hashf) { 1811 1812 vp = RTOV4(rp); 1813 if (vp->v_vfsp != vfsp) 1814 continue; 1815 1816 /* 1817 * XXX there might be a case where a 1818 * replicated fs may have the same fsid 1819 * across two different servers. This 1820 * check isn't good enough in that case 1821 */ 1822 fsid = &rp->r_srv_fsid; 1823 if (FATTR4_FSID_EQ(moved_fsid, fsid)) { 1824 found = 1; 1825 break; 1826 } 1827 } 1828 rw_exit(&rtable4[index].r_lock); 1829 1830 if (found) 1831 break; 1832 } 1833 return (found); 1834 } 1835 1836 /* 1837 * Release the list of open instance references. 1838 */ 1839 1840 void 1841 r4releopenlist(nfs4_opinst_t *reopenp) 1842 { 1843 nfs4_opinst_t *rep, *next; 1844 int i; 1845 1846 for (rep = reopenp; rep; rep = next) { 1847 next = rep->re_next; 1848 1849 for (i = 0; i < rep->re_numosp; i++) 1850 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1851 1852 VN_RELE(rep->re_vp); 1853 kmem_free(rep->re_osp, 1854 rep->re_numosp * sizeof (*(rep->re_osp))); 1855 1856 kmem_free(rep, sizeof (*rep)); 1857 } 1858 } 1859 1860 int 1861 nfs4_rnode_init(void) 1862 { 1863 ulong_t nrnode4_max; 1864 int i; 1865 1866 /* 1867 * Compute the size of the rnode4 hash table 1868 */ 1869 if (nrnode <= 0) 1870 nrnode = ncsize; 1871 nrnode4_max = 1872 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1873 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1874 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1875 "!setting nrnode to max value of %ld", nrnode4_max); 1876 nrnode = nrnode4_max; 1877 } 1878 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1879 rtable4mask = rtable4size - 1; 1880 1881 /* 1882 * Allocate and initialize the hash buckets 1883 */ 1884 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1885 for (i = 0; i < rtable4size; i++) { 1886 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1887 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1888 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1889 } 1890 1891 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1892 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1893 1894 return (0); 1895 } 1896 1897 int 1898 nfs4_rnode_fini(void) 1899 { 1900 int i; 1901 1902 /* 1903 * Deallocate the rnode hash queues 1904 */ 1905 kmem_cache_destroy(rnode4_cache); 1906 1907 for (i = 0; i < rtable4size; i++) 1908 rw_destroy(&rtable4[i].r_lock); 1909 1910 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 1911 1912 return (0); 1913 } 1914 1915 /* 1916 * Return non-zero if the given filehandle refers to the root filehandle 1917 * for the given rnode. 1918 */ 1919 1920 static int 1921 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 1922 { 1923 int isroot; 1924 1925 isroot = 0; 1926 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 1927 isroot = 1; 1928 1929 return (isroot); 1930 } 1931 1932 /* 1933 * The r4_stub_* routines assume that the rnode is newly activated, and 1934 * that the caller either holds the hash bucket r_lock for this rnode as 1935 * RW_WRITER, or holds r_statelock. 1936 */ 1937 static void 1938 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type) 1939 { 1940 vnode_t *vp = RTOV4(rp); 1941 krwlock_t *hash_lock = &rp->r_hashq->r_lock; 1942 1943 ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock)); 1944 1945 rp->r_stub_type = type; 1946 1947 /* 1948 * Safely switch this vnode to the trigger vnodeops. 1949 * 1950 * Currently, we don't ever switch a trigger vnode back to using 1951 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that 1952 * a new v4 object is not a trigger, and it will already have the 1953 * correct v4 vnodeops by default. So, no "else" case required here. 1954 */ 1955 if (type != NFS4_STUB_NONE) 1956 vn_setops(vp, nfs4_trigger_vnodeops); 1957 } 1958 1959 void 1960 r4_stub_mirrormount(rnode4_t *rp) 1961 { 1962 r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT); 1963 } 1964 1965 void 1966 r4_stub_referral(rnode4_t *rp) 1967 { 1968 DTRACE_PROBE1(nfs4clnt__func__referral__moved, 1969 vnode_t *, RTOV4(rp)); 1970 r4_stub_set(rp, NFS4_STUB_REFERRAL); 1971 } 1972 1973 void 1974 r4_stub_none(rnode4_t *rp) 1975 { 1976 r4_stub_set(rp, NFS4_STUB_NONE); 1977 } 1978 1979 #ifdef DEBUG 1980 1981 /* 1982 * Look in the rnode table for other rnodes that have the same filehandle. 1983 * Assume the lock is held for the hash chain of checkrp 1984 */ 1985 1986 static void 1987 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 1988 { 1989 rnode4_t *rp; 1990 vnode_t *tvp; 1991 nfs4_fhandle_t fh, fh2; 1992 int index; 1993 1994 if (!r4_check_for_dups) 1995 return; 1996 1997 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 1998 1999 sfh4_copyval(checkrp->r_fh, &fh); 2000 2001 for (index = 0; index < rtable4size; index++) { 2002 2003 if (&rtable4[index] != checkrp->r_hashq) 2004 rw_enter(&rtable4[index].r_lock, RW_READER); 2005 2006 for (rp = rtable4[index].r_hashf; 2007 rp != (rnode4_t *)(&rtable4[index]); 2008 rp = rp->r_hashf) { 2009 2010 if (rp == checkrp) 2011 continue; 2012 2013 tvp = RTOV4(rp); 2014 if (tvp->v_vfsp != vfsp) 2015 continue; 2016 2017 sfh4_copyval(rp->r_fh, &fh2); 2018 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 2019 cmn_err(CE_PANIC, "rnodes with same fs, fh " 2020 "(%p, %p)", (void *)checkrp, (void *)rp); 2021 } 2022 } 2023 2024 if (&rtable4[index] != checkrp->r_hashq) 2025 rw_exit(&rtable4[index].r_lock); 2026 } 2027 } 2028 2029 #endif /* DEBUG */ 2030