1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/proc.h> 37 #include <sys/user.h> 38 #include <sys/time.h> 39 #include <sys/buf.h> 40 #include <sys/vfs.h> 41 #include <sys/vnode.h> 42 #include <sys/socket.h> 43 #include <sys/uio.h> 44 #include <sys/tiuser.h> 45 #include <sys/swap.h> 46 #include <sys/errno.h> 47 #include <sys/debug.h> 48 #include <sys/kmem.h> 49 #include <sys/kstat.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vtrace.h> 52 #include <sys/session.h> 53 #include <sys/dnlc.h> 54 #include <sys/bitmap.h> 55 #include <sys/acl.h> 56 #include <sys/ddi.h> 57 #include <sys/pathname.h> 58 #include <sys/flock.h> 59 #include <sys/dirent.h> 60 #include <sys/flock.h> 61 #include <sys/callb.h> 62 #include <sys/sdt.h> 63 64 #include <rpc/types.h> 65 #include <rpc/xdr.h> 66 #include <rpc/auth.h> 67 #include <rpc/rpcsec_gss.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs_clnt.h> 72 #include <nfs/nfs_acl.h> 73 74 #include <nfs/nfs4.h> 75 #include <nfs/rnode4.h> 76 #include <nfs/nfs4_clnt.h> 77 78 /* 79 * The hash queues for the access to active and cached rnodes 80 * are organized as doubly linked lists. A reader/writer lock 81 * for each hash bucket is used to control access and to synchronize 82 * lookups, additions, and deletions from the hash queue. 83 * 84 * The rnode freelist is organized as a doubly linked list with 85 * a head pointer. Additions and deletions are synchronized via 86 * a single mutex. 87 * 88 * In order to add an rnode to the free list, it must be hashed into 89 * a hash queue and the exclusive lock to the hash queue be held. 90 * If an rnode is not hashed into a hash queue, then it is destroyed 91 * because it represents no valuable information that can be reused 92 * about the file. The exclusive lock to the hash queue must be 93 * held in order to prevent a lookup in the hash queue from finding 94 * the rnode and using it and assuming that the rnode is not on the 95 * freelist. The lookup in the hash queue will have the hash queue 96 * locked, either exclusive or shared. 97 * 98 * The vnode reference count for each rnode is not allowed to drop 99 * below 1. This prevents external entities, such as the VM 100 * subsystem, from acquiring references to vnodes already on the 101 * freelist and then trying to place them back on the freelist 102 * when their reference is released. This means that the when an 103 * rnode is looked up in the hash queues, then either the rnode 104 * is removed from the freelist and that reference is transferred to 105 * the new reference or the vnode reference count must be incremented 106 * accordingly. The mutex for the freelist must be held in order to 107 * accurately test to see if the rnode is on the freelist or not. 108 * The hash queue lock might be held shared and it is possible that 109 * two different threads may race to remove the rnode from the 110 * freelist. This race can be resolved by holding the mutex for the 111 * freelist. Please note that the mutex for the freelist does not 112 * need to be held if the rnode is not on the freelist. It can not be 113 * placed on the freelist due to the requirement that the thread 114 * putting the rnode on the freelist must hold the exclusive lock 115 * to the hash queue and the thread doing the lookup in the hash 116 * queue is holding either a shared or exclusive lock to the hash 117 * queue. 118 * 119 * The lock ordering is: 120 * 121 * hash bucket lock -> vnode lock 122 * hash bucket lock -> freelist lock -> r_statelock 123 */ 124 r4hashq_t *rtable4; 125 126 static kmutex_t rp4freelist_lock; 127 static rnode4_t *rp4freelist = NULL; 128 static long rnode4_new = 0; 129 int rtable4size; 130 static int rtable4mask; 131 static struct kmem_cache *rnode4_cache; 132 static int rnode4_hashlen = 4; 133 134 static void r4inactive(rnode4_t *, cred_t *); 135 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 136 struct vnodeops *, 137 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 138 cred_t *), 139 int *, cred_t *); 140 static void rp4_rmfree(rnode4_t *); 141 int nfs4_free_data_reclaim(rnode4_t *); 142 static int nfs4_active_data_reclaim(rnode4_t *); 143 static int nfs4_free_reclaim(void); 144 static int nfs4_active_reclaim(void); 145 static int nfs4_rnode_reclaim(void); 146 static void nfs4_reclaim(void *); 147 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 148 static void uninit_rnode4(rnode4_t *); 149 static void destroy_rnode4(rnode4_t *); 150 static void r4_stub_set(rnode4_t *, nfs4_stub_type_t); 151 152 #ifdef DEBUG 153 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 154 static int nfs4_rnode_debug = 0; 155 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 156 static int nfs4_rnode_nofreelist = 0; 157 /* give messages on colliding shared filehandles */ 158 static void r4_dup_check(rnode4_t *, vfs_t *); 159 #endif 160 161 /* 162 * If the vnode has pages, run the list and check for any that are 163 * still dangling. We call this routine before putting an rnode on 164 * the free list. 165 */ 166 static int 167 nfs4_dross_pages(vnode_t *vp) 168 { 169 page_t *pp; 170 kmutex_t *vphm; 171 172 vphm = page_vnode_mutex(vp); 173 mutex_enter(vphm); 174 if ((pp = vp->v_pages) != NULL) { 175 do { 176 if (pp->p_fsdata != C_NOCOMMIT) { 177 mutex_exit(vphm); 178 return (1); 179 } 180 } while ((pp = pp->p_vpnext) != vp->v_pages); 181 } 182 mutex_exit(vphm); 183 184 return (0); 185 } 186 187 /* 188 * Flush any pages left on this rnode. 189 */ 190 static void 191 r4flushpages(rnode4_t *rp, cred_t *cr) 192 { 193 vnode_t *vp; 194 int error; 195 196 /* 197 * Before freeing anything, wait until all asynchronous 198 * activity is done on this rnode. This will allow all 199 * asynchronous read ahead and write behind i/o's to 200 * finish. 201 */ 202 mutex_enter(&rp->r_statelock); 203 while (rp->r_count > 0) 204 cv_wait(&rp->r_cv, &rp->r_statelock); 205 mutex_exit(&rp->r_statelock); 206 207 /* 208 * Flush and invalidate all pages associated with the vnode. 209 */ 210 vp = RTOV4(rp); 211 if (nfs4_has_pages(vp)) { 212 ASSERT(vp->v_type != VCHR); 213 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 214 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 215 if (error && (error == ENOSPC || error == EDQUOT)) { 216 mutex_enter(&rp->r_statelock); 217 if (!rp->r_error) 218 rp->r_error = error; 219 mutex_exit(&rp->r_statelock); 220 } 221 } 222 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 223 } 224 } 225 226 /* 227 * Free the resources associated with an rnode. 228 */ 229 static void 230 r4inactive(rnode4_t *rp, cred_t *cr) 231 { 232 vnode_t *vp; 233 char *contents; 234 int size; 235 vsecattr_t *vsp; 236 vnode_t *xattr; 237 238 r4flushpages(rp, cr); 239 240 vp = RTOV4(rp); 241 242 /* 243 * Free any held caches which may be 244 * associated with this rnode. 245 */ 246 mutex_enter(&rp->r_statelock); 247 contents = rp->r_symlink.contents; 248 size = rp->r_symlink.size; 249 rp->r_symlink.contents = NULL; 250 vsp = rp->r_secattr; 251 rp->r_secattr = NULL; 252 xattr = rp->r_xattr_dir; 253 rp->r_xattr_dir = NULL; 254 mutex_exit(&rp->r_statelock); 255 256 /* 257 * Free the access cache entries. 258 */ 259 (void) nfs4_access_purge_rp(rp); 260 261 /* 262 * Free the readdir cache entries. 263 */ 264 nfs4_purge_rddir_cache(vp); 265 266 /* 267 * Free the symbolic link cache. 268 */ 269 if (contents != NULL) { 270 271 kmem_free((void *)contents, size); 272 } 273 274 /* 275 * Free any cached ACL. 276 */ 277 if (vsp != NULL) 278 nfs4_acl_free_cache(vsp); 279 280 /* 281 * Release the cached xattr_dir 282 */ 283 if (xattr != NULL) 284 VN_RELE(xattr); 285 } 286 287 /* 288 * We have seen a case that the fh passed in is for "." which 289 * should be a VROOT node, however, the fh is different from the 290 * root fh stored in the mntinfo4_t. The invalid fh might be 291 * from a misbehaved server and will panic the client system at 292 * a later time. To avoid the panic, we drop the bad fh, use 293 * the root fh from mntinfo4_t, and print an error message 294 * for attention. 295 */ 296 nfs4_sharedfh_t * 297 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 298 int *wasbad) 299 { 300 char *s; 301 302 *wasbad = 0; 303 s = fn_name(nm); 304 ASSERT(strcmp(s, "..") != 0); 305 306 if ((s[0] == '.' && s[1] == '\0') && fh && 307 !SFH4_SAME(mi->mi_rootfh, fh)) { 308 #ifdef DEBUG 309 nfs4_fhandle_t fhandle; 310 311 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 312 "Server %s returns a different " 313 "root filehandle for the path %s:", 314 mi->mi_curr_serv->sv_hostname, 315 mi->mi_curr_serv->sv_path); 316 317 /* print the bad fh */ 318 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 319 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 320 fhandle.fh_len); 321 nfs4_printfhandle(&fhandle); 322 323 /* print mi_rootfh */ 324 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 325 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 326 fhandle.fh_len); 327 nfs4_printfhandle(&fhandle); 328 #endif 329 /* use mi_rootfh instead; fh will be rele by the caller */ 330 fh = mi->mi_rootfh; 331 *wasbad = 1; 332 } 333 334 kmem_free(s, MAXNAMELEN); 335 return (fh); 336 } 337 338 void 339 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 340 hrtime_t t, cred_t *cr, int index) 341 { 342 int is_stub; 343 vattr_t *attr; 344 /* 345 * Don't add to attrcache if time overflow, but 346 * no need to check because either attr is null or the time 347 * values in it were processed by nfs4_time_ntov(), which checks 348 * for time overflows. 349 */ 350 attr = garp ? &garp->n4g_va : NULL; 351 352 if (attr) { 353 if (!newnode) { 354 rw_exit(&rtable4[index].r_lock); 355 #ifdef DEBUG 356 if (vp->v_type != attr->va_type && 357 vp->v_type != VNON && attr->va_type != VNON) { 358 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 359 "makenfs4node: type (%d) doesn't " 360 "match type of found node at %p (%d)", 361 attr->va_type, (void *)vp, vp->v_type); 362 } 363 #endif 364 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 365 } else { 366 rnode4_t *rp = VTOR4(vp); 367 368 vp->v_type = attr->va_type; 369 vp->v_rdev = attr->va_rdev; 370 371 /* 372 * Turn this object into a "stub" object if we 373 * crossed an underlying server fs boundary. 374 * To make this check, during mount we save the 375 * fsid of the server object being mounted. 376 * Here we compare this object's server fsid 377 * with the fsid we saved at mount. If they 378 * are different, we crossed server fs boundary. 379 * 380 * The stub type is set (or not) at rnode 381 * creation time and it never changes for life 382 * of the rnode. 383 * 384 * The stub type is also set during RO failover, 385 * nfs4_remap_file(). 386 * 387 * This stub will be for a mirror-mount. 388 * 389 * We don't bother with taking r_state_lock to 390 * set the stub type because this is a new rnode 391 * and we're holding the hash bucket r_lock RW_WRITER. 392 * No other thread could have obtained access 393 * to this rnode. 394 */ 395 is_stub = 0; 396 if (garp->n4g_fsid_valid) { 397 fattr4_fsid ga_fsid = garp->n4g_fsid; 398 servinfo4_t *svp = rp->r_server; 399 400 rp->r_srv_fsid = ga_fsid; 401 402 (void) nfs_rw_enter_sig(&svp->sv_lock, 403 RW_READER, 0); 404 if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid)) 405 is_stub = 1; 406 nfs_rw_exit(&svp->sv_lock); 407 } 408 409 if (is_stub) 410 r4_stub_mirrormount(rp); 411 else 412 r4_stub_none(rp); 413 414 /* Can not cache partial attr */ 415 if (attr->va_mask == AT_ALL) 416 nfs4_attrcache_noinval(vp, garp, t); 417 else 418 PURGE_ATTRCACHE4(vp); 419 420 rw_exit(&rtable4[index].r_lock); 421 } 422 } else { 423 if (newnode) { 424 PURGE_ATTRCACHE4(vp); 425 } 426 rw_exit(&rtable4[index].r_lock); 427 } 428 } 429 430 /* 431 * Find or create an rnode based primarily on filehandle. To be 432 * used when dvp (vnode for parent directory) is not available; 433 * otherwise, makenfs4node() should be used. 434 * 435 * The nfs4_fname_t argument *npp is consumed and nulled out. 436 */ 437 438 vnode_t * 439 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 440 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 441 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 442 { 443 vfs_t *vfsp = mi->mi_vfsp; 444 int newnode = 0; 445 vnode_t *vp; 446 rnode4_t *rp; 447 svnode_t *svp; 448 nfs4_fname_t *name, *svpname; 449 int index; 450 451 ASSERT(npp && *npp); 452 name = *npp; 453 *npp = NULL; 454 455 index = rtable4hash(sfh); 456 rw_enter(&rtable4[index].r_lock, RW_READER); 457 458 vp = make_rnode4(sfh, &rtable4[index], vfsp, 459 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 460 461 svp = VTOSV(vp); 462 rp = VTOR4(vp); 463 if (newnode) { 464 svp->sv_forw = svp->sv_back = svp; 465 svp->sv_name = name; 466 if (psfh != NULL) 467 sfh4_hold(psfh); 468 svp->sv_dfh = psfh; 469 } else { 470 /* 471 * It is possible that due to a server 472 * side rename fnames have changed. 473 * update the fname here. 474 */ 475 mutex_enter(&rp->r_svlock); 476 svpname = svp->sv_name; 477 if (svp->sv_name != name) { 478 svp->sv_name = name; 479 mutex_exit(&rp->r_svlock); 480 fn_rele(&svpname); 481 } else { 482 mutex_exit(&rp->r_svlock); 483 fn_rele(&name); 484 } 485 } 486 487 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 488 r4_do_attrcache(vp, garp, newnode, t, cr, index); 489 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 490 491 return (vp); 492 } 493 494 /* 495 * Find or create a vnode for the given filehandle, filesystem, parent, and 496 * name. The reference to nm is consumed, so the caller must first do an 497 * fn_hold() if it wants to continue using nm after this call. 498 */ 499 vnode_t * 500 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 501 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 502 { 503 vnode_t *vp; 504 int newnode; 505 int index; 506 mntinfo4_t *mi = VFTOMI4(vfsp); 507 int had_badfh = 0; 508 rnode4_t *rp; 509 510 ASSERT(dvp != NULL); 511 512 fh = badrootfh_check(fh, nm, mi, &had_badfh); 513 514 index = rtable4hash(fh); 515 rw_enter(&rtable4[index].r_lock, RW_READER); 516 517 /* 518 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 519 */ 520 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 521 nfs4_putapage, &newnode, cr); 522 523 rp = VTOR4(vp); 524 sv_activate(&vp, dvp, &nm, newnode); 525 if (dvp->v_flag & V_XATTRDIR) { 526 mutex_enter(&rp->r_statelock); 527 rp->r_flags |= R4ISXATTR; 528 mutex_exit(&rp->r_statelock); 529 } 530 531 /* if getting a bad file handle, do not cache the attributes. */ 532 if (had_badfh) { 533 rw_exit(&rtable4[index].r_lock); 534 return (vp); 535 } 536 537 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 538 r4_do_attrcache(vp, garp, newnode, t, cr, index); 539 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 540 541 return (vp); 542 } 543 544 /* 545 * Hash on address of filehandle object. 546 * XXX totally untuned. 547 */ 548 549 int 550 rtable4hash(nfs4_sharedfh_t *fh) 551 { 552 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 553 } 554 555 /* 556 * Find or create the vnode for the given filehandle and filesystem. 557 * *newnode is set to zero if the vnode already existed; non-zero if it had 558 * to be created. 559 * 560 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 561 */ 562 563 static vnode_t * 564 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 565 struct vnodeops *vops, 566 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 567 int *newnode, cred_t *cr) 568 { 569 rnode4_t *rp; 570 rnode4_t *trp; 571 vnode_t *vp; 572 mntinfo4_t *mi; 573 574 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 575 576 mi = VFTOMI4(vfsp); 577 578 start: 579 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 580 vp = RTOV4(rp); 581 *newnode = 0; 582 return (vp); 583 } 584 rw_exit(&rhtp->r_lock); 585 586 mutex_enter(&rp4freelist_lock); 587 588 if (rp4freelist != NULL && rnode4_new >= nrnode) { 589 rp = rp4freelist; 590 rp4_rmfree(rp); 591 mutex_exit(&rp4freelist_lock); 592 593 vp = RTOV4(rp); 594 595 if (rp->r_flags & R4HASHED) { 596 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 597 mutex_enter(&vp->v_lock); 598 if (vp->v_count > 1) { 599 vp->v_count--; 600 mutex_exit(&vp->v_lock); 601 rw_exit(&rp->r_hashq->r_lock); 602 rw_enter(&rhtp->r_lock, RW_READER); 603 goto start; 604 } 605 mutex_exit(&vp->v_lock); 606 rp4_rmhash_locked(rp); 607 rw_exit(&rp->r_hashq->r_lock); 608 } 609 610 r4inactive(rp, cr); 611 612 mutex_enter(&vp->v_lock); 613 if (vp->v_count > 1) { 614 vp->v_count--; 615 mutex_exit(&vp->v_lock); 616 rw_enter(&rhtp->r_lock, RW_READER); 617 goto start; 618 } 619 mutex_exit(&vp->v_lock); 620 vn_invalid(vp); 621 622 /* 623 * destroy old locks before bzero'ing and 624 * recreating the locks below. 625 */ 626 uninit_rnode4(rp); 627 628 /* 629 * Make sure that if rnode is recycled then 630 * VFS count is decremented properly before 631 * reuse. 632 */ 633 VFS_RELE(vp->v_vfsp); 634 vn_reinit(vp); 635 } else { 636 vnode_t *new_vp; 637 638 mutex_exit(&rp4freelist_lock); 639 640 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 641 new_vp = vn_alloc(KM_SLEEP); 642 643 atomic_add_long((ulong_t *)&rnode4_new, 1); 644 #ifdef DEBUG 645 clstat4_debug.nrnode.value.ui64++; 646 #endif 647 vp = new_vp; 648 } 649 650 bzero(rp, sizeof (*rp)); 651 rp->r_vnode = vp; 652 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 653 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 654 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 655 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 656 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 657 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 658 rp->created_v4 = 0; 659 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 660 offsetof(nfs4_open_stream_t, os_node)); 661 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 662 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 663 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 664 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 665 rp->r_flags = R4READDIRWATTR; 666 rp->r_fh = fh; 667 rp->r_hashq = rhtp; 668 sfh4_hold(rp->r_fh); 669 rp->r_server = mi->mi_curr_serv; 670 rp->r_deleg_type = OPEN_DELEGATE_NONE; 671 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 672 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 673 674 rddir4_cache_create(rp); 675 rp->r_putapage = putapage; 676 vn_setops(vp, vops); 677 vp->v_data = (caddr_t)rp; 678 vp->v_vfsp = vfsp; 679 VFS_HOLD(vfsp); 680 vp->v_type = VNON; 681 if (isrootfh(fh, rp)) 682 vp->v_flag = VROOT; 683 vn_exists(vp); 684 685 /* 686 * There is a race condition if someone else 687 * alloc's the rnode while no locks are held, so we 688 * check again and recover if found. 689 */ 690 rw_enter(&rhtp->r_lock, RW_WRITER); 691 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 692 vp = RTOV4(trp); 693 *newnode = 0; 694 rw_exit(&rhtp->r_lock); 695 rp4_addfree(rp, cr); 696 rw_enter(&rhtp->r_lock, RW_READER); 697 return (vp); 698 } 699 rp4_addhash(rp); 700 *newnode = 1; 701 return (vp); 702 } 703 704 static void 705 uninit_rnode4(rnode4_t *rp) 706 { 707 vnode_t *vp = RTOV4(rp); 708 709 ASSERT(rp != NULL); 710 ASSERT(vp != NULL); 711 ASSERT(vp->v_count == 1); 712 ASSERT(rp->r_count == 0); 713 ASSERT(rp->r_mapcnt == 0); 714 if (rp->r_flags & R4LODANGLERS) { 715 nfs4_flush_lock_owners(rp); 716 } 717 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 718 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 719 ASSERT(!(rp->r_flags & R4HASHED)); 720 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 721 nfs4_clear_open_streams(rp); 722 list_destroy(&rp->r_open_streams); 723 724 /* 725 * Destroy the rddir cache first since we need to grab the r_statelock. 726 */ 727 mutex_enter(&rp->r_statelock); 728 rddir4_cache_destroy(rp); 729 mutex_exit(&rp->r_statelock); 730 sv_uninit(&rp->r_svnode); 731 sfh4_rele(&rp->r_fh); 732 nfs_rw_destroy(&rp->r_rwlock); 733 nfs_rw_destroy(&rp->r_lkserlock); 734 mutex_destroy(&rp->r_statelock); 735 mutex_destroy(&rp->r_statev4_lock); 736 mutex_destroy(&rp->r_os_lock); 737 cv_destroy(&rp->r_cv); 738 cv_destroy(&rp->r_commit.c_cv); 739 nfs_rw_destroy(&rp->r_deleg_recall_lock); 740 if (rp->r_flags & R4DELMAPLIST) 741 list_destroy(&rp->r_indelmap); 742 } 743 744 /* 745 * Put an rnode on the free list. 746 * 747 * Rnodes which were allocated above and beyond the normal limit 748 * are immediately freed. 749 */ 750 void 751 rp4_addfree(rnode4_t *rp, cred_t *cr) 752 { 753 vnode_t *vp; 754 vnode_t *xattr; 755 struct vfs *vfsp; 756 757 vp = RTOV4(rp); 758 ASSERT(vp->v_count >= 1); 759 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 760 761 /* 762 * If we have too many rnodes allocated and there are no 763 * references to this rnode, or if the rnode is no longer 764 * accessible by it does not reside in the hash queues, 765 * or if an i/o error occurred while writing to the file, 766 * then just free it instead of putting it on the rnode 767 * freelist. 768 */ 769 vfsp = vp->v_vfsp; 770 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 771 #ifdef DEBUG 772 (nfs4_rnode_nofreelist != 0) || 773 #endif 774 rp->r_error || (rp->r_flags & R4RECOVERR) || 775 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 776 if (rp->r_flags & R4HASHED) { 777 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 778 mutex_enter(&vp->v_lock); 779 if (vp->v_count > 1) { 780 vp->v_count--; 781 mutex_exit(&vp->v_lock); 782 rw_exit(&rp->r_hashq->r_lock); 783 return; 784 } 785 mutex_exit(&vp->v_lock); 786 rp4_rmhash_locked(rp); 787 rw_exit(&rp->r_hashq->r_lock); 788 } 789 790 /* 791 * Make sure we don't have a delegation on this rnode 792 * before destroying it. 793 */ 794 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 795 (void) nfs4delegreturn(rp, 796 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 797 } 798 799 r4inactive(rp, cr); 800 801 /* 802 * Recheck the vnode reference count. We need to 803 * make sure that another reference has not been 804 * acquired while we were not holding v_lock. The 805 * rnode is not in the rnode hash queues; one 806 * way for a reference to have been acquired 807 * is for a VOP_PUTPAGE because the rnode was marked 808 * with R4DIRTY or for a modified page. This 809 * reference may have been acquired before our call 810 * to r4inactive. The i/o may have been completed, 811 * thus allowing r4inactive to complete, but the 812 * reference to the vnode may not have been released 813 * yet. In any case, the rnode can not be destroyed 814 * until the other references to this vnode have been 815 * released. The other references will take care of 816 * either destroying the rnode or placing it on the 817 * rnode freelist. If there are no other references, 818 * then the rnode may be safely destroyed. 819 */ 820 mutex_enter(&vp->v_lock); 821 if (vp->v_count > 1) { 822 vp->v_count--; 823 mutex_exit(&vp->v_lock); 824 return; 825 } 826 mutex_exit(&vp->v_lock); 827 828 destroy_rnode4(rp); 829 return; 830 } 831 832 /* 833 * Lock the hash queue and then recheck the reference count 834 * to ensure that no other threads have acquired a reference 835 * to indicate that the rnode should not be placed on the 836 * freelist. If another reference has been acquired, then 837 * just release this one and let the other thread complete 838 * the processing of adding this rnode to the freelist. 839 */ 840 again: 841 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 842 843 mutex_enter(&vp->v_lock); 844 if (vp->v_count > 1) { 845 vp->v_count--; 846 mutex_exit(&vp->v_lock); 847 rw_exit(&rp->r_hashq->r_lock); 848 return; 849 } 850 mutex_exit(&vp->v_lock); 851 852 /* 853 * Make sure we don't put an rnode with a delegation 854 * on the free list. 855 */ 856 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 857 rw_exit(&rp->r_hashq->r_lock); 858 (void) nfs4delegreturn(rp, 859 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 860 goto again; 861 } 862 863 /* 864 * Now that we have the hash queue lock, and we know there 865 * are not anymore references on the vnode, check to make 866 * sure there aren't any open streams still on the rnode. 867 * If so, drop the hash queue lock, remove the open streams, 868 * and recheck the v_count. 869 */ 870 mutex_enter(&rp->r_os_lock); 871 if (list_head(&rp->r_open_streams) != NULL) { 872 mutex_exit(&rp->r_os_lock); 873 rw_exit(&rp->r_hashq->r_lock); 874 if (nfs_zone() != VTOMI4(vp)->mi_zone) 875 nfs4_clear_open_streams(rp); 876 else 877 (void) nfs4close_all(vp, cr); 878 goto again; 879 } 880 mutex_exit(&rp->r_os_lock); 881 882 /* 883 * Before we put it on the freelist, make sure there are no pages. 884 * If there are, flush and commit of all of the dirty and 885 * uncommitted pages, assuming the file system isn't read only. 886 */ 887 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) { 888 rw_exit(&rp->r_hashq->r_lock); 889 r4flushpages(rp, cr); 890 goto again; 891 } 892 893 /* 894 * Before we put it on the freelist, make sure there is no 895 * active xattr directory cached, the freelist will not 896 * have its entries r4inactive'd if there is still an active 897 * rnode, thus nothing in the freelist can hold another 898 * rnode active. 899 */ 900 xattr = rp->r_xattr_dir; 901 rp->r_xattr_dir = NULL; 902 903 /* 904 * If there is no cached data or metadata for this file, then 905 * put the rnode on the front of the freelist so that it will 906 * be reused before other rnodes which may have cached data or 907 * metadata associated with them. 908 */ 909 mutex_enter(&rp4freelist_lock); 910 if (rp4freelist == NULL) { 911 rp->r_freef = rp; 912 rp->r_freeb = rp; 913 rp4freelist = rp; 914 } else { 915 rp->r_freef = rp4freelist; 916 rp->r_freeb = rp4freelist->r_freeb; 917 rp4freelist->r_freeb->r_freef = rp; 918 rp4freelist->r_freeb = rp; 919 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 920 rp->r_symlink.contents == NULL && rp->r_secattr == NULL) 921 rp4freelist = rp; 922 } 923 mutex_exit(&rp4freelist_lock); 924 925 rw_exit(&rp->r_hashq->r_lock); 926 927 if (xattr) 928 VN_RELE(xattr); 929 } 930 931 /* 932 * Remove an rnode from the free list. 933 * 934 * The caller must be holding rp4freelist_lock and the rnode 935 * must be on the freelist. 936 */ 937 static void 938 rp4_rmfree(rnode4_t *rp) 939 { 940 941 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 942 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 943 944 if (rp == rp4freelist) { 945 rp4freelist = rp->r_freef; 946 if (rp == rp4freelist) 947 rp4freelist = NULL; 948 } 949 rp->r_freeb->r_freef = rp->r_freef; 950 rp->r_freef->r_freeb = rp->r_freeb; 951 952 rp->r_freef = rp->r_freeb = NULL; 953 } 954 955 /* 956 * Put a rnode in the hash table. 957 * 958 * The caller must be holding the exclusive hash queue lock 959 */ 960 void 961 rp4_addhash(rnode4_t *rp) 962 { 963 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 964 ASSERT(!(rp->r_flags & R4HASHED)); 965 966 #ifdef DEBUG 967 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 968 #endif 969 970 rp->r_hashf = rp->r_hashq->r_hashf; 971 rp->r_hashq->r_hashf = rp; 972 rp->r_hashb = (rnode4_t *)rp->r_hashq; 973 rp->r_hashf->r_hashb = rp; 974 975 mutex_enter(&rp->r_statelock); 976 rp->r_flags |= R4HASHED; 977 mutex_exit(&rp->r_statelock); 978 } 979 980 /* 981 * Remove a rnode from the hash table. 982 * 983 * The caller must be holding the hash queue lock. 984 */ 985 void 986 rp4_rmhash_locked(rnode4_t *rp) 987 { 988 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 989 ASSERT(rp->r_flags & R4HASHED); 990 991 rp->r_hashb->r_hashf = rp->r_hashf; 992 rp->r_hashf->r_hashb = rp->r_hashb; 993 994 mutex_enter(&rp->r_statelock); 995 rp->r_flags &= ~R4HASHED; 996 mutex_exit(&rp->r_statelock); 997 } 998 999 /* 1000 * Remove a rnode from the hash table. 1001 * 1002 * The caller must not be holding the hash queue lock. 1003 */ 1004 void 1005 rp4_rmhash(rnode4_t *rp) 1006 { 1007 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1008 rp4_rmhash_locked(rp); 1009 rw_exit(&rp->r_hashq->r_lock); 1010 } 1011 1012 /* 1013 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 1014 * Returns NULL if no match. If an rnode is returned, the reference count 1015 * on the master vnode is incremented. 1016 * 1017 * The caller must be holding the hash queue lock, either shared or exclusive. 1018 */ 1019 rnode4_t * 1020 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 1021 { 1022 rnode4_t *rp; 1023 vnode_t *vp; 1024 1025 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 1026 1027 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 1028 vp = RTOV4(rp); 1029 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 1030 1031 mutex_enter(&rp->r_statelock); 1032 if (rp->r_flags & R4RECOVERR) { 1033 mutex_exit(&rp->r_statelock); 1034 continue; 1035 } 1036 mutex_exit(&rp->r_statelock); 1037 #ifdef DEBUG 1038 r4_dup_check(rp, vfsp); 1039 #endif 1040 if (rp->r_freef != NULL) { 1041 mutex_enter(&rp4freelist_lock); 1042 /* 1043 * If the rnode is on the freelist, 1044 * then remove it and use that reference 1045 * as the new reference. Otherwise, 1046 * need to increment the reference count. 1047 */ 1048 if (rp->r_freef != NULL) { 1049 rp4_rmfree(rp); 1050 mutex_exit(&rp4freelist_lock); 1051 } else { 1052 mutex_exit(&rp4freelist_lock); 1053 VN_HOLD(vp); 1054 } 1055 } else 1056 VN_HOLD(vp); 1057 1058 /* 1059 * if root vnode, set v_flag to indicate that 1060 */ 1061 if (isrootfh(fh, rp)) { 1062 if (!(vp->v_flag & VROOT)) { 1063 mutex_enter(&vp->v_lock); 1064 vp->v_flag |= VROOT; 1065 mutex_exit(&vp->v_lock); 1066 } 1067 } 1068 return (rp); 1069 } 1070 } 1071 return (NULL); 1072 } 1073 1074 /* 1075 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1076 * that assumes the caller hasn't already got the lock 1077 * on the hash bucket. 1078 */ 1079 rnode4_t * 1080 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1081 { 1082 rnode4_t *rp; 1083 int index; 1084 1085 index = rtable4hash(fh); 1086 rw_enter(&rtable4[index].r_lock, RW_READER); 1087 rp = r4find(&rtable4[index], fh, vfsp); 1088 rw_exit(&rtable4[index].r_lock); 1089 1090 return (rp); 1091 } 1092 1093 /* 1094 * Return >0 if there is a active vnode belonging to this vfs in the 1095 * rtable4 cache. 1096 * 1097 * Several of these checks are done without holding the usual 1098 * locks. This is safe because destroy_rtable(), rp_addfree(), 1099 * etc. will redo the necessary checks before actually destroying 1100 * any rnodes. 1101 */ 1102 int 1103 check_rtable4(struct vfs *vfsp) 1104 { 1105 rnode4_t *rp; 1106 vnode_t *vp; 1107 int busy = NFSV4_RTABLE4_OK; 1108 int index; 1109 1110 for (index = 0; index < rtable4size; index++) { 1111 rw_enter(&rtable4[index].r_lock, RW_READER); 1112 1113 for (rp = rtable4[index].r_hashf; 1114 rp != (rnode4_t *)(&rtable4[index]); 1115 rp = rp->r_hashf) { 1116 1117 vp = RTOV4(rp); 1118 if (vp->v_vfsp == vfsp) { 1119 if (rp->r_freef == NULL) { 1120 busy = NFSV4_RTABLE4_NOT_FREE_LIST; 1121 } else if (nfs4_has_pages(vp) && 1122 (rp->r_flags & R4DIRTY)) { 1123 busy = NFSV4_RTABLE4_DIRTY_PAGES; 1124 } else if (rp->r_count > 0) { 1125 busy = NFSV4_RTABLE4_POS_R_COUNT; 1126 } 1127 1128 if (busy != NFSV4_RTABLE4_OK) { 1129 #ifdef DEBUG 1130 char *path; 1131 1132 path = fn_path(rp->r_svnode.sv_name); 1133 DTRACE_NFSV4_3(rnode__e__debug, 1134 int, busy, char *, path, 1135 rnode4_t *, rp); 1136 kmem_free(path, strlen(path)+1); 1137 #endif 1138 rw_exit(&rtable4[index].r_lock); 1139 return (busy); 1140 } 1141 } 1142 } 1143 rw_exit(&rtable4[index].r_lock); 1144 } 1145 return (busy); 1146 } 1147 1148 /* 1149 * Destroy inactive vnodes from the hash queues which 1150 * belong to this vfs. All of the vnodes should be inactive. 1151 * It is essential that we destroy all rnodes in case of 1152 * forced unmount as well as in normal unmount case. 1153 */ 1154 1155 void 1156 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1157 { 1158 int index; 1159 vnode_t *vp; 1160 rnode4_t *rp, *r_hashf, *rlist; 1161 1162 rlist = NULL; 1163 1164 for (index = 0; index < rtable4size; index++) { 1165 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1166 for (rp = rtable4[index].r_hashf; 1167 rp != (rnode4_t *)(&rtable4[index]); 1168 rp = r_hashf) { 1169 /* save the hash pointer before destroying */ 1170 r_hashf = rp->r_hashf; 1171 1172 vp = RTOV4(rp); 1173 if (vp->v_vfsp == vfsp) { 1174 mutex_enter(&rp4freelist_lock); 1175 if (rp->r_freef != NULL) { 1176 rp4_rmfree(rp); 1177 mutex_exit(&rp4freelist_lock); 1178 rp4_rmhash_locked(rp); 1179 rp->r_hashf = rlist; 1180 rlist = rp; 1181 } else 1182 mutex_exit(&rp4freelist_lock); 1183 } 1184 } 1185 rw_exit(&rtable4[index].r_lock); 1186 } 1187 1188 for (rp = rlist; rp != NULL; rp = r_hashf) { 1189 r_hashf = rp->r_hashf; 1190 /* 1191 * This call to rp4_addfree will end up destroying the 1192 * rnode, but in a safe way with the appropriate set 1193 * of checks done. 1194 */ 1195 rp4_addfree(rp, cr); 1196 } 1197 } 1198 1199 /* 1200 * This routine destroys all the resources of an rnode 1201 * and finally the rnode itself. 1202 */ 1203 static void 1204 destroy_rnode4(rnode4_t *rp) 1205 { 1206 vnode_t *vp; 1207 vfs_t *vfsp; 1208 1209 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1210 1211 vp = RTOV4(rp); 1212 vfsp = vp->v_vfsp; 1213 1214 uninit_rnode4(rp); 1215 atomic_add_long((ulong_t *)&rnode4_new, -1); 1216 #ifdef DEBUG 1217 clstat4_debug.nrnode.value.ui64--; 1218 #endif 1219 kmem_cache_free(rnode4_cache, rp); 1220 vn_invalid(vp); 1221 vn_free(vp); 1222 VFS_RELE(vfsp); 1223 } 1224 1225 /* 1226 * Invalidate the attributes on all rnodes forcing the next getattr 1227 * to go over the wire. Used to flush stale uid and gid mappings. 1228 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1229 */ 1230 void 1231 nfs4_rnode_invalidate(struct vfs *vfsp) 1232 { 1233 int index; 1234 rnode4_t *rp; 1235 vnode_t *vp; 1236 1237 /* 1238 * Walk the hash queues looking for rnodes. 1239 */ 1240 for (index = 0; index < rtable4size; index++) { 1241 rw_enter(&rtable4[index].r_lock, RW_READER); 1242 for (rp = rtable4[index].r_hashf; 1243 rp != (rnode4_t *)(&rtable4[index]); 1244 rp = rp->r_hashf) { 1245 vp = RTOV4(rp); 1246 if (vfsp != NULL && vp->v_vfsp != vfsp) 1247 continue; 1248 1249 if (!mutex_tryenter(&rp->r_statelock)) 1250 continue; 1251 1252 /* 1253 * Expire the attributes by resetting the change 1254 * and attr timeout. 1255 */ 1256 rp->r_change = 0; 1257 PURGE_ATTRCACHE4_LOCKED(rp); 1258 mutex_exit(&rp->r_statelock); 1259 } 1260 rw_exit(&rtable4[index].r_lock); 1261 } 1262 } 1263 1264 /* 1265 * Flush all vnodes in this (or every) vfs. 1266 * Used by nfs_sync and by nfs_unmount. 1267 */ 1268 void 1269 r4flush(struct vfs *vfsp, cred_t *cr) 1270 { 1271 int index; 1272 rnode4_t *rp; 1273 vnode_t *vp, **vplist; 1274 long num, cnt; 1275 1276 /* 1277 * Check to see whether there is anything to do. 1278 */ 1279 num = rnode4_new; 1280 if (num == 0) 1281 return; 1282 1283 /* 1284 * Allocate a slot for all currently active rnodes on the 1285 * supposition that they all may need flushing. 1286 */ 1287 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1288 cnt = 0; 1289 1290 /* 1291 * Walk the hash queues looking for rnodes with page 1292 * lists associated with them. Make a list of these 1293 * files. 1294 */ 1295 for (index = 0; index < rtable4size; index++) { 1296 rw_enter(&rtable4[index].r_lock, RW_READER); 1297 for (rp = rtable4[index].r_hashf; 1298 rp != (rnode4_t *)(&rtable4[index]); 1299 rp = rp->r_hashf) { 1300 vp = RTOV4(rp); 1301 /* 1302 * Don't bother sync'ing a vp if it 1303 * is part of virtual swap device or 1304 * if VFS is read-only 1305 */ 1306 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1307 continue; 1308 /* 1309 * If flushing all mounted file systems or 1310 * the vnode belongs to this vfs, has pages 1311 * and is marked as either dirty or mmap'd, 1312 * hold and add this vnode to the list of 1313 * vnodes to flush. 1314 */ 1315 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 1316 nfs4_has_pages(vp) && 1317 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1318 VN_HOLD(vp); 1319 vplist[cnt++] = vp; 1320 if (cnt == num) { 1321 rw_exit(&rtable4[index].r_lock); 1322 goto toomany; 1323 } 1324 } 1325 } 1326 rw_exit(&rtable4[index].r_lock); 1327 } 1328 toomany: 1329 1330 /* 1331 * Flush and release all of the files on the list. 1332 */ 1333 while (cnt-- > 0) { 1334 vp = vplist[cnt]; 1335 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 1336 VN_RELE(vp); 1337 } 1338 1339 /* 1340 * Free the space allocated to hold the list. 1341 */ 1342 kmem_free(vplist, num * sizeof (*vplist)); 1343 } 1344 1345 int 1346 nfs4_free_data_reclaim(rnode4_t *rp) 1347 { 1348 char *contents; 1349 vnode_t *xattr; 1350 int size; 1351 vsecattr_t *vsp; 1352 int freed; 1353 bool_t rdc = FALSE; 1354 1355 /* 1356 * Free any held caches which may 1357 * be associated with this rnode. 1358 */ 1359 mutex_enter(&rp->r_statelock); 1360 if (rp->r_dir != NULL) 1361 rdc = TRUE; 1362 contents = rp->r_symlink.contents; 1363 size = rp->r_symlink.size; 1364 rp->r_symlink.contents = NULL; 1365 vsp = rp->r_secattr; 1366 rp->r_secattr = NULL; 1367 xattr = rp->r_xattr_dir; 1368 rp->r_xattr_dir = NULL; 1369 mutex_exit(&rp->r_statelock); 1370 1371 /* 1372 * Free the access cache entries. 1373 */ 1374 freed = nfs4_access_purge_rp(rp); 1375 1376 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1377 return (freed); 1378 1379 /* 1380 * Free the readdir cache entries, incompletely if we can't block. 1381 */ 1382 nfs4_purge_rddir_cache(RTOV4(rp)); 1383 1384 /* 1385 * Free the symbolic link cache. 1386 */ 1387 if (contents != NULL) { 1388 1389 kmem_free((void *)contents, size); 1390 } 1391 1392 /* 1393 * Free any cached ACL. 1394 */ 1395 if (vsp != NULL) 1396 nfs4_acl_free_cache(vsp); 1397 1398 /* 1399 * Release the xattr directory vnode 1400 */ 1401 if (xattr != NULL) 1402 VN_RELE(xattr); 1403 1404 return (1); 1405 } 1406 1407 static int 1408 nfs4_active_data_reclaim(rnode4_t *rp) 1409 { 1410 char *contents; 1411 vnode_t *xattr = NULL; 1412 int size; 1413 vsecattr_t *vsp; 1414 int freed; 1415 bool_t rdc = FALSE; 1416 1417 /* 1418 * Free any held credentials and caches which 1419 * may be associated with this rnode. 1420 */ 1421 if (!mutex_tryenter(&rp->r_statelock)) 1422 return (0); 1423 contents = rp->r_symlink.contents; 1424 size = rp->r_symlink.size; 1425 rp->r_symlink.contents = NULL; 1426 vsp = rp->r_secattr; 1427 rp->r_secattr = NULL; 1428 if (rp->r_dir != NULL) 1429 rdc = TRUE; 1430 /* 1431 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed 1432 * on the same r_hashq queue. We are not mandated to free all caches. 1433 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the 1434 * rnode 'rp' is freed or put on the free list. 1435 */ 1436 if (rp->r_xattr_dir && VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) { 1437 xattr = rp->r_xattr_dir; 1438 rp->r_xattr_dir = NULL; 1439 } 1440 mutex_exit(&rp->r_statelock); 1441 1442 /* 1443 * Free the access cache entries. 1444 */ 1445 freed = nfs4_access_purge_rp(rp); 1446 1447 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1448 return (freed); 1449 1450 /* 1451 * Free the symbolic link cache. 1452 */ 1453 if (contents != NULL) { 1454 1455 kmem_free((void *)contents, size); 1456 } 1457 1458 /* 1459 * Free any cached ACL. 1460 */ 1461 if (vsp != NULL) 1462 nfs4_acl_free_cache(vsp); 1463 1464 nfs4_purge_rddir_cache(RTOV4(rp)); 1465 1466 /* 1467 * Release the xattr directory vnode 1468 */ 1469 if (xattr != NULL) 1470 VN_RELE(xattr); 1471 1472 return (1); 1473 } 1474 1475 static int 1476 nfs4_free_reclaim(void) 1477 { 1478 int freed; 1479 rnode4_t *rp; 1480 1481 #ifdef DEBUG 1482 clstat4_debug.f_reclaim.value.ui64++; 1483 #endif 1484 freed = 0; 1485 mutex_enter(&rp4freelist_lock); 1486 rp = rp4freelist; 1487 if (rp != NULL) { 1488 do { 1489 if (nfs4_free_data_reclaim(rp)) 1490 freed = 1; 1491 } while ((rp = rp->r_freef) != rp4freelist); 1492 } 1493 mutex_exit(&rp4freelist_lock); 1494 return (freed); 1495 } 1496 1497 static int 1498 nfs4_active_reclaim(void) 1499 { 1500 int freed; 1501 int index; 1502 rnode4_t *rp; 1503 1504 #ifdef DEBUG 1505 clstat4_debug.a_reclaim.value.ui64++; 1506 #endif 1507 freed = 0; 1508 for (index = 0; index < rtable4size; index++) { 1509 rw_enter(&rtable4[index].r_lock, RW_READER); 1510 for (rp = rtable4[index].r_hashf; 1511 rp != (rnode4_t *)(&rtable4[index]); 1512 rp = rp->r_hashf) { 1513 if (nfs4_active_data_reclaim(rp)) 1514 freed = 1; 1515 } 1516 rw_exit(&rtable4[index].r_lock); 1517 } 1518 return (freed); 1519 } 1520 1521 static int 1522 nfs4_rnode_reclaim(void) 1523 { 1524 int freed; 1525 rnode4_t *rp; 1526 vnode_t *vp; 1527 1528 #ifdef DEBUG 1529 clstat4_debug.r_reclaim.value.ui64++; 1530 #endif 1531 freed = 0; 1532 mutex_enter(&rp4freelist_lock); 1533 while ((rp = rp4freelist) != NULL) { 1534 rp4_rmfree(rp); 1535 mutex_exit(&rp4freelist_lock); 1536 if (rp->r_flags & R4HASHED) { 1537 vp = RTOV4(rp); 1538 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1539 mutex_enter(&vp->v_lock); 1540 if (vp->v_count > 1) { 1541 vp->v_count--; 1542 mutex_exit(&vp->v_lock); 1543 rw_exit(&rp->r_hashq->r_lock); 1544 mutex_enter(&rp4freelist_lock); 1545 continue; 1546 } 1547 mutex_exit(&vp->v_lock); 1548 rp4_rmhash_locked(rp); 1549 rw_exit(&rp->r_hashq->r_lock); 1550 } 1551 /* 1552 * This call to rp_addfree will end up destroying the 1553 * rnode, but in a safe way with the appropriate set 1554 * of checks done. 1555 */ 1556 rp4_addfree(rp, CRED()); 1557 mutex_enter(&rp4freelist_lock); 1558 } 1559 mutex_exit(&rp4freelist_lock); 1560 return (freed); 1561 } 1562 1563 /*ARGSUSED*/ 1564 static void 1565 nfs4_reclaim(void *cdrarg) 1566 { 1567 #ifdef DEBUG 1568 clstat4_debug.reclaim.value.ui64++; 1569 #endif 1570 if (nfs4_free_reclaim()) 1571 return; 1572 1573 if (nfs4_active_reclaim()) 1574 return; 1575 1576 (void) nfs4_rnode_reclaim(); 1577 } 1578 1579 /* 1580 * Returns the clientid4 to use for the given mntinfo4. Note that the 1581 * clientid can change if the caller drops mi_recovlock. 1582 */ 1583 1584 clientid4 1585 mi2clientid(mntinfo4_t *mi) 1586 { 1587 nfs4_server_t *sp; 1588 clientid4 clientid = 0; 1589 1590 /* this locks down sp if it is found */ 1591 sp = find_nfs4_server(mi); 1592 if (sp != NULL) { 1593 clientid = sp->clientid; 1594 mutex_exit(&sp->s_lock); 1595 nfs4_server_rele(sp); 1596 } 1597 return (clientid); 1598 } 1599 1600 /* 1601 * Return the current lease time for the server associated with the given 1602 * file. Note that the lease time could change immediately after this 1603 * call. 1604 */ 1605 1606 time_t 1607 r2lease_time(rnode4_t *rp) 1608 { 1609 nfs4_server_t *sp; 1610 time_t lease_time; 1611 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1612 1613 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1614 1615 /* this locks down sp if it is found */ 1616 sp = find_nfs4_server(VTOMI4(RTOV4(rp))); 1617 1618 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1619 if (sp != NULL) { 1620 mutex_exit(&sp->s_lock); 1621 nfs4_server_rele(sp); 1622 } 1623 nfs_rw_exit(&mi->mi_recovlock); 1624 return (1); /* 1 second */ 1625 } 1626 1627 ASSERT(sp != NULL); 1628 1629 lease_time = sp->s_lease_time; 1630 1631 mutex_exit(&sp->s_lock); 1632 nfs4_server_rele(sp); 1633 nfs_rw_exit(&mi->mi_recovlock); 1634 1635 return (lease_time); 1636 } 1637 1638 /* 1639 * Return a list with information about all the known open instances for 1640 * a filesystem. The caller must call r4releopenlist() when done with the 1641 * list. 1642 * 1643 * We are safe at looking at os_valid and os_pending_close across dropping 1644 * the 'os_sync_lock' to count up the number of open streams and then 1645 * allocate memory for the osp list due to: 1646 * -Looking at os_pending_close is safe since this routine is 1647 * only called via recovery, and os_pending_close can only be set via 1648 * a non-recovery operation (which are all blocked when recovery 1649 * is active). 1650 * 1651 * -Examining os_valid is safe since non-recovery operations, which 1652 * could potentially switch os_valid to 0, are blocked (via 1653 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1654 * (which means we are the only recovery thread potentially acting 1655 * on this open stream). 1656 */ 1657 1658 nfs4_opinst_t * 1659 r4mkopenlist(mntinfo4_t *mi) 1660 { 1661 nfs4_opinst_t *reopenlist, *rep; 1662 rnode4_t *rp; 1663 vnode_t *vp; 1664 vfs_t *vfsp = mi->mi_vfsp; 1665 int numosp; 1666 nfs4_open_stream_t *osp; 1667 int index; 1668 open_delegation_type4 dtype; 1669 int hold_vnode; 1670 1671 reopenlist = NULL; 1672 1673 for (index = 0; index < rtable4size; index++) { 1674 rw_enter(&rtable4[index].r_lock, RW_READER); 1675 for (rp = rtable4[index].r_hashf; 1676 rp != (rnode4_t *)(&rtable4[index]); 1677 rp = rp->r_hashf) { 1678 1679 vp = RTOV4(rp); 1680 if (vp->v_vfsp != vfsp) 1681 continue; 1682 hold_vnode = 0; 1683 1684 mutex_enter(&rp->r_os_lock); 1685 1686 /* Count the number of valid open_streams of the file */ 1687 numosp = 0; 1688 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1689 osp = list_next(&rp->r_open_streams, osp)) { 1690 mutex_enter(&osp->os_sync_lock); 1691 if (osp->os_valid && !osp->os_pending_close) 1692 numosp++; 1693 mutex_exit(&osp->os_sync_lock); 1694 } 1695 1696 /* Fill in the valid open streams per vp */ 1697 if (numosp > 0) { 1698 int j; 1699 1700 hold_vnode = 1; 1701 1702 /* 1703 * Add a new open instance to the list 1704 */ 1705 rep = kmem_zalloc(sizeof (*reopenlist), 1706 KM_SLEEP); 1707 rep->re_next = reopenlist; 1708 reopenlist = rep; 1709 1710 rep->re_vp = vp; 1711 rep->re_osp = kmem_zalloc( 1712 numosp * sizeof (*(rep->re_osp)), 1713 KM_SLEEP); 1714 rep->re_numosp = numosp; 1715 1716 j = 0; 1717 for (osp = list_head(&rp->r_open_streams); 1718 osp != NULL; 1719 osp = list_next(&rp->r_open_streams, osp)) { 1720 1721 mutex_enter(&osp->os_sync_lock); 1722 if (osp->os_valid && 1723 !osp->os_pending_close) { 1724 osp->os_ref_count++; 1725 rep->re_osp[j] = osp; 1726 j++; 1727 } 1728 mutex_exit(&osp->os_sync_lock); 1729 } 1730 /* 1731 * Assuming valid osp(s) stays valid between 1732 * the time obtaining j and numosp. 1733 */ 1734 ASSERT(j == numosp); 1735 } 1736 1737 mutex_exit(&rp->r_os_lock); 1738 /* do this here to keep v_lock > r_os_lock */ 1739 if (hold_vnode) 1740 VN_HOLD(vp); 1741 mutex_enter(&rp->r_statev4_lock); 1742 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1743 /* 1744 * If this rnode holds a delegation, 1745 * but if there are no valid open streams, 1746 * then just discard the delegation 1747 * without doing delegreturn. 1748 */ 1749 if (numosp > 0) 1750 rp->r_deleg_needs_recovery = 1751 rp->r_deleg_type; 1752 } 1753 /* Save the delegation type for use outside the lock */ 1754 dtype = rp->r_deleg_type; 1755 mutex_exit(&rp->r_statev4_lock); 1756 1757 /* 1758 * If we have a delegation then get rid of it. 1759 * We've set rp->r_deleg_needs_recovery so we have 1760 * enough information to recover. 1761 */ 1762 if (dtype != OPEN_DELEGATE_NONE) { 1763 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1764 } 1765 } 1766 rw_exit(&rtable4[index].r_lock); 1767 } 1768 return (reopenlist); 1769 } 1770 1771 /* 1772 * Release the list of open instance references. 1773 */ 1774 1775 void 1776 r4releopenlist(nfs4_opinst_t *reopenp) 1777 { 1778 nfs4_opinst_t *rep, *next; 1779 int i; 1780 1781 for (rep = reopenp; rep; rep = next) { 1782 next = rep->re_next; 1783 1784 for (i = 0; i < rep->re_numosp; i++) 1785 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1786 1787 VN_RELE(rep->re_vp); 1788 kmem_free(rep->re_osp, 1789 rep->re_numosp * sizeof (*(rep->re_osp))); 1790 1791 kmem_free(rep, sizeof (*rep)); 1792 } 1793 } 1794 1795 int 1796 nfs4_rnode_init(void) 1797 { 1798 ulong_t nrnode4_max; 1799 int i; 1800 1801 /* 1802 * Compute the size of the rnode4 hash table 1803 */ 1804 if (nrnode <= 0) 1805 nrnode = ncsize; 1806 nrnode4_max = 1807 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1808 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1809 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1810 "setting nrnode to max value of %ld", nrnode4_max); 1811 nrnode = nrnode4_max; 1812 } 1813 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1814 rtable4mask = rtable4size - 1; 1815 1816 /* 1817 * Allocate and initialize the hash buckets 1818 */ 1819 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1820 for (i = 0; i < rtable4size; i++) { 1821 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1822 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1823 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1824 } 1825 1826 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1827 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1828 1829 return (0); 1830 } 1831 1832 int 1833 nfs4_rnode_fini(void) 1834 { 1835 int i; 1836 1837 /* 1838 * Deallocate the rnode hash queues 1839 */ 1840 kmem_cache_destroy(rnode4_cache); 1841 1842 for (i = 0; i < rtable4size; i++) 1843 rw_destroy(&rtable4[i].r_lock); 1844 1845 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 1846 1847 return (0); 1848 } 1849 1850 /* 1851 * Return non-zero if the given filehandle refers to the root filehandle 1852 * for the given rnode. 1853 */ 1854 1855 static int 1856 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 1857 { 1858 int isroot; 1859 1860 isroot = 0; 1861 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 1862 isroot = 1; 1863 1864 return (isroot); 1865 } 1866 1867 /* 1868 * The r4_stub_* routines assume that the rnode is newly activated, and 1869 * that the caller either holds the hash bucket r_lock for this rnode as 1870 * RW_WRITER, or holds r_statelock. 1871 */ 1872 static void 1873 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type) 1874 { 1875 vnode_t *vp = RTOV4(rp); 1876 krwlock_t *hash_lock = &rp->r_hashq->r_lock; 1877 1878 ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock)); 1879 1880 rp->r_stub_type = type; 1881 1882 /* 1883 * Safely switch this vnode to the trigger vnodeops. 1884 * 1885 * Currently, we don't ever switch a trigger vnode back to using 1886 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that 1887 * a new v4 object is not a trigger, and it will already have the 1888 * correct v4 vnodeops by default. So, no "else" case required here. 1889 */ 1890 if (type != NFS4_STUB_NONE) 1891 vn_setops(vp, nfs4_trigger_vnodeops); 1892 } 1893 1894 void 1895 r4_stub_mirrormount(rnode4_t *rp) 1896 { 1897 r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT); 1898 } 1899 1900 void 1901 r4_stub_none(rnode4_t *rp) 1902 { 1903 r4_stub_set(rp, NFS4_STUB_NONE); 1904 } 1905 1906 #ifdef DEBUG 1907 1908 /* 1909 * Look in the rnode table for other rnodes that have the same filehandle. 1910 * Assume the lock is held for the hash chain of checkrp 1911 */ 1912 1913 static void 1914 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 1915 { 1916 rnode4_t *rp; 1917 vnode_t *tvp; 1918 nfs4_fhandle_t fh, fh2; 1919 int index; 1920 1921 if (!r4_check_for_dups) 1922 return; 1923 1924 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 1925 1926 sfh4_copyval(checkrp->r_fh, &fh); 1927 1928 for (index = 0; index < rtable4size; index++) { 1929 1930 if (&rtable4[index] != checkrp->r_hashq) 1931 rw_enter(&rtable4[index].r_lock, RW_READER); 1932 1933 for (rp = rtable4[index].r_hashf; 1934 rp != (rnode4_t *)(&rtable4[index]); 1935 rp = rp->r_hashf) { 1936 1937 if (rp == checkrp) 1938 continue; 1939 1940 tvp = RTOV4(rp); 1941 if (tvp->v_vfsp != vfsp) 1942 continue; 1943 1944 sfh4_copyval(rp->r_fh, &fh2); 1945 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 1946 cmn_err(CE_PANIC, "rnodes with same fs, fh " 1947 "(%p, %p)", (void *)checkrp, (void *)rp); 1948 } 1949 } 1950 1951 if (&rtable4[index] != checkrp->r_hashq) 1952 rw_exit(&rtable4[index].r_lock); 1953 } 1954 } 1955 1956 #endif /* DEBUG */ 1957