1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/proc.h> 37 #include <sys/user.h> 38 #include <sys/time.h> 39 #include <sys/buf.h> 40 #include <sys/vfs.h> 41 #include <sys/vnode.h> 42 #include <sys/socket.h> 43 #include <sys/uio.h> 44 #include <sys/tiuser.h> 45 #include <sys/swap.h> 46 #include <sys/errno.h> 47 #include <sys/debug.h> 48 #include <sys/kmem.h> 49 #include <sys/kstat.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vtrace.h> 52 #include <sys/session.h> 53 #include <sys/dnlc.h> 54 #include <sys/bitmap.h> 55 #include <sys/acl.h> 56 #include <sys/ddi.h> 57 #include <sys/pathname.h> 58 #include <sys/flock.h> 59 #include <sys/dirent.h> 60 #include <sys/flock.h> 61 #include <sys/callb.h> 62 #include <sys/sdt.h> 63 64 #include <rpc/types.h> 65 #include <rpc/xdr.h> 66 #include <rpc/auth.h> 67 #include <rpc/rpcsec_gss.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs_clnt.h> 72 #include <nfs/nfs_acl.h> 73 74 #include <nfs/nfs4.h> 75 #include <nfs/rnode4.h> 76 #include <nfs/nfs4_clnt.h> 77 78 /* 79 * The hash queues for the access to active and cached rnodes 80 * are organized as doubly linked lists. A reader/writer lock 81 * for each hash bucket is used to control access and to synchronize 82 * lookups, additions, and deletions from the hash queue. 83 * 84 * The rnode freelist is organized as a doubly linked list with 85 * a head pointer. Additions and deletions are synchronized via 86 * a single mutex. 87 * 88 * In order to add an rnode to the free list, it must be hashed into 89 * a hash queue and the exclusive lock to the hash queue be held. 90 * If an rnode is not hashed into a hash queue, then it is destroyed 91 * because it represents no valuable information that can be reused 92 * about the file. The exclusive lock to the hash queue must be 93 * held in order to prevent a lookup in the hash queue from finding 94 * the rnode and using it and assuming that the rnode is not on the 95 * freelist. The lookup in the hash queue will have the hash queue 96 * locked, either exclusive or shared. 97 * 98 * The vnode reference count for each rnode is not allowed to drop 99 * below 1. This prevents external entities, such as the VM 100 * subsystem, from acquiring references to vnodes already on the 101 * freelist and then trying to place them back on the freelist 102 * when their reference is released. This means that the when an 103 * rnode is looked up in the hash queues, then either the rnode 104 * is removed from the freelist and that reference is transferred to 105 * the new reference or the vnode reference count must be incremented 106 * accordingly. The mutex for the freelist must be held in order to 107 * accurately test to see if the rnode is on the freelist or not. 108 * The hash queue lock might be held shared and it is possible that 109 * two different threads may race to remove the rnode from the 110 * freelist. This race can be resolved by holding the mutex for the 111 * freelist. Please note that the mutex for the freelist does not 112 * need to be held if the rnode is not on the freelist. It can not be 113 * placed on the freelist due to the requirement that the thread 114 * putting the rnode on the freelist must hold the exclusive lock 115 * to the hash queue and the thread doing the lookup in the hash 116 * queue is holding either a shared or exclusive lock to the hash 117 * queue. 118 * 119 * The lock ordering is: 120 * 121 * hash bucket lock -> vnode lock 122 * hash bucket lock -> freelist lock -> r_statelock 123 */ 124 r4hashq_t *rtable4; 125 126 static kmutex_t rp4freelist_lock; 127 static rnode4_t *rp4freelist = NULL; 128 static long rnode4_new = 0; 129 int rtable4size; 130 static int rtable4mask; 131 static struct kmem_cache *rnode4_cache; 132 static int rnode4_hashlen = 4; 133 134 static void r4inactive(rnode4_t *, cred_t *); 135 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *, 136 struct vnodeops *, 137 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 138 cred_t *), 139 int *, cred_t *); 140 static void rp4_rmfree(rnode4_t *); 141 int nfs4_free_data_reclaim(rnode4_t *); 142 static int nfs4_active_data_reclaim(rnode4_t *); 143 static int nfs4_free_reclaim(void); 144 static int nfs4_active_reclaim(void); 145 static int nfs4_rnode_reclaim(void); 146 static void nfs4_reclaim(void *); 147 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *); 148 static void uninit_rnode4(rnode4_t *); 149 static void destroy_rnode4(rnode4_t *); 150 static void r4_stub_set(rnode4_t *, nfs4_stub_type_t); 151 152 #ifdef DEBUG 153 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */ 154 static int nfs4_rnode_debug = 0; 155 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */ 156 static int nfs4_rnode_nofreelist = 0; 157 /* give messages on colliding shared filehandles */ 158 static void r4_dup_check(rnode4_t *, vfs_t *); 159 #endif 160 161 /* 162 * If the vnode has pages, run the list and check for any that are 163 * still dangling. We call this routine before putting an rnode on 164 * the free list. 165 */ 166 static int 167 nfs4_dross_pages(vnode_t *vp) 168 { 169 page_t *pp; 170 kmutex_t *vphm; 171 172 vphm = page_vnode_mutex(vp); 173 mutex_enter(vphm); 174 if ((pp = vp->v_pages) != NULL) { 175 do { 176 if (pp->p_fsdata != C_NOCOMMIT) { 177 mutex_exit(vphm); 178 return (1); 179 } 180 } while ((pp = pp->p_vpnext) != vp->v_pages); 181 } 182 mutex_exit(vphm); 183 184 return (0); 185 } 186 187 /* 188 * Flush any pages left on this rnode. 189 */ 190 static void 191 r4flushpages(rnode4_t *rp, cred_t *cr) 192 { 193 vnode_t *vp; 194 int error; 195 196 /* 197 * Before freeing anything, wait until all asynchronous 198 * activity is done on this rnode. This will allow all 199 * asynchronous read ahead and write behind i/o's to 200 * finish. 201 */ 202 mutex_enter(&rp->r_statelock); 203 while (rp->r_count > 0) 204 cv_wait(&rp->r_cv, &rp->r_statelock); 205 mutex_exit(&rp->r_statelock); 206 207 /* 208 * Flush and invalidate all pages associated with the vnode. 209 */ 210 vp = RTOV4(rp); 211 if (nfs4_has_pages(vp)) { 212 ASSERT(vp->v_type != VCHR); 213 if ((rp->r_flags & R4DIRTY) && !rp->r_error) { 214 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 215 if (error && (error == ENOSPC || error == EDQUOT)) { 216 mutex_enter(&rp->r_statelock); 217 if (!rp->r_error) 218 rp->r_error = error; 219 mutex_exit(&rp->r_statelock); 220 } 221 } 222 nfs4_invalidate_pages(vp, (u_offset_t)0, cr); 223 } 224 } 225 226 /* 227 * Free the resources associated with an rnode. 228 */ 229 static void 230 r4inactive(rnode4_t *rp, cred_t *cr) 231 { 232 vnode_t *vp; 233 char *contents; 234 int size; 235 vsecattr_t *vsp; 236 vnode_t *xattr; 237 238 r4flushpages(rp, cr); 239 240 vp = RTOV4(rp); 241 242 /* 243 * Free any held caches which may be 244 * associated with this rnode. 245 */ 246 mutex_enter(&rp->r_statelock); 247 contents = rp->r_symlink.contents; 248 size = rp->r_symlink.size; 249 rp->r_symlink.contents = NULL; 250 vsp = rp->r_secattr; 251 rp->r_secattr = NULL; 252 xattr = rp->r_xattr_dir; 253 rp->r_xattr_dir = NULL; 254 mutex_exit(&rp->r_statelock); 255 256 /* 257 * Free the access cache entries. 258 */ 259 (void) nfs4_access_purge_rp(rp); 260 261 /* 262 * Free the readdir cache entries. 263 */ 264 nfs4_purge_rddir_cache(vp); 265 266 /* 267 * Free the symbolic link cache. 268 */ 269 if (contents != NULL) { 270 271 kmem_free((void *)contents, size); 272 } 273 274 /* 275 * Free any cached ACL. 276 */ 277 if (vsp != NULL) 278 nfs4_acl_free_cache(vsp); 279 280 /* 281 * Release the cached xattr_dir 282 */ 283 if (xattr != NULL) 284 VN_RELE(xattr); 285 } 286 287 /* 288 * We have seen a case that the fh passed in is for "." which 289 * should be a VROOT node, however, the fh is different from the 290 * root fh stored in the mntinfo4_t. The invalid fh might be 291 * from a misbehaved server and will panic the client system at 292 * a later time. To avoid the panic, we drop the bad fh, use 293 * the root fh from mntinfo4_t, and print an error message 294 * for attention. 295 */ 296 nfs4_sharedfh_t * 297 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi, 298 int *wasbad) 299 { 300 char *s; 301 302 *wasbad = 0; 303 s = fn_name(nm); 304 ASSERT(strcmp(s, "..") != 0); 305 306 if ((s[0] == '.' && s[1] == '\0') && fh && 307 !SFH4_SAME(mi->mi_rootfh, fh)) { 308 #ifdef DEBUG 309 nfs4_fhandle_t fhandle; 310 311 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 312 "Server %s returns a different " 313 "root filehandle for the path %s:", 314 mi->mi_curr_serv->sv_hostname, 315 mi->mi_curr_serv->sv_path); 316 317 /* print the bad fh */ 318 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len; 319 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 320 fhandle.fh_len); 321 nfs4_printfhandle(&fhandle); 322 323 /* print mi_rootfh */ 324 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len; 325 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf, 326 fhandle.fh_len); 327 nfs4_printfhandle(&fhandle); 328 #endif 329 /* use mi_rootfh instead; fh will be rele by the caller */ 330 fh = mi->mi_rootfh; 331 *wasbad = 1; 332 } 333 334 kmem_free(s, MAXNAMELEN); 335 return (fh); 336 } 337 338 void 339 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode, 340 hrtime_t t, cred_t *cr, int index) 341 { 342 int is_stub; 343 vattr_t *attr; 344 /* 345 * Don't add to attrcache if time overflow, but 346 * no need to check because either attr is null or the time 347 * values in it were processed by nfs4_time_ntov(), which checks 348 * for time overflows. 349 */ 350 attr = garp ? &garp->n4g_va : NULL; 351 352 if (attr) { 353 if (!newnode) { 354 rw_exit(&rtable4[index].r_lock); 355 #ifdef DEBUG 356 if (vp->v_type != attr->va_type && 357 vp->v_type != VNON && attr->va_type != VNON) { 358 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN, 359 "makenfs4node: type (%d) doesn't " 360 "match type of found node at %p (%d)", 361 attr->va_type, (void *)vp, vp->v_type); 362 } 363 #endif 364 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 365 } else { 366 rnode4_t *rp = VTOR4(vp); 367 368 vp->v_type = attr->va_type; 369 vp->v_rdev = attr->va_rdev; 370 371 /* 372 * Turn this object into a "stub" object if we 373 * crossed an underlying server fs boundary. 374 * To make this check, during mount we save the 375 * fsid of the server object being mounted. 376 * Here we compare this object's server fsid 377 * with the fsid we saved at mount. If they 378 * are different, we crossed server fs boundary. 379 * 380 * The stub type is set (or not) at rnode 381 * creation time and it never changes for life 382 * of the rnode. 383 * 384 * This stub will be for a mirror-mount, rather than 385 * a referral (the latter also sets R4SRVSTUB). 386 * 387 * The stub type is also set during RO failover, 388 * nfs4_remap_file(). 389 * 390 * We don't bother with taking r_state_lock to 391 * set the stub type because this is a new rnode 392 * and we're holding the hash bucket r_lock RW_WRITER. 393 * No other thread could have obtained access 394 * to this rnode. 395 */ 396 is_stub = 0; 397 if (garp->n4g_fsid_valid) { 398 fattr4_fsid ga_fsid = garp->n4g_fsid; 399 servinfo4_t *svp = rp->r_server; 400 401 rp->r_srv_fsid = ga_fsid; 402 403 (void) nfs_rw_enter_sig(&svp->sv_lock, 404 RW_READER, 0); 405 if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid)) 406 is_stub = 1; 407 nfs_rw_exit(&svp->sv_lock); 408 } 409 410 if (is_stub) 411 r4_stub_mirrormount(rp); 412 else 413 r4_stub_none(rp); 414 415 /* Can not cache partial attr */ 416 if (attr->va_mask == AT_ALL) 417 nfs4_attrcache_noinval(vp, garp, t); 418 else 419 PURGE_ATTRCACHE4(vp); 420 421 rw_exit(&rtable4[index].r_lock); 422 } 423 } else { 424 if (newnode) { 425 PURGE_ATTRCACHE4(vp); 426 } 427 rw_exit(&rtable4[index].r_lock); 428 } 429 } 430 431 /* 432 * Find or create an rnode based primarily on filehandle. To be 433 * used when dvp (vnode for parent directory) is not available; 434 * otherwise, makenfs4node() should be used. 435 * 436 * The nfs4_fname_t argument *npp is consumed and nulled out. 437 */ 438 439 vnode_t * 440 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh, 441 nfs4_fname_t **npp, nfs4_ga_res_t *garp, 442 mntinfo4_t *mi, cred_t *cr, hrtime_t t) 443 { 444 vfs_t *vfsp = mi->mi_vfsp; 445 int newnode = 0; 446 vnode_t *vp; 447 rnode4_t *rp; 448 svnode_t *svp; 449 nfs4_fname_t *name, *svpname; 450 int index; 451 452 ASSERT(npp && *npp); 453 name = *npp; 454 *npp = NULL; 455 456 index = rtable4hash(sfh); 457 rw_enter(&rtable4[index].r_lock, RW_READER); 458 459 vp = make_rnode4(sfh, &rtable4[index], vfsp, 460 nfs4_vnodeops, nfs4_putapage, &newnode, cr); 461 462 svp = VTOSV(vp); 463 rp = VTOR4(vp); 464 if (newnode) { 465 svp->sv_forw = svp->sv_back = svp; 466 svp->sv_name = name; 467 if (psfh != NULL) 468 sfh4_hold(psfh); 469 svp->sv_dfh = psfh; 470 } else { 471 /* 472 * It is possible that due to a server 473 * side rename fnames have changed. 474 * update the fname here. 475 */ 476 mutex_enter(&rp->r_svlock); 477 svpname = svp->sv_name; 478 if (svp->sv_name != name) { 479 svp->sv_name = name; 480 mutex_exit(&rp->r_svlock); 481 fn_rele(&svpname); 482 } else { 483 mutex_exit(&rp->r_svlock); 484 fn_rele(&name); 485 } 486 } 487 488 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 489 r4_do_attrcache(vp, garp, newnode, t, cr, index); 490 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 491 492 return (vp); 493 } 494 495 /* 496 * Find or create a vnode for the given filehandle, filesystem, parent, and 497 * name. The reference to nm is consumed, so the caller must first do an 498 * fn_hold() if it wants to continue using nm after this call. 499 */ 500 vnode_t * 501 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp, 502 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm) 503 { 504 vnode_t *vp; 505 int newnode; 506 int index; 507 mntinfo4_t *mi = VFTOMI4(vfsp); 508 int had_badfh = 0; 509 rnode4_t *rp; 510 511 ASSERT(dvp != NULL); 512 513 fh = badrootfh_check(fh, nm, mi, &had_badfh); 514 515 index = rtable4hash(fh); 516 rw_enter(&rtable4[index].r_lock, RW_READER); 517 518 /* 519 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 520 */ 521 vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops, 522 nfs4_putapage, &newnode, cr); 523 524 rp = VTOR4(vp); 525 sv_activate(&vp, dvp, &nm, newnode); 526 if (dvp->v_flag & V_XATTRDIR) { 527 mutex_enter(&rp->r_statelock); 528 rp->r_flags |= R4ISXATTR; 529 mutex_exit(&rp->r_statelock); 530 } 531 532 /* if getting a bad file handle, do not cache the attributes. */ 533 if (had_badfh) { 534 rw_exit(&rtable4[index].r_lock); 535 return (vp); 536 } 537 538 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock)); 539 r4_do_attrcache(vp, garp, newnode, t, cr, index); 540 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread); 541 542 return (vp); 543 } 544 545 /* 546 * Hash on address of filehandle object. 547 * XXX totally untuned. 548 */ 549 550 int 551 rtable4hash(nfs4_sharedfh_t *fh) 552 { 553 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask); 554 } 555 556 /* 557 * Find or create the vnode for the given filehandle and filesystem. 558 * *newnode is set to zero if the vnode already existed; non-zero if it had 559 * to be created. 560 * 561 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive. 562 */ 563 564 static vnode_t * 565 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, 566 struct vnodeops *vops, 567 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 568 int *newnode, cred_t *cr) 569 { 570 rnode4_t *rp; 571 rnode4_t *trp; 572 vnode_t *vp; 573 mntinfo4_t *mi; 574 575 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 576 577 mi = VFTOMI4(vfsp); 578 579 start: 580 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { 581 vp = RTOV4(rp); 582 *newnode = 0; 583 return (vp); 584 } 585 rw_exit(&rhtp->r_lock); 586 587 mutex_enter(&rp4freelist_lock); 588 589 if (rp4freelist != NULL && rnode4_new >= nrnode) { 590 rp = rp4freelist; 591 rp4_rmfree(rp); 592 mutex_exit(&rp4freelist_lock); 593 594 vp = RTOV4(rp); 595 596 if (rp->r_flags & R4HASHED) { 597 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 598 mutex_enter(&vp->v_lock); 599 if (vp->v_count > 1) { 600 vp->v_count--; 601 mutex_exit(&vp->v_lock); 602 rw_exit(&rp->r_hashq->r_lock); 603 rw_enter(&rhtp->r_lock, RW_READER); 604 goto start; 605 } 606 mutex_exit(&vp->v_lock); 607 rp4_rmhash_locked(rp); 608 rw_exit(&rp->r_hashq->r_lock); 609 } 610 611 r4inactive(rp, cr); 612 613 mutex_enter(&vp->v_lock); 614 if (vp->v_count > 1) { 615 vp->v_count--; 616 mutex_exit(&vp->v_lock); 617 rw_enter(&rhtp->r_lock, RW_READER); 618 goto start; 619 } 620 mutex_exit(&vp->v_lock); 621 vn_invalid(vp); 622 623 /* 624 * destroy old locks before bzero'ing and 625 * recreating the locks below. 626 */ 627 uninit_rnode4(rp); 628 629 /* 630 * Make sure that if rnode is recycled then 631 * VFS count is decremented properly before 632 * reuse. 633 */ 634 VFS_RELE(vp->v_vfsp); 635 vn_reinit(vp); 636 } else { 637 vnode_t *new_vp; 638 639 mutex_exit(&rp4freelist_lock); 640 641 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); 642 new_vp = vn_alloc(KM_SLEEP); 643 644 atomic_add_long((ulong_t *)&rnode4_new, 1); 645 #ifdef DEBUG 646 clstat4_debug.nrnode.value.ui64++; 647 #endif 648 vp = new_vp; 649 } 650 651 bzero(rp, sizeof (*rp)); 652 rp->r_vnode = vp; 653 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 654 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 655 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); 656 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 657 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); 658 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); 659 rp->created_v4 = 0; 660 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), 661 offsetof(nfs4_open_stream_t, os_node)); 662 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; 663 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; 664 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 665 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 666 rp->r_flags = R4READDIRWATTR; 667 rp->r_fh = fh; 668 rp->r_hashq = rhtp; 669 sfh4_hold(rp->r_fh); 670 rp->r_server = mi->mi_curr_serv; 671 rp->r_deleg_type = OPEN_DELEGATE_NONE; 672 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; 673 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); 674 675 rddir4_cache_create(rp); 676 rp->r_putapage = putapage; 677 vn_setops(vp, vops); 678 vp->v_data = (caddr_t)rp; 679 vp->v_vfsp = vfsp; 680 VFS_HOLD(vfsp); 681 vp->v_type = VNON; 682 if (isrootfh(fh, rp)) 683 vp->v_flag = VROOT; 684 vn_exists(vp); 685 686 /* 687 * There is a race condition if someone else 688 * alloc's the rnode while no locks are held, so we 689 * check again and recover if found. 690 */ 691 rw_enter(&rhtp->r_lock, RW_WRITER); 692 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { 693 vp = RTOV4(trp); 694 *newnode = 0; 695 rw_exit(&rhtp->r_lock); 696 rp4_addfree(rp, cr); 697 rw_enter(&rhtp->r_lock, RW_READER); 698 return (vp); 699 } 700 rp4_addhash(rp); 701 *newnode = 1; 702 return (vp); 703 } 704 705 static void 706 uninit_rnode4(rnode4_t *rp) 707 { 708 vnode_t *vp = RTOV4(rp); 709 710 ASSERT(rp != NULL); 711 ASSERT(vp != NULL); 712 ASSERT(vp->v_count == 1); 713 ASSERT(rp->r_count == 0); 714 ASSERT(rp->r_mapcnt == 0); 715 if (rp->r_flags & R4LODANGLERS) { 716 nfs4_flush_lock_owners(rp); 717 } 718 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head); 719 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head); 720 ASSERT(!(rp->r_flags & R4HASHED)); 721 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 722 nfs4_clear_open_streams(rp); 723 list_destroy(&rp->r_open_streams); 724 725 /* 726 * Destroy the rddir cache first since we need to grab the r_statelock. 727 */ 728 mutex_enter(&rp->r_statelock); 729 rddir4_cache_destroy(rp); 730 mutex_exit(&rp->r_statelock); 731 sv_uninit(&rp->r_svnode); 732 sfh4_rele(&rp->r_fh); 733 nfs_rw_destroy(&rp->r_rwlock); 734 nfs_rw_destroy(&rp->r_lkserlock); 735 mutex_destroy(&rp->r_statelock); 736 mutex_destroy(&rp->r_statev4_lock); 737 mutex_destroy(&rp->r_os_lock); 738 cv_destroy(&rp->r_cv); 739 cv_destroy(&rp->r_commit.c_cv); 740 nfs_rw_destroy(&rp->r_deleg_recall_lock); 741 if (rp->r_flags & R4DELMAPLIST) 742 list_destroy(&rp->r_indelmap); 743 } 744 745 /* 746 * Put an rnode on the free list. 747 * 748 * Rnodes which were allocated above and beyond the normal limit 749 * are immediately freed. 750 */ 751 void 752 rp4_addfree(rnode4_t *rp, cred_t *cr) 753 { 754 vnode_t *vp; 755 vnode_t *xattr; 756 struct vfs *vfsp; 757 758 vp = RTOV4(rp); 759 ASSERT(vp->v_count >= 1); 760 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 761 762 /* 763 * If we have too many rnodes allocated and there are no 764 * references to this rnode, or if the rnode is no longer 765 * accessible by it does not reside in the hash queues, 766 * or if an i/o error occurred while writing to the file, 767 * then just free it instead of putting it on the rnode 768 * freelist. 769 */ 770 vfsp = vp->v_vfsp; 771 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) || 772 #ifdef DEBUG 773 (nfs4_rnode_nofreelist != 0) || 774 #endif 775 rp->r_error || (rp->r_flags & R4RECOVERR) || 776 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 777 if (rp->r_flags & R4HASHED) { 778 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 779 mutex_enter(&vp->v_lock); 780 if (vp->v_count > 1) { 781 vp->v_count--; 782 mutex_exit(&vp->v_lock); 783 rw_exit(&rp->r_hashq->r_lock); 784 return; 785 } 786 mutex_exit(&vp->v_lock); 787 rp4_rmhash_locked(rp); 788 rw_exit(&rp->r_hashq->r_lock); 789 } 790 791 /* 792 * Make sure we don't have a delegation on this rnode 793 * before destroying it. 794 */ 795 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 796 (void) nfs4delegreturn(rp, 797 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 798 } 799 800 r4inactive(rp, cr); 801 802 /* 803 * Recheck the vnode reference count. We need to 804 * make sure that another reference has not been 805 * acquired while we were not holding v_lock. The 806 * rnode is not in the rnode hash queues; one 807 * way for a reference to have been acquired 808 * is for a VOP_PUTPAGE because the rnode was marked 809 * with R4DIRTY or for a modified page. This 810 * reference may have been acquired before our call 811 * to r4inactive. The i/o may have been completed, 812 * thus allowing r4inactive to complete, but the 813 * reference to the vnode may not have been released 814 * yet. In any case, the rnode can not be destroyed 815 * until the other references to this vnode have been 816 * released. The other references will take care of 817 * either destroying the rnode or placing it on the 818 * rnode freelist. If there are no other references, 819 * then the rnode may be safely destroyed. 820 */ 821 mutex_enter(&vp->v_lock); 822 if (vp->v_count > 1) { 823 vp->v_count--; 824 mutex_exit(&vp->v_lock); 825 return; 826 } 827 mutex_exit(&vp->v_lock); 828 829 destroy_rnode4(rp); 830 return; 831 } 832 833 /* 834 * Lock the hash queue and then recheck the reference count 835 * to ensure that no other threads have acquired a reference 836 * to indicate that the rnode should not be placed on the 837 * freelist. If another reference has been acquired, then 838 * just release this one and let the other thread complete 839 * the processing of adding this rnode to the freelist. 840 */ 841 again: 842 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 843 844 mutex_enter(&vp->v_lock); 845 if (vp->v_count > 1) { 846 vp->v_count--; 847 mutex_exit(&vp->v_lock); 848 rw_exit(&rp->r_hashq->r_lock); 849 return; 850 } 851 mutex_exit(&vp->v_lock); 852 853 /* 854 * Make sure we don't put an rnode with a delegation 855 * on the free list. 856 */ 857 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 858 rw_exit(&rp->r_hashq->r_lock); 859 (void) nfs4delegreturn(rp, 860 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN); 861 goto again; 862 } 863 864 /* 865 * Now that we have the hash queue lock, and we know there 866 * are not anymore references on the vnode, check to make 867 * sure there aren't any open streams still on the rnode. 868 * If so, drop the hash queue lock, remove the open streams, 869 * and recheck the v_count. 870 */ 871 mutex_enter(&rp->r_os_lock); 872 if (list_head(&rp->r_open_streams) != NULL) { 873 mutex_exit(&rp->r_os_lock); 874 rw_exit(&rp->r_hashq->r_lock); 875 if (nfs_zone() != VTOMI4(vp)->mi_zone) 876 nfs4_clear_open_streams(rp); 877 else 878 (void) nfs4close_all(vp, cr); 879 goto again; 880 } 881 mutex_exit(&rp->r_os_lock); 882 883 /* 884 * Before we put it on the freelist, make sure there are no pages. 885 * If there are, flush and commit of all of the dirty and 886 * uncommitted pages, assuming the file system isn't read only. 887 */ 888 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) { 889 rw_exit(&rp->r_hashq->r_lock); 890 r4flushpages(rp, cr); 891 goto again; 892 } 893 894 /* 895 * Before we put it on the freelist, make sure there is no 896 * active xattr directory cached, the freelist will not 897 * have its entries r4inactive'd if there is still an active 898 * rnode, thus nothing in the freelist can hold another 899 * rnode active. 900 */ 901 xattr = rp->r_xattr_dir; 902 rp->r_xattr_dir = NULL; 903 904 /* 905 * If there is no cached data or metadata for this file, then 906 * put the rnode on the front of the freelist so that it will 907 * be reused before other rnodes which may have cached data or 908 * metadata associated with them. 909 */ 910 mutex_enter(&rp4freelist_lock); 911 if (rp4freelist == NULL) { 912 rp->r_freef = rp; 913 rp->r_freeb = rp; 914 rp4freelist = rp; 915 } else { 916 rp->r_freef = rp4freelist; 917 rp->r_freeb = rp4freelist->r_freeb; 918 rp4freelist->r_freeb->r_freef = rp; 919 rp4freelist->r_freeb = rp; 920 if (!nfs4_has_pages(vp) && rp->r_dir == NULL && 921 rp->r_symlink.contents == NULL && rp->r_secattr == NULL) 922 rp4freelist = rp; 923 } 924 mutex_exit(&rp4freelist_lock); 925 926 rw_exit(&rp->r_hashq->r_lock); 927 928 if (xattr) 929 VN_RELE(xattr); 930 } 931 932 /* 933 * Remove an rnode from the free list. 934 * 935 * The caller must be holding rp4freelist_lock and the rnode 936 * must be on the freelist. 937 */ 938 static void 939 rp4_rmfree(rnode4_t *rp) 940 { 941 942 ASSERT(MUTEX_HELD(&rp4freelist_lock)); 943 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 944 945 if (rp == rp4freelist) { 946 rp4freelist = rp->r_freef; 947 if (rp == rp4freelist) 948 rp4freelist = NULL; 949 } 950 rp->r_freeb->r_freef = rp->r_freef; 951 rp->r_freef->r_freeb = rp->r_freeb; 952 953 rp->r_freef = rp->r_freeb = NULL; 954 } 955 956 /* 957 * Put a rnode in the hash table. 958 * 959 * The caller must be holding the exclusive hash queue lock 960 */ 961 void 962 rp4_addhash(rnode4_t *rp) 963 { 964 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 965 ASSERT(!(rp->r_flags & R4HASHED)); 966 967 #ifdef DEBUG 968 r4_dup_check(rp, RTOV4(rp)->v_vfsp); 969 #endif 970 971 rp->r_hashf = rp->r_hashq->r_hashf; 972 rp->r_hashq->r_hashf = rp; 973 rp->r_hashb = (rnode4_t *)rp->r_hashq; 974 rp->r_hashf->r_hashb = rp; 975 976 mutex_enter(&rp->r_statelock); 977 rp->r_flags |= R4HASHED; 978 mutex_exit(&rp->r_statelock); 979 } 980 981 /* 982 * Remove a rnode from the hash table. 983 * 984 * The caller must be holding the hash queue lock. 985 */ 986 void 987 rp4_rmhash_locked(rnode4_t *rp) 988 { 989 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 990 ASSERT(rp->r_flags & R4HASHED); 991 992 rp->r_hashb->r_hashf = rp->r_hashf; 993 rp->r_hashf->r_hashb = rp->r_hashb; 994 995 mutex_enter(&rp->r_statelock); 996 rp->r_flags &= ~R4HASHED; 997 mutex_exit(&rp->r_statelock); 998 } 999 1000 /* 1001 * Remove a rnode from the hash table. 1002 * 1003 * The caller must not be holding the hash queue lock. 1004 */ 1005 void 1006 rp4_rmhash(rnode4_t *rp) 1007 { 1008 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1009 rp4_rmhash_locked(rp); 1010 rw_exit(&rp->r_hashq->r_lock); 1011 } 1012 1013 /* 1014 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery. 1015 * Returns NULL if no match. If an rnode is returned, the reference count 1016 * on the master vnode is incremented. 1017 * 1018 * The caller must be holding the hash queue lock, either shared or exclusive. 1019 */ 1020 rnode4_t * 1021 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp) 1022 { 1023 rnode4_t *rp; 1024 vnode_t *vp; 1025 1026 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 1027 1028 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) { 1029 vp = RTOV4(rp); 1030 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) { 1031 1032 mutex_enter(&rp->r_statelock); 1033 if (rp->r_flags & R4RECOVERR) { 1034 mutex_exit(&rp->r_statelock); 1035 continue; 1036 } 1037 mutex_exit(&rp->r_statelock); 1038 #ifdef DEBUG 1039 r4_dup_check(rp, vfsp); 1040 #endif 1041 if (rp->r_freef != NULL) { 1042 mutex_enter(&rp4freelist_lock); 1043 /* 1044 * If the rnode is on the freelist, 1045 * then remove it and use that reference 1046 * as the new reference. Otherwise, 1047 * need to increment the reference count. 1048 */ 1049 if (rp->r_freef != NULL) { 1050 rp4_rmfree(rp); 1051 mutex_exit(&rp4freelist_lock); 1052 } else { 1053 mutex_exit(&rp4freelist_lock); 1054 VN_HOLD(vp); 1055 } 1056 } else 1057 VN_HOLD(vp); 1058 1059 /* 1060 * if root vnode, set v_flag to indicate that 1061 */ 1062 if (isrootfh(fh, rp)) { 1063 if (!(vp->v_flag & VROOT)) { 1064 mutex_enter(&vp->v_lock); 1065 vp->v_flag |= VROOT; 1066 mutex_exit(&vp->v_lock); 1067 } 1068 } 1069 return (rp); 1070 } 1071 } 1072 return (NULL); 1073 } 1074 1075 /* 1076 * Lookup an rnode by fhandle. Just a wrapper for r4find() 1077 * that assumes the caller hasn't already got the lock 1078 * on the hash bucket. 1079 */ 1080 rnode4_t * 1081 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp) 1082 { 1083 rnode4_t *rp; 1084 int index; 1085 1086 index = rtable4hash(fh); 1087 rw_enter(&rtable4[index].r_lock, RW_READER); 1088 rp = r4find(&rtable4[index], fh, vfsp); 1089 rw_exit(&rtable4[index].r_lock); 1090 1091 return (rp); 1092 } 1093 1094 /* 1095 * Return >0 if there is a active vnode belonging to this vfs in the 1096 * rtable4 cache. 1097 * 1098 * Several of these checks are done without holding the usual 1099 * locks. This is safe because destroy_rtable(), rp_addfree(), 1100 * etc. will redo the necessary checks before actually destroying 1101 * any rnodes. 1102 */ 1103 int 1104 check_rtable4(struct vfs *vfsp) 1105 { 1106 rnode4_t *rp; 1107 vnode_t *vp; 1108 int busy = NFSV4_RTABLE4_OK; 1109 int index; 1110 1111 for (index = 0; index < rtable4size; index++) { 1112 rw_enter(&rtable4[index].r_lock, RW_READER); 1113 1114 for (rp = rtable4[index].r_hashf; 1115 rp != (rnode4_t *)(&rtable4[index]); 1116 rp = rp->r_hashf) { 1117 1118 vp = RTOV4(rp); 1119 if (vp->v_vfsp == vfsp) { 1120 if (rp->r_freef == NULL) { 1121 busy = NFSV4_RTABLE4_NOT_FREE_LIST; 1122 } else if (nfs4_has_pages(vp) && 1123 (rp->r_flags & R4DIRTY)) { 1124 busy = NFSV4_RTABLE4_DIRTY_PAGES; 1125 } else if (rp->r_count > 0) { 1126 busy = NFSV4_RTABLE4_POS_R_COUNT; 1127 } 1128 1129 if (busy != NFSV4_RTABLE4_OK) { 1130 #ifdef DEBUG 1131 char *path; 1132 1133 path = fn_path(rp->r_svnode.sv_name); 1134 DTRACE_NFSV4_3(rnode__e__debug, 1135 int, busy, char *, path, 1136 rnode4_t *, rp); 1137 kmem_free(path, strlen(path)+1); 1138 #endif 1139 rw_exit(&rtable4[index].r_lock); 1140 return (busy); 1141 } 1142 } 1143 } 1144 rw_exit(&rtable4[index].r_lock); 1145 } 1146 return (busy); 1147 } 1148 1149 /* 1150 * Destroy inactive vnodes from the hash queues which 1151 * belong to this vfs. All of the vnodes should be inactive. 1152 * It is essential that we destroy all rnodes in case of 1153 * forced unmount as well as in normal unmount case. 1154 */ 1155 1156 void 1157 destroy_rtable4(struct vfs *vfsp, cred_t *cr) 1158 { 1159 int index; 1160 vnode_t *vp; 1161 rnode4_t *rp, *r_hashf, *rlist; 1162 1163 rlist = NULL; 1164 1165 for (index = 0; index < rtable4size; index++) { 1166 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1167 for (rp = rtable4[index].r_hashf; 1168 rp != (rnode4_t *)(&rtable4[index]); 1169 rp = r_hashf) { 1170 /* save the hash pointer before destroying */ 1171 r_hashf = rp->r_hashf; 1172 1173 vp = RTOV4(rp); 1174 if (vp->v_vfsp == vfsp) { 1175 mutex_enter(&rp4freelist_lock); 1176 if (rp->r_freef != NULL) { 1177 rp4_rmfree(rp); 1178 mutex_exit(&rp4freelist_lock); 1179 rp4_rmhash_locked(rp); 1180 rp->r_hashf = rlist; 1181 rlist = rp; 1182 } else 1183 mutex_exit(&rp4freelist_lock); 1184 } 1185 } 1186 rw_exit(&rtable4[index].r_lock); 1187 } 1188 1189 for (rp = rlist; rp != NULL; rp = r_hashf) { 1190 r_hashf = rp->r_hashf; 1191 /* 1192 * This call to rp4_addfree will end up destroying the 1193 * rnode, but in a safe way with the appropriate set 1194 * of checks done. 1195 */ 1196 rp4_addfree(rp, cr); 1197 } 1198 } 1199 1200 /* 1201 * This routine destroys all the resources of an rnode 1202 * and finally the rnode itself. 1203 */ 1204 static void 1205 destroy_rnode4(rnode4_t *rp) 1206 { 1207 vnode_t *vp; 1208 vfs_t *vfsp; 1209 1210 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); 1211 1212 vp = RTOV4(rp); 1213 vfsp = vp->v_vfsp; 1214 1215 uninit_rnode4(rp); 1216 atomic_add_long((ulong_t *)&rnode4_new, -1); 1217 #ifdef DEBUG 1218 clstat4_debug.nrnode.value.ui64--; 1219 #endif 1220 kmem_cache_free(rnode4_cache, rp); 1221 vn_invalid(vp); 1222 vn_free(vp); 1223 VFS_RELE(vfsp); 1224 } 1225 1226 /* 1227 * Invalidate the attributes on all rnodes forcing the next getattr 1228 * to go over the wire. Used to flush stale uid and gid mappings. 1229 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL) 1230 */ 1231 void 1232 nfs4_rnode_invalidate(struct vfs *vfsp) 1233 { 1234 int index; 1235 rnode4_t *rp; 1236 vnode_t *vp; 1237 1238 /* 1239 * Walk the hash queues looking for rnodes. 1240 */ 1241 for (index = 0; index < rtable4size; index++) { 1242 rw_enter(&rtable4[index].r_lock, RW_READER); 1243 for (rp = rtable4[index].r_hashf; 1244 rp != (rnode4_t *)(&rtable4[index]); 1245 rp = rp->r_hashf) { 1246 vp = RTOV4(rp); 1247 if (vfsp != NULL && vp->v_vfsp != vfsp) 1248 continue; 1249 1250 if (!mutex_tryenter(&rp->r_statelock)) 1251 continue; 1252 1253 /* 1254 * Expire the attributes by resetting the change 1255 * and attr timeout. 1256 */ 1257 rp->r_change = 0; 1258 PURGE_ATTRCACHE4_LOCKED(rp); 1259 mutex_exit(&rp->r_statelock); 1260 } 1261 rw_exit(&rtable4[index].r_lock); 1262 } 1263 } 1264 1265 /* 1266 * Flush all vnodes in this (or every) vfs. 1267 * Used by nfs_sync and by nfs_unmount. 1268 */ 1269 void 1270 r4flush(struct vfs *vfsp, cred_t *cr) 1271 { 1272 int index; 1273 rnode4_t *rp; 1274 vnode_t *vp, **vplist; 1275 long num, cnt; 1276 1277 /* 1278 * Check to see whether there is anything to do. 1279 */ 1280 num = rnode4_new; 1281 if (num == 0) 1282 return; 1283 1284 /* 1285 * Allocate a slot for all currently active rnodes on the 1286 * supposition that they all may need flushing. 1287 */ 1288 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 1289 cnt = 0; 1290 1291 /* 1292 * Walk the hash queues looking for rnodes with page 1293 * lists associated with them. Make a list of these 1294 * files. 1295 */ 1296 for (index = 0; index < rtable4size; index++) { 1297 rw_enter(&rtable4[index].r_lock, RW_READER); 1298 for (rp = rtable4[index].r_hashf; 1299 rp != (rnode4_t *)(&rtable4[index]); 1300 rp = rp->r_hashf) { 1301 vp = RTOV4(rp); 1302 /* 1303 * Don't bother sync'ing a vp if it 1304 * is part of virtual swap device or 1305 * if VFS is read-only 1306 */ 1307 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 1308 continue; 1309 /* 1310 * If flushing all mounted file systems or 1311 * the vnode belongs to this vfs, has pages 1312 * and is marked as either dirty or mmap'd, 1313 * hold and add this vnode to the list of 1314 * vnodes to flush. 1315 */ 1316 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 1317 nfs4_has_pages(vp) && 1318 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) { 1319 VN_HOLD(vp); 1320 vplist[cnt++] = vp; 1321 if (cnt == num) { 1322 rw_exit(&rtable4[index].r_lock); 1323 goto toomany; 1324 } 1325 } 1326 } 1327 rw_exit(&rtable4[index].r_lock); 1328 } 1329 toomany: 1330 1331 /* 1332 * Flush and release all of the files on the list. 1333 */ 1334 while (cnt-- > 0) { 1335 vp = vplist[cnt]; 1336 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 1337 VN_RELE(vp); 1338 } 1339 1340 /* 1341 * Free the space allocated to hold the list. 1342 */ 1343 kmem_free(vplist, num * sizeof (*vplist)); 1344 } 1345 1346 int 1347 nfs4_free_data_reclaim(rnode4_t *rp) 1348 { 1349 char *contents; 1350 vnode_t *xattr; 1351 int size; 1352 vsecattr_t *vsp; 1353 int freed; 1354 bool_t rdc = FALSE; 1355 1356 /* 1357 * Free any held caches which may 1358 * be associated with this rnode. 1359 */ 1360 mutex_enter(&rp->r_statelock); 1361 if (rp->r_dir != NULL) 1362 rdc = TRUE; 1363 contents = rp->r_symlink.contents; 1364 size = rp->r_symlink.size; 1365 rp->r_symlink.contents = NULL; 1366 vsp = rp->r_secattr; 1367 rp->r_secattr = NULL; 1368 xattr = rp->r_xattr_dir; 1369 rp->r_xattr_dir = NULL; 1370 mutex_exit(&rp->r_statelock); 1371 1372 /* 1373 * Free the access cache entries. 1374 */ 1375 freed = nfs4_access_purge_rp(rp); 1376 1377 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL) 1378 return (freed); 1379 1380 /* 1381 * Free the readdir cache entries, incompletely if we can't block. 1382 */ 1383 nfs4_purge_rddir_cache(RTOV4(rp)); 1384 1385 /* 1386 * Free the symbolic link cache. 1387 */ 1388 if (contents != NULL) { 1389 1390 kmem_free((void *)contents, size); 1391 } 1392 1393 /* 1394 * Free any cached ACL. 1395 */ 1396 if (vsp != NULL) 1397 nfs4_acl_free_cache(vsp); 1398 1399 /* 1400 * Release the xattr directory vnode 1401 */ 1402 if (xattr != NULL) 1403 VN_RELE(xattr); 1404 1405 return (1); 1406 } 1407 1408 static int 1409 nfs4_active_data_reclaim(rnode4_t *rp) 1410 { 1411 char *contents; 1412 vnode_t *xattr = NULL; 1413 int size; 1414 vsecattr_t *vsp; 1415 int freed; 1416 bool_t rdc = FALSE; 1417 1418 /* 1419 * Free any held credentials and caches which 1420 * may be associated with this rnode. 1421 */ 1422 if (!mutex_tryenter(&rp->r_statelock)) 1423 return (0); 1424 contents = rp->r_symlink.contents; 1425 size = rp->r_symlink.size; 1426 rp->r_symlink.contents = NULL; 1427 vsp = rp->r_secattr; 1428 rp->r_secattr = NULL; 1429 if (rp->r_dir != NULL) 1430 rdc = TRUE; 1431 /* 1432 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed 1433 * on the same r_hashq queue. We are not mandated to free all caches. 1434 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the 1435 * rnode 'rp' is freed or put on the free list. 1436 */ 1437 if (rp->r_xattr_dir && VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) { 1438 xattr = rp->r_xattr_dir; 1439 rp->r_xattr_dir = NULL; 1440 } 1441 mutex_exit(&rp->r_statelock); 1442 1443 /* 1444 * Free the access cache entries. 1445 */ 1446 freed = nfs4_access_purge_rp(rp); 1447 1448 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL) 1449 return (freed); 1450 1451 /* 1452 * Free the symbolic link cache. 1453 */ 1454 if (contents != NULL) { 1455 1456 kmem_free((void *)contents, size); 1457 } 1458 1459 /* 1460 * Free any cached ACL. 1461 */ 1462 if (vsp != NULL) 1463 nfs4_acl_free_cache(vsp); 1464 1465 nfs4_purge_rddir_cache(RTOV4(rp)); 1466 1467 /* 1468 * Release the xattr directory vnode 1469 */ 1470 if (xattr != NULL) 1471 VN_RELE(xattr); 1472 1473 return (1); 1474 } 1475 1476 static int 1477 nfs4_free_reclaim(void) 1478 { 1479 int freed; 1480 rnode4_t *rp; 1481 1482 #ifdef DEBUG 1483 clstat4_debug.f_reclaim.value.ui64++; 1484 #endif 1485 freed = 0; 1486 mutex_enter(&rp4freelist_lock); 1487 rp = rp4freelist; 1488 if (rp != NULL) { 1489 do { 1490 if (nfs4_free_data_reclaim(rp)) 1491 freed = 1; 1492 } while ((rp = rp->r_freef) != rp4freelist); 1493 } 1494 mutex_exit(&rp4freelist_lock); 1495 return (freed); 1496 } 1497 1498 static int 1499 nfs4_active_reclaim(void) 1500 { 1501 int freed; 1502 int index; 1503 rnode4_t *rp; 1504 1505 #ifdef DEBUG 1506 clstat4_debug.a_reclaim.value.ui64++; 1507 #endif 1508 freed = 0; 1509 for (index = 0; index < rtable4size; index++) { 1510 rw_enter(&rtable4[index].r_lock, RW_READER); 1511 for (rp = rtable4[index].r_hashf; 1512 rp != (rnode4_t *)(&rtable4[index]); 1513 rp = rp->r_hashf) { 1514 if (nfs4_active_data_reclaim(rp)) 1515 freed = 1; 1516 } 1517 rw_exit(&rtable4[index].r_lock); 1518 } 1519 return (freed); 1520 } 1521 1522 static int 1523 nfs4_rnode_reclaim(void) 1524 { 1525 int freed; 1526 rnode4_t *rp; 1527 vnode_t *vp; 1528 1529 #ifdef DEBUG 1530 clstat4_debug.r_reclaim.value.ui64++; 1531 #endif 1532 freed = 0; 1533 mutex_enter(&rp4freelist_lock); 1534 while ((rp = rp4freelist) != NULL) { 1535 rp4_rmfree(rp); 1536 mutex_exit(&rp4freelist_lock); 1537 if (rp->r_flags & R4HASHED) { 1538 vp = RTOV4(rp); 1539 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 1540 mutex_enter(&vp->v_lock); 1541 if (vp->v_count > 1) { 1542 vp->v_count--; 1543 mutex_exit(&vp->v_lock); 1544 rw_exit(&rp->r_hashq->r_lock); 1545 mutex_enter(&rp4freelist_lock); 1546 continue; 1547 } 1548 mutex_exit(&vp->v_lock); 1549 rp4_rmhash_locked(rp); 1550 rw_exit(&rp->r_hashq->r_lock); 1551 } 1552 /* 1553 * This call to rp_addfree will end up destroying the 1554 * rnode, but in a safe way with the appropriate set 1555 * of checks done. 1556 */ 1557 rp4_addfree(rp, CRED()); 1558 mutex_enter(&rp4freelist_lock); 1559 } 1560 mutex_exit(&rp4freelist_lock); 1561 return (freed); 1562 } 1563 1564 /*ARGSUSED*/ 1565 static void 1566 nfs4_reclaim(void *cdrarg) 1567 { 1568 #ifdef DEBUG 1569 clstat4_debug.reclaim.value.ui64++; 1570 #endif 1571 if (nfs4_free_reclaim()) 1572 return; 1573 1574 if (nfs4_active_reclaim()) 1575 return; 1576 1577 (void) nfs4_rnode_reclaim(); 1578 } 1579 1580 /* 1581 * Returns the clientid4 to use for the given mntinfo4. Note that the 1582 * clientid can change if the caller drops mi_recovlock. 1583 */ 1584 1585 clientid4 1586 mi2clientid(mntinfo4_t *mi) 1587 { 1588 nfs4_server_t *sp; 1589 clientid4 clientid = 0; 1590 1591 /* this locks down sp if it is found */ 1592 sp = find_nfs4_server(mi); 1593 if (sp != NULL) { 1594 clientid = sp->clientid; 1595 mutex_exit(&sp->s_lock); 1596 nfs4_server_rele(sp); 1597 } 1598 return (clientid); 1599 } 1600 1601 /* 1602 * Return the current lease time for the server associated with the given 1603 * file. Note that the lease time could change immediately after this 1604 * call. 1605 */ 1606 1607 time_t 1608 r2lease_time(rnode4_t *rp) 1609 { 1610 nfs4_server_t *sp; 1611 time_t lease_time; 1612 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 1613 1614 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1615 1616 /* this locks down sp if it is found */ 1617 sp = find_nfs4_server(VTOMI4(RTOV4(rp))); 1618 1619 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1620 if (sp != NULL) { 1621 mutex_exit(&sp->s_lock); 1622 nfs4_server_rele(sp); 1623 } 1624 nfs_rw_exit(&mi->mi_recovlock); 1625 return (1); /* 1 second */ 1626 } 1627 1628 ASSERT(sp != NULL); 1629 1630 lease_time = sp->s_lease_time; 1631 1632 mutex_exit(&sp->s_lock); 1633 nfs4_server_rele(sp); 1634 nfs_rw_exit(&mi->mi_recovlock); 1635 1636 return (lease_time); 1637 } 1638 1639 /* 1640 * Return a list with information about all the known open instances for 1641 * a filesystem. The caller must call r4releopenlist() when done with the 1642 * list. 1643 * 1644 * We are safe at looking at os_valid and os_pending_close across dropping 1645 * the 'os_sync_lock' to count up the number of open streams and then 1646 * allocate memory for the osp list due to: 1647 * -Looking at os_pending_close is safe since this routine is 1648 * only called via recovery, and os_pending_close can only be set via 1649 * a non-recovery operation (which are all blocked when recovery 1650 * is active). 1651 * 1652 * -Examining os_valid is safe since non-recovery operations, which 1653 * could potentially switch os_valid to 0, are blocked (via 1654 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t 1655 * (which means we are the only recovery thread potentially acting 1656 * on this open stream). 1657 */ 1658 1659 nfs4_opinst_t * 1660 r4mkopenlist(mntinfo4_t *mi) 1661 { 1662 nfs4_opinst_t *reopenlist, *rep; 1663 rnode4_t *rp; 1664 vnode_t *vp; 1665 vfs_t *vfsp = mi->mi_vfsp; 1666 int numosp; 1667 nfs4_open_stream_t *osp; 1668 int index; 1669 open_delegation_type4 dtype; 1670 int hold_vnode; 1671 1672 reopenlist = NULL; 1673 1674 for (index = 0; index < rtable4size; index++) { 1675 rw_enter(&rtable4[index].r_lock, RW_READER); 1676 for (rp = rtable4[index].r_hashf; 1677 rp != (rnode4_t *)(&rtable4[index]); 1678 rp = rp->r_hashf) { 1679 1680 vp = RTOV4(rp); 1681 if (vp->v_vfsp != vfsp) 1682 continue; 1683 hold_vnode = 0; 1684 1685 mutex_enter(&rp->r_os_lock); 1686 1687 /* Count the number of valid open_streams of the file */ 1688 numosp = 0; 1689 for (osp = list_head(&rp->r_open_streams); osp != NULL; 1690 osp = list_next(&rp->r_open_streams, osp)) { 1691 mutex_enter(&osp->os_sync_lock); 1692 if (osp->os_valid && !osp->os_pending_close) 1693 numosp++; 1694 mutex_exit(&osp->os_sync_lock); 1695 } 1696 1697 /* Fill in the valid open streams per vp */ 1698 if (numosp > 0) { 1699 int j; 1700 1701 hold_vnode = 1; 1702 1703 /* 1704 * Add a new open instance to the list 1705 */ 1706 rep = kmem_zalloc(sizeof (*reopenlist), 1707 KM_SLEEP); 1708 rep->re_next = reopenlist; 1709 reopenlist = rep; 1710 1711 rep->re_vp = vp; 1712 rep->re_osp = kmem_zalloc( 1713 numosp * sizeof (*(rep->re_osp)), 1714 KM_SLEEP); 1715 rep->re_numosp = numosp; 1716 1717 j = 0; 1718 for (osp = list_head(&rp->r_open_streams); 1719 osp != NULL; 1720 osp = list_next(&rp->r_open_streams, osp)) { 1721 1722 mutex_enter(&osp->os_sync_lock); 1723 if (osp->os_valid && 1724 !osp->os_pending_close) { 1725 osp->os_ref_count++; 1726 rep->re_osp[j] = osp; 1727 j++; 1728 } 1729 mutex_exit(&osp->os_sync_lock); 1730 } 1731 /* 1732 * Assuming valid osp(s) stays valid between 1733 * the time obtaining j and numosp. 1734 */ 1735 ASSERT(j == numosp); 1736 } 1737 1738 mutex_exit(&rp->r_os_lock); 1739 /* do this here to keep v_lock > r_os_lock */ 1740 if (hold_vnode) 1741 VN_HOLD(vp); 1742 mutex_enter(&rp->r_statev4_lock); 1743 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) { 1744 /* 1745 * If this rnode holds a delegation, 1746 * but if there are no valid open streams, 1747 * then just discard the delegation 1748 * without doing delegreturn. 1749 */ 1750 if (numosp > 0) 1751 rp->r_deleg_needs_recovery = 1752 rp->r_deleg_type; 1753 } 1754 /* Save the delegation type for use outside the lock */ 1755 dtype = rp->r_deleg_type; 1756 mutex_exit(&rp->r_statev4_lock); 1757 1758 /* 1759 * If we have a delegation then get rid of it. 1760 * We've set rp->r_deleg_needs_recovery so we have 1761 * enough information to recover. 1762 */ 1763 if (dtype != OPEN_DELEGATE_NONE) { 1764 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD); 1765 } 1766 } 1767 rw_exit(&rtable4[index].r_lock); 1768 } 1769 return (reopenlist); 1770 } 1771 1772 /* 1773 * Given a filesystem id, check to see if any rnodes 1774 * within this fsid reside in the rnode cache, other 1775 * than one we know about. 1776 * 1777 * Return 1 if an rnode is found, 0 otherwise 1778 */ 1779 int 1780 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid) 1781 { 1782 rnode4_t *rp; 1783 vnode_t *vp; 1784 vfs_t *vfsp = mi->mi_vfsp; 1785 fattr4_fsid *fsid; 1786 int index, found = 0; 1787 1788 for (index = 0; index < rtable4size; index++) { 1789 rw_enter(&rtable4[index].r_lock, RW_READER); 1790 for (rp = rtable4[index].r_hashf; 1791 rp != (rnode4_t *)(&rtable4[index]); 1792 rp = rp->r_hashf) { 1793 1794 vp = RTOV4(rp); 1795 if (vp->v_vfsp != vfsp) 1796 continue; 1797 1798 /* 1799 * XXX there might be a case where a 1800 * replicated fs may have the same fsid 1801 * across two different servers. This 1802 * check isn't good enough in that case 1803 */ 1804 fsid = &rp->r_srv_fsid; 1805 if (FATTR4_FSID_EQ(moved_fsid, fsid)) { 1806 found = 1; 1807 break; 1808 } 1809 } 1810 rw_exit(&rtable4[index].r_lock); 1811 1812 if (found) 1813 break; 1814 } 1815 return (found); 1816 } 1817 1818 /* 1819 * Release the list of open instance references. 1820 */ 1821 1822 void 1823 r4releopenlist(nfs4_opinst_t *reopenp) 1824 { 1825 nfs4_opinst_t *rep, *next; 1826 int i; 1827 1828 for (rep = reopenp; rep; rep = next) { 1829 next = rep->re_next; 1830 1831 for (i = 0; i < rep->re_numosp; i++) 1832 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp)); 1833 1834 VN_RELE(rep->re_vp); 1835 kmem_free(rep->re_osp, 1836 rep->re_numosp * sizeof (*(rep->re_osp))); 1837 1838 kmem_free(rep, sizeof (*rep)); 1839 } 1840 } 1841 1842 int 1843 nfs4_rnode_init(void) 1844 { 1845 ulong_t nrnode4_max; 1846 int i; 1847 1848 /* 1849 * Compute the size of the rnode4 hash table 1850 */ 1851 if (nrnode <= 0) 1852 nrnode = ncsize; 1853 nrnode4_max = 1854 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4)); 1855 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) { 1856 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 1857 "setting nrnode to max value of %ld", nrnode4_max); 1858 nrnode = nrnode4_max; 1859 } 1860 rtable4size = 1 << highbit(nrnode / rnode4_hashlen); 1861 rtable4mask = rtable4size - 1; 1862 1863 /* 1864 * Allocate and initialize the hash buckets 1865 */ 1866 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP); 1867 for (i = 0; i < rtable4size; i++) { 1868 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]); 1869 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]); 1870 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL); 1871 } 1872 1873 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t), 1874 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0); 1875 1876 return (0); 1877 } 1878 1879 int 1880 nfs4_rnode_fini(void) 1881 { 1882 int i; 1883 1884 /* 1885 * Deallocate the rnode hash queues 1886 */ 1887 kmem_cache_destroy(rnode4_cache); 1888 1889 for (i = 0; i < rtable4size; i++) 1890 rw_destroy(&rtable4[i].r_lock); 1891 1892 kmem_free(rtable4, rtable4size * sizeof (*rtable4)); 1893 1894 return (0); 1895 } 1896 1897 /* 1898 * Return non-zero if the given filehandle refers to the root filehandle 1899 * for the given rnode. 1900 */ 1901 1902 static int 1903 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp) 1904 { 1905 int isroot; 1906 1907 isroot = 0; 1908 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh)) 1909 isroot = 1; 1910 1911 return (isroot); 1912 } 1913 1914 /* 1915 * The r4_stub_* routines assume that the rnode is newly activated, and 1916 * that the caller either holds the hash bucket r_lock for this rnode as 1917 * RW_WRITER, or holds r_statelock. 1918 */ 1919 static void 1920 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type) 1921 { 1922 vnode_t *vp = RTOV4(rp); 1923 krwlock_t *hash_lock = &rp->r_hashq->r_lock; 1924 1925 ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock)); 1926 1927 rp->r_stub_type = type; 1928 1929 /* 1930 * Safely switch this vnode to the trigger vnodeops. 1931 * 1932 * Currently, we don't ever switch a trigger vnode back to using 1933 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that 1934 * a new v4 object is not a trigger, and it will already have the 1935 * correct v4 vnodeops by default. So, no "else" case required here. 1936 */ 1937 if (type != NFS4_STUB_NONE) 1938 vn_setops(vp, nfs4_trigger_vnodeops); 1939 } 1940 1941 void 1942 r4_stub_mirrormount(rnode4_t *rp) 1943 { 1944 r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT); 1945 } 1946 1947 void 1948 r4_stub_referral(rnode4_t *rp) 1949 { 1950 DTRACE_PROBE1(nfs4clnt__func__referral__moved, 1951 vnode_t *, RTOV4(rp)); 1952 r4_stub_set(rp, NFS4_STUB_REFERRAL); 1953 } 1954 1955 void 1956 r4_stub_none(rnode4_t *rp) 1957 { 1958 r4_stub_set(rp, NFS4_STUB_NONE); 1959 } 1960 1961 #ifdef DEBUG 1962 1963 /* 1964 * Look in the rnode table for other rnodes that have the same filehandle. 1965 * Assume the lock is held for the hash chain of checkrp 1966 */ 1967 1968 static void 1969 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp) 1970 { 1971 rnode4_t *rp; 1972 vnode_t *tvp; 1973 nfs4_fhandle_t fh, fh2; 1974 int index; 1975 1976 if (!r4_check_for_dups) 1977 return; 1978 1979 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock)); 1980 1981 sfh4_copyval(checkrp->r_fh, &fh); 1982 1983 for (index = 0; index < rtable4size; index++) { 1984 1985 if (&rtable4[index] != checkrp->r_hashq) 1986 rw_enter(&rtable4[index].r_lock, RW_READER); 1987 1988 for (rp = rtable4[index].r_hashf; 1989 rp != (rnode4_t *)(&rtable4[index]); 1990 rp = rp->r_hashf) { 1991 1992 if (rp == checkrp) 1993 continue; 1994 1995 tvp = RTOV4(rp); 1996 if (tvp->v_vfsp != vfsp) 1997 continue; 1998 1999 sfh4_copyval(rp->r_fh, &fh2); 2000 if (nfs4cmpfhandle(&fh, &fh2) == 0) { 2001 cmn_err(CE_PANIC, "rnodes with same fs, fh " 2002 "(%p, %p)", (void *)checkrp, (void *)rp); 2003 } 2004 } 2005 2006 if (&rtable4[index] != checkrp->r_hashq) 2007 rw_exit(&rtable4[index].r_lock); 2008 } 2009 } 2010 2011 #endif /* DEBUG */ 2012