xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision 96d9f183facd90dbbc2268c9a51689be0b6a0b46)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/proc.h>
37 #include <sys/user.h>
38 #include <sys/time.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/socket.h>
43 #include <sys/uio.h>
44 #include <sys/tiuser.h>
45 #include <sys/swap.h>
46 #include <sys/errno.h>
47 #include <sys/debug.h>
48 #include <sys/kmem.h>
49 #include <sys/kstat.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vtrace.h>
52 #include <sys/session.h>
53 #include <sys/dnlc.h>
54 #include <sys/bitmap.h>
55 #include <sys/acl.h>
56 #include <sys/ddi.h>
57 #include <sys/pathname.h>
58 #include <sys/flock.h>
59 #include <sys/dirent.h>
60 #include <sys/flock.h>
61 #include <sys/callb.h>
62 #include <sys/sdt.h>
63 
64 #include <rpc/types.h>
65 #include <rpc/xdr.h>
66 #include <rpc/auth.h>
67 #include <rpc/rpcsec_gss.h>
68 #include <rpc/clnt.h>
69 
70 #include <nfs/nfs.h>
71 #include <nfs/nfs_clnt.h>
72 #include <nfs/nfs_acl.h>
73 
74 #include <nfs/nfs4.h>
75 #include <nfs/rnode4.h>
76 #include <nfs/nfs4_clnt.h>
77 
78 /*
79  * The hash queues for the access to active and cached rnodes
80  * are organized as doubly linked lists.  A reader/writer lock
81  * for each hash bucket is used to control access and to synchronize
82  * lookups, additions, and deletions from the hash queue.
83  *
84  * The rnode freelist is organized as a doubly linked list with
85  * a head pointer.  Additions and deletions are synchronized via
86  * a single mutex.
87  *
88  * In order to add an rnode to the free list, it must be hashed into
89  * a hash queue and the exclusive lock to the hash queue be held.
90  * If an rnode is not hashed into a hash queue, then it is destroyed
91  * because it represents no valuable information that can be reused
92  * about the file.  The exclusive lock to the hash queue must be
93  * held in order to prevent a lookup in the hash queue from finding
94  * the rnode and using it and assuming that the rnode is not on the
95  * freelist.  The lookup in the hash queue will have the hash queue
96  * locked, either exclusive or shared.
97  *
98  * The vnode reference count for each rnode is not allowed to drop
99  * below 1.  This prevents external entities, such as the VM
100  * subsystem, from acquiring references to vnodes already on the
101  * freelist and then trying to place them back on the freelist
102  * when their reference is released.  This means that the when an
103  * rnode is looked up in the hash queues, then either the rnode
104  * is removed from the freelist and that reference is transferred to
105  * the new reference or the vnode reference count must be incremented
106  * accordingly.  The mutex for the freelist must be held in order to
107  * accurately test to see if the rnode is on the freelist or not.
108  * The hash queue lock might be held shared and it is possible that
109  * two different threads may race to remove the rnode from the
110  * freelist.  This race can be resolved by holding the mutex for the
111  * freelist.  Please note that the mutex for the freelist does not
112  * need to be held if the rnode is not on the freelist.  It can not be
113  * placed on the freelist due to the requirement that the thread
114  * putting the rnode on the freelist must hold the exclusive lock
115  * to the hash queue and the thread doing the lookup in the hash
116  * queue is holding either a shared or exclusive lock to the hash
117  * queue.
118  *
119  * The lock ordering is:
120  *
121  *	hash bucket lock -> vnode lock
122  *	hash bucket lock -> freelist lock -> r_statelock
123  */
124 r4hashq_t *rtable4;
125 
126 static kmutex_t rp4freelist_lock;
127 static rnode4_t *rp4freelist = NULL;
128 static long rnode4_new = 0;
129 int rtable4size;
130 static int rtable4mask;
131 static struct kmem_cache *rnode4_cache;
132 static int rnode4_hashlen = 4;
133 
134 static void	r4inactive(rnode4_t *, cred_t *);
135 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
136 		    struct vnodeops *,
137 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
138 		    cred_t *),
139 		    int *, cred_t *);
140 static void	rp4_rmfree(rnode4_t *);
141 int		nfs4_free_data_reclaim(rnode4_t *);
142 static int	nfs4_active_data_reclaim(rnode4_t *);
143 static int	nfs4_free_reclaim(void);
144 static int	nfs4_active_reclaim(void);
145 static int	nfs4_rnode_reclaim(void);
146 static void	nfs4_reclaim(void *);
147 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
148 static void	uninit_rnode4(rnode4_t *);
149 static void	destroy_rnode4(rnode4_t *);
150 static void	r4_stub_set(rnode4_t *, nfs4_stub_type_t);
151 
152 #ifdef DEBUG
153 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
154 static int nfs4_rnode_debug = 0;
155 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
156 static int nfs4_rnode_nofreelist = 0;
157 /* give messages on colliding shared filehandles */
158 static void	r4_dup_check(rnode4_t *, vfs_t *);
159 #endif
160 
161 /*
162  * If the vnode has pages, run the list and check for any that are
163  * still dangling.  We call this routine before putting an rnode on
164  * the free list.
165  */
166 static int
167 nfs4_dross_pages(vnode_t *vp)
168 {
169 	page_t *pp;
170 	kmutex_t *vphm;
171 
172 	vphm = page_vnode_mutex(vp);
173 	mutex_enter(vphm);
174 	if ((pp = vp->v_pages) != NULL) {
175 		do {
176 			if (pp->p_fsdata != C_NOCOMMIT) {
177 				mutex_exit(vphm);
178 				return (1);
179 			}
180 		} while ((pp = pp->p_vpnext) != vp->v_pages);
181 	}
182 	mutex_exit(vphm);
183 
184 	return (0);
185 }
186 
187 /*
188  * Flush any pages left on this rnode.
189  */
190 static void
191 r4flushpages(rnode4_t *rp, cred_t *cr)
192 {
193 	vnode_t *vp;
194 	int error;
195 
196 	/*
197 	 * Before freeing anything, wait until all asynchronous
198 	 * activity is done on this rnode.  This will allow all
199 	 * asynchronous read ahead and write behind i/o's to
200 	 * finish.
201 	 */
202 	mutex_enter(&rp->r_statelock);
203 	while (rp->r_count > 0)
204 		cv_wait(&rp->r_cv, &rp->r_statelock);
205 	mutex_exit(&rp->r_statelock);
206 
207 	/*
208 	 * Flush and invalidate all pages associated with the vnode.
209 	 */
210 	vp = RTOV4(rp);
211 	if (nfs4_has_pages(vp)) {
212 		ASSERT(vp->v_type != VCHR);
213 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
214 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
215 			if (error && (error == ENOSPC || error == EDQUOT)) {
216 				mutex_enter(&rp->r_statelock);
217 				if (!rp->r_error)
218 					rp->r_error = error;
219 				mutex_exit(&rp->r_statelock);
220 			}
221 		}
222 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
223 	}
224 }
225 
226 /*
227  * Free the resources associated with an rnode.
228  */
229 static void
230 r4inactive(rnode4_t *rp, cred_t *cr)
231 {
232 	vnode_t *vp;
233 	char *contents;
234 	int size;
235 	vsecattr_t *vsp;
236 	vnode_t *xattr;
237 
238 	r4flushpages(rp, cr);
239 
240 	vp = RTOV4(rp);
241 
242 	/*
243 	 * Free any held caches which may be
244 	 * associated with this rnode.
245 	 */
246 	mutex_enter(&rp->r_statelock);
247 	contents = rp->r_symlink.contents;
248 	size = rp->r_symlink.size;
249 	rp->r_symlink.contents = NULL;
250 	vsp = rp->r_secattr;
251 	rp->r_secattr = NULL;
252 	xattr = rp->r_xattr_dir;
253 	rp->r_xattr_dir = NULL;
254 	mutex_exit(&rp->r_statelock);
255 
256 	/*
257 	 * Free the access cache entries.
258 	 */
259 	(void) nfs4_access_purge_rp(rp);
260 
261 	/*
262 	 * Free the readdir cache entries.
263 	 */
264 	nfs4_purge_rddir_cache(vp);
265 
266 	/*
267 	 * Free the symbolic link cache.
268 	 */
269 	if (contents != NULL) {
270 
271 		kmem_free((void *)contents, size);
272 	}
273 
274 	/*
275 	 * Free any cached ACL.
276 	 */
277 	if (vsp != NULL)
278 		nfs4_acl_free_cache(vsp);
279 
280 	/*
281 	 * Release the cached xattr_dir
282 	 */
283 	if (xattr != NULL)
284 		VN_RELE(xattr);
285 }
286 
287 /*
288  * We have seen a case that the fh passed in is for "." which
289  * should be a VROOT node, however, the fh is different from the
290  * root fh stored in the mntinfo4_t. The invalid fh might be
291  * from a misbehaved server and will panic the client system at
292  * a later time. To avoid the panic, we drop the bad fh, use
293  * the root fh from mntinfo4_t, and print an error message
294  * for attention.
295  */
296 nfs4_sharedfh_t *
297 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
298     int *wasbad)
299 {
300 	char *s;
301 
302 	*wasbad = 0;
303 	s = fn_name(nm);
304 	ASSERT(strcmp(s, "..") != 0);
305 
306 	if ((s[0] == '.' && s[1] == '\0') && fh &&
307 	    !SFH4_SAME(mi->mi_rootfh, fh)) {
308 #ifdef DEBUG
309 		nfs4_fhandle_t fhandle;
310 
311 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
312 		    "Server %s returns a different "
313 		    "root filehandle for the path %s:",
314 		    mi->mi_curr_serv->sv_hostname,
315 		    mi->mi_curr_serv->sv_path);
316 
317 		/* print the bad fh */
318 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
319 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
320 		    fhandle.fh_len);
321 		nfs4_printfhandle(&fhandle);
322 
323 		/* print mi_rootfh */
324 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
325 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
326 		    fhandle.fh_len);
327 		nfs4_printfhandle(&fhandle);
328 #endif
329 		/* use mi_rootfh instead; fh will be rele by the caller */
330 		fh = mi->mi_rootfh;
331 		*wasbad = 1;
332 	}
333 
334 	kmem_free(s, MAXNAMELEN);
335 	return (fh);
336 }
337 
338 void
339 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
340     hrtime_t t, cred_t *cr, int index)
341 {
342 	int is_stub;
343 	vattr_t *attr;
344 	/*
345 	 * Don't add to attrcache if time overflow, but
346 	 * no need to check because either attr is null or the time
347 	 * values in it were processed by nfs4_time_ntov(), which checks
348 	 * for time overflows.
349 	 */
350 	attr = garp ? &garp->n4g_va : NULL;
351 
352 	if (attr) {
353 		if (!newnode) {
354 			rw_exit(&rtable4[index].r_lock);
355 #ifdef DEBUG
356 			if (vp->v_type != attr->va_type &&
357 			    vp->v_type != VNON && attr->va_type != VNON) {
358 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
359 				    "makenfs4node: type (%d) doesn't "
360 				    "match type of found node at %p (%d)",
361 				    attr->va_type, (void *)vp, vp->v_type);
362 			}
363 #endif
364 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
365 		} else {
366 			rnode4_t *rp = VTOR4(vp);
367 
368 			vp->v_type = attr->va_type;
369 			vp->v_rdev = attr->va_rdev;
370 
371 			/*
372 			 * Turn this object into a "stub" object if we
373 			 * crossed an underlying server fs boundary.
374 			 * To make this check, during mount we save the
375 			 * fsid of the server object being mounted.
376 			 * Here we compare this object's server fsid
377 			 * with the fsid we saved at mount.  If they
378 			 * are different, we crossed server fs boundary.
379 			 *
380 			 * The stub type is set (or not) at rnode
381 			 * creation time and it never changes for life
382 			 * of the rnode.
383 			 *
384 			 * This stub will be for a mirror-mount, rather than
385 			 * a referral (the latter also sets R4SRVSTUB).
386 			 *
387 			 * The stub type is also set during RO failover,
388 			 * nfs4_remap_file().
389 			 *
390 			 * We don't bother with taking r_state_lock to
391 			 * set the stub type because this is a new rnode
392 			 * and we're holding the hash bucket r_lock RW_WRITER.
393 			 * No other thread could have obtained access
394 			 * to this rnode.
395 			 */
396 			is_stub = 0;
397 			if (garp->n4g_fsid_valid) {
398 				fattr4_fsid ga_fsid = garp->n4g_fsid;
399 				servinfo4_t *svp = rp->r_server;
400 
401 				rp->r_srv_fsid = ga_fsid;
402 
403 				(void) nfs_rw_enter_sig(&svp->sv_lock,
404 				    RW_READER, 0);
405 				if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
406 					is_stub = 1;
407 				nfs_rw_exit(&svp->sv_lock);
408 			}
409 
410 			if (is_stub)
411 				r4_stub_mirrormount(rp);
412 			else
413 				r4_stub_none(rp);
414 
415 			/* Can not cache partial attr */
416 			if (attr->va_mask == AT_ALL)
417 				nfs4_attrcache_noinval(vp, garp, t);
418 			else
419 				PURGE_ATTRCACHE4(vp);
420 
421 			rw_exit(&rtable4[index].r_lock);
422 		}
423 	} else {
424 		if (newnode) {
425 			PURGE_ATTRCACHE4(vp);
426 		}
427 		rw_exit(&rtable4[index].r_lock);
428 	}
429 }
430 
431 /*
432  * Find or create an rnode based primarily on filehandle.  To be
433  * used when dvp (vnode for parent directory) is not available;
434  * otherwise, makenfs4node() should be used.
435  *
436  * The nfs4_fname_t argument *npp is consumed and nulled out.
437  */
438 
439 vnode_t *
440 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
441     nfs4_fname_t **npp, nfs4_ga_res_t *garp,
442     mntinfo4_t *mi, cred_t *cr, hrtime_t t)
443 {
444 	vfs_t *vfsp = mi->mi_vfsp;
445 	int newnode = 0;
446 	vnode_t *vp;
447 	rnode4_t *rp;
448 	svnode_t *svp;
449 	nfs4_fname_t *name, *svpname;
450 	int index;
451 
452 	ASSERT(npp && *npp);
453 	name = *npp;
454 	*npp = NULL;
455 
456 	index = rtable4hash(sfh);
457 	rw_enter(&rtable4[index].r_lock, RW_READER);
458 
459 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
460 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
461 
462 	svp = VTOSV(vp);
463 	rp = VTOR4(vp);
464 	if (newnode) {
465 		svp->sv_forw = svp->sv_back = svp;
466 		svp->sv_name = name;
467 		if (psfh != NULL)
468 			sfh4_hold(psfh);
469 		svp->sv_dfh = psfh;
470 	} else {
471 		/*
472 		 * It is possible that due to a server
473 		 * side rename fnames have changed.
474 		 * update the fname here.
475 		 */
476 		mutex_enter(&rp->r_svlock);
477 		svpname = svp->sv_name;
478 		if (svp->sv_name != name) {
479 			svp->sv_name = name;
480 			mutex_exit(&rp->r_svlock);
481 			fn_rele(&svpname);
482 		} else {
483 			mutex_exit(&rp->r_svlock);
484 			fn_rele(&name);
485 		}
486 	}
487 
488 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
489 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
490 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
491 
492 	return (vp);
493 }
494 
495 /*
496  * Find or create a vnode for the given filehandle, filesystem, parent, and
497  * name.  The reference to nm is consumed, so the caller must first do an
498  * fn_hold() if it wants to continue using nm after this call.
499  */
500 vnode_t *
501 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
502     hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
503 {
504 	vnode_t *vp;
505 	int newnode;
506 	int index;
507 	mntinfo4_t *mi = VFTOMI4(vfsp);
508 	int had_badfh = 0;
509 	rnode4_t *rp;
510 
511 	ASSERT(dvp != NULL);
512 
513 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
514 
515 	index = rtable4hash(fh);
516 	rw_enter(&rtable4[index].r_lock, RW_READER);
517 
518 	/*
519 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
520 	 */
521 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
522 	    nfs4_putapage, &newnode, cr);
523 
524 	rp = VTOR4(vp);
525 	sv_activate(&vp, dvp, &nm, newnode);
526 	if (dvp->v_flag & V_XATTRDIR) {
527 		mutex_enter(&rp->r_statelock);
528 		rp->r_flags |= R4ISXATTR;
529 		mutex_exit(&rp->r_statelock);
530 	}
531 
532 	/* if getting a bad file handle, do not cache the attributes. */
533 	if (had_badfh) {
534 		rw_exit(&rtable4[index].r_lock);
535 		return (vp);
536 	}
537 
538 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
539 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
540 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
541 
542 	return (vp);
543 }
544 
545 /*
546  * Hash on address of filehandle object.
547  * XXX totally untuned.
548  */
549 
550 int
551 rtable4hash(nfs4_sharedfh_t *fh)
552 {
553 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
554 }
555 
556 /*
557  * Find or create the vnode for the given filehandle and filesystem.
558  * *newnode is set to zero if the vnode already existed; non-zero if it had
559  * to be created.
560  *
561  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
562  */
563 
564 static vnode_t *
565 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
566     struct vnodeops *vops,
567     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
568     int *newnode, cred_t *cr)
569 {
570 	rnode4_t *rp;
571 	rnode4_t *trp;
572 	vnode_t *vp;
573 	mntinfo4_t *mi;
574 
575 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
576 
577 	mi = VFTOMI4(vfsp);
578 
579 start:
580 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
581 		vp = RTOV4(rp);
582 		*newnode = 0;
583 		return (vp);
584 	}
585 	rw_exit(&rhtp->r_lock);
586 
587 	mutex_enter(&rp4freelist_lock);
588 
589 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
590 		rp = rp4freelist;
591 		rp4_rmfree(rp);
592 		mutex_exit(&rp4freelist_lock);
593 
594 		vp = RTOV4(rp);
595 
596 		if (rp->r_flags & R4HASHED) {
597 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
598 			mutex_enter(&vp->v_lock);
599 			if (vp->v_count > 1) {
600 				vp->v_count--;
601 				mutex_exit(&vp->v_lock);
602 				rw_exit(&rp->r_hashq->r_lock);
603 				rw_enter(&rhtp->r_lock, RW_READER);
604 				goto start;
605 			}
606 			mutex_exit(&vp->v_lock);
607 			rp4_rmhash_locked(rp);
608 			rw_exit(&rp->r_hashq->r_lock);
609 		}
610 
611 		r4inactive(rp, cr);
612 
613 		mutex_enter(&vp->v_lock);
614 		if (vp->v_count > 1) {
615 			vp->v_count--;
616 			mutex_exit(&vp->v_lock);
617 			rw_enter(&rhtp->r_lock, RW_READER);
618 			goto start;
619 		}
620 		mutex_exit(&vp->v_lock);
621 		vn_invalid(vp);
622 
623 		/*
624 		 * destroy old locks before bzero'ing and
625 		 * recreating the locks below.
626 		 */
627 		uninit_rnode4(rp);
628 
629 		/*
630 		 * Make sure that if rnode is recycled then
631 		 * VFS count is decremented properly before
632 		 * reuse.
633 		 */
634 		VFS_RELE(vp->v_vfsp);
635 		vn_reinit(vp);
636 	} else {
637 		vnode_t *new_vp;
638 
639 		mutex_exit(&rp4freelist_lock);
640 
641 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
642 		new_vp = vn_alloc(KM_SLEEP);
643 
644 		atomic_add_long((ulong_t *)&rnode4_new, 1);
645 #ifdef DEBUG
646 		clstat4_debug.nrnode.value.ui64++;
647 #endif
648 		vp = new_vp;
649 	}
650 
651 	bzero(rp, sizeof (*rp));
652 	rp->r_vnode = vp;
653 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
654 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
655 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
656 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
657 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
658 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
659 	rp->created_v4 = 0;
660 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
661 	    offsetof(nfs4_open_stream_t, os_node));
662 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
663 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
664 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
665 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
666 	rp->r_flags = R4READDIRWATTR;
667 	rp->r_fh = fh;
668 	rp->r_hashq = rhtp;
669 	sfh4_hold(rp->r_fh);
670 	rp->r_server = mi->mi_curr_serv;
671 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
672 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
673 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
674 
675 	rddir4_cache_create(rp);
676 	rp->r_putapage = putapage;
677 	vn_setops(vp, vops);
678 	vp->v_data = (caddr_t)rp;
679 	vp->v_vfsp = vfsp;
680 	VFS_HOLD(vfsp);
681 	vp->v_type = VNON;
682 	if (isrootfh(fh, rp))
683 		vp->v_flag = VROOT;
684 	vn_exists(vp);
685 
686 	/*
687 	 * There is a race condition if someone else
688 	 * alloc's the rnode while no locks are held, so we
689 	 * check again and recover if found.
690 	 */
691 	rw_enter(&rhtp->r_lock, RW_WRITER);
692 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
693 		vp = RTOV4(trp);
694 		*newnode = 0;
695 		rw_exit(&rhtp->r_lock);
696 		rp4_addfree(rp, cr);
697 		rw_enter(&rhtp->r_lock, RW_READER);
698 		return (vp);
699 	}
700 	rp4_addhash(rp);
701 	*newnode = 1;
702 	return (vp);
703 }
704 
705 static void
706 uninit_rnode4(rnode4_t *rp)
707 {
708 	vnode_t *vp = RTOV4(rp);
709 
710 	ASSERT(rp != NULL);
711 	ASSERT(vp != NULL);
712 	ASSERT(vp->v_count == 1);
713 	ASSERT(rp->r_count == 0);
714 	ASSERT(rp->r_mapcnt == 0);
715 	if (rp->r_flags & R4LODANGLERS) {
716 		nfs4_flush_lock_owners(rp);
717 	}
718 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
719 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
720 	ASSERT(!(rp->r_flags & R4HASHED));
721 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
722 	nfs4_clear_open_streams(rp);
723 	list_destroy(&rp->r_open_streams);
724 
725 	/*
726 	 * Destroy the rddir cache first since we need to grab the r_statelock.
727 	 */
728 	mutex_enter(&rp->r_statelock);
729 	rddir4_cache_destroy(rp);
730 	mutex_exit(&rp->r_statelock);
731 	sv_uninit(&rp->r_svnode);
732 	sfh4_rele(&rp->r_fh);
733 	nfs_rw_destroy(&rp->r_rwlock);
734 	nfs_rw_destroy(&rp->r_lkserlock);
735 	mutex_destroy(&rp->r_statelock);
736 	mutex_destroy(&rp->r_statev4_lock);
737 	mutex_destroy(&rp->r_os_lock);
738 	cv_destroy(&rp->r_cv);
739 	cv_destroy(&rp->r_commit.c_cv);
740 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
741 	if (rp->r_flags & R4DELMAPLIST)
742 		list_destroy(&rp->r_indelmap);
743 }
744 
745 /*
746  * Put an rnode on the free list.
747  *
748  * Rnodes which were allocated above and beyond the normal limit
749  * are immediately freed.
750  */
751 void
752 rp4_addfree(rnode4_t *rp, cred_t *cr)
753 {
754 	vnode_t *vp;
755 	vnode_t *xattr;
756 	struct vfs *vfsp;
757 
758 	vp = RTOV4(rp);
759 	ASSERT(vp->v_count >= 1);
760 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
761 
762 	/*
763 	 * If we have too many rnodes allocated and there are no
764 	 * references to this rnode, or if the rnode is no longer
765 	 * accessible by it does not reside in the hash queues,
766 	 * or if an i/o error occurred while writing to the file,
767 	 * then just free it instead of putting it on the rnode
768 	 * freelist.
769 	 */
770 	vfsp = vp->v_vfsp;
771 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
772 #ifdef DEBUG
773 	    (nfs4_rnode_nofreelist != 0) ||
774 #endif
775 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
776 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
777 		if (rp->r_flags & R4HASHED) {
778 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
779 			mutex_enter(&vp->v_lock);
780 			if (vp->v_count > 1) {
781 				vp->v_count--;
782 				mutex_exit(&vp->v_lock);
783 				rw_exit(&rp->r_hashq->r_lock);
784 				return;
785 			}
786 			mutex_exit(&vp->v_lock);
787 			rp4_rmhash_locked(rp);
788 			rw_exit(&rp->r_hashq->r_lock);
789 		}
790 
791 		/*
792 		 * Make sure we don't have a delegation on this rnode
793 		 * before destroying it.
794 		 */
795 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
796 			(void) nfs4delegreturn(rp,
797 			    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
798 		}
799 
800 		r4inactive(rp, cr);
801 
802 		/*
803 		 * Recheck the vnode reference count.  We need to
804 		 * make sure that another reference has not been
805 		 * acquired while we were not holding v_lock.  The
806 		 * rnode is not in the rnode hash queues; one
807 		 * way for a reference to have been acquired
808 		 * is for a VOP_PUTPAGE because the rnode was marked
809 		 * with R4DIRTY or for a modified page.  This
810 		 * reference may have been acquired before our call
811 		 * to r4inactive.  The i/o may have been completed,
812 		 * thus allowing r4inactive to complete, but the
813 		 * reference to the vnode may not have been released
814 		 * yet.  In any case, the rnode can not be destroyed
815 		 * until the other references to this vnode have been
816 		 * released.  The other references will take care of
817 		 * either destroying the rnode or placing it on the
818 		 * rnode freelist.  If there are no other references,
819 		 * then the rnode may be safely destroyed.
820 		 */
821 		mutex_enter(&vp->v_lock);
822 		if (vp->v_count > 1) {
823 			vp->v_count--;
824 			mutex_exit(&vp->v_lock);
825 			return;
826 		}
827 		mutex_exit(&vp->v_lock);
828 
829 		destroy_rnode4(rp);
830 		return;
831 	}
832 
833 	/*
834 	 * Lock the hash queue and then recheck the reference count
835 	 * to ensure that no other threads have acquired a reference
836 	 * to indicate that the rnode should not be placed on the
837 	 * freelist.  If another reference has been acquired, then
838 	 * just release this one and let the other thread complete
839 	 * the processing of adding this rnode to the freelist.
840 	 */
841 again:
842 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
843 
844 	mutex_enter(&vp->v_lock);
845 	if (vp->v_count > 1) {
846 		vp->v_count--;
847 		mutex_exit(&vp->v_lock);
848 		rw_exit(&rp->r_hashq->r_lock);
849 		return;
850 	}
851 	mutex_exit(&vp->v_lock);
852 
853 	/*
854 	 * Make sure we don't put an rnode with a delegation
855 	 * on the free list.
856 	 */
857 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
858 		rw_exit(&rp->r_hashq->r_lock);
859 		(void) nfs4delegreturn(rp,
860 		    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
861 		goto again;
862 	}
863 
864 	/*
865 	 * Now that we have the hash queue lock, and we know there
866 	 * are not anymore references on the vnode, check to make
867 	 * sure there aren't any open streams still on the rnode.
868 	 * If so, drop the hash queue lock, remove the open streams,
869 	 * and recheck the v_count.
870 	 */
871 	mutex_enter(&rp->r_os_lock);
872 	if (list_head(&rp->r_open_streams) != NULL) {
873 		mutex_exit(&rp->r_os_lock);
874 		rw_exit(&rp->r_hashq->r_lock);
875 		if (nfs_zone() != VTOMI4(vp)->mi_zone)
876 			nfs4_clear_open_streams(rp);
877 		else
878 			(void) nfs4close_all(vp, cr);
879 		goto again;
880 	}
881 	mutex_exit(&rp->r_os_lock);
882 
883 	/*
884 	 * Before we put it on the freelist, make sure there are no pages.
885 	 * If there are, flush and commit of all of the dirty and
886 	 * uncommitted pages, assuming the file system isn't read only.
887 	 */
888 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
889 		rw_exit(&rp->r_hashq->r_lock);
890 		r4flushpages(rp, cr);
891 		goto again;
892 	}
893 
894 	/*
895 	 * Before we put it on the freelist, make sure there is no
896 	 * active xattr directory cached, the freelist will not
897 	 * have its entries r4inactive'd if there is still an active
898 	 * rnode, thus nothing in the freelist can hold another
899 	 * rnode active.
900 	 */
901 	xattr = rp->r_xattr_dir;
902 	rp->r_xattr_dir = NULL;
903 
904 	/*
905 	 * If there is no cached data or metadata for this file, then
906 	 * put the rnode on the front of the freelist so that it will
907 	 * be reused before other rnodes which may have cached data or
908 	 * metadata associated with them.
909 	 */
910 	mutex_enter(&rp4freelist_lock);
911 	if (rp4freelist == NULL) {
912 		rp->r_freef = rp;
913 		rp->r_freeb = rp;
914 		rp4freelist = rp;
915 	} else {
916 		rp->r_freef = rp4freelist;
917 		rp->r_freeb = rp4freelist->r_freeb;
918 		rp4freelist->r_freeb->r_freef = rp;
919 		rp4freelist->r_freeb = rp;
920 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
921 		    rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
922 			rp4freelist = rp;
923 	}
924 	mutex_exit(&rp4freelist_lock);
925 
926 	rw_exit(&rp->r_hashq->r_lock);
927 
928 	if (xattr)
929 		VN_RELE(xattr);
930 }
931 
932 /*
933  * Remove an rnode from the free list.
934  *
935  * The caller must be holding rp4freelist_lock and the rnode
936  * must be on the freelist.
937  */
938 static void
939 rp4_rmfree(rnode4_t *rp)
940 {
941 
942 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
943 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
944 
945 	if (rp == rp4freelist) {
946 		rp4freelist = rp->r_freef;
947 		if (rp == rp4freelist)
948 			rp4freelist = NULL;
949 	}
950 	rp->r_freeb->r_freef = rp->r_freef;
951 	rp->r_freef->r_freeb = rp->r_freeb;
952 
953 	rp->r_freef = rp->r_freeb = NULL;
954 }
955 
956 /*
957  * Put a rnode in the hash table.
958  *
959  * The caller must be holding the exclusive hash queue lock
960  */
961 void
962 rp4_addhash(rnode4_t *rp)
963 {
964 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
965 	ASSERT(!(rp->r_flags & R4HASHED));
966 
967 #ifdef DEBUG
968 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
969 #endif
970 
971 	rp->r_hashf = rp->r_hashq->r_hashf;
972 	rp->r_hashq->r_hashf = rp;
973 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
974 	rp->r_hashf->r_hashb = rp;
975 
976 	mutex_enter(&rp->r_statelock);
977 	rp->r_flags |= R4HASHED;
978 	mutex_exit(&rp->r_statelock);
979 }
980 
981 /*
982  * Remove a rnode from the hash table.
983  *
984  * The caller must be holding the hash queue lock.
985  */
986 void
987 rp4_rmhash_locked(rnode4_t *rp)
988 {
989 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
990 	ASSERT(rp->r_flags & R4HASHED);
991 
992 	rp->r_hashb->r_hashf = rp->r_hashf;
993 	rp->r_hashf->r_hashb = rp->r_hashb;
994 
995 	mutex_enter(&rp->r_statelock);
996 	rp->r_flags &= ~R4HASHED;
997 	mutex_exit(&rp->r_statelock);
998 }
999 
1000 /*
1001  * Remove a rnode from the hash table.
1002  *
1003  * The caller must not be holding the hash queue lock.
1004  */
1005 void
1006 rp4_rmhash(rnode4_t *rp)
1007 {
1008 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1009 	rp4_rmhash_locked(rp);
1010 	rw_exit(&rp->r_hashq->r_lock);
1011 }
1012 
1013 /*
1014  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1015  * Returns NULL if no match.  If an rnode is returned, the reference count
1016  * on the master vnode is incremented.
1017  *
1018  * The caller must be holding the hash queue lock, either shared or exclusive.
1019  */
1020 rnode4_t *
1021 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1022 {
1023 	rnode4_t *rp;
1024 	vnode_t *vp;
1025 
1026 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1027 
1028 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1029 		vp = RTOV4(rp);
1030 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1031 
1032 			mutex_enter(&rp->r_statelock);
1033 			if (rp->r_flags & R4RECOVERR) {
1034 				mutex_exit(&rp->r_statelock);
1035 				continue;
1036 			}
1037 			mutex_exit(&rp->r_statelock);
1038 #ifdef DEBUG
1039 			r4_dup_check(rp, vfsp);
1040 #endif
1041 			if (rp->r_freef != NULL) {
1042 				mutex_enter(&rp4freelist_lock);
1043 				/*
1044 				 * If the rnode is on the freelist,
1045 				 * then remove it and use that reference
1046 				 * as the new reference.  Otherwise,
1047 				 * need to increment the reference count.
1048 				 */
1049 				if (rp->r_freef != NULL) {
1050 					rp4_rmfree(rp);
1051 					mutex_exit(&rp4freelist_lock);
1052 				} else {
1053 					mutex_exit(&rp4freelist_lock);
1054 					VN_HOLD(vp);
1055 				}
1056 			} else
1057 				VN_HOLD(vp);
1058 
1059 			/*
1060 			 * if root vnode, set v_flag to indicate that
1061 			 */
1062 			if (isrootfh(fh, rp)) {
1063 				if (!(vp->v_flag & VROOT)) {
1064 					mutex_enter(&vp->v_lock);
1065 					vp->v_flag |= VROOT;
1066 					mutex_exit(&vp->v_lock);
1067 				}
1068 			}
1069 			return (rp);
1070 		}
1071 	}
1072 	return (NULL);
1073 }
1074 
1075 /*
1076  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1077  * that assumes the caller hasn't already got the lock
1078  * on the hash bucket.
1079  */
1080 rnode4_t *
1081 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1082 {
1083 	rnode4_t *rp;
1084 	int index;
1085 
1086 	index = rtable4hash(fh);
1087 	rw_enter(&rtable4[index].r_lock, RW_READER);
1088 	rp = r4find(&rtable4[index], fh, vfsp);
1089 	rw_exit(&rtable4[index].r_lock);
1090 
1091 	return (rp);
1092 }
1093 
1094 /*
1095  * Return >0 if there is a active vnode belonging to this vfs in the
1096  * rtable4 cache.
1097  *
1098  * Several of these checks are done without holding the usual
1099  * locks.  This is safe because destroy_rtable(), rp_addfree(),
1100  * etc. will redo the necessary checks before actually destroying
1101  * any rnodes.
1102  */
1103 int
1104 check_rtable4(struct vfs *vfsp)
1105 {
1106 	rnode4_t *rp;
1107 	vnode_t *vp;
1108 	int busy = NFSV4_RTABLE4_OK;
1109 	int index;
1110 
1111 	for (index = 0; index < rtable4size; index++) {
1112 		rw_enter(&rtable4[index].r_lock, RW_READER);
1113 
1114 		for (rp = rtable4[index].r_hashf;
1115 		    rp != (rnode4_t *)(&rtable4[index]);
1116 		    rp = rp->r_hashf) {
1117 
1118 			vp = RTOV4(rp);
1119 			if (vp->v_vfsp == vfsp) {
1120 				if (rp->r_freef == NULL) {
1121 					busy = NFSV4_RTABLE4_NOT_FREE_LIST;
1122 				} else if (nfs4_has_pages(vp) &&
1123 				    (rp->r_flags & R4DIRTY)) {
1124 					busy = NFSV4_RTABLE4_DIRTY_PAGES;
1125 				} else if (rp->r_count > 0) {
1126 					busy = NFSV4_RTABLE4_POS_R_COUNT;
1127 				}
1128 
1129 				if (busy != NFSV4_RTABLE4_OK) {
1130 #ifdef DEBUG
1131 					char *path;
1132 
1133 					path = fn_path(rp->r_svnode.sv_name);
1134 					DTRACE_NFSV4_3(rnode__e__debug,
1135 					    int, busy, char *, path,
1136 					    rnode4_t *, rp);
1137 					kmem_free(path, strlen(path)+1);
1138 #endif
1139 					rw_exit(&rtable4[index].r_lock);
1140 					return (busy);
1141 				}
1142 			}
1143 		}
1144 		rw_exit(&rtable4[index].r_lock);
1145 	}
1146 	return (busy);
1147 }
1148 
1149 /*
1150  * Destroy inactive vnodes from the hash queues which
1151  * belong to this vfs. All of the vnodes should be inactive.
1152  * It is essential that we destroy all rnodes in case of
1153  * forced unmount as well as in normal unmount case.
1154  */
1155 
1156 void
1157 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1158 {
1159 	int index;
1160 	vnode_t *vp;
1161 	rnode4_t *rp, *r_hashf, *rlist;
1162 
1163 	rlist = NULL;
1164 
1165 	for (index = 0; index < rtable4size; index++) {
1166 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1167 		for (rp = rtable4[index].r_hashf;
1168 		    rp != (rnode4_t *)(&rtable4[index]);
1169 		    rp = r_hashf) {
1170 			/* save the hash pointer before destroying */
1171 			r_hashf = rp->r_hashf;
1172 
1173 			vp = RTOV4(rp);
1174 			if (vp->v_vfsp == vfsp) {
1175 				mutex_enter(&rp4freelist_lock);
1176 				if (rp->r_freef != NULL) {
1177 					rp4_rmfree(rp);
1178 					mutex_exit(&rp4freelist_lock);
1179 					rp4_rmhash_locked(rp);
1180 					rp->r_hashf = rlist;
1181 					rlist = rp;
1182 				} else
1183 					mutex_exit(&rp4freelist_lock);
1184 			}
1185 		}
1186 		rw_exit(&rtable4[index].r_lock);
1187 	}
1188 
1189 	for (rp = rlist; rp != NULL; rp = r_hashf) {
1190 		r_hashf = rp->r_hashf;
1191 		/*
1192 		 * This call to rp4_addfree will end up destroying the
1193 		 * rnode, but in a safe way with the appropriate set
1194 		 * of checks done.
1195 		 */
1196 		rp4_addfree(rp, cr);
1197 	}
1198 }
1199 
1200 /*
1201  * This routine destroys all the resources of an rnode
1202  * and finally the rnode itself.
1203  */
1204 static void
1205 destroy_rnode4(rnode4_t *rp)
1206 {
1207 	vnode_t *vp;
1208 	vfs_t *vfsp;
1209 
1210 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1211 
1212 	vp = RTOV4(rp);
1213 	vfsp = vp->v_vfsp;
1214 
1215 	uninit_rnode4(rp);
1216 	atomic_add_long((ulong_t *)&rnode4_new, -1);
1217 #ifdef DEBUG
1218 	clstat4_debug.nrnode.value.ui64--;
1219 #endif
1220 	kmem_cache_free(rnode4_cache, rp);
1221 	vn_invalid(vp);
1222 	vn_free(vp);
1223 	VFS_RELE(vfsp);
1224 }
1225 
1226 /*
1227  * Invalidate the attributes on all rnodes forcing the next getattr
1228  * to go over the wire.  Used to flush stale uid and gid mappings.
1229  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1230  */
1231 void
1232 nfs4_rnode_invalidate(struct vfs *vfsp)
1233 {
1234 	int index;
1235 	rnode4_t *rp;
1236 	vnode_t *vp;
1237 
1238 	/*
1239 	 * Walk the hash queues looking for rnodes.
1240 	 */
1241 	for (index = 0; index < rtable4size; index++) {
1242 		rw_enter(&rtable4[index].r_lock, RW_READER);
1243 		for (rp = rtable4[index].r_hashf;
1244 		    rp != (rnode4_t *)(&rtable4[index]);
1245 		    rp = rp->r_hashf) {
1246 			vp = RTOV4(rp);
1247 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1248 				continue;
1249 
1250 			if (!mutex_tryenter(&rp->r_statelock))
1251 				continue;
1252 
1253 			/*
1254 			 * Expire the attributes by resetting the change
1255 			 * and attr timeout.
1256 			 */
1257 			rp->r_change = 0;
1258 			PURGE_ATTRCACHE4_LOCKED(rp);
1259 			mutex_exit(&rp->r_statelock);
1260 		}
1261 		rw_exit(&rtable4[index].r_lock);
1262 	}
1263 }
1264 
1265 /*
1266  * Flush all vnodes in this (or every) vfs.
1267  * Used by nfs_sync and by nfs_unmount.
1268  */
1269 void
1270 r4flush(struct vfs *vfsp, cred_t *cr)
1271 {
1272 	int index;
1273 	rnode4_t *rp;
1274 	vnode_t *vp, **vplist;
1275 	long num, cnt;
1276 
1277 	/*
1278 	 * Check to see whether there is anything to do.
1279 	 */
1280 	num = rnode4_new;
1281 	if (num == 0)
1282 		return;
1283 
1284 	/*
1285 	 * Allocate a slot for all currently active rnodes on the
1286 	 * supposition that they all may need flushing.
1287 	 */
1288 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1289 	cnt = 0;
1290 
1291 	/*
1292 	 * Walk the hash queues looking for rnodes with page
1293 	 * lists associated with them.  Make a list of these
1294 	 * files.
1295 	 */
1296 	for (index = 0; index < rtable4size; index++) {
1297 		rw_enter(&rtable4[index].r_lock, RW_READER);
1298 		for (rp = rtable4[index].r_hashf;
1299 		    rp != (rnode4_t *)(&rtable4[index]);
1300 		    rp = rp->r_hashf) {
1301 			vp = RTOV4(rp);
1302 			/*
1303 			 * Don't bother sync'ing a vp if it
1304 			 * is part of virtual swap device or
1305 			 * if VFS is read-only
1306 			 */
1307 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1308 				continue;
1309 			/*
1310 			 * If flushing all mounted file systems or
1311 			 * the vnode belongs to this vfs, has pages
1312 			 * and is marked as either dirty or mmap'd,
1313 			 * hold and add this vnode to the list of
1314 			 * vnodes to flush.
1315 			 */
1316 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
1317 			    nfs4_has_pages(vp) &&
1318 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1319 				VN_HOLD(vp);
1320 				vplist[cnt++] = vp;
1321 				if (cnt == num) {
1322 					rw_exit(&rtable4[index].r_lock);
1323 					goto toomany;
1324 				}
1325 			}
1326 		}
1327 		rw_exit(&rtable4[index].r_lock);
1328 	}
1329 toomany:
1330 
1331 	/*
1332 	 * Flush and release all of the files on the list.
1333 	 */
1334 	while (cnt-- > 0) {
1335 		vp = vplist[cnt];
1336 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
1337 		VN_RELE(vp);
1338 	}
1339 
1340 	/*
1341 	 * Free the space allocated to hold the list.
1342 	 */
1343 	kmem_free(vplist, num * sizeof (*vplist));
1344 }
1345 
1346 int
1347 nfs4_free_data_reclaim(rnode4_t *rp)
1348 {
1349 	char *contents;
1350 	vnode_t *xattr;
1351 	int size;
1352 	vsecattr_t *vsp;
1353 	int freed;
1354 	bool_t rdc = FALSE;
1355 
1356 	/*
1357 	 * Free any held caches which may
1358 	 * be associated with this rnode.
1359 	 */
1360 	mutex_enter(&rp->r_statelock);
1361 	if (rp->r_dir != NULL)
1362 		rdc = TRUE;
1363 	contents = rp->r_symlink.contents;
1364 	size = rp->r_symlink.size;
1365 	rp->r_symlink.contents = NULL;
1366 	vsp = rp->r_secattr;
1367 	rp->r_secattr = NULL;
1368 	xattr = rp->r_xattr_dir;
1369 	rp->r_xattr_dir = NULL;
1370 	mutex_exit(&rp->r_statelock);
1371 
1372 	/*
1373 	 * Free the access cache entries.
1374 	 */
1375 	freed = nfs4_access_purge_rp(rp);
1376 
1377 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1378 		return (freed);
1379 
1380 	/*
1381 	 * Free the readdir cache entries, incompletely if we can't block.
1382 	 */
1383 	nfs4_purge_rddir_cache(RTOV4(rp));
1384 
1385 	/*
1386 	 * Free the symbolic link cache.
1387 	 */
1388 	if (contents != NULL) {
1389 
1390 		kmem_free((void *)contents, size);
1391 	}
1392 
1393 	/*
1394 	 * Free any cached ACL.
1395 	 */
1396 	if (vsp != NULL)
1397 		nfs4_acl_free_cache(vsp);
1398 
1399 	/*
1400 	 * Release the xattr directory vnode
1401 	 */
1402 	if (xattr != NULL)
1403 		VN_RELE(xattr);
1404 
1405 	return (1);
1406 }
1407 
1408 static int
1409 nfs4_active_data_reclaim(rnode4_t *rp)
1410 {
1411 	char *contents;
1412 	vnode_t *xattr = NULL;
1413 	int size;
1414 	vsecattr_t *vsp;
1415 	int freed;
1416 	bool_t rdc = FALSE;
1417 
1418 	/*
1419 	 * Free any held credentials and caches which
1420 	 * may be associated with this rnode.
1421 	 */
1422 	if (!mutex_tryenter(&rp->r_statelock))
1423 		return (0);
1424 	contents = rp->r_symlink.contents;
1425 	size = rp->r_symlink.size;
1426 	rp->r_symlink.contents = NULL;
1427 	vsp = rp->r_secattr;
1428 	rp->r_secattr = NULL;
1429 	if (rp->r_dir != NULL)
1430 		rdc = TRUE;
1431 	/*
1432 	 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1433 	 * on the same r_hashq queue. We are not mandated to free all caches.
1434 	 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1435 	 * rnode 'rp' is freed or put on the free list.
1436 	 *
1437 	 * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1438 	 * - it has no associated rnode4_t (its v_data is NULL),
1439 	 * - it is preallocated statically and will never go away,
1440 	 * so we cannot save anything by releasing it.
1441 	 */
1442 	if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP &&
1443 	    VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) {
1444 		xattr = rp->r_xattr_dir;
1445 		rp->r_xattr_dir = NULL;
1446 	}
1447 	mutex_exit(&rp->r_statelock);
1448 
1449 	/*
1450 	 * Free the access cache entries.
1451 	 */
1452 	freed = nfs4_access_purge_rp(rp);
1453 
1454 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1455 		return (freed);
1456 
1457 	/*
1458 	 * Free the symbolic link cache.
1459 	 */
1460 	if (contents != NULL) {
1461 
1462 		kmem_free((void *)contents, size);
1463 	}
1464 
1465 	/*
1466 	 * Free any cached ACL.
1467 	 */
1468 	if (vsp != NULL)
1469 		nfs4_acl_free_cache(vsp);
1470 
1471 	nfs4_purge_rddir_cache(RTOV4(rp));
1472 
1473 	/*
1474 	 * Release the xattr directory vnode
1475 	 */
1476 	if (xattr != NULL)
1477 		VN_RELE(xattr);
1478 
1479 	return (1);
1480 }
1481 
1482 static int
1483 nfs4_free_reclaim(void)
1484 {
1485 	int freed;
1486 	rnode4_t *rp;
1487 
1488 #ifdef DEBUG
1489 	clstat4_debug.f_reclaim.value.ui64++;
1490 #endif
1491 	freed = 0;
1492 	mutex_enter(&rp4freelist_lock);
1493 	rp = rp4freelist;
1494 	if (rp != NULL) {
1495 		do {
1496 			if (nfs4_free_data_reclaim(rp))
1497 				freed = 1;
1498 		} while ((rp = rp->r_freef) != rp4freelist);
1499 	}
1500 	mutex_exit(&rp4freelist_lock);
1501 	return (freed);
1502 }
1503 
1504 static int
1505 nfs4_active_reclaim(void)
1506 {
1507 	int freed;
1508 	int index;
1509 	rnode4_t *rp;
1510 
1511 #ifdef DEBUG
1512 	clstat4_debug.a_reclaim.value.ui64++;
1513 #endif
1514 	freed = 0;
1515 	for (index = 0; index < rtable4size; index++) {
1516 		rw_enter(&rtable4[index].r_lock, RW_READER);
1517 		for (rp = rtable4[index].r_hashf;
1518 		    rp != (rnode4_t *)(&rtable4[index]);
1519 		    rp = rp->r_hashf) {
1520 			if (nfs4_active_data_reclaim(rp))
1521 				freed = 1;
1522 		}
1523 		rw_exit(&rtable4[index].r_lock);
1524 	}
1525 	return (freed);
1526 }
1527 
1528 static int
1529 nfs4_rnode_reclaim(void)
1530 {
1531 	int freed;
1532 	rnode4_t *rp;
1533 	vnode_t *vp;
1534 
1535 #ifdef DEBUG
1536 	clstat4_debug.r_reclaim.value.ui64++;
1537 #endif
1538 	freed = 0;
1539 	mutex_enter(&rp4freelist_lock);
1540 	while ((rp = rp4freelist) != NULL) {
1541 		rp4_rmfree(rp);
1542 		mutex_exit(&rp4freelist_lock);
1543 		if (rp->r_flags & R4HASHED) {
1544 			vp = RTOV4(rp);
1545 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1546 			mutex_enter(&vp->v_lock);
1547 			if (vp->v_count > 1) {
1548 				vp->v_count--;
1549 				mutex_exit(&vp->v_lock);
1550 				rw_exit(&rp->r_hashq->r_lock);
1551 				mutex_enter(&rp4freelist_lock);
1552 				continue;
1553 			}
1554 			mutex_exit(&vp->v_lock);
1555 			rp4_rmhash_locked(rp);
1556 			rw_exit(&rp->r_hashq->r_lock);
1557 		}
1558 		/*
1559 		 * This call to rp_addfree will end up destroying the
1560 		 * rnode, but in a safe way with the appropriate set
1561 		 * of checks done.
1562 		 */
1563 		rp4_addfree(rp, CRED());
1564 		mutex_enter(&rp4freelist_lock);
1565 	}
1566 	mutex_exit(&rp4freelist_lock);
1567 	return (freed);
1568 }
1569 
1570 /*ARGSUSED*/
1571 static void
1572 nfs4_reclaim(void *cdrarg)
1573 {
1574 #ifdef DEBUG
1575 	clstat4_debug.reclaim.value.ui64++;
1576 #endif
1577 	if (nfs4_free_reclaim())
1578 		return;
1579 
1580 	if (nfs4_active_reclaim())
1581 		return;
1582 
1583 	(void) nfs4_rnode_reclaim();
1584 }
1585 
1586 /*
1587  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1588  * clientid can change if the caller drops mi_recovlock.
1589  */
1590 
1591 clientid4
1592 mi2clientid(mntinfo4_t *mi)
1593 {
1594 	nfs4_server_t	*sp;
1595 	clientid4	clientid = 0;
1596 
1597 	/* this locks down sp if it is found */
1598 	sp = find_nfs4_server(mi);
1599 	if (sp != NULL) {
1600 		clientid = sp->clientid;
1601 		mutex_exit(&sp->s_lock);
1602 		nfs4_server_rele(sp);
1603 	}
1604 	return (clientid);
1605 }
1606 
1607 /*
1608  * Return the current lease time for the server associated with the given
1609  * file.  Note that the lease time could change immediately after this
1610  * call.
1611  */
1612 
1613 time_t
1614 r2lease_time(rnode4_t *rp)
1615 {
1616 	nfs4_server_t	*sp;
1617 	time_t		lease_time;
1618 	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1619 
1620 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1621 
1622 	/* this locks down sp if it is found */
1623 	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1624 
1625 	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1626 		if (sp != NULL) {
1627 			mutex_exit(&sp->s_lock);
1628 			nfs4_server_rele(sp);
1629 		}
1630 		nfs_rw_exit(&mi->mi_recovlock);
1631 		return (1);		/* 1 second */
1632 	}
1633 
1634 	ASSERT(sp != NULL);
1635 
1636 	lease_time = sp->s_lease_time;
1637 
1638 	mutex_exit(&sp->s_lock);
1639 	nfs4_server_rele(sp);
1640 	nfs_rw_exit(&mi->mi_recovlock);
1641 
1642 	return (lease_time);
1643 }
1644 
1645 /*
1646  * Return a list with information about all the known open instances for
1647  * a filesystem. The caller must call r4releopenlist() when done with the
1648  * list.
1649  *
1650  * We are safe at looking at os_valid and os_pending_close across dropping
1651  * the 'os_sync_lock' to count up the number of open streams and then
1652  * allocate memory for the osp list due to:
1653  *	-Looking at os_pending_close is safe since this routine is
1654  *	only called via recovery, and os_pending_close can only be set via
1655  *	a non-recovery operation (which are all blocked when recovery
1656  *	is active).
1657  *
1658  *	-Examining os_valid is safe since non-recovery operations, which
1659  *	could potentially switch os_valid to 0, are blocked (via
1660  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1661  *	(which means we are the only recovery thread potentially acting
1662  *	on this open stream).
1663  */
1664 
1665 nfs4_opinst_t *
1666 r4mkopenlist(mntinfo4_t *mi)
1667 {
1668 	nfs4_opinst_t *reopenlist, *rep;
1669 	rnode4_t *rp;
1670 	vnode_t *vp;
1671 	vfs_t *vfsp = mi->mi_vfsp;
1672 	int numosp;
1673 	nfs4_open_stream_t *osp;
1674 	int index;
1675 	open_delegation_type4 dtype;
1676 	int hold_vnode;
1677 
1678 	reopenlist = NULL;
1679 
1680 	for (index = 0; index < rtable4size; index++) {
1681 		rw_enter(&rtable4[index].r_lock, RW_READER);
1682 		for (rp = rtable4[index].r_hashf;
1683 		    rp != (rnode4_t *)(&rtable4[index]);
1684 		    rp = rp->r_hashf) {
1685 
1686 			vp = RTOV4(rp);
1687 			if (vp->v_vfsp != vfsp)
1688 				continue;
1689 			hold_vnode = 0;
1690 
1691 			mutex_enter(&rp->r_os_lock);
1692 
1693 			/* Count the number of valid open_streams of the file */
1694 			numosp = 0;
1695 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1696 			    osp = list_next(&rp->r_open_streams, osp)) {
1697 				mutex_enter(&osp->os_sync_lock);
1698 				if (osp->os_valid && !osp->os_pending_close)
1699 					numosp++;
1700 				mutex_exit(&osp->os_sync_lock);
1701 			}
1702 
1703 			/* Fill in the valid open streams per vp */
1704 			if (numosp > 0) {
1705 				int j;
1706 
1707 				hold_vnode = 1;
1708 
1709 				/*
1710 				 * Add a new open instance to the list
1711 				 */
1712 				rep = kmem_zalloc(sizeof (*reopenlist),
1713 				    KM_SLEEP);
1714 				rep->re_next = reopenlist;
1715 				reopenlist = rep;
1716 
1717 				rep->re_vp = vp;
1718 				rep->re_osp = kmem_zalloc(
1719 				    numosp * sizeof (*(rep->re_osp)),
1720 				    KM_SLEEP);
1721 				rep->re_numosp = numosp;
1722 
1723 				j = 0;
1724 				for (osp = list_head(&rp->r_open_streams);
1725 				    osp != NULL;
1726 				    osp = list_next(&rp->r_open_streams, osp)) {
1727 
1728 					mutex_enter(&osp->os_sync_lock);
1729 					if (osp->os_valid &&
1730 					    !osp->os_pending_close) {
1731 						osp->os_ref_count++;
1732 						rep->re_osp[j] = osp;
1733 						j++;
1734 					}
1735 					mutex_exit(&osp->os_sync_lock);
1736 				}
1737 				/*
1738 				 * Assuming valid osp(s) stays valid between
1739 				 * the time obtaining j and numosp.
1740 				 */
1741 				ASSERT(j == numosp);
1742 			}
1743 
1744 			mutex_exit(&rp->r_os_lock);
1745 			/* do this here to keep v_lock > r_os_lock */
1746 			if (hold_vnode)
1747 				VN_HOLD(vp);
1748 			mutex_enter(&rp->r_statev4_lock);
1749 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1750 				/*
1751 				 * If this rnode holds a delegation,
1752 				 * but if there are no valid open streams,
1753 				 * then just discard the delegation
1754 				 * without doing delegreturn.
1755 				 */
1756 				if (numosp > 0)
1757 					rp->r_deleg_needs_recovery =
1758 					    rp->r_deleg_type;
1759 			}
1760 			/* Save the delegation type for use outside the lock */
1761 			dtype = rp->r_deleg_type;
1762 			mutex_exit(&rp->r_statev4_lock);
1763 
1764 			/*
1765 			 * If we have a delegation then get rid of it.
1766 			 * We've set rp->r_deleg_needs_recovery so we have
1767 			 * enough information to recover.
1768 			 */
1769 			if (dtype != OPEN_DELEGATE_NONE) {
1770 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1771 			}
1772 		}
1773 		rw_exit(&rtable4[index].r_lock);
1774 	}
1775 	return (reopenlist);
1776 }
1777 
1778 /*
1779  * Given a filesystem id, check to see if any rnodes
1780  * within this fsid reside in the rnode cache, other
1781  * than one we know about.
1782  *
1783  * Return 1 if an rnode is found, 0 otherwise
1784  */
1785 int
1786 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid)
1787 {
1788 	rnode4_t *rp;
1789 	vnode_t *vp;
1790 	vfs_t *vfsp = mi->mi_vfsp;
1791 	fattr4_fsid *fsid;
1792 	int index, found = 0;
1793 
1794 	for (index = 0; index < rtable4size; index++) {
1795 		rw_enter(&rtable4[index].r_lock, RW_READER);
1796 		for (rp = rtable4[index].r_hashf;
1797 		    rp != (rnode4_t *)(&rtable4[index]);
1798 		    rp = rp->r_hashf) {
1799 
1800 			vp = RTOV4(rp);
1801 			if (vp->v_vfsp != vfsp)
1802 				continue;
1803 
1804 			/*
1805 			 * XXX there might be a case where a
1806 			 * replicated fs may have the same fsid
1807 			 * across two different servers. This
1808 			 * check isn't good enough in that case
1809 			 */
1810 			fsid = &rp->r_srv_fsid;
1811 			if (FATTR4_FSID_EQ(moved_fsid, fsid)) {
1812 				found = 1;
1813 				break;
1814 			}
1815 		}
1816 		rw_exit(&rtable4[index].r_lock);
1817 
1818 		if (found)
1819 			break;
1820 	}
1821 	return (found);
1822 }
1823 
1824 /*
1825  * Release the list of open instance references.
1826  */
1827 
1828 void
1829 r4releopenlist(nfs4_opinst_t *reopenp)
1830 {
1831 	nfs4_opinst_t *rep, *next;
1832 	int i;
1833 
1834 	for (rep = reopenp; rep; rep = next) {
1835 		next = rep->re_next;
1836 
1837 		for (i = 0; i < rep->re_numosp; i++)
1838 			open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1839 
1840 		VN_RELE(rep->re_vp);
1841 		kmem_free(rep->re_osp,
1842 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1843 
1844 		kmem_free(rep, sizeof (*rep));
1845 	}
1846 }
1847 
1848 int
1849 nfs4_rnode_init(void)
1850 {
1851 	ulong_t nrnode4_max;
1852 	int i;
1853 
1854 	/*
1855 	 * Compute the size of the rnode4 hash table
1856 	 */
1857 	if (nrnode <= 0)
1858 		nrnode = ncsize;
1859 	nrnode4_max =
1860 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1861 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1862 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1863 		    "setting nrnode to max value of %ld", nrnode4_max);
1864 		nrnode = nrnode4_max;
1865 	}
1866 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1867 	rtable4mask = rtable4size - 1;
1868 
1869 	/*
1870 	 * Allocate and initialize the hash buckets
1871 	 */
1872 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1873 	for (i = 0; i < rtable4size; i++) {
1874 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1875 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1876 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1877 	}
1878 
1879 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1880 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1881 
1882 	return (0);
1883 }
1884 
1885 int
1886 nfs4_rnode_fini(void)
1887 {
1888 	int i;
1889 
1890 	/*
1891 	 * Deallocate the rnode hash queues
1892 	 */
1893 	kmem_cache_destroy(rnode4_cache);
1894 
1895 	for (i = 0; i < rtable4size; i++)
1896 		rw_destroy(&rtable4[i].r_lock);
1897 
1898 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1899 
1900 	return (0);
1901 }
1902 
1903 /*
1904  * Return non-zero if the given filehandle refers to the root filehandle
1905  * for the given rnode.
1906  */
1907 
1908 static int
1909 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1910 {
1911 	int isroot;
1912 
1913 	isroot = 0;
1914 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1915 		isroot = 1;
1916 
1917 	return (isroot);
1918 }
1919 
1920 /*
1921  * The r4_stub_* routines assume that the rnode is newly activated, and
1922  * that the caller either holds the hash bucket r_lock for this rnode as
1923  * RW_WRITER, or holds r_statelock.
1924  */
1925 static void
1926 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1927 {
1928 	vnode_t *vp = RTOV4(rp);
1929 	krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1930 
1931 	ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1932 
1933 	rp->r_stub_type = type;
1934 
1935 	/*
1936 	 * Safely switch this vnode to the trigger vnodeops.
1937 	 *
1938 	 * Currently, we don't ever switch a trigger vnode back to using
1939 	 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1940 	 * a new v4 object is not a trigger, and it will already have the
1941 	 * correct v4 vnodeops by default. So, no "else" case required here.
1942 	 */
1943 	if (type != NFS4_STUB_NONE)
1944 		vn_setops(vp, nfs4_trigger_vnodeops);
1945 }
1946 
1947 void
1948 r4_stub_mirrormount(rnode4_t *rp)
1949 {
1950 	r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
1951 }
1952 
1953 void
1954 r4_stub_referral(rnode4_t *rp)
1955 {
1956 	DTRACE_PROBE1(nfs4clnt__func__referral__moved,
1957 	    vnode_t *, RTOV4(rp));
1958 	r4_stub_set(rp, NFS4_STUB_REFERRAL);
1959 }
1960 
1961 void
1962 r4_stub_none(rnode4_t *rp)
1963 {
1964 	r4_stub_set(rp, NFS4_STUB_NONE);
1965 }
1966 
1967 #ifdef DEBUG
1968 
1969 /*
1970  * Look in the rnode table for other rnodes that have the same filehandle.
1971  * Assume the lock is held for the hash chain of checkrp
1972  */
1973 
1974 static void
1975 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
1976 {
1977 	rnode4_t *rp;
1978 	vnode_t *tvp;
1979 	nfs4_fhandle_t fh, fh2;
1980 	int index;
1981 
1982 	if (!r4_check_for_dups)
1983 		return;
1984 
1985 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
1986 
1987 	sfh4_copyval(checkrp->r_fh, &fh);
1988 
1989 	for (index = 0; index < rtable4size; index++) {
1990 
1991 		if (&rtable4[index] != checkrp->r_hashq)
1992 			rw_enter(&rtable4[index].r_lock, RW_READER);
1993 
1994 		for (rp = rtable4[index].r_hashf;
1995 		    rp != (rnode4_t *)(&rtable4[index]);
1996 		    rp = rp->r_hashf) {
1997 
1998 			if (rp == checkrp)
1999 				continue;
2000 
2001 			tvp = RTOV4(rp);
2002 			if (tvp->v_vfsp != vfsp)
2003 				continue;
2004 
2005 			sfh4_copyval(rp->r_fh, &fh2);
2006 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2007 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
2008 				    "(%p, %p)", (void *)checkrp, (void *)rp);
2009 			}
2010 		}
2011 
2012 		if (&rtable4[index] != checkrp->r_hashq)
2013 			rw_exit(&rtable4[index].r_lock);
2014 	}
2015 }
2016 
2017 #endif /* DEBUG */
2018