xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision 2e837a72011f54762249b6612c2a64f171efcd43)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 /*
32  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
33  * Copyright (c) 2017 by Delphix. All rights reserved.
34  */
35 
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/cred.h>
40 #include <sys/proc.h>
41 #include <sys/user.h>
42 #include <sys/time.h>
43 #include <sys/buf.h>
44 #include <sys/vfs.h>
45 #include <sys/vnode.h>
46 #include <sys/socket.h>
47 #include <sys/uio.h>
48 #include <sys/tiuser.h>
49 #include <sys/swap.h>
50 #include <sys/errno.h>
51 #include <sys/debug.h>
52 #include <sys/kmem.h>
53 #include <sys/kstat.h>
54 #include <sys/cmn_err.h>
55 #include <sys/vtrace.h>
56 #include <sys/session.h>
57 #include <sys/dnlc.h>
58 #include <sys/bitmap.h>
59 #include <sys/acl.h>
60 #include <sys/ddi.h>
61 #include <sys/pathname.h>
62 #include <sys/flock.h>
63 #include <sys/dirent.h>
64 #include <sys/flock.h>
65 #include <sys/callb.h>
66 #include <sys/sdt.h>
67 
68 #include <vm/pvn.h>
69 
70 #include <rpc/types.h>
71 #include <rpc/xdr.h>
72 #include <rpc/auth.h>
73 #include <rpc/rpcsec_gss.h>
74 #include <rpc/clnt.h>
75 
76 #include <nfs/nfs.h>
77 #include <nfs/nfs_clnt.h>
78 #include <nfs/nfs_acl.h>
79 
80 #include <nfs/nfs4.h>
81 #include <nfs/rnode4.h>
82 #include <nfs/nfs4_clnt.h>
83 
84 /*
85  * The hash queues for the access to active and cached rnodes
86  * are organized as doubly linked lists.  A reader/writer lock
87  * for each hash bucket is used to control access and to synchronize
88  * lookups, additions, and deletions from the hash queue.
89  *
90  * The rnode freelist is organized as a doubly linked list with
91  * a head pointer.  Additions and deletions are synchronized via
92  * a single mutex.
93  *
94  * In order to add an rnode to the free list, it must be hashed into
95  * a hash queue and the exclusive lock to the hash queue be held.
96  * If an rnode is not hashed into a hash queue, then it is destroyed
97  * because it represents no valuable information that can be reused
98  * about the file.  The exclusive lock to the hash queue must be
99  * held in order to prevent a lookup in the hash queue from finding
100  * the rnode and using it and assuming that the rnode is not on the
101  * freelist.  The lookup in the hash queue will have the hash queue
102  * locked, either exclusive or shared.
103  *
104  * The vnode reference count for each rnode is not allowed to drop
105  * below 1.  This prevents external entities, such as the VM
106  * subsystem, from acquiring references to vnodes already on the
107  * freelist and then trying to place them back on the freelist
108  * when their reference is released.  This means that the when an
109  * rnode is looked up in the hash queues, then either the rnode
110  * is removed from the freelist and that reference is transferred to
111  * the new reference or the vnode reference count must be incremented
112  * accordingly.  The mutex for the freelist must be held in order to
113  * accurately test to see if the rnode is on the freelist or not.
114  * The hash queue lock might be held shared and it is possible that
115  * two different threads may race to remove the rnode from the
116  * freelist.  This race can be resolved by holding the mutex for the
117  * freelist.  Please note that the mutex for the freelist does not
118  * need to be held if the rnode is not on the freelist.  It can not be
119  * placed on the freelist due to the requirement that the thread
120  * putting the rnode on the freelist must hold the exclusive lock
121  * to the hash queue and the thread doing the lookup in the hash
122  * queue is holding either a shared or exclusive lock to the hash
123  * queue.
124  *
125  * The lock ordering is:
126  *
127  *	hash bucket lock -> vnode lock
128  *	hash bucket lock -> freelist lock -> r_statelock
129  */
130 r4hashq_t *rtable4;
131 
132 static kmutex_t rp4freelist_lock;
133 static rnode4_t *rp4freelist = NULL;
134 static long rnode4_new = 0;
135 int rtable4size;
136 static int rtable4mask;
137 static struct kmem_cache *rnode4_cache;
138 static int rnode4_hashlen = 4;
139 
140 static void	r4inactive(rnode4_t *, cred_t *);
141 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
142 		    struct vnodeops *,
143 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
144 		    cred_t *),
145 		    int *, cred_t *);
146 static void	rp4_rmfree(rnode4_t *);
147 int		nfs4_free_data_reclaim(rnode4_t *);
148 static int	nfs4_active_data_reclaim(rnode4_t *);
149 static int	nfs4_free_reclaim(void);
150 static int	nfs4_active_reclaim(void);
151 static int	nfs4_rnode_reclaim(void);
152 static void	nfs4_reclaim(void *);
153 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
154 static void	uninit_rnode4(rnode4_t *);
155 static void	destroy_rnode4(rnode4_t *);
156 static void	r4_stub_set(rnode4_t *, nfs4_stub_type_t);
157 
158 #ifdef DEBUG
159 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
160 static int nfs4_rnode_debug = 0;
161 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
162 static int nfs4_rnode_nofreelist = 0;
163 /* give messages on colliding shared filehandles */
164 static void	r4_dup_check(rnode4_t *, vfs_t *);
165 #endif
166 
167 /*
168  * If the vnode has pages, run the list and check for any that are
169  * still dangling.  We call this routine before putting an rnode on
170  * the free list.
171  */
172 static int
173 nfs4_dross_pages(vnode_t *vp)
174 {
175 	page_t *pp;
176 	kmutex_t *vphm;
177 
178 	vphm = page_vnode_mutex(vp);
179 	mutex_enter(vphm);
180 	if ((pp = vp->v_pages) != NULL) {
181 		do {
182 			if (pp->p_hash != PVN_VPLIST_HASH_TAG &&
183 			    pp->p_fsdata != C_NOCOMMIT) {
184 				mutex_exit(vphm);
185 				return (1);
186 			}
187 		} while ((pp = pp->p_vpnext) != vp->v_pages);
188 	}
189 	mutex_exit(vphm);
190 
191 	return (0);
192 }
193 
194 /*
195  * Flush any pages left on this rnode.
196  */
197 static void
198 r4flushpages(rnode4_t *rp, cred_t *cr)
199 {
200 	vnode_t *vp;
201 	int error;
202 
203 	/*
204 	 * Before freeing anything, wait until all asynchronous
205 	 * activity is done on this rnode.  This will allow all
206 	 * asynchronous read ahead and write behind i/o's to
207 	 * finish.
208 	 */
209 	mutex_enter(&rp->r_statelock);
210 	while (rp->r_count > 0)
211 		cv_wait(&rp->r_cv, &rp->r_statelock);
212 	mutex_exit(&rp->r_statelock);
213 
214 	/*
215 	 * Flush and invalidate all pages associated with the vnode.
216 	 */
217 	vp = RTOV4(rp);
218 	if (nfs4_has_pages(vp)) {
219 		ASSERT(vp->v_type != VCHR);
220 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
221 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
222 			if (error && (error == ENOSPC || error == EDQUOT)) {
223 				mutex_enter(&rp->r_statelock);
224 				if (!rp->r_error)
225 					rp->r_error = error;
226 				mutex_exit(&rp->r_statelock);
227 			}
228 		}
229 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
230 	}
231 }
232 
233 /*
234  * Free the resources associated with an rnode.
235  */
236 static void
237 r4inactive(rnode4_t *rp, cred_t *cr)
238 {
239 	vnode_t *vp;
240 	char *contents;
241 	int size;
242 	vsecattr_t *vsp;
243 	vnode_t *xattr;
244 
245 	r4flushpages(rp, cr);
246 
247 	vp = RTOV4(rp);
248 
249 	/*
250 	 * Free any held caches which may be
251 	 * associated with this rnode.
252 	 */
253 	mutex_enter(&rp->r_statelock);
254 	contents = rp->r_symlink.contents;
255 	size = rp->r_symlink.size;
256 	rp->r_symlink.contents = NULL;
257 	vsp = rp->r_secattr;
258 	rp->r_secattr = NULL;
259 	xattr = rp->r_xattr_dir;
260 	rp->r_xattr_dir = NULL;
261 	mutex_exit(&rp->r_statelock);
262 
263 	/*
264 	 * Free the access cache entries.
265 	 */
266 	(void) nfs4_access_purge_rp(rp);
267 
268 	/*
269 	 * Free the readdir cache entries.
270 	 */
271 	nfs4_purge_rddir_cache(vp);
272 
273 	/*
274 	 * Free the symbolic link cache.
275 	 */
276 	if (contents != NULL) {
277 
278 		kmem_free((void *)contents, size);
279 	}
280 
281 	/*
282 	 * Free any cached ACL.
283 	 */
284 	if (vsp != NULL)
285 		nfs4_acl_free_cache(vsp);
286 
287 	/*
288 	 * Release the cached xattr_dir
289 	 */
290 	if (xattr != NULL)
291 		VN_RELE(xattr);
292 }
293 
294 /*
295  * We have seen a case that the fh passed in is for "." which
296  * should be a VROOT node, however, the fh is different from the
297  * root fh stored in the mntinfo4_t. The invalid fh might be
298  * from a misbehaved server and will panic the client system at
299  * a later time. To avoid the panic, we drop the bad fh, use
300  * the root fh from mntinfo4_t, and print an error message
301  * for attention.
302  */
303 nfs4_sharedfh_t *
304 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
305     int *wasbad)
306 {
307 	char *s;
308 
309 	*wasbad = 0;
310 	s = fn_name(nm);
311 	ASSERT(strcmp(s, "..") != 0);
312 
313 	if ((s[0] == '.' && s[1] == '\0') && fh &&
314 	    !SFH4_SAME(mi->mi_rootfh, fh)) {
315 #ifdef DEBUG
316 		nfs4_fhandle_t fhandle;
317 
318 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
319 		    "Server %s returns a different "
320 		    "root filehandle for the path %s:",
321 		    mi->mi_curr_serv->sv_hostname,
322 		    mi->mi_curr_serv->sv_path);
323 
324 		/* print the bad fh */
325 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
326 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
327 		    fhandle.fh_len);
328 		nfs4_printfhandle(&fhandle);
329 
330 		/* print mi_rootfh */
331 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
332 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
333 		    fhandle.fh_len);
334 		nfs4_printfhandle(&fhandle);
335 #endif
336 		/* use mi_rootfh instead; fh will be rele by the caller */
337 		fh = mi->mi_rootfh;
338 		*wasbad = 1;
339 	}
340 
341 	kmem_free(s, MAXNAMELEN);
342 	return (fh);
343 }
344 
345 void
346 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
347     hrtime_t t, cred_t *cr, int index)
348 {
349 	int is_stub;
350 	vattr_t *attr;
351 	/*
352 	 * Don't add to attrcache if time overflow, but
353 	 * no need to check because either attr is null or the time
354 	 * values in it were processed by nfs4_time_ntov(), which checks
355 	 * for time overflows.
356 	 */
357 	attr = garp ? &garp->n4g_va : NULL;
358 
359 	if (attr) {
360 		if (!newnode) {
361 			rw_exit(&rtable4[index].r_lock);
362 #ifdef DEBUG
363 			if (vp->v_type != attr->va_type &&
364 			    vp->v_type != VNON && attr->va_type != VNON) {
365 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
366 				    "makenfs4node: type (%d) doesn't "
367 				    "match type of found node at %p (%d)",
368 				    attr->va_type, (void *)vp, vp->v_type);
369 			}
370 #endif
371 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
372 		} else {
373 			rnode4_t *rp = VTOR4(vp);
374 
375 			vp->v_type = attr->va_type;
376 			vp->v_rdev = attr->va_rdev;
377 
378 			/*
379 			 * Turn this object into a "stub" object if we
380 			 * crossed an underlying server fs boundary.
381 			 * To make this check, during mount we save the
382 			 * fsid of the server object being mounted.
383 			 * Here we compare this object's server fsid
384 			 * with the fsid we saved at mount.  If they
385 			 * are different, we crossed server fs boundary.
386 			 *
387 			 * The stub type is set (or not) at rnode
388 			 * creation time and it never changes for life
389 			 * of the rnode.
390 			 *
391 			 * This stub will be for a mirror-mount, rather than
392 			 * a referral (the latter also sets R4SRVSTUB).
393 			 *
394 			 * The stub type is also set during RO failover,
395 			 * nfs4_remap_file().
396 			 *
397 			 * We don't bother with taking r_state_lock to
398 			 * set the stub type because this is a new rnode
399 			 * and we're holding the hash bucket r_lock RW_WRITER.
400 			 * No other thread could have obtained access
401 			 * to this rnode.
402 			 */
403 			is_stub = 0;
404 			if (garp->n4g_fsid_valid) {
405 				fattr4_fsid ga_fsid = garp->n4g_fsid;
406 				servinfo4_t *svp = rp->r_server;
407 
408 				rp->r_srv_fsid = ga_fsid;
409 
410 				(void) nfs_rw_enter_sig(&svp->sv_lock,
411 				    RW_READER, 0);
412 				if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
413 					is_stub = 1;
414 				nfs_rw_exit(&svp->sv_lock);
415 			}
416 
417 			if (is_stub)
418 				r4_stub_mirrormount(rp);
419 			else
420 				r4_stub_none(rp);
421 
422 			/* Can not cache partial attr */
423 			if (attr->va_mask == AT_ALL)
424 				nfs4_attrcache_noinval(vp, garp, t);
425 			else
426 				PURGE_ATTRCACHE4(vp);
427 
428 			rw_exit(&rtable4[index].r_lock);
429 		}
430 	} else {
431 		if (newnode) {
432 			PURGE_ATTRCACHE4(vp);
433 		}
434 		rw_exit(&rtable4[index].r_lock);
435 	}
436 }
437 
438 /*
439  * Find or create an rnode based primarily on filehandle.  To be
440  * used when dvp (vnode for parent directory) is not available;
441  * otherwise, makenfs4node() should be used.
442  *
443  * The nfs4_fname_t argument *npp is consumed and nulled out.
444  */
445 
446 vnode_t *
447 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
448     nfs4_fname_t **npp, nfs4_ga_res_t *garp,
449     mntinfo4_t *mi, cred_t *cr, hrtime_t t)
450 {
451 	vfs_t *vfsp = mi->mi_vfsp;
452 	int newnode = 0;
453 	vnode_t *vp;
454 	rnode4_t *rp;
455 	svnode_t *svp;
456 	nfs4_fname_t *name, *svpname;
457 	int index;
458 
459 	ASSERT(npp && *npp);
460 	name = *npp;
461 	*npp = NULL;
462 
463 	index = rtable4hash(sfh);
464 	rw_enter(&rtable4[index].r_lock, RW_READER);
465 
466 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
467 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
468 
469 	svp = VTOSV(vp);
470 	rp = VTOR4(vp);
471 	if (newnode) {
472 		svp->sv_forw = svp->sv_back = svp;
473 		svp->sv_name = name;
474 		if (psfh != NULL)
475 			sfh4_hold(psfh);
476 		svp->sv_dfh = psfh;
477 	} else {
478 		/*
479 		 * It is possible that due to a server
480 		 * side rename fnames have changed.
481 		 * update the fname here.
482 		 */
483 		mutex_enter(&rp->r_svlock);
484 		svpname = svp->sv_name;
485 		if (svp->sv_name != name) {
486 			svp->sv_name = name;
487 			mutex_exit(&rp->r_svlock);
488 			fn_rele(&svpname);
489 		} else {
490 			mutex_exit(&rp->r_svlock);
491 			fn_rele(&name);
492 		}
493 	}
494 
495 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
496 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
497 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
498 
499 	return (vp);
500 }
501 
502 /*
503  * Find or create a vnode for the given filehandle, filesystem, parent, and
504  * name.  The reference to nm is consumed, so the caller must first do an
505  * fn_hold() if it wants to continue using nm after this call.
506  */
507 vnode_t *
508 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
509     hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
510 {
511 	vnode_t *vp;
512 	int newnode;
513 	int index;
514 	mntinfo4_t *mi = VFTOMI4(vfsp);
515 	int had_badfh = 0;
516 	rnode4_t *rp;
517 
518 	ASSERT(dvp != NULL);
519 
520 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
521 
522 	index = rtable4hash(fh);
523 	rw_enter(&rtable4[index].r_lock, RW_READER);
524 
525 	/*
526 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
527 	 */
528 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
529 	    nfs4_putapage, &newnode, cr);
530 
531 	rp = VTOR4(vp);
532 	sv_activate(&vp, dvp, &nm, newnode);
533 	if (dvp->v_flag & V_XATTRDIR) {
534 		mutex_enter(&rp->r_statelock);
535 		rp->r_flags |= R4ISXATTR;
536 		mutex_exit(&rp->r_statelock);
537 	}
538 
539 	/* if getting a bad file handle, do not cache the attributes. */
540 	if (had_badfh) {
541 		rw_exit(&rtable4[index].r_lock);
542 		return (vp);
543 	}
544 
545 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
546 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
547 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
548 
549 	return (vp);
550 }
551 
552 /*
553  * Hash on address of filehandle object.
554  * XXX totally untuned.
555  */
556 
557 int
558 rtable4hash(nfs4_sharedfh_t *fh)
559 {
560 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
561 }
562 
563 /*
564  * Find or create the vnode for the given filehandle and filesystem.
565  * *newnode is set to zero if the vnode already existed; non-zero if it had
566  * to be created.
567  *
568  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
569  */
570 
571 static vnode_t *
572 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
573     struct vnodeops *vops,
574     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
575     int *newnode, cred_t *cr)
576 {
577 	rnode4_t *rp;
578 	rnode4_t *trp;
579 	vnode_t *vp;
580 	mntinfo4_t *mi;
581 
582 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
583 
584 	mi = VFTOMI4(vfsp);
585 
586 start:
587 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
588 		vp = RTOV4(rp);
589 		*newnode = 0;
590 		return (vp);
591 	}
592 	rw_exit(&rhtp->r_lock);
593 
594 	mutex_enter(&rp4freelist_lock);
595 
596 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
597 		rp = rp4freelist;
598 		rp4_rmfree(rp);
599 		mutex_exit(&rp4freelist_lock);
600 
601 		vp = RTOV4(rp);
602 
603 		if (rp->r_flags & R4HASHED) {
604 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
605 			mutex_enter(&vp->v_lock);
606 			if (vp->v_count > 1) {
607 				VN_RELE_LOCKED(vp);
608 				mutex_exit(&vp->v_lock);
609 				rw_exit(&rp->r_hashq->r_lock);
610 				rw_enter(&rhtp->r_lock, RW_READER);
611 				goto start;
612 			}
613 			mutex_exit(&vp->v_lock);
614 			rp4_rmhash_locked(rp);
615 			rw_exit(&rp->r_hashq->r_lock);
616 		}
617 
618 		r4inactive(rp, cr);
619 
620 		mutex_enter(&vp->v_lock);
621 		if (vp->v_count > 1) {
622 			VN_RELE_LOCKED(vp);
623 			mutex_exit(&vp->v_lock);
624 			rw_enter(&rhtp->r_lock, RW_READER);
625 			goto start;
626 		}
627 		mutex_exit(&vp->v_lock);
628 		vn_invalid(vp);
629 
630 		/*
631 		 * destroy old locks before bzero'ing and
632 		 * recreating the locks below.
633 		 */
634 		uninit_rnode4(rp);
635 
636 		/*
637 		 * Make sure that if rnode is recycled then
638 		 * VFS count is decremented properly before
639 		 * reuse.
640 		 */
641 		VFS_RELE(vp->v_vfsp);
642 		vn_reinit(vp);
643 	} else {
644 		vnode_t *new_vp;
645 
646 		mutex_exit(&rp4freelist_lock);
647 
648 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
649 		new_vp = vn_alloc(KM_SLEEP);
650 
651 		atomic_inc_ulong((ulong_t *)&rnode4_new);
652 #ifdef DEBUG
653 		clstat4_debug.nrnode.value.ui64++;
654 #endif
655 		vp = new_vp;
656 	}
657 
658 	bzero(rp, sizeof (*rp));
659 	rp->r_vnode = vp;
660 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
661 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
662 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
663 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
664 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
665 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
666 	rp->created_v4 = 0;
667 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
668 	    offsetof(nfs4_open_stream_t, os_node));
669 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
670 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
671 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
672 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
673 	rp->r_flags = R4READDIRWATTR;
674 	rp->r_fh = fh;
675 	rp->r_hashq = rhtp;
676 	sfh4_hold(rp->r_fh);
677 	rp->r_server = mi->mi_curr_serv;
678 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
679 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
680 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
681 
682 	rddir4_cache_create(rp);
683 	rp->r_putapage = putapage;
684 	vn_setops(vp, vops);
685 	vp->v_data = (caddr_t)rp;
686 	vp->v_vfsp = vfsp;
687 	VFS_HOLD(vfsp);
688 	vp->v_type = VNON;
689 	vp->v_flag |= VMODSORT;
690 	if (isrootfh(fh, rp))
691 		vp->v_flag = VROOT;
692 	vn_exists(vp);
693 
694 	/*
695 	 * There is a race condition if someone else
696 	 * alloc's the rnode while no locks are held, so we
697 	 * check again and recover if found.
698 	 */
699 	rw_enter(&rhtp->r_lock, RW_WRITER);
700 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
701 		vp = RTOV4(trp);
702 		*newnode = 0;
703 		rw_exit(&rhtp->r_lock);
704 		rp4_addfree(rp, cr);
705 		rw_enter(&rhtp->r_lock, RW_READER);
706 		return (vp);
707 	}
708 	rp4_addhash(rp);
709 	*newnode = 1;
710 	return (vp);
711 }
712 
713 static void
714 uninit_rnode4(rnode4_t *rp)
715 {
716 	vnode_t *vp = RTOV4(rp);
717 
718 	ASSERT(rp != NULL);
719 	ASSERT(vp != NULL);
720 	ASSERT(vp->v_count == 1);
721 	ASSERT(rp->r_count == 0);
722 	ASSERT(rp->r_mapcnt == 0);
723 	if (rp->r_flags & R4LODANGLERS) {
724 		nfs4_flush_lock_owners(rp);
725 	}
726 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
727 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
728 	ASSERT(!(rp->r_flags & R4HASHED));
729 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
730 	nfs4_clear_open_streams(rp);
731 	list_destroy(&rp->r_open_streams);
732 
733 	/*
734 	 * Destroy the rddir cache first since we need to grab the r_statelock.
735 	 */
736 	mutex_enter(&rp->r_statelock);
737 	rddir4_cache_destroy(rp);
738 	mutex_exit(&rp->r_statelock);
739 	sv_uninit(&rp->r_svnode);
740 	sfh4_rele(&rp->r_fh);
741 	nfs_rw_destroy(&rp->r_rwlock);
742 	nfs_rw_destroy(&rp->r_lkserlock);
743 	mutex_destroy(&rp->r_statelock);
744 	mutex_destroy(&rp->r_statev4_lock);
745 	mutex_destroy(&rp->r_os_lock);
746 	cv_destroy(&rp->r_cv);
747 	cv_destroy(&rp->r_commit.c_cv);
748 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
749 	if (rp->r_flags & R4DELMAPLIST)
750 		list_destroy(&rp->r_indelmap);
751 }
752 
753 /*
754  * Put an rnode on the free list.
755  *
756  * Rnodes which were allocated above and beyond the normal limit
757  * are immediately freed.
758  */
759 void
760 rp4_addfree(rnode4_t *rp, cred_t *cr)
761 {
762 	vnode_t *vp;
763 	vnode_t *xattr;
764 	struct vfs *vfsp;
765 
766 	vp = RTOV4(rp);
767 	ASSERT(vp->v_count >= 1);
768 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
769 
770 	/*
771 	 * If we have too many rnodes allocated and there are no
772 	 * references to this rnode, or if the rnode is no longer
773 	 * accessible by it does not reside in the hash queues,
774 	 * or if an i/o error occurred while writing to the file,
775 	 * then just free it instead of putting it on the rnode
776 	 * freelist.
777 	 */
778 	vfsp = vp->v_vfsp;
779 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
780 #ifdef DEBUG
781 	    (nfs4_rnode_nofreelist != 0) ||
782 #endif
783 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
784 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
785 		if (rp->r_flags & R4HASHED) {
786 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
787 			mutex_enter(&vp->v_lock);
788 			if (vp->v_count > 1) {
789 				VN_RELE_LOCKED(vp);
790 				mutex_exit(&vp->v_lock);
791 				rw_exit(&rp->r_hashq->r_lock);
792 				return;
793 			}
794 			mutex_exit(&vp->v_lock);
795 			rp4_rmhash_locked(rp);
796 			rw_exit(&rp->r_hashq->r_lock);
797 		}
798 
799 		/*
800 		 * Make sure we don't have a delegation on this rnode
801 		 * before destroying it.
802 		 */
803 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
804 			(void) nfs4delegreturn(rp,
805 			    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
806 		}
807 
808 		r4inactive(rp, cr);
809 
810 		/*
811 		 * Recheck the vnode reference count.  We need to
812 		 * make sure that another reference has not been
813 		 * acquired while we were not holding v_lock.  The
814 		 * rnode is not in the rnode hash queues; one
815 		 * way for a reference to have been acquired
816 		 * is for a VOP_PUTPAGE because the rnode was marked
817 		 * with R4DIRTY or for a modified page.  This
818 		 * reference may have been acquired before our call
819 		 * to r4inactive.  The i/o may have been completed,
820 		 * thus allowing r4inactive to complete, but the
821 		 * reference to the vnode may not have been released
822 		 * yet.  In any case, the rnode can not be destroyed
823 		 * until the other references to this vnode have been
824 		 * released.  The other references will take care of
825 		 * either destroying the rnode or placing it on the
826 		 * rnode freelist.  If there are no other references,
827 		 * then the rnode may be safely destroyed.
828 		 */
829 		mutex_enter(&vp->v_lock);
830 		if (vp->v_count > 1) {
831 			VN_RELE_LOCKED(vp);
832 			mutex_exit(&vp->v_lock);
833 			return;
834 		}
835 		mutex_exit(&vp->v_lock);
836 
837 		destroy_rnode4(rp);
838 		return;
839 	}
840 
841 	/*
842 	 * Lock the hash queue and then recheck the reference count
843 	 * to ensure that no other threads have acquired a reference
844 	 * to indicate that the rnode should not be placed on the
845 	 * freelist.  If another reference has been acquired, then
846 	 * just release this one and let the other thread complete
847 	 * the processing of adding this rnode to the freelist.
848 	 */
849 again:
850 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
851 
852 	mutex_enter(&vp->v_lock);
853 	if (vp->v_count > 1) {
854 		VN_RELE_LOCKED(vp);
855 		mutex_exit(&vp->v_lock);
856 		rw_exit(&rp->r_hashq->r_lock);
857 		return;
858 	}
859 	mutex_exit(&vp->v_lock);
860 
861 	/*
862 	 * Make sure we don't put an rnode with a delegation
863 	 * on the free list.
864 	 */
865 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
866 		rw_exit(&rp->r_hashq->r_lock);
867 		(void) nfs4delegreturn(rp,
868 		    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
869 		goto again;
870 	}
871 
872 	/*
873 	 * Now that we have the hash queue lock, and we know there
874 	 * are not anymore references on the vnode, check to make
875 	 * sure there aren't any open streams still on the rnode.
876 	 * If so, drop the hash queue lock, remove the open streams,
877 	 * and recheck the v_count.
878 	 */
879 	mutex_enter(&rp->r_os_lock);
880 	if (list_head(&rp->r_open_streams) != NULL) {
881 		mutex_exit(&rp->r_os_lock);
882 		rw_exit(&rp->r_hashq->r_lock);
883 		if (nfs_zone() != VTOMI4(vp)->mi_zone)
884 			nfs4_clear_open_streams(rp);
885 		else
886 			(void) nfs4close_all(vp, cr);
887 		goto again;
888 	}
889 	mutex_exit(&rp->r_os_lock);
890 
891 	/*
892 	 * Before we put it on the freelist, make sure there are no pages.
893 	 * If there are, flush and commit of all of the dirty and
894 	 * uncommitted pages, assuming the file system isn't read only.
895 	 */
896 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
897 		rw_exit(&rp->r_hashq->r_lock);
898 		r4flushpages(rp, cr);
899 		goto again;
900 	}
901 
902 	/*
903 	 * Before we put it on the freelist, make sure there is no
904 	 * active xattr directory cached, the freelist will not
905 	 * have its entries r4inactive'd if there is still an active
906 	 * rnode, thus nothing in the freelist can hold another
907 	 * rnode active.
908 	 */
909 	xattr = rp->r_xattr_dir;
910 	rp->r_xattr_dir = NULL;
911 
912 	/*
913 	 * If there is no cached data or metadata for this file, then
914 	 * put the rnode on the front of the freelist so that it will
915 	 * be reused before other rnodes which may have cached data or
916 	 * metadata associated with them.
917 	 */
918 	mutex_enter(&rp4freelist_lock);
919 	if (rp4freelist == NULL) {
920 		rp->r_freef = rp;
921 		rp->r_freeb = rp;
922 		rp4freelist = rp;
923 	} else {
924 		rp->r_freef = rp4freelist;
925 		rp->r_freeb = rp4freelist->r_freeb;
926 		rp4freelist->r_freeb->r_freef = rp;
927 		rp4freelist->r_freeb = rp;
928 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
929 		    rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
930 			rp4freelist = rp;
931 	}
932 	mutex_exit(&rp4freelist_lock);
933 
934 	rw_exit(&rp->r_hashq->r_lock);
935 
936 	if (xattr)
937 		VN_RELE(xattr);
938 }
939 
940 /*
941  * Remove an rnode from the free list.
942  *
943  * The caller must be holding rp4freelist_lock and the rnode
944  * must be on the freelist.
945  */
946 static void
947 rp4_rmfree(rnode4_t *rp)
948 {
949 
950 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
951 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
952 
953 	if (rp == rp4freelist) {
954 		rp4freelist = rp->r_freef;
955 		if (rp == rp4freelist)
956 			rp4freelist = NULL;
957 	}
958 	rp->r_freeb->r_freef = rp->r_freef;
959 	rp->r_freef->r_freeb = rp->r_freeb;
960 
961 	rp->r_freef = rp->r_freeb = NULL;
962 }
963 
964 /*
965  * Put a rnode in the hash table.
966  *
967  * The caller must be holding the exclusive hash queue lock
968  */
969 void
970 rp4_addhash(rnode4_t *rp)
971 {
972 	mntinfo4_t *mi;
973 
974 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
975 	ASSERT(!(rp->r_flags & R4HASHED));
976 
977 #ifdef DEBUG
978 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
979 #endif
980 
981 	rp->r_hashf = rp->r_hashq->r_hashf;
982 	rp->r_hashq->r_hashf = rp;
983 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
984 	rp->r_hashf->r_hashb = rp;
985 
986 	mutex_enter(&rp->r_statelock);
987 	rp->r_flags |= R4HASHED;
988 	mutex_exit(&rp->r_statelock);
989 
990 	mi = VTOMI4(RTOV4(rp));
991 	mutex_enter(&mi->mi_rnodes_lock);
992 	list_insert_tail(&mi->mi_rnodes, rp);
993 	mutex_exit(&mi->mi_rnodes_lock);
994 }
995 
996 /*
997  * Remove a rnode from the hash table.
998  *
999  * The caller must be holding the hash queue lock.
1000  */
1001 void
1002 rp4_rmhash_locked(rnode4_t *rp)
1003 {
1004 	mntinfo4_t *mi;
1005 
1006 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
1007 	ASSERT(rp->r_flags & R4HASHED);
1008 
1009 	rp->r_hashb->r_hashf = rp->r_hashf;
1010 	rp->r_hashf->r_hashb = rp->r_hashb;
1011 
1012 	mutex_enter(&rp->r_statelock);
1013 	rp->r_flags &= ~R4HASHED;
1014 	mutex_exit(&rp->r_statelock);
1015 
1016 	mi = VTOMI4(RTOV4(rp));
1017 	mutex_enter(&mi->mi_rnodes_lock);
1018 	if (list_link_active(&rp->r_mi_link))
1019 		list_remove(&mi->mi_rnodes, rp);
1020 	mutex_exit(&mi->mi_rnodes_lock);
1021 }
1022 
1023 /*
1024  * Remove a rnode from the hash table.
1025  *
1026  * The caller must not be holding the hash queue lock.
1027  */
1028 void
1029 rp4_rmhash(rnode4_t *rp)
1030 {
1031 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1032 	rp4_rmhash_locked(rp);
1033 	rw_exit(&rp->r_hashq->r_lock);
1034 }
1035 
1036 /*
1037  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1038  * Returns NULL if no match.  If an rnode is returned, the reference count
1039  * on the master vnode is incremented.
1040  *
1041  * The caller must be holding the hash queue lock, either shared or exclusive.
1042  */
1043 rnode4_t *
1044 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1045 {
1046 	rnode4_t *rp;
1047 	vnode_t *vp;
1048 
1049 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1050 
1051 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1052 		vp = RTOV4(rp);
1053 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1054 
1055 			mutex_enter(&rp->r_statelock);
1056 			if (rp->r_flags & R4RECOVERR) {
1057 				mutex_exit(&rp->r_statelock);
1058 				continue;
1059 			}
1060 			mutex_exit(&rp->r_statelock);
1061 #ifdef DEBUG
1062 			r4_dup_check(rp, vfsp);
1063 #endif
1064 			if (rp->r_freef != NULL) {
1065 				mutex_enter(&rp4freelist_lock);
1066 				/*
1067 				 * If the rnode is on the freelist,
1068 				 * then remove it and use that reference
1069 				 * as the new reference.  Otherwise,
1070 				 * need to increment the reference count.
1071 				 */
1072 				if (rp->r_freef != NULL) {
1073 					rp4_rmfree(rp);
1074 					mutex_exit(&rp4freelist_lock);
1075 				} else {
1076 					mutex_exit(&rp4freelist_lock);
1077 					VN_HOLD(vp);
1078 				}
1079 			} else
1080 				VN_HOLD(vp);
1081 
1082 			/*
1083 			 * if root vnode, set v_flag to indicate that
1084 			 */
1085 			if (isrootfh(fh, rp)) {
1086 				if (!(vp->v_flag & VROOT)) {
1087 					mutex_enter(&vp->v_lock);
1088 					vp->v_flag |= VROOT;
1089 					mutex_exit(&vp->v_lock);
1090 				}
1091 			}
1092 			return (rp);
1093 		}
1094 	}
1095 	return (NULL);
1096 }
1097 
1098 /*
1099  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1100  * that assumes the caller hasn't already got the lock
1101  * on the hash bucket.
1102  */
1103 rnode4_t *
1104 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1105 {
1106 	rnode4_t *rp;
1107 	int index;
1108 
1109 	index = rtable4hash(fh);
1110 	rw_enter(&rtable4[index].r_lock, RW_READER);
1111 	rp = r4find(&rtable4[index], fh, vfsp);
1112 	rw_exit(&rtable4[index].r_lock);
1113 
1114 	return (rp);
1115 }
1116 
1117 /*
1118  * Return 1 if there is an active vnode belonging to this vfs in the
1119  * rtable4 cache.
1120  *
1121  * Several of these checks are done without holding the usual
1122  * locks.  This is safe because destroy_rtable4(), rp4_addfree(),
1123  * etc. will redo the necessary checks before actually destroying
1124  * any rnodes.
1125  */
1126 int
1127 check_rtable4(struct vfs *vfsp)
1128 {
1129 	rnode4_t *rp;
1130 	vnode_t *vp;
1131 	mntinfo4_t *mi;
1132 
1133 	ASSERT(vfsp != NULL);
1134 	mi = VFTOMI4(vfsp);
1135 
1136 	mutex_enter(&mi->mi_rnodes_lock);
1137 	for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1138 	    rp = list_next(&mi->mi_rnodes, rp)) {
1139 		vp = RTOV4(rp);
1140 
1141 		if (rp->r_freef == NULL ||
1142 		    (nfs4_has_pages(vp) && (rp->r_flags & R4DIRTY)) ||
1143 		    rp->r_count > 0) {
1144 			mutex_exit(&mi->mi_rnodes_lock);
1145 			return (1);
1146 		}
1147 	}
1148 	mutex_exit(&mi->mi_rnodes_lock);
1149 
1150 	return (0);
1151 }
1152 
1153 /*
1154  * Destroy inactive vnodes from the hash queues which
1155  * belong to this vfs. All of the vnodes should be inactive.
1156  * It is essential that we destroy all rnodes in case of
1157  * forced unmount as well as in normal unmount case.
1158  */
1159 
1160 void
1161 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1162 {
1163 	rnode4_t *rp;
1164 	mntinfo4_t *mi;
1165 
1166 	ASSERT(vfsp != NULL);
1167 
1168 	mi = VFTOMI4(vfsp);
1169 
1170 	mutex_enter(&rp4freelist_lock);
1171 	mutex_enter(&mi->mi_rnodes_lock);
1172 	while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
1173 		/*
1174 		 * If the rnode is no longer on the freelist it is not
1175 		 * ours and it will be handled by some other thread, so
1176 		 * skip it.
1177 		 */
1178 		if (rp->r_freef == NULL)
1179 			continue;
1180 		mutex_exit(&mi->mi_rnodes_lock);
1181 
1182 		rp4_rmfree(rp);
1183 		mutex_exit(&rp4freelist_lock);
1184 
1185 		rp4_rmhash(rp);
1186 
1187 		/*
1188 		 * This call to rp4_addfree will end up destroying the
1189 		 * rnode, but in a safe way with the appropriate set
1190 		 * of checks done.
1191 		 */
1192 		rp4_addfree(rp, cr);
1193 
1194 		mutex_enter(&rp4freelist_lock);
1195 		mutex_enter(&mi->mi_rnodes_lock);
1196 	}
1197 	mutex_exit(&mi->mi_rnodes_lock);
1198 	mutex_exit(&rp4freelist_lock);
1199 }
1200 
1201 /*
1202  * This routine destroys all the resources of an rnode
1203  * and finally the rnode itself.
1204  */
1205 static void
1206 destroy_rnode4(rnode4_t *rp)
1207 {
1208 	vnode_t *vp;
1209 	vfs_t *vfsp;
1210 
1211 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1212 
1213 	vp = RTOV4(rp);
1214 	vfsp = vp->v_vfsp;
1215 
1216 	uninit_rnode4(rp);
1217 	atomic_dec_ulong((ulong_t *)&rnode4_new);
1218 #ifdef DEBUG
1219 	clstat4_debug.nrnode.value.ui64--;
1220 #endif
1221 	kmem_cache_free(rnode4_cache, rp);
1222 	vn_invalid(vp);
1223 	vn_free(vp);
1224 	VFS_RELE(vfsp);
1225 }
1226 
1227 /*
1228  * Invalidate the attributes on all rnodes forcing the next getattr
1229  * to go over the wire.  Used to flush stale uid and gid mappings.
1230  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1231  */
1232 void
1233 nfs4_rnode_invalidate(struct vfs *vfsp)
1234 {
1235 	int index;
1236 	rnode4_t *rp;
1237 	vnode_t *vp;
1238 
1239 	/*
1240 	 * Walk the hash queues looking for rnodes.
1241 	 */
1242 	for (index = 0; index < rtable4size; index++) {
1243 		rw_enter(&rtable4[index].r_lock, RW_READER);
1244 		for (rp = rtable4[index].r_hashf;
1245 		    rp != (rnode4_t *)(&rtable4[index]);
1246 		    rp = rp->r_hashf) {
1247 			vp = RTOV4(rp);
1248 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1249 				continue;
1250 
1251 			if (!mutex_tryenter(&rp->r_statelock))
1252 				continue;
1253 
1254 			/*
1255 			 * Expire the attributes by resetting the change
1256 			 * and attr timeout.
1257 			 */
1258 			rp->r_change = 0;
1259 			PURGE_ATTRCACHE4_LOCKED(rp);
1260 			mutex_exit(&rp->r_statelock);
1261 		}
1262 		rw_exit(&rtable4[index].r_lock);
1263 	}
1264 }
1265 
1266 /*
1267  * Flush all vnodes in this (or every) vfs.
1268  * Used by nfs_sync and by nfs_unmount.
1269  */
1270 void
1271 r4flush(struct vfs *vfsp, cred_t *cr)
1272 {
1273 	int index;
1274 	rnode4_t *rp;
1275 	vnode_t *vp, **vplist;
1276 	long num, cnt;
1277 
1278 	/*
1279 	 * Check to see whether there is anything to do.
1280 	 */
1281 	num = rnode4_new;
1282 	if (num == 0)
1283 		return;
1284 
1285 	/*
1286 	 * Allocate a slot for all currently active rnodes on the
1287 	 * supposition that they all may need flushing.
1288 	 */
1289 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1290 	cnt = 0;
1291 
1292 	/*
1293 	 * If the vfs is known we can do fast path by iterating all rnodes that
1294 	 * belongs to this vfs.  This is much faster than the traditional way
1295 	 * of iterating rtable4 (below) in a case there is a lot of rnodes that
1296 	 * does not belong to our vfs.
1297 	 */
1298 	if (vfsp != NULL) {
1299 		mntinfo4_t *mi = VFTOMI4(vfsp);
1300 
1301 		mutex_enter(&mi->mi_rnodes_lock);
1302 		for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1303 		    rp = list_next(&mi->mi_rnodes, rp)) {
1304 			vp = RTOV4(rp);
1305 			/*
1306 			 * Don't bother sync'ing a vp if it
1307 			 * is part of virtual swap device or
1308 			 * if VFS is read-only
1309 			 */
1310 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1311 				continue;
1312 			/*
1313 			 * If the vnode has pages and is marked as either dirty
1314 			 * or mmap'd, hold and add this vnode to the list of
1315 			 * vnodes to flush.
1316 			 */
1317 			ASSERT(vp->v_vfsp == vfsp);
1318 			if (nfs4_has_pages(vp) &&
1319 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1320 				VN_HOLD(vp);
1321 				vplist[cnt++] = vp;
1322 				if (cnt == num) {
1323 					/*
1324 					 * The vplist is full because there is
1325 					 * too many rnodes.  We are done for
1326 					 * now.
1327 					 */
1328 					break;
1329 				}
1330 			}
1331 		}
1332 		mutex_exit(&mi->mi_rnodes_lock);
1333 
1334 		goto done;
1335 	}
1336 
1337 	ASSERT(vfsp == NULL);
1338 
1339 	/*
1340 	 * Walk the hash queues looking for rnodes with page
1341 	 * lists associated with them.  Make a list of these
1342 	 * files.
1343 	 */
1344 	for (index = 0; index < rtable4size; index++) {
1345 		rw_enter(&rtable4[index].r_lock, RW_READER);
1346 		for (rp = rtable4[index].r_hashf;
1347 		    rp != (rnode4_t *)(&rtable4[index]);
1348 		    rp = rp->r_hashf) {
1349 			vp = RTOV4(rp);
1350 			/*
1351 			 * Don't bother sync'ing a vp if it
1352 			 * is part of virtual swap device or
1353 			 * if VFS is read-only
1354 			 */
1355 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1356 				continue;
1357 			/*
1358 			 * If the vnode has pages and is marked as either dirty
1359 			 * or mmap'd, hold and add this vnode to the list of
1360 			 * vnodes to flush.
1361 			 */
1362 			if (nfs4_has_pages(vp) &&
1363 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1364 				VN_HOLD(vp);
1365 				vplist[cnt++] = vp;
1366 				if (cnt == num) {
1367 					rw_exit(&rtable4[index].r_lock);
1368 					/*
1369 					 * The vplist is full because there is
1370 					 * too many rnodes.  We are done for
1371 					 * now.
1372 					 */
1373 					goto done;
1374 				}
1375 			}
1376 		}
1377 		rw_exit(&rtable4[index].r_lock);
1378 	}
1379 
1380 done:
1381 
1382 	/*
1383 	 * Flush and release all of the files on the list.
1384 	 */
1385 	while (cnt-- > 0) {
1386 		vp = vplist[cnt];
1387 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
1388 		VN_RELE(vp);
1389 	}
1390 
1391 	/*
1392 	 * Free the space allocated to hold the list.
1393 	 */
1394 	kmem_free(vplist, num * sizeof (*vplist));
1395 }
1396 
1397 int
1398 nfs4_free_data_reclaim(rnode4_t *rp)
1399 {
1400 	char *contents;
1401 	vnode_t *xattr;
1402 	int size;
1403 	vsecattr_t *vsp;
1404 	int freed;
1405 	bool_t rdc = FALSE;
1406 
1407 	/*
1408 	 * Free any held caches which may
1409 	 * be associated with this rnode.
1410 	 */
1411 	mutex_enter(&rp->r_statelock);
1412 	if (rp->r_dir != NULL)
1413 		rdc = TRUE;
1414 	contents = rp->r_symlink.contents;
1415 	size = rp->r_symlink.size;
1416 	rp->r_symlink.contents = NULL;
1417 	vsp = rp->r_secattr;
1418 	rp->r_secattr = NULL;
1419 	xattr = rp->r_xattr_dir;
1420 	rp->r_xattr_dir = NULL;
1421 	mutex_exit(&rp->r_statelock);
1422 
1423 	/*
1424 	 * Free the access cache entries.
1425 	 */
1426 	freed = nfs4_access_purge_rp(rp);
1427 
1428 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1429 		return (freed);
1430 
1431 	/*
1432 	 * Free the readdir cache entries, incompletely if we can't block.
1433 	 */
1434 	nfs4_purge_rddir_cache(RTOV4(rp));
1435 
1436 	/*
1437 	 * Free the symbolic link cache.
1438 	 */
1439 	if (contents != NULL) {
1440 
1441 		kmem_free((void *)contents, size);
1442 	}
1443 
1444 	/*
1445 	 * Free any cached ACL.
1446 	 */
1447 	if (vsp != NULL)
1448 		nfs4_acl_free_cache(vsp);
1449 
1450 	/*
1451 	 * Release the xattr directory vnode
1452 	 */
1453 	if (xattr != NULL)
1454 		VN_RELE(xattr);
1455 
1456 	return (1);
1457 }
1458 
1459 static int
1460 nfs4_active_data_reclaim(rnode4_t *rp)
1461 {
1462 	char *contents;
1463 	vnode_t *xattr = NULL;
1464 	int size;
1465 	vsecattr_t *vsp;
1466 	int freed;
1467 	bool_t rdc = FALSE;
1468 
1469 	/*
1470 	 * Free any held credentials and caches which
1471 	 * may be associated with this rnode.
1472 	 */
1473 	if (!mutex_tryenter(&rp->r_statelock))
1474 		return (0);
1475 	contents = rp->r_symlink.contents;
1476 	size = rp->r_symlink.size;
1477 	rp->r_symlink.contents = NULL;
1478 	vsp = rp->r_secattr;
1479 	rp->r_secattr = NULL;
1480 	if (rp->r_dir != NULL)
1481 		rdc = TRUE;
1482 	/*
1483 	 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1484 	 * on the same r_hashq queue. We are not mandated to free all caches.
1485 	 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1486 	 * rnode 'rp' is freed or put on the free list.
1487 	 *
1488 	 * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1489 	 * - it has no associated rnode4_t (its v_data is NULL),
1490 	 * - it is preallocated statically and will never go away,
1491 	 * so we cannot save anything by releasing it.
1492 	 */
1493 	if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP &&
1494 	    VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) {
1495 		xattr = rp->r_xattr_dir;
1496 		rp->r_xattr_dir = NULL;
1497 	}
1498 	mutex_exit(&rp->r_statelock);
1499 
1500 	/*
1501 	 * Free the access cache entries.
1502 	 */
1503 	freed = nfs4_access_purge_rp(rp);
1504 
1505 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1506 		return (freed);
1507 
1508 	/*
1509 	 * Free the symbolic link cache.
1510 	 */
1511 	if (contents != NULL) {
1512 
1513 		kmem_free((void *)contents, size);
1514 	}
1515 
1516 	/*
1517 	 * Free any cached ACL.
1518 	 */
1519 	if (vsp != NULL)
1520 		nfs4_acl_free_cache(vsp);
1521 
1522 	nfs4_purge_rddir_cache(RTOV4(rp));
1523 
1524 	/*
1525 	 * Release the xattr directory vnode
1526 	 */
1527 	if (xattr != NULL)
1528 		VN_RELE(xattr);
1529 
1530 	return (1);
1531 }
1532 
1533 static int
1534 nfs4_free_reclaim(void)
1535 {
1536 	int freed;
1537 	rnode4_t *rp;
1538 
1539 #ifdef DEBUG
1540 	clstat4_debug.f_reclaim.value.ui64++;
1541 #endif
1542 	freed = 0;
1543 	mutex_enter(&rp4freelist_lock);
1544 	rp = rp4freelist;
1545 	if (rp != NULL) {
1546 		do {
1547 			if (nfs4_free_data_reclaim(rp))
1548 				freed = 1;
1549 		} while ((rp = rp->r_freef) != rp4freelist);
1550 	}
1551 	mutex_exit(&rp4freelist_lock);
1552 	return (freed);
1553 }
1554 
1555 static int
1556 nfs4_active_reclaim(void)
1557 {
1558 	int freed;
1559 	int index;
1560 	rnode4_t *rp;
1561 
1562 #ifdef DEBUG
1563 	clstat4_debug.a_reclaim.value.ui64++;
1564 #endif
1565 	freed = 0;
1566 	for (index = 0; index < rtable4size; index++) {
1567 		rw_enter(&rtable4[index].r_lock, RW_READER);
1568 		for (rp = rtable4[index].r_hashf;
1569 		    rp != (rnode4_t *)(&rtable4[index]);
1570 		    rp = rp->r_hashf) {
1571 			if (nfs4_active_data_reclaim(rp))
1572 				freed = 1;
1573 		}
1574 		rw_exit(&rtable4[index].r_lock);
1575 	}
1576 	return (freed);
1577 }
1578 
1579 static int
1580 nfs4_rnode_reclaim(void)
1581 {
1582 	int freed;
1583 	rnode4_t *rp;
1584 	vnode_t *vp;
1585 
1586 #ifdef DEBUG
1587 	clstat4_debug.r_reclaim.value.ui64++;
1588 #endif
1589 	freed = 0;
1590 	mutex_enter(&rp4freelist_lock);
1591 	while ((rp = rp4freelist) != NULL) {
1592 		rp4_rmfree(rp);
1593 		mutex_exit(&rp4freelist_lock);
1594 		if (rp->r_flags & R4HASHED) {
1595 			vp = RTOV4(rp);
1596 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1597 			mutex_enter(&vp->v_lock);
1598 			if (vp->v_count > 1) {
1599 				VN_RELE_LOCKED(vp);
1600 				mutex_exit(&vp->v_lock);
1601 				rw_exit(&rp->r_hashq->r_lock);
1602 				mutex_enter(&rp4freelist_lock);
1603 				continue;
1604 			}
1605 			mutex_exit(&vp->v_lock);
1606 			rp4_rmhash_locked(rp);
1607 			rw_exit(&rp->r_hashq->r_lock);
1608 		}
1609 		/*
1610 		 * This call to rp_addfree will end up destroying the
1611 		 * rnode, but in a safe way with the appropriate set
1612 		 * of checks done.
1613 		 */
1614 		rp4_addfree(rp, CRED());
1615 		mutex_enter(&rp4freelist_lock);
1616 	}
1617 	mutex_exit(&rp4freelist_lock);
1618 	return (freed);
1619 }
1620 
1621 /*ARGSUSED*/
1622 static void
1623 nfs4_reclaim(void *cdrarg)
1624 {
1625 #ifdef DEBUG
1626 	clstat4_debug.reclaim.value.ui64++;
1627 #endif
1628 	if (nfs4_free_reclaim())
1629 		return;
1630 
1631 	if (nfs4_active_reclaim())
1632 		return;
1633 
1634 	(void) nfs4_rnode_reclaim();
1635 }
1636 
1637 /*
1638  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1639  * clientid can change if the caller drops mi_recovlock.
1640  */
1641 
1642 clientid4
1643 mi2clientid(mntinfo4_t *mi)
1644 {
1645 	nfs4_server_t	*sp;
1646 	clientid4	clientid = 0;
1647 
1648 	/* this locks down sp if it is found */
1649 	sp = find_nfs4_server(mi);
1650 	if (sp != NULL) {
1651 		clientid = sp->clientid;
1652 		mutex_exit(&sp->s_lock);
1653 		nfs4_server_rele(sp);
1654 	}
1655 	return (clientid);
1656 }
1657 
1658 /*
1659  * Return the current lease time for the server associated with the given
1660  * file.  Note that the lease time could change immediately after this
1661  * call.
1662  */
1663 
1664 time_t
1665 r2lease_time(rnode4_t *rp)
1666 {
1667 	nfs4_server_t	*sp;
1668 	time_t		lease_time;
1669 	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1670 
1671 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1672 
1673 	/* this locks down sp if it is found */
1674 	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1675 
1676 	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1677 		if (sp != NULL) {
1678 			mutex_exit(&sp->s_lock);
1679 			nfs4_server_rele(sp);
1680 		}
1681 		nfs_rw_exit(&mi->mi_recovlock);
1682 		return (1);		/* 1 second */
1683 	}
1684 
1685 	ASSERT(sp != NULL);
1686 
1687 	lease_time = sp->s_lease_time;
1688 
1689 	mutex_exit(&sp->s_lock);
1690 	nfs4_server_rele(sp);
1691 	nfs_rw_exit(&mi->mi_recovlock);
1692 
1693 	return (lease_time);
1694 }
1695 
1696 /*
1697  * Return a list with information about all the known open instances for
1698  * a filesystem. The caller must call r4releopenlist() when done with the
1699  * list.
1700  *
1701  * We are safe at looking at os_valid and os_pending_close across dropping
1702  * the 'os_sync_lock' to count up the number of open streams and then
1703  * allocate memory for the osp list due to:
1704  *	-Looking at os_pending_close is safe since this routine is
1705  *	only called via recovery, and os_pending_close can only be set via
1706  *	a non-recovery operation (which are all blocked when recovery
1707  *	is active).
1708  *
1709  *	-Examining os_valid is safe since non-recovery operations, which
1710  *	could potentially switch os_valid to 0, are blocked (via
1711  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1712  *	(which means we are the only recovery thread potentially acting
1713  *	on this open stream).
1714  */
1715 
1716 nfs4_opinst_t *
1717 r4mkopenlist(mntinfo4_t *mi)
1718 {
1719 	nfs4_opinst_t *reopenlist, *rep;
1720 	rnode4_t *rp;
1721 	vnode_t *vp;
1722 	vfs_t *vfsp = mi->mi_vfsp;
1723 	int numosp;
1724 	nfs4_open_stream_t *osp;
1725 	int index;
1726 	open_delegation_type4 dtype;
1727 	int hold_vnode;
1728 
1729 	reopenlist = NULL;
1730 
1731 	for (index = 0; index < rtable4size; index++) {
1732 		rw_enter(&rtable4[index].r_lock, RW_READER);
1733 		for (rp = rtable4[index].r_hashf;
1734 		    rp != (rnode4_t *)(&rtable4[index]);
1735 		    rp = rp->r_hashf) {
1736 
1737 			vp = RTOV4(rp);
1738 			if (vp->v_vfsp != vfsp)
1739 				continue;
1740 			hold_vnode = 0;
1741 
1742 			mutex_enter(&rp->r_os_lock);
1743 
1744 			/* Count the number of valid open_streams of the file */
1745 			numosp = 0;
1746 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1747 			    osp = list_next(&rp->r_open_streams, osp)) {
1748 				mutex_enter(&osp->os_sync_lock);
1749 				if (osp->os_valid && !osp->os_pending_close)
1750 					numosp++;
1751 				mutex_exit(&osp->os_sync_lock);
1752 			}
1753 
1754 			/* Fill in the valid open streams per vp */
1755 			if (numosp > 0) {
1756 				int j;
1757 
1758 				hold_vnode = 1;
1759 
1760 				/*
1761 				 * Add a new open instance to the list
1762 				 */
1763 				rep = kmem_zalloc(sizeof (*reopenlist),
1764 				    KM_SLEEP);
1765 				rep->re_next = reopenlist;
1766 				reopenlist = rep;
1767 
1768 				rep->re_vp = vp;
1769 				rep->re_osp = kmem_zalloc(
1770 				    numosp * sizeof (*(rep->re_osp)),
1771 				    KM_SLEEP);
1772 				rep->re_numosp = numosp;
1773 
1774 				j = 0;
1775 				for (osp = list_head(&rp->r_open_streams);
1776 				    osp != NULL;
1777 				    osp = list_next(&rp->r_open_streams, osp)) {
1778 
1779 					mutex_enter(&osp->os_sync_lock);
1780 					if (osp->os_valid &&
1781 					    !osp->os_pending_close) {
1782 						osp->os_ref_count++;
1783 						rep->re_osp[j] = osp;
1784 						j++;
1785 					}
1786 					mutex_exit(&osp->os_sync_lock);
1787 				}
1788 				/*
1789 				 * Assuming valid osp(s) stays valid between
1790 				 * the time obtaining j and numosp.
1791 				 */
1792 				ASSERT(j == numosp);
1793 			}
1794 
1795 			mutex_exit(&rp->r_os_lock);
1796 			/* do this here to keep v_lock > r_os_lock */
1797 			if (hold_vnode)
1798 				VN_HOLD(vp);
1799 			mutex_enter(&rp->r_statev4_lock);
1800 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1801 				/*
1802 				 * If this rnode holds a delegation,
1803 				 * but if there are no valid open streams,
1804 				 * then just discard the delegation
1805 				 * without doing delegreturn.
1806 				 */
1807 				if (numosp > 0)
1808 					rp->r_deleg_needs_recovery =
1809 					    rp->r_deleg_type;
1810 			}
1811 			/* Save the delegation type for use outside the lock */
1812 			dtype = rp->r_deleg_type;
1813 			mutex_exit(&rp->r_statev4_lock);
1814 
1815 			/*
1816 			 * If we have a delegation then get rid of it.
1817 			 * We've set rp->r_deleg_needs_recovery so we have
1818 			 * enough information to recover.
1819 			 */
1820 			if (dtype != OPEN_DELEGATE_NONE) {
1821 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1822 			}
1823 		}
1824 		rw_exit(&rtable4[index].r_lock);
1825 	}
1826 	return (reopenlist);
1827 }
1828 
1829 /*
1830  * Given a filesystem id, check to see if any rnodes
1831  * within this fsid reside in the rnode cache, other
1832  * than one we know about.
1833  *
1834  * Return 1 if an rnode is found, 0 otherwise
1835  */
1836 int
1837 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid)
1838 {
1839 	rnode4_t *rp;
1840 	vnode_t *vp;
1841 	vfs_t *vfsp = mi->mi_vfsp;
1842 	fattr4_fsid *fsid;
1843 	int index, found = 0;
1844 
1845 	for (index = 0; index < rtable4size; index++) {
1846 		rw_enter(&rtable4[index].r_lock, RW_READER);
1847 		for (rp = rtable4[index].r_hashf;
1848 		    rp != (rnode4_t *)(&rtable4[index]);
1849 		    rp = rp->r_hashf) {
1850 
1851 			vp = RTOV4(rp);
1852 			if (vp->v_vfsp != vfsp)
1853 				continue;
1854 
1855 			/*
1856 			 * XXX there might be a case where a
1857 			 * replicated fs may have the same fsid
1858 			 * across two different servers. This
1859 			 * check isn't good enough in that case
1860 			 */
1861 			fsid = &rp->r_srv_fsid;
1862 			if (FATTR4_FSID_EQ(moved_fsid, fsid)) {
1863 				found = 1;
1864 				break;
1865 			}
1866 		}
1867 		rw_exit(&rtable4[index].r_lock);
1868 
1869 		if (found)
1870 			break;
1871 	}
1872 	return (found);
1873 }
1874 
1875 /*
1876  * Release the list of open instance references.
1877  */
1878 
1879 void
1880 r4releopenlist(nfs4_opinst_t *reopenp)
1881 {
1882 	nfs4_opinst_t *rep, *next;
1883 	int i;
1884 
1885 	for (rep = reopenp; rep; rep = next) {
1886 		next = rep->re_next;
1887 
1888 		for (i = 0; i < rep->re_numosp; i++)
1889 			open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1890 
1891 		VN_RELE(rep->re_vp);
1892 		kmem_free(rep->re_osp,
1893 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1894 
1895 		kmem_free(rep, sizeof (*rep));
1896 	}
1897 }
1898 
1899 int
1900 nfs4_rnode_init(void)
1901 {
1902 	ulong_t nrnode4_max;
1903 	int i;
1904 
1905 	/*
1906 	 * Compute the size of the rnode4 hash table
1907 	 */
1908 	if (nrnode <= 0)
1909 		nrnode = ncsize;
1910 	nrnode4_max =
1911 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1912 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1913 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1914 		    "!setting nrnode to max value of %ld", nrnode4_max);
1915 		nrnode = nrnode4_max;
1916 	}
1917 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1918 	rtable4mask = rtable4size - 1;
1919 
1920 	/*
1921 	 * Allocate and initialize the hash buckets
1922 	 */
1923 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1924 	for (i = 0; i < rtable4size; i++) {
1925 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1926 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1927 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1928 	}
1929 
1930 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1931 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1932 
1933 	return (0);
1934 }
1935 
1936 int
1937 nfs4_rnode_fini(void)
1938 {
1939 	int i;
1940 
1941 	/*
1942 	 * Deallocate the rnode hash queues
1943 	 */
1944 	kmem_cache_destroy(rnode4_cache);
1945 
1946 	for (i = 0; i < rtable4size; i++)
1947 		rw_destroy(&rtable4[i].r_lock);
1948 
1949 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1950 
1951 	return (0);
1952 }
1953 
1954 /*
1955  * Return non-zero if the given filehandle refers to the root filehandle
1956  * for the given rnode.
1957  */
1958 
1959 static int
1960 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1961 {
1962 	int isroot;
1963 
1964 	isroot = 0;
1965 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1966 		isroot = 1;
1967 
1968 	return (isroot);
1969 }
1970 
1971 /*
1972  * The r4_stub_* routines assume that the rnode is newly activated, and
1973  * that the caller either holds the hash bucket r_lock for this rnode as
1974  * RW_WRITER, or holds r_statelock.
1975  */
1976 static void
1977 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1978 {
1979 	vnode_t *vp = RTOV4(rp);
1980 	krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1981 
1982 	ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1983 
1984 	rp->r_stub_type = type;
1985 
1986 	/*
1987 	 * Safely switch this vnode to the trigger vnodeops.
1988 	 *
1989 	 * Currently, we don't ever switch a trigger vnode back to using
1990 	 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1991 	 * a new v4 object is not a trigger, and it will already have the
1992 	 * correct v4 vnodeops by default. So, no "else" case required here.
1993 	 */
1994 	if (type != NFS4_STUB_NONE)
1995 		vn_setops(vp, nfs4_trigger_vnodeops);
1996 }
1997 
1998 void
1999 r4_stub_mirrormount(rnode4_t *rp)
2000 {
2001 	r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
2002 }
2003 
2004 void
2005 r4_stub_referral(rnode4_t *rp)
2006 {
2007 	DTRACE_PROBE1(nfs4clnt__func__referral__moved,
2008 	    vnode_t *, RTOV4(rp));
2009 	r4_stub_set(rp, NFS4_STUB_REFERRAL);
2010 }
2011 
2012 void
2013 r4_stub_none(rnode4_t *rp)
2014 {
2015 	r4_stub_set(rp, NFS4_STUB_NONE);
2016 }
2017 
2018 #ifdef DEBUG
2019 
2020 /*
2021  * Look in the rnode table for other rnodes that have the same filehandle.
2022  * Assume the lock is held for the hash chain of checkrp
2023  */
2024 
2025 static void
2026 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
2027 {
2028 	rnode4_t *rp;
2029 	vnode_t *tvp;
2030 	nfs4_fhandle_t fh, fh2;
2031 	int index;
2032 
2033 	if (!r4_check_for_dups)
2034 		return;
2035 
2036 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
2037 
2038 	sfh4_copyval(checkrp->r_fh, &fh);
2039 
2040 	for (index = 0; index < rtable4size; index++) {
2041 
2042 		if (&rtable4[index] != checkrp->r_hashq)
2043 			rw_enter(&rtable4[index].r_lock, RW_READER);
2044 
2045 		for (rp = rtable4[index].r_hashf;
2046 		    rp != (rnode4_t *)(&rtable4[index]);
2047 		    rp = rp->r_hashf) {
2048 
2049 			if (rp == checkrp)
2050 				continue;
2051 
2052 			tvp = RTOV4(rp);
2053 			if (tvp->v_vfsp != vfsp)
2054 				continue;
2055 
2056 			sfh4_copyval(rp->r_fh, &fh2);
2057 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2058 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
2059 				    "(%p, %p)", (void *)checkrp, (void *)rp);
2060 			}
2061 		}
2062 
2063 		if (&rtable4[index] != checkrp->r_hashq)
2064 			rw_exit(&rtable4[index].r_lock);
2065 	}
2066 }
2067 
2068 #endif /* DEBUG */
2069