xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision c138f478d2bc94e73ab8f6a084e323bec25e62f5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
29  *	All Rights Reserved
30  */
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/proc.h>
39 #include <sys/user.h>
40 #include <sys/time.h>
41 #include <sys/buf.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/socket.h>
45 #include <sys/uio.h>
46 #include <sys/tiuser.h>
47 #include <sys/swap.h>
48 #include <sys/errno.h>
49 #include <sys/debug.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vtrace.h>
54 #include <sys/session.h>
55 #include <sys/dnlc.h>
56 #include <sys/bitmap.h>
57 #include <sys/acl.h>
58 #include <sys/ddi.h>
59 #include <sys/pathname.h>
60 #include <sys/flock.h>
61 #include <sys/dirent.h>
62 #include <sys/flock.h>
63 #include <sys/callb.h>
64 
65 #include <rpc/types.h>
66 #include <rpc/xdr.h>
67 #include <rpc/auth.h>
68 #include <rpc/rpcsec_gss.h>
69 #include <rpc/clnt.h>
70 
71 #include <nfs/nfs.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/nfs_acl.h>
74 
75 #include <nfs/nfs4.h>
76 #include <nfs/rnode4.h>
77 #include <nfs/nfs4_clnt.h>
78 
79 /*
80  * The hash queues for the access to active and cached rnodes
81  * are organized as doubly linked lists.  A reader/writer lock
82  * for each hash bucket is used to control access and to synchronize
83  * lookups, additions, and deletions from the hash queue.
84  *
85  * The rnode freelist is organized as a doubly linked list with
86  * a head pointer.  Additions and deletions are synchronized via
87  * a single mutex.
88  *
89  * In order to add an rnode to the free list, it must be hashed into
90  * a hash queue and the exclusive lock to the hash queue be held.
91  * If an rnode is not hashed into a hash queue, then it is destroyed
92  * because it represents no valuable information that can be reused
93  * about the file.  The exclusive lock to the hash queue must be
94  * held in order to prevent a lookup in the hash queue from finding
95  * the rnode and using it and assuming that the rnode is not on the
96  * freelist.  The lookup in the hash queue will have the hash queue
97  * locked, either exclusive or shared.
98  *
99  * The vnode reference count for each rnode is not allowed to drop
100  * below 1.  This prevents external entities, such as the VM
101  * subsystem, from acquiring references to vnodes already on the
102  * freelist and then trying to place them back on the freelist
103  * when their reference is released.  This means that the when an
104  * rnode is looked up in the hash queues, then either the rnode
105  * is removed from the freelist and that reference is tranfered to
106  * the new reference or the vnode reference count must be incremented
107  * accordingly.  The mutex for the freelist must be held in order to
108  * accurately test to see if the rnode is on the freelist or not.
109  * The hash queue lock might be held shared and it is possible that
110  * two different threads may race to remove the rnode from the
111  * freelist.  This race can be resolved by holding the mutex for the
112  * freelist.  Please note that the mutex for the freelist does not
113  * need to be held if the rnode is not on the freelist.  It can not be
114  * placed on the freelist due to the requirement that the thread
115  * putting the rnode on the freelist must hold the exclusive lock
116  * to the hash queue and the thread doing the lookup in the hash
117  * queue is holding either a shared or exclusive lock to the hash
118  * queue.
119  *
120  * The lock ordering is:
121  *
122  *	hash bucket lock -> vnode lock
123  *	hash bucket lock -> freelist lock -> r_statelock
124  */
125 r4hashq_t *rtable4;
126 
127 static kmutex_t rp4freelist_lock;
128 static rnode4_t *rp4freelist = NULL;
129 static long rnode4_new = 0;
130 int rtable4size;
131 static int rtable4mask;
132 static struct kmem_cache *rnode4_cache;
133 static int rnode4_hashlen = 4;
134 
135 static void	r4inactive(rnode4_t *, cred_t *);
136 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
137 		    struct vnodeops *,
138 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
139 		    cred_t *),
140 		    int *, cred_t *);
141 static void	rp4_rmfree(rnode4_t *);
142 int		nfs4_free_data_reclaim(rnode4_t *);
143 static int	nfs4_active_data_reclaim(rnode4_t *);
144 static int	nfs4_free_reclaim(void);
145 static int	nfs4_active_reclaim(void);
146 static int	nfs4_rnode_reclaim(void);
147 static void	nfs4_reclaim(void *);
148 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
149 static void	uninit_rnode4(rnode4_t *);
150 static void	destroy_rnode4(rnode4_t *);
151 
152 #ifdef DEBUG
153 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
154 static int nfs4_rnode_debug = 0;
155 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
156 static int nfs4_rnode_nofreelist = 0;
157 /* give messages on colliding shared filehandles */
158 static void	r4_dup_check(rnode4_t *, vfs_t *);
159 #endif
160 
161 /*
162  * Free the resources associated with an rnode.
163  */
164 static void
165 r4inactive(rnode4_t *rp, cred_t *cr)
166 {
167 	vnode_t *vp;
168 	char *contents;
169 	int size;
170 	vsecattr_t *vsp;
171 	vnode_t *xattr;
172 	int error;
173 
174 	/*
175 	 * Before freeing anything, wait until all asynchronous
176 	 * activity is done on this rnode.  This will allow all
177 	 * asynchronous read ahead and write behind i/o's to
178 	 * finish.
179 	 */
180 	mutex_enter(&rp->r_statelock);
181 	while (rp->r_count > 0)
182 		cv_wait(&rp->r_cv, &rp->r_statelock);
183 	mutex_exit(&rp->r_statelock);
184 
185 	/*
186 	 * Flush and invalidate all pages associated with the vnode.
187 	 */
188 	vp = RTOV4(rp);
189 	if (nfs4_has_pages(vp)) {
190 		ASSERT(vp->v_type != VCHR);
191 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
192 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr);
193 			if (error && (error == ENOSPC || error == EDQUOT)) {
194 				mutex_enter(&rp->r_statelock);
195 				if (!rp->r_error)
196 					rp->r_error = error;
197 				mutex_exit(&rp->r_statelock);
198 			}
199 		}
200 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
201 	}
202 
203 	/*
204 	 * Free any held caches which may be
205 	 * associated with this rnode.
206 	 */
207 	mutex_enter(&rp->r_statelock);
208 	contents = rp->r_symlink.contents;
209 	size = rp->r_symlink.size;
210 	rp->r_symlink.contents = NULL;
211 	vsp = rp->r_secattr;
212 	rp->r_secattr = NULL;
213 	xattr = rp->r_xattr_dir;
214 	rp->r_xattr_dir = NULL;
215 	mutex_exit(&rp->r_statelock);
216 
217 	/*
218 	 * Free the access cache entries.
219 	 */
220 	(void) nfs4_access_purge_rp(rp);
221 
222 	/*
223 	 * Free the readdir cache entries.
224 	 */
225 	nfs4_purge_rddir_cache(vp);
226 
227 	/*
228 	 * Free the symbolic link cache.
229 	 */
230 	if (contents != NULL) {
231 
232 		kmem_free((void *)contents, size);
233 	}
234 
235 	/*
236 	 * Free any cached ACL.
237 	 */
238 	if (vsp != NULL)
239 		nfs4_acl_free_cache(vsp);
240 
241 	/*
242 	 * Release the cached xattr_dir
243 	 */
244 	if (xattr != NULL)
245 		VN_RELE(xattr);
246 }
247 
248 /*
249  * We have seen a case that the fh passed in is for "." which
250  * should be a VROOT node, however, the fh is different from the
251  * root fh stored in the mntinfo4_t. The invalid fh might be
252  * from a misbehaved server and will panic the client system at
253  * a later time. To avoid the panic, we drop the bad fh, use
254  * the root fh from mntinfo4_t, and print an error message
255  * for attention.
256  */
257 nfs4_sharedfh_t *
258 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
259     int *wasbad)
260 {
261 	char *s;
262 
263 	*wasbad = 0;
264 	s = fn_name(nm);
265 	ASSERT(strcmp(s, "..") != 0);
266 
267 	if ((s[0] == '.' && s[1] == '\0') && fh &&
268 					!SFH4_SAME(mi->mi_rootfh, fh)) {
269 #ifdef DEBUG
270 		nfs4_fhandle_t fhandle;
271 
272 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
273 		    "Server %s returns a different "
274 		    "root filehandle for the path %s:",
275 		    mi->mi_curr_serv->sv_hostname,
276 		    mi->mi_curr_serv->sv_path);
277 
278 		/* print the bad fh */
279 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
280 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
281 			fhandle.fh_len);
282 		nfs4_printfhandle(&fhandle);
283 
284 		/* print mi_rootfh */
285 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
286 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
287 			fhandle.fh_len);
288 		nfs4_printfhandle(&fhandle);
289 #endif
290 		/* use mi_rootfh instead; fh will be rele by the caller */
291 		fh = mi->mi_rootfh;
292 		*wasbad = 1;
293 	}
294 
295 	kmem_free(s, MAXNAMELEN);
296 	return (fh);
297 }
298 
299 void
300 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
301     hrtime_t t, cred_t *cr, int index)
302 {
303 	vattr_t *attr;
304 	/*
305 	 * Don't add to attrcache if time overflow, but
306 	 * no need to check because either attr is null or the time
307 	 * values in it were processed by nfs4_time_ntov(), which checks
308 	 * for time overflows.
309 	 */
310 	attr = garp ? &garp->n4g_va : NULL;
311 
312 	if (attr) {
313 		if (!newnode) {
314 			rw_exit(&rtable4[index].r_lock);
315 #ifdef DEBUG
316 			if (vp->v_type != attr->va_type &&
317 			    vp->v_type != VNON && attr->va_type != VNON) {
318 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
319 					"makenfs4node: type (%d) doesn't "
320 					"match type of found node at %p (%d)",
321 					attr->va_type, (void *)vp, vp->v_type);
322 			}
323 #endif
324 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
325 		} else {
326 			rnode4_t *rp = VTOR4(vp);
327 
328 			vp->v_type = attr->va_type;
329 			vp->v_rdev = attr->va_rdev;
330 
331 			/*
332 			 * Turn this object into a "stub" object if we
333 			 * crossed an underlying server fs boundary.  To
334 			 * make this check, during mount we save the
335 			 * fsid of the server object being mounted.
336 			 * Here we compare this object's server fsid
337 			 * with the fsid we saved at mount.  If they
338 			 * are different, we crossed server fs boundary.
339 			 *
340 			 * The stub flag is set (or not) at rnode
341 			 * creation time and it never changes for life
342 			 * of rnode.
343 			 *
344 			 * We don't bother with taking r_state_lock
345 			 * to set R4SRVSTUB flag because this is a new
346 			 * rnode and we're holding rtable lock.  No other
347 			 * thread could have obtained access to this
348 			 * rnode.
349 			 */
350 			if (garp->n4g_fsid_valid) {
351 				rp->r_srv_fsid = garp->n4g_fsid;
352 
353 				if (vp->v_type == VDIR) {
354 					servinfo4_t *svp = rp->r_server;
355 
356 					(void) nfs_rw_enter_sig(&svp->sv_lock,
357 								RW_READER, 0);
358 					if (!FATTR4_FSID_EQ(&garp->n4g_fsid,
359 							    &svp->sv_fsid)) {
360 						rp->r_flags |= R4SRVSTUB;
361 					}
362 					nfs_rw_exit(&svp->sv_lock);
363 				}
364 			}
365 
366 			/* Can not cache partial attr */
367 			if (attr->va_mask == AT_ALL)
368 				nfs4_attrcache_noinval(vp, garp, t);
369 			else
370 				PURGE_ATTRCACHE4(vp);
371 
372 			rw_exit(&rtable4[index].r_lock);
373 		}
374 	} else {
375 		if (newnode) {
376 			PURGE_ATTRCACHE4(vp);
377 		}
378 		rw_exit(&rtable4[index].r_lock);
379 	}
380 }
381 
382 /*
383  * Find or create an rnode based primarily on filehandle.  To be
384  * used when dvp (vnode for parent directory) is not available;
385  * otherwise, makenfs4node() should be used.
386  *
387  * The nfs4_fname_t argument *npp is consumed and nulled out.
388  */
389 
390 vnode_t *
391 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
392 	nfs4_fname_t **npp, nfs4_ga_res_t *garp,
393 	mntinfo4_t *mi, cred_t *cr, hrtime_t t)
394 {
395 	vfs_t *vfsp = mi->mi_vfsp;
396 	int newnode = 0;
397 	vnode_t *vp;
398 	rnode4_t *rp;
399 	svnode_t *svp;
400 	nfs4_fname_t *name;
401 	int index;
402 
403 	ASSERT(npp && *npp);
404 	name = *npp;
405 	*npp = NULL;
406 
407 	index = rtable4hash(sfh);
408 	rw_enter(&rtable4[index].r_lock, RW_READER);
409 
410 	rp = r4find(&rtable4[index], sfh, vfsp);
411 	if (rp != NULL) {
412 		rw_exit(&rtable4[index].r_lock);
413 		vp = RTOV4(rp);
414 		fn_rele(&name);
415 		return (vp);
416 	}
417 
418 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
419 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
420 	if (newnode) {
421 		svp = vtosv(vp);
422 		svp->sv_forw = svp->sv_back = svp;
423 		svp->sv_name = name;
424 		if (psfh != NULL)
425 			sfh4_hold(psfh);
426 		svp->sv_dfh = psfh;
427 	} else {
428 		fn_rele(&name);
429 	}
430 
431 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
432 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
433 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
434 
435 	return (vp);
436 }
437 
438 /*
439  * Find or create a vnode for the given filehandle, filesystem, parent, and
440  * name.  The reference to nm is consumed, so the caller must first do an
441  * fn_hold() if it wants to continue using nm after this call.
442  */
443 vnode_t *
444 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
445 	hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
446 {
447 	vnode_t *vp;
448 	int newnode;
449 	int index;
450 	mntinfo4_t *mi = VFTOMI4(vfsp);
451 	int had_badfh = 0;
452 	rnode4_t *rp;
453 
454 	ASSERT(dvp != NULL);
455 
456 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
457 
458 	index = rtable4hash(fh);
459 	rw_enter(&rtable4[index].r_lock, RW_READER);
460 
461 	/*
462 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
463 	 */
464 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
465 	    nfs4_putapage, &newnode, cr);
466 
467 	rp = VTOR4(vp);
468 	sv_activate(&vp, dvp, &nm, newnode);
469 	if (dvp->v_flag & V_XATTRDIR) {
470 		mutex_enter(&rp->r_statelock);
471 		rp->r_flags |= R4ISXATTR;
472 		mutex_exit(&rp->r_statelock);
473 	}
474 
475 	/* if getting a bad file handle, do not cache the attributes. */
476 	if (had_badfh) {
477 		rw_exit(&rtable4[index].r_lock);
478 		return (vp);
479 	}
480 
481 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
482 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
483 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
484 
485 	return (vp);
486 }
487 
488 /*
489  * Hash on address of filehandle object.
490  * XXX totally untuned.
491  */
492 
493 int
494 rtable4hash(nfs4_sharedfh_t *fh)
495 {
496 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
497 }
498 
499 /*
500  * Find or create the vnode for the given filehandle and filesystem.
501  * *newnode is set to zero if the vnode already existed; non-zero if it had
502  * to be created.
503  *
504  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
505  */
506 
507 static vnode_t *
508 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
509     struct vnodeops *vops,
510     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
511     int *newnode, cred_t *cr)
512 {
513 	rnode4_t *rp;
514 	rnode4_t *trp;
515 	vnode_t *vp;
516 	mntinfo4_t *mi;
517 
518 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
519 
520 	mi = VFTOMI4(vfsp);
521 
522 start:
523 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
524 		vp = RTOV4(rp);
525 		*newnode = 0;
526 		return (vp);
527 	}
528 	rw_exit(&rhtp->r_lock);
529 
530 	mutex_enter(&rp4freelist_lock);
531 
532 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
533 		rp = rp4freelist;
534 		rp4_rmfree(rp);
535 		mutex_exit(&rp4freelist_lock);
536 
537 		vp = RTOV4(rp);
538 
539 		if (rp->r_flags & R4HASHED) {
540 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
541 			mutex_enter(&vp->v_lock);
542 			if (vp->v_count > 1) {
543 				vp->v_count--;
544 				mutex_exit(&vp->v_lock);
545 				rw_exit(&rp->r_hashq->r_lock);
546 				rw_enter(&rhtp->r_lock, RW_READER);
547 				goto start;
548 			}
549 			mutex_exit(&vp->v_lock);
550 			rp4_rmhash_locked(rp);
551 			rw_exit(&rp->r_hashq->r_lock);
552 		}
553 
554 		r4inactive(rp, cr);
555 
556 		mutex_enter(&vp->v_lock);
557 		if (vp->v_count > 1) {
558 			vp->v_count--;
559 			mutex_exit(&vp->v_lock);
560 			rw_enter(&rhtp->r_lock, RW_READER);
561 			goto start;
562 		}
563 		mutex_exit(&vp->v_lock);
564 		vn_invalid(vp);
565 
566 		/*
567 		 * destroy old locks before bzero'ing and
568 		 * recreating the locks below.
569 		 */
570 		uninit_rnode4(rp);
571 
572 		/*
573 		 * Make sure that if rnode is recycled then
574 		 * VFS count is decremented properly before
575 		 * reuse.
576 		 */
577 		VFS_RELE(vp->v_vfsp);
578 		vn_reinit(vp);
579 	} else {
580 		vnode_t *new_vp;
581 
582 		mutex_exit(&rp4freelist_lock);
583 
584 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
585 		new_vp = vn_alloc(KM_SLEEP);
586 
587 		atomic_add_long((ulong_t *)&rnode4_new, 1);
588 #ifdef DEBUG
589 		clstat4_debug.nrnode.value.ui64++;
590 #endif
591 		vp = new_vp;
592 	}
593 
594 	bzero(rp, sizeof (*rp));
595 	rp->r_vnode = vp;
596 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
597 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
598 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
599 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
600 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
601 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
602 	rp->created_v4 = 0;
603 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
604 	    offsetof(nfs4_open_stream_t, os_node));
605 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
606 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
607 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
608 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
609 	rp->r_flags = R4READDIRWATTR;
610 	rp->r_fh = fh;
611 	rp->r_hashq = rhtp;
612 	sfh4_hold(rp->r_fh);
613 	rp->r_server = mi->mi_curr_serv;
614 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
615 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
616 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
617 
618 	rddir4_cache_create(rp);
619 	rp->r_putapage = putapage;
620 	vn_setops(vp, vops);
621 	vp->v_data = (caddr_t)rp;
622 	vp->v_vfsp = vfsp;
623 	VFS_HOLD(vfsp);
624 	vp->v_type = VNON;
625 	if (isrootfh(fh, rp))
626 		vp->v_flag = VROOT;
627 	vn_exists(vp);
628 
629 	/*
630 	 * There is a race condition if someone else
631 	 * alloc's the rnode while no locks are held, so we
632 	 * check again and recover if found.
633 	 */
634 	rw_enter(&rhtp->r_lock, RW_WRITER);
635 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
636 		vp = RTOV4(trp);
637 		*newnode = 0;
638 		rw_exit(&rhtp->r_lock);
639 		rp4_addfree(rp, cr);
640 		rw_enter(&rhtp->r_lock, RW_READER);
641 		return (vp);
642 	}
643 	rp4_addhash(rp);
644 	*newnode = 1;
645 	return (vp);
646 }
647 
648 static void
649 uninit_rnode4(rnode4_t *rp)
650 {
651 	vnode_t *vp = RTOV4(rp);
652 
653 	ASSERT(rp != NULL);
654 	ASSERT(vp != NULL);
655 	ASSERT(vp->v_count == 1);
656 	ASSERT(rp->r_count == 0);
657 	ASSERT(rp->r_mapcnt == 0);
658 	if (rp->r_flags & R4LODANGLERS) {
659 		nfs4_flush_lock_owners(rp);
660 	}
661 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
662 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
663 	ASSERT(!(rp->r_flags & R4HASHED));
664 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
665 	nfs4_clear_open_streams(rp);
666 	list_destroy(&rp->r_open_streams);
667 
668 	/*
669 	 * Destroy the rddir cache first since we need to grab the r_statelock.
670 	 */
671 	mutex_enter(&rp->r_statelock);
672 	rddir4_cache_destroy(rp);
673 	mutex_exit(&rp->r_statelock);
674 	sv_uninit(&rp->r_svnode);
675 	sfh4_rele(&rp->r_fh);
676 	nfs_rw_destroy(&rp->r_rwlock);
677 	nfs_rw_destroy(&rp->r_lkserlock);
678 	mutex_destroy(&rp->r_statelock);
679 	mutex_destroy(&rp->r_statev4_lock);
680 	mutex_destroy(&rp->r_os_lock);
681 	cv_destroy(&rp->r_cv);
682 	cv_destroy(&rp->r_commit.c_cv);
683 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
684 	if (rp->r_flags & R4DELMAPLIST)
685 		list_destroy(&rp->r_indelmap);
686 }
687 
688 /*
689  * Put an rnode on the free list.
690  *
691  * Rnodes which were allocated above and beyond the normal limit
692  * are immediately freed.
693  */
694 void
695 rp4_addfree(rnode4_t *rp, cred_t *cr)
696 {
697 	vnode_t *vp;
698 	vnode_t *xattr;
699 	struct vfs *vfsp;
700 
701 	vp = RTOV4(rp);
702 	ASSERT(vp->v_count >= 1);
703 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
704 
705 	/*
706 	 * If we have too many rnodes allocated and there are no
707 	 * references to this rnode, or if the rnode is no longer
708 	 * accessible by it does not reside in the hash queues,
709 	 * or if an i/o error occurred while writing to the file,
710 	 * then just free it instead of putting it on the rnode
711 	 * freelist.
712 	 */
713 	vfsp = vp->v_vfsp;
714 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
715 #ifdef DEBUG
716 	    (nfs4_rnode_nofreelist != 0) ||
717 #endif
718 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
719 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
720 		if (rp->r_flags & R4HASHED) {
721 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
722 			mutex_enter(&vp->v_lock);
723 			if (vp->v_count > 1) {
724 				vp->v_count--;
725 				mutex_exit(&vp->v_lock);
726 				rw_exit(&rp->r_hashq->r_lock);
727 				return;
728 			}
729 			mutex_exit(&vp->v_lock);
730 			rp4_rmhash_locked(rp);
731 			rw_exit(&rp->r_hashq->r_lock);
732 		}
733 
734 		/*
735 		 * Make sure we don't have a delegation on this rnode
736 		 * before destroying it.
737 		 */
738 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
739 			(void) nfs4delegreturn(rp,
740 				NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
741 		}
742 
743 		r4inactive(rp, cr);
744 
745 		/*
746 		 * Recheck the vnode reference count.  We need to
747 		 * make sure that another reference has not been
748 		 * acquired while we were not holding v_lock.  The
749 		 * rnode is not in the rnode hash queues; one
750 		 * way for a reference to have been acquired
751 		 * is for a VOP_PUTPAGE because the rnode was marked
752 		 * with R4DIRTY or for a modified page.  This
753 		 * reference may have been acquired before our call
754 		 * to r4inactive.  The i/o may have been completed,
755 		 * thus allowing r4inactive to complete, but the
756 		 * reference to the vnode may not have been released
757 		 * yet.  In any case, the rnode can not be destroyed
758 		 * until the other references to this vnode have been
759 		 * released.  The other references will take care of
760 		 * either destroying the rnode or placing it on the
761 		 * rnode freelist.  If there are no other references,
762 		 * then the rnode may be safely destroyed.
763 		 */
764 		mutex_enter(&vp->v_lock);
765 		if (vp->v_count > 1) {
766 			vp->v_count--;
767 			mutex_exit(&vp->v_lock);
768 			return;
769 		}
770 		mutex_exit(&vp->v_lock);
771 
772 		destroy_rnode4(rp);
773 		return;
774 	}
775 
776 	/*
777 	 * Lock the hash queue and then recheck the reference count
778 	 * to ensure that no other threads have acquired a reference
779 	 * to indicate that the rnode should not be placed on the
780 	 * freelist.  If another reference has been acquired, then
781 	 * just release this one and let the other thread complete
782 	 * the processing of adding this rnode to the freelist.
783 	 */
784 again:
785 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
786 
787 	mutex_enter(&vp->v_lock);
788 	if (vp->v_count > 1) {
789 		vp->v_count--;
790 		mutex_exit(&vp->v_lock);
791 		rw_exit(&rp->r_hashq->r_lock);
792 		return;
793 	}
794 	mutex_exit(&vp->v_lock);
795 
796 	/*
797 	 * Make sure we don't put an rnode with a delegation
798 	 * on the free list.
799 	 */
800 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
801 		rw_exit(&rp->r_hashq->r_lock);
802 		(void) nfs4delegreturn(rp,
803 			NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
804 		goto again;
805 	}
806 
807 	/*
808 	 * Now that we have the hash queue lock, and we know there
809 	 * are not anymore references on the vnode, check to make
810 	 * sure there aren't any open streams still on the rnode.
811 	 * If so, drop the hash queue lock, remove the open streams,
812 	 * and recheck the v_count.
813 	 */
814 	mutex_enter(&rp->r_os_lock);
815 	if (list_head(&rp->r_open_streams) != NULL) {
816 		mutex_exit(&rp->r_os_lock);
817 		rw_exit(&rp->r_hashq->r_lock);
818 		if (nfs_zone() != VTOMI4(vp)->mi_zone)
819 			nfs4_clear_open_streams(rp);
820 		else
821 			(void) nfs4close_all(vp, cr);
822 		goto again;
823 	}
824 	mutex_exit(&rp->r_os_lock);
825 
826 	/*
827 	 * Before we put it on the freelist, make sure there is no
828 	 * active xattr directory cached, the freelist will not
829 	 * have its entries r4inactive'd if there is still an active
830 	 * rnode, thus nothing in the freelist can hold another
831 	 * rnode active.
832 	 */
833 	xattr = rp->r_xattr_dir;
834 	rp->r_xattr_dir = NULL;
835 
836 	/*
837 	 * If there is no cached data or metadata for this file, then
838 	 * put the rnode on the front of the freelist so that it will
839 	 * be reused before other rnodes which may have cached data or
840 	 * metadata associated with them.
841 	 */
842 	mutex_enter(&rp4freelist_lock);
843 	if (rp4freelist == NULL) {
844 		rp->r_freef = rp;
845 		rp->r_freeb = rp;
846 		rp4freelist = rp;
847 	} else {
848 		rp->r_freef = rp4freelist;
849 		rp->r_freeb = rp4freelist->r_freeb;
850 		rp4freelist->r_freeb->r_freef = rp;
851 		rp4freelist->r_freeb = rp;
852 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
853 				rp->r_symlink.contents == NULL &&
854 				rp->r_secattr == NULL)
855 			rp4freelist = rp;
856 	}
857 	mutex_exit(&rp4freelist_lock);
858 
859 	rw_exit(&rp->r_hashq->r_lock);
860 
861 	if (xattr)
862 		VN_RELE(xattr);
863 }
864 
865 /*
866  * Remove an rnode from the free list.
867  *
868  * The caller must be holding rp4freelist_lock and the rnode
869  * must be on the freelist.
870  */
871 static void
872 rp4_rmfree(rnode4_t *rp)
873 {
874 
875 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
876 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
877 
878 	if (rp == rp4freelist) {
879 		rp4freelist = rp->r_freef;
880 		if (rp == rp4freelist)
881 			rp4freelist = NULL;
882 	}
883 	rp->r_freeb->r_freef = rp->r_freef;
884 	rp->r_freef->r_freeb = rp->r_freeb;
885 
886 	rp->r_freef = rp->r_freeb = NULL;
887 }
888 
889 /*
890  * Put a rnode in the hash table.
891  *
892  * The caller must be holding the exclusive hash queue lock
893  */
894 void
895 rp4_addhash(rnode4_t *rp)
896 {
897 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
898 	ASSERT(!(rp->r_flags & R4HASHED));
899 
900 #ifdef DEBUG
901 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
902 #endif
903 
904 	rp->r_hashf = rp->r_hashq->r_hashf;
905 	rp->r_hashq->r_hashf = rp;
906 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
907 	rp->r_hashf->r_hashb = rp;
908 
909 	mutex_enter(&rp->r_statelock);
910 	rp->r_flags |= R4HASHED;
911 	mutex_exit(&rp->r_statelock);
912 }
913 
914 /*
915  * Remove a rnode from the hash table.
916  *
917  * The caller must be holding the hash queue lock.
918  */
919 void
920 rp4_rmhash_locked(rnode4_t *rp)
921 {
922 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
923 	ASSERT(rp->r_flags & R4HASHED);
924 
925 	rp->r_hashb->r_hashf = rp->r_hashf;
926 	rp->r_hashf->r_hashb = rp->r_hashb;
927 
928 	mutex_enter(&rp->r_statelock);
929 	rp->r_flags &= ~R4HASHED;
930 	mutex_exit(&rp->r_statelock);
931 }
932 
933 /*
934  * Remove a rnode from the hash table.
935  *
936  * The caller must not be holding the hash queue lock.
937  */
938 void
939 rp4_rmhash(rnode4_t *rp)
940 {
941 
942 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
943 	rp4_rmhash_locked(rp);
944 	rw_exit(&rp->r_hashq->r_lock);
945 }
946 
947 /*
948  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
949  * Returns NULL if no match.  If an rnode is returned, the reference count
950  * on the master vnode is incremented.
951  *
952  * The caller must be holding the hash queue lock, either shared or exclusive.
953  */
954 rnode4_t *
955 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
956 {
957 	rnode4_t *rp;
958 	vnode_t *vp;
959 
960 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
961 
962 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
963 		vp = RTOV4(rp);
964 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
965 
966 			mutex_enter(&rp->r_statelock);
967 			if (rp->r_flags & R4RECOVERR) {
968 				mutex_exit(&rp->r_statelock);
969 				continue;
970 			}
971 			mutex_exit(&rp->r_statelock);
972 #ifdef DEBUG
973 			r4_dup_check(rp, vfsp);
974 #endif
975 			if (rp->r_freef != NULL) {
976 				mutex_enter(&rp4freelist_lock);
977 				/*
978 				 * If the rnode is on the freelist,
979 				 * then remove it and use that reference
980 				 * as the new reference.  Otherwise,
981 				 * need to increment the reference count.
982 				 */
983 				if (rp->r_freef != NULL) {
984 					rp4_rmfree(rp);
985 					mutex_exit(&rp4freelist_lock);
986 				} else {
987 					mutex_exit(&rp4freelist_lock);
988 					VN_HOLD(vp);
989 				}
990 			} else
991 				VN_HOLD(vp);
992 
993 			/*
994 			 * if root vnode, set v_flag to indicate that
995 			 */
996 			if (isrootfh(fh, rp)) {
997 				if (!(vp->v_flag & VROOT)) {
998 					mutex_enter(&vp->v_lock);
999 					vp->v_flag |= VROOT;
1000 					mutex_exit(&vp->v_lock);
1001 				}
1002 			}
1003 			return (rp);
1004 		}
1005 	}
1006 	return (NULL);
1007 }
1008 
1009 /*
1010  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1011  * that assumes the caller hasn't already got the lock
1012  * on the hash bucket.
1013  */
1014 rnode4_t *
1015 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1016 {
1017 	rnode4_t *rp;
1018 	int index;
1019 
1020 	index = rtable4hash(fh);
1021 	rw_enter(&rtable4[index].r_lock, RW_READER);
1022 	rp = r4find(&rtable4[index], fh, vfsp);
1023 	rw_exit(&rtable4[index].r_lock);
1024 
1025 	return (rp);
1026 }
1027 
1028 /*
1029  * Return 1 if there is a active vnode belonging to this vfs in the
1030  * rtable4 cache.
1031  *
1032  * Several of these checks are done without holding the usual
1033  * locks.  This is safe because destroy_rtable(), rp_addfree(),
1034  * etc. will redo the necessary checks before actually destroying
1035  * any rnodes.
1036  */
1037 int
1038 check_rtable4(struct vfs *vfsp)
1039 {
1040 	rnode4_t *rp;
1041 	vnode_t *vp;
1042 	char *busy = NULL;
1043 	int index;
1044 
1045 	for (index = 0; index < rtable4size; index++) {
1046 		rw_enter(&rtable4[index].r_lock, RW_READER);
1047 
1048 		for (rp = rtable4[index].r_hashf;
1049 		    rp != (rnode4_t *)(&rtable4[index]);
1050 		    rp = rp->r_hashf) {
1051 
1052 			vp = RTOV4(rp);
1053 			if (vp->v_vfsp == vfsp) {
1054 				if (rp->r_freef == NULL) {
1055 					busy = "not on free list";
1056 				} else if (nfs4_has_pages(vp) &&
1057 					    (rp->r_flags & R4DIRTY)) {
1058 					busy = "dirty pages";
1059 				} else if (rp->r_count > 0) {
1060 					busy = "r_count > 0";
1061 				}
1062 
1063 				if (busy != NULL) {
1064 #ifdef DEBUG
1065 					char *path;
1066 
1067 					path = fn_path(rp->r_svnode.sv_name);
1068 					NFS4_DEBUG(nfs4_rnode_debug,
1069 					    (CE_NOTE, "check_rtable4: " "%s %s",
1070 					    path, busy));
1071 					kmem_free(path, strlen(path)+1);
1072 #endif
1073 					rw_exit(&rtable4[index].r_lock);
1074 					return (1);
1075 				}
1076 			}
1077 		}
1078 		rw_exit(&rtable4[index].r_lock);
1079 	}
1080 	return (0);
1081 }
1082 
1083 /*
1084  * Destroy inactive vnodes from the hash queues which
1085  * belong to this vfs. All of the vnodes should be inactive.
1086  * It is essential that we destory all rnodes in case of
1087  * forced unmount as well as in normal unmount case.
1088  */
1089 
1090 void
1091 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1092 {
1093 	int index;
1094 	vnode_t *vp;
1095 	rnode4_t *rp, *r_hashf, *rlist;
1096 
1097 	rlist = NULL;
1098 
1099 	for (index = 0; index < rtable4size; index++) {
1100 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1101 		for (rp = rtable4[index].r_hashf;
1102 		    rp != (rnode4_t *)(&rtable4[index]);
1103 		    rp = r_hashf) {
1104 			/* save the hash pointer before destroying */
1105 			r_hashf = rp->r_hashf;
1106 
1107 			vp = RTOV4(rp);
1108 			if (vp->v_vfsp == vfsp) {
1109 				mutex_enter(&rp4freelist_lock);
1110 				if (rp->r_freef != NULL) {
1111 					rp4_rmfree(rp);
1112 					mutex_exit(&rp4freelist_lock);
1113 					rp4_rmhash_locked(rp);
1114 					rp->r_hashf = rlist;
1115 					rlist = rp;
1116 				} else
1117 					mutex_exit(&rp4freelist_lock);
1118 			}
1119 		}
1120 		rw_exit(&rtable4[index].r_lock);
1121 	}
1122 
1123 	for (rp = rlist; rp != NULL; rp = r_hashf) {
1124 		r_hashf = rp->r_hashf;
1125 		/*
1126 		 * This call to rp4_addfree will end up destroying the
1127 		 * rnode, but in a safe way with the appropriate set
1128 		 * of checks done.
1129 		 */
1130 		rp4_addfree(rp, cr);
1131 	}
1132 }
1133 
1134 /*
1135  * This routine destroys all the resources of an rnode
1136  * and finally the rnode itself.
1137  */
1138 static void
1139 destroy_rnode4(rnode4_t *rp)
1140 {
1141 	vnode_t *vp;
1142 	vfs_t *vfsp;
1143 
1144 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1145 
1146 	vp = RTOV4(rp);
1147 	vfsp = vp->v_vfsp;
1148 
1149 	uninit_rnode4(rp);
1150 	atomic_add_long((ulong_t *)&rnode4_new, -1);
1151 #ifdef DEBUG
1152 	clstat4_debug.nrnode.value.ui64--;
1153 #endif
1154 	kmem_cache_free(rnode4_cache, rp);
1155 	vn_invalid(vp);
1156 	vn_free(vp);
1157 	VFS_RELE(vfsp);
1158 }
1159 
1160 /*
1161  * Invalidate the attributes on all rnodes forcing the next getattr
1162  * to go over the wire.  Used to flush stale uid and gid mappings.
1163  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1164  */
1165 void
1166 nfs4_rnode_invalidate(struct vfs *vfsp)
1167 {
1168 	int index;
1169 	rnode4_t *rp;
1170 	vnode_t *vp;
1171 
1172 	/*
1173 	 * Walk the hash queues looking for rnodes.
1174 	 */
1175 	for (index = 0; index < rtable4size; index++) {
1176 		rw_enter(&rtable4[index].r_lock, RW_READER);
1177 		for (rp = rtable4[index].r_hashf;
1178 		    rp != (rnode4_t *)(&rtable4[index]);
1179 		    rp = rp->r_hashf) {
1180 			vp = RTOV4(rp);
1181 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1182 				continue;
1183 
1184 			if (!mutex_tryenter(&rp->r_statelock))
1185 				continue;
1186 
1187 			/*
1188 			 * Expire the attributes by resetting the change
1189 			 * and attr timeout.
1190 			 */
1191 			rp->r_change = 0;
1192 			PURGE_ATTRCACHE4_LOCKED(rp);
1193 			mutex_exit(&rp->r_statelock);
1194 		}
1195 		rw_exit(&rtable4[index].r_lock);
1196 	}
1197 }
1198 
1199 /*
1200  * Flush all vnodes in this (or every) vfs.
1201  * Used by nfs_sync and by nfs_unmount.
1202  */
1203 void
1204 r4flush(struct vfs *vfsp, cred_t *cr)
1205 {
1206 	int index;
1207 	rnode4_t *rp;
1208 	vnode_t *vp, **vplist;
1209 	long num, cnt;
1210 
1211 	/*
1212 	 * Check to see whether there is anything to do.
1213 	 */
1214 	num = rnode4_new;
1215 	if (num == 0)
1216 		return;
1217 
1218 	/*
1219 	 * Allocate a slot for all currently active rnodes on the
1220 	 * supposition that they all may need flushing.
1221 	 */
1222 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1223 	cnt = 0;
1224 
1225 	/*
1226 	 * Walk the hash queues looking for rnodes with page
1227 	 * lists associated with them.  Make a list of these
1228 	 * files.
1229 	 */
1230 	for (index = 0; index < rtable4size; index++) {
1231 		rw_enter(&rtable4[index].r_lock, RW_READER);
1232 		for (rp = rtable4[index].r_hashf;
1233 		    rp != (rnode4_t *)(&rtable4[index]);
1234 		    rp = rp->r_hashf) {
1235 			vp = RTOV4(rp);
1236 			/*
1237 			 * Don't bother sync'ing a vp if it
1238 			 * is part of virtual swap device or
1239 			 * if VFS is read-only
1240 			 */
1241 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1242 				continue;
1243 			/*
1244 			 * If flushing all mounted file systems or
1245 			 * the vnode belongs to this vfs, has pages
1246 			 * and is marked as either dirty or mmap'd,
1247 			 * hold and add this vnode to the list of
1248 			 * vnodes to flush.
1249 			 */
1250 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
1251 			    nfs4_has_pages(vp) &&
1252 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1253 				VN_HOLD(vp);
1254 				vplist[cnt++] = vp;
1255 				if (cnt == num) {
1256 					rw_exit(&rtable4[index].r_lock);
1257 					goto toomany;
1258 				}
1259 			}
1260 		}
1261 		rw_exit(&rtable4[index].r_lock);
1262 	}
1263 toomany:
1264 
1265 	/*
1266 	 * Flush and release all of the files on the list.
1267 	 */
1268 	while (cnt-- > 0) {
1269 		vp = vplist[cnt];
1270 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr);
1271 		VN_RELE(vp);
1272 	}
1273 
1274 	/*
1275 	 * Free the space allocated to hold the list.
1276 	 */
1277 	kmem_free(vplist, num * sizeof (*vplist));
1278 }
1279 
1280 int
1281 nfs4_free_data_reclaim(rnode4_t *rp)
1282 {
1283 	char *contents;
1284 	vnode_t *xattr;
1285 	int size;
1286 	vsecattr_t *vsp;
1287 	int freed;
1288 	bool_t rdc = FALSE;
1289 
1290 	/*
1291 	 * Free any held caches which may
1292 	 * be associated with this rnode.
1293 	 */
1294 	mutex_enter(&rp->r_statelock);
1295 	if (rp->r_dir != NULL)
1296 		rdc = TRUE;
1297 	contents = rp->r_symlink.contents;
1298 	size = rp->r_symlink.size;
1299 	rp->r_symlink.contents = NULL;
1300 	vsp = rp->r_secattr;
1301 	rp->r_secattr = NULL;
1302 	xattr = rp->r_xattr_dir;
1303 	rp->r_xattr_dir = NULL;
1304 	mutex_exit(&rp->r_statelock);
1305 
1306 	/*
1307 	 * Free the access cache entries.
1308 	 */
1309 	freed = nfs4_access_purge_rp(rp);
1310 
1311 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1312 		return (freed);
1313 
1314 	/*
1315 	 * Free the readdir cache entries, incompletely if we can't block.
1316 	 */
1317 	nfs4_purge_rddir_cache(RTOV4(rp));
1318 
1319 	/*
1320 	 * Free the symbolic link cache.
1321 	 */
1322 	if (contents != NULL) {
1323 
1324 		kmem_free((void *)contents, size);
1325 	}
1326 
1327 	/*
1328 	 * Free any cached ACL.
1329 	 */
1330 	if (vsp != NULL)
1331 		nfs4_acl_free_cache(vsp);
1332 
1333 	/*
1334 	 * Release the xattr directory vnode
1335 	 */
1336 	if (xattr != NULL)
1337 		VN_RELE(xattr);
1338 
1339 	return (1);
1340 }
1341 
1342 static int
1343 nfs4_active_data_reclaim(rnode4_t *rp)
1344 {
1345 	char *contents;
1346 	vnode_t *xattr;
1347 	int size;
1348 	vsecattr_t *vsp;
1349 	int freed;
1350 	bool_t rdc = FALSE;
1351 
1352 	/*
1353 	 * Free any held credentials and caches which
1354 	 * may be associated with this rnode.
1355 	 */
1356 	if (!mutex_tryenter(&rp->r_statelock))
1357 		return (0);
1358 	contents = rp->r_symlink.contents;
1359 	size = rp->r_symlink.size;
1360 	rp->r_symlink.contents = NULL;
1361 	vsp = rp->r_secattr;
1362 	rp->r_secattr = NULL;
1363 	if (rp->r_dir != NULL)
1364 		rdc = TRUE;
1365 	xattr = rp->r_xattr_dir;
1366 	rp->r_xattr_dir = NULL;
1367 	mutex_exit(&rp->r_statelock);
1368 
1369 	/*
1370 	 * Free the access cache entries.
1371 	 */
1372 	freed = nfs4_access_purge_rp(rp);
1373 
1374 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1375 		return (freed);
1376 
1377 	/*
1378 	 * Free the symbolic link cache.
1379 	 */
1380 	if (contents != NULL) {
1381 
1382 		kmem_free((void *)contents, size);
1383 	}
1384 
1385 	/*
1386 	 * Free any cached ACL.
1387 	 */
1388 	if (vsp != NULL)
1389 		nfs4_acl_free_cache(vsp);
1390 
1391 	nfs4_purge_rddir_cache(RTOV4(rp));
1392 
1393 	/*
1394 	 * Release the xattr directory vnode
1395 	 */
1396 	if (xattr != NULL)
1397 		VN_RELE(xattr);
1398 
1399 	return (1);
1400 }
1401 
1402 static int
1403 nfs4_free_reclaim(void)
1404 {
1405 	int freed;
1406 	rnode4_t *rp;
1407 
1408 #ifdef DEBUG
1409 	clstat4_debug.f_reclaim.value.ui64++;
1410 #endif
1411 	freed = 0;
1412 	mutex_enter(&rp4freelist_lock);
1413 	rp = rp4freelist;
1414 	if (rp != NULL) {
1415 		do {
1416 			if (nfs4_free_data_reclaim(rp))
1417 				freed = 1;
1418 		} while ((rp = rp->r_freef) != rp4freelist);
1419 	}
1420 	mutex_exit(&rp4freelist_lock);
1421 	return (freed);
1422 }
1423 
1424 static int
1425 nfs4_active_reclaim(void)
1426 {
1427 	int freed;
1428 	int index;
1429 	rnode4_t *rp;
1430 
1431 #ifdef DEBUG
1432 	clstat4_debug.a_reclaim.value.ui64++;
1433 #endif
1434 	freed = 0;
1435 	for (index = 0; index < rtable4size; index++) {
1436 		rw_enter(&rtable4[index].r_lock, RW_READER);
1437 		for (rp = rtable4[index].r_hashf;
1438 		    rp != (rnode4_t *)(&rtable4[index]);
1439 		    rp = rp->r_hashf) {
1440 			if (nfs4_active_data_reclaim(rp))
1441 				freed = 1;
1442 		}
1443 		rw_exit(&rtable4[index].r_lock);
1444 	}
1445 	return (freed);
1446 }
1447 
1448 static int
1449 nfs4_rnode_reclaim(void)
1450 {
1451 	int freed;
1452 	rnode4_t *rp;
1453 	vnode_t *vp;
1454 
1455 #ifdef DEBUG
1456 	clstat4_debug.r_reclaim.value.ui64++;
1457 #endif
1458 	freed = 0;
1459 	mutex_enter(&rp4freelist_lock);
1460 	while ((rp = rp4freelist) != NULL) {
1461 		rp4_rmfree(rp);
1462 		mutex_exit(&rp4freelist_lock);
1463 		if (rp->r_flags & R4HASHED) {
1464 			vp = RTOV4(rp);
1465 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1466 			mutex_enter(&vp->v_lock);
1467 			if (vp->v_count > 1) {
1468 				vp->v_count--;
1469 				mutex_exit(&vp->v_lock);
1470 				rw_exit(&rp->r_hashq->r_lock);
1471 				mutex_enter(&rp4freelist_lock);
1472 				continue;
1473 			}
1474 			mutex_exit(&vp->v_lock);
1475 			rp4_rmhash_locked(rp);
1476 			rw_exit(&rp->r_hashq->r_lock);
1477 		}
1478 		/*
1479 		 * This call to rp_addfree will end up destroying the
1480 		 * rnode, but in a safe way with the appropriate set
1481 		 * of checks done.
1482 		 */
1483 		rp4_addfree(rp, CRED());
1484 		mutex_enter(&rp4freelist_lock);
1485 	}
1486 	mutex_exit(&rp4freelist_lock);
1487 	return (freed);
1488 }
1489 
1490 /*ARGSUSED*/
1491 static void
1492 nfs4_reclaim(void *cdrarg)
1493 {
1494 
1495 #ifdef DEBUG
1496 	clstat4_debug.reclaim.value.ui64++;
1497 #endif
1498 	if (nfs4_free_reclaim())
1499 		return;
1500 
1501 	if (nfs4_active_reclaim())
1502 		return;
1503 
1504 	(void) nfs4_rnode_reclaim();
1505 }
1506 
1507 /*
1508  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1509  * clientid can change if the caller drops mi_recovlock.
1510  */
1511 
1512 clientid4
1513 mi2clientid(mntinfo4_t *mi)
1514 {
1515 	nfs4_server_t	*sp;
1516 	clientid4	clientid = 0;
1517 
1518 	/* this locks down sp if it is found */
1519 	sp = find_nfs4_server(mi);
1520 	if (sp != NULL) {
1521 		clientid = sp->clientid;
1522 		mutex_exit(&sp->s_lock);
1523 		nfs4_server_rele(sp);
1524 	}
1525 	return (clientid);
1526 }
1527 
1528 /*
1529  * Return the current lease time for the server associated with the given
1530  * file.  Note that the lease time could change immediately after this
1531  * call.
1532  */
1533 
1534 time_t
1535 r2lease_time(rnode4_t *rp)
1536 {
1537 	nfs4_server_t	*sp;
1538 	time_t		lease_time;
1539 	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1540 
1541 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1542 
1543 	/* this locks down sp if it is found */
1544 	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1545 
1546 	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1547 		if (sp != NULL) {
1548 			mutex_exit(&sp->s_lock);
1549 			nfs4_server_rele(sp);
1550 		}
1551 		nfs_rw_exit(&mi->mi_recovlock);
1552 		return (1);		/* 1 second */
1553 	}
1554 
1555 	ASSERT(sp != NULL);
1556 
1557 	lease_time = sp->s_lease_time;
1558 
1559 	mutex_exit(&sp->s_lock);
1560 	nfs4_server_rele(sp);
1561 	nfs_rw_exit(&mi->mi_recovlock);
1562 
1563 	return (lease_time);
1564 }
1565 
1566 /*
1567  * Return a list with information about all the known open instances for
1568  * a filesystem. The caller must call r4releopenlist() when done with the
1569  * list.
1570  *
1571  * We are safe at looking at os_valid and os_pending_close across dropping
1572  * the 'os_sync_lock' to count up the number of open streams and then
1573  * allocate memory for the osp list due to:
1574  *	-Looking at os_pending_close is safe since this routine is
1575  *	only called via recovery, and os_pending_close can only be set via
1576  *	a non-recovery operation (which are all blocked when recovery
1577  *	is active).
1578  *
1579  *	-Examining os_valid is safe since non-recovery operations, which
1580  *	could potentially switch os_valid to 0, are blocked (via
1581  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1582  *	(which means we are the only recovery thread potentially acting
1583  *	on this open stream).
1584  */
1585 
1586 nfs4_opinst_t *
1587 r4mkopenlist(mntinfo4_t *mi)
1588 {
1589 	nfs4_opinst_t *reopenlist, *rep;
1590 	rnode4_t *rp;
1591 	vnode_t *vp;
1592 	vfs_t *vfsp = mi->mi_vfsp;
1593 	int numosp;
1594 	nfs4_open_stream_t *osp;
1595 	int index;
1596 	open_delegation_type4 dtype;
1597 	int hold_vnode;
1598 
1599 	reopenlist = NULL;
1600 
1601 	for (index = 0; index < rtable4size; index++) {
1602 		rw_enter(&rtable4[index].r_lock, RW_READER);
1603 		for (rp = rtable4[index].r_hashf;
1604 		    rp != (rnode4_t *)(&rtable4[index]);
1605 		    rp = rp->r_hashf) {
1606 
1607 			vp = RTOV4(rp);
1608 			if (vp->v_vfsp != vfsp)
1609 				continue;
1610 			hold_vnode = 0;
1611 
1612 			mutex_enter(&rp->r_os_lock);
1613 
1614 			/* Count the number of valid open_streams of the file */
1615 			numosp = 0;
1616 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1617 			    osp = list_next(&rp->r_open_streams, osp)) {
1618 				mutex_enter(&osp->os_sync_lock);
1619 				if (osp->os_valid && !osp->os_pending_close)
1620 					numosp++;
1621 				mutex_exit(&osp->os_sync_lock);
1622 			}
1623 
1624 			/* Fill in the valid open streams per vp */
1625 			if (numosp > 0) {
1626 				int j;
1627 
1628 				hold_vnode = 1;
1629 
1630 				/*
1631 				 * Add a new open instance to the list
1632 				 */
1633 				rep = kmem_zalloc(sizeof (*reopenlist),
1634 					KM_SLEEP);
1635 				rep->re_next = reopenlist;
1636 				reopenlist = rep;
1637 
1638 				rep->re_vp = vp;
1639 				rep->re_osp = kmem_zalloc(
1640 					numosp * sizeof (*(rep->re_osp)),
1641 					KM_SLEEP);
1642 				rep->re_numosp = numosp;
1643 
1644 				j = 0;
1645 				for (osp = list_head(&rp->r_open_streams);
1646 				    osp != NULL;
1647 				    osp = list_next(&rp->r_open_streams, osp)) {
1648 
1649 					mutex_enter(&osp->os_sync_lock);
1650 					if (osp->os_valid &&
1651 					    !osp->os_pending_close) {
1652 						osp->os_ref_count++;
1653 						rep->re_osp[j] = osp;
1654 						j++;
1655 					}
1656 					mutex_exit(&osp->os_sync_lock);
1657 				}
1658 				/*
1659 				 * Assuming valid osp(s) stays valid between
1660 				 * the time obtaining j and numosp.
1661 				 */
1662 				ASSERT(j == numosp);
1663 			}
1664 
1665 			mutex_exit(&rp->r_os_lock);
1666 			/* do this here to keep v_lock > r_os_lock */
1667 			if (hold_vnode)
1668 				VN_HOLD(vp);
1669 			mutex_enter(&rp->r_statev4_lock);
1670 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1671 				/*
1672 				 * If this rnode holds a delegation,
1673 				 * but if there are no valid open streams,
1674 				 * then just discard the delegation
1675 				 * without doing delegreturn.
1676 				 */
1677 				if (numosp > 0)
1678 					rp->r_deleg_needs_recovery =
1679 							rp->r_deleg_type;
1680 			}
1681 			/* Save the delegation type for use outside the lock */
1682 			dtype = rp->r_deleg_type;
1683 			mutex_exit(&rp->r_statev4_lock);
1684 
1685 			/*
1686 			 * If we have a delegation then get rid of it.
1687 			 * We've set rp->r_deleg_needs_recovery so we have
1688 			 * enough information to recover.
1689 			 */
1690 			if (dtype != OPEN_DELEGATE_NONE) {
1691 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1692 			}
1693 		}
1694 		rw_exit(&rtable4[index].r_lock);
1695 	}
1696 	return (reopenlist);
1697 }
1698 
1699 /*
1700  * Release the list of open instance references.
1701  */
1702 
1703 void
1704 r4releopenlist(nfs4_opinst_t *reopenp)
1705 {
1706 	nfs4_opinst_t *rep, *next;
1707 	int i;
1708 
1709 	for (rep = reopenp; rep; rep = next) {
1710 		next = rep->re_next;
1711 
1712 		for (i = 0; i < rep->re_numosp; i++)
1713 		    open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1714 
1715 		VN_RELE(rep->re_vp);
1716 		kmem_free(rep->re_osp,
1717 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1718 
1719 		kmem_free(rep, sizeof (*rep));
1720 	}
1721 }
1722 
1723 int
1724 nfs4_rnode_init(void)
1725 {
1726 	ulong_t nrnode4_max;
1727 	int i;
1728 
1729 	/*
1730 	 * Compute the size of the rnode4 hash table
1731 	 */
1732 	if (nrnode <= 0)
1733 		nrnode = ncsize;
1734 	nrnode4_max =
1735 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1736 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1737 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1738 		    "setting nrnode to max value of %ld", nrnode4_max);
1739 		nrnode = nrnode4_max;
1740 	}
1741 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1742 	rtable4mask = rtable4size - 1;
1743 
1744 	/*
1745 	 * Allocate and initialize the hash buckets
1746 	 */
1747 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1748 	for (i = 0; i < rtable4size; i++) {
1749 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1750 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1751 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1752 	}
1753 
1754 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1755 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1756 
1757 	return (0);
1758 }
1759 
1760 int
1761 nfs4_rnode_fini(void)
1762 {
1763 	int i;
1764 
1765 	/*
1766 	 * Deallocate the rnode hash queues
1767 	 */
1768 	kmem_cache_destroy(rnode4_cache);
1769 
1770 	for (i = 0; i < rtable4size; i++)
1771 		rw_destroy(&rtable4[i].r_lock);
1772 
1773 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1774 
1775 	return (0);
1776 }
1777 
1778 /*
1779  * Return non-zero if the given filehandle refers to the root filehandle
1780  * for the given rnode.
1781  */
1782 
1783 static int
1784 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1785 {
1786 	int isroot;
1787 
1788 	isroot = 0;
1789 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1790 		isroot = 1;
1791 
1792 	return (isroot);
1793 }
1794 
1795 #ifdef DEBUG
1796 
1797 /*
1798  * Look in the rnode table for other rnodes that have the same filehandle.
1799  * Assume the lock is held for the hash chain of checkrp
1800  */
1801 
1802 static void
1803 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
1804 {
1805 	rnode4_t *rp;
1806 	vnode_t *tvp;
1807 	nfs4_fhandle_t fh, fh2;
1808 	int index;
1809 
1810 	if (!r4_check_for_dups)
1811 		return;
1812 
1813 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
1814 
1815 	sfh4_copyval(checkrp->r_fh, &fh);
1816 
1817 	for (index = 0; index < rtable4size; index++) {
1818 
1819 		if (&rtable4[index] != checkrp->r_hashq)
1820 			rw_enter(&rtable4[index].r_lock, RW_READER);
1821 
1822 		for (rp = rtable4[index].r_hashf;
1823 		    rp != (rnode4_t *)(&rtable4[index]);
1824 		    rp = rp->r_hashf) {
1825 
1826 			if (rp == checkrp)
1827 				continue;
1828 
1829 			tvp = RTOV4(rp);
1830 			if (tvp->v_vfsp != vfsp)
1831 				continue;
1832 
1833 			sfh4_copyval(rp->r_fh, &fh2);
1834 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
1835 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
1836 				    "(%p, %p)", (void *)checkrp, (void *)rp);
1837 			}
1838 		}
1839 
1840 		if (&rtable4[index] != checkrp->r_hashq)
1841 			rw_exit(&rtable4[index].r_lock);
1842 	}
1843 }
1844 
1845 #endif /* DEBUG */
1846