xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision bb57d1f5164aca913cbd286ae1b61c896167cfa7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/proc.h>
38 #include <sys/user.h>
39 #include <sys/time.h>
40 #include <sys/buf.h>
41 #include <sys/vfs.h>
42 #include <sys/vnode.h>
43 #include <sys/socket.h>
44 #include <sys/uio.h>
45 #include <sys/tiuser.h>
46 #include <sys/swap.h>
47 #include <sys/errno.h>
48 #include <sys/debug.h>
49 #include <sys/kmem.h>
50 #include <sys/kstat.h>
51 #include <sys/cmn_err.h>
52 #include <sys/vtrace.h>
53 #include <sys/session.h>
54 #include <sys/dnlc.h>
55 #include <sys/bitmap.h>
56 #include <sys/acl.h>
57 #include <sys/ddi.h>
58 #include <sys/pathname.h>
59 #include <sys/flock.h>
60 #include <sys/dirent.h>
61 #include <sys/flock.h>
62 #include <sys/callb.h>
63 
64 #include <rpc/types.h>
65 #include <rpc/xdr.h>
66 #include <rpc/auth.h>
67 #include <rpc/rpcsec_gss.h>
68 #include <rpc/clnt.h>
69 
70 #include <nfs/nfs.h>
71 #include <nfs/nfs_clnt.h>
72 #include <nfs/nfs_acl.h>
73 
74 #include <nfs/nfs4.h>
75 #include <nfs/rnode4.h>
76 #include <nfs/nfs4_clnt.h>
77 
78 /*
79  * The hash queues for the access to active and cached rnodes
80  * are organized as doubly linked lists.  A reader/writer lock
81  * for each hash bucket is used to control access and to synchronize
82  * lookups, additions, and deletions from the hash queue.
83  *
84  * The rnode freelist is organized as a doubly linked list with
85  * a head pointer.  Additions and deletions are synchronized via
86  * a single mutex.
87  *
88  * In order to add an rnode to the free list, it must be hashed into
89  * a hash queue and the exclusive lock to the hash queue be held.
90  * If an rnode is not hashed into a hash queue, then it is destroyed
91  * because it represents no valuable information that can be reused
92  * about the file.  The exclusive lock to the hash queue must be
93  * held in order to prevent a lookup in the hash queue from finding
94  * the rnode and using it and assuming that the rnode is not on the
95  * freelist.  The lookup in the hash queue will have the hash queue
96  * locked, either exclusive or shared.
97  *
98  * The vnode reference count for each rnode is not allowed to drop
99  * below 1.  This prevents external entities, such as the VM
100  * subsystem, from acquiring references to vnodes already on the
101  * freelist and then trying to place them back on the freelist
102  * when their reference is released.  This means that the when an
103  * rnode is looked up in the hash queues, then either the rnode
104  * is removed from the freelist and that reference is transferred to
105  * the new reference or the vnode reference count must be incremented
106  * accordingly.  The mutex for the freelist must be held in order to
107  * accurately test to see if the rnode is on the freelist or not.
108  * The hash queue lock might be held shared and it is possible that
109  * two different threads may race to remove the rnode from the
110  * freelist.  This race can be resolved by holding the mutex for the
111  * freelist.  Please note that the mutex for the freelist does not
112  * need to be held if the rnode is not on the freelist.  It can not be
113  * placed on the freelist due to the requirement that the thread
114  * putting the rnode on the freelist must hold the exclusive lock
115  * to the hash queue and the thread doing the lookup in the hash
116  * queue is holding either a shared or exclusive lock to the hash
117  * queue.
118  *
119  * The lock ordering is:
120  *
121  *	hash bucket lock -> vnode lock
122  *	hash bucket lock -> freelist lock -> r_statelock
123  */
124 r4hashq_t *rtable4;
125 
126 static kmutex_t rp4freelist_lock;
127 static rnode4_t *rp4freelist = NULL;
128 static long rnode4_new = 0;
129 int rtable4size;
130 static int rtable4mask;
131 static struct kmem_cache *rnode4_cache;
132 static int rnode4_hashlen = 4;
133 
134 static void	r4inactive(rnode4_t *, cred_t *);
135 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
136 		    struct vnodeops *,
137 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
138 		    cred_t *),
139 		    int *, cred_t *);
140 static void	rp4_rmfree(rnode4_t *);
141 int		nfs4_free_data_reclaim(rnode4_t *);
142 static int	nfs4_active_data_reclaim(rnode4_t *);
143 static int	nfs4_free_reclaim(void);
144 static int	nfs4_active_reclaim(void);
145 static int	nfs4_rnode_reclaim(void);
146 static void	nfs4_reclaim(void *);
147 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
148 static void	uninit_rnode4(rnode4_t *);
149 static void	destroy_rnode4(rnode4_t *);
150 static void	r4_stub_set(rnode4_t *, nfs4_stub_type_t);
151 
152 #ifdef DEBUG
153 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
154 static int nfs4_rnode_debug = 0;
155 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
156 static int nfs4_rnode_nofreelist = 0;
157 /* give messages on colliding shared filehandles */
158 static void	r4_dup_check(rnode4_t *, vfs_t *);
159 #endif
160 
161 /*
162  * If the vnode has pages, run the list and check for any that are
163  * still dangling.  We call this routine before putting an rnode on
164  * the free list.
165  */
166 static int
167 nfs4_dross_pages(vnode_t *vp)
168 {
169 	page_t *pp;
170 	kmutex_t *vphm;
171 
172 	vphm = page_vnode_mutex(vp);
173 	mutex_enter(vphm);
174 	if ((pp = vp->v_pages) != NULL) {
175 		do {
176 			if (pp->p_fsdata != C_NOCOMMIT) {
177 				mutex_exit(vphm);
178 				return (1);
179 			}
180 		} while ((pp = pp->p_vpnext) != vp->v_pages);
181 	}
182 	mutex_exit(vphm);
183 
184 	return (0);
185 }
186 
187 /*
188  * Flush any pages left on this rnode.
189  */
190 static void
191 r4flushpages(rnode4_t *rp, cred_t *cr)
192 {
193 	vnode_t *vp;
194 	int error;
195 
196 	/*
197 	 * Before freeing anything, wait until all asynchronous
198 	 * activity is done on this rnode.  This will allow all
199 	 * asynchronous read ahead and write behind i/o's to
200 	 * finish.
201 	 */
202 	mutex_enter(&rp->r_statelock);
203 	while (rp->r_count > 0)
204 		cv_wait(&rp->r_cv, &rp->r_statelock);
205 	mutex_exit(&rp->r_statelock);
206 
207 	/*
208 	 * Flush and invalidate all pages associated with the vnode.
209 	 */
210 	vp = RTOV4(rp);
211 	if (nfs4_has_pages(vp)) {
212 		ASSERT(vp->v_type != VCHR);
213 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
214 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
215 			if (error && (error == ENOSPC || error == EDQUOT)) {
216 				mutex_enter(&rp->r_statelock);
217 				if (!rp->r_error)
218 					rp->r_error = error;
219 				mutex_exit(&rp->r_statelock);
220 			}
221 		}
222 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
223 	}
224 }
225 
226 /*
227  * Free the resources associated with an rnode.
228  */
229 static void
230 r4inactive(rnode4_t *rp, cred_t *cr)
231 {
232 	vnode_t *vp;
233 	char *contents;
234 	int size;
235 	vsecattr_t *vsp;
236 	vnode_t *xattr;
237 
238 	r4flushpages(rp, cr);
239 
240 	vp = RTOV4(rp);
241 
242 	/*
243 	 * Free any held caches which may be
244 	 * associated with this rnode.
245 	 */
246 	mutex_enter(&rp->r_statelock);
247 	contents = rp->r_symlink.contents;
248 	size = rp->r_symlink.size;
249 	rp->r_symlink.contents = NULL;
250 	vsp = rp->r_secattr;
251 	rp->r_secattr = NULL;
252 	xattr = rp->r_xattr_dir;
253 	rp->r_xattr_dir = NULL;
254 	mutex_exit(&rp->r_statelock);
255 
256 	/*
257 	 * Free the access cache entries.
258 	 */
259 	(void) nfs4_access_purge_rp(rp);
260 
261 	/*
262 	 * Free the readdir cache entries.
263 	 */
264 	nfs4_purge_rddir_cache(vp);
265 
266 	/*
267 	 * Free the symbolic link cache.
268 	 */
269 	if (contents != NULL) {
270 
271 		kmem_free((void *)contents, size);
272 	}
273 
274 	/*
275 	 * Free any cached ACL.
276 	 */
277 	if (vsp != NULL)
278 		nfs4_acl_free_cache(vsp);
279 
280 	/*
281 	 * Release the cached xattr_dir
282 	 */
283 	if (xattr != NULL)
284 		VN_RELE(xattr);
285 }
286 
287 /*
288  * We have seen a case that the fh passed in is for "." which
289  * should be a VROOT node, however, the fh is different from the
290  * root fh stored in the mntinfo4_t. The invalid fh might be
291  * from a misbehaved server and will panic the client system at
292  * a later time. To avoid the panic, we drop the bad fh, use
293  * the root fh from mntinfo4_t, and print an error message
294  * for attention.
295  */
296 nfs4_sharedfh_t *
297 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
298     int *wasbad)
299 {
300 	char *s;
301 
302 	*wasbad = 0;
303 	s = fn_name(nm);
304 	ASSERT(strcmp(s, "..") != 0);
305 
306 	if ((s[0] == '.' && s[1] == '\0') && fh &&
307 	    !SFH4_SAME(mi->mi_rootfh, fh)) {
308 #ifdef DEBUG
309 		nfs4_fhandle_t fhandle;
310 
311 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
312 		    "Server %s returns a different "
313 		    "root filehandle for the path %s:",
314 		    mi->mi_curr_serv->sv_hostname,
315 		    mi->mi_curr_serv->sv_path);
316 
317 		/* print the bad fh */
318 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
319 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
320 		    fhandle.fh_len);
321 		nfs4_printfhandle(&fhandle);
322 
323 		/* print mi_rootfh */
324 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
325 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
326 		    fhandle.fh_len);
327 		nfs4_printfhandle(&fhandle);
328 #endif
329 		/* use mi_rootfh instead; fh will be rele by the caller */
330 		fh = mi->mi_rootfh;
331 		*wasbad = 1;
332 	}
333 
334 	kmem_free(s, MAXNAMELEN);
335 	return (fh);
336 }
337 
338 void
339 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
340     hrtime_t t, cred_t *cr, int index)
341 {
342 	int is_stub;
343 	vattr_t *attr;
344 	/*
345 	 * Don't add to attrcache if time overflow, but
346 	 * no need to check because either attr is null or the time
347 	 * values in it were processed by nfs4_time_ntov(), which checks
348 	 * for time overflows.
349 	 */
350 	attr = garp ? &garp->n4g_va : NULL;
351 
352 	if (attr) {
353 		if (!newnode) {
354 			rw_exit(&rtable4[index].r_lock);
355 #ifdef DEBUG
356 			if (vp->v_type != attr->va_type &&
357 			    vp->v_type != VNON && attr->va_type != VNON) {
358 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
359 				    "makenfs4node: type (%d) doesn't "
360 				    "match type of found node at %p (%d)",
361 				    attr->va_type, (void *)vp, vp->v_type);
362 			}
363 #endif
364 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
365 		} else {
366 			rnode4_t *rp = VTOR4(vp);
367 
368 			vp->v_type = attr->va_type;
369 			vp->v_rdev = attr->va_rdev;
370 
371 			/*
372 			 * Turn this object into a "stub" object if we
373 			 * crossed an underlying server fs boundary.
374 			 * To make this check, during mount we save the
375 			 * fsid of the server object being mounted.
376 			 * Here we compare this object's server fsid
377 			 * with the fsid we saved at mount.  If they
378 			 * are different, we crossed server fs boundary.
379 			 *
380 			 * The stub type is set (or not) at rnode
381 			 * creation time and it never changes for life
382 			 * of the rnode.
383 			 *
384 			 * The stub type is also set during RO failover,
385 			 * nfs4_remap_file().
386 			 *
387 			 * This stub will be for a mirror-mount.
388 			 *
389 			 * We don't bother with taking r_state_lock to
390 			 * set the stub type because this is a new rnode
391 			 * and we're holding the hash bucket r_lock RW_WRITER.
392 			 * No other thread could have obtained access
393 			 * to this rnode.
394 			 */
395 			is_stub = 0;
396 			if (garp->n4g_fsid_valid) {
397 				fattr4_fsid ga_fsid = garp->n4g_fsid;
398 				servinfo4_t *svp = rp->r_server;
399 
400 				rp->r_srv_fsid = ga_fsid;
401 
402 				(void) nfs_rw_enter_sig(&svp->sv_lock,
403 				    RW_READER, 0);
404 				if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
405 					is_stub = 1;
406 				nfs_rw_exit(&svp->sv_lock);
407 			}
408 
409 			if (is_stub)
410 				r4_stub_mirrormount(rp);
411 			else
412 				r4_stub_none(rp);
413 
414 			/* Can not cache partial attr */
415 			if (attr->va_mask == AT_ALL)
416 				nfs4_attrcache_noinval(vp, garp, t);
417 			else
418 				PURGE_ATTRCACHE4(vp);
419 
420 			rw_exit(&rtable4[index].r_lock);
421 		}
422 	} else {
423 		if (newnode) {
424 			PURGE_ATTRCACHE4(vp);
425 		}
426 		rw_exit(&rtable4[index].r_lock);
427 	}
428 }
429 
430 /*
431  * Find or create an rnode based primarily on filehandle.  To be
432  * used when dvp (vnode for parent directory) is not available;
433  * otherwise, makenfs4node() should be used.
434  *
435  * The nfs4_fname_t argument *npp is consumed and nulled out.
436  */
437 
438 vnode_t *
439 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
440     nfs4_fname_t **npp, nfs4_ga_res_t *garp,
441     mntinfo4_t *mi, cred_t *cr, hrtime_t t)
442 {
443 	vfs_t *vfsp = mi->mi_vfsp;
444 	int newnode = 0;
445 	vnode_t *vp;
446 	rnode4_t *rp;
447 	svnode_t *svp;
448 	nfs4_fname_t *name;
449 	int index;
450 
451 	ASSERT(npp && *npp);
452 	name = *npp;
453 	*npp = NULL;
454 
455 	index = rtable4hash(sfh);
456 	rw_enter(&rtable4[index].r_lock, RW_READER);
457 
458 	rp = r4find(&rtable4[index], sfh, vfsp);
459 	if (rp != NULL) {
460 		rw_exit(&rtable4[index].r_lock);
461 		vp = RTOV4(rp);
462 		fn_rele(&name);
463 		return (vp);
464 	}
465 
466 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
467 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
468 	if (newnode) {
469 		svp = vtosv(vp);
470 		svp->sv_forw = svp->sv_back = svp;
471 		svp->sv_name = name;
472 		if (psfh != NULL)
473 			sfh4_hold(psfh);
474 		svp->sv_dfh = psfh;
475 	} else {
476 		fn_rele(&name);
477 	}
478 
479 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
480 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
481 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
482 
483 	return (vp);
484 }
485 
486 /*
487  * Find or create a vnode for the given filehandle, filesystem, parent, and
488  * name.  The reference to nm is consumed, so the caller must first do an
489  * fn_hold() if it wants to continue using nm after this call.
490  */
491 vnode_t *
492 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
493     hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
494 {
495 	vnode_t *vp;
496 	int newnode;
497 	int index;
498 	mntinfo4_t *mi = VFTOMI4(vfsp);
499 	int had_badfh = 0;
500 	rnode4_t *rp;
501 
502 	ASSERT(dvp != NULL);
503 
504 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
505 
506 	index = rtable4hash(fh);
507 	rw_enter(&rtable4[index].r_lock, RW_READER);
508 
509 	/*
510 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
511 	 */
512 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
513 	    nfs4_putapage, &newnode, cr);
514 
515 	rp = VTOR4(vp);
516 	sv_activate(&vp, dvp, &nm, newnode);
517 	if (dvp->v_flag & V_XATTRDIR) {
518 		mutex_enter(&rp->r_statelock);
519 		rp->r_flags |= R4ISXATTR;
520 		mutex_exit(&rp->r_statelock);
521 	}
522 
523 	/* if getting a bad file handle, do not cache the attributes. */
524 	if (had_badfh) {
525 		rw_exit(&rtable4[index].r_lock);
526 		return (vp);
527 	}
528 
529 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
530 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
531 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
532 
533 	return (vp);
534 }
535 
536 /*
537  * Hash on address of filehandle object.
538  * XXX totally untuned.
539  */
540 
541 int
542 rtable4hash(nfs4_sharedfh_t *fh)
543 {
544 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
545 }
546 
547 /*
548  * Find or create the vnode for the given filehandle and filesystem.
549  * *newnode is set to zero if the vnode already existed; non-zero if it had
550  * to be created.
551  *
552  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
553  */
554 
555 static vnode_t *
556 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
557     struct vnodeops *vops,
558     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
559     int *newnode, cred_t *cr)
560 {
561 	rnode4_t *rp;
562 	rnode4_t *trp;
563 	vnode_t *vp;
564 	mntinfo4_t *mi;
565 
566 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
567 
568 	mi = VFTOMI4(vfsp);
569 
570 start:
571 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
572 		vp = RTOV4(rp);
573 		*newnode = 0;
574 		return (vp);
575 	}
576 	rw_exit(&rhtp->r_lock);
577 
578 	mutex_enter(&rp4freelist_lock);
579 
580 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
581 		rp = rp4freelist;
582 		rp4_rmfree(rp);
583 		mutex_exit(&rp4freelist_lock);
584 
585 		vp = RTOV4(rp);
586 
587 		if (rp->r_flags & R4HASHED) {
588 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
589 			mutex_enter(&vp->v_lock);
590 			if (vp->v_count > 1) {
591 				vp->v_count--;
592 				mutex_exit(&vp->v_lock);
593 				rw_exit(&rp->r_hashq->r_lock);
594 				rw_enter(&rhtp->r_lock, RW_READER);
595 				goto start;
596 			}
597 			mutex_exit(&vp->v_lock);
598 			rp4_rmhash_locked(rp);
599 			rw_exit(&rp->r_hashq->r_lock);
600 		}
601 
602 		r4inactive(rp, cr);
603 
604 		mutex_enter(&vp->v_lock);
605 		if (vp->v_count > 1) {
606 			vp->v_count--;
607 			mutex_exit(&vp->v_lock);
608 			rw_enter(&rhtp->r_lock, RW_READER);
609 			goto start;
610 		}
611 		mutex_exit(&vp->v_lock);
612 		vn_invalid(vp);
613 
614 		/*
615 		 * destroy old locks before bzero'ing and
616 		 * recreating the locks below.
617 		 */
618 		uninit_rnode4(rp);
619 
620 		/*
621 		 * Make sure that if rnode is recycled then
622 		 * VFS count is decremented properly before
623 		 * reuse.
624 		 */
625 		VFS_RELE(vp->v_vfsp);
626 		vn_reinit(vp);
627 	} else {
628 		vnode_t *new_vp;
629 
630 		mutex_exit(&rp4freelist_lock);
631 
632 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
633 		new_vp = vn_alloc(KM_SLEEP);
634 
635 		atomic_add_long((ulong_t *)&rnode4_new, 1);
636 #ifdef DEBUG
637 		clstat4_debug.nrnode.value.ui64++;
638 #endif
639 		vp = new_vp;
640 	}
641 
642 	bzero(rp, sizeof (*rp));
643 	rp->r_vnode = vp;
644 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
645 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
646 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
647 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
648 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
649 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
650 	rp->created_v4 = 0;
651 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
652 	    offsetof(nfs4_open_stream_t, os_node));
653 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
654 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
655 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
656 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
657 	rp->r_flags = R4READDIRWATTR;
658 	rp->r_fh = fh;
659 	rp->r_hashq = rhtp;
660 	sfh4_hold(rp->r_fh);
661 	rp->r_server = mi->mi_curr_serv;
662 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
663 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
664 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
665 
666 	rddir4_cache_create(rp);
667 	rp->r_putapage = putapage;
668 	vn_setops(vp, vops);
669 	vp->v_data = (caddr_t)rp;
670 	vp->v_vfsp = vfsp;
671 	VFS_HOLD(vfsp);
672 	vp->v_type = VNON;
673 	if (isrootfh(fh, rp))
674 		vp->v_flag = VROOT;
675 	vn_exists(vp);
676 
677 	/*
678 	 * There is a race condition if someone else
679 	 * alloc's the rnode while no locks are held, so we
680 	 * check again and recover if found.
681 	 */
682 	rw_enter(&rhtp->r_lock, RW_WRITER);
683 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
684 		vp = RTOV4(trp);
685 		*newnode = 0;
686 		rw_exit(&rhtp->r_lock);
687 		rp4_addfree(rp, cr);
688 		rw_enter(&rhtp->r_lock, RW_READER);
689 		return (vp);
690 	}
691 	rp4_addhash(rp);
692 	*newnode = 1;
693 	return (vp);
694 }
695 
696 static void
697 uninit_rnode4(rnode4_t *rp)
698 {
699 	vnode_t *vp = RTOV4(rp);
700 
701 	ASSERT(rp != NULL);
702 	ASSERT(vp != NULL);
703 	ASSERT(vp->v_count == 1);
704 	ASSERT(rp->r_count == 0);
705 	ASSERT(rp->r_mapcnt == 0);
706 	if (rp->r_flags & R4LODANGLERS) {
707 		nfs4_flush_lock_owners(rp);
708 	}
709 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
710 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
711 	ASSERT(!(rp->r_flags & R4HASHED));
712 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
713 	nfs4_clear_open_streams(rp);
714 	list_destroy(&rp->r_open_streams);
715 
716 	/*
717 	 * Destroy the rddir cache first since we need to grab the r_statelock.
718 	 */
719 	mutex_enter(&rp->r_statelock);
720 	rddir4_cache_destroy(rp);
721 	mutex_exit(&rp->r_statelock);
722 	sv_uninit(&rp->r_svnode);
723 	sfh4_rele(&rp->r_fh);
724 	nfs_rw_destroy(&rp->r_rwlock);
725 	nfs_rw_destroy(&rp->r_lkserlock);
726 	mutex_destroy(&rp->r_statelock);
727 	mutex_destroy(&rp->r_statev4_lock);
728 	mutex_destroy(&rp->r_os_lock);
729 	cv_destroy(&rp->r_cv);
730 	cv_destroy(&rp->r_commit.c_cv);
731 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
732 	if (rp->r_flags & R4DELMAPLIST)
733 		list_destroy(&rp->r_indelmap);
734 }
735 
736 /*
737  * Put an rnode on the free list.
738  *
739  * Rnodes which were allocated above and beyond the normal limit
740  * are immediately freed.
741  */
742 void
743 rp4_addfree(rnode4_t *rp, cred_t *cr)
744 {
745 	vnode_t *vp;
746 	vnode_t *xattr;
747 	struct vfs *vfsp;
748 
749 	vp = RTOV4(rp);
750 	ASSERT(vp->v_count >= 1);
751 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
752 
753 	/*
754 	 * If we have too many rnodes allocated and there are no
755 	 * references to this rnode, or if the rnode is no longer
756 	 * accessible by it does not reside in the hash queues,
757 	 * or if an i/o error occurred while writing to the file,
758 	 * then just free it instead of putting it on the rnode
759 	 * freelist.
760 	 */
761 	vfsp = vp->v_vfsp;
762 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
763 #ifdef DEBUG
764 	    (nfs4_rnode_nofreelist != 0) ||
765 #endif
766 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
767 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
768 		if (rp->r_flags & R4HASHED) {
769 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
770 			mutex_enter(&vp->v_lock);
771 			if (vp->v_count > 1) {
772 				vp->v_count--;
773 				mutex_exit(&vp->v_lock);
774 				rw_exit(&rp->r_hashq->r_lock);
775 				return;
776 			}
777 			mutex_exit(&vp->v_lock);
778 			rp4_rmhash_locked(rp);
779 			rw_exit(&rp->r_hashq->r_lock);
780 		}
781 
782 		/*
783 		 * Make sure we don't have a delegation on this rnode
784 		 * before destroying it.
785 		 */
786 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
787 			(void) nfs4delegreturn(rp,
788 			    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
789 		}
790 
791 		r4inactive(rp, cr);
792 
793 		/*
794 		 * Recheck the vnode reference count.  We need to
795 		 * make sure that another reference has not been
796 		 * acquired while we were not holding v_lock.  The
797 		 * rnode is not in the rnode hash queues; one
798 		 * way for a reference to have been acquired
799 		 * is for a VOP_PUTPAGE because the rnode was marked
800 		 * with R4DIRTY or for a modified page.  This
801 		 * reference may have been acquired before our call
802 		 * to r4inactive.  The i/o may have been completed,
803 		 * thus allowing r4inactive to complete, but the
804 		 * reference to the vnode may not have been released
805 		 * yet.  In any case, the rnode can not be destroyed
806 		 * until the other references to this vnode have been
807 		 * released.  The other references will take care of
808 		 * either destroying the rnode or placing it on the
809 		 * rnode freelist.  If there are no other references,
810 		 * then the rnode may be safely destroyed.
811 		 */
812 		mutex_enter(&vp->v_lock);
813 		if (vp->v_count > 1) {
814 			vp->v_count--;
815 			mutex_exit(&vp->v_lock);
816 			return;
817 		}
818 		mutex_exit(&vp->v_lock);
819 
820 		destroy_rnode4(rp);
821 		return;
822 	}
823 
824 	/*
825 	 * Lock the hash queue and then recheck the reference count
826 	 * to ensure that no other threads have acquired a reference
827 	 * to indicate that the rnode should not be placed on the
828 	 * freelist.  If another reference has been acquired, then
829 	 * just release this one and let the other thread complete
830 	 * the processing of adding this rnode to the freelist.
831 	 */
832 again:
833 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
834 
835 	mutex_enter(&vp->v_lock);
836 	if (vp->v_count > 1) {
837 		vp->v_count--;
838 		mutex_exit(&vp->v_lock);
839 		rw_exit(&rp->r_hashq->r_lock);
840 		return;
841 	}
842 	mutex_exit(&vp->v_lock);
843 
844 	/*
845 	 * Make sure we don't put an rnode with a delegation
846 	 * on the free list.
847 	 */
848 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
849 		rw_exit(&rp->r_hashq->r_lock);
850 		(void) nfs4delegreturn(rp,
851 		    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
852 		goto again;
853 	}
854 
855 	/*
856 	 * Now that we have the hash queue lock, and we know there
857 	 * are not anymore references on the vnode, check to make
858 	 * sure there aren't any open streams still on the rnode.
859 	 * If so, drop the hash queue lock, remove the open streams,
860 	 * and recheck the v_count.
861 	 */
862 	mutex_enter(&rp->r_os_lock);
863 	if (list_head(&rp->r_open_streams) != NULL) {
864 		mutex_exit(&rp->r_os_lock);
865 		rw_exit(&rp->r_hashq->r_lock);
866 		if (nfs_zone() != VTOMI4(vp)->mi_zone)
867 			nfs4_clear_open_streams(rp);
868 		else
869 			(void) nfs4close_all(vp, cr);
870 		goto again;
871 	}
872 	mutex_exit(&rp->r_os_lock);
873 
874 	/*
875 	 * Before we put it on the freelist, make sure there are no pages.
876 	 * If there are, flush and commit of all of the dirty and
877 	 * uncommitted pages, assuming the file system isn't read only.
878 	 */
879 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
880 		rw_exit(&rp->r_hashq->r_lock);
881 		r4flushpages(rp, cr);
882 		goto again;
883 	}
884 
885 	/*
886 	 * Before we put it on the freelist, make sure there is no
887 	 * active xattr directory cached, the freelist will not
888 	 * have its entries r4inactive'd if there is still an active
889 	 * rnode, thus nothing in the freelist can hold another
890 	 * rnode active.
891 	 */
892 	xattr = rp->r_xattr_dir;
893 	rp->r_xattr_dir = NULL;
894 
895 	/*
896 	 * If there is no cached data or metadata for this file, then
897 	 * put the rnode on the front of the freelist so that it will
898 	 * be reused before other rnodes which may have cached data or
899 	 * metadata associated with them.
900 	 */
901 	mutex_enter(&rp4freelist_lock);
902 	if (rp4freelist == NULL) {
903 		rp->r_freef = rp;
904 		rp->r_freeb = rp;
905 		rp4freelist = rp;
906 	} else {
907 		rp->r_freef = rp4freelist;
908 		rp->r_freeb = rp4freelist->r_freeb;
909 		rp4freelist->r_freeb->r_freef = rp;
910 		rp4freelist->r_freeb = rp;
911 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
912 		    rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
913 			rp4freelist = rp;
914 	}
915 	mutex_exit(&rp4freelist_lock);
916 
917 	rw_exit(&rp->r_hashq->r_lock);
918 
919 	if (xattr)
920 		VN_RELE(xattr);
921 }
922 
923 /*
924  * Remove an rnode from the free list.
925  *
926  * The caller must be holding rp4freelist_lock and the rnode
927  * must be on the freelist.
928  */
929 static void
930 rp4_rmfree(rnode4_t *rp)
931 {
932 
933 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
934 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
935 
936 	if (rp == rp4freelist) {
937 		rp4freelist = rp->r_freef;
938 		if (rp == rp4freelist)
939 			rp4freelist = NULL;
940 	}
941 	rp->r_freeb->r_freef = rp->r_freef;
942 	rp->r_freef->r_freeb = rp->r_freeb;
943 
944 	rp->r_freef = rp->r_freeb = NULL;
945 }
946 
947 /*
948  * Put a rnode in the hash table.
949  *
950  * The caller must be holding the exclusive hash queue lock
951  */
952 void
953 rp4_addhash(rnode4_t *rp)
954 {
955 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
956 	ASSERT(!(rp->r_flags & R4HASHED));
957 
958 #ifdef DEBUG
959 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
960 #endif
961 
962 	rp->r_hashf = rp->r_hashq->r_hashf;
963 	rp->r_hashq->r_hashf = rp;
964 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
965 	rp->r_hashf->r_hashb = rp;
966 
967 	mutex_enter(&rp->r_statelock);
968 	rp->r_flags |= R4HASHED;
969 	mutex_exit(&rp->r_statelock);
970 }
971 
972 /*
973  * Remove a rnode from the hash table.
974  *
975  * The caller must be holding the hash queue lock.
976  */
977 void
978 rp4_rmhash_locked(rnode4_t *rp)
979 {
980 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
981 	ASSERT(rp->r_flags & R4HASHED);
982 
983 	rp->r_hashb->r_hashf = rp->r_hashf;
984 	rp->r_hashf->r_hashb = rp->r_hashb;
985 
986 	mutex_enter(&rp->r_statelock);
987 	rp->r_flags &= ~R4HASHED;
988 	mutex_exit(&rp->r_statelock);
989 }
990 
991 /*
992  * Remove a rnode from the hash table.
993  *
994  * The caller must not be holding the hash queue lock.
995  */
996 void
997 rp4_rmhash(rnode4_t *rp)
998 {
999 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1000 	rp4_rmhash_locked(rp);
1001 	rw_exit(&rp->r_hashq->r_lock);
1002 }
1003 
1004 /*
1005  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1006  * Returns NULL if no match.  If an rnode is returned, the reference count
1007  * on the master vnode is incremented.
1008  *
1009  * The caller must be holding the hash queue lock, either shared or exclusive.
1010  */
1011 rnode4_t *
1012 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1013 {
1014 	rnode4_t *rp;
1015 	vnode_t *vp;
1016 
1017 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1018 
1019 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1020 		vp = RTOV4(rp);
1021 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1022 
1023 			mutex_enter(&rp->r_statelock);
1024 			if (rp->r_flags & R4RECOVERR) {
1025 				mutex_exit(&rp->r_statelock);
1026 				continue;
1027 			}
1028 			mutex_exit(&rp->r_statelock);
1029 #ifdef DEBUG
1030 			r4_dup_check(rp, vfsp);
1031 #endif
1032 			if (rp->r_freef != NULL) {
1033 				mutex_enter(&rp4freelist_lock);
1034 				/*
1035 				 * If the rnode is on the freelist,
1036 				 * then remove it and use that reference
1037 				 * as the new reference.  Otherwise,
1038 				 * need to increment the reference count.
1039 				 */
1040 				if (rp->r_freef != NULL) {
1041 					rp4_rmfree(rp);
1042 					mutex_exit(&rp4freelist_lock);
1043 				} else {
1044 					mutex_exit(&rp4freelist_lock);
1045 					VN_HOLD(vp);
1046 				}
1047 			} else
1048 				VN_HOLD(vp);
1049 
1050 			/*
1051 			 * if root vnode, set v_flag to indicate that
1052 			 */
1053 			if (isrootfh(fh, rp)) {
1054 				if (!(vp->v_flag & VROOT)) {
1055 					mutex_enter(&vp->v_lock);
1056 					vp->v_flag |= VROOT;
1057 					mutex_exit(&vp->v_lock);
1058 				}
1059 			}
1060 			return (rp);
1061 		}
1062 	}
1063 	return (NULL);
1064 }
1065 
1066 /*
1067  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1068  * that assumes the caller hasn't already got the lock
1069  * on the hash bucket.
1070  */
1071 rnode4_t *
1072 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1073 {
1074 	rnode4_t *rp;
1075 	int index;
1076 
1077 	index = rtable4hash(fh);
1078 	rw_enter(&rtable4[index].r_lock, RW_READER);
1079 	rp = r4find(&rtable4[index], fh, vfsp);
1080 	rw_exit(&rtable4[index].r_lock);
1081 
1082 	return (rp);
1083 }
1084 
1085 /*
1086  * Return 1 if there is a active vnode belonging to this vfs in the
1087  * rtable4 cache.
1088  *
1089  * Several of these checks are done without holding the usual
1090  * locks.  This is safe because destroy_rtable(), rp_addfree(),
1091  * etc. will redo the necessary checks before actually destroying
1092  * any rnodes.
1093  */
1094 int
1095 check_rtable4(struct vfs *vfsp)
1096 {
1097 	rnode4_t *rp;
1098 	vnode_t *vp;
1099 	char *busy = NULL;
1100 	int index;
1101 
1102 	for (index = 0; index < rtable4size; index++) {
1103 		rw_enter(&rtable4[index].r_lock, RW_READER);
1104 
1105 		for (rp = rtable4[index].r_hashf;
1106 		    rp != (rnode4_t *)(&rtable4[index]);
1107 		    rp = rp->r_hashf) {
1108 
1109 			vp = RTOV4(rp);
1110 			if (vp->v_vfsp == vfsp) {
1111 				if (rp->r_freef == NULL) {
1112 					busy = "not on free list";
1113 				} else if (nfs4_has_pages(vp) &&
1114 				    (rp->r_flags & R4DIRTY)) {
1115 					busy = "dirty pages";
1116 				} else if (rp->r_count > 0) {
1117 					busy = "r_count > 0";
1118 				}
1119 
1120 				if (busy != NULL) {
1121 #ifdef DEBUG
1122 					char *path;
1123 
1124 					path = fn_path(rp->r_svnode.sv_name);
1125 					NFS4_DEBUG(nfs4_rnode_debug,
1126 					    (CE_NOTE, "check_rtable4: " "%s %s",
1127 					    path, busy));
1128 					kmem_free(path, strlen(path)+1);
1129 #endif
1130 					rw_exit(&rtable4[index].r_lock);
1131 					return (1);
1132 				}
1133 			}
1134 		}
1135 		rw_exit(&rtable4[index].r_lock);
1136 	}
1137 	return (0);
1138 }
1139 
1140 /*
1141  * Destroy inactive vnodes from the hash queues which
1142  * belong to this vfs. All of the vnodes should be inactive.
1143  * It is essential that we destroy all rnodes in case of
1144  * forced unmount as well as in normal unmount case.
1145  */
1146 
1147 void
1148 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1149 {
1150 	int index;
1151 	vnode_t *vp;
1152 	rnode4_t *rp, *r_hashf, *rlist;
1153 
1154 	rlist = NULL;
1155 
1156 	for (index = 0; index < rtable4size; index++) {
1157 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1158 		for (rp = rtable4[index].r_hashf;
1159 		    rp != (rnode4_t *)(&rtable4[index]);
1160 		    rp = r_hashf) {
1161 			/* save the hash pointer before destroying */
1162 			r_hashf = rp->r_hashf;
1163 
1164 			vp = RTOV4(rp);
1165 			if (vp->v_vfsp == vfsp) {
1166 				mutex_enter(&rp4freelist_lock);
1167 				if (rp->r_freef != NULL) {
1168 					rp4_rmfree(rp);
1169 					mutex_exit(&rp4freelist_lock);
1170 					rp4_rmhash_locked(rp);
1171 					rp->r_hashf = rlist;
1172 					rlist = rp;
1173 				} else
1174 					mutex_exit(&rp4freelist_lock);
1175 			}
1176 		}
1177 		rw_exit(&rtable4[index].r_lock);
1178 	}
1179 
1180 	for (rp = rlist; rp != NULL; rp = r_hashf) {
1181 		r_hashf = rp->r_hashf;
1182 		/*
1183 		 * This call to rp4_addfree will end up destroying the
1184 		 * rnode, but in a safe way with the appropriate set
1185 		 * of checks done.
1186 		 */
1187 		rp4_addfree(rp, cr);
1188 	}
1189 }
1190 
1191 /*
1192  * This routine destroys all the resources of an rnode
1193  * and finally the rnode itself.
1194  */
1195 static void
1196 destroy_rnode4(rnode4_t *rp)
1197 {
1198 	vnode_t *vp;
1199 	vfs_t *vfsp;
1200 
1201 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1202 
1203 	vp = RTOV4(rp);
1204 	vfsp = vp->v_vfsp;
1205 
1206 	uninit_rnode4(rp);
1207 	atomic_add_long((ulong_t *)&rnode4_new, -1);
1208 #ifdef DEBUG
1209 	clstat4_debug.nrnode.value.ui64--;
1210 #endif
1211 	kmem_cache_free(rnode4_cache, rp);
1212 	vn_invalid(vp);
1213 	vn_free(vp);
1214 	VFS_RELE(vfsp);
1215 }
1216 
1217 /*
1218  * Invalidate the attributes on all rnodes forcing the next getattr
1219  * to go over the wire.  Used to flush stale uid and gid mappings.
1220  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1221  */
1222 void
1223 nfs4_rnode_invalidate(struct vfs *vfsp)
1224 {
1225 	int index;
1226 	rnode4_t *rp;
1227 	vnode_t *vp;
1228 
1229 	/*
1230 	 * Walk the hash queues looking for rnodes.
1231 	 */
1232 	for (index = 0; index < rtable4size; index++) {
1233 		rw_enter(&rtable4[index].r_lock, RW_READER);
1234 		for (rp = rtable4[index].r_hashf;
1235 		    rp != (rnode4_t *)(&rtable4[index]);
1236 		    rp = rp->r_hashf) {
1237 			vp = RTOV4(rp);
1238 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1239 				continue;
1240 
1241 			if (!mutex_tryenter(&rp->r_statelock))
1242 				continue;
1243 
1244 			/*
1245 			 * Expire the attributes by resetting the change
1246 			 * and attr timeout.
1247 			 */
1248 			rp->r_change = 0;
1249 			PURGE_ATTRCACHE4_LOCKED(rp);
1250 			mutex_exit(&rp->r_statelock);
1251 		}
1252 		rw_exit(&rtable4[index].r_lock);
1253 	}
1254 }
1255 
1256 /*
1257  * Flush all vnodes in this (or every) vfs.
1258  * Used by nfs_sync and by nfs_unmount.
1259  */
1260 void
1261 r4flush(struct vfs *vfsp, cred_t *cr)
1262 {
1263 	int index;
1264 	rnode4_t *rp;
1265 	vnode_t *vp, **vplist;
1266 	long num, cnt;
1267 
1268 	/*
1269 	 * Check to see whether there is anything to do.
1270 	 */
1271 	num = rnode4_new;
1272 	if (num == 0)
1273 		return;
1274 
1275 	/*
1276 	 * Allocate a slot for all currently active rnodes on the
1277 	 * supposition that they all may need flushing.
1278 	 */
1279 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1280 	cnt = 0;
1281 
1282 	/*
1283 	 * Walk the hash queues looking for rnodes with page
1284 	 * lists associated with them.  Make a list of these
1285 	 * files.
1286 	 */
1287 	for (index = 0; index < rtable4size; index++) {
1288 		rw_enter(&rtable4[index].r_lock, RW_READER);
1289 		for (rp = rtable4[index].r_hashf;
1290 		    rp != (rnode4_t *)(&rtable4[index]);
1291 		    rp = rp->r_hashf) {
1292 			vp = RTOV4(rp);
1293 			/*
1294 			 * Don't bother sync'ing a vp if it
1295 			 * is part of virtual swap device or
1296 			 * if VFS is read-only
1297 			 */
1298 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1299 				continue;
1300 			/*
1301 			 * If flushing all mounted file systems or
1302 			 * the vnode belongs to this vfs, has pages
1303 			 * and is marked as either dirty or mmap'd,
1304 			 * hold and add this vnode to the list of
1305 			 * vnodes to flush.
1306 			 */
1307 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
1308 			    nfs4_has_pages(vp) &&
1309 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1310 				VN_HOLD(vp);
1311 				vplist[cnt++] = vp;
1312 				if (cnt == num) {
1313 					rw_exit(&rtable4[index].r_lock);
1314 					goto toomany;
1315 				}
1316 			}
1317 		}
1318 		rw_exit(&rtable4[index].r_lock);
1319 	}
1320 toomany:
1321 
1322 	/*
1323 	 * Flush and release all of the files on the list.
1324 	 */
1325 	while (cnt-- > 0) {
1326 		vp = vplist[cnt];
1327 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
1328 		VN_RELE(vp);
1329 	}
1330 
1331 	/*
1332 	 * Free the space allocated to hold the list.
1333 	 */
1334 	kmem_free(vplist, num * sizeof (*vplist));
1335 }
1336 
1337 int
1338 nfs4_free_data_reclaim(rnode4_t *rp)
1339 {
1340 	char *contents;
1341 	vnode_t *xattr;
1342 	int size;
1343 	vsecattr_t *vsp;
1344 	int freed;
1345 	bool_t rdc = FALSE;
1346 
1347 	/*
1348 	 * Free any held caches which may
1349 	 * be associated with this rnode.
1350 	 */
1351 	mutex_enter(&rp->r_statelock);
1352 	if (rp->r_dir != NULL)
1353 		rdc = TRUE;
1354 	contents = rp->r_symlink.contents;
1355 	size = rp->r_symlink.size;
1356 	rp->r_symlink.contents = NULL;
1357 	vsp = rp->r_secattr;
1358 	rp->r_secattr = NULL;
1359 	xattr = rp->r_xattr_dir;
1360 	rp->r_xattr_dir = NULL;
1361 	mutex_exit(&rp->r_statelock);
1362 
1363 	/*
1364 	 * Free the access cache entries.
1365 	 */
1366 	freed = nfs4_access_purge_rp(rp);
1367 
1368 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1369 		return (freed);
1370 
1371 	/*
1372 	 * Free the readdir cache entries, incompletely if we can't block.
1373 	 */
1374 	nfs4_purge_rddir_cache(RTOV4(rp));
1375 
1376 	/*
1377 	 * Free the symbolic link cache.
1378 	 */
1379 	if (contents != NULL) {
1380 
1381 		kmem_free((void *)contents, size);
1382 	}
1383 
1384 	/*
1385 	 * Free any cached ACL.
1386 	 */
1387 	if (vsp != NULL)
1388 		nfs4_acl_free_cache(vsp);
1389 
1390 	/*
1391 	 * Release the xattr directory vnode
1392 	 */
1393 	if (xattr != NULL)
1394 		VN_RELE(xattr);
1395 
1396 	return (1);
1397 }
1398 
1399 static int
1400 nfs4_active_data_reclaim(rnode4_t *rp)
1401 {
1402 	char *contents;
1403 	vnode_t *xattr;
1404 	int size;
1405 	vsecattr_t *vsp;
1406 	int freed;
1407 	bool_t rdc = FALSE;
1408 
1409 	/*
1410 	 * Free any held credentials and caches which
1411 	 * may be associated with this rnode.
1412 	 */
1413 	if (!mutex_tryenter(&rp->r_statelock))
1414 		return (0);
1415 	contents = rp->r_symlink.contents;
1416 	size = rp->r_symlink.size;
1417 	rp->r_symlink.contents = NULL;
1418 	vsp = rp->r_secattr;
1419 	rp->r_secattr = NULL;
1420 	if (rp->r_dir != NULL)
1421 		rdc = TRUE;
1422 	xattr = rp->r_xattr_dir;
1423 	rp->r_xattr_dir = NULL;
1424 	mutex_exit(&rp->r_statelock);
1425 
1426 	/*
1427 	 * Free the access cache entries.
1428 	 */
1429 	freed = nfs4_access_purge_rp(rp);
1430 
1431 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1432 		return (freed);
1433 
1434 	/*
1435 	 * Free the symbolic link cache.
1436 	 */
1437 	if (contents != NULL) {
1438 
1439 		kmem_free((void *)contents, size);
1440 	}
1441 
1442 	/*
1443 	 * Free any cached ACL.
1444 	 */
1445 	if (vsp != NULL)
1446 		nfs4_acl_free_cache(vsp);
1447 
1448 	nfs4_purge_rddir_cache(RTOV4(rp));
1449 
1450 	/*
1451 	 * Release the xattr directory vnode
1452 	 */
1453 	if (xattr != NULL)
1454 		VN_RELE(xattr);
1455 
1456 	return (1);
1457 }
1458 
1459 static int
1460 nfs4_free_reclaim(void)
1461 {
1462 	int freed;
1463 	rnode4_t *rp;
1464 
1465 #ifdef DEBUG
1466 	clstat4_debug.f_reclaim.value.ui64++;
1467 #endif
1468 	freed = 0;
1469 	mutex_enter(&rp4freelist_lock);
1470 	rp = rp4freelist;
1471 	if (rp != NULL) {
1472 		do {
1473 			if (nfs4_free_data_reclaim(rp))
1474 				freed = 1;
1475 		} while ((rp = rp->r_freef) != rp4freelist);
1476 	}
1477 	mutex_exit(&rp4freelist_lock);
1478 	return (freed);
1479 }
1480 
1481 static int
1482 nfs4_active_reclaim(void)
1483 {
1484 	int freed;
1485 	int index;
1486 	rnode4_t *rp;
1487 
1488 #ifdef DEBUG
1489 	clstat4_debug.a_reclaim.value.ui64++;
1490 #endif
1491 	freed = 0;
1492 	for (index = 0; index < rtable4size; index++) {
1493 		rw_enter(&rtable4[index].r_lock, RW_READER);
1494 		for (rp = rtable4[index].r_hashf;
1495 		    rp != (rnode4_t *)(&rtable4[index]);
1496 		    rp = rp->r_hashf) {
1497 			if (nfs4_active_data_reclaim(rp))
1498 				freed = 1;
1499 		}
1500 		rw_exit(&rtable4[index].r_lock);
1501 	}
1502 	return (freed);
1503 }
1504 
1505 static int
1506 nfs4_rnode_reclaim(void)
1507 {
1508 	int freed;
1509 	rnode4_t *rp;
1510 	vnode_t *vp;
1511 
1512 #ifdef DEBUG
1513 	clstat4_debug.r_reclaim.value.ui64++;
1514 #endif
1515 	freed = 0;
1516 	mutex_enter(&rp4freelist_lock);
1517 	while ((rp = rp4freelist) != NULL) {
1518 		rp4_rmfree(rp);
1519 		mutex_exit(&rp4freelist_lock);
1520 		if (rp->r_flags & R4HASHED) {
1521 			vp = RTOV4(rp);
1522 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1523 			mutex_enter(&vp->v_lock);
1524 			if (vp->v_count > 1) {
1525 				vp->v_count--;
1526 				mutex_exit(&vp->v_lock);
1527 				rw_exit(&rp->r_hashq->r_lock);
1528 				mutex_enter(&rp4freelist_lock);
1529 				continue;
1530 			}
1531 			mutex_exit(&vp->v_lock);
1532 			rp4_rmhash_locked(rp);
1533 			rw_exit(&rp->r_hashq->r_lock);
1534 		}
1535 		/*
1536 		 * This call to rp_addfree will end up destroying the
1537 		 * rnode, but in a safe way with the appropriate set
1538 		 * of checks done.
1539 		 */
1540 		rp4_addfree(rp, CRED());
1541 		mutex_enter(&rp4freelist_lock);
1542 	}
1543 	mutex_exit(&rp4freelist_lock);
1544 	return (freed);
1545 }
1546 
1547 /*ARGSUSED*/
1548 static void
1549 nfs4_reclaim(void *cdrarg)
1550 {
1551 #ifdef DEBUG
1552 	clstat4_debug.reclaim.value.ui64++;
1553 #endif
1554 	if (nfs4_free_reclaim())
1555 		return;
1556 
1557 	if (nfs4_active_reclaim())
1558 		return;
1559 
1560 	(void) nfs4_rnode_reclaim();
1561 }
1562 
1563 /*
1564  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1565  * clientid can change if the caller drops mi_recovlock.
1566  */
1567 
1568 clientid4
1569 mi2clientid(mntinfo4_t *mi)
1570 {
1571 	nfs4_server_t	*sp;
1572 	clientid4	clientid = 0;
1573 
1574 	/* this locks down sp if it is found */
1575 	sp = find_nfs4_server(mi);
1576 	if (sp != NULL) {
1577 		clientid = sp->clientid;
1578 		mutex_exit(&sp->s_lock);
1579 		nfs4_server_rele(sp);
1580 	}
1581 	return (clientid);
1582 }
1583 
1584 /*
1585  * Return the current lease time for the server associated with the given
1586  * file.  Note that the lease time could change immediately after this
1587  * call.
1588  */
1589 
1590 time_t
1591 r2lease_time(rnode4_t *rp)
1592 {
1593 	nfs4_server_t	*sp;
1594 	time_t		lease_time;
1595 	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1596 
1597 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1598 
1599 	/* this locks down sp if it is found */
1600 	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1601 
1602 	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1603 		if (sp != NULL) {
1604 			mutex_exit(&sp->s_lock);
1605 			nfs4_server_rele(sp);
1606 		}
1607 		nfs_rw_exit(&mi->mi_recovlock);
1608 		return (1);		/* 1 second */
1609 	}
1610 
1611 	ASSERT(sp != NULL);
1612 
1613 	lease_time = sp->s_lease_time;
1614 
1615 	mutex_exit(&sp->s_lock);
1616 	nfs4_server_rele(sp);
1617 	nfs_rw_exit(&mi->mi_recovlock);
1618 
1619 	return (lease_time);
1620 }
1621 
1622 /*
1623  * Return a list with information about all the known open instances for
1624  * a filesystem. The caller must call r4releopenlist() when done with the
1625  * list.
1626  *
1627  * We are safe at looking at os_valid and os_pending_close across dropping
1628  * the 'os_sync_lock' to count up the number of open streams and then
1629  * allocate memory for the osp list due to:
1630  *	-Looking at os_pending_close is safe since this routine is
1631  *	only called via recovery, and os_pending_close can only be set via
1632  *	a non-recovery operation (which are all blocked when recovery
1633  *	is active).
1634  *
1635  *	-Examining os_valid is safe since non-recovery operations, which
1636  *	could potentially switch os_valid to 0, are blocked (via
1637  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1638  *	(which means we are the only recovery thread potentially acting
1639  *	on this open stream).
1640  */
1641 
1642 nfs4_opinst_t *
1643 r4mkopenlist(mntinfo4_t *mi)
1644 {
1645 	nfs4_opinst_t *reopenlist, *rep;
1646 	rnode4_t *rp;
1647 	vnode_t *vp;
1648 	vfs_t *vfsp = mi->mi_vfsp;
1649 	int numosp;
1650 	nfs4_open_stream_t *osp;
1651 	int index;
1652 	open_delegation_type4 dtype;
1653 	int hold_vnode;
1654 
1655 	reopenlist = NULL;
1656 
1657 	for (index = 0; index < rtable4size; index++) {
1658 		rw_enter(&rtable4[index].r_lock, RW_READER);
1659 		for (rp = rtable4[index].r_hashf;
1660 		    rp != (rnode4_t *)(&rtable4[index]);
1661 		    rp = rp->r_hashf) {
1662 
1663 			vp = RTOV4(rp);
1664 			if (vp->v_vfsp != vfsp)
1665 				continue;
1666 			hold_vnode = 0;
1667 
1668 			mutex_enter(&rp->r_os_lock);
1669 
1670 			/* Count the number of valid open_streams of the file */
1671 			numosp = 0;
1672 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1673 			    osp = list_next(&rp->r_open_streams, osp)) {
1674 				mutex_enter(&osp->os_sync_lock);
1675 				if (osp->os_valid && !osp->os_pending_close)
1676 					numosp++;
1677 				mutex_exit(&osp->os_sync_lock);
1678 			}
1679 
1680 			/* Fill in the valid open streams per vp */
1681 			if (numosp > 0) {
1682 				int j;
1683 
1684 				hold_vnode = 1;
1685 
1686 				/*
1687 				 * Add a new open instance to the list
1688 				 */
1689 				rep = kmem_zalloc(sizeof (*reopenlist),
1690 				    KM_SLEEP);
1691 				rep->re_next = reopenlist;
1692 				reopenlist = rep;
1693 
1694 				rep->re_vp = vp;
1695 				rep->re_osp = kmem_zalloc(
1696 				    numosp * sizeof (*(rep->re_osp)),
1697 				    KM_SLEEP);
1698 				rep->re_numosp = numosp;
1699 
1700 				j = 0;
1701 				for (osp = list_head(&rp->r_open_streams);
1702 				    osp != NULL;
1703 				    osp = list_next(&rp->r_open_streams, osp)) {
1704 
1705 					mutex_enter(&osp->os_sync_lock);
1706 					if (osp->os_valid &&
1707 					    !osp->os_pending_close) {
1708 						osp->os_ref_count++;
1709 						rep->re_osp[j] = osp;
1710 						j++;
1711 					}
1712 					mutex_exit(&osp->os_sync_lock);
1713 				}
1714 				/*
1715 				 * Assuming valid osp(s) stays valid between
1716 				 * the time obtaining j and numosp.
1717 				 */
1718 				ASSERT(j == numosp);
1719 			}
1720 
1721 			mutex_exit(&rp->r_os_lock);
1722 			/* do this here to keep v_lock > r_os_lock */
1723 			if (hold_vnode)
1724 				VN_HOLD(vp);
1725 			mutex_enter(&rp->r_statev4_lock);
1726 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1727 				/*
1728 				 * If this rnode holds a delegation,
1729 				 * but if there are no valid open streams,
1730 				 * then just discard the delegation
1731 				 * without doing delegreturn.
1732 				 */
1733 				if (numosp > 0)
1734 					rp->r_deleg_needs_recovery =
1735 					    rp->r_deleg_type;
1736 			}
1737 			/* Save the delegation type for use outside the lock */
1738 			dtype = rp->r_deleg_type;
1739 			mutex_exit(&rp->r_statev4_lock);
1740 
1741 			/*
1742 			 * If we have a delegation then get rid of it.
1743 			 * We've set rp->r_deleg_needs_recovery so we have
1744 			 * enough information to recover.
1745 			 */
1746 			if (dtype != OPEN_DELEGATE_NONE) {
1747 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1748 			}
1749 		}
1750 		rw_exit(&rtable4[index].r_lock);
1751 	}
1752 	return (reopenlist);
1753 }
1754 
1755 /*
1756  * Release the list of open instance references.
1757  */
1758 
1759 void
1760 r4releopenlist(nfs4_opinst_t *reopenp)
1761 {
1762 	nfs4_opinst_t *rep, *next;
1763 	int i;
1764 
1765 	for (rep = reopenp; rep; rep = next) {
1766 		next = rep->re_next;
1767 
1768 		for (i = 0; i < rep->re_numosp; i++)
1769 			open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1770 
1771 		VN_RELE(rep->re_vp);
1772 		kmem_free(rep->re_osp,
1773 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1774 
1775 		kmem_free(rep, sizeof (*rep));
1776 	}
1777 }
1778 
1779 int
1780 nfs4_rnode_init(void)
1781 {
1782 	ulong_t nrnode4_max;
1783 	int i;
1784 
1785 	/*
1786 	 * Compute the size of the rnode4 hash table
1787 	 */
1788 	if (nrnode <= 0)
1789 		nrnode = ncsize;
1790 	nrnode4_max =
1791 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1792 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1793 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1794 		    "setting nrnode to max value of %ld", nrnode4_max);
1795 		nrnode = nrnode4_max;
1796 	}
1797 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1798 	rtable4mask = rtable4size - 1;
1799 
1800 	/*
1801 	 * Allocate and initialize the hash buckets
1802 	 */
1803 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1804 	for (i = 0; i < rtable4size; i++) {
1805 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1806 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1807 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1808 	}
1809 
1810 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1811 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1812 
1813 	return (0);
1814 }
1815 
1816 int
1817 nfs4_rnode_fini(void)
1818 {
1819 	int i;
1820 
1821 	/*
1822 	 * Deallocate the rnode hash queues
1823 	 */
1824 	kmem_cache_destroy(rnode4_cache);
1825 
1826 	for (i = 0; i < rtable4size; i++)
1827 		rw_destroy(&rtable4[i].r_lock);
1828 
1829 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1830 
1831 	return (0);
1832 }
1833 
1834 /*
1835  * Return non-zero if the given filehandle refers to the root filehandle
1836  * for the given rnode.
1837  */
1838 
1839 static int
1840 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1841 {
1842 	int isroot;
1843 
1844 	isroot = 0;
1845 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1846 		isroot = 1;
1847 
1848 	return (isroot);
1849 }
1850 
1851 /*
1852  * The r4_stub_* routines assume that the rnode is newly activated, and
1853  * that the caller either holds the hash bucket r_lock for this rnode as
1854  * RW_WRITER, or holds r_statelock.
1855  */
1856 static void
1857 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1858 {
1859 	vnode_t *vp = RTOV4(rp);
1860 	krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1861 
1862 	ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1863 
1864 	rp->r_stub_type = type;
1865 
1866 	/*
1867 	 * Safely switch this vnode to the trigger vnodeops.
1868 	 *
1869 	 * Currently, we don't ever switch a trigger vnode back to using
1870 	 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1871 	 * a new v4 object is not a trigger, and it will already have the
1872 	 * correct v4 vnodeops by default. So, no "else" case required here.
1873 	 */
1874 	if (type != NFS4_STUB_NONE)
1875 		vn_setops(vp, nfs4_trigger_vnodeops);
1876 }
1877 
1878 void
1879 r4_stub_mirrormount(rnode4_t *rp)
1880 {
1881 	r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
1882 }
1883 
1884 void
1885 r4_stub_none(rnode4_t *rp)
1886 {
1887 	r4_stub_set(rp, NFS4_STUB_NONE);
1888 }
1889 
1890 #ifdef DEBUG
1891 
1892 /*
1893  * Look in the rnode table for other rnodes that have the same filehandle.
1894  * Assume the lock is held for the hash chain of checkrp
1895  */
1896 
1897 static void
1898 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
1899 {
1900 	rnode4_t *rp;
1901 	vnode_t *tvp;
1902 	nfs4_fhandle_t fh, fh2;
1903 	int index;
1904 
1905 	if (!r4_check_for_dups)
1906 		return;
1907 
1908 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
1909 
1910 	sfh4_copyval(checkrp->r_fh, &fh);
1911 
1912 	for (index = 0; index < rtable4size; index++) {
1913 
1914 		if (&rtable4[index] != checkrp->r_hashq)
1915 			rw_enter(&rtable4[index].r_lock, RW_READER);
1916 
1917 		for (rp = rtable4[index].r_hashf;
1918 		    rp != (rnode4_t *)(&rtable4[index]);
1919 		    rp = rp->r_hashf) {
1920 
1921 			if (rp == checkrp)
1922 				continue;
1923 
1924 			tvp = RTOV4(rp);
1925 			if (tvp->v_vfsp != vfsp)
1926 				continue;
1927 
1928 			sfh4_copyval(rp->r_fh, &fh2);
1929 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
1930 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
1931 				    "(%p, %p)", (void *)checkrp, (void *)rp);
1932 			}
1933 		}
1934 
1935 		if (&rtable4[index] != checkrp->r_hashq)
1936 			rw_exit(&rtable4[index].r_lock);
1937 	}
1938 }
1939 
1940 #endif /* DEBUG */
1941