xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision 0173c38a73f34277e0c97a19fedfd25d81ba8380)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/proc.h>
38 #include <sys/user.h>
39 #include <sys/time.h>
40 #include <sys/buf.h>
41 #include <sys/vfs.h>
42 #include <sys/vnode.h>
43 #include <sys/socket.h>
44 #include <sys/uio.h>
45 #include <sys/tiuser.h>
46 #include <sys/swap.h>
47 #include <sys/errno.h>
48 #include <sys/debug.h>
49 #include <sys/kmem.h>
50 #include <sys/kstat.h>
51 #include <sys/cmn_err.h>
52 #include <sys/vtrace.h>
53 #include <sys/session.h>
54 #include <sys/dnlc.h>
55 #include <sys/bitmap.h>
56 #include <sys/acl.h>
57 #include <sys/ddi.h>
58 #include <sys/pathname.h>
59 #include <sys/flock.h>
60 #include <sys/dirent.h>
61 #include <sys/flock.h>
62 #include <sys/callb.h>
63 
64 #include <rpc/types.h>
65 #include <rpc/xdr.h>
66 #include <rpc/auth.h>
67 #include <rpc/rpcsec_gss.h>
68 #include <rpc/clnt.h>
69 
70 #include <nfs/nfs.h>
71 #include <nfs/nfs_clnt.h>
72 #include <nfs/nfs_acl.h>
73 
74 #include <nfs/nfs4.h>
75 #include <nfs/rnode4.h>
76 #include <nfs/nfs4_clnt.h>
77 
78 /*
79  * The hash queues for the access to active and cached rnodes
80  * are organized as doubly linked lists.  A reader/writer lock
81  * for each hash bucket is used to control access and to synchronize
82  * lookups, additions, and deletions from the hash queue.
83  *
84  * The rnode freelist is organized as a doubly linked list with
85  * a head pointer.  Additions and deletions are synchronized via
86  * a single mutex.
87  *
88  * In order to add an rnode to the free list, it must be hashed into
89  * a hash queue and the exclusive lock to the hash queue be held.
90  * If an rnode is not hashed into a hash queue, then it is destroyed
91  * because it represents no valuable information that can be reused
92  * about the file.  The exclusive lock to the hash queue must be
93  * held in order to prevent a lookup in the hash queue from finding
94  * the rnode and using it and assuming that the rnode is not on the
95  * freelist.  The lookup in the hash queue will have the hash queue
96  * locked, either exclusive or shared.
97  *
98  * The vnode reference count for each rnode is not allowed to drop
99  * below 1.  This prevents external entities, such as the VM
100  * subsystem, from acquiring references to vnodes already on the
101  * freelist and then trying to place them back on the freelist
102  * when their reference is released.  This means that the when an
103  * rnode is looked up in the hash queues, then either the rnode
104  * is removed from the freelist and that reference is tranfered to
105  * the new reference or the vnode reference count must be incremented
106  * accordingly.  The mutex for the freelist must be held in order to
107  * accurately test to see if the rnode is on the freelist or not.
108  * The hash queue lock might be held shared and it is possible that
109  * two different threads may race to remove the rnode from the
110  * freelist.  This race can be resolved by holding the mutex for the
111  * freelist.  Please note that the mutex for the freelist does not
112  * need to be held if the rnode is not on the freelist.  It can not be
113  * placed on the freelist due to the requirement that the thread
114  * putting the rnode on the freelist must hold the exclusive lock
115  * to the hash queue and the thread doing the lookup in the hash
116  * queue is holding either a shared or exclusive lock to the hash
117  * queue.
118  *
119  * The lock ordering is:
120  *
121  *	hash bucket lock -> vnode lock
122  *	hash bucket lock -> freelist lock -> r_statelock
123  */
124 r4hashq_t *rtable4;
125 
126 static kmutex_t rp4freelist_lock;
127 static rnode4_t *rp4freelist = NULL;
128 static long rnode4_new = 0;
129 int rtable4size;
130 static int rtable4mask;
131 static struct kmem_cache *rnode4_cache;
132 static int rnode4_hashlen = 4;
133 
134 static void	r4inactive(rnode4_t *, cred_t *);
135 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
136 		    struct vnodeops *,
137 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
138 		    cred_t *),
139 		    int *, cred_t *);
140 static void	rp4_rmfree(rnode4_t *);
141 int		nfs4_free_data_reclaim(rnode4_t *);
142 static int	nfs4_active_data_reclaim(rnode4_t *);
143 static int	nfs4_free_reclaim(void);
144 static int	nfs4_active_reclaim(void);
145 static int	nfs4_rnode_reclaim(void);
146 static void	nfs4_reclaim(void *);
147 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
148 static void	uninit_rnode4(rnode4_t *);
149 static void	destroy_rnode4(rnode4_t *);
150 
151 #ifdef DEBUG
152 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
153 static int nfs4_rnode_debug = 0;
154 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
155 static int nfs4_rnode_nofreelist = 0;
156 /* give messages on colliding shared filehandles */
157 static void	r4_dup_check(rnode4_t *, vfs_t *);
158 #endif
159 
160 /*
161  * If the vnode has pages, run the list and check for any that are
162  * still dangling.  We call this routine before putting an rnode on
163  * the free list.
164  */
165 static int
166 nfs4_dross_pages(vnode_t *vp)
167 {
168 	page_t *pp;
169 	kmutex_t *vphm;
170 
171 	vphm = page_vnode_mutex(vp);
172 	mutex_enter(vphm);
173 	if ((pp = vp->v_pages) != NULL) {
174 		do {
175 			if (pp->p_fsdata != C_NOCOMMIT) {
176 				mutex_exit(vphm);
177 				return (1);
178 			}
179 		} while ((pp = pp->p_vpnext) != vp->v_pages);
180 	}
181 	mutex_exit(vphm);
182 
183 	return (0);
184 }
185 
186 /*
187  * Flush any pages left on this rnode.
188  */
189 static void
190 r4flushpages(rnode4_t *rp, cred_t *cr)
191 {
192 	vnode_t *vp;
193 	int error;
194 
195 	/*
196 	 * Before freeing anything, wait until all asynchronous
197 	 * activity is done on this rnode.  This will allow all
198 	 * asynchronous read ahead and write behind i/o's to
199 	 * finish.
200 	 */
201 	mutex_enter(&rp->r_statelock);
202 	while (rp->r_count > 0)
203 		cv_wait(&rp->r_cv, &rp->r_statelock);
204 	mutex_exit(&rp->r_statelock);
205 
206 	/*
207 	 * Flush and invalidate all pages associated with the vnode.
208 	 */
209 	vp = RTOV4(rp);
210 	if (nfs4_has_pages(vp)) {
211 		ASSERT(vp->v_type != VCHR);
212 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
213 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr);
214 			if (error && (error == ENOSPC || error == EDQUOT)) {
215 				mutex_enter(&rp->r_statelock);
216 				if (!rp->r_error)
217 					rp->r_error = error;
218 				mutex_exit(&rp->r_statelock);
219 			}
220 		}
221 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
222 	}
223 }
224 
225 /*
226  * Free the resources associated with an rnode.
227  */
228 static void
229 r4inactive(rnode4_t *rp, cred_t *cr)
230 {
231 	vnode_t *vp;
232 	char *contents;
233 	int size;
234 	vsecattr_t *vsp;
235 	vnode_t *xattr;
236 
237 	r4flushpages(rp, cr);
238 
239 	vp = RTOV4(rp);
240 
241 	/*
242 	 * Free any held caches which may be
243 	 * associated with this rnode.
244 	 */
245 	mutex_enter(&rp->r_statelock);
246 	contents = rp->r_symlink.contents;
247 	size = rp->r_symlink.size;
248 	rp->r_symlink.contents = NULL;
249 	vsp = rp->r_secattr;
250 	rp->r_secattr = NULL;
251 	xattr = rp->r_xattr_dir;
252 	rp->r_xattr_dir = NULL;
253 	mutex_exit(&rp->r_statelock);
254 
255 	/*
256 	 * Free the access cache entries.
257 	 */
258 	(void) nfs4_access_purge_rp(rp);
259 
260 	/*
261 	 * Free the readdir cache entries.
262 	 */
263 	nfs4_purge_rddir_cache(vp);
264 
265 	/*
266 	 * Free the symbolic link cache.
267 	 */
268 	if (contents != NULL) {
269 
270 		kmem_free((void *)contents, size);
271 	}
272 
273 	/*
274 	 * Free any cached ACL.
275 	 */
276 	if (vsp != NULL)
277 		nfs4_acl_free_cache(vsp);
278 
279 	/*
280 	 * Release the cached xattr_dir
281 	 */
282 	if (xattr != NULL)
283 		VN_RELE(xattr);
284 }
285 
286 /*
287  * We have seen a case that the fh passed in is for "." which
288  * should be a VROOT node, however, the fh is different from the
289  * root fh stored in the mntinfo4_t. The invalid fh might be
290  * from a misbehaved server and will panic the client system at
291  * a later time. To avoid the panic, we drop the bad fh, use
292  * the root fh from mntinfo4_t, and print an error message
293  * for attention.
294  */
295 nfs4_sharedfh_t *
296 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
297     int *wasbad)
298 {
299 	char *s;
300 
301 	*wasbad = 0;
302 	s = fn_name(nm);
303 	ASSERT(strcmp(s, "..") != 0);
304 
305 	if ((s[0] == '.' && s[1] == '\0') && fh &&
306 					!SFH4_SAME(mi->mi_rootfh, fh)) {
307 #ifdef DEBUG
308 		nfs4_fhandle_t fhandle;
309 
310 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
311 		    "Server %s returns a different "
312 		    "root filehandle for the path %s:",
313 		    mi->mi_curr_serv->sv_hostname,
314 		    mi->mi_curr_serv->sv_path);
315 
316 		/* print the bad fh */
317 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
318 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
319 			fhandle.fh_len);
320 		nfs4_printfhandle(&fhandle);
321 
322 		/* print mi_rootfh */
323 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
324 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
325 			fhandle.fh_len);
326 		nfs4_printfhandle(&fhandle);
327 #endif
328 		/* use mi_rootfh instead; fh will be rele by the caller */
329 		fh = mi->mi_rootfh;
330 		*wasbad = 1;
331 	}
332 
333 	kmem_free(s, MAXNAMELEN);
334 	return (fh);
335 }
336 
337 void
338 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
339     hrtime_t t, cred_t *cr, int index)
340 {
341 	vattr_t *attr;
342 	/*
343 	 * Don't add to attrcache if time overflow, but
344 	 * no need to check because either attr is null or the time
345 	 * values in it were processed by nfs4_time_ntov(), which checks
346 	 * for time overflows.
347 	 */
348 	attr = garp ? &garp->n4g_va : NULL;
349 
350 	if (attr) {
351 		if (!newnode) {
352 			rw_exit(&rtable4[index].r_lock);
353 #ifdef DEBUG
354 			if (vp->v_type != attr->va_type &&
355 			    vp->v_type != VNON && attr->va_type != VNON) {
356 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
357 					"makenfs4node: type (%d) doesn't "
358 					"match type of found node at %p (%d)",
359 					attr->va_type, (void *)vp, vp->v_type);
360 			}
361 #endif
362 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
363 		} else {
364 			rnode4_t *rp = VTOR4(vp);
365 
366 			vp->v_type = attr->va_type;
367 			vp->v_rdev = attr->va_rdev;
368 
369 			/*
370 			 * Turn this object into a "stub" object if we
371 			 * crossed an underlying server fs boundary.  To
372 			 * make this check, during mount we save the
373 			 * fsid of the server object being mounted.
374 			 * Here we compare this object's server fsid
375 			 * with the fsid we saved at mount.  If they
376 			 * are different, we crossed server fs boundary.
377 			 *
378 			 * The stub flag is set (or not) at rnode
379 			 * creation time and it never changes for life
380 			 * of rnode.
381 			 *
382 			 * We don't bother with taking r_state_lock
383 			 * to set R4SRVSTUB flag because this is a new
384 			 * rnode and we're holding rtable lock.  No other
385 			 * thread could have obtained access to this
386 			 * rnode.
387 			 */
388 			if (garp->n4g_fsid_valid) {
389 				rp->r_srv_fsid = garp->n4g_fsid;
390 
391 				if (vp->v_type == VDIR) {
392 					servinfo4_t *svp = rp->r_server;
393 
394 					(void) nfs_rw_enter_sig(&svp->sv_lock,
395 								RW_READER, 0);
396 					if (!FATTR4_FSID_EQ(&garp->n4g_fsid,
397 							    &svp->sv_fsid)) {
398 						rp->r_flags |= R4SRVSTUB;
399 					}
400 					nfs_rw_exit(&svp->sv_lock);
401 				}
402 			}
403 
404 			/* Can not cache partial attr */
405 			if (attr->va_mask == AT_ALL)
406 				nfs4_attrcache_noinval(vp, garp, t);
407 			else
408 				PURGE_ATTRCACHE4(vp);
409 
410 			rw_exit(&rtable4[index].r_lock);
411 		}
412 	} else {
413 		if (newnode) {
414 			PURGE_ATTRCACHE4(vp);
415 		}
416 		rw_exit(&rtable4[index].r_lock);
417 	}
418 }
419 
420 /*
421  * Find or create an rnode based primarily on filehandle.  To be
422  * used when dvp (vnode for parent directory) is not available;
423  * otherwise, makenfs4node() should be used.
424  *
425  * The nfs4_fname_t argument *npp is consumed and nulled out.
426  */
427 
428 vnode_t *
429 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
430 	nfs4_fname_t **npp, nfs4_ga_res_t *garp,
431 	mntinfo4_t *mi, cred_t *cr, hrtime_t t)
432 {
433 	vfs_t *vfsp = mi->mi_vfsp;
434 	int newnode = 0;
435 	vnode_t *vp;
436 	rnode4_t *rp;
437 	svnode_t *svp;
438 	nfs4_fname_t *name;
439 	int index;
440 
441 	ASSERT(npp && *npp);
442 	name = *npp;
443 	*npp = NULL;
444 
445 	index = rtable4hash(sfh);
446 	rw_enter(&rtable4[index].r_lock, RW_READER);
447 
448 	rp = r4find(&rtable4[index], sfh, vfsp);
449 	if (rp != NULL) {
450 		rw_exit(&rtable4[index].r_lock);
451 		vp = RTOV4(rp);
452 		fn_rele(&name);
453 		return (vp);
454 	}
455 
456 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
457 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
458 	if (newnode) {
459 		svp = vtosv(vp);
460 		svp->sv_forw = svp->sv_back = svp;
461 		svp->sv_name = name;
462 		if (psfh != NULL)
463 			sfh4_hold(psfh);
464 		svp->sv_dfh = psfh;
465 	} else {
466 		fn_rele(&name);
467 	}
468 
469 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
470 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
471 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
472 
473 	return (vp);
474 }
475 
476 /*
477  * Find or create a vnode for the given filehandle, filesystem, parent, and
478  * name.  The reference to nm is consumed, so the caller must first do an
479  * fn_hold() if it wants to continue using nm after this call.
480  */
481 vnode_t *
482 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
483 	hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
484 {
485 	vnode_t *vp;
486 	int newnode;
487 	int index;
488 	mntinfo4_t *mi = VFTOMI4(vfsp);
489 	int had_badfh = 0;
490 	rnode4_t *rp;
491 
492 	ASSERT(dvp != NULL);
493 
494 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
495 
496 	index = rtable4hash(fh);
497 	rw_enter(&rtable4[index].r_lock, RW_READER);
498 
499 	/*
500 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
501 	 */
502 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
503 	    nfs4_putapage, &newnode, cr);
504 
505 	rp = VTOR4(vp);
506 	sv_activate(&vp, dvp, &nm, newnode);
507 	if (dvp->v_flag & V_XATTRDIR) {
508 		mutex_enter(&rp->r_statelock);
509 		rp->r_flags |= R4ISXATTR;
510 		mutex_exit(&rp->r_statelock);
511 	}
512 
513 	/* if getting a bad file handle, do not cache the attributes. */
514 	if (had_badfh) {
515 		rw_exit(&rtable4[index].r_lock);
516 		return (vp);
517 	}
518 
519 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
520 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
521 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
522 
523 	return (vp);
524 }
525 
526 /*
527  * Hash on address of filehandle object.
528  * XXX totally untuned.
529  */
530 
531 int
532 rtable4hash(nfs4_sharedfh_t *fh)
533 {
534 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
535 }
536 
537 /*
538  * Find or create the vnode for the given filehandle and filesystem.
539  * *newnode is set to zero if the vnode already existed; non-zero if it had
540  * to be created.
541  *
542  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
543  */
544 
545 static vnode_t *
546 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
547     struct vnodeops *vops,
548     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
549     int *newnode, cred_t *cr)
550 {
551 	rnode4_t *rp;
552 	rnode4_t *trp;
553 	vnode_t *vp;
554 	mntinfo4_t *mi;
555 
556 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
557 
558 	mi = VFTOMI4(vfsp);
559 
560 start:
561 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
562 		vp = RTOV4(rp);
563 		*newnode = 0;
564 		return (vp);
565 	}
566 	rw_exit(&rhtp->r_lock);
567 
568 	mutex_enter(&rp4freelist_lock);
569 
570 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
571 		rp = rp4freelist;
572 		rp4_rmfree(rp);
573 		mutex_exit(&rp4freelist_lock);
574 
575 		vp = RTOV4(rp);
576 
577 		if (rp->r_flags & R4HASHED) {
578 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
579 			mutex_enter(&vp->v_lock);
580 			if (vp->v_count > 1) {
581 				vp->v_count--;
582 				mutex_exit(&vp->v_lock);
583 				rw_exit(&rp->r_hashq->r_lock);
584 				rw_enter(&rhtp->r_lock, RW_READER);
585 				goto start;
586 			}
587 			mutex_exit(&vp->v_lock);
588 			rp4_rmhash_locked(rp);
589 			rw_exit(&rp->r_hashq->r_lock);
590 		}
591 
592 		r4inactive(rp, cr);
593 
594 		mutex_enter(&vp->v_lock);
595 		if (vp->v_count > 1) {
596 			vp->v_count--;
597 			mutex_exit(&vp->v_lock);
598 			rw_enter(&rhtp->r_lock, RW_READER);
599 			goto start;
600 		}
601 		mutex_exit(&vp->v_lock);
602 		vn_invalid(vp);
603 
604 		/*
605 		 * destroy old locks before bzero'ing and
606 		 * recreating the locks below.
607 		 */
608 		uninit_rnode4(rp);
609 
610 		/*
611 		 * Make sure that if rnode is recycled then
612 		 * VFS count is decremented properly before
613 		 * reuse.
614 		 */
615 		VFS_RELE(vp->v_vfsp);
616 		vn_reinit(vp);
617 	} else {
618 		vnode_t *new_vp;
619 
620 		mutex_exit(&rp4freelist_lock);
621 
622 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
623 		new_vp = vn_alloc(KM_SLEEP);
624 
625 		atomic_add_long((ulong_t *)&rnode4_new, 1);
626 #ifdef DEBUG
627 		clstat4_debug.nrnode.value.ui64++;
628 #endif
629 		vp = new_vp;
630 	}
631 
632 	bzero(rp, sizeof (*rp));
633 	rp->r_vnode = vp;
634 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
635 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
636 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
637 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
638 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
639 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
640 	rp->created_v4 = 0;
641 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
642 	    offsetof(nfs4_open_stream_t, os_node));
643 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
644 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
645 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
646 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
647 	rp->r_flags = R4READDIRWATTR;
648 	rp->r_fh = fh;
649 	rp->r_hashq = rhtp;
650 	sfh4_hold(rp->r_fh);
651 	rp->r_server = mi->mi_curr_serv;
652 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
653 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
654 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
655 
656 	rddir4_cache_create(rp);
657 	rp->r_putapage = putapage;
658 	vn_setops(vp, vops);
659 	vp->v_data = (caddr_t)rp;
660 	vp->v_vfsp = vfsp;
661 	VFS_HOLD(vfsp);
662 	vp->v_type = VNON;
663 	if (isrootfh(fh, rp))
664 		vp->v_flag = VROOT;
665 	vn_exists(vp);
666 
667 	/*
668 	 * There is a race condition if someone else
669 	 * alloc's the rnode while no locks are held, so we
670 	 * check again and recover if found.
671 	 */
672 	rw_enter(&rhtp->r_lock, RW_WRITER);
673 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
674 		vp = RTOV4(trp);
675 		*newnode = 0;
676 		rw_exit(&rhtp->r_lock);
677 		rp4_addfree(rp, cr);
678 		rw_enter(&rhtp->r_lock, RW_READER);
679 		return (vp);
680 	}
681 	rp4_addhash(rp);
682 	*newnode = 1;
683 	return (vp);
684 }
685 
686 static void
687 uninit_rnode4(rnode4_t *rp)
688 {
689 	vnode_t *vp = RTOV4(rp);
690 
691 	ASSERT(rp != NULL);
692 	ASSERT(vp != NULL);
693 	ASSERT(vp->v_count == 1);
694 	ASSERT(rp->r_count == 0);
695 	ASSERT(rp->r_mapcnt == 0);
696 	if (rp->r_flags & R4LODANGLERS) {
697 		nfs4_flush_lock_owners(rp);
698 	}
699 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
700 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
701 	ASSERT(!(rp->r_flags & R4HASHED));
702 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
703 	nfs4_clear_open_streams(rp);
704 	list_destroy(&rp->r_open_streams);
705 
706 	/*
707 	 * Destroy the rddir cache first since we need to grab the r_statelock.
708 	 */
709 	mutex_enter(&rp->r_statelock);
710 	rddir4_cache_destroy(rp);
711 	mutex_exit(&rp->r_statelock);
712 	sv_uninit(&rp->r_svnode);
713 	sfh4_rele(&rp->r_fh);
714 	nfs_rw_destroy(&rp->r_rwlock);
715 	nfs_rw_destroy(&rp->r_lkserlock);
716 	mutex_destroy(&rp->r_statelock);
717 	mutex_destroy(&rp->r_statev4_lock);
718 	mutex_destroy(&rp->r_os_lock);
719 	cv_destroy(&rp->r_cv);
720 	cv_destroy(&rp->r_commit.c_cv);
721 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
722 	if (rp->r_flags & R4DELMAPLIST)
723 		list_destroy(&rp->r_indelmap);
724 }
725 
726 /*
727  * Put an rnode on the free list.
728  *
729  * Rnodes which were allocated above and beyond the normal limit
730  * are immediately freed.
731  */
732 void
733 rp4_addfree(rnode4_t *rp, cred_t *cr)
734 {
735 	vnode_t *vp;
736 	vnode_t *xattr;
737 	struct vfs *vfsp;
738 
739 	vp = RTOV4(rp);
740 	ASSERT(vp->v_count >= 1);
741 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
742 
743 	/*
744 	 * If we have too many rnodes allocated and there are no
745 	 * references to this rnode, or if the rnode is no longer
746 	 * accessible by it does not reside in the hash queues,
747 	 * or if an i/o error occurred while writing to the file,
748 	 * then just free it instead of putting it on the rnode
749 	 * freelist.
750 	 */
751 	vfsp = vp->v_vfsp;
752 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
753 #ifdef DEBUG
754 	    (nfs4_rnode_nofreelist != 0) ||
755 #endif
756 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
757 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
758 		if (rp->r_flags & R4HASHED) {
759 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
760 			mutex_enter(&vp->v_lock);
761 			if (vp->v_count > 1) {
762 				vp->v_count--;
763 				mutex_exit(&vp->v_lock);
764 				rw_exit(&rp->r_hashq->r_lock);
765 				return;
766 			}
767 			mutex_exit(&vp->v_lock);
768 			rp4_rmhash_locked(rp);
769 			rw_exit(&rp->r_hashq->r_lock);
770 		}
771 
772 		/*
773 		 * Make sure we don't have a delegation on this rnode
774 		 * before destroying it.
775 		 */
776 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
777 			(void) nfs4delegreturn(rp,
778 				NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
779 		}
780 
781 		r4inactive(rp, cr);
782 
783 		/*
784 		 * Recheck the vnode reference count.  We need to
785 		 * make sure that another reference has not been
786 		 * acquired while we were not holding v_lock.  The
787 		 * rnode is not in the rnode hash queues; one
788 		 * way for a reference to have been acquired
789 		 * is for a VOP_PUTPAGE because the rnode was marked
790 		 * with R4DIRTY or for a modified page.  This
791 		 * reference may have been acquired before our call
792 		 * to r4inactive.  The i/o may have been completed,
793 		 * thus allowing r4inactive to complete, but the
794 		 * reference to the vnode may not have been released
795 		 * yet.  In any case, the rnode can not be destroyed
796 		 * until the other references to this vnode have been
797 		 * released.  The other references will take care of
798 		 * either destroying the rnode or placing it on the
799 		 * rnode freelist.  If there are no other references,
800 		 * then the rnode may be safely destroyed.
801 		 */
802 		mutex_enter(&vp->v_lock);
803 		if (vp->v_count > 1) {
804 			vp->v_count--;
805 			mutex_exit(&vp->v_lock);
806 			return;
807 		}
808 		mutex_exit(&vp->v_lock);
809 
810 		destroy_rnode4(rp);
811 		return;
812 	}
813 
814 	/*
815 	 * Lock the hash queue and then recheck the reference count
816 	 * to ensure that no other threads have acquired a reference
817 	 * to indicate that the rnode should not be placed on the
818 	 * freelist.  If another reference has been acquired, then
819 	 * just release this one and let the other thread complete
820 	 * the processing of adding this rnode to the freelist.
821 	 */
822 again:
823 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
824 
825 	mutex_enter(&vp->v_lock);
826 	if (vp->v_count > 1) {
827 		vp->v_count--;
828 		mutex_exit(&vp->v_lock);
829 		rw_exit(&rp->r_hashq->r_lock);
830 		return;
831 	}
832 	mutex_exit(&vp->v_lock);
833 
834 	/*
835 	 * Make sure we don't put an rnode with a delegation
836 	 * on the free list.
837 	 */
838 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
839 		rw_exit(&rp->r_hashq->r_lock);
840 		(void) nfs4delegreturn(rp,
841 			NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
842 		goto again;
843 	}
844 
845 	/*
846 	 * Now that we have the hash queue lock, and we know there
847 	 * are not anymore references on the vnode, check to make
848 	 * sure there aren't any open streams still on the rnode.
849 	 * If so, drop the hash queue lock, remove the open streams,
850 	 * and recheck the v_count.
851 	 */
852 	mutex_enter(&rp->r_os_lock);
853 	if (list_head(&rp->r_open_streams) != NULL) {
854 		mutex_exit(&rp->r_os_lock);
855 		rw_exit(&rp->r_hashq->r_lock);
856 		if (nfs_zone() != VTOMI4(vp)->mi_zone)
857 			nfs4_clear_open_streams(rp);
858 		else
859 			(void) nfs4close_all(vp, cr);
860 		goto again;
861 	}
862 	mutex_exit(&rp->r_os_lock);
863 
864 	/*
865 	 * Before we put it on the freelist, make sure there are no pages.
866 	 * If there are, flush and commit of all of the dirty and
867 	 * uncommitted pages, assuming the file system isn't read only.
868 	 */
869 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
870 		rw_exit(&rp->r_hashq->r_lock);
871 		r4flushpages(rp, cr);
872 		goto again;
873 	}
874 
875 	/*
876 	 * Before we put it on the freelist, make sure there is no
877 	 * active xattr directory cached, the freelist will not
878 	 * have its entries r4inactive'd if there is still an active
879 	 * rnode, thus nothing in the freelist can hold another
880 	 * rnode active.
881 	 */
882 	xattr = rp->r_xattr_dir;
883 	rp->r_xattr_dir = NULL;
884 
885 	/*
886 	 * If there is no cached data or metadata for this file, then
887 	 * put the rnode on the front of the freelist so that it will
888 	 * be reused before other rnodes which may have cached data or
889 	 * metadata associated with them.
890 	 */
891 	mutex_enter(&rp4freelist_lock);
892 	if (rp4freelist == NULL) {
893 		rp->r_freef = rp;
894 		rp->r_freeb = rp;
895 		rp4freelist = rp;
896 	} else {
897 		rp->r_freef = rp4freelist;
898 		rp->r_freeb = rp4freelist->r_freeb;
899 		rp4freelist->r_freeb->r_freef = rp;
900 		rp4freelist->r_freeb = rp;
901 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
902 		    rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
903 			rp4freelist = rp;
904 	}
905 	mutex_exit(&rp4freelist_lock);
906 
907 	rw_exit(&rp->r_hashq->r_lock);
908 
909 	if (xattr)
910 		VN_RELE(xattr);
911 }
912 
913 /*
914  * Remove an rnode from the free list.
915  *
916  * The caller must be holding rp4freelist_lock and the rnode
917  * must be on the freelist.
918  */
919 static void
920 rp4_rmfree(rnode4_t *rp)
921 {
922 
923 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
924 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
925 
926 	if (rp == rp4freelist) {
927 		rp4freelist = rp->r_freef;
928 		if (rp == rp4freelist)
929 			rp4freelist = NULL;
930 	}
931 	rp->r_freeb->r_freef = rp->r_freef;
932 	rp->r_freef->r_freeb = rp->r_freeb;
933 
934 	rp->r_freef = rp->r_freeb = NULL;
935 }
936 
937 /*
938  * Put a rnode in the hash table.
939  *
940  * The caller must be holding the exclusive hash queue lock
941  */
942 void
943 rp4_addhash(rnode4_t *rp)
944 {
945 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
946 	ASSERT(!(rp->r_flags & R4HASHED));
947 
948 #ifdef DEBUG
949 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
950 #endif
951 
952 	rp->r_hashf = rp->r_hashq->r_hashf;
953 	rp->r_hashq->r_hashf = rp;
954 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
955 	rp->r_hashf->r_hashb = rp;
956 
957 	mutex_enter(&rp->r_statelock);
958 	rp->r_flags |= R4HASHED;
959 	mutex_exit(&rp->r_statelock);
960 }
961 
962 /*
963  * Remove a rnode from the hash table.
964  *
965  * The caller must be holding the hash queue lock.
966  */
967 void
968 rp4_rmhash_locked(rnode4_t *rp)
969 {
970 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
971 	ASSERT(rp->r_flags & R4HASHED);
972 
973 	rp->r_hashb->r_hashf = rp->r_hashf;
974 	rp->r_hashf->r_hashb = rp->r_hashb;
975 
976 	mutex_enter(&rp->r_statelock);
977 	rp->r_flags &= ~R4HASHED;
978 	mutex_exit(&rp->r_statelock);
979 }
980 
981 /*
982  * Remove a rnode from the hash table.
983  *
984  * The caller must not be holding the hash queue lock.
985  */
986 void
987 rp4_rmhash(rnode4_t *rp)
988 {
989 
990 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
991 	rp4_rmhash_locked(rp);
992 	rw_exit(&rp->r_hashq->r_lock);
993 }
994 
995 /*
996  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
997  * Returns NULL if no match.  If an rnode is returned, the reference count
998  * on the master vnode is incremented.
999  *
1000  * The caller must be holding the hash queue lock, either shared or exclusive.
1001  */
1002 rnode4_t *
1003 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1004 {
1005 	rnode4_t *rp;
1006 	vnode_t *vp;
1007 
1008 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1009 
1010 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1011 		vp = RTOV4(rp);
1012 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1013 
1014 			mutex_enter(&rp->r_statelock);
1015 			if (rp->r_flags & R4RECOVERR) {
1016 				mutex_exit(&rp->r_statelock);
1017 				continue;
1018 			}
1019 			mutex_exit(&rp->r_statelock);
1020 #ifdef DEBUG
1021 			r4_dup_check(rp, vfsp);
1022 #endif
1023 			if (rp->r_freef != NULL) {
1024 				mutex_enter(&rp4freelist_lock);
1025 				/*
1026 				 * If the rnode is on the freelist,
1027 				 * then remove it and use that reference
1028 				 * as the new reference.  Otherwise,
1029 				 * need to increment the reference count.
1030 				 */
1031 				if (rp->r_freef != NULL) {
1032 					rp4_rmfree(rp);
1033 					mutex_exit(&rp4freelist_lock);
1034 				} else {
1035 					mutex_exit(&rp4freelist_lock);
1036 					VN_HOLD(vp);
1037 				}
1038 			} else
1039 				VN_HOLD(vp);
1040 
1041 			/*
1042 			 * if root vnode, set v_flag to indicate that
1043 			 */
1044 			if (isrootfh(fh, rp)) {
1045 				if (!(vp->v_flag & VROOT)) {
1046 					mutex_enter(&vp->v_lock);
1047 					vp->v_flag |= VROOT;
1048 					mutex_exit(&vp->v_lock);
1049 				}
1050 			}
1051 			return (rp);
1052 		}
1053 	}
1054 	return (NULL);
1055 }
1056 
1057 /*
1058  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1059  * that assumes the caller hasn't already got the lock
1060  * on the hash bucket.
1061  */
1062 rnode4_t *
1063 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1064 {
1065 	rnode4_t *rp;
1066 	int index;
1067 
1068 	index = rtable4hash(fh);
1069 	rw_enter(&rtable4[index].r_lock, RW_READER);
1070 	rp = r4find(&rtable4[index], fh, vfsp);
1071 	rw_exit(&rtable4[index].r_lock);
1072 
1073 	return (rp);
1074 }
1075 
1076 /*
1077  * Return 1 if there is a active vnode belonging to this vfs in the
1078  * rtable4 cache.
1079  *
1080  * Several of these checks are done without holding the usual
1081  * locks.  This is safe because destroy_rtable(), rp_addfree(),
1082  * etc. will redo the necessary checks before actually destroying
1083  * any rnodes.
1084  */
1085 int
1086 check_rtable4(struct vfs *vfsp)
1087 {
1088 	rnode4_t *rp;
1089 	vnode_t *vp;
1090 	char *busy = NULL;
1091 	int index;
1092 
1093 	for (index = 0; index < rtable4size; index++) {
1094 		rw_enter(&rtable4[index].r_lock, RW_READER);
1095 
1096 		for (rp = rtable4[index].r_hashf;
1097 		    rp != (rnode4_t *)(&rtable4[index]);
1098 		    rp = rp->r_hashf) {
1099 
1100 			vp = RTOV4(rp);
1101 			if (vp->v_vfsp == vfsp) {
1102 				if (rp->r_freef == NULL) {
1103 					busy = "not on free list";
1104 				} else if (nfs4_has_pages(vp) &&
1105 					    (rp->r_flags & R4DIRTY)) {
1106 					busy = "dirty pages";
1107 				} else if (rp->r_count > 0) {
1108 					busy = "r_count > 0";
1109 				}
1110 
1111 				if (busy != NULL) {
1112 #ifdef DEBUG
1113 					char *path;
1114 
1115 					path = fn_path(rp->r_svnode.sv_name);
1116 					NFS4_DEBUG(nfs4_rnode_debug,
1117 					    (CE_NOTE, "check_rtable4: " "%s %s",
1118 					    path, busy));
1119 					kmem_free(path, strlen(path)+1);
1120 #endif
1121 					rw_exit(&rtable4[index].r_lock);
1122 					return (1);
1123 				}
1124 			}
1125 		}
1126 		rw_exit(&rtable4[index].r_lock);
1127 	}
1128 	return (0);
1129 }
1130 
1131 /*
1132  * Destroy inactive vnodes from the hash queues which
1133  * belong to this vfs. All of the vnodes should be inactive.
1134  * It is essential that we destory all rnodes in case of
1135  * forced unmount as well as in normal unmount case.
1136  */
1137 
1138 void
1139 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1140 {
1141 	int index;
1142 	vnode_t *vp;
1143 	rnode4_t *rp, *r_hashf, *rlist;
1144 
1145 	rlist = NULL;
1146 
1147 	for (index = 0; index < rtable4size; index++) {
1148 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1149 		for (rp = rtable4[index].r_hashf;
1150 		    rp != (rnode4_t *)(&rtable4[index]);
1151 		    rp = r_hashf) {
1152 			/* save the hash pointer before destroying */
1153 			r_hashf = rp->r_hashf;
1154 
1155 			vp = RTOV4(rp);
1156 			if (vp->v_vfsp == vfsp) {
1157 				mutex_enter(&rp4freelist_lock);
1158 				if (rp->r_freef != NULL) {
1159 					rp4_rmfree(rp);
1160 					mutex_exit(&rp4freelist_lock);
1161 					rp4_rmhash_locked(rp);
1162 					rp->r_hashf = rlist;
1163 					rlist = rp;
1164 				} else
1165 					mutex_exit(&rp4freelist_lock);
1166 			}
1167 		}
1168 		rw_exit(&rtable4[index].r_lock);
1169 	}
1170 
1171 	for (rp = rlist; rp != NULL; rp = r_hashf) {
1172 		r_hashf = rp->r_hashf;
1173 		/*
1174 		 * This call to rp4_addfree will end up destroying the
1175 		 * rnode, but in a safe way with the appropriate set
1176 		 * of checks done.
1177 		 */
1178 		rp4_addfree(rp, cr);
1179 	}
1180 }
1181 
1182 /*
1183  * This routine destroys all the resources of an rnode
1184  * and finally the rnode itself.
1185  */
1186 static void
1187 destroy_rnode4(rnode4_t *rp)
1188 {
1189 	vnode_t *vp;
1190 	vfs_t *vfsp;
1191 
1192 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1193 
1194 	vp = RTOV4(rp);
1195 	vfsp = vp->v_vfsp;
1196 
1197 	uninit_rnode4(rp);
1198 	atomic_add_long((ulong_t *)&rnode4_new, -1);
1199 #ifdef DEBUG
1200 	clstat4_debug.nrnode.value.ui64--;
1201 #endif
1202 	kmem_cache_free(rnode4_cache, rp);
1203 	vn_invalid(vp);
1204 	vn_free(vp);
1205 	VFS_RELE(vfsp);
1206 }
1207 
1208 /*
1209  * Invalidate the attributes on all rnodes forcing the next getattr
1210  * to go over the wire.  Used to flush stale uid and gid mappings.
1211  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1212  */
1213 void
1214 nfs4_rnode_invalidate(struct vfs *vfsp)
1215 {
1216 	int index;
1217 	rnode4_t *rp;
1218 	vnode_t *vp;
1219 
1220 	/*
1221 	 * Walk the hash queues looking for rnodes.
1222 	 */
1223 	for (index = 0; index < rtable4size; index++) {
1224 		rw_enter(&rtable4[index].r_lock, RW_READER);
1225 		for (rp = rtable4[index].r_hashf;
1226 		    rp != (rnode4_t *)(&rtable4[index]);
1227 		    rp = rp->r_hashf) {
1228 			vp = RTOV4(rp);
1229 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1230 				continue;
1231 
1232 			if (!mutex_tryenter(&rp->r_statelock))
1233 				continue;
1234 
1235 			/*
1236 			 * Expire the attributes by resetting the change
1237 			 * and attr timeout.
1238 			 */
1239 			rp->r_change = 0;
1240 			PURGE_ATTRCACHE4_LOCKED(rp);
1241 			mutex_exit(&rp->r_statelock);
1242 		}
1243 		rw_exit(&rtable4[index].r_lock);
1244 	}
1245 }
1246 
1247 /*
1248  * Flush all vnodes in this (or every) vfs.
1249  * Used by nfs_sync and by nfs_unmount.
1250  */
1251 void
1252 r4flush(struct vfs *vfsp, cred_t *cr)
1253 {
1254 	int index;
1255 	rnode4_t *rp;
1256 	vnode_t *vp, **vplist;
1257 	long num, cnt;
1258 
1259 	/*
1260 	 * Check to see whether there is anything to do.
1261 	 */
1262 	num = rnode4_new;
1263 	if (num == 0)
1264 		return;
1265 
1266 	/*
1267 	 * Allocate a slot for all currently active rnodes on the
1268 	 * supposition that they all may need flushing.
1269 	 */
1270 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1271 	cnt = 0;
1272 
1273 	/*
1274 	 * Walk the hash queues looking for rnodes with page
1275 	 * lists associated with them.  Make a list of these
1276 	 * files.
1277 	 */
1278 	for (index = 0; index < rtable4size; index++) {
1279 		rw_enter(&rtable4[index].r_lock, RW_READER);
1280 		for (rp = rtable4[index].r_hashf;
1281 		    rp != (rnode4_t *)(&rtable4[index]);
1282 		    rp = rp->r_hashf) {
1283 			vp = RTOV4(rp);
1284 			/*
1285 			 * Don't bother sync'ing a vp if it
1286 			 * is part of virtual swap device or
1287 			 * if VFS is read-only
1288 			 */
1289 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1290 				continue;
1291 			/*
1292 			 * If flushing all mounted file systems or
1293 			 * the vnode belongs to this vfs, has pages
1294 			 * and is marked as either dirty or mmap'd,
1295 			 * hold and add this vnode to the list of
1296 			 * vnodes to flush.
1297 			 */
1298 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
1299 			    nfs4_has_pages(vp) &&
1300 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1301 				VN_HOLD(vp);
1302 				vplist[cnt++] = vp;
1303 				if (cnt == num) {
1304 					rw_exit(&rtable4[index].r_lock);
1305 					goto toomany;
1306 				}
1307 			}
1308 		}
1309 		rw_exit(&rtable4[index].r_lock);
1310 	}
1311 toomany:
1312 
1313 	/*
1314 	 * Flush and release all of the files on the list.
1315 	 */
1316 	while (cnt-- > 0) {
1317 		vp = vplist[cnt];
1318 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr);
1319 		VN_RELE(vp);
1320 	}
1321 
1322 	/*
1323 	 * Free the space allocated to hold the list.
1324 	 */
1325 	kmem_free(vplist, num * sizeof (*vplist));
1326 }
1327 
1328 int
1329 nfs4_free_data_reclaim(rnode4_t *rp)
1330 {
1331 	char *contents;
1332 	vnode_t *xattr;
1333 	int size;
1334 	vsecattr_t *vsp;
1335 	int freed;
1336 	bool_t rdc = FALSE;
1337 
1338 	/*
1339 	 * Free any held caches which may
1340 	 * be associated with this rnode.
1341 	 */
1342 	mutex_enter(&rp->r_statelock);
1343 	if (rp->r_dir != NULL)
1344 		rdc = TRUE;
1345 	contents = rp->r_symlink.contents;
1346 	size = rp->r_symlink.size;
1347 	rp->r_symlink.contents = NULL;
1348 	vsp = rp->r_secattr;
1349 	rp->r_secattr = NULL;
1350 	xattr = rp->r_xattr_dir;
1351 	rp->r_xattr_dir = NULL;
1352 	mutex_exit(&rp->r_statelock);
1353 
1354 	/*
1355 	 * Free the access cache entries.
1356 	 */
1357 	freed = nfs4_access_purge_rp(rp);
1358 
1359 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1360 		return (freed);
1361 
1362 	/*
1363 	 * Free the readdir cache entries, incompletely if we can't block.
1364 	 */
1365 	nfs4_purge_rddir_cache(RTOV4(rp));
1366 
1367 	/*
1368 	 * Free the symbolic link cache.
1369 	 */
1370 	if (contents != NULL) {
1371 
1372 		kmem_free((void *)contents, size);
1373 	}
1374 
1375 	/*
1376 	 * Free any cached ACL.
1377 	 */
1378 	if (vsp != NULL)
1379 		nfs4_acl_free_cache(vsp);
1380 
1381 	/*
1382 	 * Release the xattr directory vnode
1383 	 */
1384 	if (xattr != NULL)
1385 		VN_RELE(xattr);
1386 
1387 	return (1);
1388 }
1389 
1390 static int
1391 nfs4_active_data_reclaim(rnode4_t *rp)
1392 {
1393 	char *contents;
1394 	vnode_t *xattr;
1395 	int size;
1396 	vsecattr_t *vsp;
1397 	int freed;
1398 	bool_t rdc = FALSE;
1399 
1400 	/*
1401 	 * Free any held credentials and caches which
1402 	 * may be associated with this rnode.
1403 	 */
1404 	if (!mutex_tryenter(&rp->r_statelock))
1405 		return (0);
1406 	contents = rp->r_symlink.contents;
1407 	size = rp->r_symlink.size;
1408 	rp->r_symlink.contents = NULL;
1409 	vsp = rp->r_secattr;
1410 	rp->r_secattr = NULL;
1411 	if (rp->r_dir != NULL)
1412 		rdc = TRUE;
1413 	xattr = rp->r_xattr_dir;
1414 	rp->r_xattr_dir = NULL;
1415 	mutex_exit(&rp->r_statelock);
1416 
1417 	/*
1418 	 * Free the access cache entries.
1419 	 */
1420 	freed = nfs4_access_purge_rp(rp);
1421 
1422 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1423 		return (freed);
1424 
1425 	/*
1426 	 * Free the symbolic link cache.
1427 	 */
1428 	if (contents != NULL) {
1429 
1430 		kmem_free((void *)contents, size);
1431 	}
1432 
1433 	/*
1434 	 * Free any cached ACL.
1435 	 */
1436 	if (vsp != NULL)
1437 		nfs4_acl_free_cache(vsp);
1438 
1439 	nfs4_purge_rddir_cache(RTOV4(rp));
1440 
1441 	/*
1442 	 * Release the xattr directory vnode
1443 	 */
1444 	if (xattr != NULL)
1445 		VN_RELE(xattr);
1446 
1447 	return (1);
1448 }
1449 
1450 static int
1451 nfs4_free_reclaim(void)
1452 {
1453 	int freed;
1454 	rnode4_t *rp;
1455 
1456 #ifdef DEBUG
1457 	clstat4_debug.f_reclaim.value.ui64++;
1458 #endif
1459 	freed = 0;
1460 	mutex_enter(&rp4freelist_lock);
1461 	rp = rp4freelist;
1462 	if (rp != NULL) {
1463 		do {
1464 			if (nfs4_free_data_reclaim(rp))
1465 				freed = 1;
1466 		} while ((rp = rp->r_freef) != rp4freelist);
1467 	}
1468 	mutex_exit(&rp4freelist_lock);
1469 	return (freed);
1470 }
1471 
1472 static int
1473 nfs4_active_reclaim(void)
1474 {
1475 	int freed;
1476 	int index;
1477 	rnode4_t *rp;
1478 
1479 #ifdef DEBUG
1480 	clstat4_debug.a_reclaim.value.ui64++;
1481 #endif
1482 	freed = 0;
1483 	for (index = 0; index < rtable4size; index++) {
1484 		rw_enter(&rtable4[index].r_lock, RW_READER);
1485 		for (rp = rtable4[index].r_hashf;
1486 		    rp != (rnode4_t *)(&rtable4[index]);
1487 		    rp = rp->r_hashf) {
1488 			if (nfs4_active_data_reclaim(rp))
1489 				freed = 1;
1490 		}
1491 		rw_exit(&rtable4[index].r_lock);
1492 	}
1493 	return (freed);
1494 }
1495 
1496 static int
1497 nfs4_rnode_reclaim(void)
1498 {
1499 	int freed;
1500 	rnode4_t *rp;
1501 	vnode_t *vp;
1502 
1503 #ifdef DEBUG
1504 	clstat4_debug.r_reclaim.value.ui64++;
1505 #endif
1506 	freed = 0;
1507 	mutex_enter(&rp4freelist_lock);
1508 	while ((rp = rp4freelist) != NULL) {
1509 		rp4_rmfree(rp);
1510 		mutex_exit(&rp4freelist_lock);
1511 		if (rp->r_flags & R4HASHED) {
1512 			vp = RTOV4(rp);
1513 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1514 			mutex_enter(&vp->v_lock);
1515 			if (vp->v_count > 1) {
1516 				vp->v_count--;
1517 				mutex_exit(&vp->v_lock);
1518 				rw_exit(&rp->r_hashq->r_lock);
1519 				mutex_enter(&rp4freelist_lock);
1520 				continue;
1521 			}
1522 			mutex_exit(&vp->v_lock);
1523 			rp4_rmhash_locked(rp);
1524 			rw_exit(&rp->r_hashq->r_lock);
1525 		}
1526 		/*
1527 		 * This call to rp_addfree will end up destroying the
1528 		 * rnode, but in a safe way with the appropriate set
1529 		 * of checks done.
1530 		 */
1531 		rp4_addfree(rp, CRED());
1532 		mutex_enter(&rp4freelist_lock);
1533 	}
1534 	mutex_exit(&rp4freelist_lock);
1535 	return (freed);
1536 }
1537 
1538 /*ARGSUSED*/
1539 static void
1540 nfs4_reclaim(void *cdrarg)
1541 {
1542 
1543 #ifdef DEBUG
1544 	clstat4_debug.reclaim.value.ui64++;
1545 #endif
1546 	if (nfs4_free_reclaim())
1547 		return;
1548 
1549 	if (nfs4_active_reclaim())
1550 		return;
1551 
1552 	(void) nfs4_rnode_reclaim();
1553 }
1554 
1555 /*
1556  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1557  * clientid can change if the caller drops mi_recovlock.
1558  */
1559 
1560 clientid4
1561 mi2clientid(mntinfo4_t *mi)
1562 {
1563 	nfs4_server_t	*sp;
1564 	clientid4	clientid = 0;
1565 
1566 	/* this locks down sp if it is found */
1567 	sp = find_nfs4_server(mi);
1568 	if (sp != NULL) {
1569 		clientid = sp->clientid;
1570 		mutex_exit(&sp->s_lock);
1571 		nfs4_server_rele(sp);
1572 	}
1573 	return (clientid);
1574 }
1575 
1576 /*
1577  * Return the current lease time for the server associated with the given
1578  * file.  Note that the lease time could change immediately after this
1579  * call.
1580  */
1581 
1582 time_t
1583 r2lease_time(rnode4_t *rp)
1584 {
1585 	nfs4_server_t	*sp;
1586 	time_t		lease_time;
1587 	mntinfo4_t	*mi = VTOMI4(RTOV4(rp));
1588 
1589 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1590 
1591 	/* this locks down sp if it is found */
1592 	sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1593 
1594 	if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1595 		if (sp != NULL) {
1596 			mutex_exit(&sp->s_lock);
1597 			nfs4_server_rele(sp);
1598 		}
1599 		nfs_rw_exit(&mi->mi_recovlock);
1600 		return (1);		/* 1 second */
1601 	}
1602 
1603 	ASSERT(sp != NULL);
1604 
1605 	lease_time = sp->s_lease_time;
1606 
1607 	mutex_exit(&sp->s_lock);
1608 	nfs4_server_rele(sp);
1609 	nfs_rw_exit(&mi->mi_recovlock);
1610 
1611 	return (lease_time);
1612 }
1613 
1614 /*
1615  * Return a list with information about all the known open instances for
1616  * a filesystem. The caller must call r4releopenlist() when done with the
1617  * list.
1618  *
1619  * We are safe at looking at os_valid and os_pending_close across dropping
1620  * the 'os_sync_lock' to count up the number of open streams and then
1621  * allocate memory for the osp list due to:
1622  *	-Looking at os_pending_close is safe since this routine is
1623  *	only called via recovery, and os_pending_close can only be set via
1624  *	a non-recovery operation (which are all blocked when recovery
1625  *	is active).
1626  *
1627  *	-Examining os_valid is safe since non-recovery operations, which
1628  *	could potentially switch os_valid to 0, are blocked (via
1629  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1630  *	(which means we are the only recovery thread potentially acting
1631  *	on this open stream).
1632  */
1633 
1634 nfs4_opinst_t *
1635 r4mkopenlist(mntinfo4_t *mi)
1636 {
1637 	nfs4_opinst_t *reopenlist, *rep;
1638 	rnode4_t *rp;
1639 	vnode_t *vp;
1640 	vfs_t *vfsp = mi->mi_vfsp;
1641 	int numosp;
1642 	nfs4_open_stream_t *osp;
1643 	int index;
1644 	open_delegation_type4 dtype;
1645 	int hold_vnode;
1646 
1647 	reopenlist = NULL;
1648 
1649 	for (index = 0; index < rtable4size; index++) {
1650 		rw_enter(&rtable4[index].r_lock, RW_READER);
1651 		for (rp = rtable4[index].r_hashf;
1652 		    rp != (rnode4_t *)(&rtable4[index]);
1653 		    rp = rp->r_hashf) {
1654 
1655 			vp = RTOV4(rp);
1656 			if (vp->v_vfsp != vfsp)
1657 				continue;
1658 			hold_vnode = 0;
1659 
1660 			mutex_enter(&rp->r_os_lock);
1661 
1662 			/* Count the number of valid open_streams of the file */
1663 			numosp = 0;
1664 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1665 			    osp = list_next(&rp->r_open_streams, osp)) {
1666 				mutex_enter(&osp->os_sync_lock);
1667 				if (osp->os_valid && !osp->os_pending_close)
1668 					numosp++;
1669 				mutex_exit(&osp->os_sync_lock);
1670 			}
1671 
1672 			/* Fill in the valid open streams per vp */
1673 			if (numosp > 0) {
1674 				int j;
1675 
1676 				hold_vnode = 1;
1677 
1678 				/*
1679 				 * Add a new open instance to the list
1680 				 */
1681 				rep = kmem_zalloc(sizeof (*reopenlist),
1682 					KM_SLEEP);
1683 				rep->re_next = reopenlist;
1684 				reopenlist = rep;
1685 
1686 				rep->re_vp = vp;
1687 				rep->re_osp = kmem_zalloc(
1688 					numosp * sizeof (*(rep->re_osp)),
1689 					KM_SLEEP);
1690 				rep->re_numosp = numosp;
1691 
1692 				j = 0;
1693 				for (osp = list_head(&rp->r_open_streams);
1694 				    osp != NULL;
1695 				    osp = list_next(&rp->r_open_streams, osp)) {
1696 
1697 					mutex_enter(&osp->os_sync_lock);
1698 					if (osp->os_valid &&
1699 					    !osp->os_pending_close) {
1700 						osp->os_ref_count++;
1701 						rep->re_osp[j] = osp;
1702 						j++;
1703 					}
1704 					mutex_exit(&osp->os_sync_lock);
1705 				}
1706 				/*
1707 				 * Assuming valid osp(s) stays valid between
1708 				 * the time obtaining j and numosp.
1709 				 */
1710 				ASSERT(j == numosp);
1711 			}
1712 
1713 			mutex_exit(&rp->r_os_lock);
1714 			/* do this here to keep v_lock > r_os_lock */
1715 			if (hold_vnode)
1716 				VN_HOLD(vp);
1717 			mutex_enter(&rp->r_statev4_lock);
1718 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1719 				/*
1720 				 * If this rnode holds a delegation,
1721 				 * but if there are no valid open streams,
1722 				 * then just discard the delegation
1723 				 * without doing delegreturn.
1724 				 */
1725 				if (numosp > 0)
1726 					rp->r_deleg_needs_recovery =
1727 							rp->r_deleg_type;
1728 			}
1729 			/* Save the delegation type for use outside the lock */
1730 			dtype = rp->r_deleg_type;
1731 			mutex_exit(&rp->r_statev4_lock);
1732 
1733 			/*
1734 			 * If we have a delegation then get rid of it.
1735 			 * We've set rp->r_deleg_needs_recovery so we have
1736 			 * enough information to recover.
1737 			 */
1738 			if (dtype != OPEN_DELEGATE_NONE) {
1739 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1740 			}
1741 		}
1742 		rw_exit(&rtable4[index].r_lock);
1743 	}
1744 	return (reopenlist);
1745 }
1746 
1747 /*
1748  * Release the list of open instance references.
1749  */
1750 
1751 void
1752 r4releopenlist(nfs4_opinst_t *reopenp)
1753 {
1754 	nfs4_opinst_t *rep, *next;
1755 	int i;
1756 
1757 	for (rep = reopenp; rep; rep = next) {
1758 		next = rep->re_next;
1759 
1760 		for (i = 0; i < rep->re_numosp; i++)
1761 		    open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1762 
1763 		VN_RELE(rep->re_vp);
1764 		kmem_free(rep->re_osp,
1765 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1766 
1767 		kmem_free(rep, sizeof (*rep));
1768 	}
1769 }
1770 
1771 int
1772 nfs4_rnode_init(void)
1773 {
1774 	ulong_t nrnode4_max;
1775 	int i;
1776 
1777 	/*
1778 	 * Compute the size of the rnode4 hash table
1779 	 */
1780 	if (nrnode <= 0)
1781 		nrnode = ncsize;
1782 	nrnode4_max =
1783 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1784 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1785 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1786 		    "setting nrnode to max value of %ld", nrnode4_max);
1787 		nrnode = nrnode4_max;
1788 	}
1789 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1790 	rtable4mask = rtable4size - 1;
1791 
1792 	/*
1793 	 * Allocate and initialize the hash buckets
1794 	 */
1795 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1796 	for (i = 0; i < rtable4size; i++) {
1797 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1798 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1799 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1800 	}
1801 
1802 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1803 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1804 
1805 	return (0);
1806 }
1807 
1808 int
1809 nfs4_rnode_fini(void)
1810 {
1811 	int i;
1812 
1813 	/*
1814 	 * Deallocate the rnode hash queues
1815 	 */
1816 	kmem_cache_destroy(rnode4_cache);
1817 
1818 	for (i = 0; i < rtable4size; i++)
1819 		rw_destroy(&rtable4[i].r_lock);
1820 
1821 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1822 
1823 	return (0);
1824 }
1825 
1826 /*
1827  * Return non-zero if the given filehandle refers to the root filehandle
1828  * for the given rnode.
1829  */
1830 
1831 static int
1832 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1833 {
1834 	int isroot;
1835 
1836 	isroot = 0;
1837 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1838 		isroot = 1;
1839 
1840 	return (isroot);
1841 }
1842 
1843 #ifdef DEBUG
1844 
1845 /*
1846  * Look in the rnode table for other rnodes that have the same filehandle.
1847  * Assume the lock is held for the hash chain of checkrp
1848  */
1849 
1850 static void
1851 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
1852 {
1853 	rnode4_t *rp;
1854 	vnode_t *tvp;
1855 	nfs4_fhandle_t fh, fh2;
1856 	int index;
1857 
1858 	if (!r4_check_for_dups)
1859 		return;
1860 
1861 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
1862 
1863 	sfh4_copyval(checkrp->r_fh, &fh);
1864 
1865 	for (index = 0; index < rtable4size; index++) {
1866 
1867 		if (&rtable4[index] != checkrp->r_hashq)
1868 			rw_enter(&rtable4[index].r_lock, RW_READER);
1869 
1870 		for (rp = rtable4[index].r_hashf;
1871 		    rp != (rnode4_t *)(&rtable4[index]);
1872 		    rp = rp->r_hashf) {
1873 
1874 			if (rp == checkrp)
1875 				continue;
1876 
1877 			tvp = RTOV4(rp);
1878 			if (tvp->v_vfsp != vfsp)
1879 				continue;
1880 
1881 			sfh4_copyval(rp->r_fh, &fh2);
1882 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
1883 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
1884 				    "(%p, %p)", (void *)checkrp, (void *)rp);
1885 			}
1886 		}
1887 
1888 		if (&rtable4[index] != checkrp->r_hashq)
1889 			rw_exit(&rtable4[index].r_lock);
1890 	}
1891 }
1892 
1893 #endif /* DEBUG */
1894