xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs4_rnode.c (revision 92034044e95e6f6e8fb6a3dddf68ddc2561c6870)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 /*
32  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/systm.h>
38 #include <sys/cred.h>
39 #include <sys/proc.h>
40 #include <sys/user.h>
41 #include <sys/time.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/socket.h>
46 #include <sys/uio.h>
47 #include <sys/tiuser.h>
48 #include <sys/swap.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
51 #include <sys/kmem.h>
52 #include <sys/kstat.h>
53 #include <sys/cmn_err.h>
54 #include <sys/vtrace.h>
55 #include <sys/session.h>
56 #include <sys/dnlc.h>
57 #include <sys/bitmap.h>
58 #include <sys/acl.h>
59 #include <sys/ddi.h>
60 #include <sys/pathname.h>
61 #include <sys/flock.h>
62 #include <sys/dirent.h>
63 #include <sys/flock.h>
64 #include <sys/callb.h>
65 #include <sys/sdt.h>
66 
67 #include <vm/pvn.h>
68 
69 #include <rpc/types.h>
70 #include <rpc/xdr.h>
71 #include <rpc/auth.h>
72 #include <rpc/rpcsec_gss.h>
73 #include <rpc/clnt.h>
74 
75 #include <nfs/nfs.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/nfs_acl.h>
78 
79 #include <nfs/nfs4.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82 
83 /*
84  * The hash queues for the access to active and cached rnodes
85  * are organized as doubly linked lists.  A reader/writer lock
86  * for each hash bucket is used to control access and to synchronize
87  * lookups, additions, and deletions from the hash queue.
88  *
89  * The rnode freelist is organized as a doubly linked list with
90  * a head pointer.  Additions and deletions are synchronized via
91  * a single mutex.
92  *
93  * In order to add an rnode to the free list, it must be hashed into
94  * a hash queue and the exclusive lock to the hash queue be held.
95  * If an rnode is not hashed into a hash queue, then it is destroyed
96  * because it represents no valuable information that can be reused
97  * about the file.  The exclusive lock to the hash queue must be
98  * held in order to prevent a lookup in the hash queue from finding
99  * the rnode and using it and assuming that the rnode is not on the
100  * freelist.  The lookup in the hash queue will have the hash queue
101  * locked, either exclusive or shared.
102  *
103  * The vnode reference count for each rnode is not allowed to drop
104  * below 1.  This prevents external entities, such as the VM
105  * subsystem, from acquiring references to vnodes already on the
106  * freelist and then trying to place them back on the freelist
107  * when their reference is released.  This means that the when an
108  * rnode is looked up in the hash queues, then either the rnode
109  * is removed from the freelist and that reference is transferred to
110  * the new reference or the vnode reference count must be incremented
111  * accordingly.  The mutex for the freelist must be held in order to
112  * accurately test to see if the rnode is on the freelist or not.
113  * The hash queue lock might be held shared and it is possible that
114  * two different threads may race to remove the rnode from the
115  * freelist.  This race can be resolved by holding the mutex for the
116  * freelist.  Please note that the mutex for the freelist does not
117  * need to be held if the rnode is not on the freelist.  It can not be
118  * placed on the freelist due to the requirement that the thread
119  * putting the rnode on the freelist must hold the exclusive lock
120  * to the hash queue and the thread doing the lookup in the hash
121  * queue is holding either a shared or exclusive lock to the hash
122  * queue.
123  *
124  * The lock ordering is:
125  *
126  *	hash bucket lock -> vnode lock
127  *	hash bucket lock -> freelist lock -> r_statelock
128  */
129 r4hashq_t *rtable4;
130 
131 static kmutex_t rp4freelist_lock;
132 static rnode4_t *rp4freelist = NULL;
133 static long rnode4_new = 0;
134 int rtable4size;
135 static int rtable4mask;
136 static struct kmem_cache *rnode4_cache;
137 static int rnode4_hashlen = 4;
138 
139 static void	r4inactive(rnode4_t *, cred_t *);
140 static vnode_t	*make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
141 		    struct vnodeops *,
142 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
143 		    cred_t *),
144 		    int *, cred_t *);
145 static void	rp4_rmfree(rnode4_t *);
146 int		nfs4_free_data_reclaim(rnode4_t *);
147 static int	nfs4_active_data_reclaim(rnode4_t *);
148 static int	nfs4_free_reclaim(void);
149 static int	nfs4_active_reclaim(void);
150 static int	nfs4_rnode_reclaim(void);
151 static void	nfs4_reclaim(void *);
152 static int	isrootfh(nfs4_sharedfh_t *, rnode4_t *);
153 static void	uninit_rnode4(rnode4_t *);
154 static void	destroy_rnode4(rnode4_t *);
155 static void	r4_stub_set(rnode4_t *, nfs4_stub_type_t);
156 
157 #ifdef DEBUG
158 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
159 static int nfs4_rnode_debug = 0;
160 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
161 static int nfs4_rnode_nofreelist = 0;
162 /* give messages on colliding shared filehandles */
163 static void	r4_dup_check(rnode4_t *, vfs_t *);
164 #endif
165 
166 /*
167  * If the vnode has pages, run the list and check for any that are
168  * still dangling.  We call this routine before putting an rnode on
169  * the free list.
170  */
171 static int
nfs4_dross_pages(vnode_t * vp)172 nfs4_dross_pages(vnode_t *vp)
173 {
174 	page_t *pp;
175 	kmutex_t *vphm;
176 
177 	vphm = page_vnode_mutex(vp);
178 	mutex_enter(vphm);
179 	if ((pp = vp->v_pages) != NULL) {
180 		do {
181 			if (pp->p_hash != PVN_VPLIST_HASH_TAG &&
182 			    pp->p_fsdata != C_NOCOMMIT) {
183 				mutex_exit(vphm);
184 				return (1);
185 			}
186 		} while ((pp = pp->p_vpnext) != vp->v_pages);
187 	}
188 	mutex_exit(vphm);
189 
190 	return (0);
191 }
192 
193 /*
194  * Flush any pages left on this rnode.
195  */
196 static void
r4flushpages(rnode4_t * rp,cred_t * cr)197 r4flushpages(rnode4_t *rp, cred_t *cr)
198 {
199 	vnode_t *vp;
200 	int error;
201 
202 	/*
203 	 * Before freeing anything, wait until all asynchronous
204 	 * activity is done on this rnode.  This will allow all
205 	 * asynchronous read ahead and write behind i/o's to
206 	 * finish.
207 	 */
208 	mutex_enter(&rp->r_statelock);
209 	while (rp->r_count > 0)
210 		cv_wait(&rp->r_cv, &rp->r_statelock);
211 	mutex_exit(&rp->r_statelock);
212 
213 	/*
214 	 * Flush and invalidate all pages associated with the vnode.
215 	 */
216 	vp = RTOV4(rp);
217 	if (nfs4_has_pages(vp)) {
218 		ASSERT(vp->v_type != VCHR);
219 		if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
220 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
221 			if (error && (error == ENOSPC || error == EDQUOT)) {
222 				mutex_enter(&rp->r_statelock);
223 				if (!rp->r_error)
224 					rp->r_error = error;
225 				mutex_exit(&rp->r_statelock);
226 			}
227 		}
228 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
229 	}
230 }
231 
232 /*
233  * Free the resources associated with an rnode.
234  */
235 static void
r4inactive(rnode4_t * rp,cred_t * cr)236 r4inactive(rnode4_t *rp, cred_t *cr)
237 {
238 	vnode_t *vp;
239 	char *contents;
240 	int size;
241 	vsecattr_t *vsp;
242 	vnode_t *xattr;
243 
244 	r4flushpages(rp, cr);
245 
246 	vp = RTOV4(rp);
247 
248 	/*
249 	 * Free any held caches which may be
250 	 * associated with this rnode.
251 	 */
252 	mutex_enter(&rp->r_statelock);
253 	contents = rp->r_symlink.contents;
254 	size = rp->r_symlink.size;
255 	rp->r_symlink.contents = NULL;
256 	vsp = rp->r_secattr;
257 	rp->r_secattr = NULL;
258 	xattr = rp->r_xattr_dir;
259 	rp->r_xattr_dir = NULL;
260 	mutex_exit(&rp->r_statelock);
261 
262 	/*
263 	 * Free the access cache entries.
264 	 */
265 	(void) nfs4_access_purge_rp(rp);
266 
267 	/*
268 	 * Free the readdir cache entries.
269 	 */
270 	nfs4_purge_rddir_cache(vp);
271 
272 	/*
273 	 * Free the symbolic link cache.
274 	 */
275 	if (contents != NULL) {
276 
277 		kmem_free((void *)contents, size);
278 	}
279 
280 	/*
281 	 * Free any cached ACL.
282 	 */
283 	if (vsp != NULL)
284 		nfs4_acl_free_cache(vsp);
285 
286 	/*
287 	 * Release the cached xattr_dir
288 	 */
289 	if (xattr != NULL)
290 		VN_RELE(xattr);
291 }
292 
293 /*
294  * We have seen a case that the fh passed in is for "." which
295  * should be a VROOT node, however, the fh is different from the
296  * root fh stored in the mntinfo4_t. The invalid fh might be
297  * from a misbehaved server and will panic the client system at
298  * a later time. To avoid the panic, we drop the bad fh, use
299  * the root fh from mntinfo4_t, and print an error message
300  * for attention.
301  */
302 nfs4_sharedfh_t *
badrootfh_check(nfs4_sharedfh_t * fh,nfs4_fname_t * nm,mntinfo4_t * mi,int * wasbad)303 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
304     int *wasbad)
305 {
306 	char *s;
307 
308 	*wasbad = 0;
309 	s = fn_name(nm);
310 	ASSERT(strcmp(s, "..") != 0);
311 
312 	if ((s[0] == '.' && s[1] == '\0') && fh &&
313 	    !SFH4_SAME(mi->mi_rootfh, fh)) {
314 #ifdef DEBUG
315 		nfs4_fhandle_t fhandle;
316 
317 		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
318 		    "Server %s returns a different "
319 		    "root filehandle for the path %s:",
320 		    mi->mi_curr_serv->sv_hostname,
321 		    mi->mi_curr_serv->sv_path);
322 
323 		/* print the bad fh */
324 		fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
325 		bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
326 		    fhandle.fh_len);
327 		nfs4_printfhandle(&fhandle);
328 
329 		/* print mi_rootfh */
330 		fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
331 		bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
332 		    fhandle.fh_len);
333 		nfs4_printfhandle(&fhandle);
334 #endif
335 		/* use mi_rootfh instead; fh will be rele by the caller */
336 		fh = mi->mi_rootfh;
337 		*wasbad = 1;
338 	}
339 
340 	kmem_free(s, MAXNAMELEN);
341 	return (fh);
342 }
343 
344 void
r4_do_attrcache(vnode_t * vp,nfs4_ga_res_t * garp,int newnode,hrtime_t t,cred_t * cr,int index)345 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
346     hrtime_t t, cred_t *cr, int index)
347 {
348 	int is_stub;
349 	vattr_t *attr;
350 	/*
351 	 * Don't add to attrcache if time overflow, but
352 	 * no need to check because either attr is null or the time
353 	 * values in it were processed by nfs4_time_ntov(), which checks
354 	 * for time overflows.
355 	 */
356 	attr = garp ? &garp->n4g_va : NULL;
357 
358 	if (attr) {
359 		if (!newnode) {
360 			rw_exit(&rtable4[index].r_lock);
361 #ifdef DEBUG
362 			if (vp->v_type != attr->va_type &&
363 			    vp->v_type != VNON && attr->va_type != VNON) {
364 				zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
365 				    "makenfs4node: type (%d) doesn't "
366 				    "match type of found node at %p (%d)",
367 				    attr->va_type, (void *)vp, vp->v_type);
368 			}
369 #endif
370 			nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
371 		} else {
372 			rnode4_t *rp = VTOR4(vp);
373 
374 			vp->v_type = attr->va_type;
375 			vp->v_rdev = attr->va_rdev;
376 
377 			/*
378 			 * Turn this object into a "stub" object if we
379 			 * crossed an underlying server fs boundary.
380 			 * To make this check, during mount we save the
381 			 * fsid of the server object being mounted.
382 			 * Here we compare this object's server fsid
383 			 * with the fsid we saved at mount.  If they
384 			 * are different, we crossed server fs boundary.
385 			 *
386 			 * The stub type is set (or not) at rnode
387 			 * creation time and it never changes for life
388 			 * of the rnode.
389 			 *
390 			 * This stub will be for a mirror-mount, rather than
391 			 * a referral (the latter also sets R4SRVSTUB).
392 			 *
393 			 * The stub type is also set during RO failover,
394 			 * nfs4_remap_file().
395 			 *
396 			 * We don't bother with taking r_state_lock to
397 			 * set the stub type because this is a new rnode
398 			 * and we're holding the hash bucket r_lock RW_WRITER.
399 			 * No other thread could have obtained access
400 			 * to this rnode.
401 			 */
402 			is_stub = 0;
403 			if (garp->n4g_fsid_valid) {
404 				fattr4_fsid ga_fsid = garp->n4g_fsid;
405 				servinfo4_t *svp = rp->r_server;
406 
407 				rp->r_srv_fsid = ga_fsid;
408 
409 				(void) nfs_rw_enter_sig(&svp->sv_lock,
410 				    RW_READER, 0);
411 				if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
412 					is_stub = 1;
413 				nfs_rw_exit(&svp->sv_lock);
414 			}
415 
416 			if (is_stub)
417 				r4_stub_mirrormount(rp);
418 			else
419 				r4_stub_none(rp);
420 
421 			/* Can not cache partial attr */
422 			if (attr->va_mask == AT_ALL)
423 				nfs4_attrcache_noinval(vp, garp, t);
424 			else
425 				PURGE_ATTRCACHE4(vp);
426 
427 			rw_exit(&rtable4[index].r_lock);
428 		}
429 	} else {
430 		if (newnode) {
431 			PURGE_ATTRCACHE4(vp);
432 		}
433 		rw_exit(&rtable4[index].r_lock);
434 	}
435 }
436 
437 /*
438  * Find or create an rnode based primarily on filehandle.  To be
439  * used when dvp (vnode for parent directory) is not available;
440  * otherwise, makenfs4node() should be used.
441  *
442  * The nfs4_fname_t argument *npp is consumed and nulled out.
443  */
444 
445 vnode_t *
makenfs4node_by_fh(nfs4_sharedfh_t * sfh,nfs4_sharedfh_t * psfh,nfs4_fname_t ** npp,nfs4_ga_res_t * garp,mntinfo4_t * mi,cred_t * cr,hrtime_t t)446 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
447     nfs4_fname_t **npp, nfs4_ga_res_t *garp,
448     mntinfo4_t *mi, cred_t *cr, hrtime_t t)
449 {
450 	vfs_t *vfsp = mi->mi_vfsp;
451 	int newnode = 0;
452 	vnode_t *vp;
453 	rnode4_t *rp;
454 	svnode_t *svp;
455 	nfs4_fname_t *name, *svpname;
456 	int index;
457 
458 	ASSERT(npp && *npp);
459 	name = *npp;
460 	*npp = NULL;
461 
462 	index = rtable4hash(sfh);
463 	rw_enter(&rtable4[index].r_lock, RW_READER);
464 
465 	vp = make_rnode4(sfh, &rtable4[index], vfsp,
466 	    nfs4_vnodeops, nfs4_putapage, &newnode, cr);
467 
468 	svp = VTOSV(vp);
469 	rp = VTOR4(vp);
470 	if (newnode) {
471 		svp->sv_forw = svp->sv_back = svp;
472 		svp->sv_name = name;
473 		if (psfh != NULL)
474 			sfh4_hold(psfh);
475 		svp->sv_dfh = psfh;
476 	} else {
477 		/*
478 		 * It is possible that due to a server
479 		 * side rename fnames have changed.
480 		 * update the fname here.
481 		 */
482 		mutex_enter(&rp->r_svlock);
483 		svpname = svp->sv_name;
484 		if (svp->sv_name != name) {
485 			svp->sv_name = name;
486 			mutex_exit(&rp->r_svlock);
487 			fn_rele(&svpname);
488 		} else {
489 			mutex_exit(&rp->r_svlock);
490 			fn_rele(&name);
491 		}
492 	}
493 
494 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
495 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
496 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
497 
498 	return (vp);
499 }
500 
501 /*
502  * Find or create a vnode for the given filehandle, filesystem, parent, and
503  * name.  The reference to nm is consumed, so the caller must first do an
504  * fn_hold() if it wants to continue using nm after this call.
505  */
506 vnode_t *
makenfs4node(nfs4_sharedfh_t * fh,nfs4_ga_res_t * garp,struct vfs * vfsp,hrtime_t t,cred_t * cr,vnode_t * dvp,nfs4_fname_t * nm)507 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
508     hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
509 {
510 	vnode_t *vp;
511 	int newnode;
512 	int index;
513 	mntinfo4_t *mi = VFTOMI4(vfsp);
514 	int had_badfh = 0;
515 	rnode4_t *rp;
516 
517 	ASSERT(dvp != NULL);
518 
519 	fh = badrootfh_check(fh, nm, mi, &had_badfh);
520 
521 	index = rtable4hash(fh);
522 	rw_enter(&rtable4[index].r_lock, RW_READER);
523 
524 	/*
525 	 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
526 	 */
527 	vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
528 	    nfs4_putapage, &newnode, cr);
529 
530 	rp = VTOR4(vp);
531 	sv_activate(&vp, dvp, &nm, newnode);
532 	if (dvp->v_flag & V_XATTRDIR) {
533 		mutex_enter(&rp->r_statelock);
534 		rp->r_flags |= R4ISXATTR;
535 		mutex_exit(&rp->r_statelock);
536 	}
537 
538 	/* if getting a bad file handle, do not cache the attributes. */
539 	if (had_badfh) {
540 		rw_exit(&rtable4[index].r_lock);
541 		return (vp);
542 	}
543 
544 	ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
545 	r4_do_attrcache(vp, garp, newnode, t, cr, index);
546 	ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
547 
548 	return (vp);
549 }
550 
551 /*
552  * Hash on address of filehandle object.
553  * XXX totally untuned.
554  */
555 
556 int
rtable4hash(nfs4_sharedfh_t * fh)557 rtable4hash(nfs4_sharedfh_t *fh)
558 {
559 	return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
560 }
561 
562 /*
563  * Find or create the vnode for the given filehandle and filesystem.
564  * *newnode is set to zero if the vnode already existed; non-zero if it had
565  * to be created.
566  *
567  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
568  */
569 
570 static vnode_t *
make_rnode4(nfs4_sharedfh_t * fh,r4hashq_t * rhtp,struct vfs * vfsp,struct vnodeops * vops,int (* putapage)(vnode_t *,page_t *,u_offset_t *,size_t *,int,cred_t *),int * newnode,cred_t * cr)571 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
572     struct vnodeops *vops,
573     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
574     int *newnode, cred_t *cr)
575 {
576 	rnode4_t *rp;
577 	rnode4_t *trp;
578 	vnode_t *vp;
579 	mntinfo4_t *mi;
580 
581 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
582 
583 	mi = VFTOMI4(vfsp);
584 
585 start:
586 	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
587 		vp = RTOV4(rp);
588 		*newnode = 0;
589 		return (vp);
590 	}
591 	rw_exit(&rhtp->r_lock);
592 
593 	mutex_enter(&rp4freelist_lock);
594 
595 	if (rp4freelist != NULL && rnode4_new >= nrnode) {
596 		rp = rp4freelist;
597 		rp4_rmfree(rp);
598 		mutex_exit(&rp4freelist_lock);
599 
600 		vp = RTOV4(rp);
601 
602 		if (rp->r_flags & R4HASHED) {
603 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
604 			mutex_enter(&vp->v_lock);
605 			if (vp->v_count > 1) {
606 				vp->v_count--;
607 				mutex_exit(&vp->v_lock);
608 				rw_exit(&rp->r_hashq->r_lock);
609 				rw_enter(&rhtp->r_lock, RW_READER);
610 				goto start;
611 			}
612 			mutex_exit(&vp->v_lock);
613 			rp4_rmhash_locked(rp);
614 			rw_exit(&rp->r_hashq->r_lock);
615 		}
616 
617 		r4inactive(rp, cr);
618 
619 		mutex_enter(&vp->v_lock);
620 		if (vp->v_count > 1) {
621 			vp->v_count--;
622 			mutex_exit(&vp->v_lock);
623 			rw_enter(&rhtp->r_lock, RW_READER);
624 			goto start;
625 		}
626 		mutex_exit(&vp->v_lock);
627 		vn_invalid(vp);
628 
629 		/*
630 		 * destroy old locks before bzero'ing and
631 		 * recreating the locks below.
632 		 */
633 		uninit_rnode4(rp);
634 
635 		/*
636 		 * Make sure that if rnode is recycled then
637 		 * VFS count is decremented properly before
638 		 * reuse.
639 		 */
640 		VFS_RELE(vp->v_vfsp);
641 		vn_reinit(vp);
642 	} else {
643 		vnode_t *new_vp;
644 
645 		mutex_exit(&rp4freelist_lock);
646 
647 		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
648 		new_vp = vn_alloc(KM_SLEEP);
649 
650 		atomic_inc_ulong((ulong_t *)&rnode4_new);
651 #ifdef DEBUG
652 		clstat4_debug.nrnode.value.ui64++;
653 #endif
654 		vp = new_vp;
655 	}
656 
657 	bzero(rp, sizeof (*rp));
658 	rp->r_vnode = vp;
659 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
660 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
661 	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
662 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
663 	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
664 	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
665 	rp->created_v4 = 0;
666 	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
667 	    offsetof(nfs4_open_stream_t, os_node));
668 	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
669 	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
670 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
671 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
672 	rp->r_flags = R4READDIRWATTR;
673 	rp->r_fh = fh;
674 	rp->r_hashq = rhtp;
675 	sfh4_hold(rp->r_fh);
676 	rp->r_server = mi->mi_curr_serv;
677 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
678 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
679 	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
680 
681 	rddir4_cache_create(rp);
682 	rp->r_putapage = putapage;
683 	vn_setops(vp, vops);
684 	vp->v_data = (caddr_t)rp;
685 	vp->v_vfsp = vfsp;
686 	VFS_HOLD(vfsp);
687 	vp->v_type = VNON;
688 	vp->v_flag |= VMODSORT;
689 	if (isrootfh(fh, rp))
690 		vp->v_flag = VROOT;
691 	vn_exists(vp);
692 
693 	/*
694 	 * There is a race condition if someone else
695 	 * alloc's the rnode while no locks are held, so we
696 	 * check again and recover if found.
697 	 */
698 	rw_enter(&rhtp->r_lock, RW_WRITER);
699 	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
700 		vp = RTOV4(trp);
701 		*newnode = 0;
702 		rw_exit(&rhtp->r_lock);
703 		rp4_addfree(rp, cr);
704 		rw_enter(&rhtp->r_lock, RW_READER);
705 		return (vp);
706 	}
707 	rp4_addhash(rp);
708 	*newnode = 1;
709 	return (vp);
710 }
711 
712 static void
uninit_rnode4(rnode4_t * rp)713 uninit_rnode4(rnode4_t *rp)
714 {
715 	vnode_t *vp = RTOV4(rp);
716 
717 	ASSERT(rp != NULL);
718 	ASSERT(vp != NULL);
719 	ASSERT(vp->v_count == 1);
720 	ASSERT(rp->r_count == 0);
721 	ASSERT(rp->r_mapcnt == 0);
722 	if (rp->r_flags & R4LODANGLERS) {
723 		nfs4_flush_lock_owners(rp);
724 	}
725 	ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
726 	ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
727 	ASSERT(!(rp->r_flags & R4HASHED));
728 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
729 	nfs4_clear_open_streams(rp);
730 	list_destroy(&rp->r_open_streams);
731 
732 	/*
733 	 * Destroy the rddir cache first since we need to grab the r_statelock.
734 	 */
735 	mutex_enter(&rp->r_statelock);
736 	rddir4_cache_destroy(rp);
737 	mutex_exit(&rp->r_statelock);
738 	sv_uninit(&rp->r_svnode);
739 	sfh4_rele(&rp->r_fh);
740 	nfs_rw_destroy(&rp->r_rwlock);
741 	nfs_rw_destroy(&rp->r_lkserlock);
742 	mutex_destroy(&rp->r_statelock);
743 	mutex_destroy(&rp->r_statev4_lock);
744 	mutex_destroy(&rp->r_os_lock);
745 	cv_destroy(&rp->r_cv);
746 	cv_destroy(&rp->r_commit.c_cv);
747 	nfs_rw_destroy(&rp->r_deleg_recall_lock);
748 	if (rp->r_flags & R4DELMAPLIST)
749 		list_destroy(&rp->r_indelmap);
750 }
751 
752 /*
753  * Put an rnode on the free list.
754  *
755  * Rnodes which were allocated above and beyond the normal limit
756  * are immediately freed.
757  */
758 void
rp4_addfree(rnode4_t * rp,cred_t * cr)759 rp4_addfree(rnode4_t *rp, cred_t *cr)
760 {
761 	vnode_t *vp;
762 	vnode_t *xattr;
763 	struct vfs *vfsp;
764 
765 	vp = RTOV4(rp);
766 	ASSERT(vp->v_count >= 1);
767 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
768 
769 	/*
770 	 * If we have too many rnodes allocated and there are no
771 	 * references to this rnode, or if the rnode is no longer
772 	 * accessible by it does not reside in the hash queues,
773 	 * or if an i/o error occurred while writing to the file,
774 	 * then just free it instead of putting it on the rnode
775 	 * freelist.
776 	 */
777 	vfsp = vp->v_vfsp;
778 	if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
779 #ifdef DEBUG
780 	    (nfs4_rnode_nofreelist != 0) ||
781 #endif
782 	    rp->r_error || (rp->r_flags & R4RECOVERR) ||
783 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
784 		if (rp->r_flags & R4HASHED) {
785 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
786 			mutex_enter(&vp->v_lock);
787 			if (vp->v_count > 1) {
788 				vp->v_count--;
789 				mutex_exit(&vp->v_lock);
790 				rw_exit(&rp->r_hashq->r_lock);
791 				return;
792 			}
793 			mutex_exit(&vp->v_lock);
794 			rp4_rmhash_locked(rp);
795 			rw_exit(&rp->r_hashq->r_lock);
796 		}
797 
798 		/*
799 		 * Make sure we don't have a delegation on this rnode
800 		 * before destroying it.
801 		 */
802 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
803 			(void) nfs4delegreturn(rp,
804 			    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
805 		}
806 
807 		r4inactive(rp, cr);
808 
809 		/*
810 		 * Recheck the vnode reference count.  We need to
811 		 * make sure that another reference has not been
812 		 * acquired while we were not holding v_lock.  The
813 		 * rnode is not in the rnode hash queues; one
814 		 * way for a reference to have been acquired
815 		 * is for a VOP_PUTPAGE because the rnode was marked
816 		 * with R4DIRTY or for a modified page.  This
817 		 * reference may have been acquired before our call
818 		 * to r4inactive.  The i/o may have been completed,
819 		 * thus allowing r4inactive to complete, but the
820 		 * reference to the vnode may not have been released
821 		 * yet.  In any case, the rnode can not be destroyed
822 		 * until the other references to this vnode have been
823 		 * released.  The other references will take care of
824 		 * either destroying the rnode or placing it on the
825 		 * rnode freelist.  If there are no other references,
826 		 * then the rnode may be safely destroyed.
827 		 */
828 		mutex_enter(&vp->v_lock);
829 		if (vp->v_count > 1) {
830 			vp->v_count--;
831 			mutex_exit(&vp->v_lock);
832 			return;
833 		}
834 		mutex_exit(&vp->v_lock);
835 
836 		destroy_rnode4(rp);
837 		return;
838 	}
839 
840 	/*
841 	 * Lock the hash queue and then recheck the reference count
842 	 * to ensure that no other threads have acquired a reference
843 	 * to indicate that the rnode should not be placed on the
844 	 * freelist.  If another reference has been acquired, then
845 	 * just release this one and let the other thread complete
846 	 * the processing of adding this rnode to the freelist.
847 	 */
848 again:
849 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
850 
851 	mutex_enter(&vp->v_lock);
852 	if (vp->v_count > 1) {
853 		vp->v_count--;
854 		mutex_exit(&vp->v_lock);
855 		rw_exit(&rp->r_hashq->r_lock);
856 		return;
857 	}
858 	mutex_exit(&vp->v_lock);
859 
860 	/*
861 	 * Make sure we don't put an rnode with a delegation
862 	 * on the free list.
863 	 */
864 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
865 		rw_exit(&rp->r_hashq->r_lock);
866 		(void) nfs4delegreturn(rp,
867 		    NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
868 		goto again;
869 	}
870 
871 	/*
872 	 * Now that we have the hash queue lock, and we know there
873 	 * are not anymore references on the vnode, check to make
874 	 * sure there aren't any open streams still on the rnode.
875 	 * If so, drop the hash queue lock, remove the open streams,
876 	 * and recheck the v_count.
877 	 */
878 	mutex_enter(&rp->r_os_lock);
879 	if (list_head(&rp->r_open_streams) != NULL) {
880 		mutex_exit(&rp->r_os_lock);
881 		rw_exit(&rp->r_hashq->r_lock);
882 		if (nfs_zone() != VTOMI4(vp)->mi_zone)
883 			nfs4_clear_open_streams(rp);
884 		else
885 			(void) nfs4close_all(vp, cr);
886 		goto again;
887 	}
888 	mutex_exit(&rp->r_os_lock);
889 
890 	/*
891 	 * Before we put it on the freelist, make sure there are no pages.
892 	 * If there are, flush and commit of all of the dirty and
893 	 * uncommitted pages, assuming the file system isn't read only.
894 	 */
895 	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
896 		rw_exit(&rp->r_hashq->r_lock);
897 		r4flushpages(rp, cr);
898 		goto again;
899 	}
900 
901 	/*
902 	 * Before we put it on the freelist, make sure there is no
903 	 * active xattr directory cached, the freelist will not
904 	 * have its entries r4inactive'd if there is still an active
905 	 * rnode, thus nothing in the freelist can hold another
906 	 * rnode active.
907 	 */
908 	xattr = rp->r_xattr_dir;
909 	rp->r_xattr_dir = NULL;
910 
911 	/*
912 	 * If there is no cached data or metadata for this file, then
913 	 * put the rnode on the front of the freelist so that it will
914 	 * be reused before other rnodes which may have cached data or
915 	 * metadata associated with them.
916 	 */
917 	mutex_enter(&rp4freelist_lock);
918 	if (rp4freelist == NULL) {
919 		rp->r_freef = rp;
920 		rp->r_freeb = rp;
921 		rp4freelist = rp;
922 	} else {
923 		rp->r_freef = rp4freelist;
924 		rp->r_freeb = rp4freelist->r_freeb;
925 		rp4freelist->r_freeb->r_freef = rp;
926 		rp4freelist->r_freeb = rp;
927 		if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
928 		    rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
929 			rp4freelist = rp;
930 	}
931 	mutex_exit(&rp4freelist_lock);
932 
933 	rw_exit(&rp->r_hashq->r_lock);
934 
935 	if (xattr)
936 		VN_RELE(xattr);
937 }
938 
939 /*
940  * Remove an rnode from the free list.
941  *
942  * The caller must be holding rp4freelist_lock and the rnode
943  * must be on the freelist.
944  */
945 static void
rp4_rmfree(rnode4_t * rp)946 rp4_rmfree(rnode4_t *rp)
947 {
948 
949 	ASSERT(MUTEX_HELD(&rp4freelist_lock));
950 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
951 
952 	if (rp == rp4freelist) {
953 		rp4freelist = rp->r_freef;
954 		if (rp == rp4freelist)
955 			rp4freelist = NULL;
956 	}
957 	rp->r_freeb->r_freef = rp->r_freef;
958 	rp->r_freef->r_freeb = rp->r_freeb;
959 
960 	rp->r_freef = rp->r_freeb = NULL;
961 }
962 
963 /*
964  * Put a rnode in the hash table.
965  *
966  * The caller must be holding the exclusive hash queue lock
967  */
968 void
rp4_addhash(rnode4_t * rp)969 rp4_addhash(rnode4_t *rp)
970 {
971 	mntinfo4_t *mi;
972 
973 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
974 	ASSERT(!(rp->r_flags & R4HASHED));
975 
976 #ifdef DEBUG
977 	r4_dup_check(rp, RTOV4(rp)->v_vfsp);
978 #endif
979 
980 	rp->r_hashf = rp->r_hashq->r_hashf;
981 	rp->r_hashq->r_hashf = rp;
982 	rp->r_hashb = (rnode4_t *)rp->r_hashq;
983 	rp->r_hashf->r_hashb = rp;
984 
985 	mutex_enter(&rp->r_statelock);
986 	rp->r_flags |= R4HASHED;
987 	mutex_exit(&rp->r_statelock);
988 
989 	mi = VTOMI4(RTOV4(rp));
990 	mutex_enter(&mi->mi_rnodes_lock);
991 	list_insert_tail(&mi->mi_rnodes, rp);
992 	mutex_exit(&mi->mi_rnodes_lock);
993 }
994 
995 /*
996  * Remove a rnode from the hash table.
997  *
998  * The caller must be holding the hash queue lock.
999  */
1000 void
rp4_rmhash_locked(rnode4_t * rp)1001 rp4_rmhash_locked(rnode4_t *rp)
1002 {
1003 	mntinfo4_t *mi;
1004 
1005 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
1006 	ASSERT(rp->r_flags & R4HASHED);
1007 
1008 	rp->r_hashb->r_hashf = rp->r_hashf;
1009 	rp->r_hashf->r_hashb = rp->r_hashb;
1010 
1011 	mutex_enter(&rp->r_statelock);
1012 	rp->r_flags &= ~R4HASHED;
1013 	mutex_exit(&rp->r_statelock);
1014 
1015 	mi = VTOMI4(RTOV4(rp));
1016 	mutex_enter(&mi->mi_rnodes_lock);
1017 	if (list_link_active(&rp->r_mi_link))
1018 		list_remove(&mi->mi_rnodes, rp);
1019 	mutex_exit(&mi->mi_rnodes_lock);
1020 }
1021 
1022 /*
1023  * Remove a rnode from the hash table.
1024  *
1025  * The caller must not be holding the hash queue lock.
1026  */
1027 void
rp4_rmhash(rnode4_t * rp)1028 rp4_rmhash(rnode4_t *rp)
1029 {
1030 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1031 	rp4_rmhash_locked(rp);
1032 	rw_exit(&rp->r_hashq->r_lock);
1033 }
1034 
1035 /*
1036  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1037  * Returns NULL if no match.  If an rnode is returned, the reference count
1038  * on the master vnode is incremented.
1039  *
1040  * The caller must be holding the hash queue lock, either shared or exclusive.
1041  */
1042 rnode4_t *
r4find(r4hashq_t * rhtp,nfs4_sharedfh_t * fh,struct vfs * vfsp)1043 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1044 {
1045 	rnode4_t *rp;
1046 	vnode_t *vp;
1047 
1048 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1049 
1050 	for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1051 		vp = RTOV4(rp);
1052 		if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1053 
1054 			mutex_enter(&rp->r_statelock);
1055 			if (rp->r_flags & R4RECOVERR) {
1056 				mutex_exit(&rp->r_statelock);
1057 				continue;
1058 			}
1059 			mutex_exit(&rp->r_statelock);
1060 #ifdef DEBUG
1061 			r4_dup_check(rp, vfsp);
1062 #endif
1063 			if (rp->r_freef != NULL) {
1064 				mutex_enter(&rp4freelist_lock);
1065 				/*
1066 				 * If the rnode is on the freelist,
1067 				 * then remove it and use that reference
1068 				 * as the new reference.  Otherwise,
1069 				 * need to increment the reference count.
1070 				 */
1071 				if (rp->r_freef != NULL) {
1072 					rp4_rmfree(rp);
1073 					mutex_exit(&rp4freelist_lock);
1074 				} else {
1075 					mutex_exit(&rp4freelist_lock);
1076 					VN_HOLD(vp);
1077 				}
1078 			} else
1079 				VN_HOLD(vp);
1080 
1081 			/*
1082 			 * if root vnode, set v_flag to indicate that
1083 			 */
1084 			if (isrootfh(fh, rp)) {
1085 				if (!(vp->v_flag & VROOT)) {
1086 					mutex_enter(&vp->v_lock);
1087 					vp->v_flag |= VROOT;
1088 					mutex_exit(&vp->v_lock);
1089 				}
1090 			}
1091 			return (rp);
1092 		}
1093 	}
1094 	return (NULL);
1095 }
1096 
1097 /*
1098  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1099  * that assumes the caller hasn't already got the lock
1100  * on the hash bucket.
1101  */
1102 rnode4_t *
r4find_unlocked(nfs4_sharedfh_t * fh,struct vfs * vfsp)1103 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1104 {
1105 	rnode4_t *rp;
1106 	int index;
1107 
1108 	index = rtable4hash(fh);
1109 	rw_enter(&rtable4[index].r_lock, RW_READER);
1110 	rp = r4find(&rtable4[index], fh, vfsp);
1111 	rw_exit(&rtable4[index].r_lock);
1112 
1113 	return (rp);
1114 }
1115 
1116 /*
1117  * Return 1 if there is an active vnode belonging to this vfs in the
1118  * rtable4 cache.
1119  *
1120  * Several of these checks are done without holding the usual
1121  * locks.  This is safe because destroy_rtable4(), rp4_addfree(),
1122  * etc. will redo the necessary checks before actually destroying
1123  * any rnodes.
1124  */
1125 int
check_rtable4(struct vfs * vfsp)1126 check_rtable4(struct vfs *vfsp)
1127 {
1128 	rnode4_t *rp;
1129 	vnode_t *vp;
1130 	mntinfo4_t *mi;
1131 
1132 	ASSERT(vfsp != NULL);
1133 	mi = VFTOMI4(vfsp);
1134 
1135 	mutex_enter(&mi->mi_rnodes_lock);
1136 	for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1137 	    rp = list_next(&mi->mi_rnodes, rp)) {
1138 		vp = RTOV4(rp);
1139 
1140 		if (rp->r_freef == NULL ||
1141 		    (nfs4_has_pages(vp) && (rp->r_flags & R4DIRTY)) ||
1142 		    rp->r_count > 0) {
1143 			mutex_exit(&mi->mi_rnodes_lock);
1144 			return (1);
1145 		}
1146 	}
1147 	mutex_exit(&mi->mi_rnodes_lock);
1148 
1149 	return (0);
1150 }
1151 
1152 /*
1153  * Destroy inactive vnodes from the hash queues which
1154  * belong to this vfs. All of the vnodes should be inactive.
1155  * It is essential that we destroy all rnodes in case of
1156  * forced unmount as well as in normal unmount case.
1157  */
1158 
1159 void
destroy_rtable4(struct vfs * vfsp,cred_t * cr)1160 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1161 {
1162 	rnode4_t *rp;
1163 	mntinfo4_t *mi;
1164 
1165 	ASSERT(vfsp != NULL);
1166 
1167 	mi = VFTOMI4(vfsp);
1168 
1169 	mutex_enter(&rp4freelist_lock);
1170 	mutex_enter(&mi->mi_rnodes_lock);
1171 	while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
1172 		/*
1173 		 * If the rnode is no longer on the freelist it is not
1174 		 * ours and it will be handled by some other thread, so
1175 		 * skip it.
1176 		 */
1177 		if (rp->r_freef == NULL)
1178 			continue;
1179 		mutex_exit(&mi->mi_rnodes_lock);
1180 
1181 		rp4_rmfree(rp);
1182 		mutex_exit(&rp4freelist_lock);
1183 
1184 		rp4_rmhash(rp);
1185 
1186 		/*
1187 		 * This call to rp4_addfree will end up destroying the
1188 		 * rnode, but in a safe way with the appropriate set
1189 		 * of checks done.
1190 		 */
1191 		rp4_addfree(rp, cr);
1192 
1193 		mutex_enter(&rp4freelist_lock);
1194 		mutex_enter(&mi->mi_rnodes_lock);
1195 	}
1196 	mutex_exit(&mi->mi_rnodes_lock);
1197 	mutex_exit(&rp4freelist_lock);
1198 }
1199 
1200 /*
1201  * This routine destroys all the resources of an rnode
1202  * and finally the rnode itself.
1203  */
1204 static void
destroy_rnode4(rnode4_t * rp)1205 destroy_rnode4(rnode4_t *rp)
1206 {
1207 	vnode_t *vp;
1208 	vfs_t *vfsp;
1209 
1210 	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1211 
1212 	vp = RTOV4(rp);
1213 	vfsp = vp->v_vfsp;
1214 
1215 	uninit_rnode4(rp);
1216 	atomic_dec_ulong((ulong_t *)&rnode4_new);
1217 #ifdef DEBUG
1218 	clstat4_debug.nrnode.value.ui64--;
1219 #endif
1220 	kmem_cache_free(rnode4_cache, rp);
1221 	vn_invalid(vp);
1222 	vn_free(vp);
1223 	VFS_RELE(vfsp);
1224 }
1225 
1226 /*
1227  * Invalidate the attributes on all rnodes forcing the next getattr
1228  * to go over the wire.  Used to flush stale uid and gid mappings.
1229  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1230  */
1231 void
nfs4_rnode_invalidate(struct vfs * vfsp)1232 nfs4_rnode_invalidate(struct vfs *vfsp)
1233 {
1234 	int index;
1235 	rnode4_t *rp;
1236 	vnode_t *vp;
1237 
1238 	/*
1239 	 * Walk the hash queues looking for rnodes.
1240 	 */
1241 	for (index = 0; index < rtable4size; index++) {
1242 		rw_enter(&rtable4[index].r_lock, RW_READER);
1243 		for (rp = rtable4[index].r_hashf;
1244 		    rp != (rnode4_t *)(&rtable4[index]);
1245 		    rp = rp->r_hashf) {
1246 			vp = RTOV4(rp);
1247 			if (vfsp != NULL && vp->v_vfsp != vfsp)
1248 				continue;
1249 
1250 			if (!mutex_tryenter(&rp->r_statelock))
1251 				continue;
1252 
1253 			/*
1254 			 * Expire the attributes by resetting the change
1255 			 * and attr timeout.
1256 			 */
1257 			rp->r_change = 0;
1258 			PURGE_ATTRCACHE4_LOCKED(rp);
1259 			mutex_exit(&rp->r_statelock);
1260 		}
1261 		rw_exit(&rtable4[index].r_lock);
1262 	}
1263 }
1264 
1265 /*
1266  * Flush all vnodes in this (or every) vfs.
1267  * Used by nfs_sync and by nfs_unmount.
1268  */
1269 void
r4flush(struct vfs * vfsp,cred_t * cr)1270 r4flush(struct vfs *vfsp, cred_t *cr)
1271 {
1272 	int index;
1273 	rnode4_t *rp;
1274 	vnode_t *vp, **vplist;
1275 	long num, cnt;
1276 
1277 	/*
1278 	 * Check to see whether there is anything to do.
1279 	 */
1280 	num = rnode4_new;
1281 	if (num == 0)
1282 		return;
1283 
1284 	/*
1285 	 * Allocate a slot for all currently active rnodes on the
1286 	 * supposition that they all may need flushing.
1287 	 */
1288 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1289 	cnt = 0;
1290 
1291 	/*
1292 	 * If the vfs is known we can do fast path by iterating all rnodes that
1293 	 * belongs to this vfs.  This is much faster than the traditional way
1294 	 * of iterating rtable4 (below) in a case there is a lot of rnodes that
1295 	 * does not belong to our vfs.
1296 	 */
1297 	if (vfsp != NULL) {
1298 		mntinfo4_t *mi = VFTOMI4(vfsp);
1299 
1300 		mutex_enter(&mi->mi_rnodes_lock);
1301 		for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1302 		    rp = list_next(&mi->mi_rnodes, rp)) {
1303 			vp = RTOV4(rp);
1304 			/*
1305 			 * Don't bother sync'ing a vp if it
1306 			 * is part of virtual swap device or
1307 			 * if VFS is read-only
1308 			 */
1309 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1310 				continue;
1311 			/*
1312 			 * If the vnode has pages and is marked as either dirty
1313 			 * or mmap'd, hold and add this vnode to the list of
1314 			 * vnodes to flush.
1315 			 */
1316 			ASSERT(vp->v_vfsp == vfsp);
1317 			if (nfs4_has_pages(vp) &&
1318 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1319 				VN_HOLD(vp);
1320 				vplist[cnt++] = vp;
1321 				if (cnt == num) {
1322 					/*
1323 					 * The vplist is full because there is
1324 					 * too many rnodes.  We are done for
1325 					 * now.
1326 					 */
1327 					break;
1328 				}
1329 			}
1330 		}
1331 		mutex_exit(&mi->mi_rnodes_lock);
1332 
1333 		goto done;
1334 	}
1335 
1336 	ASSERT(vfsp == NULL);
1337 
1338 	/*
1339 	 * Walk the hash queues looking for rnodes with page
1340 	 * lists associated with them.  Make a list of these
1341 	 * files.
1342 	 */
1343 	for (index = 0; index < rtable4size; index++) {
1344 		rw_enter(&rtable4[index].r_lock, RW_READER);
1345 		for (rp = rtable4[index].r_hashf;
1346 		    rp != (rnode4_t *)(&rtable4[index]);
1347 		    rp = rp->r_hashf) {
1348 			vp = RTOV4(rp);
1349 			/*
1350 			 * Don't bother sync'ing a vp if it
1351 			 * is part of virtual swap device or
1352 			 * if VFS is read-only
1353 			 */
1354 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1355 				continue;
1356 			/*
1357 			 * If the vnode has pages and is marked as either dirty
1358 			 * or mmap'd, hold and add this vnode to the list of
1359 			 * vnodes to flush.
1360 			 */
1361 			if (nfs4_has_pages(vp) &&
1362 			    ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1363 				VN_HOLD(vp);
1364 				vplist[cnt++] = vp;
1365 				if (cnt == num) {
1366 					rw_exit(&rtable4[index].r_lock);
1367 					/*
1368 					 * The vplist is full because there is
1369 					 * too many rnodes.  We are done for
1370 					 * now.
1371 					 */
1372 					goto done;
1373 				}
1374 			}
1375 		}
1376 		rw_exit(&rtable4[index].r_lock);
1377 	}
1378 
1379 done:
1380 
1381 	/*
1382 	 * Flush and release all of the files on the list.
1383 	 */
1384 	while (cnt-- > 0) {
1385 		vp = vplist[cnt];
1386 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
1387 		VN_RELE(vp);
1388 	}
1389 
1390 	/*
1391 	 * Free the space allocated to hold the list.
1392 	 */
1393 	kmem_free(vplist, num * sizeof (*vplist));
1394 }
1395 
1396 int
nfs4_free_data_reclaim(rnode4_t * rp)1397 nfs4_free_data_reclaim(rnode4_t *rp)
1398 {
1399 	char *contents;
1400 	vnode_t *xattr;
1401 	int size;
1402 	vsecattr_t *vsp;
1403 	int freed;
1404 	bool_t rdc = FALSE;
1405 
1406 	/*
1407 	 * Free any held caches which may
1408 	 * be associated with this rnode.
1409 	 */
1410 	mutex_enter(&rp->r_statelock);
1411 	if (rp->r_dir != NULL)
1412 		rdc = TRUE;
1413 	contents = rp->r_symlink.contents;
1414 	size = rp->r_symlink.size;
1415 	rp->r_symlink.contents = NULL;
1416 	vsp = rp->r_secattr;
1417 	rp->r_secattr = NULL;
1418 	xattr = rp->r_xattr_dir;
1419 	rp->r_xattr_dir = NULL;
1420 	mutex_exit(&rp->r_statelock);
1421 
1422 	/*
1423 	 * Free the access cache entries.
1424 	 */
1425 	freed = nfs4_access_purge_rp(rp);
1426 
1427 	if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1428 		return (freed);
1429 
1430 	/*
1431 	 * Free the readdir cache entries, incompletely if we can't block.
1432 	 */
1433 	nfs4_purge_rddir_cache(RTOV4(rp));
1434 
1435 	/*
1436 	 * Free the symbolic link cache.
1437 	 */
1438 	if (contents != NULL) {
1439 
1440 		kmem_free((void *)contents, size);
1441 	}
1442 
1443 	/*
1444 	 * Free any cached ACL.
1445 	 */
1446 	if (vsp != NULL)
1447 		nfs4_acl_free_cache(vsp);
1448 
1449 	/*
1450 	 * Release the xattr directory vnode
1451 	 */
1452 	if (xattr != NULL)
1453 		VN_RELE(xattr);
1454 
1455 	return (1);
1456 }
1457 
1458 static int
nfs4_active_data_reclaim(rnode4_t * rp)1459 nfs4_active_data_reclaim(rnode4_t *rp)
1460 {
1461 	char *contents;
1462 	vnode_t *xattr = NULL;
1463 	int size;
1464 	vsecattr_t *vsp;
1465 	int freed;
1466 	bool_t rdc = FALSE;
1467 
1468 	/*
1469 	 * Free any held credentials and caches which
1470 	 * may be associated with this rnode.
1471 	 */
1472 	if (!mutex_tryenter(&rp->r_statelock))
1473 		return (0);
1474 	contents = rp->r_symlink.contents;
1475 	size = rp->r_symlink.size;
1476 	rp->r_symlink.contents = NULL;
1477 	vsp = rp->r_secattr;
1478 	rp->r_secattr = NULL;
1479 	if (rp->r_dir != NULL)
1480 		rdc = TRUE;
1481 	/*
1482 	 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1483 	 * on the same r_hashq queue. We are not mandated to free all caches.
1484 	 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1485 	 * rnode 'rp' is freed or put on the free list.
1486 	 *
1487 	 * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1488 	 * - it has no associated rnode4_t (its v_data is NULL),
1489 	 * - it is preallocated statically and will never go away,
1490 	 * so we cannot save anything by releasing it.
1491 	 */
1492 	if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP &&
1493 	    VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) {
1494 		xattr = rp->r_xattr_dir;
1495 		rp->r_xattr_dir = NULL;
1496 	}
1497 	mutex_exit(&rp->r_statelock);
1498 
1499 	/*
1500 	 * Free the access cache entries.
1501 	 */
1502 	freed = nfs4_access_purge_rp(rp);
1503 
1504 	if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1505 		return (freed);
1506 
1507 	/*
1508 	 * Free the symbolic link cache.
1509 	 */
1510 	if (contents != NULL) {
1511 
1512 		kmem_free((void *)contents, size);
1513 	}
1514 
1515 	/*
1516 	 * Free any cached ACL.
1517 	 */
1518 	if (vsp != NULL)
1519 		nfs4_acl_free_cache(vsp);
1520 
1521 	nfs4_purge_rddir_cache(RTOV4(rp));
1522 
1523 	/*
1524 	 * Release the xattr directory vnode
1525 	 */
1526 	if (xattr != NULL)
1527 		VN_RELE(xattr);
1528 
1529 	return (1);
1530 }
1531 
1532 static int
nfs4_free_reclaim(void)1533 nfs4_free_reclaim(void)
1534 {
1535 	int freed;
1536 	rnode4_t *rp;
1537 
1538 #ifdef DEBUG
1539 	clstat4_debug.f_reclaim.value.ui64++;
1540 #endif
1541 	freed = 0;
1542 	mutex_enter(&rp4freelist_lock);
1543 	rp = rp4freelist;
1544 	if (rp != NULL) {
1545 		do {
1546 			if (nfs4_free_data_reclaim(rp))
1547 				freed = 1;
1548 		} while ((rp = rp->r_freef) != rp4freelist);
1549 	}
1550 	mutex_exit(&rp4freelist_lock);
1551 	return (freed);
1552 }
1553 
1554 static int
nfs4_active_reclaim(void)1555 nfs4_active_reclaim(void)
1556 {
1557 	int freed;
1558 	int index;
1559 	rnode4_t *rp;
1560 
1561 #ifdef DEBUG
1562 	clstat4_debug.a_reclaim.value.ui64++;
1563 #endif
1564 	freed = 0;
1565 	for (index = 0; index < rtable4size; index++) {
1566 		rw_enter(&rtable4[index].r_lock, RW_READER);
1567 		for (rp = rtable4[index].r_hashf;
1568 		    rp != (rnode4_t *)(&rtable4[index]);
1569 		    rp = rp->r_hashf) {
1570 			if (nfs4_active_data_reclaim(rp))
1571 				freed = 1;
1572 		}
1573 		rw_exit(&rtable4[index].r_lock);
1574 	}
1575 	return (freed);
1576 }
1577 
1578 static int
nfs4_rnode_reclaim(void)1579 nfs4_rnode_reclaim(void)
1580 {
1581 	int freed;
1582 	rnode4_t *rp;
1583 	vnode_t *vp;
1584 
1585 #ifdef DEBUG
1586 	clstat4_debug.r_reclaim.value.ui64++;
1587 #endif
1588 	freed = 0;
1589 	mutex_enter(&rp4freelist_lock);
1590 	while ((rp = rp4freelist) != NULL) {
1591 		rp4_rmfree(rp);
1592 		mutex_exit(&rp4freelist_lock);
1593 		if (rp->r_flags & R4HASHED) {
1594 			vp = RTOV4(rp);
1595 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1596 			mutex_enter(&vp->v_lock);
1597 			if (vp->v_count > 1) {
1598 				vp->v_count--;
1599 				mutex_exit(&vp->v_lock);
1600 				rw_exit(&rp->r_hashq->r_lock);
1601 				mutex_enter(&rp4freelist_lock);
1602 				continue;
1603 			}
1604 			mutex_exit(&vp->v_lock);
1605 			rp4_rmhash_locked(rp);
1606 			rw_exit(&rp->r_hashq->r_lock);
1607 		}
1608 		/*
1609 		 * This call to rp_addfree will end up destroying the
1610 		 * rnode, but in a safe way with the appropriate set
1611 		 * of checks done.
1612 		 */
1613 		rp4_addfree(rp, CRED());
1614 		mutex_enter(&rp4freelist_lock);
1615 	}
1616 	mutex_exit(&rp4freelist_lock);
1617 	return (freed);
1618 }
1619 
1620 /*ARGSUSED*/
1621 static void
nfs4_reclaim(void * cdrarg)1622 nfs4_reclaim(void *cdrarg)
1623 {
1624 #ifdef DEBUG
1625 	clstat4_debug.reclaim.value.ui64++;
1626 #endif
1627 	if (nfs4_free_reclaim())
1628 		return;
1629 
1630 	if (nfs4_active_reclaim())
1631 		return;
1632 
1633 	(void) nfs4_rnode_reclaim();
1634 }
1635 
1636 /*
1637  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1638  * clientid can change if the caller drops mi_recovlock.
1639  */
1640 
1641 clientid4
mi2clientid(mntinfo4_t * mi)1642 mi2clientid(mntinfo4_t *mi)
1643 {
1644 	nfs4_server_t	*sp;
1645 	clientid4	clientid = 0;
1646 
1647 	/* this locks down sp if it is found */
1648 	sp = find_nfs4_server(mi);
1649 	if (sp != NULL) {
1650 		clientid = sp->clientid;
1651 		mutex_exit(&sp->s_lock);
1652 		nfs4_server_rele(sp);
1653 	}
1654 	return (clientid);
1655 }
1656 
1657 /*
1658  * Return a list with information about all the known open instances for
1659  * a filesystem. The caller must call r4releopenlist() when done with the
1660  * list.
1661  *
1662  * We are safe at looking at os_valid and os_pending_close across dropping
1663  * the 'os_sync_lock' to count up the number of open streams and then
1664  * allocate memory for the osp list due to:
1665  *	-Looking at os_pending_close is safe since this routine is
1666  *	only called via recovery, and os_pending_close can only be set via
1667  *	a non-recovery operation (which are all blocked when recovery
1668  *	is active).
1669  *
1670  *	-Examining os_valid is safe since non-recovery operations, which
1671  *	could potentially switch os_valid to 0, are blocked (via
1672  *	nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1673  *	(which means we are the only recovery thread potentially acting
1674  *	on this open stream).
1675  */
1676 
1677 nfs4_opinst_t *
r4mkopenlist(mntinfo4_t * mi)1678 r4mkopenlist(mntinfo4_t *mi)
1679 {
1680 	nfs4_opinst_t *reopenlist, *rep;
1681 	rnode4_t *rp;
1682 	vnode_t *vp;
1683 	vfs_t *vfsp = mi->mi_vfsp;
1684 	int numosp;
1685 	nfs4_open_stream_t *osp;
1686 	int index;
1687 	open_delegation_type4 dtype;
1688 	int hold_vnode;
1689 
1690 	reopenlist = NULL;
1691 
1692 	for (index = 0; index < rtable4size; index++) {
1693 		rw_enter(&rtable4[index].r_lock, RW_READER);
1694 		for (rp = rtable4[index].r_hashf;
1695 		    rp != (rnode4_t *)(&rtable4[index]);
1696 		    rp = rp->r_hashf) {
1697 
1698 			vp = RTOV4(rp);
1699 			if (vp->v_vfsp != vfsp)
1700 				continue;
1701 			hold_vnode = 0;
1702 
1703 			mutex_enter(&rp->r_os_lock);
1704 
1705 			/* Count the number of valid open_streams of the file */
1706 			numosp = 0;
1707 			for (osp = list_head(&rp->r_open_streams); osp != NULL;
1708 			    osp = list_next(&rp->r_open_streams, osp)) {
1709 				mutex_enter(&osp->os_sync_lock);
1710 				if (osp->os_valid && !osp->os_pending_close)
1711 					numosp++;
1712 				mutex_exit(&osp->os_sync_lock);
1713 			}
1714 
1715 			/* Fill in the valid open streams per vp */
1716 			if (numosp > 0) {
1717 				int j;
1718 
1719 				hold_vnode = 1;
1720 
1721 				/*
1722 				 * Add a new open instance to the list
1723 				 */
1724 				rep = kmem_zalloc(sizeof (*reopenlist),
1725 				    KM_SLEEP);
1726 				rep->re_next = reopenlist;
1727 				reopenlist = rep;
1728 
1729 				rep->re_vp = vp;
1730 				rep->re_osp = kmem_zalloc(
1731 				    numosp * sizeof (*(rep->re_osp)),
1732 				    KM_SLEEP);
1733 				rep->re_numosp = numosp;
1734 
1735 				j = 0;
1736 				for (osp = list_head(&rp->r_open_streams);
1737 				    osp != NULL;
1738 				    osp = list_next(&rp->r_open_streams, osp)) {
1739 
1740 					mutex_enter(&osp->os_sync_lock);
1741 					if (osp->os_valid &&
1742 					    !osp->os_pending_close) {
1743 						osp->os_ref_count++;
1744 						rep->re_osp[j] = osp;
1745 						j++;
1746 					}
1747 					mutex_exit(&osp->os_sync_lock);
1748 				}
1749 				/*
1750 				 * Assuming valid osp(s) stays valid between
1751 				 * the time obtaining j and numosp.
1752 				 */
1753 				ASSERT(j == numosp);
1754 			}
1755 
1756 			mutex_exit(&rp->r_os_lock);
1757 			/* do this here to keep v_lock > r_os_lock */
1758 			if (hold_vnode)
1759 				VN_HOLD(vp);
1760 			mutex_enter(&rp->r_statev4_lock);
1761 			if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1762 				/*
1763 				 * If this rnode holds a delegation,
1764 				 * but if there are no valid open streams,
1765 				 * then just discard the delegation
1766 				 * without doing delegreturn.
1767 				 */
1768 				if (numosp > 0)
1769 					rp->r_deleg_needs_recovery =
1770 					    rp->r_deleg_type;
1771 			}
1772 			/* Save the delegation type for use outside the lock */
1773 			dtype = rp->r_deleg_type;
1774 			mutex_exit(&rp->r_statev4_lock);
1775 
1776 			/*
1777 			 * If we have a delegation then get rid of it.
1778 			 * We've set rp->r_deleg_needs_recovery so we have
1779 			 * enough information to recover.
1780 			 */
1781 			if (dtype != OPEN_DELEGATE_NONE) {
1782 				(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1783 			}
1784 		}
1785 		rw_exit(&rtable4[index].r_lock);
1786 	}
1787 	return (reopenlist);
1788 }
1789 
1790 /*
1791  * Given a filesystem id, check to see if any rnodes
1792  * within this fsid reside in the rnode cache, other
1793  * than one we know about.
1794  *
1795  * Return 1 if an rnode is found, 0 otherwise
1796  */
1797 int
r4find_by_fsid(mntinfo4_t * mi,fattr4_fsid * moved_fsid)1798 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid)
1799 {
1800 	rnode4_t *rp;
1801 	vnode_t *vp;
1802 	vfs_t *vfsp = mi->mi_vfsp;
1803 	fattr4_fsid *fsid;
1804 	int index, found = 0;
1805 
1806 	for (index = 0; index < rtable4size; index++) {
1807 		rw_enter(&rtable4[index].r_lock, RW_READER);
1808 		for (rp = rtable4[index].r_hashf;
1809 		    rp != (rnode4_t *)(&rtable4[index]);
1810 		    rp = rp->r_hashf) {
1811 
1812 			vp = RTOV4(rp);
1813 			if (vp->v_vfsp != vfsp)
1814 				continue;
1815 
1816 			/*
1817 			 * XXX there might be a case where a
1818 			 * replicated fs may have the same fsid
1819 			 * across two different servers. This
1820 			 * check isn't good enough in that case
1821 			 */
1822 			fsid = &rp->r_srv_fsid;
1823 			if (FATTR4_FSID_EQ(moved_fsid, fsid)) {
1824 				found = 1;
1825 				break;
1826 			}
1827 		}
1828 		rw_exit(&rtable4[index].r_lock);
1829 
1830 		if (found)
1831 			break;
1832 	}
1833 	return (found);
1834 }
1835 
1836 /*
1837  * Release the list of open instance references.
1838  */
1839 
1840 void
r4releopenlist(nfs4_opinst_t * reopenp)1841 r4releopenlist(nfs4_opinst_t *reopenp)
1842 {
1843 	nfs4_opinst_t *rep, *next;
1844 	int i;
1845 
1846 	for (rep = reopenp; rep; rep = next) {
1847 		next = rep->re_next;
1848 
1849 		for (i = 0; i < rep->re_numosp; i++)
1850 			open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1851 
1852 		VN_RELE(rep->re_vp);
1853 		kmem_free(rep->re_osp,
1854 		    rep->re_numosp * sizeof (*(rep->re_osp)));
1855 
1856 		kmem_free(rep, sizeof (*rep));
1857 	}
1858 }
1859 
1860 int
nfs4_rnode_init(void)1861 nfs4_rnode_init(void)
1862 {
1863 	ulong_t nrnode4_max;
1864 	int i;
1865 
1866 	/*
1867 	 * Compute the size of the rnode4 hash table
1868 	 */
1869 	if (nrnode <= 0)
1870 		nrnode = ncsize;
1871 	nrnode4_max =
1872 	    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1873 	if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1874 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1875 		    "!setting nrnode to max value of %ld", nrnode4_max);
1876 		nrnode = nrnode4_max;
1877 	}
1878 	rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1879 	rtable4mask = rtable4size - 1;
1880 
1881 	/*
1882 	 * Allocate and initialize the hash buckets
1883 	 */
1884 	rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1885 	for (i = 0; i < rtable4size; i++) {
1886 		rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1887 		rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1888 		rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1889 	}
1890 
1891 	rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1892 	    0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1893 
1894 	return (0);
1895 }
1896 
1897 int
nfs4_rnode_fini(void)1898 nfs4_rnode_fini(void)
1899 {
1900 	int i;
1901 
1902 	/*
1903 	 * Deallocate the rnode hash queues
1904 	 */
1905 	kmem_cache_destroy(rnode4_cache);
1906 
1907 	for (i = 0; i < rtable4size; i++)
1908 		rw_destroy(&rtable4[i].r_lock);
1909 
1910 	kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1911 
1912 	return (0);
1913 }
1914 
1915 /*
1916  * Return non-zero if the given filehandle refers to the root filehandle
1917  * for the given rnode.
1918  */
1919 
1920 static int
isrootfh(nfs4_sharedfh_t * fh,rnode4_t * rp)1921 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1922 {
1923 	int isroot;
1924 
1925 	isroot = 0;
1926 	if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1927 		isroot = 1;
1928 
1929 	return (isroot);
1930 }
1931 
1932 /*
1933  * The r4_stub_* routines assume that the rnode is newly activated, and
1934  * that the caller either holds the hash bucket r_lock for this rnode as
1935  * RW_WRITER, or holds r_statelock.
1936  */
1937 static void
r4_stub_set(rnode4_t * rp,nfs4_stub_type_t type)1938 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1939 {
1940 	vnode_t *vp = RTOV4(rp);
1941 	krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1942 
1943 	ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1944 
1945 	rp->r_stub_type = type;
1946 
1947 	/*
1948 	 * Safely switch this vnode to the trigger vnodeops.
1949 	 *
1950 	 * Currently, we don't ever switch a trigger vnode back to using
1951 	 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1952 	 * a new v4 object is not a trigger, and it will already have the
1953 	 * correct v4 vnodeops by default. So, no "else" case required here.
1954 	 */
1955 	if (type != NFS4_STUB_NONE)
1956 		vn_setops(vp, nfs4_trigger_vnodeops);
1957 }
1958 
1959 void
r4_stub_mirrormount(rnode4_t * rp)1960 r4_stub_mirrormount(rnode4_t *rp)
1961 {
1962 	r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
1963 }
1964 
1965 void
r4_stub_referral(rnode4_t * rp)1966 r4_stub_referral(rnode4_t *rp)
1967 {
1968 	DTRACE_PROBE1(nfs4clnt__func__referral__moved,
1969 	    vnode_t *, RTOV4(rp));
1970 	r4_stub_set(rp, NFS4_STUB_REFERRAL);
1971 }
1972 
1973 void
r4_stub_none(rnode4_t * rp)1974 r4_stub_none(rnode4_t *rp)
1975 {
1976 	r4_stub_set(rp, NFS4_STUB_NONE);
1977 }
1978 
1979 #ifdef DEBUG
1980 
1981 /*
1982  * Look in the rnode table for other rnodes that have the same filehandle.
1983  * Assume the lock is held for the hash chain of checkrp
1984  */
1985 
1986 static void
r4_dup_check(rnode4_t * checkrp,vfs_t * vfsp)1987 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
1988 {
1989 	rnode4_t *rp;
1990 	vnode_t *tvp;
1991 	nfs4_fhandle_t fh, fh2;
1992 	int index;
1993 
1994 	if (!r4_check_for_dups)
1995 		return;
1996 
1997 	ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
1998 
1999 	sfh4_copyval(checkrp->r_fh, &fh);
2000 
2001 	for (index = 0; index < rtable4size; index++) {
2002 
2003 		if (&rtable4[index] != checkrp->r_hashq)
2004 			rw_enter(&rtable4[index].r_lock, RW_READER);
2005 
2006 		for (rp = rtable4[index].r_hashf;
2007 		    rp != (rnode4_t *)(&rtable4[index]);
2008 		    rp = rp->r_hashf) {
2009 
2010 			if (rp == checkrp)
2011 				continue;
2012 
2013 			tvp = RTOV4(rp);
2014 			if (tvp->v_vfsp != vfsp)
2015 				continue;
2016 
2017 			sfh4_copyval(rp->r_fh, &fh2);
2018 			if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2019 				cmn_err(CE_PANIC, "rnodes with same fs, fh "
2020 				    "(%p, %p)", (void *)checkrp, (void *)rp);
2021 			}
2022 		}
2023 
2024 		if (&rtable4[index] != checkrp->r_hashq)
2025 			rw_exit(&rtable4[index].r_lock);
2026 	}
2027 }
2028 
2029 #endif /* DEBUG */
2030