xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_subr.c (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
27  *	All rights reserved.
28  */
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/proc.h>
37 #include <sys/user.h>
38 #include <sys/time.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/socket.h>
43 #include <sys/uio.h>
44 #include <sys/tiuser.h>
45 #include <sys/swap.h>
46 #include <sys/errno.h>
47 #include <sys/debug.h>
48 #include <sys/kmem.h>
49 #include <sys/kstat.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vtrace.h>
52 #include <sys/session.h>
53 #include <sys/dnlc.h>
54 #include <sys/bitmap.h>
55 #include <sys/acl.h>
56 #include <sys/ddi.h>
57 #include <sys/pathname.h>
58 #include <sys/flock.h>
59 #include <sys/dirent.h>
60 #include <sys/flock.h>
61 #include <sys/callb.h>
62 #include <sys/atomic.h>
63 #include <sys/list.h>
64 
65 #include <rpc/types.h>
66 #include <rpc/xdr.h>
67 #include <rpc/auth.h>
68 #include <rpc/clnt.h>
69 
70 #include <nfs/nfs.h>
71 #include <nfs/nfs4.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/rnode.h>
74 #include <nfs/nfs_acl.h>
75 
76 /*
77  * The hash queues for the access to active and cached rnodes
78  * are organized as doubly linked lists.  A reader/writer lock
79  * for each hash bucket is used to control access and to synchronize
80  * lookups, additions, and deletions from the hash queue.
81  *
82  * The rnode freelist is organized as a doubly linked list with
83  * a head pointer.  Additions and deletions are synchronized via
84  * a single mutex.
85  *
86  * In order to add an rnode to the free list, it must be hashed into
87  * a hash queue and the exclusive lock to the hash queue be held.
88  * If an rnode is not hashed into a hash queue, then it is destroyed
89  * because it represents no valuable information that can be reused
90  * about the file.  The exclusive lock to the hash queue must be
91  * held in order to prevent a lookup in the hash queue from finding
92  * the rnode and using it and assuming that the rnode is not on the
93  * freelist.  The lookup in the hash queue will have the hash queue
94  * locked, either exclusive or shared.
95  *
96  * The vnode reference count for each rnode is not allowed to drop
97  * below 1.  This prevents external entities, such as the VM
98  * subsystem, from acquiring references to vnodes already on the
99  * freelist and then trying to place them back on the freelist
100  * when their reference is released.  This means that the when an
101  * rnode is looked up in the hash queues, then either the rnode
102  * is removed from the freelist and that reference is tranfered to
103  * the new reference or the vnode reference count must be incremented
104  * accordingly.  The mutex for the freelist must be held in order to
105  * accurately test to see if the rnode is on the freelist or not.
106  * The hash queue lock might be held shared and it is possible that
107  * two different threads may race to remove the rnode from the
108  * freelist.  This race can be resolved by holding the mutex for the
109  * freelist.  Please note that the mutex for the freelist does not
110  * need to held if the rnode is not on the freelist.  It can not be
111  * placed on the freelist due to the requirement that the thread
112  * putting the rnode on the freelist must hold the exclusive lock
113  * to the hash queue and the thread doing the lookup in the hash
114  * queue is holding either a shared or exclusive lock to the hash
115  * queue.
116  *
117  * The lock ordering is:
118  *
119  *	hash bucket lock -> vnode lock
120  *	hash bucket lock -> freelist lock
121  */
122 static rhashq_t *rtable;
123 
124 static kmutex_t rpfreelist_lock;
125 static rnode_t *rpfreelist = NULL;
126 static long rnew = 0;
127 long nrnode = 0;
128 
129 static int rtablesize;
130 static int rtablemask;
131 
132 static int hashlen = 4;
133 
134 static struct kmem_cache *rnode_cache;
135 
136 /*
137  * Mutex to protect the following variables:
138  *	nfs_major
139  *	nfs_minor
140  */
141 kmutex_t nfs_minor_lock;
142 int nfs_major;
143 int nfs_minor;
144 
145 /* Do we allow preepoch (negative) time values otw? */
146 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
147 
148 /*
149  * Access cache
150  */
151 static acache_hash_t *acache;
152 static long nacache;	/* used strictly to size the number of hash queues */
153 
154 static int acachesize;
155 static int acachemask;
156 static struct kmem_cache *acache_cache;
157 
158 /*
159  * Client side utilities
160  */
161 
162 /*
163  * client side statistics
164  */
165 static const struct clstat clstat_tmpl = {
166 	{ "calls",	KSTAT_DATA_UINT64 },
167 	{ "badcalls",	KSTAT_DATA_UINT64 },
168 	{ "clgets",	KSTAT_DATA_UINT64 },
169 	{ "cltoomany",	KSTAT_DATA_UINT64 },
170 #ifdef DEBUG
171 	{ "clalloc",	KSTAT_DATA_UINT64 },
172 	{ "noresponse",	KSTAT_DATA_UINT64 },
173 	{ "failover",	KSTAT_DATA_UINT64 },
174 	{ "remap",	KSTAT_DATA_UINT64 },
175 #endif
176 };
177 
178 /*
179  * The following are statistics that describe behavior of the system as a whole
180  * and doesn't correspond to any one particular zone.
181  */
182 #ifdef DEBUG
183 static struct clstat_debug {
184 	kstat_named_t	nrnode;			/* number of allocated rnodes */
185 	kstat_named_t	access;			/* size of access cache */
186 	kstat_named_t	dirent;			/* size of readdir cache */
187 	kstat_named_t	dirents;		/* size of readdir buf cache */
188 	kstat_named_t	reclaim;		/* number of reclaims */
189 	kstat_named_t	clreclaim;		/* number of cl reclaims */
190 	kstat_named_t	f_reclaim;		/* number of free reclaims */
191 	kstat_named_t	a_reclaim;		/* number of active reclaims */
192 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
193 	kstat_named_t	rpath;			/* bytes used to store rpaths */
194 } clstat_debug = {
195 	{ "nrnode",	KSTAT_DATA_UINT64 },
196 	{ "access",	KSTAT_DATA_UINT64 },
197 	{ "dirent",	KSTAT_DATA_UINT64 },
198 	{ "dirents",	KSTAT_DATA_UINT64 },
199 	{ "reclaim",	KSTAT_DATA_UINT64 },
200 	{ "clreclaim",	KSTAT_DATA_UINT64 },
201 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
202 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
203 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
204 	{ "r_path",	KSTAT_DATA_UINT64 },
205 };
206 #endif	/* DEBUG */
207 
208 /*
209  * We keep a global list of per-zone client data, so we can clean up all zones
210  * if we get low on memory.
211  */
212 static list_t nfs_clnt_list;
213 static kmutex_t nfs_clnt_list_lock;
214 static zone_key_t nfsclnt_zone_key;
215 
216 static struct kmem_cache *chtab_cache;
217 
218 /*
219  * Some servers do not properly update the attributes of the
220  * directory when changes are made.  To allow interoperability
221  * with these broken servers, the nfs_disable_rddir_cache
222  * parameter must be set in /etc/system
223  */
224 int nfs_disable_rddir_cache = 0;
225 
226 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
227 		    struct chtab **);
228 void		clfree(CLIENT *, struct chtab *);
229 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
230 		    struct chtab **, struct nfs_clnt *);
231 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
232 		    struct chtab **, struct nfs_clnt *);
233 static void	clreclaim(void *);
234 static int	nfs_feedback(int, int, mntinfo_t *);
235 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
236 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
237 		    failinfo_t *);
238 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
239 		    caddr_t, cred_t *, int *, int, failinfo_t *);
240 static void	rinactive(rnode_t *, cred_t *);
241 static int	rtablehash(nfs_fhandle *);
242 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
243 		    struct vnodeops *,
244 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
245 			cred_t *),
246 		    int (*)(const void *, const void *), int *, cred_t *,
247 		    char *, char *);
248 static void	rp_rmfree(rnode_t *);
249 static void	rp_addhash(rnode_t *);
250 static void	rp_rmhash_locked(rnode_t *);
251 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
252 static void	destroy_rnode(rnode_t *);
253 static void	rddir_cache_free(rddir_cache *);
254 static int	nfs_free_data_reclaim(rnode_t *);
255 static int	nfs_active_data_reclaim(rnode_t *);
256 static int	nfs_free_reclaim(void);
257 static int	nfs_active_reclaim(void);
258 static int	nfs_rnode_reclaim(void);
259 static void	nfs_reclaim(void *);
260 static int	failover_safe(failinfo_t *);
261 static void	failover_newserver(mntinfo_t *mi);
262 static void	failover_thread(mntinfo_t *mi);
263 static int	failover_wait(mntinfo_t *);
264 static int	failover_remap(failinfo_t *);
265 static int	failover_lookup(char *, vnode_t *,
266 		    int (*)(vnode_t *, char *, vnode_t **,
267 			struct pathname *, int, vnode_t *, cred_t *, int),
268 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
269 		    vnode_t **);
270 static void	nfs_free_r_path(rnode_t *);
271 static void	nfs_set_vroot(vnode_t *);
272 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
273 
274 /*
275  * from rpcsec module (common/rpcsec)
276  */
277 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
278 extern void sec_clnt_freeh(AUTH *);
279 extern void sec_clnt_freeinfo(struct sec_data *);
280 
281 /*
282  * EIO or EINTR are not recoverable errors.
283  */
284 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
285 
286 /*
287  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
288  */
289 static int
290 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
291     struct chtab **chp, struct nfs_clnt *nfscl)
292 {
293 	struct chhead *ch, *newch;
294 	struct chhead **plistp;
295 	struct chtab *cp;
296 	int error;
297 	k_sigset_t smask;
298 
299 	if (newcl == NULL || chp == NULL || ci == NULL)
300 		return (EINVAL);
301 
302 	*newcl = NULL;
303 	*chp = NULL;
304 
305 	/*
306 	 * Find an unused handle or create one
307 	 */
308 	newch = NULL;
309 	nfscl->nfscl_stat.clgets.value.ui64++;
310 top:
311 	/*
312 	 * Find the correct entry in the cache to check for free
313 	 * client handles.  The search is based on the RPC program
314 	 * number, program version number, dev_t for the transport
315 	 * device, and the protocol family.
316 	 */
317 	mutex_enter(&nfscl->nfscl_chtable_lock);
318 	plistp = &nfscl->nfscl_chtable;
319 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
320 		if (ch->ch_prog == ci->cl_prog &&
321 		    ch->ch_vers == ci->cl_vers &&
322 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
323 		    (strcmp(ch->ch_protofmly,
324 			svp->sv_knconf->knc_protofmly) == 0))
325 			break;
326 		plistp = &ch->ch_next;
327 	}
328 
329 	/*
330 	 * If we didn't find a cache entry for this quadruple, then
331 	 * create one.  If we don't have one already preallocated,
332 	 * then drop the cache lock, create one, and then start over.
333 	 * If we did have a preallocated entry, then just add it to
334 	 * the front of the list.
335 	 */
336 	if (ch == NULL) {
337 		if (newch == NULL) {
338 			mutex_exit(&nfscl->nfscl_chtable_lock);
339 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
340 			newch->ch_timesused = 0;
341 			newch->ch_prog = ci->cl_prog;
342 			newch->ch_vers = ci->cl_vers;
343 			newch->ch_dev = svp->sv_knconf->knc_rdev;
344 			newch->ch_protofmly = kmem_alloc(
345 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
346 			    KM_SLEEP);
347 			(void) strcpy(newch->ch_protofmly,
348 			    svp->sv_knconf->knc_protofmly);
349 			newch->ch_list = NULL;
350 			goto top;
351 		}
352 		ch = newch;
353 		newch = NULL;
354 		ch->ch_next = nfscl->nfscl_chtable;
355 		nfscl->nfscl_chtable = ch;
356 	/*
357 	 * We found a cache entry, but if it isn't on the front of the
358 	 * list, then move it to the front of the list to try to take
359 	 * advantage of locality of operations.
360 	 */
361 	} else if (ch != nfscl->nfscl_chtable) {
362 		*plistp = ch->ch_next;
363 		ch->ch_next = nfscl->nfscl_chtable;
364 		nfscl->nfscl_chtable = ch;
365 	}
366 
367 	/*
368 	 * If there was a free client handle cached, then remove it
369 	 * from the list, init it, and use it.
370 	 */
371 	if (ch->ch_list != NULL) {
372 		cp = ch->ch_list;
373 		ch->ch_list = cp->ch_list;
374 		mutex_exit(&nfscl->nfscl_chtable_lock);
375 		if (newch != NULL) {
376 			kmem_free(newch->ch_protofmly,
377 			    strlen(newch->ch_protofmly) + 1);
378 			kmem_free(newch, sizeof (*newch));
379 		}
380 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
381 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
382 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
383 		    &cp->ch_client->cl_auth);
384 		if (error || cp->ch_client->cl_auth == NULL) {
385 			CLNT_DESTROY(cp->ch_client);
386 			kmem_cache_free(chtab_cache, cp);
387 			return ((error != 0) ? error : EINTR);
388 		}
389 		ch->ch_timesused++;
390 		*newcl = cp->ch_client;
391 		*chp = cp;
392 		return (0);
393 	}
394 
395 	/*
396 	 * There weren't any free client handles which fit, so allocate
397 	 * a new one and use that.
398 	 */
399 #ifdef DEBUG
400 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
401 #endif
402 	mutex_exit(&nfscl->nfscl_chtable_lock);
403 
404 	nfscl->nfscl_stat.cltoomany.value.ui64++;
405 	if (newch != NULL) {
406 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
407 		kmem_free(newch, sizeof (*newch));
408 	}
409 
410 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
411 	cp->ch_head = ch;
412 
413 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
414 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
415 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
416 	sigunintr(&smask);
417 
418 	if (error != 0) {
419 		kmem_cache_free(chtab_cache, cp);
420 #ifdef DEBUG
421 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
422 #endif
423 		/*
424 		 * Warning is unnecessary if error is EINTR.
425 		 */
426 		if (error != EINTR) {
427 			nfs_cmn_err(error, CE_WARN,
428 			    "clget: couldn't create handle: %m\n");
429 		}
430 		return (error);
431 	}
432 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
433 	auth_destroy(cp->ch_client->cl_auth);
434 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
435 	    &cp->ch_client->cl_auth);
436 	if (error || cp->ch_client->cl_auth == NULL) {
437 		CLNT_DESTROY(cp->ch_client);
438 		kmem_cache_free(chtab_cache, cp);
439 #ifdef DEBUG
440 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
441 #endif
442 		return ((error != 0) ? error : EINTR);
443 	}
444 	ch->ch_timesused++;
445 	*newcl = cp->ch_client;
446 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
447 	*chp = cp;
448 	return (0);
449 }
450 
451 int
452 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
453     struct chtab **chp)
454 {
455 	struct nfs_clnt *nfscl;
456 
457 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
458 	ASSERT(nfscl != NULL);
459 
460 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
461 }
462 
463 static int
464 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
465     struct chtab **chp, struct nfs_clnt *nfscl)
466 {
467 	clinfo_t ci;
468 	int error;
469 
470 	/*
471 	 * Set read buffer size to rsize
472 	 * and add room for RPC headers.
473 	 */
474 	ci.cl_readsize = mi->mi_tsize;
475 	if (ci.cl_readsize != 0)
476 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
477 
478 	/*
479 	 * If soft mount and server is down just try once.
480 	 * meaning: do not retransmit.
481 	 */
482 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
483 		ci.cl_retrans = 0;
484 	else
485 		ci.cl_retrans = mi->mi_retrans;
486 
487 	ci.cl_prog = NFS_ACL_PROGRAM;
488 	ci.cl_vers = mi->mi_vers;
489 	ci.cl_flags = mi->mi_flags;
490 
491 	/*
492 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
493 	 * security flavor, the client tries to establish a security context
494 	 * by contacting the server. If the connection is timed out or reset,
495 	 * e.g. server reboot, we will try again.
496 	 */
497 	do {
498 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
499 
500 		if (error == 0)
501 			break;
502 
503 		/*
504 		 * For forced unmount or zone shutdown, bail out, no retry.
505 		 */
506 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
507 			error = EIO;
508 			break;
509 		}
510 
511 		/* do not retry for softmount */
512 		if (!(mi->mi_flags & MI_HARD))
513 			break;
514 
515 		/* let the caller deal with the failover case */
516 		if (FAILOVER_MOUNT(mi))
517 			break;
518 
519 	} while (error == ETIMEDOUT || error == ECONNRESET);
520 
521 	return (error);
522 }
523 
524 static int
525 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
526     struct chtab **chp, struct nfs_clnt *nfscl)
527 {
528 	clinfo_t ci;
529 	int error;
530 
531 	/*
532 	 * Set read buffer size to rsize
533 	 * and add room for RPC headers.
534 	 */
535 	ci.cl_readsize = mi->mi_tsize;
536 	if (ci.cl_readsize != 0)
537 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
538 
539 	/*
540 	 * If soft mount and server is down just try once.
541 	 * meaning: do not retransmit.
542 	 */
543 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
544 		ci.cl_retrans = 0;
545 	else
546 		ci.cl_retrans = mi->mi_retrans;
547 
548 	ci.cl_prog = mi->mi_prog;
549 	ci.cl_vers = mi->mi_vers;
550 	ci.cl_flags = mi->mi_flags;
551 
552 	/*
553 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
554 	 * security flavor, the client tries to establish a security context
555 	 * by contacting the server. If the connection is timed out or reset,
556 	 * e.g. server reboot, we will try again.
557 	 */
558 	do {
559 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
560 
561 		if (error == 0)
562 			break;
563 
564 		/*
565 		 * For forced unmount or zone shutdown, bail out, no retry.
566 		 */
567 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
568 			error = EIO;
569 			break;
570 		}
571 
572 		/* do not retry for softmount */
573 		if (!(mi->mi_flags & MI_HARD))
574 			break;
575 
576 		/* let the caller deal with the failover case */
577 		if (FAILOVER_MOUNT(mi))
578 			break;
579 
580 	} while (error == ETIMEDOUT || error == ECONNRESET);
581 
582 	return (error);
583 }
584 
585 static void
586 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
587 {
588 	if (cl->cl_auth != NULL) {
589 		sec_clnt_freeh(cl->cl_auth);
590 		cl->cl_auth = NULL;
591 	}
592 
593 	/*
594 	 * Timestamp this cache entry so that we know when it was last
595 	 * used.
596 	 */
597 	cp->ch_freed = gethrestime_sec();
598 
599 	/*
600 	 * Add the free client handle to the front of the list.
601 	 * This way, the list will be sorted in youngest to oldest
602 	 * order.
603 	 */
604 	mutex_enter(&nfscl->nfscl_chtable_lock);
605 	cp->ch_list = cp->ch_head->ch_list;
606 	cp->ch_head->ch_list = cp;
607 	mutex_exit(&nfscl->nfscl_chtable_lock);
608 }
609 
610 void
611 clfree(CLIENT *cl, struct chtab *cp)
612 {
613 	struct nfs_clnt *nfscl;
614 
615 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
616 	ASSERT(nfscl != NULL);
617 
618 	clfree_impl(cl, cp, nfscl);
619 }
620 
621 #define	CL_HOLDTIME	60	/* time to hold client handles */
622 
623 static void
624 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
625 {
626 	struct chhead *ch;
627 	struct chtab *cp;	/* list of objects that can be reclaimed */
628 	struct chtab *cpe;
629 	struct chtab *cpl;
630 	struct chtab **cpp;
631 #ifdef DEBUG
632 	int n = 0;
633 #endif
634 
635 	/*
636 	 * Need to reclaim some memory, so step through the cache
637 	 * looking through the lists for entries which can be freed.
638 	 */
639 	cp = NULL;
640 
641 	mutex_enter(&nfscl->nfscl_chtable_lock);
642 
643 	/*
644 	 * Here we step through each non-NULL quadruple and start to
645 	 * construct the reclaim list pointed to by cp.  Note that
646 	 * cp will contain all eligible chtab entries.  When this traversal
647 	 * completes, chtab entries from the last quadruple will be at the
648 	 * front of cp and entries from previously inspected quadruples have
649 	 * been appended to the rear of cp.
650 	 */
651 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
652 		if (ch->ch_list == NULL)
653 			continue;
654 		/*
655 		 * Search each list for entries older then
656 		 * cl_holdtime seconds.  The lists are maintained
657 		 * in youngest to oldest order so that when the
658 		 * first entry is found which is old enough, then
659 		 * all of the rest of the entries on the list will
660 		 * be old enough as well.
661 		 */
662 		cpl = ch->ch_list;
663 		cpp = &ch->ch_list;
664 		while (cpl != NULL &&
665 			cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
666 			cpp = &cpl->ch_list;
667 			cpl = cpl->ch_list;
668 		}
669 		if (cpl != NULL) {
670 			*cpp = NULL;
671 			if (cp != NULL) {
672 				cpe = cpl;
673 				while (cpe->ch_list != NULL)
674 					cpe = cpe->ch_list;
675 				cpe->ch_list = cp;
676 			}
677 			cp = cpl;
678 		}
679 	}
680 
681 	mutex_exit(&nfscl->nfscl_chtable_lock);
682 
683 	/*
684 	 * If cp is empty, then there is nothing to reclaim here.
685 	 */
686 	if (cp == NULL)
687 		return;
688 
689 	/*
690 	 * Step through the list of entries to free, destroying each client
691 	 * handle and kmem_free'ing the memory for each entry.
692 	 */
693 	while (cp != NULL) {
694 #ifdef DEBUG
695 		n++;
696 #endif
697 		CLNT_DESTROY(cp->ch_client);
698 		cpl = cp->ch_list;
699 		kmem_cache_free(chtab_cache, cp);
700 		cp = cpl;
701 	}
702 
703 #ifdef DEBUG
704 	/*
705 	 * Update clalloc so that nfsstat shows the current number
706 	 * of allocated client handles.
707 	 */
708 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
709 #endif
710 }
711 
712 /* ARGSUSED */
713 static void
714 clreclaim(void *all)
715 {
716 	struct nfs_clnt *nfscl;
717 
718 #ifdef DEBUG
719 	clstat_debug.clreclaim.value.ui64++;
720 #endif
721 	/*
722 	 * The system is low on memory; go through and try to reclaim some from
723 	 * every zone on the system.
724 	 */
725 	mutex_enter(&nfs_clnt_list_lock);
726 	nfscl = list_head(&nfs_clnt_list);
727 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
728 		clreclaim_zone(nfscl, CL_HOLDTIME);
729 	mutex_exit(&nfs_clnt_list_lock);
730 }
731 
732 /*
733  * Minimum time-out values indexed by call type
734  * These units are in "eights" of a second to avoid multiplies
735  */
736 static unsigned int minimum_timeo[] = {
737 	6, 7, 10
738 };
739 
740 /*
741  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
742  */
743 #define	MAXTIMO	(20*hz)
744 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
745 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
746 
747 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
748 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
749 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
750 
751 /*
752  * Function called when rfscall notices that we have been
753  * re-transmitting, or when we get a response without retransmissions.
754  * Return 1 if the transfer size was adjusted down - 0 if no change.
755  */
756 static int
757 nfs_feedback(int flag, int which, mntinfo_t *mi)
758 {
759 	int kind;
760 	int r = 0;
761 
762 	mutex_enter(&mi->mi_lock);
763 	if (flag == FEEDBACK_REXMIT1) {
764 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
765 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
766 			goto done;
767 		if (mi->mi_curread > MIN_NFS_TSIZE) {
768 			mi->mi_curread /= 2;
769 			if (mi->mi_curread < MIN_NFS_TSIZE)
770 				mi->mi_curread = MIN_NFS_TSIZE;
771 			r = 1;
772 		}
773 
774 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
775 			mi->mi_curwrite /= 2;
776 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
777 				mi->mi_curwrite = MIN_NFS_TSIZE;
778 			r = 1;
779 		}
780 	} else if (flag == FEEDBACK_OK) {
781 		kind = mi->mi_timer_type[which];
782 		if (kind == 0 ||
783 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
784 			goto done;
785 		if (kind == 1) {
786 			if (mi->mi_curread >= mi->mi_tsize)
787 				goto done;
788 			mi->mi_curread +=  MIN_NFS_TSIZE;
789 			if (mi->mi_curread > mi->mi_tsize/2)
790 				mi->mi_curread = mi->mi_tsize;
791 		} else if (kind == 2) {
792 			if (mi->mi_curwrite >= mi->mi_stsize)
793 				goto done;
794 			mi->mi_curwrite += MIN_NFS_TSIZE;
795 			if (mi->mi_curwrite > mi->mi_stsize/2)
796 				mi->mi_curwrite = mi->mi_stsize;
797 		}
798 	}
799 done:
800 	mutex_exit(&mi->mi_lock);
801 	return (r);
802 }
803 
804 #ifdef DEBUG
805 static int rfs2call_hits = 0;
806 static int rfs2call_misses = 0;
807 #endif
808 
809 int
810 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
811     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
812     enum nfsstat *statusp, int flags, failinfo_t *fi)
813 {
814 	int rpcerror;
815 	enum clnt_stat rpc_status;
816 
817 	ASSERT(statusp != NULL);
818 
819 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
820 	    cr, douprintf, &rpc_status, flags, fi);
821 	if (!rpcerror) {
822 		/*
823 		 * See crnetadjust() for comments.
824 		 */
825 		if (*statusp == NFSERR_ACCES &&
826 		    (cr = crnetadjust(cr)) != NULL) {
827 #ifdef DEBUG
828 			rfs2call_hits++;
829 #endif
830 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
831 			    resp, cr, douprintf, NULL, flags, fi);
832 			crfree(cr);
833 #ifdef DEBUG
834 			if (*statusp == NFSERR_ACCES)
835 				rfs2call_misses++;
836 #endif
837 		}
838 	} else if (rpc_status == RPC_PROCUNAVAIL) {
839 		*statusp = NFSERR_OPNOTSUPP;
840 		rpcerror = 0;
841 	}
842 
843 	return (rpcerror);
844 }
845 
846 #define	NFS3_JUKEBOX_DELAY	10 * hz
847 
848 static clock_t nfs3_jukebox_delay = 0;
849 
850 #ifdef DEBUG
851 static int rfs3call_hits = 0;
852 static int rfs3call_misses = 0;
853 #endif
854 
855 int
856 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
857     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
858     nfsstat3 *statusp, int flags, failinfo_t *fi)
859 {
860 	int rpcerror;
861 	int user_informed;
862 
863 	user_informed = 0;
864 	do {
865 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
866 		    cr, douprintf, NULL, flags, fi);
867 		if (!rpcerror) {
868 			cred_t *crr;
869 			if (*statusp == NFS3ERR_JUKEBOX) {
870 				if (ttoproc(curthread) == &p0) {
871 					rpcerror = EAGAIN;
872 					break;
873 				}
874 				if (!user_informed) {
875 					user_informed = 1;
876 					uprintf(
877 		"file temporarily unavailable on the server, retrying...\n");
878 				}
879 				delay(nfs3_jukebox_delay);
880 			}
881 			/*
882 			 * See crnetadjust() for comments.
883 			 */
884 			else if (*statusp == NFS3ERR_ACCES &&
885 			    (crr = crnetadjust(cr)) != NULL) {
886 #ifdef DEBUG
887 				rfs3call_hits++;
888 #endif
889 				rpcerror = rfscall(mi, which, xdrargs, argsp,
890 				    xdrres, resp, crr, douprintf,
891 				    NULL, flags, fi);
892 
893 				crfree(crr);
894 #ifdef DEBUG
895 				if (*statusp == NFS3ERR_ACCES)
896 					rfs3call_misses++;
897 #endif
898 			}
899 		}
900 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
901 
902 	return (rpcerror);
903 }
904 
905 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
906 #define	INC_READERS(mi)		{ \
907 	mi->mi_readers++; \
908 }
909 #define	DEC_READERS(mi)		{ \
910 	mi->mi_readers--; \
911 	if (mi->mi_readers == 0) \
912 		cv_broadcast(&mi->mi_failover_cv); \
913 }
914 
915 static int
916 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
917     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
918     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
919 {
920 	CLIENT *client;
921 	struct chtab *ch;
922 	enum clnt_stat status;
923 	struct rpc_err rpcerr;
924 	struct timeval wait;
925 	int timeo;		/* in units of hz */
926 	int my_rsize, my_wsize;
927 	bool_t tryagain;
928 	k_sigset_t smask;
929 	servinfo_t *svp;
930 	struct nfs_clnt *nfscl;
931 	zoneid_t zoneid = getzoneid();
932 #ifdef DEBUG
933 	char *bufp;
934 #endif
935 
936 
937 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
938 		"rfscall_start:which %d mi %p", which, mi);
939 
940 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
941 	ASSERT(nfscl != NULL);
942 
943 	nfscl->nfscl_stat.calls.value.ui64++;
944 	mi->mi_reqs[which].value.ui64++;
945 
946 	rpcerr.re_status = RPC_SUCCESS;
947 
948 	/*
949 	 * In case of forced unmount or zone shutdown, return EIO.
950 	 */
951 
952 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
953 		rpcerr.re_status = RPC_FAILED;
954 		rpcerr.re_errno = EIO;
955 		return (rpcerr.re_errno);
956 	}
957 
958 	/*
959 	 * Remember the transfer sizes in case
960 	 * nfs_feedback changes them underneath us.
961 	 */
962 	my_rsize = mi->mi_curread;
963 	my_wsize = mi->mi_curwrite;
964 
965 	/*
966 	 * NFS client failover support
967 	 *
968 	 * If this rnode is not in sync with the current server (VALID_FH),
969 	 * we'd like to do a remap to get in sync.  We can be interrupted
970 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
971 	 * use the best info we have to try the RPC.  Part of that is
972 	 * unconditionally updating the filehandle copy kept for V3.
973 	 *
974 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
975 	 * rw_enter(); we're trying to keep the current server from being
976 	 * changed on us until we're done with the remapping and have a
977 	 * matching client handle.  We don't want to sending a filehandle
978 	 * to the wrong host.
979 	 */
980 failoverretry:
981 	if (FAILOVER_MOUNT(mi)) {
982 		mutex_enter(&mi->mi_lock);
983 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
984 			if (failover_wait(mi)) {
985 				mutex_exit(&mi->mi_lock);
986 				return (EINTR);
987 			}
988 		}
989 		INC_READERS(mi);
990 		mutex_exit(&mi->mi_lock);
991 		if (fi) {
992 			if (!VALID_FH(fi) &&
993 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
994 				int remaperr;
995 
996 				svp = mi->mi_curr_serv;
997 				remaperr = failover_remap(fi);
998 				if (remaperr != 0) {
999 #ifdef DEBUG
1000 					if (remaperr != EINTR)
1001 						nfs_cmn_err(remaperr, CE_WARN,
1002 					    "rfscall couldn't failover: %m");
1003 #endif
1004 					mutex_enter(&mi->mi_lock);
1005 					DEC_READERS(mi);
1006 					mutex_exit(&mi->mi_lock);
1007 					/*
1008 					 * If failover_remap returns ETIMEDOUT
1009 					 * and the filesystem is hard mounted
1010 					 * we have to retry the call with a new
1011 					 * server.
1012 					 */
1013 					if ((mi->mi_flags & MI_HARD) &&
1014 					    IS_RECOVERABLE_ERROR(remaperr)) {
1015 						if (svp == mi->mi_curr_serv)
1016 							failover_newserver(mi);
1017 						rpcerr.re_status = RPC_SUCCESS;
1018 						goto failoverretry;
1019 					}
1020 					rpcerr.re_errno = remaperr;
1021 					return (remaperr);
1022 				}
1023 			}
1024 			if (fi->fhp && fi->copyproc)
1025 				(*fi->copyproc)(fi->fhp, fi->vp);
1026 		}
1027 	}
1028 
1029 	/*
1030 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1031 	 * are guaranteed to reprocess the retry as a new request.
1032 	 */
1033 	svp = mi->mi_curr_serv;
1034 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1035 
1036 	if (FAILOVER_MOUNT(mi)) {
1037 		mutex_enter(&mi->mi_lock);
1038 		DEC_READERS(mi);
1039 		mutex_exit(&mi->mi_lock);
1040 
1041 		if ((rpcerr.re_errno == ETIMEDOUT ||
1042 				rpcerr.re_errno == ECONNRESET) &&
1043 				failover_safe(fi)) {
1044 			if (svp == mi->mi_curr_serv)
1045 				failover_newserver(mi);
1046 			goto failoverretry;
1047 		}
1048 	}
1049 	if (rpcerr.re_errno != 0)
1050 		return (rpcerr.re_errno);
1051 
1052 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1053 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1054 		timeo = (mi->mi_timeo * hz) / 10;
1055 	} else {
1056 		mutex_enter(&mi->mi_lock);
1057 		timeo = CLNT_SETTIMERS(client,
1058 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1059 		    &(mi->mi_timers[NFS_CALLTYPES]),
1060 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1061 		    (void (*)())NULL, (caddr_t)mi, 0);
1062 		mutex_exit(&mi->mi_lock);
1063 	}
1064 
1065 	/*
1066 	 * If hard mounted fs, retry call forever unless hard error occurs.
1067 	 */
1068 	do {
1069 		tryagain = FALSE;
1070 
1071 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1072 			status = RPC_FAILED;
1073 			rpcerr.re_status = RPC_FAILED;
1074 			rpcerr.re_errno = EIO;
1075 			break;
1076 		}
1077 
1078 		TICK_TO_TIMEVAL(timeo, &wait);
1079 
1080 		/*
1081 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1082 		 * and SIGTERM. (Preserving the existing masks).
1083 		 * Mask out SIGINT if mount option nointr is specified.
1084 		 */
1085 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1086 		if (!(mi->mi_flags & MI_INT))
1087 			client->cl_nosignal = TRUE;
1088 
1089 		/*
1090 		 * If there is a current signal, then don't bother
1091 		 * even trying to send out the request because we
1092 		 * won't be able to block waiting for the response.
1093 		 * Simply assume RPC_INTR and get on with it.
1094 		 */
1095 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1096 			status = RPC_INTR;
1097 		else {
1098 			status = CLNT_CALL(client, which, xdrargs, argsp,
1099 			    xdrres, resp, wait);
1100 		}
1101 
1102 		if (!(mi->mi_flags & MI_INT))
1103 			client->cl_nosignal = FALSE;
1104 		/*
1105 		 * restore original signal mask
1106 		 */
1107 		sigunintr(&smask);
1108 
1109 		switch (status) {
1110 		case RPC_SUCCESS:
1111 			if ((mi->mi_flags & MI_DYNAMIC) &&
1112 			    mi->mi_timer_type[which] != 0 &&
1113 			    (mi->mi_curread != my_rsize ||
1114 			    mi->mi_curwrite != my_wsize))
1115 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1116 			break;
1117 
1118 		case RPC_INTR:
1119 			/*
1120 			 * There is no way to recover from this error,
1121 			 * even if mount option nointr is specified.
1122 			 * SIGKILL, for example, cannot be blocked.
1123 			 */
1124 			rpcerr.re_status = RPC_INTR;
1125 			rpcerr.re_errno = EINTR;
1126 			break;
1127 
1128 		case RPC_UDERROR:
1129 			/*
1130 			 * If the NFS server is local (vold) and
1131 			 * it goes away then we get RPC_UDERROR.
1132 			 * This is a retryable error, so we would
1133 			 * loop, so check to see if the specific
1134 			 * error was ECONNRESET, indicating that
1135 			 * target did not exist at all.  If so,
1136 			 * return with RPC_PROGUNAVAIL and
1137 			 * ECONNRESET to indicate why.
1138 			 */
1139 			CLNT_GETERR(client, &rpcerr);
1140 			if (rpcerr.re_errno == ECONNRESET) {
1141 				rpcerr.re_status = RPC_PROGUNAVAIL;
1142 				rpcerr.re_errno = ECONNRESET;
1143 				break;
1144 			}
1145 			/*FALLTHROUGH*/
1146 
1147 		default:		/* probably RPC_TIMEDOUT */
1148 			if (IS_UNRECOVERABLE_RPC(status))
1149 				break;
1150 
1151 			/*
1152 			 * increment server not responding count
1153 			 */
1154 			mutex_enter(&mi->mi_lock);
1155 			mi->mi_noresponse++;
1156 			mutex_exit(&mi->mi_lock);
1157 #ifdef DEBUG
1158 			nfscl->nfscl_stat.noresponse.value.ui64++;
1159 #endif
1160 
1161 			if (!(mi->mi_flags & MI_HARD)) {
1162 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1163 				    (mi->mi_ss_call_type[which] == 0))
1164 					break;
1165 			}
1166 
1167 			/*
1168 			 * The call is in progress (over COTS).
1169 			 * Try the CLNT_CALL again, but don't
1170 			 * print a noisy error message.
1171 			 */
1172 			if (status == RPC_INPROGRESS) {
1173 				tryagain = TRUE;
1174 				break;
1175 			}
1176 
1177 			if (flags & RFSCALL_SOFT)
1178 				break;
1179 
1180 			/*
1181 			 * On zone shutdown, just move on.
1182 			 */
1183 			if (zone_status_get(curproc->p_zone) >=
1184 			    ZONE_IS_SHUTTING_DOWN) {
1185 				rpcerr.re_status = RPC_FAILED;
1186 				rpcerr.re_errno = EIO;
1187 				break;
1188 			}
1189 
1190 			/*
1191 			 * NFS client failover support
1192 			 *
1193 			 * If the current server just failed us, we'll
1194 			 * start the process of finding a new server.
1195 			 * After that, we can just retry.
1196 			 */
1197 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1198 				if (svp == mi->mi_curr_serv)
1199 					failover_newserver(mi);
1200 				clfree_impl(client, ch, nfscl);
1201 				goto failoverretry;
1202 			}
1203 
1204 			tryagain = TRUE;
1205 			timeo = backoff(timeo);
1206 			mutex_enter(&mi->mi_lock);
1207 			if (!(mi->mi_flags & MI_PRINTED)) {
1208 				mi->mi_flags |= MI_PRINTED;
1209 				mutex_exit(&mi->mi_lock);
1210 #ifdef DEBUG
1211 				zprintf(zoneid,
1212 			"NFS%d server %s not responding still trying\n",
1213 				    mi->mi_vers, svp->sv_hostname);
1214 #else
1215 				zprintf(zoneid,
1216 			"NFS server %s not responding still trying\n",
1217 				    svp->sv_hostname);
1218 #endif
1219 			} else
1220 				mutex_exit(&mi->mi_lock);
1221 			if (*douprintf && curproc->p_sessp->s_vp != NULL) {
1222 				*douprintf = 0;
1223 				if (!(mi->mi_flags & MI_NOPRINT))
1224 #ifdef DEBUG
1225 					uprintf(
1226 			    "NFS%d server %s not responding still trying\n",
1227 					    mi->mi_vers, svp->sv_hostname);
1228 #else
1229 					uprintf(
1230 			    "NFS server %s not responding still trying\n",
1231 					    svp->sv_hostname);
1232 #endif
1233 			}
1234 
1235 			/*
1236 			 * If doing dynamic adjustment of transfer
1237 			 * size and if it's a read or write call
1238 			 * and if the transfer size changed while
1239 			 * retransmitting or if the feedback routine
1240 			 * changed the transfer size,
1241 			 * then exit rfscall so that the transfer
1242 			 * size can be adjusted at the vnops level.
1243 			 */
1244 			if ((mi->mi_flags & MI_DYNAMIC) &&
1245 			    mi->mi_timer_type[which] != 0 &&
1246 			    (mi->mi_curread != my_rsize ||
1247 			    mi->mi_curwrite != my_wsize ||
1248 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1249 				/*
1250 				 * On read or write calls, return
1251 				 * back to the vnode ops level if
1252 				 * the transfer size changed.
1253 				 */
1254 				clfree_impl(client, ch, nfscl);
1255 				return (ENFS_TRYAGAIN);
1256 			}
1257 		}
1258 	} while (tryagain);
1259 
1260 	if (status != RPC_SUCCESS) {
1261 		/*
1262 		 * Let soft mounts use the timed out message.
1263 		 */
1264 		if (status == RPC_INPROGRESS)
1265 			status = RPC_TIMEDOUT;
1266 		nfscl->nfscl_stat.badcalls.value.ui64++;
1267 		if (status != RPC_INTR) {
1268 			mutex_enter(&mi->mi_lock);
1269 			mi->mi_flags |= MI_DOWN;
1270 			mutex_exit(&mi->mi_lock);
1271 			CLNT_GETERR(client, &rpcerr);
1272 #ifdef DEBUG
1273 			bufp = clnt_sperror(client, svp->sv_hostname);
1274 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1275 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1276 			if (curproc->p_sessp->s_vp != NULL) {
1277 				if (!(mi->mi_flags & MI_NOPRINT)) {
1278 					uprintf("NFS%d %s failed for %s\n",
1279 					    mi->mi_vers, mi->mi_rfsnames[which],
1280 					    bufp);
1281 				}
1282 			}
1283 			kmem_free(bufp, MAXPATHLEN);
1284 #else
1285 			zprintf(zoneid,
1286 			    "NFS %s failed for server %s: error %d (%s)\n",
1287 			    mi->mi_rfsnames[which], svp->sv_hostname,
1288 			    status, clnt_sperrno(status));
1289 			if (curproc->p_sessp->s_vp != NULL) {
1290 				if (!(mi->mi_flags & MI_NOPRINT)) {
1291 					uprintf(
1292 				"NFS %s failed for server %s: error %d (%s)\n",
1293 					    mi->mi_rfsnames[which],
1294 					    svp->sv_hostname, status,
1295 					    clnt_sperrno(status));
1296 				}
1297 			}
1298 #endif
1299 			/*
1300 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1301 			 * re_errno is set appropriately depending on
1302 			 * the authentication error
1303 			 */
1304 			if (status == RPC_VERSMISMATCH ||
1305 			    status == RPC_PROGVERSMISMATCH)
1306 				rpcerr.re_errno = EIO;
1307 		}
1308 	} else {
1309 		/*
1310 		 * Test the value of mi_down and mi_printed without
1311 		 * holding the mi_lock mutex.  If they are both zero,
1312 		 * then it is okay to skip the down and printed
1313 		 * processing.  This saves on a mutex_enter and
1314 		 * mutex_exit pair for a normal, successful RPC.
1315 		 * This was just complete overhead.
1316 		 */
1317 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1318 			mutex_enter(&mi->mi_lock);
1319 			mi->mi_flags &= ~MI_DOWN;
1320 			if (mi->mi_flags & MI_PRINTED) {
1321 				mi->mi_flags &= ~MI_PRINTED;
1322 				mutex_exit(&mi->mi_lock);
1323 #ifdef DEBUG
1324 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1325 				zprintf(zoneid, "NFS%d server %s ok\n",
1326 				    mi->mi_vers, svp->sv_hostname);
1327 #else
1328 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1329 				zprintf(zoneid, "NFS server %s ok\n",
1330 				    svp->sv_hostname);
1331 #endif
1332 			} else
1333 				mutex_exit(&mi->mi_lock);
1334 		}
1335 
1336 		if (*douprintf == 0) {
1337 			if (!(mi->mi_flags & MI_NOPRINT))
1338 #ifdef DEBUG
1339 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1340 					uprintf("NFS%d server %s ok\n",
1341 					    mi->mi_vers, svp->sv_hostname);
1342 #else
1343 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1344 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1345 #endif
1346 			*douprintf = 1;
1347 		}
1348 	}
1349 
1350 	clfree_impl(client, ch, nfscl);
1351 
1352 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1353 
1354 	if (rpc_status != NULL)
1355 		*rpc_status = rpcerr.re_status;
1356 
1357 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1358 	    rpcerr.re_errno);
1359 
1360 	return (rpcerr.re_errno);
1361 }
1362 
1363 #ifdef DEBUG
1364 static int acl2call_hits = 0;
1365 static int acl2call_misses = 0;
1366 #endif
1367 
1368 int
1369 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1370     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1371     enum nfsstat *statusp, int flags, failinfo_t *fi)
1372 {
1373 	int rpcerror;
1374 
1375 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1376 	    cr, douprintf, flags, fi);
1377 	if (!rpcerror) {
1378 		/*
1379 		 * See comments with crnetadjust().
1380 		 */
1381 		if (*statusp == NFSERR_ACCES &&
1382 		    (cr = crnetadjust(cr)) != NULL) {
1383 #ifdef DEBUG
1384 			acl2call_hits++;
1385 #endif
1386 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1387 			    resp, cr, douprintf, flags, fi);
1388 			crfree(cr);
1389 #ifdef DEBUG
1390 			if (*statusp == NFSERR_ACCES)
1391 				acl2call_misses++;
1392 #endif
1393 		}
1394 	}
1395 
1396 	return (rpcerror);
1397 }
1398 
1399 #ifdef DEBUG
1400 static int acl3call_hits = 0;
1401 static int acl3call_misses = 0;
1402 #endif
1403 
1404 int
1405 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1406     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1407     nfsstat3 *statusp, int flags, failinfo_t *fi)
1408 {
1409 	int rpcerror;
1410 	int user_informed;
1411 
1412 	user_informed = 0;
1413 
1414 	do {
1415 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1416 		    cr, douprintf, flags, fi);
1417 		if (!rpcerror) {
1418 			cred_t *crr;
1419 			if (*statusp == NFS3ERR_JUKEBOX) {
1420 				if (!user_informed) {
1421 					user_informed = 1;
1422 					uprintf(
1423 		"file temporarily unavailable on the server, retrying...\n");
1424 				}
1425 				delay(nfs3_jukebox_delay);
1426 			}
1427 			/*
1428 			 * See crnetadjust() for comments.
1429 			 */
1430 			else if (*statusp == NFS3ERR_ACCES &&
1431 			    (crr = crnetadjust(cr)) != NULL) {
1432 #ifdef DEBUG
1433 				acl3call_hits++;
1434 #endif
1435 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1436 				    xdrres, resp, crr, douprintf, flags, fi);
1437 
1438 				crfree(crr);
1439 #ifdef DEBUG
1440 				if (*statusp == NFS3ERR_ACCES)
1441 					acl3call_misses++;
1442 #endif
1443 			}
1444 		}
1445 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1446 
1447 	return (rpcerror);
1448 }
1449 
1450 static int
1451 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1452     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1453     int flags, failinfo_t *fi)
1454 {
1455 	CLIENT *client;
1456 	struct chtab *ch;
1457 	enum clnt_stat status;
1458 	struct rpc_err rpcerr;
1459 	struct timeval wait;
1460 	int timeo;		/* in units of hz */
1461 #if 0 /* notyet */
1462 	int my_rsize, my_wsize;
1463 #endif
1464 	bool_t tryagain;
1465 	k_sigset_t smask;
1466 	servinfo_t *svp;
1467 	struct nfs_clnt *nfscl;
1468 	zoneid_t zoneid = getzoneid();
1469 #ifdef DEBUG
1470 	char *bufp;
1471 #endif
1472 
1473 #if 0 /* notyet */
1474 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1475 		"rfscall_start:which %d mi %p", which, mi);
1476 #endif
1477 
1478 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1479 	ASSERT(nfscl != NULL);
1480 
1481 	nfscl->nfscl_stat.calls.value.ui64++;
1482 	mi->mi_aclreqs[which].value.ui64++;
1483 
1484 	rpcerr.re_status = RPC_SUCCESS;
1485 
1486 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1487 		rpcerr.re_status = RPC_FAILED;
1488 		rpcerr.re_errno = EIO;
1489 		return (rpcerr.re_errno);
1490 	}
1491 
1492 #if 0 /* notyet */
1493 	/*
1494 	 * Remember the transfer sizes in case
1495 	 * nfs_feedback changes them underneath us.
1496 	 */
1497 	my_rsize = mi->mi_curread;
1498 	my_wsize = mi->mi_curwrite;
1499 #endif
1500 
1501 	/*
1502 	 * NFS client failover support
1503 	 *
1504 	 * If this rnode is not in sync with the current server (VALID_FH),
1505 	 * we'd like to do a remap to get in sync.  We can be interrupted
1506 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1507 	 * use the best info we have to try the RPC.  Part of that is
1508 	 * unconditionally updating the filehandle copy kept for V3.
1509 	 *
1510 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1511 	 * rw_enter(); we're trying to keep the current server from being
1512 	 * changed on us until we're done with the remapping and have a
1513 	 * matching client handle.  We don't want to sending a filehandle
1514 	 * to the wrong host.
1515 	 */
1516 failoverretry:
1517 	if (FAILOVER_MOUNT(mi)) {
1518 		mutex_enter(&mi->mi_lock);
1519 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1520 			if (failover_wait(mi)) {
1521 				mutex_exit(&mi->mi_lock);
1522 				return (EINTR);
1523 			}
1524 		}
1525 		INC_READERS(mi);
1526 		mutex_exit(&mi->mi_lock);
1527 		if (fi) {
1528 			if (!VALID_FH(fi) &&
1529 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1530 				int remaperr;
1531 
1532 				svp = mi->mi_curr_serv;
1533 				remaperr = failover_remap(fi);
1534 				if (remaperr != 0) {
1535 #ifdef DEBUG
1536 					if (remaperr != EINTR)
1537 						nfs_cmn_err(remaperr, CE_WARN,
1538 					    "aclcall couldn't failover: %m");
1539 #endif
1540 					mutex_enter(&mi->mi_lock);
1541 					DEC_READERS(mi);
1542 					mutex_exit(&mi->mi_lock);
1543 
1544 					/*
1545 					 * If failover_remap returns ETIMEDOUT
1546 					 * and the filesystem is hard mounted
1547 					 * we have to retry the call with a new
1548 					 * server.
1549 					 */
1550 					if ((mi->mi_flags & MI_HARD) &&
1551 					    IS_RECOVERABLE_ERROR(remaperr)) {
1552 						if (svp == mi->mi_curr_serv)
1553 							failover_newserver(mi);
1554 						rpcerr.re_status = RPC_SUCCESS;
1555 						goto failoverretry;
1556 					}
1557 					return (remaperr);
1558 				}
1559 			}
1560 			if (fi->fhp && fi->copyproc)
1561 				(*fi->copyproc)(fi->fhp, fi->vp);
1562 		}
1563 	}
1564 
1565 	/*
1566 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1567 	 * are guaranteed to reprocess the retry as a new request.
1568 	 */
1569 	svp = mi->mi_curr_serv;
1570 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1571 	if (FAILOVER_MOUNT(mi)) {
1572 		mutex_enter(&mi->mi_lock);
1573 		DEC_READERS(mi);
1574 		mutex_exit(&mi->mi_lock);
1575 
1576 		if ((rpcerr.re_errno == ETIMEDOUT ||
1577 				rpcerr.re_errno == ECONNRESET) &&
1578 				failover_safe(fi)) {
1579 			if (svp == mi->mi_curr_serv)
1580 				failover_newserver(mi);
1581 			goto failoverretry;
1582 		}
1583 	}
1584 	if (rpcerr.re_errno != 0)
1585 		return (rpcerr.re_errno);
1586 
1587 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1588 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1589 		timeo = (mi->mi_timeo * hz) / 10;
1590 	} else {
1591 		mutex_enter(&mi->mi_lock);
1592 		timeo = CLNT_SETTIMERS(client,
1593 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1594 		    &(mi->mi_timers[NFS_CALLTYPES]),
1595 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1596 		    (void (*)()) 0, (caddr_t)mi, 0);
1597 		mutex_exit(&mi->mi_lock);
1598 	}
1599 
1600 	/*
1601 	 * If hard mounted fs, retry call forever unless hard error occurs.
1602 	 */
1603 	do {
1604 		tryagain = FALSE;
1605 
1606 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1607 			status = RPC_FAILED;
1608 			rpcerr.re_status = RPC_FAILED;
1609 			rpcerr.re_errno = EIO;
1610 			break;
1611 		}
1612 
1613 		TICK_TO_TIMEVAL(timeo, &wait);
1614 
1615 		/*
1616 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1617 		 * and SIGTERM. (Preserving the existing masks).
1618 		 * Mask out SIGINT if mount option nointr is specified.
1619 		 */
1620 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1621 		if (!(mi->mi_flags & MI_INT))
1622 			client->cl_nosignal = TRUE;
1623 
1624 		/*
1625 		 * If there is a current signal, then don't bother
1626 		 * even trying to send out the request because we
1627 		 * won't be able to block waiting for the response.
1628 		 * Simply assume RPC_INTR and get on with it.
1629 		 */
1630 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1631 			status = RPC_INTR;
1632 		else {
1633 			status = CLNT_CALL(client, which, xdrargs, argsp,
1634 			    xdrres, resp, wait);
1635 		}
1636 
1637 		if (!(mi->mi_flags & MI_INT))
1638 			client->cl_nosignal = FALSE;
1639 		/*
1640 		 * restore original signal mask
1641 		 */
1642 		sigunintr(&smask);
1643 
1644 		switch (status) {
1645 		case RPC_SUCCESS:
1646 #if 0 /* notyet */
1647 			if ((mi->mi_flags & MI_DYNAMIC) &&
1648 			    mi->mi_timer_type[which] != 0 &&
1649 			    (mi->mi_curread != my_rsize ||
1650 			    mi->mi_curwrite != my_wsize))
1651 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1652 #endif
1653 			break;
1654 
1655 		/*
1656 		 * Unfortunately, there are servers in the world which
1657 		 * are not coded correctly.  They are not prepared to
1658 		 * handle RPC requests to the NFS port which are not
1659 		 * NFS requests.  Thus, they may try to process the
1660 		 * NFS_ACL request as if it were an NFS request.  This
1661 		 * does not work.  Generally, an error will be generated
1662 		 * on the client because it will not be able to decode
1663 		 * the response from the server.  However, it seems
1664 		 * possible that the server may not be able to decode
1665 		 * the arguments.  Thus, the criteria for deciding
1666 		 * whether the server supports NFS_ACL or not is whether
1667 		 * the following RPC errors are returned from CLNT_CALL.
1668 		 */
1669 		case RPC_CANTDECODERES:
1670 		case RPC_PROGUNAVAIL:
1671 		case RPC_CANTDECODEARGS:
1672 		case RPC_PROGVERSMISMATCH:
1673 			mutex_enter(&mi->mi_lock);
1674 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1675 			mutex_exit(&mi->mi_lock);
1676 			break;
1677 
1678 		/*
1679 		 * If the server supports NFS_ACL but not the new ops
1680 		 * for extended attributes, make sure we don't retry.
1681 		 */
1682 		case RPC_PROCUNAVAIL:
1683 			mutex_enter(&mi->mi_lock);
1684 			mi->mi_flags &= ~MI_EXTATTR;
1685 			mutex_exit(&mi->mi_lock);
1686 			break;
1687 
1688 		case RPC_INTR:
1689 			/*
1690 			 * There is no way to recover from this error,
1691 			 * even if mount option nointr is specified.
1692 			 * SIGKILL, for example, cannot be blocked.
1693 			 */
1694 			rpcerr.re_status = RPC_INTR;
1695 			rpcerr.re_errno = EINTR;
1696 			break;
1697 
1698 		case RPC_UDERROR:
1699 			/*
1700 			 * If the NFS server is local (vold) and
1701 			 * it goes away then we get RPC_UDERROR.
1702 			 * This is a retryable error, so we would
1703 			 * loop, so check to see if the specific
1704 			 * error was ECONNRESET, indicating that
1705 			 * target did not exist at all.  If so,
1706 			 * return with RPC_PROGUNAVAIL and
1707 			 * ECONNRESET to indicate why.
1708 			 */
1709 			CLNT_GETERR(client, &rpcerr);
1710 			if (rpcerr.re_errno == ECONNRESET) {
1711 				rpcerr.re_status = RPC_PROGUNAVAIL;
1712 				rpcerr.re_errno = ECONNRESET;
1713 				break;
1714 			}
1715 			/*FALLTHROUGH*/
1716 
1717 		default:		/* probably RPC_TIMEDOUT */
1718 			if (IS_UNRECOVERABLE_RPC(status))
1719 				break;
1720 
1721 			/*
1722 			 * increment server not responding count
1723 			 */
1724 			mutex_enter(&mi->mi_lock);
1725 			mi->mi_noresponse++;
1726 			mutex_exit(&mi->mi_lock);
1727 #ifdef DEBUG
1728 			nfscl->nfscl_stat.noresponse.value.ui64++;
1729 #endif
1730 
1731 			if (!(mi->mi_flags & MI_HARD)) {
1732 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1733 				    (mi->mi_acl_ss_call_type[which] == 0))
1734 					break;
1735 			}
1736 
1737 			/*
1738 			 * The call is in progress (over COTS).
1739 			 * Try the CLNT_CALL again, but don't
1740 			 * print a noisy error message.
1741 			 */
1742 			if (status == RPC_INPROGRESS) {
1743 				tryagain = TRUE;
1744 				break;
1745 			}
1746 
1747 			if (flags & RFSCALL_SOFT)
1748 				break;
1749 
1750 			/*
1751 			 * On zone shutdown, just move on.
1752 			 */
1753 			if (zone_status_get(curproc->p_zone) >=
1754 			    ZONE_IS_SHUTTING_DOWN) {
1755 				rpcerr.re_status = RPC_FAILED;
1756 				rpcerr.re_errno = EIO;
1757 				break;
1758 			}
1759 
1760 			/*
1761 			 * NFS client failover support
1762 			 *
1763 			 * If the current server just failed us, we'll
1764 			 * start the process of finding a new server.
1765 			 * After that, we can just retry.
1766 			 */
1767 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1768 				if (svp == mi->mi_curr_serv)
1769 					failover_newserver(mi);
1770 				clfree_impl(client, ch, nfscl);
1771 				goto failoverretry;
1772 			}
1773 
1774 			tryagain = TRUE;
1775 			timeo = backoff(timeo);
1776 			mutex_enter(&mi->mi_lock);
1777 			if (!(mi->mi_flags & MI_PRINTED)) {
1778 				mi->mi_flags |= MI_PRINTED;
1779 				mutex_exit(&mi->mi_lock);
1780 #ifdef DEBUG
1781 				zprintf(zoneid,
1782 			"NFS_ACL%d server %s not responding still trying\n",
1783 				    mi->mi_vers, svp->sv_hostname);
1784 #else
1785 				zprintf(zoneid,
1786 			    "NFS server %s not responding still trying\n",
1787 				    svp->sv_hostname);
1788 #endif
1789 			} else
1790 				mutex_exit(&mi->mi_lock);
1791 			if (*douprintf && curproc->p_sessp->s_vp != NULL) {
1792 				*douprintf = 0;
1793 				if (!(mi->mi_flags & MI_NOPRINT))
1794 #ifdef DEBUG
1795 					uprintf(
1796 			"NFS_ACL%d server %s not responding still trying\n",
1797 					    mi->mi_vers, svp->sv_hostname);
1798 #else
1799 					uprintf(
1800 			    "NFS server %s not responding still trying\n",
1801 					    svp->sv_hostname);
1802 #endif
1803 			}
1804 
1805 #if 0 /* notyet */
1806 			/*
1807 			 * If doing dynamic adjustment of transfer
1808 			 * size and if it's a read or write call
1809 			 * and if the transfer size changed while
1810 			 * retransmitting or if the feedback routine
1811 			 * changed the transfer size,
1812 			 * then exit rfscall so that the transfer
1813 			 * size can be adjusted at the vnops level.
1814 			 */
1815 			if ((mi->mi_flags & MI_DYNAMIC) &&
1816 			    mi->mi_acl_timer_type[which] != 0 &&
1817 			    (mi->mi_curread != my_rsize ||
1818 			    mi->mi_curwrite != my_wsize ||
1819 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1820 				/*
1821 				 * On read or write calls, return
1822 				 * back to the vnode ops level if
1823 				 * the transfer size changed.
1824 				 */
1825 				clfree_impl(client, ch, nfscl);
1826 				return (ENFS_TRYAGAIN);
1827 			}
1828 #endif
1829 		}
1830 	} while (tryagain);
1831 
1832 	if (status != RPC_SUCCESS) {
1833 		/*
1834 		 * Let soft mounts use the timed out message.
1835 		 */
1836 		if (status == RPC_INPROGRESS)
1837 			status = RPC_TIMEDOUT;
1838 		nfscl->nfscl_stat.badcalls.value.ui64++;
1839 		if (status == RPC_CANTDECODERES ||
1840 		    status == RPC_PROGUNAVAIL ||
1841 		    status == RPC_PROCUNAVAIL ||
1842 		    status == RPC_CANTDECODEARGS ||
1843 		    status == RPC_PROGVERSMISMATCH)
1844 			CLNT_GETERR(client, &rpcerr);
1845 		else if (status != RPC_INTR) {
1846 			mutex_enter(&mi->mi_lock);
1847 			mi->mi_flags |= MI_DOWN;
1848 			mutex_exit(&mi->mi_lock);
1849 			CLNT_GETERR(client, &rpcerr);
1850 #ifdef DEBUG
1851 			bufp = clnt_sperror(client, svp->sv_hostname);
1852 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1853 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1854 			if (curproc->p_sessp->s_vp != NULL) {
1855 				if (!(mi->mi_flags & MI_NOPRINT)) {
1856 					uprintf("NFS_ACL%d %s failed for %s\n",
1857 					    mi->mi_vers, mi->mi_aclnames[which],
1858 					    bufp);
1859 				}
1860 			}
1861 			kmem_free(bufp, MAXPATHLEN);
1862 #else
1863 			zprintf(zoneid,
1864 			    "NFS %s failed for server %s: error %d (%s)\n",
1865 			    mi->mi_aclnames[which], svp->sv_hostname,
1866 			    status, clnt_sperrno(status));
1867 			if (curproc->p_sessp->s_vp != NULL) {
1868 				if (!(mi->mi_flags & MI_NOPRINT))
1869 					uprintf(
1870 				"NFS %s failed for server %s: error %d (%s)\n",
1871 					    mi->mi_aclnames[which],
1872 					    svp->sv_hostname, status,
1873 					    clnt_sperrno(status));
1874 			}
1875 #endif
1876 			/*
1877 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1878 			 * re_errno is set appropriately depending on
1879 			 * the authentication error
1880 			 */
1881 			if (status == RPC_VERSMISMATCH ||
1882 			    status == RPC_PROGVERSMISMATCH)
1883 				rpcerr.re_errno = EIO;
1884 		}
1885 	} else {
1886 		/*
1887 		 * Test the value of mi_down and mi_printed without
1888 		 * holding the mi_lock mutex.  If they are both zero,
1889 		 * then it is okay to skip the down and printed
1890 		 * processing.  This saves on a mutex_enter and
1891 		 * mutex_exit pair for a normal, successful RPC.
1892 		 * This was just complete overhead.
1893 		 */
1894 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1895 			mutex_enter(&mi->mi_lock);
1896 			mi->mi_flags &= ~MI_DOWN;
1897 			if (mi->mi_flags & MI_PRINTED) {
1898 				mi->mi_flags &= ~MI_PRINTED;
1899 				mutex_exit(&mi->mi_lock);
1900 #ifdef DEBUG
1901 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1902 				    mi->mi_vers, svp->sv_hostname);
1903 #else
1904 				zprintf(zoneid, "NFS server %s ok\n",
1905 				    svp->sv_hostname);
1906 #endif
1907 			} else
1908 				mutex_exit(&mi->mi_lock);
1909 		}
1910 
1911 		if (*douprintf == 0) {
1912 			if (!(mi->mi_flags & MI_NOPRINT))
1913 #ifdef DEBUG
1914 				uprintf("NFS_ACL%d server %s ok\n",
1915 				    mi->mi_vers, svp->sv_hostname);
1916 #else
1917 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1918 #endif
1919 			*douprintf = 1;
1920 		}
1921 	}
1922 
1923 	clfree_impl(client, ch, nfscl);
1924 
1925 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1926 
1927 #if 0 /* notyet */
1928 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1929 	    rpcerr.re_errno);
1930 #endif
1931 
1932 	return (rpcerr.re_errno);
1933 }
1934 
1935 int
1936 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1937 {
1938 	uint_t mask = vap->va_mask;
1939 
1940 	if (!(mask & AT_MODE))
1941 		sa->sa_mode = (uint32_t)-1;
1942 	else
1943 		sa->sa_mode = vap->va_mode;
1944 	if (!(mask & AT_UID))
1945 		sa->sa_uid = (uint32_t)-1;
1946 	else
1947 		sa->sa_uid = (uint32_t)vap->va_uid;
1948 	if (!(mask & AT_GID))
1949 		sa->sa_gid = (uint32_t)-1;
1950 	else
1951 		sa->sa_gid = (uint32_t)vap->va_gid;
1952 	if (!(mask & AT_SIZE))
1953 		sa->sa_size = (uint32_t)-1;
1954 	else
1955 		sa->sa_size = (uint32_t)vap->va_size;
1956 	if (!(mask & AT_ATIME))
1957 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
1958 	else {
1959 		/* check time validity */
1960 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1961 			return (EOVERFLOW);
1962 		}
1963 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
1964 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
1965 	}
1966 	if (!(mask & AT_MTIME))
1967 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
1968 	else {
1969 		/* check time validity */
1970 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
1971 			return (EOVERFLOW);
1972 		}
1973 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
1974 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
1975 	}
1976 	return (0);
1977 }
1978 
1979 int
1980 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
1981 {
1982 	uint_t mask = vap->va_mask;
1983 
1984 	if (!(mask & AT_MODE))
1985 		sa->mode.set_it = FALSE;
1986 	else {
1987 		sa->mode.set_it = TRUE;
1988 		sa->mode.mode = (mode3)vap->va_mode;
1989 	}
1990 	if (!(mask & AT_UID))
1991 		sa->uid.set_it = FALSE;
1992 	else {
1993 		sa->uid.set_it = TRUE;
1994 		sa->uid.uid = (uid3)vap->va_uid;
1995 	}
1996 	if (!(mask & AT_GID))
1997 		sa->gid.set_it = FALSE;
1998 	else {
1999 		sa->gid.set_it = TRUE;
2000 		sa->gid.gid = (gid3)vap->va_gid;
2001 	}
2002 	if (!(mask & AT_SIZE))
2003 		sa->size.set_it = FALSE;
2004 	else {
2005 		sa->size.set_it = TRUE;
2006 		sa->size.size = (size3)vap->va_size;
2007 	}
2008 	if (!(mask & AT_ATIME))
2009 		sa->atime.set_it = DONT_CHANGE;
2010 	else {
2011 		/* check time validity */
2012 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2013 			return (EOVERFLOW);
2014 		}
2015 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2016 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2017 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2018 	}
2019 	if (!(mask & AT_MTIME))
2020 		sa->mtime.set_it = DONT_CHANGE;
2021 	else {
2022 		/* check time validity */
2023 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2024 			return (EOVERFLOW);
2025 		}
2026 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2027 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2028 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2029 	}
2030 	return (0);
2031 }
2032 
2033 void
2034 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2035 {
2036 
2037 	da->da_fhandle = VTOFH(dvp);
2038 	da->da_name = nm;
2039 	da->da_flags = 0;
2040 }
2041 
2042 void
2043 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2044 {
2045 
2046 	da->dirp = VTOFH3(dvp);
2047 	da->name = nm;
2048 }
2049 
2050 int
2051 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2052 {
2053 	int error;
2054 	rnode_t *rp;
2055 	struct vattr va;
2056 
2057 	va.va_mask = AT_MODE | AT_GID;
2058 	error = VOP_GETATTR(dvp, &va, 0, cr);
2059 	if (error)
2060 		return (error);
2061 
2062 	/*
2063 	 * To determine the expected group-id of the created file:
2064 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2065 	 *	GRPID option, and the directory's set-gid bit is clear,
2066 	 *	then use the process's gid.
2067 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2068 	 */
2069 	rp = VTOR(dvp);
2070 	mutex_enter(&rp->r_statelock);
2071 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2072 		*gidp = crgetgid(cr);
2073 	else
2074 		*gidp = va.va_gid;
2075 	mutex_exit(&rp->r_statelock);
2076 	return (0);
2077 }
2078 
2079 int
2080 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2081 {
2082 	int error;
2083 	struct vattr va;
2084 
2085 	va.va_mask = AT_MODE;
2086 	error = VOP_GETATTR(dvp, &va, 0, cr);
2087 	if (error)
2088 		return (error);
2089 
2090 	/*
2091 	 * Modify the expected mode (om) so that the set-gid bit matches
2092 	 * that of the parent directory (dvp).
2093 	 */
2094 	if (va.va_mode & VSGID)
2095 		*omp |= VSGID;
2096 	else
2097 		*omp &= ~VSGID;
2098 	return (0);
2099 }
2100 
2101 void
2102 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2103 {
2104 
2105 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2106 		if (!(vp->v_flag & VSWAPLIKE)) {
2107 			mutex_enter(&vp->v_lock);
2108 			vp->v_flag |= VSWAPLIKE;
2109 			mutex_exit(&vp->v_lock);
2110 		}
2111 	} else {
2112 		if (vp->v_flag & VSWAPLIKE) {
2113 			mutex_enter(&vp->v_lock);
2114 			vp->v_flag &= ~VSWAPLIKE;
2115 			mutex_exit(&vp->v_lock);
2116 		}
2117 	}
2118 }
2119 
2120 /*
2121  * Free the resources associated with an rnode.
2122  */
2123 static void
2124 rinactive(rnode_t *rp, cred_t *cr)
2125 {
2126 	vnode_t *vp;
2127 	cred_t *cred;
2128 	char *contents;
2129 	int size;
2130 	vsecattr_t *vsp;
2131 	int error;
2132 	nfs3_pathconf_info *info;
2133 
2134 	/*
2135 	 * Before freeing anything, wait until all asynchronous
2136 	 * activity is done on this rnode.  This will allow all
2137 	 * asynchronous read ahead and write behind i/o's to
2138 	 * finish.
2139 	 */
2140 	mutex_enter(&rp->r_statelock);
2141 	while (rp->r_count > 0)
2142 		cv_wait(&rp->r_cv, &rp->r_statelock);
2143 	mutex_exit(&rp->r_statelock);
2144 
2145 	/*
2146 	 * Flush and invalidate all pages associated with the vnode.
2147 	 */
2148 	vp = RTOV(rp);
2149 	if (vn_has_cached_data(vp)) {
2150 		ASSERT(vp->v_type != VCHR);
2151 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2152 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr);
2153 			if (error && (error == ENOSPC || error == EDQUOT)) {
2154 				mutex_enter(&rp->r_statelock);
2155 				if (!rp->r_error)
2156 					rp->r_error = error;
2157 				mutex_exit(&rp->r_statelock);
2158 			}
2159 		}
2160 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2161 	}
2162 
2163 	/*
2164 	 * Free any held credentials and caches which may be associated
2165 	 * with this rnode.
2166 	 */
2167 	mutex_enter(&rp->r_statelock);
2168 	cred = rp->r_cred;
2169 	rp->r_cred = NULL;
2170 	contents = rp->r_symlink.contents;
2171 	size = rp->r_symlink.size;
2172 	rp->r_symlink.contents = NULL;
2173 	vsp = rp->r_secattr;
2174 	rp->r_secattr = NULL;
2175 	info = rp->r_pathconf;
2176 	rp->r_pathconf = NULL;
2177 	mutex_exit(&rp->r_statelock);
2178 
2179 	/*
2180 	 * Free the held credential.
2181 	 */
2182 	if (cred != NULL)
2183 		crfree(cred);
2184 
2185 	/*
2186 	 * Free the access cache entries.
2187 	 */
2188 	(void) nfs_access_purge_rp(rp);
2189 
2190 	/*
2191 	 * Free the readdir cache entries.
2192 	 */
2193 	if (HAVE_RDDIR_CACHE(rp))
2194 		nfs_purge_rddir_cache(vp);
2195 
2196 	/*
2197 	 * Free the symbolic link cache.
2198 	 */
2199 	if (contents != NULL) {
2200 
2201 		kmem_free((void *)contents, size);
2202 	}
2203 
2204 	/*
2205 	 * Free any cached ACL.
2206 	 */
2207 	if (vsp != NULL)
2208 		nfs_acl_free(vsp);
2209 
2210 	/*
2211 	 * Free any cached pathconf information.
2212 	 */
2213 	if (info != NULL)
2214 		kmem_free(info, sizeof (*info));
2215 }
2216 
2217 /*
2218  * Return a vnode for the given NFS Version 2 file handle.
2219  * If no rnode exists for this fhandle, create one and put it
2220  * into the hash queues.  If the rnode for this fhandle
2221  * already exists, return it.
2222  *
2223  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2224  */
2225 vnode_t *
2226 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2227     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2228 {
2229 	int newnode;
2230 	int index;
2231 	vnode_t *vp;
2232 	nfs_fhandle nfh;
2233 	vattr_t va;
2234 
2235 	nfh.fh_len = NFS_FHSIZE;
2236 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2237 
2238 	index = rtablehash(&nfh);
2239 	rw_enter(&rtable[index].r_lock, RW_READER);
2240 
2241 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2242 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2243 
2244 	if (attr != NULL) {
2245 		if (!newnode) {
2246 			rw_exit(&rtable[index].r_lock);
2247 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2248 		} else {
2249 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2250 				vp->v_type = VBAD;
2251 			else
2252 				vp->v_type = n2v_type(attr);
2253 			/*
2254 			 * A translation here seems to be necessary
2255 			 * because this function can be called
2256 			 * with `attr' that has come from the wire,
2257 			 * and been operated on by vattr_to_nattr().
2258 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2259 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2260 			 * ->makenfsnode().
2261 			 */
2262 			if ((attr->na_rdev & 0xffff0000) == 0)
2263 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2264 			else
2265 				vp->v_rdev = expldev(n2v_rdev(attr));
2266 			nfs_attrcache(vp, attr, t);
2267 			rw_exit(&rtable[index].r_lock);
2268 		}
2269 	} else {
2270 		if (newnode) {
2271 			PURGE_ATTRCACHE(vp);
2272 		}
2273 		rw_exit(&rtable[index].r_lock);
2274 	}
2275 
2276 	return (vp);
2277 }
2278 
2279 /*
2280  * Return a vnode for the given NFS Version 3 file handle.
2281  * If no rnode exists for this fhandle, create one and put it
2282  * into the hash queues.  If the rnode for this fhandle
2283  * already exists, return it.
2284  *
2285  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2286  */
2287 vnode_t *
2288 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2289     cred_t *cr, char *dnm, char *nm)
2290 {
2291 	int newnode;
2292 	int index;
2293 	vnode_t *vp;
2294 
2295 	index = rtablehash((nfs_fhandle *)fh);
2296 	rw_enter(&rtable[index].r_lock, RW_READER);
2297 
2298 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2299 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2300 	    dnm, nm);
2301 
2302 	if (vap == NULL) {
2303 		if (newnode) {
2304 			PURGE_ATTRCACHE(vp);
2305 		}
2306 		rw_exit(&rtable[index].r_lock);
2307 		return (vp);
2308 	}
2309 
2310 	if (!newnode) {
2311 		rw_exit(&rtable[index].r_lock);
2312 		nfs_attr_cache(vp, vap, t, cr);
2313 	} else {
2314 		rnode_t *rp = VTOR(vp);
2315 
2316 		vp->v_type = vap->va_type;
2317 		vp->v_rdev = vap->va_rdev;
2318 
2319 		mutex_enter(&rp->r_statelock);
2320 		if (rp->r_mtime <= t)
2321 			nfs_attrcache_va(vp, vap);
2322 		mutex_exit(&rp->r_statelock);
2323 		rw_exit(&rtable[index].r_lock);
2324 	}
2325 
2326 	return (vp);
2327 }
2328 
2329 vnode_t *
2330 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2331     cred_t *cr, char *dnm, char *nm)
2332 {
2333 	int newnode;
2334 	int index;
2335 	vnode_t *vp;
2336 	vattr_t va;
2337 
2338 	index = rtablehash((nfs_fhandle *)fh);
2339 	rw_enter(&rtable[index].r_lock, RW_READER);
2340 
2341 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2342 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2343 	    dnm, nm);
2344 
2345 	if (attr == NULL) {
2346 		if (newnode) {
2347 			PURGE_ATTRCACHE(vp);
2348 		}
2349 		rw_exit(&rtable[index].r_lock);
2350 		return (vp);
2351 	}
2352 
2353 	if (!newnode) {
2354 		rw_exit(&rtable[index].r_lock);
2355 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2356 	} else {
2357 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2358 			vp->v_type = VBAD;
2359 		else
2360 			vp->v_type = nf3_to_vt[attr->type];
2361 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2362 			    attr->rdev.specdata2);
2363 		nfs3_attrcache(vp, attr, t);
2364 		rw_exit(&rtable[index].r_lock);
2365 	}
2366 
2367 	return (vp);
2368 }
2369 
2370 /*
2371  * Read this comment before making changes to rtablehash()!
2372  * This is a hash function in which seemingly obvious and harmless
2373  * changes can cause escalations costing million dollars!
2374  * Know what you are doing.
2375  *
2376  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2377  * algorithm is currently detailed here:
2378  *
2379  *   http://burtleburtle.net/bob/hash/doobs.html
2380  *
2381  * Of course, the above link may not be valid by the time you are reading
2382  * this, but suffice it to say that the one-at-a-time algorithm works well in
2383  * almost all cases.  If you are changing the algorithm be sure to verify that
2384  * the hash algorithm still provides even distribution in all cases and with
2385  * any server returning filehandles in whatever order (sequential or random).
2386  */
2387 static int
2388 rtablehash(nfs_fhandle *fh)
2389 {
2390 	ulong_t hash, len, i;
2391 	char *key;
2392 
2393 	key = fh->fh_buf;
2394 	len = (ulong_t)fh->fh_len;
2395 	for (hash = 0, i = 0; i < len; i++) {
2396 		hash += key[i];
2397 		hash += (hash << 10);
2398 		hash ^= (hash >> 6);
2399 	}
2400 	hash += (hash << 3);
2401 	hash ^= (hash >> 11);
2402 	hash += (hash << 15);
2403 	return (hash & rtablemask);
2404 }
2405 
2406 static vnode_t *
2407 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2408     struct vnodeops *vops,
2409     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2410     int (*compar)(const void *, const void *),
2411     int *newnode, cred_t *cr, char *dnm, char *nm)
2412 {
2413 	rnode_t *rp;
2414 	rnode_t *trp;
2415 	vnode_t *vp;
2416 	mntinfo_t *mi;
2417 
2418 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2419 
2420 	mi = VFTOMI(vfsp);
2421 start:
2422 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2423 		vp = RTOV(rp);
2424 		nfs_set_vroot(vp);
2425 		*newnode = 0;
2426 		return (vp);
2427 	}
2428 	rw_exit(&rhtp->r_lock);
2429 
2430 	mutex_enter(&rpfreelist_lock);
2431 	if (rpfreelist != NULL && rnew >= nrnode) {
2432 		rp = rpfreelist;
2433 		rp_rmfree(rp);
2434 		mutex_exit(&rpfreelist_lock);
2435 
2436 		vp = RTOV(rp);
2437 
2438 		if (rp->r_flags & RHASHED) {
2439 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2440 			mutex_enter(&vp->v_lock);
2441 			if (vp->v_count > 1) {
2442 				vp->v_count--;
2443 				mutex_exit(&vp->v_lock);
2444 				rw_exit(&rp->r_hashq->r_lock);
2445 				rw_enter(&rhtp->r_lock, RW_READER);
2446 				goto start;
2447 			}
2448 			mutex_exit(&vp->v_lock);
2449 			rp_rmhash_locked(rp);
2450 			rw_exit(&rp->r_hashq->r_lock);
2451 		}
2452 
2453 		rinactive(rp, cr);
2454 
2455 		mutex_enter(&vp->v_lock);
2456 		if (vp->v_count > 1) {
2457 			vp->v_count--;
2458 			mutex_exit(&vp->v_lock);
2459 			rw_enter(&rhtp->r_lock, RW_READER);
2460 			goto start;
2461 		}
2462 		mutex_exit(&vp->v_lock);
2463 		vn_invalid(vp);
2464 		/*
2465 		 * destroy old locks before bzero'ing and
2466 		 * recreating the locks below.
2467 		 */
2468 		nfs_rw_destroy(&rp->r_rwlock);
2469 		nfs_rw_destroy(&rp->r_lkserlock);
2470 		mutex_destroy(&rp->r_statelock);
2471 		cv_destroy(&rp->r_cv);
2472 		cv_destroy(&rp->r_commit.c_cv);
2473 		nfs_free_r_path(rp);
2474 		avl_destroy(&rp->r_dir);
2475 		/*
2476 		 * Make sure that if rnode is recycled then
2477 		 * VFS count is decremented properly before
2478 		 * reuse.
2479 		 */
2480 		VFS_RELE(vp->v_vfsp);
2481 		vn_reinit(vp);
2482 	} else {
2483 		vnode_t *new_vp;
2484 
2485 		mutex_exit(&rpfreelist_lock);
2486 
2487 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2488 		new_vp = vn_alloc(KM_SLEEP);
2489 
2490 		atomic_add_long((ulong_t *)&rnew, 1);
2491 #ifdef DEBUG
2492 		clstat_debug.nrnode.value.ui64++;
2493 #endif
2494 		vp = new_vp;
2495 	}
2496 
2497 	bzero(rp, sizeof (*rp));
2498 	rp->r_vnode = vp;
2499 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2500 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2501 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2502 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2503 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2504 	rp->r_fh.fh_len = fh->fh_len;
2505 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2506 	rp->r_server = mi->mi_curr_serv;
2507 	if (FAILOVER_MOUNT(mi)) {
2508 		/*
2509 		 * If replicated servers, stash pathnames
2510 		 */
2511 		if (dnm != NULL && nm != NULL) {
2512 			char *s, *p;
2513 			uint_t len;
2514 
2515 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2516 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2517 #ifdef DEBUG
2518 			clstat_debug.rpath.value.ui64 += len;
2519 #endif
2520 			s = rp->r_path;
2521 			for (p = dnm; *p; p++)
2522 				*s++ = *p;
2523 			*s++ = '/';
2524 			for (p = nm; *p; p++)
2525 				*s++ = *p;
2526 			*s = '\0';
2527 		} else {
2528 			/* special case for root */
2529 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2530 #ifdef DEBUG
2531 			clstat_debug.rpath.value.ui64 += 2;
2532 #endif
2533 			*rp->r_path = '.';
2534 			*(rp->r_path + 1) = '\0';
2535 		}
2536 	}
2537 	VFS_HOLD(vfsp);
2538 	rp->r_putapage = putapage;
2539 	rp->r_hashq = rhtp;
2540 	rp->r_flags = RREADDIRPLUS;
2541 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2542 	    offsetof(rddir_cache, tree));
2543 	vn_setops(vp, vops);
2544 	vp->v_data = (caddr_t)rp;
2545 	vp->v_vfsp = vfsp;
2546 	vp->v_type = VNON;
2547 	nfs_set_vroot(vp);
2548 
2549 	/*
2550 	 * There is a race condition if someone else
2551 	 * alloc's the rnode while no locks are held, so we
2552 	 * check again and recover if found.
2553 	 */
2554 	rw_enter(&rhtp->r_lock, RW_WRITER);
2555 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2556 		vp = RTOV(trp);
2557 		nfs_set_vroot(vp);
2558 		*newnode = 0;
2559 		rw_exit(&rhtp->r_lock);
2560 		rp_addfree(rp, cr);
2561 		rw_enter(&rhtp->r_lock, RW_READER);
2562 		return (vp);
2563 	}
2564 	rp_addhash(rp);
2565 	*newnode = 1;
2566 	return (vp);
2567 }
2568 
2569 static void
2570 nfs_set_vroot(vnode_t *vp)
2571 {
2572 	rnode_t *rp;
2573 	nfs_fhandle *rootfh;
2574 
2575 	rp = VTOR(vp);
2576 	rootfh = &rp->r_server->sv_fhandle;
2577 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2578 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2579 		if (!(vp->v_flag & VROOT)) {
2580 			mutex_enter(&vp->v_lock);
2581 			vp->v_flag |= VROOT;
2582 			mutex_exit(&vp->v_lock);
2583 		}
2584 	}
2585 }
2586 
2587 static void
2588 nfs_free_r_path(rnode_t *rp)
2589 {
2590 	char *path;
2591 	size_t len;
2592 
2593 	path = rp->r_path;
2594 	if (path) {
2595 		rp->r_path = NULL;
2596 		len = strlen(path) + 1;
2597 		kmem_free(path, len);
2598 #ifdef DEBUG
2599 		clstat_debug.rpath.value.ui64 -= len;
2600 #endif
2601 	}
2602 }
2603 
2604 /*
2605  * Put an rnode on the free list.
2606  *
2607  * Rnodes which were allocated above and beyond the normal limit
2608  * are immediately freed.
2609  */
2610 void
2611 rp_addfree(rnode_t *rp, cred_t *cr)
2612 {
2613 	vnode_t *vp;
2614 	struct vfs *vfsp;
2615 
2616 	vp = RTOV(rp);
2617 	ASSERT(vp->v_count >= 1);
2618 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2619 
2620 	/*
2621 	 * If we have too many rnodes allocated and there are no
2622 	 * references to this rnode, or if the rnode is no longer
2623 	 * accessible by it does not reside in the hash queues,
2624 	 * or if an i/o error occurred while writing to the file,
2625 	 * then just free it instead of putting it on the rnode
2626 	 * freelist.
2627 	 */
2628 	vfsp = vp->v_vfsp;
2629 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2630 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2631 		if (rp->r_flags & RHASHED) {
2632 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2633 			mutex_enter(&vp->v_lock);
2634 			if (vp->v_count > 1) {
2635 				vp->v_count--;
2636 				mutex_exit(&vp->v_lock);
2637 				rw_exit(&rp->r_hashq->r_lock);
2638 				return;
2639 			}
2640 			mutex_exit(&vp->v_lock);
2641 			rp_rmhash_locked(rp);
2642 			rw_exit(&rp->r_hashq->r_lock);
2643 		}
2644 
2645 		rinactive(rp, cr);
2646 
2647 		/*
2648 		 * Recheck the vnode reference count.  We need to
2649 		 * make sure that another reference has not been
2650 		 * acquired while we were not holding v_lock.  The
2651 		 * rnode is not in the rnode hash queues, so the
2652 		 * only way for a reference to have been acquired
2653 		 * is for a VOP_PUTPAGE because the rnode was marked
2654 		 * with RDIRTY or for a modified page.  This
2655 		 * reference may have been acquired before our call
2656 		 * to rinactive.  The i/o may have been completed,
2657 		 * thus allowing rinactive to complete, but the
2658 		 * reference to the vnode may not have been released
2659 		 * yet.  In any case, the rnode can not be destroyed
2660 		 * until the other references to this vnode have been
2661 		 * released.  The other references will take care of
2662 		 * either destroying the rnode or placing it on the
2663 		 * rnode freelist.  If there are no other references,
2664 		 * then the rnode may be safely destroyed.
2665 		 */
2666 		mutex_enter(&vp->v_lock);
2667 		if (vp->v_count > 1) {
2668 			vp->v_count--;
2669 			mutex_exit(&vp->v_lock);
2670 			return;
2671 		}
2672 		mutex_exit(&vp->v_lock);
2673 
2674 		destroy_rnode(rp);
2675 		return;
2676 	}
2677 
2678 	/*
2679 	 * Lock the hash queue and then recheck the reference count
2680 	 * to ensure that no other threads have acquired a reference
2681 	 * to indicate that the rnode should not be placed on the
2682 	 * freelist.  If another reference has been acquired, then
2683 	 * just release this one and let the other thread complete
2684 	 * the processing of adding this rnode to the freelist.
2685 	 */
2686 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2687 
2688 	mutex_enter(&vp->v_lock);
2689 	if (vp->v_count > 1) {
2690 		vp->v_count--;
2691 		mutex_exit(&vp->v_lock);
2692 		rw_exit(&rp->r_hashq->r_lock);
2693 		return;
2694 	}
2695 	mutex_exit(&vp->v_lock);
2696 
2697 	/*
2698 	 * If there is no cached data or metadata for this file, then
2699 	 * put the rnode on the front of the freelist so that it will
2700 	 * be reused before other rnodes which may have cached data or
2701 	 * metadata associated with them.
2702 	 */
2703 	mutex_enter(&rpfreelist_lock);
2704 	if (rpfreelist == NULL) {
2705 		rp->r_freef = rp;
2706 		rp->r_freeb = rp;
2707 		rpfreelist = rp;
2708 	} else {
2709 		rp->r_freef = rpfreelist;
2710 		rp->r_freeb = rpfreelist->r_freeb;
2711 		rpfreelist->r_freeb->r_freef = rp;
2712 		rpfreelist->r_freeb = rp;
2713 		if (!vn_has_cached_data(vp) &&
2714 		    !HAVE_RDDIR_CACHE(rp) &&
2715 		    rp->r_symlink.contents == NULL &&
2716 		    rp->r_secattr == NULL &&
2717 		    rp->r_pathconf == NULL)
2718 			rpfreelist = rp;
2719 	}
2720 	mutex_exit(&rpfreelist_lock);
2721 
2722 	rw_exit(&rp->r_hashq->r_lock);
2723 }
2724 
2725 /*
2726  * Remove an rnode from the free list.
2727  *
2728  * The caller must be holding rpfreelist_lock and the rnode
2729  * must be on the freelist.
2730  */
2731 static void
2732 rp_rmfree(rnode_t *rp)
2733 {
2734 
2735 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2736 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2737 
2738 	if (rp == rpfreelist) {
2739 		rpfreelist = rp->r_freef;
2740 		if (rp == rpfreelist)
2741 			rpfreelist = NULL;
2742 	}
2743 
2744 	rp->r_freeb->r_freef = rp->r_freef;
2745 	rp->r_freef->r_freeb = rp->r_freeb;
2746 
2747 	rp->r_freef = rp->r_freeb = NULL;
2748 }
2749 
2750 /*
2751  * Put a rnode in the hash table.
2752  *
2753  * The caller must be holding the exclusive hash queue lock.
2754  */
2755 static void
2756 rp_addhash(rnode_t *rp)
2757 {
2758 
2759 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2760 	ASSERT(!(rp->r_flags & RHASHED));
2761 
2762 	rp->r_hashf = rp->r_hashq->r_hashf;
2763 	rp->r_hashq->r_hashf = rp;
2764 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2765 	rp->r_hashf->r_hashb = rp;
2766 
2767 	mutex_enter(&rp->r_statelock);
2768 	rp->r_flags |= RHASHED;
2769 	mutex_exit(&rp->r_statelock);
2770 }
2771 
2772 /*
2773  * Remove a rnode from the hash table.
2774  *
2775  * The caller must be holding the hash queue lock.
2776  */
2777 static void
2778 rp_rmhash_locked(rnode_t *rp)
2779 {
2780 
2781 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2782 	ASSERT(rp->r_flags & RHASHED);
2783 
2784 	rp->r_hashb->r_hashf = rp->r_hashf;
2785 	rp->r_hashf->r_hashb = rp->r_hashb;
2786 
2787 	mutex_enter(&rp->r_statelock);
2788 	rp->r_flags &= ~RHASHED;
2789 	mutex_exit(&rp->r_statelock);
2790 }
2791 
2792 /*
2793  * Remove a rnode from the hash table.
2794  *
2795  * The caller must not be holding the hash queue lock.
2796  */
2797 void
2798 rp_rmhash(rnode_t *rp)
2799 {
2800 
2801 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2802 	rp_rmhash_locked(rp);
2803 	rw_exit(&rp->r_hashq->r_lock);
2804 }
2805 
2806 /*
2807  * Lookup a rnode by fhandle.
2808  *
2809  * The caller must be holding the hash queue lock, either shared or exclusive.
2810  */
2811 static rnode_t *
2812 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2813 {
2814 	rnode_t *rp;
2815 	vnode_t *vp;
2816 
2817 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2818 
2819 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2820 		vp = RTOV(rp);
2821 		if (vp->v_vfsp == vfsp &&
2822 		    rp->r_fh.fh_len == fh->fh_len &&
2823 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2824 			/*
2825 			 * remove rnode from free list, if necessary.
2826 			 */
2827 			if (rp->r_freef != NULL) {
2828 				mutex_enter(&rpfreelist_lock);
2829 				/*
2830 				 * If the rnode is on the freelist,
2831 				 * then remove it and use that reference
2832 				 * as the new reference.  Otherwise,
2833 				 * need to increment the reference count.
2834 				 */
2835 				if (rp->r_freef != NULL) {
2836 					rp_rmfree(rp);
2837 					mutex_exit(&rpfreelist_lock);
2838 				} else {
2839 					mutex_exit(&rpfreelist_lock);
2840 					VN_HOLD(vp);
2841 				}
2842 			} else
2843 				VN_HOLD(vp);
2844 			return (rp);
2845 		}
2846 	}
2847 	return (NULL);
2848 }
2849 
2850 /*
2851  * Return 1 if there is a active vnode belonging to this vfs in the
2852  * rtable cache.
2853  *
2854  * Several of these checks are done without holding the usual
2855  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2856  * etc. will redo the necessary checks before actually destroying
2857  * any rnodes.
2858  */
2859 int
2860 check_rtable(struct vfs *vfsp)
2861 {
2862 	int index;
2863 	rnode_t *rp;
2864 	vnode_t *vp;
2865 
2866 	for (index = 0; index < rtablesize; index++) {
2867 		rw_enter(&rtable[index].r_lock, RW_READER);
2868 		for (rp = rtable[index].r_hashf;
2869 		    rp != (rnode_t *)(&rtable[index]);
2870 		    rp = rp->r_hashf) {
2871 			vp = RTOV(rp);
2872 			if (vp->v_vfsp == vfsp) {
2873 				if (rp->r_freef == NULL ||
2874 				    (vn_has_cached_data(vp) &&
2875 				    (rp->r_flags & RDIRTY)) ||
2876 				    rp->r_count > 0) {
2877 					rw_exit(&rtable[index].r_lock);
2878 					return (1);
2879 				}
2880 			}
2881 		}
2882 		rw_exit(&rtable[index].r_lock);
2883 	}
2884 	return (0);
2885 }
2886 
2887 /*
2888  * Destroy inactive vnodes from the hash queues which belong to this
2889  * vfs.  It is essential that we destroy all inactive vnodes during a
2890  * forced unmount as well as during a normal unmount.
2891  */
2892 void
2893 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2894 {
2895 	int index;
2896 	rnode_t *rp;
2897 	rnode_t *rlist;
2898 	rnode_t *r_hashf;
2899 	vnode_t *vp;
2900 
2901 	rlist = NULL;
2902 
2903 	for (index = 0; index < rtablesize; index++) {
2904 		rw_enter(&rtable[index].r_lock, RW_WRITER);
2905 		for (rp = rtable[index].r_hashf;
2906 		    rp != (rnode_t *)(&rtable[index]);
2907 		    rp = r_hashf) {
2908 			/* save the hash pointer before destroying */
2909 			r_hashf = rp->r_hashf;
2910 			vp = RTOV(rp);
2911 			if (vp->v_vfsp == vfsp) {
2912 				mutex_enter(&rpfreelist_lock);
2913 				if (rp->r_freef != NULL) {
2914 					rp_rmfree(rp);
2915 					mutex_exit(&rpfreelist_lock);
2916 					rp_rmhash_locked(rp);
2917 					rp->r_hashf = rlist;
2918 					rlist = rp;
2919 				} else
2920 					mutex_exit(&rpfreelist_lock);
2921 			}
2922 		}
2923 		rw_exit(&rtable[index].r_lock);
2924 	}
2925 
2926 	for (rp = rlist; rp != NULL; rp = rlist) {
2927 		rlist = rp->r_hashf;
2928 		/*
2929 		 * This call to rp_addfree will end up destroying the
2930 		 * rnode, but in a safe way with the appropriate set
2931 		 * of checks done.
2932 		 */
2933 		rp_addfree(rp, cr);
2934 	}
2935 
2936 }
2937 
2938 /*
2939  * This routine destroys all the resources associated with the rnode
2940  * and then the rnode itself.
2941  */
2942 static void
2943 destroy_rnode(rnode_t *rp)
2944 {
2945 	vnode_t *vp;
2946 	vfs_t *vfsp;
2947 
2948 	vp = RTOV(rp);
2949 	vfsp = vp->v_vfsp;
2950 
2951 	ASSERT(vp->v_count == 1);
2952 	ASSERT(rp->r_count == 0);
2953 	ASSERT(rp->r_lmpl == NULL);
2954 	ASSERT(rp->r_mapcnt == 0);
2955 	ASSERT(!(rp->r_flags & RHASHED));
2956 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2957 	atomic_add_long((ulong_t *)&rnew, -1);
2958 #ifdef DEBUG
2959 	clstat_debug.nrnode.value.ui64--;
2960 #endif
2961 	nfs_rw_destroy(&rp->r_rwlock);
2962 	nfs_rw_destroy(&rp->r_lkserlock);
2963 	mutex_destroy(&rp->r_statelock);
2964 	cv_destroy(&rp->r_cv);
2965 	cv_destroy(&rp->r_commit.c_cv);
2966 	if (rp->r_flags & RDELMAPLIST)
2967 		list_destroy(&rp->r_indelmap);
2968 	nfs_free_r_path(rp);
2969 	avl_destroy(&rp->r_dir);
2970 	vn_invalid(vp);
2971 	vn_free(vp);
2972 	kmem_cache_free(rnode_cache, rp);
2973 	VFS_RELE(vfsp);
2974 }
2975 
2976 /*
2977  * Flush all vnodes in this (or every) vfs.
2978  * Used by nfs_sync and by nfs_unmount.
2979  */
2980 void
2981 rflush(struct vfs *vfsp, cred_t *cr)
2982 {
2983 	int index;
2984 	rnode_t *rp;
2985 	vnode_t *vp, **vplist;
2986 	long num, cnt;
2987 
2988 	/*
2989 	 * Check to see whether there is anything to do.
2990 	 */
2991 	num = rnew;
2992 	if (num == 0)
2993 		return;
2994 
2995 	/*
2996 	 * Allocate a slot for all currently active rnodes on the
2997 	 * supposition that they all may need flushing.
2998 	 */
2999 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3000 	cnt = 0;
3001 
3002 	/*
3003 	 * Walk the hash queues looking for rnodes with page
3004 	 * lists associated with them.  Make a list of these
3005 	 * files.
3006 	 */
3007 	for (index = 0; index < rtablesize; index++) {
3008 		rw_enter(&rtable[index].r_lock, RW_READER);
3009 		for (rp = rtable[index].r_hashf;
3010 		    rp != (rnode_t *)(&rtable[index]);
3011 		    rp = rp->r_hashf) {
3012 			vp = RTOV(rp);
3013 			/*
3014 			 * Don't bother sync'ing a vp if it
3015 			 * is part of virtual swap device or
3016 			 * if VFS is read-only
3017 			 */
3018 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3019 				continue;
3020 			/*
3021 			 * If flushing all mounted file systems or
3022 			 * the vnode belongs to this vfs, has pages
3023 			 * and is marked as either dirty or mmap'd,
3024 			 * hold and add this vnode to the list of
3025 			 * vnodes to flush.
3026 			 */
3027 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3028 			    vn_has_cached_data(vp) &&
3029 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3030 				VN_HOLD(vp);
3031 				vplist[cnt++] = vp;
3032 				if (cnt == num) {
3033 					rw_exit(&rtable[index].r_lock);
3034 					goto toomany;
3035 				}
3036 			}
3037 		}
3038 		rw_exit(&rtable[index].r_lock);
3039 	}
3040 toomany:
3041 
3042 	/*
3043 	 * Flush and release all of the files on the list.
3044 	 */
3045 	while (cnt-- > 0) {
3046 		vp = vplist[cnt];
3047 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr);
3048 		VN_RELE(vp);
3049 	}
3050 
3051 	/*
3052 	 * Free the space allocated to hold the list.
3053 	 */
3054 	kmem_free(vplist, num * sizeof (*vplist));
3055 }
3056 
3057 /*
3058  * This probably needs to be larger than or equal to
3059  * log2(sizeof (struct rnode)) due to the way that rnodes are
3060  * allocated.
3061  */
3062 #define	ACACHE_SHIFT_BITS	9
3063 
3064 static int
3065 acachehash(rnode_t *rp, cred_t *cr)
3066 {
3067 
3068 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3069 	    acachemask);
3070 }
3071 
3072 #ifdef DEBUG
3073 static long nfs_access_cache_hits = 0;
3074 static long nfs_access_cache_misses = 0;
3075 #endif
3076 
3077 nfs_access_type_t
3078 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3079 {
3080 	vnode_t *vp;
3081 	acache_t *ap;
3082 	acache_hash_t *hp;
3083 	nfs_access_type_t all;
3084 
3085 	vp = RTOV(rp);
3086 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3087 		return (NFS_ACCESS_UNKNOWN);
3088 
3089 	if (rp->r_acache != NULL) {
3090 		hp = &acache[acachehash(rp, cr)];
3091 		rw_enter(&hp->lock, RW_READER);
3092 		ap = hp->next;
3093 		while (ap != (acache_t *)hp) {
3094 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3095 				if ((ap->known & acc) == acc) {
3096 #ifdef DEBUG
3097 					nfs_access_cache_hits++;
3098 #endif
3099 					if ((ap->allowed & acc) == acc)
3100 						all = NFS_ACCESS_ALLOWED;
3101 					else
3102 						all = NFS_ACCESS_DENIED;
3103 				} else {
3104 #ifdef DEBUG
3105 					nfs_access_cache_misses++;
3106 #endif
3107 					all = NFS_ACCESS_UNKNOWN;
3108 				}
3109 				rw_exit(&hp->lock);
3110 				return (all);
3111 			}
3112 			ap = ap->next;
3113 		}
3114 		rw_exit(&hp->lock);
3115 	}
3116 
3117 #ifdef DEBUG
3118 	nfs_access_cache_misses++;
3119 #endif
3120 	return (NFS_ACCESS_UNKNOWN);
3121 }
3122 
3123 void
3124 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3125 {
3126 	acache_t *ap;
3127 	acache_t *nap;
3128 	acache_hash_t *hp;
3129 
3130 	hp = &acache[acachehash(rp, cr)];
3131 
3132 	/*
3133 	 * Allocate now assuming that mostly an allocation will be
3134 	 * required.  This allows the allocation to happen without
3135 	 * holding the hash bucket locked.
3136 	 */
3137 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3138 	if (nap != NULL) {
3139 		nap->known = acc;
3140 		nap->allowed = resacc;
3141 		nap->rnode = rp;
3142 		crhold(cr);
3143 		nap->cred = cr;
3144 		nap->hashq = hp;
3145 	}
3146 
3147 	rw_enter(&hp->lock, RW_WRITER);
3148 
3149 	if (rp->r_acache != NULL) {
3150 		ap = hp->next;
3151 		while (ap != (acache_t *)hp) {
3152 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3153 				ap->known |= acc;
3154 				ap->allowed &= ~acc;
3155 				ap->allowed |= resacc;
3156 				rw_exit(&hp->lock);
3157 				if (nap != NULL) {
3158 					crfree(nap->cred);
3159 					kmem_cache_free(acache_cache, nap);
3160 				}
3161 				return;
3162 			}
3163 			ap = ap->next;
3164 		}
3165 	}
3166 
3167 	if (nap != NULL) {
3168 #ifdef DEBUG
3169 		clstat_debug.access.value.ui64++;
3170 #endif
3171 		nap->next = hp->next;
3172 		hp->next = nap;
3173 		nap->next->prev = nap;
3174 		nap->prev = (acache_t *)hp;
3175 
3176 		mutex_enter(&rp->r_statelock);
3177 		nap->list = rp->r_acache;
3178 		rp->r_acache = nap;
3179 		mutex_exit(&rp->r_statelock);
3180 	}
3181 
3182 	rw_exit(&hp->lock);
3183 }
3184 
3185 int
3186 nfs_access_purge_rp(rnode_t *rp)
3187 {
3188 	acache_t *ap;
3189 	acache_t *tmpap;
3190 	acache_t *rplist;
3191 
3192 	/*
3193 	 * If there aren't any cached entries, then there is nothing
3194 	 * to free.
3195 	 */
3196 	if (rp->r_acache == NULL)
3197 		return (0);
3198 
3199 	mutex_enter(&rp->r_statelock);
3200 	rplist = rp->r_acache;
3201 	rp->r_acache = NULL;
3202 	mutex_exit(&rp->r_statelock);
3203 
3204 	/*
3205 	 * Loop through each entry in the list pointed to in the
3206 	 * rnode.  Remove each of these entries from the hash
3207 	 * queue that it is on and remove it from the list in
3208 	 * the rnode.
3209 	 */
3210 	for (ap = rplist; ap != NULL; ap = tmpap) {
3211 		rw_enter(&ap->hashq->lock, RW_WRITER);
3212 		ap->prev->next = ap->next;
3213 		ap->next->prev = ap->prev;
3214 		rw_exit(&ap->hashq->lock);
3215 
3216 		tmpap = ap->list;
3217 		crfree(ap->cred);
3218 		kmem_cache_free(acache_cache, ap);
3219 #ifdef DEBUG
3220 		clstat_debug.access.value.ui64--;
3221 #endif
3222 	}
3223 
3224 	return (1);
3225 }
3226 
3227 static const char prefix[] = ".nfs";
3228 
3229 static kmutex_t newnum_lock;
3230 
3231 int
3232 newnum(void)
3233 {
3234 	static uint_t newnum = 0;
3235 	uint_t id;
3236 
3237 	mutex_enter(&newnum_lock);
3238 	if (newnum == 0)
3239 		newnum = gethrestime_sec() & 0xffff;
3240 	id = newnum++;
3241 	mutex_exit(&newnum_lock);
3242 	return (id);
3243 }
3244 
3245 char *
3246 newname(void)
3247 {
3248 	char *news;
3249 	char *s;
3250 	const char *p;
3251 	uint_t id;
3252 
3253 	id = newnum();
3254 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3255 	s = news;
3256 	p = prefix;
3257 	while (*p != '\0')
3258 		*s++ = *p++;
3259 	while (id != 0) {
3260 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3261 		id >>= 4;
3262 	}
3263 	*s = '\0';
3264 	return (news);
3265 }
3266 
3267 int
3268 nfs_atoi(char *cp)
3269 {
3270 	int n;
3271 
3272 	n = 0;
3273 	while (*cp != '\0') {
3274 		n = n * 10 + (*cp - '0');
3275 		cp++;
3276 	}
3277 
3278 	return (n);
3279 }
3280 
3281 /*
3282  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3283  * framework.
3284  */
3285 static int
3286 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3287 {
3288 	ksp->ks_snaptime = gethrtime();
3289 	if (rw == KSTAT_WRITE) {
3290 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3291 #ifdef DEBUG
3292 		/*
3293 		 * Currently only the global zone can write to kstats, but we
3294 		 * add the check just for paranoia.
3295 		 */
3296 		if (INGLOBALZONE(curproc))
3297 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3298 			    sizeof (clstat_debug));
3299 #endif
3300 	} else {
3301 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3302 #ifdef DEBUG
3303 		/*
3304 		 * If we're displaying the "global" debug kstat values, we
3305 		 * display them as-is to all zones since in fact they apply to
3306 		 * the system as a whole.
3307 		 */
3308 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3309 		    sizeof (clstat_debug));
3310 #endif
3311 	}
3312 	return (0);
3313 }
3314 
3315 static void *
3316 clinit_zone(zoneid_t zoneid)
3317 {
3318 	kstat_t *nfs_client_kstat;
3319 	struct nfs_clnt *nfscl;
3320 	uint_t ndata;
3321 
3322 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3323 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3324 	nfscl->nfscl_chtable = NULL;
3325 	nfscl->nfscl_zoneid = zoneid;
3326 
3327 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3328 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3329 #ifdef DEBUG
3330 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3331 #endif
3332 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3333 	    "misc", KSTAT_TYPE_NAMED, ndata,
3334 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3335 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3336 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3337 		kstat_install(nfs_client_kstat);
3338 	}
3339 	mutex_enter(&nfs_clnt_list_lock);
3340 	list_insert_head(&nfs_clnt_list, nfscl);
3341 	mutex_exit(&nfs_clnt_list_lock);
3342 	return (nfscl);
3343 }
3344 
3345 /*ARGSUSED*/
3346 static void
3347 clfini_zone(zoneid_t zoneid, void *arg)
3348 {
3349 	struct nfs_clnt *nfscl = arg;
3350 	chhead_t *chp, *next;
3351 
3352 	if (nfscl == NULL)
3353 		return;
3354 	mutex_enter(&nfs_clnt_list_lock);
3355 	list_remove(&nfs_clnt_list, nfscl);
3356 	mutex_exit(&nfs_clnt_list_lock);
3357 	clreclaim_zone(nfscl, 0);
3358 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3359 		ASSERT(chp->ch_list == NULL);
3360 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3361 		next = chp->ch_next;
3362 		kmem_free(chp, sizeof (*chp));
3363 	}
3364 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3365 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3366 	kmem_free(nfscl, sizeof (*nfscl));
3367 }
3368 
3369 /*
3370  * Called by endpnt_destructor to make sure the client handles are
3371  * cleaned up before the RPC endpoints.  This becomes a no-op if
3372  * clfini_zone (above) is called first.  This function is needed
3373  * (rather than relying on clfini_zone to clean up) because the ZSD
3374  * callbacks have no ordering mechanism, so we have no way to ensure
3375  * that clfini_zone is called before endpnt_destructor.
3376  */
3377 void
3378 clcleanup_zone(zoneid_t zoneid)
3379 {
3380 	struct nfs_clnt *nfscl;
3381 
3382 	mutex_enter(&nfs_clnt_list_lock);
3383 	nfscl = list_head(&nfs_clnt_list);
3384 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3385 		if (nfscl->nfscl_zoneid == zoneid) {
3386 			clreclaim_zone(nfscl, 0);
3387 			break;
3388 		}
3389 	}
3390 	mutex_exit(&nfs_clnt_list_lock);
3391 }
3392 
3393 int
3394 nfs_subrinit(void)
3395 {
3396 	int i;
3397 	ulong_t nrnode_max;
3398 
3399 	/*
3400 	 * Allocate and initialize the rnode hash queues
3401 	 */
3402 	if (nrnode <= 0)
3403 		nrnode = ncsize;
3404 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3405 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3406 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3407 		    "setting nrnode to max value of %ld", nrnode_max);
3408 		nrnode = nrnode_max;
3409 	}
3410 
3411 	rtablesize = 1 << highbit(nrnode / hashlen);
3412 	rtablemask = rtablesize - 1;
3413 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3414 	for (i = 0; i < rtablesize; i++) {
3415 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3416 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3417 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3418 	}
3419 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3420 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3421 
3422 	/*
3423 	 * Allocate and initialize the access cache
3424 	 */
3425 
3426 	/*
3427 	 * Initial guess is one access cache entry per rnode unless
3428 	 * nacache is set to a non-zero value and then it is used to
3429 	 * indicate a guess at the number of access cache entries.
3430 	 */
3431 	if (nacache > 0)
3432 		acachesize = 1 << highbit(nacache / hashlen);
3433 	else
3434 		acachesize = rtablesize;
3435 	acachemask = acachesize - 1;
3436 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3437 	for (i = 0; i < acachesize; i++) {
3438 		acache[i].next = (acache_t *)&acache[i];
3439 		acache[i].prev = (acache_t *)&acache[i];
3440 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3441 	}
3442 	acache_cache = kmem_cache_create("nfs_access_cache",
3443 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3444 	/*
3445 	 * Allocate and initialize the client handle cache
3446 	 */
3447 	chtab_cache = kmem_cache_create("client_handle_cache",
3448 		sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL,
3449 		NULL, 0);
3450 	/*
3451 	 * Initialize the list of per-zone client handles (and associated data).
3452 	 * This needs to be done before we call zone_key_create().
3453 	 */
3454 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3455 	    offsetof(struct nfs_clnt, nfscl_node));
3456 	/*
3457 	 * Initialize the zone_key for per-zone client handle lists.
3458 	 */
3459 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3460 	/*
3461 	 * Initialize the various mutexes and reader/writer locks
3462 	 */
3463 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3464 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3465 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3466 
3467 	/*
3468 	 * Assign unique major number for all nfs mounts
3469 	 */
3470 	if ((nfs_major = getudev()) == -1) {
3471 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3472 		    "nfs: init: can't get unique device number");
3473 		nfs_major = 0;
3474 	}
3475 	nfs_minor = 0;
3476 
3477 	if (nfs3_jukebox_delay == 0)
3478 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3479 
3480 	return (0);
3481 }
3482 
3483 void
3484 nfs_subrfini(void)
3485 {
3486 	int i;
3487 
3488 	/*
3489 	 * Deallocate the rnode hash queues
3490 	 */
3491 	kmem_cache_destroy(rnode_cache);
3492 
3493 	for (i = 0; i < rtablesize; i++)
3494 		rw_destroy(&rtable[i].r_lock);
3495 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3496 
3497 	/*
3498 	 * Deallocated the access cache
3499 	 */
3500 	kmem_cache_destroy(acache_cache);
3501 
3502 	for (i = 0; i < acachesize; i++)
3503 		rw_destroy(&acache[i].lock);
3504 	kmem_free(acache, acachesize * sizeof (*acache));
3505 
3506 	/*
3507 	 * Deallocate the client handle cache
3508 	 */
3509 	kmem_cache_destroy(chtab_cache);
3510 
3511 	/*
3512 	 * Destroy the various mutexes and reader/writer locks
3513 	 */
3514 	mutex_destroy(&rpfreelist_lock);
3515 	mutex_destroy(&newnum_lock);
3516 	mutex_destroy(&nfs_minor_lock);
3517 	(void) zone_key_delete(nfsclnt_zone_key);
3518 }
3519 
3520 enum nfsstat
3521 puterrno(int error)
3522 {
3523 
3524 	switch (error) {
3525 	case EOPNOTSUPP:
3526 		return (NFSERR_OPNOTSUPP);
3527 	case ENAMETOOLONG:
3528 		return (NFSERR_NAMETOOLONG);
3529 	case ENOTEMPTY:
3530 		return (NFSERR_NOTEMPTY);
3531 	case EDQUOT:
3532 		return (NFSERR_DQUOT);
3533 	case ESTALE:
3534 		return (NFSERR_STALE);
3535 	case EREMOTE:
3536 		return (NFSERR_REMOTE);
3537 	case ENOSYS:
3538 		return (NFSERR_OPNOTSUPP);
3539 	case EOVERFLOW:
3540 		return (NFSERR_INVAL);
3541 	default:
3542 		return ((enum nfsstat)error);
3543 	}
3544 	/* NOTREACHED */
3545 }
3546 
3547 int
3548 geterrno(enum nfsstat status)
3549 {
3550 
3551 	switch (status) {
3552 	case NFSERR_OPNOTSUPP:
3553 		return (EOPNOTSUPP);
3554 	case NFSERR_NAMETOOLONG:
3555 		return (ENAMETOOLONG);
3556 	case NFSERR_NOTEMPTY:
3557 		return (ENOTEMPTY);
3558 	case NFSERR_DQUOT:
3559 		return (EDQUOT);
3560 	case NFSERR_STALE:
3561 		return (ESTALE);
3562 	case NFSERR_REMOTE:
3563 		return (EREMOTE);
3564 	case NFSERR_WFLUSH:
3565 		return (EIO);
3566 	default:
3567 		return ((int)status);
3568 	}
3569 	/* NOTREACHED */
3570 }
3571 
3572 enum nfsstat3
3573 puterrno3(int error)
3574 {
3575 
3576 #ifdef DEBUG
3577 	switch (error) {
3578 	case 0:
3579 		return (NFS3_OK);
3580 	case EPERM:
3581 		return (NFS3ERR_PERM);
3582 	case ENOENT:
3583 		return (NFS3ERR_NOENT);
3584 	case EIO:
3585 		return (NFS3ERR_IO);
3586 	case ENXIO:
3587 		return (NFS3ERR_NXIO);
3588 	case EACCES:
3589 		return (NFS3ERR_ACCES);
3590 	case EEXIST:
3591 		return (NFS3ERR_EXIST);
3592 	case EXDEV:
3593 		return (NFS3ERR_XDEV);
3594 	case ENODEV:
3595 		return (NFS3ERR_NODEV);
3596 	case ENOTDIR:
3597 		return (NFS3ERR_NOTDIR);
3598 	case EISDIR:
3599 		return (NFS3ERR_ISDIR);
3600 	case EINVAL:
3601 		return (NFS3ERR_INVAL);
3602 	case EFBIG:
3603 		return (NFS3ERR_FBIG);
3604 	case ENOSPC:
3605 		return (NFS3ERR_NOSPC);
3606 	case EROFS:
3607 		return (NFS3ERR_ROFS);
3608 	case EMLINK:
3609 		return (NFS3ERR_MLINK);
3610 	case ENAMETOOLONG:
3611 		return (NFS3ERR_NAMETOOLONG);
3612 	case ENOTEMPTY:
3613 		return (NFS3ERR_NOTEMPTY);
3614 	case EDQUOT:
3615 		return (NFS3ERR_DQUOT);
3616 	case ESTALE:
3617 		return (NFS3ERR_STALE);
3618 	case EREMOTE:
3619 		return (NFS3ERR_REMOTE);
3620 	case EOPNOTSUPP:
3621 		return (NFS3ERR_NOTSUPP);
3622 	case EOVERFLOW:
3623 		return (NFS3ERR_INVAL);
3624 	default:
3625 		zcmn_err(getzoneid(), CE_WARN,
3626 		    "puterrno3: got error %d", error);
3627 		return ((enum nfsstat3)error);
3628 	}
3629 #else
3630 	switch (error) {
3631 	case ENAMETOOLONG:
3632 		return (NFS3ERR_NAMETOOLONG);
3633 	case ENOTEMPTY:
3634 		return (NFS3ERR_NOTEMPTY);
3635 	case EDQUOT:
3636 		return (NFS3ERR_DQUOT);
3637 	case ESTALE:
3638 		return (NFS3ERR_STALE);
3639 	case EOPNOTSUPP:
3640 		return (NFS3ERR_NOTSUPP);
3641 	case EREMOTE:
3642 		return (NFS3ERR_REMOTE);
3643 	case EOVERFLOW:
3644 		return (NFS3ERR_INVAL);
3645 	default:
3646 		return ((enum nfsstat3)error);
3647 	}
3648 #endif
3649 }
3650 
3651 int
3652 geterrno3(enum nfsstat3 status)
3653 {
3654 
3655 #ifdef DEBUG
3656 	switch (status) {
3657 	case NFS3_OK:
3658 		return (0);
3659 	case NFS3ERR_PERM:
3660 		return (EPERM);
3661 	case NFS3ERR_NOENT:
3662 		return (ENOENT);
3663 	case NFS3ERR_IO:
3664 		return (EIO);
3665 	case NFS3ERR_NXIO:
3666 		return (ENXIO);
3667 	case NFS3ERR_ACCES:
3668 		return (EACCES);
3669 	case NFS3ERR_EXIST:
3670 		return (EEXIST);
3671 	case NFS3ERR_XDEV:
3672 		return (EXDEV);
3673 	case NFS3ERR_NODEV:
3674 		return (ENODEV);
3675 	case NFS3ERR_NOTDIR:
3676 		return (ENOTDIR);
3677 	case NFS3ERR_ISDIR:
3678 		return (EISDIR);
3679 	case NFS3ERR_INVAL:
3680 		return (EINVAL);
3681 	case NFS3ERR_FBIG:
3682 		return (EFBIG);
3683 	case NFS3ERR_NOSPC:
3684 		return (ENOSPC);
3685 	case NFS3ERR_ROFS:
3686 		return (EROFS);
3687 	case NFS3ERR_MLINK:
3688 		return (EMLINK);
3689 	case NFS3ERR_NAMETOOLONG:
3690 		return (ENAMETOOLONG);
3691 	case NFS3ERR_NOTEMPTY:
3692 		return (ENOTEMPTY);
3693 	case NFS3ERR_DQUOT:
3694 		return (EDQUOT);
3695 	case NFS3ERR_STALE:
3696 		return (ESTALE);
3697 	case NFS3ERR_REMOTE:
3698 		return (EREMOTE);
3699 	case NFS3ERR_BADHANDLE:
3700 		return (ESTALE);
3701 	case NFS3ERR_NOT_SYNC:
3702 		return (EINVAL);
3703 	case NFS3ERR_BAD_COOKIE:
3704 		return (ENOENT);
3705 	case NFS3ERR_NOTSUPP:
3706 		return (EOPNOTSUPP);
3707 	case NFS3ERR_TOOSMALL:
3708 		return (EINVAL);
3709 	case NFS3ERR_SERVERFAULT:
3710 		return (EIO);
3711 	case NFS3ERR_BADTYPE:
3712 		return (EINVAL);
3713 	case NFS3ERR_JUKEBOX:
3714 		return (ENXIO);
3715 	default:
3716 		zcmn_err(getzoneid(), CE_WARN,
3717 		    "geterrno3: got status %d", status);
3718 		return ((int)status);
3719 	}
3720 #else
3721 	switch (status) {
3722 	case NFS3ERR_NAMETOOLONG:
3723 		return (ENAMETOOLONG);
3724 	case NFS3ERR_NOTEMPTY:
3725 		return (ENOTEMPTY);
3726 	case NFS3ERR_DQUOT:
3727 		return (EDQUOT);
3728 	case NFS3ERR_STALE:
3729 	case NFS3ERR_BADHANDLE:
3730 		return (ESTALE);
3731 	case NFS3ERR_NOTSUPP:
3732 		return (EOPNOTSUPP);
3733 	case NFS3ERR_REMOTE:
3734 		return (EREMOTE);
3735 	case NFS3ERR_NOT_SYNC:
3736 	case NFS3ERR_TOOSMALL:
3737 	case NFS3ERR_BADTYPE:
3738 		return (EINVAL);
3739 	case NFS3ERR_BAD_COOKIE:
3740 		return (ENOENT);
3741 	case NFS3ERR_SERVERFAULT:
3742 		return (EIO);
3743 	case NFS3ERR_JUKEBOX:
3744 		return (ENXIO);
3745 	default:
3746 		return ((int)status);
3747 	}
3748 #endif
3749 }
3750 
3751 rddir_cache *
3752 rddir_cache_alloc(int flags)
3753 {
3754 	rddir_cache *rc;
3755 
3756 	rc = kmem_alloc(sizeof (*rc), flags);
3757 	if (rc != NULL) {
3758 		rc->entries = NULL;
3759 		rc->flags = RDDIR;
3760 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3761 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3762 		rc->count = 1;
3763 #ifdef DEBUG
3764 		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3765 #endif
3766 	}
3767 	return (rc);
3768 }
3769 
3770 static void
3771 rddir_cache_free(rddir_cache *rc)
3772 {
3773 
3774 #ifdef DEBUG
3775 	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3776 #endif
3777 	if (rc->entries != NULL) {
3778 #ifdef DEBUG
3779 		rddir_cache_buf_free(rc->entries, rc->buflen);
3780 #else
3781 		kmem_free(rc->entries, rc->buflen);
3782 #endif
3783 	}
3784 	cv_destroy(&rc->cv);
3785 	mutex_destroy(&rc->lock);
3786 	kmem_free(rc, sizeof (*rc));
3787 }
3788 
3789 void
3790 rddir_cache_hold(rddir_cache *rc)
3791 {
3792 
3793 	mutex_enter(&rc->lock);
3794 	rc->count++;
3795 	mutex_exit(&rc->lock);
3796 }
3797 
3798 void
3799 rddir_cache_rele(rddir_cache *rc)
3800 {
3801 
3802 	mutex_enter(&rc->lock);
3803 	ASSERT(rc->count > 0);
3804 	if (--rc->count == 0) {
3805 		mutex_exit(&rc->lock);
3806 		rddir_cache_free(rc);
3807 	} else
3808 		mutex_exit(&rc->lock);
3809 }
3810 
3811 #ifdef DEBUG
3812 char *
3813 rddir_cache_buf_alloc(size_t size, int flags)
3814 {
3815 	char *rc;
3816 
3817 	rc = kmem_alloc(size, flags);
3818 	if (rc != NULL)
3819 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3820 	return (rc);
3821 }
3822 
3823 void
3824 rddir_cache_buf_free(void *addr, size_t size)
3825 {
3826 
3827 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3828 	kmem_free(addr, size);
3829 }
3830 #endif
3831 
3832 static int
3833 nfs_free_data_reclaim(rnode_t *rp)
3834 {
3835 	char *contents;
3836 	int size;
3837 	vsecattr_t *vsp;
3838 	nfs3_pathconf_info *info;
3839 	int freed;
3840 	cred_t *cred;
3841 
3842 	/*
3843 	 * Free any held credentials and caches which
3844 	 * may be associated with this rnode.
3845 	 */
3846 	mutex_enter(&rp->r_statelock);
3847 	cred = rp->r_cred;
3848 	rp->r_cred = NULL;
3849 	contents = rp->r_symlink.contents;
3850 	size = rp->r_symlink.size;
3851 	rp->r_symlink.contents = NULL;
3852 	vsp = rp->r_secattr;
3853 	rp->r_secattr = NULL;
3854 	info = rp->r_pathconf;
3855 	rp->r_pathconf = NULL;
3856 	mutex_exit(&rp->r_statelock);
3857 
3858 	if (cred != NULL)
3859 		crfree(cred);
3860 
3861 	/*
3862 	 * Free the access cache entries.
3863 	 */
3864 	freed = nfs_access_purge_rp(rp);
3865 
3866 	if (!HAVE_RDDIR_CACHE(rp) &&
3867 	    contents == NULL &&
3868 	    vsp == NULL &&
3869 	    info == NULL)
3870 		return (freed);
3871 
3872 	/*
3873 	 * Free the readdir cache entries
3874 	 */
3875 	if (HAVE_RDDIR_CACHE(rp))
3876 		nfs_purge_rddir_cache(RTOV(rp));
3877 
3878 	/*
3879 	 * Free the symbolic link cache.
3880 	 */
3881 	if (contents != NULL) {
3882 
3883 		kmem_free((void *)contents, size);
3884 	}
3885 
3886 	/*
3887 	 * Free any cached ACL.
3888 	 */
3889 	if (vsp != NULL)
3890 		nfs_acl_free(vsp);
3891 
3892 	/*
3893 	 * Free any cached pathconf information.
3894 	 */
3895 	if (info != NULL)
3896 		kmem_free(info, sizeof (*info));
3897 
3898 	return (1);
3899 }
3900 
3901 static int
3902 nfs_active_data_reclaim(rnode_t *rp)
3903 {
3904 	char *contents;
3905 	int size;
3906 	vsecattr_t *vsp;
3907 	nfs3_pathconf_info *info;
3908 	int freed;
3909 
3910 	/*
3911 	 * Free any held credentials and caches which
3912 	 * may be associated with this rnode.
3913 	 */
3914 	if (!mutex_tryenter(&rp->r_statelock))
3915 		return (0);
3916 	contents = rp->r_symlink.contents;
3917 	size = rp->r_symlink.size;
3918 	rp->r_symlink.contents = NULL;
3919 	vsp = rp->r_secattr;
3920 	rp->r_secattr = NULL;
3921 	info = rp->r_pathconf;
3922 	rp->r_pathconf = NULL;
3923 	mutex_exit(&rp->r_statelock);
3924 
3925 	/*
3926 	 * Free the access cache entries.
3927 	 */
3928 	freed = nfs_access_purge_rp(rp);
3929 
3930 	if (!HAVE_RDDIR_CACHE(rp) &&
3931 	    contents == NULL &&
3932 	    vsp == NULL &&
3933 	    info == NULL)
3934 		return (freed);
3935 
3936 	/*
3937 	 * Free the readdir cache entries
3938 	 */
3939 	if (HAVE_RDDIR_CACHE(rp))
3940 		nfs_purge_rddir_cache(RTOV(rp));
3941 
3942 	/*
3943 	 * Free the symbolic link cache.
3944 	 */
3945 	if (contents != NULL) {
3946 
3947 		kmem_free((void *)contents, size);
3948 	}
3949 
3950 	/*
3951 	 * Free any cached ACL.
3952 	 */
3953 	if (vsp != NULL)
3954 		nfs_acl_free(vsp);
3955 
3956 	/*
3957 	 * Free any cached pathconf information.
3958 	 */
3959 	if (info != NULL)
3960 		kmem_free(info, sizeof (*info));
3961 
3962 	return (1);
3963 }
3964 
3965 static int
3966 nfs_free_reclaim(void)
3967 {
3968 	int freed;
3969 	rnode_t *rp;
3970 
3971 #ifdef DEBUG
3972 	clstat_debug.f_reclaim.value.ui64++;
3973 #endif
3974 	freed = 0;
3975 	mutex_enter(&rpfreelist_lock);
3976 	rp = rpfreelist;
3977 	if (rp != NULL) {
3978 		do {
3979 			if (nfs_free_data_reclaim(rp))
3980 				freed = 1;
3981 		} while ((rp = rp->r_freef) != rpfreelist);
3982 	}
3983 	mutex_exit(&rpfreelist_lock);
3984 	return (freed);
3985 }
3986 
3987 static int
3988 nfs_active_reclaim(void)
3989 {
3990 	int freed;
3991 	int index;
3992 	rnode_t *rp;
3993 
3994 #ifdef DEBUG
3995 	clstat_debug.a_reclaim.value.ui64++;
3996 #endif
3997 	freed = 0;
3998 	for (index = 0; index < rtablesize; index++) {
3999 		rw_enter(&rtable[index].r_lock, RW_READER);
4000 		for (rp = rtable[index].r_hashf;
4001 		    rp != (rnode_t *)(&rtable[index]);
4002 		    rp = rp->r_hashf) {
4003 			if (nfs_active_data_reclaim(rp))
4004 				freed = 1;
4005 		}
4006 		rw_exit(&rtable[index].r_lock);
4007 	}
4008 	return (freed);
4009 }
4010 
4011 static int
4012 nfs_rnode_reclaim(void)
4013 {
4014 	int freed;
4015 	rnode_t *rp;
4016 	vnode_t *vp;
4017 
4018 #ifdef DEBUG
4019 	clstat_debug.r_reclaim.value.ui64++;
4020 #endif
4021 	freed = 0;
4022 	mutex_enter(&rpfreelist_lock);
4023 	while ((rp = rpfreelist) != NULL) {
4024 		rp_rmfree(rp);
4025 		mutex_exit(&rpfreelist_lock);
4026 		if (rp->r_flags & RHASHED) {
4027 			vp = RTOV(rp);
4028 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4029 			mutex_enter(&vp->v_lock);
4030 			if (vp->v_count > 1) {
4031 				vp->v_count--;
4032 				mutex_exit(&vp->v_lock);
4033 				rw_exit(&rp->r_hashq->r_lock);
4034 				mutex_enter(&rpfreelist_lock);
4035 				continue;
4036 			}
4037 			mutex_exit(&vp->v_lock);
4038 			rp_rmhash_locked(rp);
4039 			rw_exit(&rp->r_hashq->r_lock);
4040 		}
4041 		/*
4042 		 * This call to rp_addfree will end up destroying the
4043 		 * rnode, but in a safe way with the appropriate set
4044 		 * of checks done.
4045 		 */
4046 		rp_addfree(rp, CRED());
4047 		mutex_enter(&rpfreelist_lock);
4048 	}
4049 	mutex_exit(&rpfreelist_lock);
4050 	return (freed);
4051 }
4052 
4053 /*ARGSUSED*/
4054 static void
4055 nfs_reclaim(void *cdrarg)
4056 {
4057 
4058 #ifdef DEBUG
4059 	clstat_debug.reclaim.value.ui64++;
4060 #endif
4061 	if (nfs_free_reclaim())
4062 		return;
4063 
4064 	if (nfs_active_reclaim())
4065 		return;
4066 
4067 	(void) nfs_rnode_reclaim();
4068 }
4069 
4070 /*
4071  * NFS client failover support
4072  *
4073  * Routines to copy filehandles
4074  */
4075 void
4076 nfscopyfh(caddr_t fhp, vnode_t *vp)
4077 {
4078 	fhandle_t *dest = (fhandle_t *)fhp;
4079 
4080 	if (dest != NULL)
4081 		*dest = *VTOFH(vp);
4082 }
4083 
4084 void
4085 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4086 {
4087 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4088 
4089 	if (dest != NULL)
4090 		*dest = *VTOFH3(vp);
4091 }
4092 
4093 /*
4094  * NFS client failover support
4095  *
4096  * failover_safe() will test various conditions to ensure that
4097  * failover is permitted for this vnode.  It will be denied
4098  * if:
4099  *	1) the operation in progress does not support failover (NULL fi)
4100  *	2) there are no available replicas (NULL mi_servers->sv_next)
4101  *	3) any locks are outstanding on this file
4102  */
4103 static int
4104 failover_safe(failinfo_t *fi)
4105 {
4106 
4107 	/*
4108 	 * Does this op permit failover?
4109 	 */
4110 	if (fi == NULL || fi->vp == NULL)
4111 		return (0);
4112 
4113 	/*
4114 	 * Are there any alternates to failover to?
4115 	 */
4116 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4117 		return (0);
4118 
4119 	/*
4120 	 * Disable check; we've forced local locking
4121 	 *
4122 	 * if (flk_has_remote_locks(fi->vp))
4123 	 *	return (0);
4124 	 */
4125 
4126 	/*
4127 	 * If we have no partial path, we can't do anything
4128 	 */
4129 	if (VTOR(fi->vp)->r_path == NULL)
4130 		return (0);
4131 
4132 	return (1);
4133 }
4134 
4135 #include <sys/thread.h>
4136 
4137 /*
4138  * NFS client failover support
4139  *
4140  * failover_newserver() will start a search for a new server,
4141  * preferably by starting an async thread to do the work.  If
4142  * someone is already doing this (recognizable by MI_BINDINPROG
4143  * being set), it will simply return and the calling thread
4144  * will queue on the mi_failover_cv condition variable.
4145  */
4146 static void
4147 failover_newserver(mntinfo_t *mi)
4148 {
4149 	/*
4150 	 * Check if someone else is doing this already
4151 	 */
4152 	mutex_enter(&mi->mi_lock);
4153 	if (mi->mi_flags & MI_BINDINPROG) {
4154 		mutex_exit(&mi->mi_lock);
4155 		return;
4156 	}
4157 	mi->mi_flags |= MI_BINDINPROG;
4158 
4159 	/*
4160 	 * Need to hold the vfs struct so that it can't be released
4161 	 * while the failover thread is selecting a new server.
4162 	 */
4163 	VFS_HOLD(mi->mi_vfsp);
4164 
4165 	/*
4166 	 * Start a thread to do the real searching.
4167 	 */
4168 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4169 
4170 	mutex_exit(&mi->mi_lock);
4171 }
4172 
4173 /*
4174  * NFS client failover support
4175  *
4176  * failover_thread() will find a new server to replace the one
4177  * currently in use, wake up other threads waiting on this mount
4178  * point, and die.  It will start at the head of the server list
4179  * and poll servers until it finds one with an NFS server which is
4180  * registered and responds to a NULL procedure ping.
4181  *
4182  * XXX failover_thread is unsafe within the scope of the
4183  * present model defined for cpr to suspend the system.
4184  * Specifically, over-the-wire calls made by the thread
4185  * are unsafe. The thread needs to be reevaluated in case of
4186  * future updates to the cpr suspend model.
4187  */
4188 static void
4189 failover_thread(mntinfo_t *mi)
4190 {
4191 	servinfo_t *svp = NULL;
4192 	CLIENT *cl;
4193 	enum clnt_stat status;
4194 	struct timeval tv;
4195 	int error;
4196 	int oncethru = 0;
4197 	callb_cpr_t cprinfo;
4198 	rnode_t *rp;
4199 	int index;
4200 	char *srvnames;
4201 	size_t srvnames_len;
4202 	struct nfs_clnt *nfscl = NULL;
4203 	zoneid_t zoneid = getzoneid();
4204 
4205 #ifdef DEBUG
4206 	/*
4207 	 * This is currently only needed to access counters which exist on
4208 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4209 	 * on non-DEBUG kernels.
4210 	 */
4211 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4212 	ASSERT(nfscl != NULL);
4213 #endif
4214 
4215 	/*
4216 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4217 	 * code guarantees that there will be only one failover thread
4218 	 * per mountinfo at any instance.
4219 	 */
4220 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4221 	    "failover_thread");
4222 
4223 	mutex_enter(&mi->mi_lock);
4224 	while (mi->mi_readers) {
4225 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4226 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4227 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4228 	}
4229 	mutex_exit(&mi->mi_lock);
4230 
4231 	tv.tv_sec = 2;
4232 	tv.tv_usec = 0;
4233 
4234 	/*
4235 	 * Ping the null NFS procedure of every server in
4236 	 * the list until one responds.  We always start
4237 	 * at the head of the list and always skip the one
4238 	 * that is current, since it's caused us a problem.
4239 	 */
4240 	while (svp == NULL) {
4241 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4242 			if (!oncethru && svp == mi->mi_curr_serv)
4243 				continue;
4244 
4245 			/*
4246 			 * If the file system was forcibly umounted
4247 			 * while trying to do a failover, then just
4248 			 * give up on the failover.  It won't matter
4249 			 * what the server is.
4250 			 */
4251 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4252 				svp = NULL;
4253 				goto done;
4254 			}
4255 
4256 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4257 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4258 			if (error)
4259 				continue;
4260 
4261 			if (!(mi->mi_flags & MI_INT))
4262 				cl->cl_nosignal = TRUE;
4263 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4264 			    xdr_void, NULL, tv);
4265 			if (!(mi->mi_flags & MI_INT))
4266 				cl->cl_nosignal = FALSE;
4267 			AUTH_DESTROY(cl->cl_auth);
4268 			CLNT_DESTROY(cl);
4269 			if (status == RPC_SUCCESS) {
4270 				if (svp == mi->mi_curr_serv) {
4271 #ifdef DEBUG
4272 					zcmn_err(zoneid, CE_NOTE,
4273 			"NFS%d: failing over: selecting original server %s",
4274 					    mi->mi_vers, svp->sv_hostname);
4275 #else
4276 					zcmn_err(zoneid, CE_NOTE,
4277 			"NFS: failing over: selecting original server %s",
4278 					    svp->sv_hostname);
4279 #endif
4280 				} else {
4281 #ifdef DEBUG
4282 					zcmn_err(zoneid, CE_NOTE,
4283 				    "NFS%d: failing over from %s to %s",
4284 					    mi->mi_vers,
4285 					    mi->mi_curr_serv->sv_hostname,
4286 					    svp->sv_hostname);
4287 #else
4288 					zcmn_err(zoneid, CE_NOTE,
4289 				    "NFS: failing over from %s to %s",
4290 					    mi->mi_curr_serv->sv_hostname,
4291 					    svp->sv_hostname);
4292 #endif
4293 				}
4294 				break;
4295 			}
4296 		}
4297 
4298 		if (svp == NULL) {
4299 			if (!oncethru) {
4300 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4301 #ifdef DEBUG
4302 				zprintf(zoneid,
4303 				    "NFS%d servers %s not responding "
4304 				    "still trying\n", mi->mi_vers, srvnames);
4305 #else
4306 				zprintf(zoneid, "NFS servers %s not responding "
4307 				    "still trying\n", srvnames);
4308 #endif
4309 				oncethru = 1;
4310 			}
4311 			mutex_enter(&mi->mi_lock);
4312 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4313 			mutex_exit(&mi->mi_lock);
4314 			delay(hz);
4315 			mutex_enter(&mi->mi_lock);
4316 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4317 			mutex_exit(&mi->mi_lock);
4318 		}
4319 	}
4320 
4321 	if (oncethru) {
4322 #ifdef DEBUG
4323 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4324 #else
4325 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4326 #endif
4327 	}
4328 
4329 	if (svp != mi->mi_curr_serv) {
4330 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4331 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4332 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4333 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4334 		    mi->mi_vfsp);
4335 		if (rp != NULL) {
4336 			if (rp->r_flags & RHASHED)
4337 				rp_rmhash_locked(rp);
4338 			rw_exit(&rtable[index].r_lock);
4339 			rp->r_server = svp;
4340 			rp->r_fh = svp->sv_fhandle;
4341 			(void) nfs_free_data_reclaim(rp);
4342 			index = rtablehash(&rp->r_fh);
4343 			rp->r_hashq = &rtable[index];
4344 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4345 			vn_exists(RTOV(rp));
4346 			rp_addhash(rp);
4347 			rw_exit(&rp->r_hashq->r_lock);
4348 			VN_RELE(RTOV(rp));
4349 		} else
4350 			rw_exit(&rtable[index].r_lock);
4351 	}
4352 
4353 done:
4354 	if (oncethru)
4355 		kmem_free(srvnames, srvnames_len);
4356 	mutex_enter(&mi->mi_lock);
4357 	mi->mi_flags &= ~MI_BINDINPROG;
4358 	if (svp != NULL) {
4359 		mi->mi_curr_serv = svp;
4360 		mi->mi_failover++;
4361 #ifdef DEBUG
4362 	nfscl->nfscl_stat.failover.value.ui64++;
4363 #endif
4364 	}
4365 	cv_broadcast(&mi->mi_failover_cv);
4366 	CALLB_CPR_EXIT(&cprinfo);
4367 	VFS_RELE(mi->mi_vfsp);
4368 	zthread_exit();
4369 	/* NOTREACHED */
4370 }
4371 
4372 /*
4373  * NFS client failover support
4374  *
4375  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4376  * is cleared, meaning that failover is complete.  Called with
4377  * mi_lock mutex held.
4378  */
4379 static int
4380 failover_wait(mntinfo_t *mi)
4381 {
4382 	k_sigset_t smask;
4383 
4384 	/*
4385 	 * If someone else is hunting for a living server,
4386 	 * sleep until it's done.  After our sleep, we may
4387 	 * be bound to the right server and get off cheaply.
4388 	 */
4389 	while (mi->mi_flags & MI_BINDINPROG) {
4390 		/*
4391 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4392 		 * and SIGTERM. (Preserving the existing masks).
4393 		 * Mask out SIGINT if mount option nointr is specified.
4394 		 */
4395 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4396 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4397 			/*
4398 			 * restore original signal mask
4399 			 */
4400 			sigunintr(&smask);
4401 			return (EINTR);
4402 		}
4403 		/*
4404 		 * restore original signal mask
4405 		 */
4406 		sigunintr(&smask);
4407 	}
4408 	return (0);
4409 }
4410 
4411 /*
4412  * NFS client failover support
4413  *
4414  * failover_remap() will do a partial pathname lookup and find the
4415  * desired vnode on the current server.  The interim vnode will be
4416  * discarded after we pilfer the new filehandle.
4417  *
4418  * Side effects:
4419  * - This routine will also update the filehandle in the args structure
4420  *    pointed to by the fi->fhp pointer if it is non-NULL.
4421  */
4422 
4423 static int
4424 failover_remap(failinfo_t *fi)
4425 {
4426 	vnode_t *vp, *nvp, *rootvp;
4427 	rnode_t *rp, *nrp;
4428 	mntinfo_t *mi;
4429 	int error;
4430 #ifdef DEBUG
4431 	struct nfs_clnt *nfscl;
4432 
4433 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4434 	ASSERT(nfscl != NULL);
4435 #endif
4436 	/*
4437 	 * Sanity check
4438 	 */
4439 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4440 		return (EINVAL);
4441 	vp = fi->vp;
4442 	rp = VTOR(vp);
4443 	mi = VTOMI(vp);
4444 
4445 	if (!(vp->v_flag & VROOT)) {
4446 		/*
4447 		 * Given the root fh, use the path stored in
4448 		 * the rnode to find the fh for the new server.
4449 		 */
4450 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4451 		if (error)
4452 			return (error);
4453 
4454 		error = failover_lookup(rp->r_path, rootvp,
4455 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4456 
4457 		VN_RELE(rootvp);
4458 
4459 		if (error)
4460 			return (error);
4461 
4462 		/*
4463 		 * If we found the same rnode, we're done now
4464 		 */
4465 		if (nvp == vp) {
4466 			/*
4467 			 * Failed and the new server may physically be same
4468 			 * OR may share a same disk subsystem. In this case
4469 			 * file handle for a particular file path is not going
4470 			 * to change, given the same filehandle lookup will
4471 			 * always locate the same rnode as the existing one.
4472 			 * All we might need to do is to update the r_server
4473 			 * with the current servinfo.
4474 			 */
4475 			if (!VALID_FH(fi)) {
4476 				rp->r_server = mi->mi_curr_serv;
4477 			}
4478 			VN_RELE(nvp);
4479 			return (0);
4480 		}
4481 
4482 		/*
4483 		 * Try to make it so that no one else will find this
4484 		 * vnode because it is just a temporary to hold the
4485 		 * new file handle until that file handle can be
4486 		 * copied to the original vnode/rnode.
4487 		 */
4488 		nrp = VTOR(nvp);
4489 		mutex_enter(&mi->mi_remap_lock);
4490 		/*
4491 		 * Some other thread could have raced in here and could
4492 		 * have done the remap for this particular rnode before
4493 		 * this thread here. Check for rp->r_server and
4494 		 * mi->mi_curr_serv and return if they are same.
4495 		 */
4496 		if (VALID_FH(fi)) {
4497 			mutex_exit(&mi->mi_remap_lock);
4498 			VN_RELE(nvp);
4499 			return (0);
4500 		}
4501 
4502 		if (nrp->r_flags & RHASHED)
4503 			rp_rmhash(nrp);
4504 
4505 		/*
4506 		 * As a heuristic check on the validity of the new
4507 		 * file, check that the size and type match against
4508 		 * that we remember from the old version.
4509 		 */
4510 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4511 			mutex_exit(&mi->mi_remap_lock);
4512 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4513 			    "NFS replicas %s and %s: file %s not same.",
4514 			    rp->r_server->sv_hostname,
4515 			    nrp->r_server->sv_hostname, rp->r_path);
4516 			VN_RELE(nvp);
4517 			return (EINVAL);
4518 		}
4519 
4520 		/*
4521 		 * snarf the filehandle from the new rnode
4522 		 * then release it, again while updating the
4523 		 * hash queues for the rnode.
4524 		 */
4525 		if (rp->r_flags & RHASHED)
4526 			rp_rmhash(rp);
4527 		rp->r_server = mi->mi_curr_serv;
4528 		rp->r_fh = nrp->r_fh;
4529 		rp->r_hashq = nrp->r_hashq;
4530 		/*
4531 		 * Copy the attributes from the new rnode to the old
4532 		 * rnode.  This will help to reduce unnecessary page
4533 		 * cache flushes.
4534 		 */
4535 		rp->r_attr = nrp->r_attr;
4536 		rp->r_attrtime = nrp->r_attrtime;
4537 		rp->r_mtime = nrp->r_mtime;
4538 		(void) nfs_free_data_reclaim(rp);
4539 		nfs_setswaplike(vp, &rp->r_attr);
4540 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4541 		rp_addhash(rp);
4542 		rw_exit(&rp->r_hashq->r_lock);
4543 		mutex_exit(&mi->mi_remap_lock);
4544 		VN_RELE(nvp);
4545 	}
4546 
4547 	/*
4548 	 * Update successful failover remap count
4549 	 */
4550 	mutex_enter(&mi->mi_lock);
4551 	mi->mi_remap++;
4552 	mutex_exit(&mi->mi_lock);
4553 #ifdef DEBUG
4554 	nfscl->nfscl_stat.remap.value.ui64++;
4555 #endif
4556 
4557 	/*
4558 	 * If we have a copied filehandle to update, do it now.
4559 	 */
4560 	if (fi->fhp != NULL && fi->copyproc != NULL)
4561 		(*fi->copyproc)(fi->fhp, vp);
4562 
4563 	return (0);
4564 }
4565 
4566 /*
4567  * NFS client failover support
4568  *
4569  * We want a simple pathname lookup routine to parse the pieces
4570  * of path in rp->r_path.  We know that the path was a created
4571  * as rnodes were made, so we know we have only to deal with
4572  * paths that look like:
4573  *	dir1/dir2/dir3/file
4574  * Any evidence of anything like .., symlinks, and ENOTDIR
4575  * are hard errors, because they mean something in this filesystem
4576  * is different from the one we came from, or has changed under
4577  * us in some way.  If this is true, we want the failure.
4578  *
4579  * Extended attributes: if the filesystem is mounted with extended
4580  * attributes enabled (-o xattr), the attribute directory will be
4581  * represented in the r_path as the magic name XATTR_RPATH. So if
4582  * we see that name in the pathname, is must be because this node
4583  * is an extended attribute.  Therefore, look it up that way.
4584  */
4585 static int
4586 failover_lookup(char *path, vnode_t *root,
4587     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4588 	vnode_t *, cred_t *, int),
4589     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4590     vnode_t **new)
4591 {
4592 	vnode_t *dvp, *nvp;
4593 	int error = EINVAL;
4594 	char *s, *p, *tmppath;
4595 	size_t len;
4596 	mntinfo_t *mi;
4597 	bool_t xattr;
4598 
4599 	/* Make local copy of path */
4600 	len = strlen(path) + 1;
4601 	tmppath = kmem_alloc(len, KM_SLEEP);
4602 	(void) strcpy(tmppath, path);
4603 	s = tmppath;
4604 
4605 	dvp = root;
4606 	VN_HOLD(dvp);
4607 	mi = VTOMI(root);
4608 	xattr = mi->mi_flags & MI_EXTATTR;
4609 
4610 	do {
4611 		p = strchr(s, '/');
4612 		if (p != NULL)
4613 			*p = '\0';
4614 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4615 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4616 			    RFSCALL_SOFT);
4617 		} else {
4618 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4619 			    CRED(), RFSCALL_SOFT);
4620 		}
4621 		if (p != NULL)
4622 			*p++ = '/';
4623 		if (error) {
4624 			VN_RELE(dvp);
4625 			kmem_free(tmppath, len);
4626 			return (error);
4627 		}
4628 		s = p;
4629 		VN_RELE(dvp);
4630 		dvp = nvp;
4631 	} while (p != NULL);
4632 
4633 	if (nvp != NULL && new != NULL)
4634 		*new = nvp;
4635 	kmem_free(tmppath, len);
4636 	return (0);
4637 }
4638 
4639 /*
4640  * NFS client failover support
4641  *
4642  * sv_free() frees the malloc'd portion of a "servinfo_t".
4643  */
4644 void
4645 sv_free(servinfo_t *svp)
4646 {
4647 	servinfo_t *next;
4648 	struct knetconfig *knconf;
4649 
4650 	while (svp != NULL) {
4651 		next = svp->sv_next;
4652 		if (svp->sv_secdata)
4653 			sec_clnt_freeinfo(svp->sv_secdata);
4654 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4655 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4656 		knconf = svp->sv_knconf;
4657 		if (knconf != NULL) {
4658 			if (knconf->knc_protofmly != NULL)
4659 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4660 			if (knconf->knc_proto != NULL)
4661 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4662 			kmem_free(knconf, sizeof (*knconf));
4663 		}
4664 		knconf = svp->sv_origknconf;
4665 		if (knconf != NULL) {
4666 			if (knconf->knc_protofmly != NULL)
4667 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4668 			if (knconf->knc_proto != NULL)
4669 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4670 			kmem_free(knconf, sizeof (*knconf));
4671 		}
4672 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4673 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4674 		mutex_destroy(&svp->sv_lock);
4675 		kmem_free(svp, sizeof (*svp));
4676 		svp = next;
4677 	}
4678 }
4679 
4680 /*
4681  * Only can return non-zero if intr != 0.
4682  */
4683 int
4684 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4685 {
4686 
4687 	mutex_enter(&l->lock);
4688 
4689 	/*
4690 	 * If this is a nested enter, then allow it.  There
4691 	 * must be as many exits as enters through.
4692 	 */
4693 	if (l->owner == curthread) {
4694 		/* lock is held for writing by current thread */
4695 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4696 		l->count--;
4697 	} else if (rw == RW_READER) {
4698 		/*
4699 		 * While there is a writer active or writers waiting,
4700 		 * then wait for them to finish up and move on.  Then,
4701 		 * increment the count to indicate that a reader is
4702 		 * active.
4703 		 */
4704 		while (l->count < 0 || l->waiters > 0) {
4705 			if (intr) {
4706 				klwp_t *lwp = ttolwp(curthread);
4707 
4708 				if (lwp != NULL)
4709 					lwp->lwp_nostop++;
4710 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4711 					if (lwp != NULL)
4712 						lwp->lwp_nostop--;
4713 					mutex_exit(&l->lock);
4714 					return (EINTR);
4715 				}
4716 				if (lwp != NULL)
4717 					lwp->lwp_nostop--;
4718 			} else
4719 				cv_wait(&l->cv, &l->lock);
4720 		}
4721 		ASSERT(l->count < INT_MAX);
4722 #ifdef	DEBUG
4723 		if ((l->count % 10000) == 9999)
4724 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4725 				"rwlock @ %p\n", l->count, (void *)&l);
4726 #endif
4727 		l->count++;
4728 	} else {
4729 		ASSERT(rw == RW_WRITER);
4730 		/*
4731 		 * While there are readers active or a writer
4732 		 * active, then wait for all of the readers
4733 		 * to finish or for the writer to finish.
4734 		 * Then, set the owner field to curthread and
4735 		 * decrement count to indicate that a writer
4736 		 * is active.
4737 		 */
4738 		while (l->count > 0 || l->owner != NULL) {
4739 			l->waiters++;
4740 			if (intr) {
4741 				klwp_t *lwp = ttolwp(curthread);
4742 
4743 				if (lwp != NULL)
4744 					lwp->lwp_nostop++;
4745 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4746 					if (lwp != NULL)
4747 						lwp->lwp_nostop--;
4748 					l->waiters--;
4749 					cv_broadcast(&l->cv);
4750 					mutex_exit(&l->lock);
4751 					return (EINTR);
4752 				}
4753 				if (lwp != NULL)
4754 					lwp->lwp_nostop--;
4755 			} else
4756 				cv_wait(&l->cv, &l->lock);
4757 			l->waiters--;
4758 		}
4759 		l->owner = curthread;
4760 		l->count--;
4761 	}
4762 
4763 	mutex_exit(&l->lock);
4764 
4765 	return (0);
4766 }
4767 
4768 /*
4769  * If the lock is available, obtain it and return non-zero.  If there is
4770  * already a conflicting lock, return 0 immediately.
4771  */
4772 
4773 int
4774 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4775 {
4776 	mutex_enter(&l->lock);
4777 
4778 	/*
4779 	 * If this is a nested enter, then allow it.  There
4780 	 * must be as many exits as enters through.
4781 	 */
4782 	if (l->owner == curthread) {
4783 		/* lock is held for writing by current thread */
4784 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4785 		l->count--;
4786 	} else if (rw == RW_READER) {
4787 		/*
4788 		 * If there is a writer active or writers waiting, deny the
4789 		 * lock.  Otherwise, bump the count of readers.
4790 		 */
4791 		if (l->count < 0 || l->waiters > 0) {
4792 			mutex_exit(&l->lock);
4793 			return (0);
4794 		}
4795 		l->count++;
4796 	} else {
4797 		ASSERT(rw == RW_WRITER);
4798 		/*
4799 		 * If there are readers active or a writer active, deny the
4800 		 * lock.  Otherwise, set the owner field to curthread and
4801 		 * decrement count to indicate that a writer is active.
4802 		 */
4803 		if (l->count > 0 || l->owner != NULL) {
4804 			mutex_exit(&l->lock);
4805 			return (0);
4806 		}
4807 		l->owner = curthread;
4808 		l->count--;
4809 	}
4810 
4811 	mutex_exit(&l->lock);
4812 
4813 	return (1);
4814 }
4815 
4816 void
4817 nfs_rw_exit(nfs_rwlock_t *l)
4818 {
4819 
4820 	mutex_enter(&l->lock);
4821 	/*
4822 	 * If this is releasing a writer lock, then increment count to
4823 	 * indicate that there is one less writer active.  If this was
4824 	 * the last of possibly nested writer locks, then clear the owner
4825 	 * field as well to indicate that there is no writer active
4826 	 * and wakeup any possible waiting writers or readers.
4827 	 *
4828 	 * If releasing a reader lock, then just decrement count to
4829 	 * indicate that there is one less reader active.  If this was
4830 	 * the last active reader and there are writer(s) waiting,
4831 	 * then wake up the first.
4832 	 */
4833 	if (l->owner != NULL) {
4834 		ASSERT(l->owner == curthread);
4835 		l->count++;
4836 		if (l->count == 0) {
4837 			l->owner = NULL;
4838 			cv_broadcast(&l->cv);
4839 		}
4840 	} else {
4841 		ASSERT(l->count > 0);
4842 		l->count--;
4843 		if (l->count == 0 && l->waiters > 0)
4844 			cv_broadcast(&l->cv);
4845 	}
4846 	mutex_exit(&l->lock);
4847 }
4848 
4849 int
4850 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4851 {
4852 
4853 	if (rw == RW_READER)
4854 		return (l->count > 0);
4855 	ASSERT(rw == RW_WRITER);
4856 	return (l->count < 0);
4857 }
4858 
4859 /* ARGSUSED */
4860 void
4861 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4862 {
4863 
4864 	l->count = 0;
4865 	l->waiters = 0;
4866 	l->owner = NULL;
4867 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4868 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4869 }
4870 
4871 void
4872 nfs_rw_destroy(nfs_rwlock_t *l)
4873 {
4874 
4875 	mutex_destroy(&l->lock);
4876 	cv_destroy(&l->cv);
4877 }
4878 
4879 int
4880 nfs3_rddir_compar(const void *x, const void *y)
4881 {
4882 	rddir_cache *a = (rddir_cache *)x;
4883 	rddir_cache *b = (rddir_cache *)y;
4884 
4885 	if (a->nfs3_cookie == b->nfs3_cookie) {
4886 		if (a->buflen == b->buflen)
4887 			return (0);
4888 		if (a->buflen < b->buflen)
4889 			return (-1);
4890 		return (1);
4891 	}
4892 
4893 	if (a->nfs3_cookie < b->nfs3_cookie)
4894 		return (-1);
4895 
4896 	return (1);
4897 }
4898 
4899 int
4900 nfs_rddir_compar(const void *x, const void *y)
4901 {
4902 	rddir_cache *a = (rddir_cache *)x;
4903 	rddir_cache *b = (rddir_cache *)y;
4904 
4905 	if (a->nfs_cookie == b->nfs_cookie) {
4906 		if (a->buflen == b->buflen)
4907 			return (0);
4908 		if (a->buflen < b->buflen)
4909 			return (-1);
4910 		return (1);
4911 	}
4912 
4913 	if (a->nfs_cookie < b->nfs_cookie)
4914 		return (-1);
4915 
4916 	return (1);
4917 }
4918 
4919 static char *
4920 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4921 {
4922 	servinfo_t *s;
4923 	char *srvnames;
4924 	char *namep;
4925 	size_t length;
4926 
4927 	/*
4928 	 * Calculate the length of the string required to hold all
4929 	 * of the server names plus either a comma or a null
4930 	 * character following each individual one.
4931 	 */
4932 	length = 0;
4933 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4934 		length += s->sv_hostnamelen;
4935 
4936 	srvnames = kmem_alloc(length, KM_SLEEP);
4937 
4938 	namep = srvnames;
4939 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4940 		(void) strcpy(namep, s->sv_hostname);
4941 		namep += s->sv_hostnamelen - 1;
4942 		*namep++ = ',';
4943 	}
4944 	*--namep = '\0';
4945 
4946 	*len = length;
4947 
4948 	return (srvnames);
4949 }
4950 
4951 /*
4952  * These two functions are temporary and designed for the upgrade-workaround
4953  * only.  They cannot be used for general zone-crossing NFS client support, and
4954  * will be removed shortly.
4955  *
4956  * When the workaround is enabled, all NFS traffic is forced into the global
4957  * zone.  These functions are called when the code needs to refer to the state
4958  * of the underlying network connection.  They're not called when the function
4959  * needs to refer to the state of the process that invoked the system call.
4960  * (E.g., when checking whether the zone is shutting down during the mount()
4961  * call.)
4962  */
4963 
4964 struct zone *
4965 nfs_zone(void)
4966 {
4967 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
4968 }
4969 
4970 zoneid_t
4971 nfs_zoneid(void)
4972 {
4973 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
4974 }
4975