xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_subr.c (revision 726fad2a65f16c200a03969c29cb5c86c2d427db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/param.h>
27 #include <sys/types.h>
28 #include <sys/systm.h>
29 #include <sys/cred.h>
30 #include <sys/proc.h>
31 #include <sys/user.h>
32 #include <sys/time.h>
33 #include <sys/buf.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/socket.h>
37 #include <sys/uio.h>
38 #include <sys/tiuser.h>
39 #include <sys/swap.h>
40 #include <sys/errno.h>
41 #include <sys/debug.h>
42 #include <sys/kmem.h>
43 #include <sys/kstat.h>
44 #include <sys/cmn_err.h>
45 #include <sys/vtrace.h>
46 #include <sys/session.h>
47 #include <sys/dnlc.h>
48 #include <sys/bitmap.h>
49 #include <sys/acl.h>
50 #include <sys/ddi.h>
51 #include <sys/pathname.h>
52 #include <sys/flock.h>
53 #include <sys/dirent.h>
54 #include <sys/flock.h>
55 #include <sys/callb.h>
56 #include <sys/atomic.h>
57 #include <sys/list.h>
58 #include <sys/tsol/tnet.h>
59 #include <sys/priv.h>
60 #include <sys/sdt.h>
61 #include <sys/attr.h>
62 
63 #include <inet/ip6.h>
64 
65 #include <rpc/types.h>
66 #include <rpc/xdr.h>
67 #include <rpc/auth.h>
68 #include <rpc/clnt.h>
69 
70 #include <nfs/nfs.h>
71 #include <nfs/nfs4.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/rnode.h>
74 #include <nfs/nfs_acl.h>
75 
76 #include <sys/tsol/label.h>
77 
78 /*
79  * The hash queues for the access to active and cached rnodes
80  * are organized as doubly linked lists.  A reader/writer lock
81  * for each hash bucket is used to control access and to synchronize
82  * lookups, additions, and deletions from the hash queue.
83  *
84  * The rnode freelist is organized as a doubly linked list with
85  * a head pointer.  Additions and deletions are synchronized via
86  * a single mutex.
87  *
88  * In order to add an rnode to the free list, it must be hashed into
89  * a hash queue and the exclusive lock to the hash queue be held.
90  * If an rnode is not hashed into a hash queue, then it is destroyed
91  * because it represents no valuable information that can be reused
92  * about the file.  The exclusive lock to the hash queue must be
93  * held in order to prevent a lookup in the hash queue from finding
94  * the rnode and using it and assuming that the rnode is not on the
95  * freelist.  The lookup in the hash queue will have the hash queue
96  * locked, either exclusive or shared.
97  *
98  * The vnode reference count for each rnode is not allowed to drop
99  * below 1.  This prevents external entities, such as the VM
100  * subsystem, from acquiring references to vnodes already on the
101  * freelist and then trying to place them back on the freelist
102  * when their reference is released.  This means that the when an
103  * rnode is looked up in the hash queues, then either the rnode
104  * is removed from the freelist and that reference is transferred to
105  * the new reference or the vnode reference count must be incremented
106  * accordingly.  The mutex for the freelist must be held in order to
107  * accurately test to see if the rnode is on the freelist or not.
108  * The hash queue lock might be held shared and it is possible that
109  * two different threads may race to remove the rnode from the
110  * freelist.  This race can be resolved by holding the mutex for the
111  * freelist.  Please note that the mutex for the freelist does not
112  * need to held if the rnode is not on the freelist.  It can not be
113  * placed on the freelist due to the requirement that the thread
114  * putting the rnode on the freelist must hold the exclusive lock
115  * to the hash queue and the thread doing the lookup in the hash
116  * queue is holding either a shared or exclusive lock to the hash
117  * queue.
118  *
119  * The lock ordering is:
120  *
121  *	hash bucket lock -> vnode lock
122  *	hash bucket lock -> freelist lock
123  */
124 static rhashq_t *rtable;
125 
126 static kmutex_t rpfreelist_lock;
127 static rnode_t *rpfreelist = NULL;
128 static long rnew = 0;
129 long nrnode = 0;
130 
131 static int rtablesize;
132 static int rtablemask;
133 
134 static int hashlen = 4;
135 
136 static struct kmem_cache *rnode_cache;
137 
138 /*
139  * Mutex to protect the following variables:
140  *	nfs_major
141  *	nfs_minor
142  */
143 kmutex_t nfs_minor_lock;
144 int nfs_major;
145 int nfs_minor;
146 
147 /* Do we allow preepoch (negative) time values otw? */
148 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
149 
150 /*
151  * Access cache
152  */
153 static acache_hash_t *acache;
154 static long nacache;	/* used strictly to size the number of hash queues */
155 
156 static int acachesize;
157 static int acachemask;
158 static struct kmem_cache *acache_cache;
159 
160 /*
161  * Client side utilities
162  */
163 
164 /*
165  * client side statistics
166  */
167 static const struct clstat clstat_tmpl = {
168 	{ "calls",	KSTAT_DATA_UINT64 },
169 	{ "badcalls",	KSTAT_DATA_UINT64 },
170 	{ "clgets",	KSTAT_DATA_UINT64 },
171 	{ "cltoomany",	KSTAT_DATA_UINT64 },
172 #ifdef DEBUG
173 	{ "clalloc",	KSTAT_DATA_UINT64 },
174 	{ "noresponse",	KSTAT_DATA_UINT64 },
175 	{ "failover",	KSTAT_DATA_UINT64 },
176 	{ "remap",	KSTAT_DATA_UINT64 },
177 #endif
178 };
179 
180 /*
181  * The following are statistics that describe behavior of the system as a whole
182  * and doesn't correspond to any one particular zone.
183  */
184 #ifdef DEBUG
185 static struct clstat_debug {
186 	kstat_named_t	nrnode;			/* number of allocated rnodes */
187 	kstat_named_t	access;			/* size of access cache */
188 	kstat_named_t	dirent;			/* size of readdir cache */
189 	kstat_named_t	dirents;		/* size of readdir buf cache */
190 	kstat_named_t	reclaim;		/* number of reclaims */
191 	kstat_named_t	clreclaim;		/* number of cl reclaims */
192 	kstat_named_t	f_reclaim;		/* number of free reclaims */
193 	kstat_named_t	a_reclaim;		/* number of active reclaims */
194 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
195 	kstat_named_t	rpath;			/* bytes used to store rpaths */
196 } clstat_debug = {
197 	{ "nrnode",	KSTAT_DATA_UINT64 },
198 	{ "access",	KSTAT_DATA_UINT64 },
199 	{ "dirent",	KSTAT_DATA_UINT64 },
200 	{ "dirents",	KSTAT_DATA_UINT64 },
201 	{ "reclaim",	KSTAT_DATA_UINT64 },
202 	{ "clreclaim",	KSTAT_DATA_UINT64 },
203 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
204 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
205 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
206 	{ "r_path",	KSTAT_DATA_UINT64 },
207 };
208 #endif	/* DEBUG */
209 
210 /*
211  * We keep a global list of per-zone client data, so we can clean up all zones
212  * if we get low on memory.
213  */
214 static list_t nfs_clnt_list;
215 static kmutex_t nfs_clnt_list_lock;
216 static zone_key_t nfsclnt_zone_key;
217 
218 static struct kmem_cache *chtab_cache;
219 
220 /*
221  * Some servers do not properly update the attributes of the
222  * directory when changes are made.  To allow interoperability
223  * with these broken servers, the nfs_disable_rddir_cache
224  * parameter must be set in /etc/system
225  */
226 int nfs_disable_rddir_cache = 0;
227 
228 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
229 		    struct chtab **);
230 void		clfree(CLIENT *, struct chtab *);
231 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
232 		    struct chtab **, struct nfs_clnt *);
233 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 		    struct chtab **, struct nfs_clnt *);
235 static void	clreclaim(void *);
236 static int	nfs_feedback(int, int, mntinfo_t *);
237 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
238 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
239 		    failinfo_t *);
240 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
241 		    caddr_t, cred_t *, int *, int, failinfo_t *);
242 static void	rinactive(rnode_t *, cred_t *);
243 static int	rtablehash(nfs_fhandle *);
244 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
245 		    struct vnodeops *,
246 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
247 			cred_t *),
248 		    int (*)(const void *, const void *), int *, cred_t *,
249 		    char *, char *);
250 static void	rp_rmfree(rnode_t *);
251 static void	rp_addhash(rnode_t *);
252 static void	rp_rmhash_locked(rnode_t *);
253 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
254 static void	destroy_rnode(rnode_t *);
255 static void	rddir_cache_free(rddir_cache *);
256 static int	nfs_free_data_reclaim(rnode_t *);
257 static int	nfs_active_data_reclaim(rnode_t *);
258 static int	nfs_free_reclaim(void);
259 static int	nfs_active_reclaim(void);
260 static int	nfs_rnode_reclaim(void);
261 static void	nfs_reclaim(void *);
262 static int	failover_safe(failinfo_t *);
263 static void	failover_newserver(mntinfo_t *mi);
264 static void	failover_thread(mntinfo_t *mi);
265 static int	failover_wait(mntinfo_t *);
266 static int	failover_remap(failinfo_t *);
267 static int	failover_lookup(char *, vnode_t *,
268 		    int (*)(vnode_t *, char *, vnode_t **,
269 			struct pathname *, int, vnode_t *, cred_t *, int),
270 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
271 		    vnode_t **);
272 static void	nfs_free_r_path(rnode_t *);
273 static void	nfs_set_vroot(vnode_t *);
274 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
275 
276 /*
277  * from rpcsec module (common/rpcsec)
278  */
279 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
280 extern void sec_clnt_freeh(AUTH *);
281 extern void sec_clnt_freeinfo(struct sec_data *);
282 
283 /*
284  * used in mount policy
285  */
286 extern ts_label_t *getflabel_cipso(vfs_t *);
287 
288 /*
289  * EIO or EINTR are not recoverable errors.
290  */
291 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
292 
293 #ifdef DEBUG
294 #define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
295 #define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
296 #else
297 #define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
298 #define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
299 #endif
300 /*
301  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
302  */
303 static int
304 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
305     struct chtab **chp, struct nfs_clnt *nfscl)
306 {
307 	struct chhead *ch, *newch;
308 	struct chhead **plistp;
309 	struct chtab *cp;
310 	int error;
311 	k_sigset_t smask;
312 
313 	if (newcl == NULL || chp == NULL || ci == NULL)
314 		return (EINVAL);
315 
316 	*newcl = NULL;
317 	*chp = NULL;
318 
319 	/*
320 	 * Find an unused handle or create one
321 	 */
322 	newch = NULL;
323 	nfscl->nfscl_stat.clgets.value.ui64++;
324 top:
325 	/*
326 	 * Find the correct entry in the cache to check for free
327 	 * client handles.  The search is based on the RPC program
328 	 * number, program version number, dev_t for the transport
329 	 * device, and the protocol family.
330 	 */
331 	mutex_enter(&nfscl->nfscl_chtable_lock);
332 	plistp = &nfscl->nfscl_chtable;
333 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
334 		if (ch->ch_prog == ci->cl_prog &&
335 		    ch->ch_vers == ci->cl_vers &&
336 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
337 		    (strcmp(ch->ch_protofmly,
338 		    svp->sv_knconf->knc_protofmly) == 0))
339 			break;
340 		plistp = &ch->ch_next;
341 	}
342 
343 	/*
344 	 * If we didn't find a cache entry for this quadruple, then
345 	 * create one.  If we don't have one already preallocated,
346 	 * then drop the cache lock, create one, and then start over.
347 	 * If we did have a preallocated entry, then just add it to
348 	 * the front of the list.
349 	 */
350 	if (ch == NULL) {
351 		if (newch == NULL) {
352 			mutex_exit(&nfscl->nfscl_chtable_lock);
353 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
354 			newch->ch_timesused = 0;
355 			newch->ch_prog = ci->cl_prog;
356 			newch->ch_vers = ci->cl_vers;
357 			newch->ch_dev = svp->sv_knconf->knc_rdev;
358 			newch->ch_protofmly = kmem_alloc(
359 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
360 			    KM_SLEEP);
361 			(void) strcpy(newch->ch_protofmly,
362 			    svp->sv_knconf->knc_protofmly);
363 			newch->ch_list = NULL;
364 			goto top;
365 		}
366 		ch = newch;
367 		newch = NULL;
368 		ch->ch_next = nfscl->nfscl_chtable;
369 		nfscl->nfscl_chtable = ch;
370 	/*
371 	 * We found a cache entry, but if it isn't on the front of the
372 	 * list, then move it to the front of the list to try to take
373 	 * advantage of locality of operations.
374 	 */
375 	} else if (ch != nfscl->nfscl_chtable) {
376 		*plistp = ch->ch_next;
377 		ch->ch_next = nfscl->nfscl_chtable;
378 		nfscl->nfscl_chtable = ch;
379 	}
380 
381 	/*
382 	 * If there was a free client handle cached, then remove it
383 	 * from the list, init it, and use it.
384 	 */
385 	if (ch->ch_list != NULL) {
386 		cp = ch->ch_list;
387 		ch->ch_list = cp->ch_list;
388 		mutex_exit(&nfscl->nfscl_chtable_lock);
389 		if (newch != NULL) {
390 			kmem_free(newch->ch_protofmly,
391 			    strlen(newch->ch_protofmly) + 1);
392 			kmem_free(newch, sizeof (*newch));
393 		}
394 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
395 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
396 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
397 		    &cp->ch_client->cl_auth);
398 		if (error || cp->ch_client->cl_auth == NULL) {
399 			CLNT_DESTROY(cp->ch_client);
400 			kmem_cache_free(chtab_cache, cp);
401 			return ((error != 0) ? error : EINTR);
402 		}
403 		ch->ch_timesused++;
404 		*newcl = cp->ch_client;
405 		*chp = cp;
406 		return (0);
407 	}
408 
409 	/*
410 	 * There weren't any free client handles which fit, so allocate
411 	 * a new one and use that.
412 	 */
413 #ifdef DEBUG
414 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
415 #endif
416 	mutex_exit(&nfscl->nfscl_chtable_lock);
417 
418 	nfscl->nfscl_stat.cltoomany.value.ui64++;
419 	if (newch != NULL) {
420 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
421 		kmem_free(newch, sizeof (*newch));
422 	}
423 
424 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
425 	cp->ch_head = ch;
426 
427 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
428 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
429 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
430 	sigunintr(&smask);
431 
432 	if (error != 0) {
433 		kmem_cache_free(chtab_cache, cp);
434 #ifdef DEBUG
435 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
436 #endif
437 		/*
438 		 * Warning is unnecessary if error is EINTR.
439 		 */
440 		if (error != EINTR) {
441 			nfs_cmn_err(error, CE_WARN,
442 			    "clget: couldn't create handle: %m\n");
443 		}
444 		return (error);
445 	}
446 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
447 	auth_destroy(cp->ch_client->cl_auth);
448 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
449 	    &cp->ch_client->cl_auth);
450 	if (error || cp->ch_client->cl_auth == NULL) {
451 		CLNT_DESTROY(cp->ch_client);
452 		kmem_cache_free(chtab_cache, cp);
453 #ifdef DEBUG
454 		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
455 #endif
456 		return ((error != 0) ? error : EINTR);
457 	}
458 	ch->ch_timesused++;
459 	*newcl = cp->ch_client;
460 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
461 	*chp = cp;
462 	return (0);
463 }
464 
465 int
466 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
467     struct chtab **chp)
468 {
469 	struct nfs_clnt *nfscl;
470 
471 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
472 	ASSERT(nfscl != NULL);
473 
474 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
475 }
476 
477 static int
478 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
479     struct chtab **chp, struct nfs_clnt *nfscl)
480 {
481 	clinfo_t ci;
482 	int error;
483 
484 	/*
485 	 * Set read buffer size to rsize
486 	 * and add room for RPC headers.
487 	 */
488 	ci.cl_readsize = mi->mi_tsize;
489 	if (ci.cl_readsize != 0)
490 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
491 
492 	/*
493 	 * If soft mount and server is down just try once.
494 	 * meaning: do not retransmit.
495 	 */
496 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
497 		ci.cl_retrans = 0;
498 	else
499 		ci.cl_retrans = mi->mi_retrans;
500 
501 	ci.cl_prog = NFS_ACL_PROGRAM;
502 	ci.cl_vers = mi->mi_vers;
503 	ci.cl_flags = mi->mi_flags;
504 
505 	/*
506 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
507 	 * security flavor, the client tries to establish a security context
508 	 * by contacting the server. If the connection is timed out or reset,
509 	 * e.g. server reboot, we will try again.
510 	 */
511 	do {
512 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
513 
514 		if (error == 0)
515 			break;
516 
517 		/*
518 		 * For forced unmount or zone shutdown, bail out, no retry.
519 		 */
520 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
521 			error = EIO;
522 			break;
523 		}
524 
525 		/* do not retry for softmount */
526 		if (!(mi->mi_flags & MI_HARD))
527 			break;
528 
529 		/* let the caller deal with the failover case */
530 		if (FAILOVER_MOUNT(mi))
531 			break;
532 
533 	} while (error == ETIMEDOUT || error == ECONNRESET);
534 
535 	return (error);
536 }
537 
538 static int
539 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
540     struct chtab **chp, struct nfs_clnt *nfscl)
541 {
542 	clinfo_t ci;
543 	int error;
544 
545 	/*
546 	 * Set read buffer size to rsize
547 	 * and add room for RPC headers.
548 	 */
549 	ci.cl_readsize = mi->mi_tsize;
550 	if (ci.cl_readsize != 0)
551 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
552 
553 	/*
554 	 * If soft mount and server is down just try once.
555 	 * meaning: do not retransmit.
556 	 */
557 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
558 		ci.cl_retrans = 0;
559 	else
560 		ci.cl_retrans = mi->mi_retrans;
561 
562 	ci.cl_prog = mi->mi_prog;
563 	ci.cl_vers = mi->mi_vers;
564 	ci.cl_flags = mi->mi_flags;
565 
566 	/*
567 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
568 	 * security flavor, the client tries to establish a security context
569 	 * by contacting the server. If the connection is timed out or reset,
570 	 * e.g. server reboot, we will try again.
571 	 */
572 	do {
573 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
574 
575 		if (error == 0)
576 			break;
577 
578 		/*
579 		 * For forced unmount or zone shutdown, bail out, no retry.
580 		 */
581 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
582 			error = EIO;
583 			break;
584 		}
585 
586 		/* do not retry for softmount */
587 		if (!(mi->mi_flags & MI_HARD))
588 			break;
589 
590 		/* let the caller deal with the failover case */
591 		if (FAILOVER_MOUNT(mi))
592 			break;
593 
594 	} while (error == ETIMEDOUT || error == ECONNRESET);
595 
596 	return (error);
597 }
598 
599 static void
600 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
601 {
602 	if (cl->cl_auth != NULL) {
603 		sec_clnt_freeh(cl->cl_auth);
604 		cl->cl_auth = NULL;
605 	}
606 
607 	/*
608 	 * Timestamp this cache entry so that we know when it was last
609 	 * used.
610 	 */
611 	cp->ch_freed = gethrestime_sec();
612 
613 	/*
614 	 * Add the free client handle to the front of the list.
615 	 * This way, the list will be sorted in youngest to oldest
616 	 * order.
617 	 */
618 	mutex_enter(&nfscl->nfscl_chtable_lock);
619 	cp->ch_list = cp->ch_head->ch_list;
620 	cp->ch_head->ch_list = cp;
621 	mutex_exit(&nfscl->nfscl_chtable_lock);
622 }
623 
624 void
625 clfree(CLIENT *cl, struct chtab *cp)
626 {
627 	struct nfs_clnt *nfscl;
628 
629 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
630 	ASSERT(nfscl != NULL);
631 
632 	clfree_impl(cl, cp, nfscl);
633 }
634 
635 #define	CL_HOLDTIME	60	/* time to hold client handles */
636 
637 static void
638 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
639 {
640 	struct chhead *ch;
641 	struct chtab *cp;	/* list of objects that can be reclaimed */
642 	struct chtab *cpe;
643 	struct chtab *cpl;
644 	struct chtab **cpp;
645 #ifdef DEBUG
646 	int n = 0;
647 #endif
648 
649 	/*
650 	 * Need to reclaim some memory, so step through the cache
651 	 * looking through the lists for entries which can be freed.
652 	 */
653 	cp = NULL;
654 
655 	mutex_enter(&nfscl->nfscl_chtable_lock);
656 
657 	/*
658 	 * Here we step through each non-NULL quadruple and start to
659 	 * construct the reclaim list pointed to by cp.  Note that
660 	 * cp will contain all eligible chtab entries.  When this traversal
661 	 * completes, chtab entries from the last quadruple will be at the
662 	 * front of cp and entries from previously inspected quadruples have
663 	 * been appended to the rear of cp.
664 	 */
665 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
666 		if (ch->ch_list == NULL)
667 			continue;
668 		/*
669 		 * Search each list for entries older then
670 		 * cl_holdtime seconds.  The lists are maintained
671 		 * in youngest to oldest order so that when the
672 		 * first entry is found which is old enough, then
673 		 * all of the rest of the entries on the list will
674 		 * be old enough as well.
675 		 */
676 		cpl = ch->ch_list;
677 		cpp = &ch->ch_list;
678 		while (cpl != NULL &&
679 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
680 			cpp = &cpl->ch_list;
681 			cpl = cpl->ch_list;
682 		}
683 		if (cpl != NULL) {
684 			*cpp = NULL;
685 			if (cp != NULL) {
686 				cpe = cpl;
687 				while (cpe->ch_list != NULL)
688 					cpe = cpe->ch_list;
689 				cpe->ch_list = cp;
690 			}
691 			cp = cpl;
692 		}
693 	}
694 
695 	mutex_exit(&nfscl->nfscl_chtable_lock);
696 
697 	/*
698 	 * If cp is empty, then there is nothing to reclaim here.
699 	 */
700 	if (cp == NULL)
701 		return;
702 
703 	/*
704 	 * Step through the list of entries to free, destroying each client
705 	 * handle and kmem_free'ing the memory for each entry.
706 	 */
707 	while (cp != NULL) {
708 #ifdef DEBUG
709 		n++;
710 #endif
711 		CLNT_DESTROY(cp->ch_client);
712 		cpl = cp->ch_list;
713 		kmem_cache_free(chtab_cache, cp);
714 		cp = cpl;
715 	}
716 
717 #ifdef DEBUG
718 	/*
719 	 * Update clalloc so that nfsstat shows the current number
720 	 * of allocated client handles.
721 	 */
722 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
723 #endif
724 }
725 
726 /* ARGSUSED */
727 static void
728 clreclaim(void *all)
729 {
730 	struct nfs_clnt *nfscl;
731 
732 #ifdef DEBUG
733 	clstat_debug.clreclaim.value.ui64++;
734 #endif
735 	/*
736 	 * The system is low on memory; go through and try to reclaim some from
737 	 * every zone on the system.
738 	 */
739 	mutex_enter(&nfs_clnt_list_lock);
740 	nfscl = list_head(&nfs_clnt_list);
741 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
742 		clreclaim_zone(nfscl, CL_HOLDTIME);
743 	mutex_exit(&nfs_clnt_list_lock);
744 }
745 
746 /*
747  * Minimum time-out values indexed by call type
748  * These units are in "eights" of a second to avoid multiplies
749  */
750 static unsigned int minimum_timeo[] = {
751 	6, 7, 10
752 };
753 
754 /*
755  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
756  */
757 #define	MAXTIMO	(20*hz)
758 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
759 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
760 
761 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
762 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
763 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
764 
765 /*
766  * Function called when rfscall notices that we have been
767  * re-transmitting, or when we get a response without retransmissions.
768  * Return 1 if the transfer size was adjusted down - 0 if no change.
769  */
770 static int
771 nfs_feedback(int flag, int which, mntinfo_t *mi)
772 {
773 	int kind;
774 	int r = 0;
775 
776 	mutex_enter(&mi->mi_lock);
777 	if (flag == FEEDBACK_REXMIT1) {
778 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
779 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
780 			goto done;
781 		if (mi->mi_curread > MIN_NFS_TSIZE) {
782 			mi->mi_curread /= 2;
783 			if (mi->mi_curread < MIN_NFS_TSIZE)
784 				mi->mi_curread = MIN_NFS_TSIZE;
785 			r = 1;
786 		}
787 
788 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
789 			mi->mi_curwrite /= 2;
790 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
791 				mi->mi_curwrite = MIN_NFS_TSIZE;
792 			r = 1;
793 		}
794 	} else if (flag == FEEDBACK_OK) {
795 		kind = mi->mi_timer_type[which];
796 		if (kind == 0 ||
797 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
798 			goto done;
799 		if (kind == 1) {
800 			if (mi->mi_curread >= mi->mi_tsize)
801 				goto done;
802 			mi->mi_curread +=  MIN_NFS_TSIZE;
803 			if (mi->mi_curread > mi->mi_tsize/2)
804 				mi->mi_curread = mi->mi_tsize;
805 		} else if (kind == 2) {
806 			if (mi->mi_curwrite >= mi->mi_stsize)
807 				goto done;
808 			mi->mi_curwrite += MIN_NFS_TSIZE;
809 			if (mi->mi_curwrite > mi->mi_stsize/2)
810 				mi->mi_curwrite = mi->mi_stsize;
811 		}
812 	}
813 done:
814 	mutex_exit(&mi->mi_lock);
815 	return (r);
816 }
817 
818 #ifdef DEBUG
819 static int rfs2call_hits = 0;
820 static int rfs2call_misses = 0;
821 #endif
822 
823 int
824 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
825     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
826     enum nfsstat *statusp, int flags, failinfo_t *fi)
827 {
828 	int rpcerror;
829 	enum clnt_stat rpc_status;
830 
831 	ASSERT(statusp != NULL);
832 
833 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
834 	    cr, douprintf, &rpc_status, flags, fi);
835 	if (!rpcerror) {
836 		/*
837 		 * See crnetadjust() for comments.
838 		 */
839 		if (*statusp == NFSERR_ACCES &&
840 		    (cr = crnetadjust(cr)) != NULL) {
841 #ifdef DEBUG
842 			rfs2call_hits++;
843 #endif
844 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
845 			    resp, cr, douprintf, NULL, flags, fi);
846 			crfree(cr);
847 #ifdef DEBUG
848 			if (*statusp == NFSERR_ACCES)
849 				rfs2call_misses++;
850 #endif
851 		}
852 	} else if (rpc_status == RPC_PROCUNAVAIL) {
853 		*statusp = NFSERR_OPNOTSUPP;
854 		rpcerror = 0;
855 	}
856 
857 	return (rpcerror);
858 }
859 
860 #define	NFS3_JUKEBOX_DELAY	10 * hz
861 
862 static clock_t nfs3_jukebox_delay = 0;
863 
864 #ifdef DEBUG
865 static int rfs3call_hits = 0;
866 static int rfs3call_misses = 0;
867 #endif
868 
869 int
870 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
871     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
872     nfsstat3 *statusp, int flags, failinfo_t *fi)
873 {
874 	int rpcerror;
875 	int user_informed;
876 
877 	user_informed = 0;
878 	do {
879 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
880 		    cr, douprintf, NULL, flags, fi);
881 		if (!rpcerror) {
882 			cred_t *crr;
883 			if (*statusp == NFS3ERR_JUKEBOX) {
884 				if (ttoproc(curthread) == &p0) {
885 					rpcerror = EAGAIN;
886 					break;
887 				}
888 				if (!user_informed) {
889 					user_informed = 1;
890 					uprintf(
891 		"file temporarily unavailable on the server, retrying...\n");
892 				}
893 				delay(nfs3_jukebox_delay);
894 			}
895 			/*
896 			 * See crnetadjust() for comments.
897 			 */
898 			else if (*statusp == NFS3ERR_ACCES &&
899 			    (crr = crnetadjust(cr)) != NULL) {
900 #ifdef DEBUG
901 				rfs3call_hits++;
902 #endif
903 				rpcerror = rfscall(mi, which, xdrargs, argsp,
904 				    xdrres, resp, crr, douprintf,
905 				    NULL, flags, fi);
906 
907 				crfree(crr);
908 #ifdef DEBUG
909 				if (*statusp == NFS3ERR_ACCES)
910 					rfs3call_misses++;
911 #endif
912 			}
913 		}
914 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
915 
916 	return (rpcerror);
917 }
918 
919 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
920 #define	INC_READERS(mi)		{ \
921 	mi->mi_readers++; \
922 }
923 #define	DEC_READERS(mi)		{ \
924 	mi->mi_readers--; \
925 	if (mi->mi_readers == 0) \
926 		cv_broadcast(&mi->mi_failover_cv); \
927 }
928 
929 static int
930 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
931     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
932     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
933 {
934 	CLIENT *client;
935 	struct chtab *ch;
936 	cred_t *cr = icr;
937 	enum clnt_stat status;
938 	struct rpc_err rpcerr, rpcerr_tmp;
939 	struct timeval wait;
940 	int timeo;		/* in units of hz */
941 	int my_rsize, my_wsize;
942 	bool_t tryagain;
943 	bool_t cred_cloned = FALSE;
944 	k_sigset_t smask;
945 	servinfo_t *svp;
946 	struct nfs_clnt *nfscl;
947 	zoneid_t zoneid = getzoneid();
948 	char *msg;
949 #ifdef DEBUG
950 	char *bufp;
951 #endif
952 
953 
954 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
955 	    "rfscall_start:which %d mi %p", which, mi);
956 
957 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
958 	ASSERT(nfscl != NULL);
959 
960 	nfscl->nfscl_stat.calls.value.ui64++;
961 	mi->mi_reqs[which].value.ui64++;
962 
963 	rpcerr.re_status = RPC_SUCCESS;
964 
965 	/*
966 	 * In case of forced unmount or zone shutdown, return EIO.
967 	 */
968 
969 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
970 		rpcerr.re_status = RPC_FAILED;
971 		rpcerr.re_errno = EIO;
972 		return (rpcerr.re_errno);
973 	}
974 
975 	/*
976 	 * Remember the transfer sizes in case
977 	 * nfs_feedback changes them underneath us.
978 	 */
979 	my_rsize = mi->mi_curread;
980 	my_wsize = mi->mi_curwrite;
981 
982 	/*
983 	 * NFS client failover support
984 	 *
985 	 * If this rnode is not in sync with the current server (VALID_FH),
986 	 * we'd like to do a remap to get in sync.  We can be interrupted
987 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
988 	 * use the best info we have to try the RPC.  Part of that is
989 	 * unconditionally updating the filehandle copy kept for V3.
990 	 *
991 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
992 	 * rw_enter(); we're trying to keep the current server from being
993 	 * changed on us until we're done with the remapping and have a
994 	 * matching client handle.  We don't want to sending a filehandle
995 	 * to the wrong host.
996 	 */
997 failoverretry:
998 	if (FAILOVER_MOUNT(mi)) {
999 		mutex_enter(&mi->mi_lock);
1000 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1001 			if (failover_wait(mi)) {
1002 				mutex_exit(&mi->mi_lock);
1003 				return (EINTR);
1004 			}
1005 		}
1006 		INC_READERS(mi);
1007 		mutex_exit(&mi->mi_lock);
1008 		if (fi) {
1009 			if (!VALID_FH(fi) &&
1010 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1011 				int remaperr;
1012 
1013 				svp = mi->mi_curr_serv;
1014 				remaperr = failover_remap(fi);
1015 				if (remaperr != 0) {
1016 #ifdef DEBUG
1017 					if (remaperr != EINTR)
1018 						nfs_cmn_err(remaperr, CE_WARN,
1019 					    "rfscall couldn't failover: %m");
1020 #endif
1021 					mutex_enter(&mi->mi_lock);
1022 					DEC_READERS(mi);
1023 					mutex_exit(&mi->mi_lock);
1024 					/*
1025 					 * If failover_remap returns ETIMEDOUT
1026 					 * and the filesystem is hard mounted
1027 					 * we have to retry the call with a new
1028 					 * server.
1029 					 */
1030 					if ((mi->mi_flags & MI_HARD) &&
1031 					    IS_RECOVERABLE_ERROR(remaperr)) {
1032 						if (svp == mi->mi_curr_serv)
1033 							failover_newserver(mi);
1034 						rpcerr.re_status = RPC_SUCCESS;
1035 						goto failoverretry;
1036 					}
1037 					rpcerr.re_errno = remaperr;
1038 					return (remaperr);
1039 				}
1040 			}
1041 			if (fi->fhp && fi->copyproc)
1042 				(*fi->copyproc)(fi->fhp, fi->vp);
1043 		}
1044 	}
1045 
1046 	/* For TSOL, use a new cred which has net_mac_aware flag */
1047 	if (!cred_cloned && is_system_labeled()) {
1048 		cred_cloned = TRUE;
1049 		cr = crdup(icr);
1050 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1051 	}
1052 
1053 	/*
1054 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1055 	 * are guaranteed to reprocess the retry as a new request.
1056 	 */
1057 	svp = mi->mi_curr_serv;
1058 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1059 
1060 	if (FAILOVER_MOUNT(mi)) {
1061 		mutex_enter(&mi->mi_lock);
1062 		DEC_READERS(mi);
1063 		mutex_exit(&mi->mi_lock);
1064 
1065 		if ((rpcerr.re_errno == ETIMEDOUT ||
1066 		    rpcerr.re_errno == ECONNRESET) &&
1067 		    failover_safe(fi)) {
1068 			if (svp == mi->mi_curr_serv)
1069 				failover_newserver(mi);
1070 			goto failoverretry;
1071 		}
1072 	}
1073 	if (rpcerr.re_errno != 0)
1074 		return (rpcerr.re_errno);
1075 
1076 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1077 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1078 		timeo = (mi->mi_timeo * hz) / 10;
1079 	} else {
1080 		mutex_enter(&mi->mi_lock);
1081 		timeo = CLNT_SETTIMERS(client,
1082 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1083 		    &(mi->mi_timers[NFS_CALLTYPES]),
1084 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1085 		    (void (*)())NULL, (caddr_t)mi, 0);
1086 		mutex_exit(&mi->mi_lock);
1087 	}
1088 
1089 	/*
1090 	 * If hard mounted fs, retry call forever unless hard error occurs.
1091 	 */
1092 	do {
1093 		tryagain = FALSE;
1094 
1095 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1096 			status = RPC_FAILED;
1097 			rpcerr.re_status = RPC_FAILED;
1098 			rpcerr.re_errno = EIO;
1099 			break;
1100 		}
1101 
1102 		TICK_TO_TIMEVAL(timeo, &wait);
1103 
1104 		/*
1105 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1106 		 * and SIGTERM. (Preserving the existing masks).
1107 		 * Mask out SIGINT if mount option nointr is specified.
1108 		 */
1109 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1110 		if (!(mi->mi_flags & MI_INT))
1111 			client->cl_nosignal = TRUE;
1112 
1113 		/*
1114 		 * If there is a current signal, then don't bother
1115 		 * even trying to send out the request because we
1116 		 * won't be able to block waiting for the response.
1117 		 * Simply assume RPC_INTR and get on with it.
1118 		 */
1119 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1120 			status = RPC_INTR;
1121 		else {
1122 			status = CLNT_CALL(client, which, xdrargs, argsp,
1123 			    xdrres, resp, wait);
1124 		}
1125 
1126 		if (!(mi->mi_flags & MI_INT))
1127 			client->cl_nosignal = FALSE;
1128 		/*
1129 		 * restore original signal mask
1130 		 */
1131 		sigunintr(&smask);
1132 
1133 		switch (status) {
1134 		case RPC_SUCCESS:
1135 			if ((mi->mi_flags & MI_DYNAMIC) &&
1136 			    mi->mi_timer_type[which] != 0 &&
1137 			    (mi->mi_curread != my_rsize ||
1138 			    mi->mi_curwrite != my_wsize))
1139 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1140 			break;
1141 
1142 		case RPC_INTR:
1143 			/*
1144 			 * There is no way to recover from this error,
1145 			 * even if mount option nointr is specified.
1146 			 * SIGKILL, for example, cannot be blocked.
1147 			 */
1148 			rpcerr.re_status = RPC_INTR;
1149 			rpcerr.re_errno = EINTR;
1150 			break;
1151 
1152 		case RPC_UDERROR:
1153 			/*
1154 			 * If the NFS server is local (vold) and
1155 			 * it goes away then we get RPC_UDERROR.
1156 			 * This is a retryable error, so we would
1157 			 * loop, so check to see if the specific
1158 			 * error was ECONNRESET, indicating that
1159 			 * target did not exist at all.  If so,
1160 			 * return with RPC_PROGUNAVAIL and
1161 			 * ECONNRESET to indicate why.
1162 			 */
1163 			CLNT_GETERR(client, &rpcerr);
1164 			if (rpcerr.re_errno == ECONNRESET) {
1165 				rpcerr.re_status = RPC_PROGUNAVAIL;
1166 				rpcerr.re_errno = ECONNRESET;
1167 				break;
1168 			}
1169 			/*FALLTHROUGH*/
1170 
1171 		default:		/* probably RPC_TIMEDOUT */
1172 			if (IS_UNRECOVERABLE_RPC(status))
1173 				break;
1174 
1175 			/*
1176 			 * increment server not responding count
1177 			 */
1178 			mutex_enter(&mi->mi_lock);
1179 			mi->mi_noresponse++;
1180 			mutex_exit(&mi->mi_lock);
1181 #ifdef DEBUG
1182 			nfscl->nfscl_stat.noresponse.value.ui64++;
1183 #endif
1184 
1185 			if (!(mi->mi_flags & MI_HARD)) {
1186 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1187 				    (mi->mi_ss_call_type[which] == 0))
1188 					break;
1189 			}
1190 
1191 			/*
1192 			 * The call is in progress (over COTS).
1193 			 * Try the CLNT_CALL again, but don't
1194 			 * print a noisy error message.
1195 			 */
1196 			if (status == RPC_INPROGRESS) {
1197 				tryagain = TRUE;
1198 				break;
1199 			}
1200 
1201 			if (flags & RFSCALL_SOFT)
1202 				break;
1203 
1204 			/*
1205 			 * On zone shutdown, just move on.
1206 			 */
1207 			if (zone_status_get(curproc->p_zone) >=
1208 			    ZONE_IS_SHUTTING_DOWN) {
1209 				rpcerr.re_status = RPC_FAILED;
1210 				rpcerr.re_errno = EIO;
1211 				break;
1212 			}
1213 
1214 			/*
1215 			 * NFS client failover support
1216 			 *
1217 			 * If the current server just failed us, we'll
1218 			 * start the process of finding a new server.
1219 			 * After that, we can just retry.
1220 			 */
1221 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1222 				if (svp == mi->mi_curr_serv)
1223 					failover_newserver(mi);
1224 				clfree_impl(client, ch, nfscl);
1225 				goto failoverretry;
1226 			}
1227 
1228 			tryagain = TRUE;
1229 			timeo = backoff(timeo);
1230 
1231 			CLNT_GETERR(client, &rpcerr_tmp);
1232 			if ((status == RPC_CANTSEND) &&
1233 			    (rpcerr_tmp.re_errno == ENOBUFS))
1234 				msg = SRV_QFULL_MSG;
1235 			else
1236 				msg = SRV_NOTRESP_MSG;
1237 
1238 			mutex_enter(&mi->mi_lock);
1239 			if (!(mi->mi_flags & MI_PRINTED)) {
1240 				mi->mi_flags |= MI_PRINTED;
1241 				mutex_exit(&mi->mi_lock);
1242 #ifdef DEBUG
1243 				zprintf(zoneid, msg, mi->mi_vers,
1244 				    svp->sv_hostname);
1245 #else
1246 				zprintf(zoneid, msg, svp->sv_hostname);
1247 #endif
1248 			} else
1249 				mutex_exit(&mi->mi_lock);
1250 			if (*douprintf && nfs_has_ctty()) {
1251 				*douprintf = 0;
1252 				if (!(mi->mi_flags & MI_NOPRINT))
1253 #ifdef DEBUG
1254 					uprintf(msg, mi->mi_vers,
1255 					    svp->sv_hostname);
1256 #else
1257 					uprintf(msg, svp->sv_hostname);
1258 #endif
1259 			}
1260 
1261 			/*
1262 			 * If doing dynamic adjustment of transfer
1263 			 * size and if it's a read or write call
1264 			 * and if the transfer size changed while
1265 			 * retransmitting or if the feedback routine
1266 			 * changed the transfer size,
1267 			 * then exit rfscall so that the transfer
1268 			 * size can be adjusted at the vnops level.
1269 			 */
1270 			if ((mi->mi_flags & MI_DYNAMIC) &&
1271 			    mi->mi_timer_type[which] != 0 &&
1272 			    (mi->mi_curread != my_rsize ||
1273 			    mi->mi_curwrite != my_wsize ||
1274 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1275 				/*
1276 				 * On read or write calls, return
1277 				 * back to the vnode ops level if
1278 				 * the transfer size changed.
1279 				 */
1280 				clfree_impl(client, ch, nfscl);
1281 				if (cred_cloned)
1282 					crfree(cr);
1283 				return (ENFS_TRYAGAIN);
1284 			}
1285 		}
1286 	} while (tryagain);
1287 
1288 	if (status != RPC_SUCCESS) {
1289 		/*
1290 		 * Let soft mounts use the timed out message.
1291 		 */
1292 		if (status == RPC_INPROGRESS)
1293 			status = RPC_TIMEDOUT;
1294 		nfscl->nfscl_stat.badcalls.value.ui64++;
1295 		if (status != RPC_INTR) {
1296 			mutex_enter(&mi->mi_lock);
1297 			mi->mi_flags |= MI_DOWN;
1298 			mutex_exit(&mi->mi_lock);
1299 			CLNT_GETERR(client, &rpcerr);
1300 #ifdef DEBUG
1301 			bufp = clnt_sperror(client, svp->sv_hostname);
1302 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1303 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1304 			if (nfs_has_ctty()) {
1305 				if (!(mi->mi_flags & MI_NOPRINT)) {
1306 					uprintf("NFS%d %s failed for %s\n",
1307 					    mi->mi_vers, mi->mi_rfsnames[which],
1308 					    bufp);
1309 				}
1310 			}
1311 			kmem_free(bufp, MAXPATHLEN);
1312 #else
1313 			zprintf(zoneid,
1314 			    "NFS %s failed for server %s: error %d (%s)\n",
1315 			    mi->mi_rfsnames[which], svp->sv_hostname,
1316 			    status, clnt_sperrno(status));
1317 			if (nfs_has_ctty()) {
1318 				if (!(mi->mi_flags & MI_NOPRINT)) {
1319 					uprintf(
1320 				"NFS %s failed for server %s: error %d (%s)\n",
1321 					    mi->mi_rfsnames[which],
1322 					    svp->sv_hostname, status,
1323 					    clnt_sperrno(status));
1324 				}
1325 			}
1326 #endif
1327 			/*
1328 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1329 			 * re_errno is set appropriately depending on
1330 			 * the authentication error
1331 			 */
1332 			if (status == RPC_VERSMISMATCH ||
1333 			    status == RPC_PROGVERSMISMATCH)
1334 				rpcerr.re_errno = EIO;
1335 		}
1336 	} else {
1337 		/*
1338 		 * Test the value of mi_down and mi_printed without
1339 		 * holding the mi_lock mutex.  If they are both zero,
1340 		 * then it is okay to skip the down and printed
1341 		 * processing.  This saves on a mutex_enter and
1342 		 * mutex_exit pair for a normal, successful RPC.
1343 		 * This was just complete overhead.
1344 		 */
1345 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1346 			mutex_enter(&mi->mi_lock);
1347 			mi->mi_flags &= ~MI_DOWN;
1348 			if (mi->mi_flags & MI_PRINTED) {
1349 				mi->mi_flags &= ~MI_PRINTED;
1350 				mutex_exit(&mi->mi_lock);
1351 #ifdef DEBUG
1352 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1353 				zprintf(zoneid, "NFS%d server %s ok\n",
1354 				    mi->mi_vers, svp->sv_hostname);
1355 #else
1356 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357 				zprintf(zoneid, "NFS server %s ok\n",
1358 				    svp->sv_hostname);
1359 #endif
1360 			} else
1361 				mutex_exit(&mi->mi_lock);
1362 		}
1363 
1364 		if (*douprintf == 0) {
1365 			if (!(mi->mi_flags & MI_NOPRINT))
1366 #ifdef DEBUG
1367 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1368 					uprintf("NFS%d server %s ok\n",
1369 					    mi->mi_vers, svp->sv_hostname);
1370 #else
1371 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1373 #endif
1374 			*douprintf = 1;
1375 		}
1376 	}
1377 
1378 	clfree_impl(client, ch, nfscl);
1379 	if (cred_cloned)
1380 		crfree(cr);
1381 
1382 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1383 
1384 	if (rpc_status != NULL)
1385 		*rpc_status = rpcerr.re_status;
1386 
1387 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1388 	    rpcerr.re_errno);
1389 
1390 	return (rpcerr.re_errno);
1391 }
1392 
1393 #ifdef DEBUG
1394 static int acl2call_hits = 0;
1395 static int acl2call_misses = 0;
1396 #endif
1397 
1398 int
1399 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1400     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1401     enum nfsstat *statusp, int flags, failinfo_t *fi)
1402 {
1403 	int rpcerror;
1404 
1405 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1406 	    cr, douprintf, flags, fi);
1407 	if (!rpcerror) {
1408 		/*
1409 		 * See comments with crnetadjust().
1410 		 */
1411 		if (*statusp == NFSERR_ACCES &&
1412 		    (cr = crnetadjust(cr)) != NULL) {
1413 #ifdef DEBUG
1414 			acl2call_hits++;
1415 #endif
1416 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1417 			    resp, cr, douprintf, flags, fi);
1418 			crfree(cr);
1419 #ifdef DEBUG
1420 			if (*statusp == NFSERR_ACCES)
1421 				acl2call_misses++;
1422 #endif
1423 		}
1424 	}
1425 
1426 	return (rpcerror);
1427 }
1428 
1429 #ifdef DEBUG
1430 static int acl3call_hits = 0;
1431 static int acl3call_misses = 0;
1432 #endif
1433 
1434 int
1435 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1436     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1437     nfsstat3 *statusp, int flags, failinfo_t *fi)
1438 {
1439 	int rpcerror;
1440 	int user_informed;
1441 
1442 	user_informed = 0;
1443 
1444 	do {
1445 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1446 		    cr, douprintf, flags, fi);
1447 		if (!rpcerror) {
1448 			cred_t *crr;
1449 			if (*statusp == NFS3ERR_JUKEBOX) {
1450 				if (!user_informed) {
1451 					user_informed = 1;
1452 					uprintf(
1453 		"file temporarily unavailable on the server, retrying...\n");
1454 				}
1455 				delay(nfs3_jukebox_delay);
1456 			}
1457 			/*
1458 			 * See crnetadjust() for comments.
1459 			 */
1460 			else if (*statusp == NFS3ERR_ACCES &&
1461 			    (crr = crnetadjust(cr)) != NULL) {
1462 #ifdef DEBUG
1463 				acl3call_hits++;
1464 #endif
1465 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1466 				    xdrres, resp, crr, douprintf, flags, fi);
1467 
1468 				crfree(crr);
1469 #ifdef DEBUG
1470 				if (*statusp == NFS3ERR_ACCES)
1471 					acl3call_misses++;
1472 #endif
1473 			}
1474 		}
1475 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1476 
1477 	return (rpcerror);
1478 }
1479 
1480 static int
1481 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1482     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1483     int flags, failinfo_t *fi)
1484 {
1485 	CLIENT *client;
1486 	struct chtab *ch;
1487 	cred_t *cr = icr;
1488 	bool_t cred_cloned = FALSE;
1489 	enum clnt_stat status;
1490 	struct rpc_err rpcerr;
1491 	struct timeval wait;
1492 	int timeo;		/* in units of hz */
1493 #if 0 /* notyet */
1494 	int my_rsize, my_wsize;
1495 #endif
1496 	bool_t tryagain;
1497 	k_sigset_t smask;
1498 	servinfo_t *svp;
1499 	struct nfs_clnt *nfscl;
1500 	zoneid_t zoneid = getzoneid();
1501 #ifdef DEBUG
1502 	char *bufp;
1503 #endif
1504 
1505 #if 0 /* notyet */
1506 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1507 	    "rfscall_start:which %d mi %p", which, mi);
1508 #endif
1509 
1510 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1511 	ASSERT(nfscl != NULL);
1512 
1513 	nfscl->nfscl_stat.calls.value.ui64++;
1514 	mi->mi_aclreqs[which].value.ui64++;
1515 
1516 	rpcerr.re_status = RPC_SUCCESS;
1517 
1518 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1519 		rpcerr.re_status = RPC_FAILED;
1520 		rpcerr.re_errno = EIO;
1521 		return (rpcerr.re_errno);
1522 	}
1523 
1524 #if 0 /* notyet */
1525 	/*
1526 	 * Remember the transfer sizes in case
1527 	 * nfs_feedback changes them underneath us.
1528 	 */
1529 	my_rsize = mi->mi_curread;
1530 	my_wsize = mi->mi_curwrite;
1531 #endif
1532 
1533 	/*
1534 	 * NFS client failover support
1535 	 *
1536 	 * If this rnode is not in sync with the current server (VALID_FH),
1537 	 * we'd like to do a remap to get in sync.  We can be interrupted
1538 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1539 	 * use the best info we have to try the RPC.  Part of that is
1540 	 * unconditionally updating the filehandle copy kept for V3.
1541 	 *
1542 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1543 	 * rw_enter(); we're trying to keep the current server from being
1544 	 * changed on us until we're done with the remapping and have a
1545 	 * matching client handle.  We don't want to sending a filehandle
1546 	 * to the wrong host.
1547 	 */
1548 failoverretry:
1549 	if (FAILOVER_MOUNT(mi)) {
1550 		mutex_enter(&mi->mi_lock);
1551 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1552 			if (failover_wait(mi)) {
1553 				mutex_exit(&mi->mi_lock);
1554 				return (EINTR);
1555 			}
1556 		}
1557 		INC_READERS(mi);
1558 		mutex_exit(&mi->mi_lock);
1559 		if (fi) {
1560 			if (!VALID_FH(fi) &&
1561 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1562 				int remaperr;
1563 
1564 				svp = mi->mi_curr_serv;
1565 				remaperr = failover_remap(fi);
1566 				if (remaperr != 0) {
1567 #ifdef DEBUG
1568 					if (remaperr != EINTR)
1569 						nfs_cmn_err(remaperr, CE_WARN,
1570 					    "aclcall couldn't failover: %m");
1571 #endif
1572 					mutex_enter(&mi->mi_lock);
1573 					DEC_READERS(mi);
1574 					mutex_exit(&mi->mi_lock);
1575 
1576 					/*
1577 					 * If failover_remap returns ETIMEDOUT
1578 					 * and the filesystem is hard mounted
1579 					 * we have to retry the call with a new
1580 					 * server.
1581 					 */
1582 					if ((mi->mi_flags & MI_HARD) &&
1583 					    IS_RECOVERABLE_ERROR(remaperr)) {
1584 						if (svp == mi->mi_curr_serv)
1585 							failover_newserver(mi);
1586 						rpcerr.re_status = RPC_SUCCESS;
1587 						goto failoverretry;
1588 					}
1589 					return (remaperr);
1590 				}
1591 			}
1592 			if (fi->fhp && fi->copyproc)
1593 				(*fi->copyproc)(fi->fhp, fi->vp);
1594 		}
1595 	}
1596 
1597 	/* For TSOL, use a new cred which has net_mac_aware flag */
1598 	if (!cred_cloned && is_system_labeled()) {
1599 		cred_cloned = TRUE;
1600 		cr = crdup(icr);
1601 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1602 	}
1603 
1604 	/*
1605 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1606 	 * are guaranteed to reprocess the retry as a new request.
1607 	 */
1608 	svp = mi->mi_curr_serv;
1609 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1610 	if (FAILOVER_MOUNT(mi)) {
1611 		mutex_enter(&mi->mi_lock);
1612 		DEC_READERS(mi);
1613 		mutex_exit(&mi->mi_lock);
1614 
1615 		if ((rpcerr.re_errno == ETIMEDOUT ||
1616 		    rpcerr.re_errno == ECONNRESET) &&
1617 		    failover_safe(fi)) {
1618 			if (svp == mi->mi_curr_serv)
1619 				failover_newserver(mi);
1620 			goto failoverretry;
1621 		}
1622 	}
1623 	if (rpcerr.re_errno != 0) {
1624 		if (cred_cloned)
1625 			crfree(cr);
1626 		return (rpcerr.re_errno);
1627 	}
1628 
1629 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1630 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1631 		timeo = (mi->mi_timeo * hz) / 10;
1632 	} else {
1633 		mutex_enter(&mi->mi_lock);
1634 		timeo = CLNT_SETTIMERS(client,
1635 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1636 		    &(mi->mi_timers[NFS_CALLTYPES]),
1637 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1638 		    (void (*)()) 0, (caddr_t)mi, 0);
1639 		mutex_exit(&mi->mi_lock);
1640 	}
1641 
1642 	/*
1643 	 * If hard mounted fs, retry call forever unless hard error occurs.
1644 	 */
1645 	do {
1646 		tryagain = FALSE;
1647 
1648 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1649 			status = RPC_FAILED;
1650 			rpcerr.re_status = RPC_FAILED;
1651 			rpcerr.re_errno = EIO;
1652 			break;
1653 		}
1654 
1655 		TICK_TO_TIMEVAL(timeo, &wait);
1656 
1657 		/*
1658 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1659 		 * and SIGTERM. (Preserving the existing masks).
1660 		 * Mask out SIGINT if mount option nointr is specified.
1661 		 */
1662 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1663 		if (!(mi->mi_flags & MI_INT))
1664 			client->cl_nosignal = TRUE;
1665 
1666 		/*
1667 		 * If there is a current signal, then don't bother
1668 		 * even trying to send out the request because we
1669 		 * won't be able to block waiting for the response.
1670 		 * Simply assume RPC_INTR and get on with it.
1671 		 */
1672 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1673 			status = RPC_INTR;
1674 		else {
1675 			status = CLNT_CALL(client, which, xdrargs, argsp,
1676 			    xdrres, resp, wait);
1677 		}
1678 
1679 		if (!(mi->mi_flags & MI_INT))
1680 			client->cl_nosignal = FALSE;
1681 		/*
1682 		 * restore original signal mask
1683 		 */
1684 		sigunintr(&smask);
1685 
1686 		switch (status) {
1687 		case RPC_SUCCESS:
1688 #if 0 /* notyet */
1689 			if ((mi->mi_flags & MI_DYNAMIC) &&
1690 			    mi->mi_timer_type[which] != 0 &&
1691 			    (mi->mi_curread != my_rsize ||
1692 			    mi->mi_curwrite != my_wsize))
1693 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1694 #endif
1695 			break;
1696 
1697 		/*
1698 		 * Unfortunately, there are servers in the world which
1699 		 * are not coded correctly.  They are not prepared to
1700 		 * handle RPC requests to the NFS port which are not
1701 		 * NFS requests.  Thus, they may try to process the
1702 		 * NFS_ACL request as if it were an NFS request.  This
1703 		 * does not work.  Generally, an error will be generated
1704 		 * on the client because it will not be able to decode
1705 		 * the response from the server.  However, it seems
1706 		 * possible that the server may not be able to decode
1707 		 * the arguments.  Thus, the criteria for deciding
1708 		 * whether the server supports NFS_ACL or not is whether
1709 		 * the following RPC errors are returned from CLNT_CALL.
1710 		 */
1711 		case RPC_CANTDECODERES:
1712 		case RPC_PROGUNAVAIL:
1713 		case RPC_CANTDECODEARGS:
1714 		case RPC_PROGVERSMISMATCH:
1715 			mutex_enter(&mi->mi_lock);
1716 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1717 			mutex_exit(&mi->mi_lock);
1718 			break;
1719 
1720 		/*
1721 		 * If the server supports NFS_ACL but not the new ops
1722 		 * for extended attributes, make sure we don't retry.
1723 		 */
1724 		case RPC_PROCUNAVAIL:
1725 			mutex_enter(&mi->mi_lock);
1726 			mi->mi_flags &= ~MI_EXTATTR;
1727 			mutex_exit(&mi->mi_lock);
1728 			break;
1729 
1730 		case RPC_INTR:
1731 			/*
1732 			 * There is no way to recover from this error,
1733 			 * even if mount option nointr is specified.
1734 			 * SIGKILL, for example, cannot be blocked.
1735 			 */
1736 			rpcerr.re_status = RPC_INTR;
1737 			rpcerr.re_errno = EINTR;
1738 			break;
1739 
1740 		case RPC_UDERROR:
1741 			/*
1742 			 * If the NFS server is local (vold) and
1743 			 * it goes away then we get RPC_UDERROR.
1744 			 * This is a retryable error, so we would
1745 			 * loop, so check to see if the specific
1746 			 * error was ECONNRESET, indicating that
1747 			 * target did not exist at all.  If so,
1748 			 * return with RPC_PROGUNAVAIL and
1749 			 * ECONNRESET to indicate why.
1750 			 */
1751 			CLNT_GETERR(client, &rpcerr);
1752 			if (rpcerr.re_errno == ECONNRESET) {
1753 				rpcerr.re_status = RPC_PROGUNAVAIL;
1754 				rpcerr.re_errno = ECONNRESET;
1755 				break;
1756 			}
1757 			/*FALLTHROUGH*/
1758 
1759 		default:		/* probably RPC_TIMEDOUT */
1760 			if (IS_UNRECOVERABLE_RPC(status))
1761 				break;
1762 
1763 			/*
1764 			 * increment server not responding count
1765 			 */
1766 			mutex_enter(&mi->mi_lock);
1767 			mi->mi_noresponse++;
1768 			mutex_exit(&mi->mi_lock);
1769 #ifdef DEBUG
1770 			nfscl->nfscl_stat.noresponse.value.ui64++;
1771 #endif
1772 
1773 			if (!(mi->mi_flags & MI_HARD)) {
1774 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1775 				    (mi->mi_acl_ss_call_type[which] == 0))
1776 					break;
1777 			}
1778 
1779 			/*
1780 			 * The call is in progress (over COTS).
1781 			 * Try the CLNT_CALL again, but don't
1782 			 * print a noisy error message.
1783 			 */
1784 			if (status == RPC_INPROGRESS) {
1785 				tryagain = TRUE;
1786 				break;
1787 			}
1788 
1789 			if (flags & RFSCALL_SOFT)
1790 				break;
1791 
1792 			/*
1793 			 * On zone shutdown, just move on.
1794 			 */
1795 			if (zone_status_get(curproc->p_zone) >=
1796 			    ZONE_IS_SHUTTING_DOWN) {
1797 				rpcerr.re_status = RPC_FAILED;
1798 				rpcerr.re_errno = EIO;
1799 				break;
1800 			}
1801 
1802 			/*
1803 			 * NFS client failover support
1804 			 *
1805 			 * If the current server just failed us, we'll
1806 			 * start the process of finding a new server.
1807 			 * After that, we can just retry.
1808 			 */
1809 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1810 				if (svp == mi->mi_curr_serv)
1811 					failover_newserver(mi);
1812 				clfree_impl(client, ch, nfscl);
1813 				goto failoverretry;
1814 			}
1815 
1816 			tryagain = TRUE;
1817 			timeo = backoff(timeo);
1818 			mutex_enter(&mi->mi_lock);
1819 			if (!(mi->mi_flags & MI_PRINTED)) {
1820 				mi->mi_flags |= MI_PRINTED;
1821 				mutex_exit(&mi->mi_lock);
1822 #ifdef DEBUG
1823 				zprintf(zoneid,
1824 			"NFS_ACL%d server %s not responding still trying\n",
1825 				    mi->mi_vers, svp->sv_hostname);
1826 #else
1827 				zprintf(zoneid,
1828 			    "NFS server %s not responding still trying\n",
1829 				    svp->sv_hostname);
1830 #endif
1831 			} else
1832 				mutex_exit(&mi->mi_lock);
1833 			if (*douprintf && nfs_has_ctty()) {
1834 				*douprintf = 0;
1835 				if (!(mi->mi_flags & MI_NOPRINT))
1836 #ifdef DEBUG
1837 					uprintf(
1838 			"NFS_ACL%d server %s not responding still trying\n",
1839 					    mi->mi_vers, svp->sv_hostname);
1840 #else
1841 					uprintf(
1842 			    "NFS server %s not responding still trying\n",
1843 					    svp->sv_hostname);
1844 #endif
1845 			}
1846 
1847 #if 0 /* notyet */
1848 			/*
1849 			 * If doing dynamic adjustment of transfer
1850 			 * size and if it's a read or write call
1851 			 * and if the transfer size changed while
1852 			 * retransmitting or if the feedback routine
1853 			 * changed the transfer size,
1854 			 * then exit rfscall so that the transfer
1855 			 * size can be adjusted at the vnops level.
1856 			 */
1857 			if ((mi->mi_flags & MI_DYNAMIC) &&
1858 			    mi->mi_acl_timer_type[which] != 0 &&
1859 			    (mi->mi_curread != my_rsize ||
1860 			    mi->mi_curwrite != my_wsize ||
1861 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1862 				/*
1863 				 * On read or write calls, return
1864 				 * back to the vnode ops level if
1865 				 * the transfer size changed.
1866 				 */
1867 				clfree_impl(client, ch, nfscl);
1868 				if (cred_cloned)
1869 					crfree(cr);
1870 				return (ENFS_TRYAGAIN);
1871 			}
1872 #endif
1873 		}
1874 	} while (tryagain);
1875 
1876 	if (status != RPC_SUCCESS) {
1877 		/*
1878 		 * Let soft mounts use the timed out message.
1879 		 */
1880 		if (status == RPC_INPROGRESS)
1881 			status = RPC_TIMEDOUT;
1882 		nfscl->nfscl_stat.badcalls.value.ui64++;
1883 		if (status == RPC_CANTDECODERES ||
1884 		    status == RPC_PROGUNAVAIL ||
1885 		    status == RPC_PROCUNAVAIL ||
1886 		    status == RPC_CANTDECODEARGS ||
1887 		    status == RPC_PROGVERSMISMATCH)
1888 			CLNT_GETERR(client, &rpcerr);
1889 		else if (status != RPC_INTR) {
1890 			mutex_enter(&mi->mi_lock);
1891 			mi->mi_flags |= MI_DOWN;
1892 			mutex_exit(&mi->mi_lock);
1893 			CLNT_GETERR(client, &rpcerr);
1894 #ifdef DEBUG
1895 			bufp = clnt_sperror(client, svp->sv_hostname);
1896 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1897 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1898 			if (nfs_has_ctty()) {
1899 				if (!(mi->mi_flags & MI_NOPRINT)) {
1900 					uprintf("NFS_ACL%d %s failed for %s\n",
1901 					    mi->mi_vers, mi->mi_aclnames[which],
1902 					    bufp);
1903 				}
1904 			}
1905 			kmem_free(bufp, MAXPATHLEN);
1906 #else
1907 			zprintf(zoneid,
1908 			    "NFS %s failed for server %s: error %d (%s)\n",
1909 			    mi->mi_aclnames[which], svp->sv_hostname,
1910 			    status, clnt_sperrno(status));
1911 			if (nfs_has_ctty()) {
1912 				if (!(mi->mi_flags & MI_NOPRINT))
1913 					uprintf(
1914 				"NFS %s failed for server %s: error %d (%s)\n",
1915 					    mi->mi_aclnames[which],
1916 					    svp->sv_hostname, status,
1917 					    clnt_sperrno(status));
1918 			}
1919 #endif
1920 			/*
1921 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1922 			 * re_errno is set appropriately depending on
1923 			 * the authentication error
1924 			 */
1925 			if (status == RPC_VERSMISMATCH ||
1926 			    status == RPC_PROGVERSMISMATCH)
1927 				rpcerr.re_errno = EIO;
1928 		}
1929 	} else {
1930 		/*
1931 		 * Test the value of mi_down and mi_printed without
1932 		 * holding the mi_lock mutex.  If they are both zero,
1933 		 * then it is okay to skip the down and printed
1934 		 * processing.  This saves on a mutex_enter and
1935 		 * mutex_exit pair for a normal, successful RPC.
1936 		 * This was just complete overhead.
1937 		 */
1938 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1939 			mutex_enter(&mi->mi_lock);
1940 			mi->mi_flags &= ~MI_DOWN;
1941 			if (mi->mi_flags & MI_PRINTED) {
1942 				mi->mi_flags &= ~MI_PRINTED;
1943 				mutex_exit(&mi->mi_lock);
1944 #ifdef DEBUG
1945 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1946 				    mi->mi_vers, svp->sv_hostname);
1947 #else
1948 				zprintf(zoneid, "NFS server %s ok\n",
1949 				    svp->sv_hostname);
1950 #endif
1951 			} else
1952 				mutex_exit(&mi->mi_lock);
1953 		}
1954 
1955 		if (*douprintf == 0) {
1956 			if (!(mi->mi_flags & MI_NOPRINT))
1957 #ifdef DEBUG
1958 				uprintf("NFS_ACL%d server %s ok\n",
1959 				    mi->mi_vers, svp->sv_hostname);
1960 #else
1961 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1962 #endif
1963 			*douprintf = 1;
1964 		}
1965 	}
1966 
1967 	clfree_impl(client, ch, nfscl);
1968 	if (cred_cloned)
1969 		crfree(cr);
1970 
1971 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1972 
1973 #if 0 /* notyet */
1974 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1975 	    rpcerr.re_errno);
1976 #endif
1977 
1978 	return (rpcerr.re_errno);
1979 }
1980 
1981 int
1982 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1983 {
1984 	uint_t mask = vap->va_mask;
1985 
1986 	if (!(mask & AT_MODE))
1987 		sa->sa_mode = (uint32_t)-1;
1988 	else
1989 		sa->sa_mode = vap->va_mode;
1990 	if (!(mask & AT_UID))
1991 		sa->sa_uid = (uint32_t)-1;
1992 	else
1993 		sa->sa_uid = (uint32_t)vap->va_uid;
1994 	if (!(mask & AT_GID))
1995 		sa->sa_gid = (uint32_t)-1;
1996 	else
1997 		sa->sa_gid = (uint32_t)vap->va_gid;
1998 	if (!(mask & AT_SIZE))
1999 		sa->sa_size = (uint32_t)-1;
2000 	else
2001 		sa->sa_size = (uint32_t)vap->va_size;
2002 	if (!(mask & AT_ATIME))
2003 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2004 	else {
2005 		/* check time validity */
2006 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2007 			return (EOVERFLOW);
2008 		}
2009 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2010 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2011 	}
2012 	if (!(mask & AT_MTIME))
2013 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2014 	else {
2015 		/* check time validity */
2016 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2017 			return (EOVERFLOW);
2018 		}
2019 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2020 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2021 	}
2022 	return (0);
2023 }
2024 
2025 int
2026 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2027 {
2028 	uint_t mask = vap->va_mask;
2029 
2030 	if (!(mask & AT_MODE))
2031 		sa->mode.set_it = FALSE;
2032 	else {
2033 		sa->mode.set_it = TRUE;
2034 		sa->mode.mode = (mode3)vap->va_mode;
2035 	}
2036 	if (!(mask & AT_UID))
2037 		sa->uid.set_it = FALSE;
2038 	else {
2039 		sa->uid.set_it = TRUE;
2040 		sa->uid.uid = (uid3)vap->va_uid;
2041 	}
2042 	if (!(mask & AT_GID))
2043 		sa->gid.set_it = FALSE;
2044 	else {
2045 		sa->gid.set_it = TRUE;
2046 		sa->gid.gid = (gid3)vap->va_gid;
2047 	}
2048 	if (!(mask & AT_SIZE))
2049 		sa->size.set_it = FALSE;
2050 	else {
2051 		sa->size.set_it = TRUE;
2052 		sa->size.size = (size3)vap->va_size;
2053 	}
2054 	if (!(mask & AT_ATIME))
2055 		sa->atime.set_it = DONT_CHANGE;
2056 	else {
2057 		/* check time validity */
2058 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2059 			return (EOVERFLOW);
2060 		}
2061 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2062 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2063 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2064 	}
2065 	if (!(mask & AT_MTIME))
2066 		sa->mtime.set_it = DONT_CHANGE;
2067 	else {
2068 		/* check time validity */
2069 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2070 			return (EOVERFLOW);
2071 		}
2072 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2073 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2074 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2075 	}
2076 	return (0);
2077 }
2078 
2079 void
2080 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2081 {
2082 
2083 	da->da_fhandle = VTOFH(dvp);
2084 	da->da_name = nm;
2085 	da->da_flags = 0;
2086 }
2087 
2088 void
2089 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2090 {
2091 
2092 	da->dirp = VTOFH3(dvp);
2093 	da->name = nm;
2094 }
2095 
2096 int
2097 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2098 {
2099 	int error;
2100 	rnode_t *rp;
2101 	struct vattr va;
2102 
2103 	va.va_mask = AT_MODE | AT_GID;
2104 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2105 	if (error)
2106 		return (error);
2107 
2108 	/*
2109 	 * To determine the expected group-id of the created file:
2110 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2111 	 *	GRPID option, and the directory's set-gid bit is clear,
2112 	 *	then use the process's gid.
2113 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2114 	 */
2115 	rp = VTOR(dvp);
2116 	mutex_enter(&rp->r_statelock);
2117 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2118 		*gidp = crgetgid(cr);
2119 	else
2120 		*gidp = va.va_gid;
2121 	mutex_exit(&rp->r_statelock);
2122 	return (0);
2123 }
2124 
2125 int
2126 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2127 {
2128 	int error;
2129 	struct vattr va;
2130 
2131 	va.va_mask = AT_MODE;
2132 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2133 	if (error)
2134 		return (error);
2135 
2136 	/*
2137 	 * Modify the expected mode (om) so that the set-gid bit matches
2138 	 * that of the parent directory (dvp).
2139 	 */
2140 	if (va.va_mode & VSGID)
2141 		*omp |= VSGID;
2142 	else
2143 		*omp &= ~VSGID;
2144 	return (0);
2145 }
2146 
2147 void
2148 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2149 {
2150 
2151 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2152 		if (!(vp->v_flag & VSWAPLIKE)) {
2153 			mutex_enter(&vp->v_lock);
2154 			vp->v_flag |= VSWAPLIKE;
2155 			mutex_exit(&vp->v_lock);
2156 		}
2157 	} else {
2158 		if (vp->v_flag & VSWAPLIKE) {
2159 			mutex_enter(&vp->v_lock);
2160 			vp->v_flag &= ~VSWAPLIKE;
2161 			mutex_exit(&vp->v_lock);
2162 		}
2163 	}
2164 }
2165 
2166 /*
2167  * Free the resources associated with an rnode.
2168  */
2169 static void
2170 rinactive(rnode_t *rp, cred_t *cr)
2171 {
2172 	vnode_t *vp;
2173 	cred_t *cred;
2174 	char *contents;
2175 	int size;
2176 	vsecattr_t *vsp;
2177 	int error;
2178 	nfs3_pathconf_info *info;
2179 
2180 	/*
2181 	 * Before freeing anything, wait until all asynchronous
2182 	 * activity is done on this rnode.  This will allow all
2183 	 * asynchronous read ahead and write behind i/o's to
2184 	 * finish.
2185 	 */
2186 	mutex_enter(&rp->r_statelock);
2187 	while (rp->r_count > 0)
2188 		cv_wait(&rp->r_cv, &rp->r_statelock);
2189 	mutex_exit(&rp->r_statelock);
2190 
2191 	/*
2192 	 * Flush and invalidate all pages associated with the vnode.
2193 	 */
2194 	vp = RTOV(rp);
2195 	if (vn_has_cached_data(vp)) {
2196 		ASSERT(vp->v_type != VCHR);
2197 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2198 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2199 			if (error && (error == ENOSPC || error == EDQUOT)) {
2200 				mutex_enter(&rp->r_statelock);
2201 				if (!rp->r_error)
2202 					rp->r_error = error;
2203 				mutex_exit(&rp->r_statelock);
2204 			}
2205 		}
2206 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2207 	}
2208 
2209 	/*
2210 	 * Free any held credentials and caches which may be associated
2211 	 * with this rnode.
2212 	 */
2213 	mutex_enter(&rp->r_statelock);
2214 	cred = rp->r_cred;
2215 	rp->r_cred = NULL;
2216 	contents = rp->r_symlink.contents;
2217 	size = rp->r_symlink.size;
2218 	rp->r_symlink.contents = NULL;
2219 	vsp = rp->r_secattr;
2220 	rp->r_secattr = NULL;
2221 	info = rp->r_pathconf;
2222 	rp->r_pathconf = NULL;
2223 	mutex_exit(&rp->r_statelock);
2224 
2225 	/*
2226 	 * Free the held credential.
2227 	 */
2228 	if (cred != NULL)
2229 		crfree(cred);
2230 
2231 	/*
2232 	 * Free the access cache entries.
2233 	 */
2234 	(void) nfs_access_purge_rp(rp);
2235 
2236 	/*
2237 	 * Free the readdir cache entries.
2238 	 */
2239 	if (HAVE_RDDIR_CACHE(rp))
2240 		nfs_purge_rddir_cache(vp);
2241 
2242 	/*
2243 	 * Free the symbolic link cache.
2244 	 */
2245 	if (contents != NULL) {
2246 
2247 		kmem_free((void *)contents, size);
2248 	}
2249 
2250 	/*
2251 	 * Free any cached ACL.
2252 	 */
2253 	if (vsp != NULL)
2254 		nfs_acl_free(vsp);
2255 
2256 	/*
2257 	 * Free any cached pathconf information.
2258 	 */
2259 	if (info != NULL)
2260 		kmem_free(info, sizeof (*info));
2261 }
2262 
2263 /*
2264  * Return a vnode for the given NFS Version 2 file handle.
2265  * If no rnode exists for this fhandle, create one and put it
2266  * into the hash queues.  If the rnode for this fhandle
2267  * already exists, return it.
2268  *
2269  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2270  */
2271 vnode_t *
2272 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2273     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2274 {
2275 	int newnode;
2276 	int index;
2277 	vnode_t *vp;
2278 	nfs_fhandle nfh;
2279 	vattr_t va;
2280 
2281 	nfh.fh_len = NFS_FHSIZE;
2282 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2283 
2284 	index = rtablehash(&nfh);
2285 	rw_enter(&rtable[index].r_lock, RW_READER);
2286 
2287 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2288 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2289 
2290 	if (attr != NULL) {
2291 		if (!newnode) {
2292 			rw_exit(&rtable[index].r_lock);
2293 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2294 		} else {
2295 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2296 				vp->v_type = VBAD;
2297 			else
2298 				vp->v_type = n2v_type(attr);
2299 			/*
2300 			 * A translation here seems to be necessary
2301 			 * because this function can be called
2302 			 * with `attr' that has come from the wire,
2303 			 * and been operated on by vattr_to_nattr().
2304 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2305 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2306 			 * ->makenfsnode().
2307 			 */
2308 			if ((attr->na_rdev & 0xffff0000) == 0)
2309 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2310 			else
2311 				vp->v_rdev = expldev(n2v_rdev(attr));
2312 			nfs_attrcache(vp, attr, t);
2313 			rw_exit(&rtable[index].r_lock);
2314 		}
2315 	} else {
2316 		if (newnode) {
2317 			PURGE_ATTRCACHE(vp);
2318 		}
2319 		rw_exit(&rtable[index].r_lock);
2320 	}
2321 
2322 	return (vp);
2323 }
2324 
2325 /*
2326  * Return a vnode for the given NFS Version 3 file handle.
2327  * If no rnode exists for this fhandle, create one and put it
2328  * into the hash queues.  If the rnode for this fhandle
2329  * already exists, return it.
2330  *
2331  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2332  */
2333 vnode_t *
2334 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2335     cred_t *cr, char *dnm, char *nm)
2336 {
2337 	int newnode;
2338 	int index;
2339 	vnode_t *vp;
2340 
2341 	index = rtablehash((nfs_fhandle *)fh);
2342 	rw_enter(&rtable[index].r_lock, RW_READER);
2343 
2344 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2345 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2346 	    dnm, nm);
2347 
2348 	if (vap == NULL) {
2349 		if (newnode) {
2350 			PURGE_ATTRCACHE(vp);
2351 		}
2352 		rw_exit(&rtable[index].r_lock);
2353 		return (vp);
2354 	}
2355 
2356 	if (!newnode) {
2357 		rw_exit(&rtable[index].r_lock);
2358 		nfs_attr_cache(vp, vap, t, cr);
2359 	} else {
2360 		rnode_t *rp = VTOR(vp);
2361 
2362 		vp->v_type = vap->va_type;
2363 		vp->v_rdev = vap->va_rdev;
2364 
2365 		mutex_enter(&rp->r_statelock);
2366 		if (rp->r_mtime <= t)
2367 			nfs_attrcache_va(vp, vap);
2368 		mutex_exit(&rp->r_statelock);
2369 		rw_exit(&rtable[index].r_lock);
2370 	}
2371 
2372 	return (vp);
2373 }
2374 
2375 vnode_t *
2376 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2377     cred_t *cr, char *dnm, char *nm)
2378 {
2379 	int newnode;
2380 	int index;
2381 	vnode_t *vp;
2382 	vattr_t va;
2383 
2384 	index = rtablehash((nfs_fhandle *)fh);
2385 	rw_enter(&rtable[index].r_lock, RW_READER);
2386 
2387 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2388 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2389 	    dnm, nm);
2390 
2391 	if (attr == NULL) {
2392 		if (newnode) {
2393 			PURGE_ATTRCACHE(vp);
2394 		}
2395 		rw_exit(&rtable[index].r_lock);
2396 		return (vp);
2397 	}
2398 
2399 	if (!newnode) {
2400 		rw_exit(&rtable[index].r_lock);
2401 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2402 	} else {
2403 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2404 			vp->v_type = VBAD;
2405 		else
2406 			vp->v_type = nf3_to_vt[attr->type];
2407 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2408 		    attr->rdev.specdata2);
2409 		nfs3_attrcache(vp, attr, t);
2410 		rw_exit(&rtable[index].r_lock);
2411 	}
2412 
2413 	return (vp);
2414 }
2415 
2416 /*
2417  * Read this comment before making changes to rtablehash()!
2418  * This is a hash function in which seemingly obvious and harmless
2419  * changes can cause escalations costing million dollars!
2420  * Know what you are doing.
2421  *
2422  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2423  * algorithm is currently detailed here:
2424  *
2425  *   http://burtleburtle.net/bob/hash/doobs.html
2426  *
2427  * Of course, the above link may not be valid by the time you are reading
2428  * this, but suffice it to say that the one-at-a-time algorithm works well in
2429  * almost all cases.  If you are changing the algorithm be sure to verify that
2430  * the hash algorithm still provides even distribution in all cases and with
2431  * any server returning filehandles in whatever order (sequential or random).
2432  */
2433 static int
2434 rtablehash(nfs_fhandle *fh)
2435 {
2436 	ulong_t hash, len, i;
2437 	char *key;
2438 
2439 	key = fh->fh_buf;
2440 	len = (ulong_t)fh->fh_len;
2441 	for (hash = 0, i = 0; i < len; i++) {
2442 		hash += key[i];
2443 		hash += (hash << 10);
2444 		hash ^= (hash >> 6);
2445 	}
2446 	hash += (hash << 3);
2447 	hash ^= (hash >> 11);
2448 	hash += (hash << 15);
2449 	return (hash & rtablemask);
2450 }
2451 
2452 static vnode_t *
2453 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2454     struct vnodeops *vops,
2455     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2456     int (*compar)(const void *, const void *),
2457     int *newnode, cred_t *cr, char *dnm, char *nm)
2458 {
2459 	rnode_t *rp;
2460 	rnode_t *trp;
2461 	vnode_t *vp;
2462 	mntinfo_t *mi;
2463 
2464 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2465 
2466 	mi = VFTOMI(vfsp);
2467 start:
2468 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2469 		vp = RTOV(rp);
2470 		nfs_set_vroot(vp);
2471 		*newnode = 0;
2472 		return (vp);
2473 	}
2474 	rw_exit(&rhtp->r_lock);
2475 
2476 	mutex_enter(&rpfreelist_lock);
2477 	if (rpfreelist != NULL && rnew >= nrnode) {
2478 		rp = rpfreelist;
2479 		rp_rmfree(rp);
2480 		mutex_exit(&rpfreelist_lock);
2481 
2482 		vp = RTOV(rp);
2483 
2484 		if (rp->r_flags & RHASHED) {
2485 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2486 			mutex_enter(&vp->v_lock);
2487 			if (vp->v_count > 1) {
2488 				vp->v_count--;
2489 				mutex_exit(&vp->v_lock);
2490 				rw_exit(&rp->r_hashq->r_lock);
2491 				rw_enter(&rhtp->r_lock, RW_READER);
2492 				goto start;
2493 			}
2494 			mutex_exit(&vp->v_lock);
2495 			rp_rmhash_locked(rp);
2496 			rw_exit(&rp->r_hashq->r_lock);
2497 		}
2498 
2499 		rinactive(rp, cr);
2500 
2501 		mutex_enter(&vp->v_lock);
2502 		if (vp->v_count > 1) {
2503 			vp->v_count--;
2504 			mutex_exit(&vp->v_lock);
2505 			rw_enter(&rhtp->r_lock, RW_READER);
2506 			goto start;
2507 		}
2508 		mutex_exit(&vp->v_lock);
2509 		vn_invalid(vp);
2510 		/*
2511 		 * destroy old locks before bzero'ing and
2512 		 * recreating the locks below.
2513 		 */
2514 		nfs_rw_destroy(&rp->r_rwlock);
2515 		nfs_rw_destroy(&rp->r_lkserlock);
2516 		mutex_destroy(&rp->r_statelock);
2517 		cv_destroy(&rp->r_cv);
2518 		cv_destroy(&rp->r_commit.c_cv);
2519 		nfs_free_r_path(rp);
2520 		avl_destroy(&rp->r_dir);
2521 		/*
2522 		 * Make sure that if rnode is recycled then
2523 		 * VFS count is decremented properly before
2524 		 * reuse.
2525 		 */
2526 		VFS_RELE(vp->v_vfsp);
2527 		vn_reinit(vp);
2528 	} else {
2529 		vnode_t *new_vp;
2530 
2531 		mutex_exit(&rpfreelist_lock);
2532 
2533 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2534 		new_vp = vn_alloc(KM_SLEEP);
2535 
2536 		atomic_add_long((ulong_t *)&rnew, 1);
2537 #ifdef DEBUG
2538 		clstat_debug.nrnode.value.ui64++;
2539 #endif
2540 		vp = new_vp;
2541 	}
2542 
2543 	bzero(rp, sizeof (*rp));
2544 	rp->r_vnode = vp;
2545 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2546 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2547 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2548 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2549 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2550 	rp->r_fh.fh_len = fh->fh_len;
2551 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2552 	rp->r_server = mi->mi_curr_serv;
2553 	if (FAILOVER_MOUNT(mi)) {
2554 		/*
2555 		 * If replicated servers, stash pathnames
2556 		 */
2557 		if (dnm != NULL && nm != NULL) {
2558 			char *s, *p;
2559 			uint_t len;
2560 
2561 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2562 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2563 #ifdef DEBUG
2564 			clstat_debug.rpath.value.ui64 += len;
2565 #endif
2566 			s = rp->r_path;
2567 			for (p = dnm; *p; p++)
2568 				*s++ = *p;
2569 			*s++ = '/';
2570 			for (p = nm; *p; p++)
2571 				*s++ = *p;
2572 			*s = '\0';
2573 		} else {
2574 			/* special case for root */
2575 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2576 #ifdef DEBUG
2577 			clstat_debug.rpath.value.ui64 += 2;
2578 #endif
2579 			*rp->r_path = '.';
2580 			*(rp->r_path + 1) = '\0';
2581 		}
2582 	}
2583 	VFS_HOLD(vfsp);
2584 	rp->r_putapage = putapage;
2585 	rp->r_hashq = rhtp;
2586 	rp->r_flags = RREADDIRPLUS;
2587 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2588 	    offsetof(rddir_cache, tree));
2589 	vn_setops(vp, vops);
2590 	vp->v_data = (caddr_t)rp;
2591 	vp->v_vfsp = vfsp;
2592 	vp->v_type = VNON;
2593 	vp->v_flag |= VMODSORT;
2594 	nfs_set_vroot(vp);
2595 
2596 	/*
2597 	 * There is a race condition if someone else
2598 	 * alloc's the rnode while no locks are held, so we
2599 	 * check again and recover if found.
2600 	 */
2601 	rw_enter(&rhtp->r_lock, RW_WRITER);
2602 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2603 		vp = RTOV(trp);
2604 		nfs_set_vroot(vp);
2605 		*newnode = 0;
2606 		rw_exit(&rhtp->r_lock);
2607 		rp_addfree(rp, cr);
2608 		rw_enter(&rhtp->r_lock, RW_READER);
2609 		return (vp);
2610 	}
2611 	rp_addhash(rp);
2612 	*newnode = 1;
2613 	return (vp);
2614 }
2615 
2616 /*
2617  * Callback function to check if the page should be marked as
2618  * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2619  */
2620 int
2621 nfs_setmod_check(page_t *pp)
2622 {
2623 	if (pp->p_fsdata != C_NOCOMMIT) {
2624 		pp->p_fsdata = C_NOCOMMIT;
2625 		return (1);
2626 	}
2627 	return (0);
2628 }
2629 
2630 static void
2631 nfs_set_vroot(vnode_t *vp)
2632 {
2633 	rnode_t *rp;
2634 	nfs_fhandle *rootfh;
2635 
2636 	rp = VTOR(vp);
2637 	rootfh = &rp->r_server->sv_fhandle;
2638 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2639 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2640 		if (!(vp->v_flag & VROOT)) {
2641 			mutex_enter(&vp->v_lock);
2642 			vp->v_flag |= VROOT;
2643 			mutex_exit(&vp->v_lock);
2644 		}
2645 	}
2646 }
2647 
2648 static void
2649 nfs_free_r_path(rnode_t *rp)
2650 {
2651 	char *path;
2652 	size_t len;
2653 
2654 	path = rp->r_path;
2655 	if (path) {
2656 		rp->r_path = NULL;
2657 		len = strlen(path) + 1;
2658 		kmem_free(path, len);
2659 #ifdef DEBUG
2660 		clstat_debug.rpath.value.ui64 -= len;
2661 #endif
2662 	}
2663 }
2664 
2665 /*
2666  * Put an rnode on the free list.
2667  *
2668  * Rnodes which were allocated above and beyond the normal limit
2669  * are immediately freed.
2670  */
2671 void
2672 rp_addfree(rnode_t *rp, cred_t *cr)
2673 {
2674 	vnode_t *vp;
2675 	struct vfs *vfsp;
2676 
2677 	vp = RTOV(rp);
2678 	ASSERT(vp->v_count >= 1);
2679 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2680 
2681 	/*
2682 	 * If we have too many rnodes allocated and there are no
2683 	 * references to this rnode, or if the rnode is no longer
2684 	 * accessible by it does not reside in the hash queues,
2685 	 * or if an i/o error occurred while writing to the file,
2686 	 * then just free it instead of putting it on the rnode
2687 	 * freelist.
2688 	 */
2689 	vfsp = vp->v_vfsp;
2690 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2691 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2692 		if (rp->r_flags & RHASHED) {
2693 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2694 			mutex_enter(&vp->v_lock);
2695 			if (vp->v_count > 1) {
2696 				vp->v_count--;
2697 				mutex_exit(&vp->v_lock);
2698 				rw_exit(&rp->r_hashq->r_lock);
2699 				return;
2700 			}
2701 			mutex_exit(&vp->v_lock);
2702 			rp_rmhash_locked(rp);
2703 			rw_exit(&rp->r_hashq->r_lock);
2704 		}
2705 
2706 		rinactive(rp, cr);
2707 
2708 		/*
2709 		 * Recheck the vnode reference count.  We need to
2710 		 * make sure that another reference has not been
2711 		 * acquired while we were not holding v_lock.  The
2712 		 * rnode is not in the rnode hash queues, so the
2713 		 * only way for a reference to have been acquired
2714 		 * is for a VOP_PUTPAGE because the rnode was marked
2715 		 * with RDIRTY or for a modified page.  This
2716 		 * reference may have been acquired before our call
2717 		 * to rinactive.  The i/o may have been completed,
2718 		 * thus allowing rinactive to complete, but the
2719 		 * reference to the vnode may not have been released
2720 		 * yet.  In any case, the rnode can not be destroyed
2721 		 * until the other references to this vnode have been
2722 		 * released.  The other references will take care of
2723 		 * either destroying the rnode or placing it on the
2724 		 * rnode freelist.  If there are no other references,
2725 		 * then the rnode may be safely destroyed.
2726 		 */
2727 		mutex_enter(&vp->v_lock);
2728 		if (vp->v_count > 1) {
2729 			vp->v_count--;
2730 			mutex_exit(&vp->v_lock);
2731 			return;
2732 		}
2733 		mutex_exit(&vp->v_lock);
2734 
2735 		destroy_rnode(rp);
2736 		return;
2737 	}
2738 
2739 	/*
2740 	 * Lock the hash queue and then recheck the reference count
2741 	 * to ensure that no other threads have acquired a reference
2742 	 * to indicate that the rnode should not be placed on the
2743 	 * freelist.  If another reference has been acquired, then
2744 	 * just release this one and let the other thread complete
2745 	 * the processing of adding this rnode to the freelist.
2746 	 */
2747 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2748 
2749 	mutex_enter(&vp->v_lock);
2750 	if (vp->v_count > 1) {
2751 		vp->v_count--;
2752 		mutex_exit(&vp->v_lock);
2753 		rw_exit(&rp->r_hashq->r_lock);
2754 		return;
2755 	}
2756 	mutex_exit(&vp->v_lock);
2757 
2758 	/*
2759 	 * If there is no cached data or metadata for this file, then
2760 	 * put the rnode on the front of the freelist so that it will
2761 	 * be reused before other rnodes which may have cached data or
2762 	 * metadata associated with them.
2763 	 */
2764 	mutex_enter(&rpfreelist_lock);
2765 	if (rpfreelist == NULL) {
2766 		rp->r_freef = rp;
2767 		rp->r_freeb = rp;
2768 		rpfreelist = rp;
2769 	} else {
2770 		rp->r_freef = rpfreelist;
2771 		rp->r_freeb = rpfreelist->r_freeb;
2772 		rpfreelist->r_freeb->r_freef = rp;
2773 		rpfreelist->r_freeb = rp;
2774 		if (!vn_has_cached_data(vp) &&
2775 		    !HAVE_RDDIR_CACHE(rp) &&
2776 		    rp->r_symlink.contents == NULL &&
2777 		    rp->r_secattr == NULL &&
2778 		    rp->r_pathconf == NULL)
2779 			rpfreelist = rp;
2780 	}
2781 	mutex_exit(&rpfreelist_lock);
2782 
2783 	rw_exit(&rp->r_hashq->r_lock);
2784 }
2785 
2786 /*
2787  * Remove an rnode from the free list.
2788  *
2789  * The caller must be holding rpfreelist_lock and the rnode
2790  * must be on the freelist.
2791  */
2792 static void
2793 rp_rmfree(rnode_t *rp)
2794 {
2795 
2796 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2797 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2798 
2799 	if (rp == rpfreelist) {
2800 		rpfreelist = rp->r_freef;
2801 		if (rp == rpfreelist)
2802 			rpfreelist = NULL;
2803 	}
2804 
2805 	rp->r_freeb->r_freef = rp->r_freef;
2806 	rp->r_freef->r_freeb = rp->r_freeb;
2807 
2808 	rp->r_freef = rp->r_freeb = NULL;
2809 }
2810 
2811 /*
2812  * Put a rnode in the hash table.
2813  *
2814  * The caller must be holding the exclusive hash queue lock.
2815  */
2816 static void
2817 rp_addhash(rnode_t *rp)
2818 {
2819 
2820 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2821 	ASSERT(!(rp->r_flags & RHASHED));
2822 
2823 	rp->r_hashf = rp->r_hashq->r_hashf;
2824 	rp->r_hashq->r_hashf = rp;
2825 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2826 	rp->r_hashf->r_hashb = rp;
2827 
2828 	mutex_enter(&rp->r_statelock);
2829 	rp->r_flags |= RHASHED;
2830 	mutex_exit(&rp->r_statelock);
2831 }
2832 
2833 /*
2834  * Remove a rnode from the hash table.
2835  *
2836  * The caller must be holding the hash queue lock.
2837  */
2838 static void
2839 rp_rmhash_locked(rnode_t *rp)
2840 {
2841 
2842 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2843 	ASSERT(rp->r_flags & RHASHED);
2844 
2845 	rp->r_hashb->r_hashf = rp->r_hashf;
2846 	rp->r_hashf->r_hashb = rp->r_hashb;
2847 
2848 	mutex_enter(&rp->r_statelock);
2849 	rp->r_flags &= ~RHASHED;
2850 	mutex_exit(&rp->r_statelock);
2851 }
2852 
2853 /*
2854  * Remove a rnode from the hash table.
2855  *
2856  * The caller must not be holding the hash queue lock.
2857  */
2858 void
2859 rp_rmhash(rnode_t *rp)
2860 {
2861 
2862 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2863 	rp_rmhash_locked(rp);
2864 	rw_exit(&rp->r_hashq->r_lock);
2865 }
2866 
2867 /*
2868  * Lookup a rnode by fhandle.
2869  *
2870  * The caller must be holding the hash queue lock, either shared or exclusive.
2871  */
2872 static rnode_t *
2873 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2874 {
2875 	rnode_t *rp;
2876 	vnode_t *vp;
2877 
2878 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2879 
2880 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2881 		vp = RTOV(rp);
2882 		if (vp->v_vfsp == vfsp &&
2883 		    rp->r_fh.fh_len == fh->fh_len &&
2884 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2885 			/*
2886 			 * remove rnode from free list, if necessary.
2887 			 */
2888 			if (rp->r_freef != NULL) {
2889 				mutex_enter(&rpfreelist_lock);
2890 				/*
2891 				 * If the rnode is on the freelist,
2892 				 * then remove it and use that reference
2893 				 * as the new reference.  Otherwise,
2894 				 * need to increment the reference count.
2895 				 */
2896 				if (rp->r_freef != NULL) {
2897 					rp_rmfree(rp);
2898 					mutex_exit(&rpfreelist_lock);
2899 				} else {
2900 					mutex_exit(&rpfreelist_lock);
2901 					VN_HOLD(vp);
2902 				}
2903 			} else
2904 				VN_HOLD(vp);
2905 			return (rp);
2906 		}
2907 	}
2908 	return (NULL);
2909 }
2910 
2911 /*
2912  * Return 1 if there is a active vnode belonging to this vfs in the
2913  * rtable cache.
2914  *
2915  * Several of these checks are done without holding the usual
2916  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2917  * etc. will redo the necessary checks before actually destroying
2918  * any rnodes.
2919  */
2920 int
2921 check_rtable(struct vfs *vfsp)
2922 {
2923 	int index;
2924 	rnode_t *rp;
2925 	vnode_t *vp;
2926 
2927 	for (index = 0; index < rtablesize; index++) {
2928 		rw_enter(&rtable[index].r_lock, RW_READER);
2929 		for (rp = rtable[index].r_hashf;
2930 		    rp != (rnode_t *)(&rtable[index]);
2931 		    rp = rp->r_hashf) {
2932 			vp = RTOV(rp);
2933 			if (vp->v_vfsp == vfsp) {
2934 				if (rp->r_freef == NULL ||
2935 				    (vn_has_cached_data(vp) &&
2936 				    (rp->r_flags & RDIRTY)) ||
2937 				    rp->r_count > 0) {
2938 					rw_exit(&rtable[index].r_lock);
2939 					return (1);
2940 				}
2941 			}
2942 		}
2943 		rw_exit(&rtable[index].r_lock);
2944 	}
2945 	return (0);
2946 }
2947 
2948 /*
2949  * Destroy inactive vnodes from the hash queues which belong to this
2950  * vfs.  It is essential that we destroy all inactive vnodes during a
2951  * forced unmount as well as during a normal unmount.
2952  */
2953 void
2954 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2955 {
2956 	int index;
2957 	rnode_t *rp;
2958 	rnode_t *rlist;
2959 	rnode_t *r_hashf;
2960 	vnode_t *vp;
2961 
2962 	rlist = NULL;
2963 
2964 	for (index = 0; index < rtablesize; index++) {
2965 		rw_enter(&rtable[index].r_lock, RW_WRITER);
2966 		for (rp = rtable[index].r_hashf;
2967 		    rp != (rnode_t *)(&rtable[index]);
2968 		    rp = r_hashf) {
2969 			/* save the hash pointer before destroying */
2970 			r_hashf = rp->r_hashf;
2971 			vp = RTOV(rp);
2972 			if (vp->v_vfsp == vfsp) {
2973 				mutex_enter(&rpfreelist_lock);
2974 				if (rp->r_freef != NULL) {
2975 					rp_rmfree(rp);
2976 					mutex_exit(&rpfreelist_lock);
2977 					rp_rmhash_locked(rp);
2978 					rp->r_hashf = rlist;
2979 					rlist = rp;
2980 				} else
2981 					mutex_exit(&rpfreelist_lock);
2982 			}
2983 		}
2984 		rw_exit(&rtable[index].r_lock);
2985 	}
2986 
2987 	for (rp = rlist; rp != NULL; rp = rlist) {
2988 		rlist = rp->r_hashf;
2989 		/*
2990 		 * This call to rp_addfree will end up destroying the
2991 		 * rnode, but in a safe way with the appropriate set
2992 		 * of checks done.
2993 		 */
2994 		rp_addfree(rp, cr);
2995 	}
2996 
2997 }
2998 
2999 /*
3000  * This routine destroys all the resources associated with the rnode
3001  * and then the rnode itself.
3002  */
3003 static void
3004 destroy_rnode(rnode_t *rp)
3005 {
3006 	vnode_t *vp;
3007 	vfs_t *vfsp;
3008 
3009 	vp = RTOV(rp);
3010 	vfsp = vp->v_vfsp;
3011 
3012 	ASSERT(vp->v_count == 1);
3013 	ASSERT(rp->r_count == 0);
3014 	ASSERT(rp->r_lmpl == NULL);
3015 	ASSERT(rp->r_mapcnt == 0);
3016 	ASSERT(!(rp->r_flags & RHASHED));
3017 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3018 	atomic_add_long((ulong_t *)&rnew, -1);
3019 #ifdef DEBUG
3020 	clstat_debug.nrnode.value.ui64--;
3021 #endif
3022 	nfs_rw_destroy(&rp->r_rwlock);
3023 	nfs_rw_destroy(&rp->r_lkserlock);
3024 	mutex_destroy(&rp->r_statelock);
3025 	cv_destroy(&rp->r_cv);
3026 	cv_destroy(&rp->r_commit.c_cv);
3027 	if (rp->r_flags & RDELMAPLIST)
3028 		list_destroy(&rp->r_indelmap);
3029 	nfs_free_r_path(rp);
3030 	avl_destroy(&rp->r_dir);
3031 	vn_invalid(vp);
3032 	vn_free(vp);
3033 	kmem_cache_free(rnode_cache, rp);
3034 	VFS_RELE(vfsp);
3035 }
3036 
3037 /*
3038  * Flush all vnodes in this (or every) vfs.
3039  * Used by nfs_sync and by nfs_unmount.
3040  */
3041 void
3042 rflush(struct vfs *vfsp, cred_t *cr)
3043 {
3044 	int index;
3045 	rnode_t *rp;
3046 	vnode_t *vp, **vplist;
3047 	long num, cnt;
3048 
3049 	/*
3050 	 * Check to see whether there is anything to do.
3051 	 */
3052 	num = rnew;
3053 	if (num == 0)
3054 		return;
3055 
3056 	/*
3057 	 * Allocate a slot for all currently active rnodes on the
3058 	 * supposition that they all may need flushing.
3059 	 */
3060 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3061 	cnt = 0;
3062 
3063 	/*
3064 	 * Walk the hash queues looking for rnodes with page
3065 	 * lists associated with them.  Make a list of these
3066 	 * files.
3067 	 */
3068 	for (index = 0; index < rtablesize; index++) {
3069 		rw_enter(&rtable[index].r_lock, RW_READER);
3070 		for (rp = rtable[index].r_hashf;
3071 		    rp != (rnode_t *)(&rtable[index]);
3072 		    rp = rp->r_hashf) {
3073 			vp = RTOV(rp);
3074 			/*
3075 			 * Don't bother sync'ing a vp if it
3076 			 * is part of virtual swap device or
3077 			 * if VFS is read-only
3078 			 */
3079 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3080 				continue;
3081 			/*
3082 			 * If flushing all mounted file systems or
3083 			 * the vnode belongs to this vfs, has pages
3084 			 * and is marked as either dirty or mmap'd,
3085 			 * hold and add this vnode to the list of
3086 			 * vnodes to flush.
3087 			 */
3088 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3089 			    vn_has_cached_data(vp) &&
3090 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3091 				VN_HOLD(vp);
3092 				vplist[cnt++] = vp;
3093 				if (cnt == num) {
3094 					rw_exit(&rtable[index].r_lock);
3095 					goto toomany;
3096 				}
3097 			}
3098 		}
3099 		rw_exit(&rtable[index].r_lock);
3100 	}
3101 toomany:
3102 
3103 	/*
3104 	 * Flush and release all of the files on the list.
3105 	 */
3106 	while (cnt-- > 0) {
3107 		vp = vplist[cnt];
3108 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3109 		VN_RELE(vp);
3110 	}
3111 
3112 	/*
3113 	 * Free the space allocated to hold the list.
3114 	 */
3115 	kmem_free(vplist, num * sizeof (*vplist));
3116 }
3117 
3118 /*
3119  * This probably needs to be larger than or equal to
3120  * log2(sizeof (struct rnode)) due to the way that rnodes are
3121  * allocated.
3122  */
3123 #define	ACACHE_SHIFT_BITS	9
3124 
3125 static int
3126 acachehash(rnode_t *rp, cred_t *cr)
3127 {
3128 
3129 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3130 	    acachemask);
3131 }
3132 
3133 #ifdef DEBUG
3134 static long nfs_access_cache_hits = 0;
3135 static long nfs_access_cache_misses = 0;
3136 #endif
3137 
3138 nfs_access_type_t
3139 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3140 {
3141 	vnode_t *vp;
3142 	acache_t *ap;
3143 	acache_hash_t *hp;
3144 	nfs_access_type_t all;
3145 
3146 	vp = RTOV(rp);
3147 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3148 		return (NFS_ACCESS_UNKNOWN);
3149 
3150 	if (rp->r_acache != NULL) {
3151 		hp = &acache[acachehash(rp, cr)];
3152 		rw_enter(&hp->lock, RW_READER);
3153 		ap = hp->next;
3154 		while (ap != (acache_t *)hp) {
3155 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3156 				if ((ap->known & acc) == acc) {
3157 #ifdef DEBUG
3158 					nfs_access_cache_hits++;
3159 #endif
3160 					if ((ap->allowed & acc) == acc)
3161 						all = NFS_ACCESS_ALLOWED;
3162 					else
3163 						all = NFS_ACCESS_DENIED;
3164 				} else {
3165 #ifdef DEBUG
3166 					nfs_access_cache_misses++;
3167 #endif
3168 					all = NFS_ACCESS_UNKNOWN;
3169 				}
3170 				rw_exit(&hp->lock);
3171 				return (all);
3172 			}
3173 			ap = ap->next;
3174 		}
3175 		rw_exit(&hp->lock);
3176 	}
3177 
3178 #ifdef DEBUG
3179 	nfs_access_cache_misses++;
3180 #endif
3181 	return (NFS_ACCESS_UNKNOWN);
3182 }
3183 
3184 void
3185 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3186 {
3187 	acache_t *ap;
3188 	acache_t *nap;
3189 	acache_hash_t *hp;
3190 
3191 	hp = &acache[acachehash(rp, cr)];
3192 
3193 	/*
3194 	 * Allocate now assuming that mostly an allocation will be
3195 	 * required.  This allows the allocation to happen without
3196 	 * holding the hash bucket locked.
3197 	 */
3198 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3199 	if (nap != NULL) {
3200 		nap->known = acc;
3201 		nap->allowed = resacc;
3202 		nap->rnode = rp;
3203 		crhold(cr);
3204 		nap->cred = cr;
3205 		nap->hashq = hp;
3206 	}
3207 
3208 	rw_enter(&hp->lock, RW_WRITER);
3209 
3210 	if (rp->r_acache != NULL) {
3211 		ap = hp->next;
3212 		while (ap != (acache_t *)hp) {
3213 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3214 				ap->known |= acc;
3215 				ap->allowed &= ~acc;
3216 				ap->allowed |= resacc;
3217 				rw_exit(&hp->lock);
3218 				if (nap != NULL) {
3219 					crfree(nap->cred);
3220 					kmem_cache_free(acache_cache, nap);
3221 				}
3222 				return;
3223 			}
3224 			ap = ap->next;
3225 		}
3226 	}
3227 
3228 	if (nap != NULL) {
3229 #ifdef DEBUG
3230 		clstat_debug.access.value.ui64++;
3231 #endif
3232 		nap->next = hp->next;
3233 		hp->next = nap;
3234 		nap->next->prev = nap;
3235 		nap->prev = (acache_t *)hp;
3236 
3237 		mutex_enter(&rp->r_statelock);
3238 		nap->list = rp->r_acache;
3239 		rp->r_acache = nap;
3240 		mutex_exit(&rp->r_statelock);
3241 	}
3242 
3243 	rw_exit(&hp->lock);
3244 }
3245 
3246 int
3247 nfs_access_purge_rp(rnode_t *rp)
3248 {
3249 	acache_t *ap;
3250 	acache_t *tmpap;
3251 	acache_t *rplist;
3252 
3253 	/*
3254 	 * If there aren't any cached entries, then there is nothing
3255 	 * to free.
3256 	 */
3257 	if (rp->r_acache == NULL)
3258 		return (0);
3259 
3260 	mutex_enter(&rp->r_statelock);
3261 	rplist = rp->r_acache;
3262 	rp->r_acache = NULL;
3263 	mutex_exit(&rp->r_statelock);
3264 
3265 	/*
3266 	 * Loop through each entry in the list pointed to in the
3267 	 * rnode.  Remove each of these entries from the hash
3268 	 * queue that it is on and remove it from the list in
3269 	 * the rnode.
3270 	 */
3271 	for (ap = rplist; ap != NULL; ap = tmpap) {
3272 		rw_enter(&ap->hashq->lock, RW_WRITER);
3273 		ap->prev->next = ap->next;
3274 		ap->next->prev = ap->prev;
3275 		rw_exit(&ap->hashq->lock);
3276 
3277 		tmpap = ap->list;
3278 		crfree(ap->cred);
3279 		kmem_cache_free(acache_cache, ap);
3280 #ifdef DEBUG
3281 		clstat_debug.access.value.ui64--;
3282 #endif
3283 	}
3284 
3285 	return (1);
3286 }
3287 
3288 static const char prefix[] = ".nfs";
3289 
3290 static kmutex_t newnum_lock;
3291 
3292 int
3293 newnum(void)
3294 {
3295 	static uint_t newnum = 0;
3296 	uint_t id;
3297 
3298 	mutex_enter(&newnum_lock);
3299 	if (newnum == 0)
3300 		newnum = gethrestime_sec() & 0xffff;
3301 	id = newnum++;
3302 	mutex_exit(&newnum_lock);
3303 	return (id);
3304 }
3305 
3306 char *
3307 newname(void)
3308 {
3309 	char *news;
3310 	char *s;
3311 	const char *p;
3312 	uint_t id;
3313 
3314 	id = newnum();
3315 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3316 	s = news;
3317 	p = prefix;
3318 	while (*p != '\0')
3319 		*s++ = *p++;
3320 	while (id != 0) {
3321 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3322 		id >>= 4;
3323 	}
3324 	*s = '\0';
3325 	return (news);
3326 }
3327 
3328 /*
3329  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3330  * framework.
3331  */
3332 static int
3333 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3334 {
3335 	ksp->ks_snaptime = gethrtime();
3336 	if (rw == KSTAT_WRITE) {
3337 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3338 #ifdef DEBUG
3339 		/*
3340 		 * Currently only the global zone can write to kstats, but we
3341 		 * add the check just for paranoia.
3342 		 */
3343 		if (INGLOBALZONE(curproc))
3344 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3345 			    sizeof (clstat_debug));
3346 #endif
3347 	} else {
3348 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3349 #ifdef DEBUG
3350 		/*
3351 		 * If we're displaying the "global" debug kstat values, we
3352 		 * display them as-is to all zones since in fact they apply to
3353 		 * the system as a whole.
3354 		 */
3355 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3356 		    sizeof (clstat_debug));
3357 #endif
3358 	}
3359 	return (0);
3360 }
3361 
3362 static void *
3363 clinit_zone(zoneid_t zoneid)
3364 {
3365 	kstat_t *nfs_client_kstat;
3366 	struct nfs_clnt *nfscl;
3367 	uint_t ndata;
3368 
3369 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3370 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3371 	nfscl->nfscl_chtable = NULL;
3372 	nfscl->nfscl_zoneid = zoneid;
3373 
3374 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3375 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3376 #ifdef DEBUG
3377 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3378 #endif
3379 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3380 	    "misc", KSTAT_TYPE_NAMED, ndata,
3381 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3382 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3383 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3384 		kstat_install(nfs_client_kstat);
3385 	}
3386 	mutex_enter(&nfs_clnt_list_lock);
3387 	list_insert_head(&nfs_clnt_list, nfscl);
3388 	mutex_exit(&nfs_clnt_list_lock);
3389 	return (nfscl);
3390 }
3391 
3392 /*ARGSUSED*/
3393 static void
3394 clfini_zone(zoneid_t zoneid, void *arg)
3395 {
3396 	struct nfs_clnt *nfscl = arg;
3397 	chhead_t *chp, *next;
3398 
3399 	if (nfscl == NULL)
3400 		return;
3401 	mutex_enter(&nfs_clnt_list_lock);
3402 	list_remove(&nfs_clnt_list, nfscl);
3403 	mutex_exit(&nfs_clnt_list_lock);
3404 	clreclaim_zone(nfscl, 0);
3405 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3406 		ASSERT(chp->ch_list == NULL);
3407 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3408 		next = chp->ch_next;
3409 		kmem_free(chp, sizeof (*chp));
3410 	}
3411 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3412 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3413 	kmem_free(nfscl, sizeof (*nfscl));
3414 }
3415 
3416 /*
3417  * Called by endpnt_destructor to make sure the client handles are
3418  * cleaned up before the RPC endpoints.  This becomes a no-op if
3419  * clfini_zone (above) is called first.  This function is needed
3420  * (rather than relying on clfini_zone to clean up) because the ZSD
3421  * callbacks have no ordering mechanism, so we have no way to ensure
3422  * that clfini_zone is called before endpnt_destructor.
3423  */
3424 void
3425 clcleanup_zone(zoneid_t zoneid)
3426 {
3427 	struct nfs_clnt *nfscl;
3428 
3429 	mutex_enter(&nfs_clnt_list_lock);
3430 	nfscl = list_head(&nfs_clnt_list);
3431 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3432 		if (nfscl->nfscl_zoneid == zoneid) {
3433 			clreclaim_zone(nfscl, 0);
3434 			break;
3435 		}
3436 	}
3437 	mutex_exit(&nfs_clnt_list_lock);
3438 }
3439 
3440 int
3441 nfs_subrinit(void)
3442 {
3443 	int i;
3444 	ulong_t nrnode_max;
3445 
3446 	/*
3447 	 * Allocate and initialize the rnode hash queues
3448 	 */
3449 	if (nrnode <= 0)
3450 		nrnode = ncsize;
3451 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3452 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3453 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3454 		    "setting nrnode to max value of %ld", nrnode_max);
3455 		nrnode = nrnode_max;
3456 	}
3457 
3458 	rtablesize = 1 << highbit(nrnode / hashlen);
3459 	rtablemask = rtablesize - 1;
3460 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3461 	for (i = 0; i < rtablesize; i++) {
3462 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3463 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3464 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3465 	}
3466 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3467 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3468 
3469 	/*
3470 	 * Allocate and initialize the access cache
3471 	 */
3472 
3473 	/*
3474 	 * Initial guess is one access cache entry per rnode unless
3475 	 * nacache is set to a non-zero value and then it is used to
3476 	 * indicate a guess at the number of access cache entries.
3477 	 */
3478 	if (nacache > 0)
3479 		acachesize = 1 << highbit(nacache / hashlen);
3480 	else
3481 		acachesize = rtablesize;
3482 	acachemask = acachesize - 1;
3483 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3484 	for (i = 0; i < acachesize; i++) {
3485 		acache[i].next = (acache_t *)&acache[i];
3486 		acache[i].prev = (acache_t *)&acache[i];
3487 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3488 	}
3489 	acache_cache = kmem_cache_create("nfs_access_cache",
3490 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3491 	/*
3492 	 * Allocate and initialize the client handle cache
3493 	 */
3494 	chtab_cache = kmem_cache_create("client_handle_cache",
3495 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3496 	/*
3497 	 * Initialize the list of per-zone client handles (and associated data).
3498 	 * This needs to be done before we call zone_key_create().
3499 	 */
3500 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3501 	    offsetof(struct nfs_clnt, nfscl_node));
3502 	/*
3503 	 * Initialize the zone_key for per-zone client handle lists.
3504 	 */
3505 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3506 	/*
3507 	 * Initialize the various mutexes and reader/writer locks
3508 	 */
3509 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3510 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3511 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3512 
3513 	/*
3514 	 * Assign unique major number for all nfs mounts
3515 	 */
3516 	if ((nfs_major = getudev()) == -1) {
3517 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3518 		    "nfs: init: can't get unique device number");
3519 		nfs_major = 0;
3520 	}
3521 	nfs_minor = 0;
3522 
3523 	if (nfs3_jukebox_delay == 0)
3524 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3525 
3526 	return (0);
3527 }
3528 
3529 void
3530 nfs_subrfini(void)
3531 {
3532 	int i;
3533 
3534 	/*
3535 	 * Deallocate the rnode hash queues
3536 	 */
3537 	kmem_cache_destroy(rnode_cache);
3538 
3539 	for (i = 0; i < rtablesize; i++)
3540 		rw_destroy(&rtable[i].r_lock);
3541 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3542 
3543 	/*
3544 	 * Deallocated the access cache
3545 	 */
3546 	kmem_cache_destroy(acache_cache);
3547 
3548 	for (i = 0; i < acachesize; i++)
3549 		rw_destroy(&acache[i].lock);
3550 	kmem_free(acache, acachesize * sizeof (*acache));
3551 
3552 	/*
3553 	 * Deallocate the client handle cache
3554 	 */
3555 	kmem_cache_destroy(chtab_cache);
3556 
3557 	/*
3558 	 * Destroy the various mutexes and reader/writer locks
3559 	 */
3560 	mutex_destroy(&rpfreelist_lock);
3561 	mutex_destroy(&newnum_lock);
3562 	mutex_destroy(&nfs_minor_lock);
3563 	(void) zone_key_delete(nfsclnt_zone_key);
3564 }
3565 
3566 enum nfsstat
3567 puterrno(int error)
3568 {
3569 
3570 	switch (error) {
3571 	case EOPNOTSUPP:
3572 		return (NFSERR_OPNOTSUPP);
3573 	case ENAMETOOLONG:
3574 		return (NFSERR_NAMETOOLONG);
3575 	case ENOTEMPTY:
3576 		return (NFSERR_NOTEMPTY);
3577 	case EDQUOT:
3578 		return (NFSERR_DQUOT);
3579 	case ESTALE:
3580 		return (NFSERR_STALE);
3581 	case EREMOTE:
3582 		return (NFSERR_REMOTE);
3583 	case ENOSYS:
3584 		return (NFSERR_OPNOTSUPP);
3585 	case EOVERFLOW:
3586 		return (NFSERR_INVAL);
3587 	default:
3588 		return ((enum nfsstat)error);
3589 	}
3590 	/* NOTREACHED */
3591 }
3592 
3593 int
3594 geterrno(enum nfsstat status)
3595 {
3596 
3597 	switch (status) {
3598 	case NFSERR_OPNOTSUPP:
3599 		return (EOPNOTSUPP);
3600 	case NFSERR_NAMETOOLONG:
3601 		return (ENAMETOOLONG);
3602 	case NFSERR_NOTEMPTY:
3603 		return (ENOTEMPTY);
3604 	case NFSERR_DQUOT:
3605 		return (EDQUOT);
3606 	case NFSERR_STALE:
3607 		return (ESTALE);
3608 	case NFSERR_REMOTE:
3609 		return (EREMOTE);
3610 	case NFSERR_WFLUSH:
3611 		return (EIO);
3612 	default:
3613 		return ((int)status);
3614 	}
3615 	/* NOTREACHED */
3616 }
3617 
3618 enum nfsstat3
3619 puterrno3(int error)
3620 {
3621 
3622 #ifdef DEBUG
3623 	switch (error) {
3624 	case 0:
3625 		return (NFS3_OK);
3626 	case EPERM:
3627 		return (NFS3ERR_PERM);
3628 	case ENOENT:
3629 		return (NFS3ERR_NOENT);
3630 	case EIO:
3631 		return (NFS3ERR_IO);
3632 	case ENXIO:
3633 		return (NFS3ERR_NXIO);
3634 	case EACCES:
3635 		return (NFS3ERR_ACCES);
3636 	case EEXIST:
3637 		return (NFS3ERR_EXIST);
3638 	case EXDEV:
3639 		return (NFS3ERR_XDEV);
3640 	case ENODEV:
3641 		return (NFS3ERR_NODEV);
3642 	case ENOTDIR:
3643 		return (NFS3ERR_NOTDIR);
3644 	case EISDIR:
3645 		return (NFS3ERR_ISDIR);
3646 	case EINVAL:
3647 		return (NFS3ERR_INVAL);
3648 	case EFBIG:
3649 		return (NFS3ERR_FBIG);
3650 	case ENOSPC:
3651 		return (NFS3ERR_NOSPC);
3652 	case EROFS:
3653 		return (NFS3ERR_ROFS);
3654 	case EMLINK:
3655 		return (NFS3ERR_MLINK);
3656 	case ENAMETOOLONG:
3657 		return (NFS3ERR_NAMETOOLONG);
3658 	case ENOTEMPTY:
3659 		return (NFS3ERR_NOTEMPTY);
3660 	case EDQUOT:
3661 		return (NFS3ERR_DQUOT);
3662 	case ESTALE:
3663 		return (NFS3ERR_STALE);
3664 	case EREMOTE:
3665 		return (NFS3ERR_REMOTE);
3666 	case ENOSYS:
3667 	case EOPNOTSUPP:
3668 		return (NFS3ERR_NOTSUPP);
3669 	case EOVERFLOW:
3670 		return (NFS3ERR_INVAL);
3671 	default:
3672 		zcmn_err(getzoneid(), CE_WARN,
3673 		    "puterrno3: got error %d", error);
3674 		return ((enum nfsstat3)error);
3675 	}
3676 #else
3677 	switch (error) {
3678 	case ENAMETOOLONG:
3679 		return (NFS3ERR_NAMETOOLONG);
3680 	case ENOTEMPTY:
3681 		return (NFS3ERR_NOTEMPTY);
3682 	case EDQUOT:
3683 		return (NFS3ERR_DQUOT);
3684 	case ESTALE:
3685 		return (NFS3ERR_STALE);
3686 	case ENOSYS:
3687 	case EOPNOTSUPP:
3688 		return (NFS3ERR_NOTSUPP);
3689 	case EREMOTE:
3690 		return (NFS3ERR_REMOTE);
3691 	case EOVERFLOW:
3692 		return (NFS3ERR_INVAL);
3693 	default:
3694 		return ((enum nfsstat3)error);
3695 	}
3696 #endif
3697 }
3698 
3699 int
3700 geterrno3(enum nfsstat3 status)
3701 {
3702 
3703 #ifdef DEBUG
3704 	switch (status) {
3705 	case NFS3_OK:
3706 		return (0);
3707 	case NFS3ERR_PERM:
3708 		return (EPERM);
3709 	case NFS3ERR_NOENT:
3710 		return (ENOENT);
3711 	case NFS3ERR_IO:
3712 		return (EIO);
3713 	case NFS3ERR_NXIO:
3714 		return (ENXIO);
3715 	case NFS3ERR_ACCES:
3716 		return (EACCES);
3717 	case NFS3ERR_EXIST:
3718 		return (EEXIST);
3719 	case NFS3ERR_XDEV:
3720 		return (EXDEV);
3721 	case NFS3ERR_NODEV:
3722 		return (ENODEV);
3723 	case NFS3ERR_NOTDIR:
3724 		return (ENOTDIR);
3725 	case NFS3ERR_ISDIR:
3726 		return (EISDIR);
3727 	case NFS3ERR_INVAL:
3728 		return (EINVAL);
3729 	case NFS3ERR_FBIG:
3730 		return (EFBIG);
3731 	case NFS3ERR_NOSPC:
3732 		return (ENOSPC);
3733 	case NFS3ERR_ROFS:
3734 		return (EROFS);
3735 	case NFS3ERR_MLINK:
3736 		return (EMLINK);
3737 	case NFS3ERR_NAMETOOLONG:
3738 		return (ENAMETOOLONG);
3739 	case NFS3ERR_NOTEMPTY:
3740 		return (ENOTEMPTY);
3741 	case NFS3ERR_DQUOT:
3742 		return (EDQUOT);
3743 	case NFS3ERR_STALE:
3744 		return (ESTALE);
3745 	case NFS3ERR_REMOTE:
3746 		return (EREMOTE);
3747 	case NFS3ERR_BADHANDLE:
3748 		return (ESTALE);
3749 	case NFS3ERR_NOT_SYNC:
3750 		return (EINVAL);
3751 	case NFS3ERR_BAD_COOKIE:
3752 		return (ENOENT);
3753 	case NFS3ERR_NOTSUPP:
3754 		return (EOPNOTSUPP);
3755 	case NFS3ERR_TOOSMALL:
3756 		return (EINVAL);
3757 	case NFS3ERR_SERVERFAULT:
3758 		return (EIO);
3759 	case NFS3ERR_BADTYPE:
3760 		return (EINVAL);
3761 	case NFS3ERR_JUKEBOX:
3762 		return (ENXIO);
3763 	default:
3764 		zcmn_err(getzoneid(), CE_WARN,
3765 		    "geterrno3: got status %d", status);
3766 		return ((int)status);
3767 	}
3768 #else
3769 	switch (status) {
3770 	case NFS3ERR_NAMETOOLONG:
3771 		return (ENAMETOOLONG);
3772 	case NFS3ERR_NOTEMPTY:
3773 		return (ENOTEMPTY);
3774 	case NFS3ERR_DQUOT:
3775 		return (EDQUOT);
3776 	case NFS3ERR_STALE:
3777 	case NFS3ERR_BADHANDLE:
3778 		return (ESTALE);
3779 	case NFS3ERR_NOTSUPP:
3780 		return (EOPNOTSUPP);
3781 	case NFS3ERR_REMOTE:
3782 		return (EREMOTE);
3783 	case NFS3ERR_NOT_SYNC:
3784 	case NFS3ERR_TOOSMALL:
3785 	case NFS3ERR_BADTYPE:
3786 		return (EINVAL);
3787 	case NFS3ERR_BAD_COOKIE:
3788 		return (ENOENT);
3789 	case NFS3ERR_SERVERFAULT:
3790 		return (EIO);
3791 	case NFS3ERR_JUKEBOX:
3792 		return (ENXIO);
3793 	default:
3794 		return ((int)status);
3795 	}
3796 #endif
3797 }
3798 
3799 rddir_cache *
3800 rddir_cache_alloc(int flags)
3801 {
3802 	rddir_cache *rc;
3803 
3804 	rc = kmem_alloc(sizeof (*rc), flags);
3805 	if (rc != NULL) {
3806 		rc->entries = NULL;
3807 		rc->flags = RDDIR;
3808 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3809 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3810 		rc->count = 1;
3811 #ifdef DEBUG
3812 		atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3813 #endif
3814 	}
3815 	return (rc);
3816 }
3817 
3818 static void
3819 rddir_cache_free(rddir_cache *rc)
3820 {
3821 
3822 #ifdef DEBUG
3823 	atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3824 #endif
3825 	if (rc->entries != NULL) {
3826 #ifdef DEBUG
3827 		rddir_cache_buf_free(rc->entries, rc->buflen);
3828 #else
3829 		kmem_free(rc->entries, rc->buflen);
3830 #endif
3831 	}
3832 	cv_destroy(&rc->cv);
3833 	mutex_destroy(&rc->lock);
3834 	kmem_free(rc, sizeof (*rc));
3835 }
3836 
3837 void
3838 rddir_cache_hold(rddir_cache *rc)
3839 {
3840 
3841 	mutex_enter(&rc->lock);
3842 	rc->count++;
3843 	mutex_exit(&rc->lock);
3844 }
3845 
3846 void
3847 rddir_cache_rele(rddir_cache *rc)
3848 {
3849 
3850 	mutex_enter(&rc->lock);
3851 	ASSERT(rc->count > 0);
3852 	if (--rc->count == 0) {
3853 		mutex_exit(&rc->lock);
3854 		rddir_cache_free(rc);
3855 	} else
3856 		mutex_exit(&rc->lock);
3857 }
3858 
3859 #ifdef DEBUG
3860 char *
3861 rddir_cache_buf_alloc(size_t size, int flags)
3862 {
3863 	char *rc;
3864 
3865 	rc = kmem_alloc(size, flags);
3866 	if (rc != NULL)
3867 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3868 	return (rc);
3869 }
3870 
3871 void
3872 rddir_cache_buf_free(void *addr, size_t size)
3873 {
3874 
3875 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3876 	kmem_free(addr, size);
3877 }
3878 #endif
3879 
3880 static int
3881 nfs_free_data_reclaim(rnode_t *rp)
3882 {
3883 	char *contents;
3884 	int size;
3885 	vsecattr_t *vsp;
3886 	nfs3_pathconf_info *info;
3887 	int freed;
3888 	cred_t *cred;
3889 
3890 	/*
3891 	 * Free any held credentials and caches which
3892 	 * may be associated with this rnode.
3893 	 */
3894 	mutex_enter(&rp->r_statelock);
3895 	cred = rp->r_cred;
3896 	rp->r_cred = NULL;
3897 	contents = rp->r_symlink.contents;
3898 	size = rp->r_symlink.size;
3899 	rp->r_symlink.contents = NULL;
3900 	vsp = rp->r_secattr;
3901 	rp->r_secattr = NULL;
3902 	info = rp->r_pathconf;
3903 	rp->r_pathconf = NULL;
3904 	mutex_exit(&rp->r_statelock);
3905 
3906 	if (cred != NULL)
3907 		crfree(cred);
3908 
3909 	/*
3910 	 * Free the access cache entries.
3911 	 */
3912 	freed = nfs_access_purge_rp(rp);
3913 
3914 	if (!HAVE_RDDIR_CACHE(rp) &&
3915 	    contents == NULL &&
3916 	    vsp == NULL &&
3917 	    info == NULL)
3918 		return (freed);
3919 
3920 	/*
3921 	 * Free the readdir cache entries
3922 	 */
3923 	if (HAVE_RDDIR_CACHE(rp))
3924 		nfs_purge_rddir_cache(RTOV(rp));
3925 
3926 	/*
3927 	 * Free the symbolic link cache.
3928 	 */
3929 	if (contents != NULL) {
3930 
3931 		kmem_free((void *)contents, size);
3932 	}
3933 
3934 	/*
3935 	 * Free any cached ACL.
3936 	 */
3937 	if (vsp != NULL)
3938 		nfs_acl_free(vsp);
3939 
3940 	/*
3941 	 * Free any cached pathconf information.
3942 	 */
3943 	if (info != NULL)
3944 		kmem_free(info, sizeof (*info));
3945 
3946 	return (1);
3947 }
3948 
3949 static int
3950 nfs_active_data_reclaim(rnode_t *rp)
3951 {
3952 	char *contents;
3953 	int size;
3954 	vsecattr_t *vsp;
3955 	nfs3_pathconf_info *info;
3956 	int freed;
3957 
3958 	/*
3959 	 * Free any held credentials and caches which
3960 	 * may be associated with this rnode.
3961 	 */
3962 	if (!mutex_tryenter(&rp->r_statelock))
3963 		return (0);
3964 	contents = rp->r_symlink.contents;
3965 	size = rp->r_symlink.size;
3966 	rp->r_symlink.contents = NULL;
3967 	vsp = rp->r_secattr;
3968 	rp->r_secattr = NULL;
3969 	info = rp->r_pathconf;
3970 	rp->r_pathconf = NULL;
3971 	mutex_exit(&rp->r_statelock);
3972 
3973 	/*
3974 	 * Free the access cache entries.
3975 	 */
3976 	freed = nfs_access_purge_rp(rp);
3977 
3978 	if (!HAVE_RDDIR_CACHE(rp) &&
3979 	    contents == NULL &&
3980 	    vsp == NULL &&
3981 	    info == NULL)
3982 		return (freed);
3983 
3984 	/*
3985 	 * Free the readdir cache entries
3986 	 */
3987 	if (HAVE_RDDIR_CACHE(rp))
3988 		nfs_purge_rddir_cache(RTOV(rp));
3989 
3990 	/*
3991 	 * Free the symbolic link cache.
3992 	 */
3993 	if (contents != NULL) {
3994 
3995 		kmem_free((void *)contents, size);
3996 	}
3997 
3998 	/*
3999 	 * Free any cached ACL.
4000 	 */
4001 	if (vsp != NULL)
4002 		nfs_acl_free(vsp);
4003 
4004 	/*
4005 	 * Free any cached pathconf information.
4006 	 */
4007 	if (info != NULL)
4008 		kmem_free(info, sizeof (*info));
4009 
4010 	return (1);
4011 }
4012 
4013 static int
4014 nfs_free_reclaim(void)
4015 {
4016 	int freed;
4017 	rnode_t *rp;
4018 
4019 #ifdef DEBUG
4020 	clstat_debug.f_reclaim.value.ui64++;
4021 #endif
4022 	freed = 0;
4023 	mutex_enter(&rpfreelist_lock);
4024 	rp = rpfreelist;
4025 	if (rp != NULL) {
4026 		do {
4027 			if (nfs_free_data_reclaim(rp))
4028 				freed = 1;
4029 		} while ((rp = rp->r_freef) != rpfreelist);
4030 	}
4031 	mutex_exit(&rpfreelist_lock);
4032 	return (freed);
4033 }
4034 
4035 static int
4036 nfs_active_reclaim(void)
4037 {
4038 	int freed;
4039 	int index;
4040 	rnode_t *rp;
4041 
4042 #ifdef DEBUG
4043 	clstat_debug.a_reclaim.value.ui64++;
4044 #endif
4045 	freed = 0;
4046 	for (index = 0; index < rtablesize; index++) {
4047 		rw_enter(&rtable[index].r_lock, RW_READER);
4048 		for (rp = rtable[index].r_hashf;
4049 		    rp != (rnode_t *)(&rtable[index]);
4050 		    rp = rp->r_hashf) {
4051 			if (nfs_active_data_reclaim(rp))
4052 				freed = 1;
4053 		}
4054 		rw_exit(&rtable[index].r_lock);
4055 	}
4056 	return (freed);
4057 }
4058 
4059 static int
4060 nfs_rnode_reclaim(void)
4061 {
4062 	int freed;
4063 	rnode_t *rp;
4064 	vnode_t *vp;
4065 
4066 #ifdef DEBUG
4067 	clstat_debug.r_reclaim.value.ui64++;
4068 #endif
4069 	freed = 0;
4070 	mutex_enter(&rpfreelist_lock);
4071 	while ((rp = rpfreelist) != NULL) {
4072 		rp_rmfree(rp);
4073 		mutex_exit(&rpfreelist_lock);
4074 		if (rp->r_flags & RHASHED) {
4075 			vp = RTOV(rp);
4076 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4077 			mutex_enter(&vp->v_lock);
4078 			if (vp->v_count > 1) {
4079 				vp->v_count--;
4080 				mutex_exit(&vp->v_lock);
4081 				rw_exit(&rp->r_hashq->r_lock);
4082 				mutex_enter(&rpfreelist_lock);
4083 				continue;
4084 			}
4085 			mutex_exit(&vp->v_lock);
4086 			rp_rmhash_locked(rp);
4087 			rw_exit(&rp->r_hashq->r_lock);
4088 		}
4089 		/*
4090 		 * This call to rp_addfree will end up destroying the
4091 		 * rnode, but in a safe way with the appropriate set
4092 		 * of checks done.
4093 		 */
4094 		rp_addfree(rp, CRED());
4095 		mutex_enter(&rpfreelist_lock);
4096 	}
4097 	mutex_exit(&rpfreelist_lock);
4098 	return (freed);
4099 }
4100 
4101 /*ARGSUSED*/
4102 static void
4103 nfs_reclaim(void *cdrarg)
4104 {
4105 
4106 #ifdef DEBUG
4107 	clstat_debug.reclaim.value.ui64++;
4108 #endif
4109 	if (nfs_free_reclaim())
4110 		return;
4111 
4112 	if (nfs_active_reclaim())
4113 		return;
4114 
4115 	(void) nfs_rnode_reclaim();
4116 }
4117 
4118 /*
4119  * NFS client failover support
4120  *
4121  * Routines to copy filehandles
4122  */
4123 void
4124 nfscopyfh(caddr_t fhp, vnode_t *vp)
4125 {
4126 	fhandle_t *dest = (fhandle_t *)fhp;
4127 
4128 	if (dest != NULL)
4129 		*dest = *VTOFH(vp);
4130 }
4131 
4132 void
4133 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4134 {
4135 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4136 
4137 	if (dest != NULL)
4138 		*dest = *VTOFH3(vp);
4139 }
4140 
4141 /*
4142  * NFS client failover support
4143  *
4144  * failover_safe() will test various conditions to ensure that
4145  * failover is permitted for this vnode.  It will be denied
4146  * if:
4147  *	1) the operation in progress does not support failover (NULL fi)
4148  *	2) there are no available replicas (NULL mi_servers->sv_next)
4149  *	3) any locks are outstanding on this file
4150  */
4151 static int
4152 failover_safe(failinfo_t *fi)
4153 {
4154 
4155 	/*
4156 	 * Does this op permit failover?
4157 	 */
4158 	if (fi == NULL || fi->vp == NULL)
4159 		return (0);
4160 
4161 	/*
4162 	 * Are there any alternates to failover to?
4163 	 */
4164 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4165 		return (0);
4166 
4167 	/*
4168 	 * Disable check; we've forced local locking
4169 	 *
4170 	 * if (flk_has_remote_locks(fi->vp))
4171 	 *	return (0);
4172 	 */
4173 
4174 	/*
4175 	 * If we have no partial path, we can't do anything
4176 	 */
4177 	if (VTOR(fi->vp)->r_path == NULL)
4178 		return (0);
4179 
4180 	return (1);
4181 }
4182 
4183 #include <sys/thread.h>
4184 
4185 /*
4186  * NFS client failover support
4187  *
4188  * failover_newserver() will start a search for a new server,
4189  * preferably by starting an async thread to do the work.  If
4190  * someone is already doing this (recognizable by MI_BINDINPROG
4191  * being set), it will simply return and the calling thread
4192  * will queue on the mi_failover_cv condition variable.
4193  */
4194 static void
4195 failover_newserver(mntinfo_t *mi)
4196 {
4197 	/*
4198 	 * Check if someone else is doing this already
4199 	 */
4200 	mutex_enter(&mi->mi_lock);
4201 	if (mi->mi_flags & MI_BINDINPROG) {
4202 		mutex_exit(&mi->mi_lock);
4203 		return;
4204 	}
4205 	mi->mi_flags |= MI_BINDINPROG;
4206 
4207 	/*
4208 	 * Need to hold the vfs struct so that it can't be released
4209 	 * while the failover thread is selecting a new server.
4210 	 */
4211 	VFS_HOLD(mi->mi_vfsp);
4212 
4213 	/*
4214 	 * Start a thread to do the real searching.
4215 	 */
4216 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4217 
4218 	mutex_exit(&mi->mi_lock);
4219 }
4220 
4221 /*
4222  * NFS client failover support
4223  *
4224  * failover_thread() will find a new server to replace the one
4225  * currently in use, wake up other threads waiting on this mount
4226  * point, and die.  It will start at the head of the server list
4227  * and poll servers until it finds one with an NFS server which is
4228  * registered and responds to a NULL procedure ping.
4229  *
4230  * XXX failover_thread is unsafe within the scope of the
4231  * present model defined for cpr to suspend the system.
4232  * Specifically, over-the-wire calls made by the thread
4233  * are unsafe. The thread needs to be reevaluated in case of
4234  * future updates to the cpr suspend model.
4235  */
4236 static void
4237 failover_thread(mntinfo_t *mi)
4238 {
4239 	servinfo_t *svp = NULL;
4240 	CLIENT *cl;
4241 	enum clnt_stat status;
4242 	struct timeval tv;
4243 	int error;
4244 	int oncethru = 0;
4245 	callb_cpr_t cprinfo;
4246 	rnode_t *rp;
4247 	int index;
4248 	char *srvnames;
4249 	size_t srvnames_len;
4250 	struct nfs_clnt *nfscl = NULL;
4251 	zoneid_t zoneid = getzoneid();
4252 
4253 #ifdef DEBUG
4254 	/*
4255 	 * This is currently only needed to access counters which exist on
4256 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4257 	 * on non-DEBUG kernels.
4258 	 */
4259 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4260 	ASSERT(nfscl != NULL);
4261 #endif
4262 
4263 	/*
4264 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4265 	 * code guarantees that there will be only one failover thread
4266 	 * per mountinfo at any instance.
4267 	 */
4268 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4269 	    "failover_thread");
4270 
4271 	mutex_enter(&mi->mi_lock);
4272 	while (mi->mi_readers) {
4273 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4274 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4275 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4276 	}
4277 	mutex_exit(&mi->mi_lock);
4278 
4279 	tv.tv_sec = 2;
4280 	tv.tv_usec = 0;
4281 
4282 	/*
4283 	 * Ping the null NFS procedure of every server in
4284 	 * the list until one responds.  We always start
4285 	 * at the head of the list and always skip the one
4286 	 * that is current, since it's caused us a problem.
4287 	 */
4288 	while (svp == NULL) {
4289 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4290 			if (!oncethru && svp == mi->mi_curr_serv)
4291 				continue;
4292 
4293 			/*
4294 			 * If the file system was forcibly umounted
4295 			 * while trying to do a failover, then just
4296 			 * give up on the failover.  It won't matter
4297 			 * what the server is.
4298 			 */
4299 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4300 				svp = NULL;
4301 				goto done;
4302 			}
4303 
4304 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4305 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4306 			if (error)
4307 				continue;
4308 
4309 			if (!(mi->mi_flags & MI_INT))
4310 				cl->cl_nosignal = TRUE;
4311 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4312 			    xdr_void, NULL, tv);
4313 			if (!(mi->mi_flags & MI_INT))
4314 				cl->cl_nosignal = FALSE;
4315 			AUTH_DESTROY(cl->cl_auth);
4316 			CLNT_DESTROY(cl);
4317 			if (status == RPC_SUCCESS) {
4318 				if (svp == mi->mi_curr_serv) {
4319 #ifdef DEBUG
4320 					zcmn_err(zoneid, CE_NOTE,
4321 			"NFS%d: failing over: selecting original server %s",
4322 					    mi->mi_vers, svp->sv_hostname);
4323 #else
4324 					zcmn_err(zoneid, CE_NOTE,
4325 			"NFS: failing over: selecting original server %s",
4326 					    svp->sv_hostname);
4327 #endif
4328 				} else {
4329 #ifdef DEBUG
4330 					zcmn_err(zoneid, CE_NOTE,
4331 				    "NFS%d: failing over from %s to %s",
4332 					    mi->mi_vers,
4333 					    mi->mi_curr_serv->sv_hostname,
4334 					    svp->sv_hostname);
4335 #else
4336 					zcmn_err(zoneid, CE_NOTE,
4337 				    "NFS: failing over from %s to %s",
4338 					    mi->mi_curr_serv->sv_hostname,
4339 					    svp->sv_hostname);
4340 #endif
4341 				}
4342 				break;
4343 			}
4344 		}
4345 
4346 		if (svp == NULL) {
4347 			if (!oncethru) {
4348 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4349 #ifdef DEBUG
4350 				zprintf(zoneid,
4351 				    "NFS%d servers %s not responding "
4352 				    "still trying\n", mi->mi_vers, srvnames);
4353 #else
4354 				zprintf(zoneid, "NFS servers %s not responding "
4355 				    "still trying\n", srvnames);
4356 #endif
4357 				oncethru = 1;
4358 			}
4359 			mutex_enter(&mi->mi_lock);
4360 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4361 			mutex_exit(&mi->mi_lock);
4362 			delay(hz);
4363 			mutex_enter(&mi->mi_lock);
4364 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4365 			mutex_exit(&mi->mi_lock);
4366 		}
4367 	}
4368 
4369 	if (oncethru) {
4370 #ifdef DEBUG
4371 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4372 #else
4373 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4374 #endif
4375 	}
4376 
4377 	if (svp != mi->mi_curr_serv) {
4378 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4379 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4380 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4381 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4382 		    mi->mi_vfsp);
4383 		if (rp != NULL) {
4384 			if (rp->r_flags & RHASHED)
4385 				rp_rmhash_locked(rp);
4386 			rw_exit(&rtable[index].r_lock);
4387 			rp->r_server = svp;
4388 			rp->r_fh = svp->sv_fhandle;
4389 			(void) nfs_free_data_reclaim(rp);
4390 			index = rtablehash(&rp->r_fh);
4391 			rp->r_hashq = &rtable[index];
4392 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4393 			vn_exists(RTOV(rp));
4394 			rp_addhash(rp);
4395 			rw_exit(&rp->r_hashq->r_lock);
4396 			VN_RELE(RTOV(rp));
4397 		} else
4398 			rw_exit(&rtable[index].r_lock);
4399 	}
4400 
4401 done:
4402 	if (oncethru)
4403 		kmem_free(srvnames, srvnames_len);
4404 	mutex_enter(&mi->mi_lock);
4405 	mi->mi_flags &= ~MI_BINDINPROG;
4406 	if (svp != NULL) {
4407 		mi->mi_curr_serv = svp;
4408 		mi->mi_failover++;
4409 #ifdef DEBUG
4410 	nfscl->nfscl_stat.failover.value.ui64++;
4411 #endif
4412 	}
4413 	cv_broadcast(&mi->mi_failover_cv);
4414 	CALLB_CPR_EXIT(&cprinfo);
4415 	VFS_RELE(mi->mi_vfsp);
4416 	zthread_exit();
4417 	/* NOTREACHED */
4418 }
4419 
4420 /*
4421  * NFS client failover support
4422  *
4423  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4424  * is cleared, meaning that failover is complete.  Called with
4425  * mi_lock mutex held.
4426  */
4427 static int
4428 failover_wait(mntinfo_t *mi)
4429 {
4430 	k_sigset_t smask;
4431 
4432 	/*
4433 	 * If someone else is hunting for a living server,
4434 	 * sleep until it's done.  After our sleep, we may
4435 	 * be bound to the right server and get off cheaply.
4436 	 */
4437 	while (mi->mi_flags & MI_BINDINPROG) {
4438 		/*
4439 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4440 		 * and SIGTERM. (Preserving the existing masks).
4441 		 * Mask out SIGINT if mount option nointr is specified.
4442 		 */
4443 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4444 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4445 			/*
4446 			 * restore original signal mask
4447 			 */
4448 			sigunintr(&smask);
4449 			return (EINTR);
4450 		}
4451 		/*
4452 		 * restore original signal mask
4453 		 */
4454 		sigunintr(&smask);
4455 	}
4456 	return (0);
4457 }
4458 
4459 /*
4460  * NFS client failover support
4461  *
4462  * failover_remap() will do a partial pathname lookup and find the
4463  * desired vnode on the current server.  The interim vnode will be
4464  * discarded after we pilfer the new filehandle.
4465  *
4466  * Side effects:
4467  * - This routine will also update the filehandle in the args structure
4468  *    pointed to by the fi->fhp pointer if it is non-NULL.
4469  */
4470 
4471 static int
4472 failover_remap(failinfo_t *fi)
4473 {
4474 	vnode_t *vp, *nvp, *rootvp;
4475 	rnode_t *rp, *nrp;
4476 	mntinfo_t *mi;
4477 	int error;
4478 #ifdef DEBUG
4479 	struct nfs_clnt *nfscl;
4480 
4481 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4482 	ASSERT(nfscl != NULL);
4483 #endif
4484 	/*
4485 	 * Sanity check
4486 	 */
4487 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4488 		return (EINVAL);
4489 	vp = fi->vp;
4490 	rp = VTOR(vp);
4491 	mi = VTOMI(vp);
4492 
4493 	if (!(vp->v_flag & VROOT)) {
4494 		/*
4495 		 * Given the root fh, use the path stored in
4496 		 * the rnode to find the fh for the new server.
4497 		 */
4498 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4499 		if (error)
4500 			return (error);
4501 
4502 		error = failover_lookup(rp->r_path, rootvp,
4503 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4504 
4505 		VN_RELE(rootvp);
4506 
4507 		if (error)
4508 			return (error);
4509 
4510 		/*
4511 		 * If we found the same rnode, we're done now
4512 		 */
4513 		if (nvp == vp) {
4514 			/*
4515 			 * Failed and the new server may physically be same
4516 			 * OR may share a same disk subsystem. In this case
4517 			 * file handle for a particular file path is not going
4518 			 * to change, given the same filehandle lookup will
4519 			 * always locate the same rnode as the existing one.
4520 			 * All we might need to do is to update the r_server
4521 			 * with the current servinfo.
4522 			 */
4523 			if (!VALID_FH(fi)) {
4524 				rp->r_server = mi->mi_curr_serv;
4525 			}
4526 			VN_RELE(nvp);
4527 			return (0);
4528 		}
4529 
4530 		/*
4531 		 * Try to make it so that no one else will find this
4532 		 * vnode because it is just a temporary to hold the
4533 		 * new file handle until that file handle can be
4534 		 * copied to the original vnode/rnode.
4535 		 */
4536 		nrp = VTOR(nvp);
4537 		mutex_enter(&mi->mi_remap_lock);
4538 		/*
4539 		 * Some other thread could have raced in here and could
4540 		 * have done the remap for this particular rnode before
4541 		 * this thread here. Check for rp->r_server and
4542 		 * mi->mi_curr_serv and return if they are same.
4543 		 */
4544 		if (VALID_FH(fi)) {
4545 			mutex_exit(&mi->mi_remap_lock);
4546 			VN_RELE(nvp);
4547 			return (0);
4548 		}
4549 
4550 		if (nrp->r_flags & RHASHED)
4551 			rp_rmhash(nrp);
4552 
4553 		/*
4554 		 * As a heuristic check on the validity of the new
4555 		 * file, check that the size and type match against
4556 		 * that we remember from the old version.
4557 		 */
4558 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4559 			mutex_exit(&mi->mi_remap_lock);
4560 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4561 			    "NFS replicas %s and %s: file %s not same.",
4562 			    rp->r_server->sv_hostname,
4563 			    nrp->r_server->sv_hostname, rp->r_path);
4564 			VN_RELE(nvp);
4565 			return (EINVAL);
4566 		}
4567 
4568 		/*
4569 		 * snarf the filehandle from the new rnode
4570 		 * then release it, again while updating the
4571 		 * hash queues for the rnode.
4572 		 */
4573 		if (rp->r_flags & RHASHED)
4574 			rp_rmhash(rp);
4575 		rp->r_server = mi->mi_curr_serv;
4576 		rp->r_fh = nrp->r_fh;
4577 		rp->r_hashq = nrp->r_hashq;
4578 		/*
4579 		 * Copy the attributes from the new rnode to the old
4580 		 * rnode.  This will help to reduce unnecessary page
4581 		 * cache flushes.
4582 		 */
4583 		rp->r_attr = nrp->r_attr;
4584 		rp->r_attrtime = nrp->r_attrtime;
4585 		rp->r_mtime = nrp->r_mtime;
4586 		(void) nfs_free_data_reclaim(rp);
4587 		nfs_setswaplike(vp, &rp->r_attr);
4588 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4589 		rp_addhash(rp);
4590 		rw_exit(&rp->r_hashq->r_lock);
4591 		mutex_exit(&mi->mi_remap_lock);
4592 		VN_RELE(nvp);
4593 	}
4594 
4595 	/*
4596 	 * Update successful failover remap count
4597 	 */
4598 	mutex_enter(&mi->mi_lock);
4599 	mi->mi_remap++;
4600 	mutex_exit(&mi->mi_lock);
4601 #ifdef DEBUG
4602 	nfscl->nfscl_stat.remap.value.ui64++;
4603 #endif
4604 
4605 	/*
4606 	 * If we have a copied filehandle to update, do it now.
4607 	 */
4608 	if (fi->fhp != NULL && fi->copyproc != NULL)
4609 		(*fi->copyproc)(fi->fhp, vp);
4610 
4611 	return (0);
4612 }
4613 
4614 /*
4615  * NFS client failover support
4616  *
4617  * We want a simple pathname lookup routine to parse the pieces
4618  * of path in rp->r_path.  We know that the path was a created
4619  * as rnodes were made, so we know we have only to deal with
4620  * paths that look like:
4621  *	dir1/dir2/dir3/file
4622  * Any evidence of anything like .., symlinks, and ENOTDIR
4623  * are hard errors, because they mean something in this filesystem
4624  * is different from the one we came from, or has changed under
4625  * us in some way.  If this is true, we want the failure.
4626  *
4627  * Extended attributes: if the filesystem is mounted with extended
4628  * attributes enabled (-o xattr), the attribute directory will be
4629  * represented in the r_path as the magic name XATTR_RPATH. So if
4630  * we see that name in the pathname, is must be because this node
4631  * is an extended attribute.  Therefore, look it up that way.
4632  */
4633 static int
4634 failover_lookup(char *path, vnode_t *root,
4635     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4636 	vnode_t *, cred_t *, int),
4637     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4638     vnode_t **new)
4639 {
4640 	vnode_t *dvp, *nvp;
4641 	int error = EINVAL;
4642 	char *s, *p, *tmppath;
4643 	size_t len;
4644 	mntinfo_t *mi;
4645 	bool_t xattr;
4646 
4647 	/* Make local copy of path */
4648 	len = strlen(path) + 1;
4649 	tmppath = kmem_alloc(len, KM_SLEEP);
4650 	(void) strcpy(tmppath, path);
4651 	s = tmppath;
4652 
4653 	dvp = root;
4654 	VN_HOLD(dvp);
4655 	mi = VTOMI(root);
4656 	xattr = mi->mi_flags & MI_EXTATTR;
4657 
4658 	do {
4659 		p = strchr(s, '/');
4660 		if (p != NULL)
4661 			*p = '\0';
4662 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4663 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4664 			    RFSCALL_SOFT);
4665 		} else {
4666 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4667 			    CRED(), RFSCALL_SOFT);
4668 		}
4669 		if (p != NULL)
4670 			*p++ = '/';
4671 		if (error) {
4672 			VN_RELE(dvp);
4673 			kmem_free(tmppath, len);
4674 			return (error);
4675 		}
4676 		s = p;
4677 		VN_RELE(dvp);
4678 		dvp = nvp;
4679 	} while (p != NULL);
4680 
4681 	if (nvp != NULL && new != NULL)
4682 		*new = nvp;
4683 	kmem_free(tmppath, len);
4684 	return (0);
4685 }
4686 
4687 /*
4688  * NFS client failover support
4689  *
4690  * sv_free() frees the malloc'd portion of a "servinfo_t".
4691  */
4692 void
4693 sv_free(servinfo_t *svp)
4694 {
4695 	servinfo_t *next;
4696 	struct knetconfig *knconf;
4697 
4698 	while (svp != NULL) {
4699 		next = svp->sv_next;
4700 		if (svp->sv_secdata)
4701 			sec_clnt_freeinfo(svp->sv_secdata);
4702 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4703 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4704 		knconf = svp->sv_knconf;
4705 		if (knconf != NULL) {
4706 			if (knconf->knc_protofmly != NULL)
4707 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4708 			if (knconf->knc_proto != NULL)
4709 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4710 			kmem_free(knconf, sizeof (*knconf));
4711 		}
4712 		knconf = svp->sv_origknconf;
4713 		if (knconf != NULL) {
4714 			if (knconf->knc_protofmly != NULL)
4715 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4716 			if (knconf->knc_proto != NULL)
4717 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4718 			kmem_free(knconf, sizeof (*knconf));
4719 		}
4720 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4721 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4722 		mutex_destroy(&svp->sv_lock);
4723 		kmem_free(svp, sizeof (*svp));
4724 		svp = next;
4725 	}
4726 }
4727 
4728 /*
4729  * Only can return non-zero if intr != 0.
4730  */
4731 int
4732 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4733 {
4734 
4735 	mutex_enter(&l->lock);
4736 
4737 	/*
4738 	 * If this is a nested enter, then allow it.  There
4739 	 * must be as many exits as enters through.
4740 	 */
4741 	if (l->owner == curthread) {
4742 		/* lock is held for writing by current thread */
4743 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4744 		l->count--;
4745 	} else if (rw == RW_READER) {
4746 		/*
4747 		 * While there is a writer active or writers waiting,
4748 		 * then wait for them to finish up and move on.  Then,
4749 		 * increment the count to indicate that a reader is
4750 		 * active.
4751 		 */
4752 		while (l->count < 0 || l->waiters > 0) {
4753 			if (intr) {
4754 				klwp_t *lwp = ttolwp(curthread);
4755 
4756 				if (lwp != NULL)
4757 					lwp->lwp_nostop++;
4758 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4759 					if (lwp != NULL)
4760 						lwp->lwp_nostop--;
4761 					mutex_exit(&l->lock);
4762 					return (EINTR);
4763 				}
4764 				if (lwp != NULL)
4765 					lwp->lwp_nostop--;
4766 			} else
4767 				cv_wait(&l->cv, &l->lock);
4768 		}
4769 		ASSERT(l->count < INT_MAX);
4770 #ifdef	DEBUG
4771 		if ((l->count % 10000) == 9999)
4772 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4773 			    "rwlock @ %p\n", l->count, (void *)&l);
4774 #endif
4775 		l->count++;
4776 	} else {
4777 		ASSERT(rw == RW_WRITER);
4778 		/*
4779 		 * While there are readers active or a writer
4780 		 * active, then wait for all of the readers
4781 		 * to finish or for the writer to finish.
4782 		 * Then, set the owner field to curthread and
4783 		 * decrement count to indicate that a writer
4784 		 * is active.
4785 		 */
4786 		while (l->count > 0 || l->owner != NULL) {
4787 			l->waiters++;
4788 			if (intr) {
4789 				klwp_t *lwp = ttolwp(curthread);
4790 
4791 				if (lwp != NULL)
4792 					lwp->lwp_nostop++;
4793 				if (!cv_wait_sig(&l->cv, &l->lock)) {
4794 					if (lwp != NULL)
4795 						lwp->lwp_nostop--;
4796 					l->waiters--;
4797 					cv_broadcast(&l->cv);
4798 					mutex_exit(&l->lock);
4799 					return (EINTR);
4800 				}
4801 				if (lwp != NULL)
4802 					lwp->lwp_nostop--;
4803 			} else
4804 				cv_wait(&l->cv, &l->lock);
4805 			l->waiters--;
4806 		}
4807 		l->owner = curthread;
4808 		l->count--;
4809 	}
4810 
4811 	mutex_exit(&l->lock);
4812 
4813 	return (0);
4814 }
4815 
4816 /*
4817  * If the lock is available, obtain it and return non-zero.  If there is
4818  * already a conflicting lock, return 0 immediately.
4819  */
4820 
4821 int
4822 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4823 {
4824 	mutex_enter(&l->lock);
4825 
4826 	/*
4827 	 * If this is a nested enter, then allow it.  There
4828 	 * must be as many exits as enters through.
4829 	 */
4830 	if (l->owner == curthread) {
4831 		/* lock is held for writing by current thread */
4832 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4833 		l->count--;
4834 	} else if (rw == RW_READER) {
4835 		/*
4836 		 * If there is a writer active or writers waiting, deny the
4837 		 * lock.  Otherwise, bump the count of readers.
4838 		 */
4839 		if (l->count < 0 || l->waiters > 0) {
4840 			mutex_exit(&l->lock);
4841 			return (0);
4842 		}
4843 		l->count++;
4844 	} else {
4845 		ASSERT(rw == RW_WRITER);
4846 		/*
4847 		 * If there are readers active or a writer active, deny the
4848 		 * lock.  Otherwise, set the owner field to curthread and
4849 		 * decrement count to indicate that a writer is active.
4850 		 */
4851 		if (l->count > 0 || l->owner != NULL) {
4852 			mutex_exit(&l->lock);
4853 			return (0);
4854 		}
4855 		l->owner = curthread;
4856 		l->count--;
4857 	}
4858 
4859 	mutex_exit(&l->lock);
4860 
4861 	return (1);
4862 }
4863 
4864 void
4865 nfs_rw_exit(nfs_rwlock_t *l)
4866 {
4867 
4868 	mutex_enter(&l->lock);
4869 	/*
4870 	 * If this is releasing a writer lock, then increment count to
4871 	 * indicate that there is one less writer active.  If this was
4872 	 * the last of possibly nested writer locks, then clear the owner
4873 	 * field as well to indicate that there is no writer active
4874 	 * and wakeup any possible waiting writers or readers.
4875 	 *
4876 	 * If releasing a reader lock, then just decrement count to
4877 	 * indicate that there is one less reader active.  If this was
4878 	 * the last active reader and there are writer(s) waiting,
4879 	 * then wake up the first.
4880 	 */
4881 	if (l->owner != NULL) {
4882 		ASSERT(l->owner == curthread);
4883 		l->count++;
4884 		if (l->count == 0) {
4885 			l->owner = NULL;
4886 			cv_broadcast(&l->cv);
4887 		}
4888 	} else {
4889 		ASSERT(l->count > 0);
4890 		l->count--;
4891 		if (l->count == 0 && l->waiters > 0)
4892 			cv_broadcast(&l->cv);
4893 	}
4894 	mutex_exit(&l->lock);
4895 }
4896 
4897 int
4898 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4899 {
4900 
4901 	if (rw == RW_READER)
4902 		return (l->count > 0);
4903 	ASSERT(rw == RW_WRITER);
4904 	return (l->count < 0);
4905 }
4906 
4907 /* ARGSUSED */
4908 void
4909 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4910 {
4911 
4912 	l->count = 0;
4913 	l->waiters = 0;
4914 	l->owner = NULL;
4915 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4916 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4917 }
4918 
4919 void
4920 nfs_rw_destroy(nfs_rwlock_t *l)
4921 {
4922 
4923 	mutex_destroy(&l->lock);
4924 	cv_destroy(&l->cv);
4925 }
4926 
4927 int
4928 nfs3_rddir_compar(const void *x, const void *y)
4929 {
4930 	rddir_cache *a = (rddir_cache *)x;
4931 	rddir_cache *b = (rddir_cache *)y;
4932 
4933 	if (a->nfs3_cookie == b->nfs3_cookie) {
4934 		if (a->buflen == b->buflen)
4935 			return (0);
4936 		if (a->buflen < b->buflen)
4937 			return (-1);
4938 		return (1);
4939 	}
4940 
4941 	if (a->nfs3_cookie < b->nfs3_cookie)
4942 		return (-1);
4943 
4944 	return (1);
4945 }
4946 
4947 int
4948 nfs_rddir_compar(const void *x, const void *y)
4949 {
4950 	rddir_cache *a = (rddir_cache *)x;
4951 	rddir_cache *b = (rddir_cache *)y;
4952 
4953 	if (a->nfs_cookie == b->nfs_cookie) {
4954 		if (a->buflen == b->buflen)
4955 			return (0);
4956 		if (a->buflen < b->buflen)
4957 			return (-1);
4958 		return (1);
4959 	}
4960 
4961 	if (a->nfs_cookie < b->nfs_cookie)
4962 		return (-1);
4963 
4964 	return (1);
4965 }
4966 
4967 static char *
4968 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4969 {
4970 	servinfo_t *s;
4971 	char *srvnames;
4972 	char *namep;
4973 	size_t length;
4974 
4975 	/*
4976 	 * Calculate the length of the string required to hold all
4977 	 * of the server names plus either a comma or a null
4978 	 * character following each individual one.
4979 	 */
4980 	length = 0;
4981 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4982 		length += s->sv_hostnamelen;
4983 
4984 	srvnames = kmem_alloc(length, KM_SLEEP);
4985 
4986 	namep = srvnames;
4987 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4988 		(void) strcpy(namep, s->sv_hostname);
4989 		namep += s->sv_hostnamelen - 1;
4990 		*namep++ = ',';
4991 	}
4992 	*--namep = '\0';
4993 
4994 	*len = length;
4995 
4996 	return (srvnames);
4997 }
4998 
4999 /*
5000  * These two functions are temporary and designed for the upgrade-workaround
5001  * only.  They cannot be used for general zone-crossing NFS client support, and
5002  * will be removed shortly.
5003  *
5004  * When the workaround is enabled, all NFS traffic is forced into the global
5005  * zone.  These functions are called when the code needs to refer to the state
5006  * of the underlying network connection.  They're not called when the function
5007  * needs to refer to the state of the process that invoked the system call.
5008  * (E.g., when checking whether the zone is shutting down during the mount()
5009  * call.)
5010  */
5011 
5012 struct zone *
5013 nfs_zone(void)
5014 {
5015 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5016 }
5017 
5018 zoneid_t
5019 nfs_zoneid(void)
5020 {
5021 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5022 }
5023 
5024 /*
5025  * nfs_mount_label_policy:
5026  *	Determine whether the mount is allowed according to MAC check,
5027  *	by comparing (where appropriate) label of the remote server
5028  *	against the label of the zone being mounted into.
5029  *
5030  *	Returns:
5031  *		 0 :	access allowed
5032  *		-1 :	read-only access allowed (i.e., read-down)
5033  *		>0 :	error code, such as EACCES
5034  */
5035 int
5036 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5037     struct knetconfig *knconf, cred_t *cr)
5038 {
5039 	int		addr_type;
5040 	void		*ipaddr;
5041 	bslabel_t	*server_sl, *mntlabel;
5042 	zone_t		*mntzone = NULL;
5043 	ts_label_t	*zlabel;
5044 	tsol_tpc_t	*tp;
5045 	ts_label_t	*tsl = NULL;
5046 	int		retv;
5047 
5048 	/*
5049 	 * Get the zone's label.  Each zone on a labeled system has a label.
5050 	 */
5051 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5052 	zlabel = mntzone->zone_slabel;
5053 	ASSERT(zlabel != NULL);
5054 	label_hold(zlabel);
5055 
5056 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5057 		addr_type = IPV4_VERSION;
5058 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5059 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5060 		addr_type = IPV6_VERSION;
5061 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5062 	} else {
5063 		retv = 0;
5064 		goto out;
5065 	}
5066 
5067 	retv = EACCES;				/* assume the worst */
5068 
5069 	/*
5070 	 * Next, get the assigned label of the remote server.
5071 	 */
5072 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5073 	if (tp == NULL)
5074 		goto out;			/* error getting host entry */
5075 
5076 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5077 		goto rel_tpc;			/* invalid domain */
5078 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5079 	    (tp->tpc_tp.host_type != UNLABELED))
5080 		goto rel_tpc;			/* invalid hosttype */
5081 
5082 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5083 		tsl = getflabel_cipso(vfsp);
5084 		if (tsl == NULL)
5085 			goto rel_tpc;		/* error getting server lbl */
5086 
5087 		server_sl = label2bslabel(tsl);
5088 	} else {	/* UNLABELED */
5089 		server_sl = &tp->tpc_tp.tp_def_label;
5090 	}
5091 
5092 	mntlabel = label2bslabel(zlabel);
5093 
5094 	/*
5095 	 * Now compare labels to complete the MAC check.  If the labels
5096 	 * are equal or if the requestor is in the global zone and has
5097 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5098 	 * mounts into the global zone itself; restrict these to
5099 	 * read-only.)
5100 	 *
5101 	 * If the requestor is in some other zone, but his label
5102 	 * dominates the server, then allow read-down.
5103 	 *
5104 	 * Otherwise, access is denied.
5105 	 */
5106 	if (blequal(mntlabel, server_sl) ||
5107 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5108 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5109 		if ((mntzone == global_zone) ||
5110 		    !blequal(mntlabel, server_sl))
5111 			retv = -1;		/* read-only */
5112 		else
5113 			retv = 0;		/* access OK */
5114 	} else if (bldominates(mntlabel, server_sl)) {
5115 		retv = -1;			/* read-only */
5116 	} else {
5117 		retv = EACCES;
5118 	}
5119 
5120 	if (tsl != NULL)
5121 		label_rele(tsl);
5122 
5123 rel_tpc:
5124 	TPC_RELE(tp);
5125 out:
5126 	if (mntzone)
5127 		zone_rele(mntzone);
5128 	label_rele(zlabel);
5129 	return (retv);
5130 }
5131 
5132 boolean_t
5133 nfs_has_ctty(void)
5134 {
5135 	boolean_t rv;
5136 	mutex_enter(&curproc->p_splock);
5137 	rv = (curproc->p_sessp->s_vp != NULL);
5138 	mutex_exit(&curproc->p_splock);
5139 	return (rv);
5140 }
5141 
5142 /*
5143  * See if xattr directory to see if it has any generic user attributes
5144  */
5145 int
5146 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5147 {
5148 	struct uio uio;
5149 	struct iovec iov;
5150 	char *dbuf;
5151 	struct dirent64 *dp;
5152 	size_t dlen = 8 * 1024;
5153 	size_t dbuflen;
5154 	int eof = 0;
5155 	int error;
5156 
5157 	*valp = 0;
5158 	dbuf = kmem_alloc(dlen, KM_SLEEP);
5159 	uio.uio_iov = &iov;
5160 	uio.uio_iovcnt = 1;
5161 	uio.uio_segflg = UIO_SYSSPACE;
5162 	uio.uio_fmode = 0;
5163 	uio.uio_extflg = UIO_COPY_CACHED;
5164 	uio.uio_loffset = 0;
5165 	uio.uio_resid = dlen;
5166 	iov.iov_base = dbuf;
5167 	iov.iov_len = dlen;
5168 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5169 	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5170 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5171 
5172 	dbuflen = dlen - uio.uio_resid;
5173 
5174 	if (error || dbuflen == 0) {
5175 		kmem_free(dbuf, dlen);
5176 		return (error);
5177 	}
5178 
5179 	dp = (dirent64_t *)dbuf;
5180 
5181 	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5182 		if (strcmp(dp->d_name, ".") == 0 ||
5183 		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5184 		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5185 		    VIEW_READONLY) == 0) {
5186 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5187 			continue;
5188 		}
5189 
5190 		*valp = 1;
5191 		break;
5192 	}
5193 	kmem_free(dbuf, dlen);
5194 	return (0);
5195 }
5196