xref: /titanic_44/usr/src/uts/common/fs/nfs/nfs_subr.c (revision 77b65ce69d04f1ba0eceb747081964672b718796)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/systm.h>
33 #include <sys/cred.h>
34 #include <sys/proc.h>
35 #include <sys/user.h>
36 #include <sys/time.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/socket.h>
41 #include <sys/uio.h>
42 #include <sys/tiuser.h>
43 #include <sys/swap.h>
44 #include <sys/errno.h>
45 #include <sys/debug.h>
46 #include <sys/kmem.h>
47 #include <sys/kstat.h>
48 #include <sys/cmn_err.h>
49 #include <sys/vtrace.h>
50 #include <sys/session.h>
51 #include <sys/dnlc.h>
52 #include <sys/bitmap.h>
53 #include <sys/acl.h>
54 #include <sys/ddi.h>
55 #include <sys/pathname.h>
56 #include <sys/flock.h>
57 #include <sys/dirent.h>
58 #include <sys/flock.h>
59 #include <sys/callb.h>
60 #include <sys/atomic.h>
61 #include <sys/list.h>
62 #include <sys/tsol/tnet.h>
63 #include <sys/priv.h>
64 #include <sys/sdt.h>
65 #include <sys/attr.h>
66 
67 #include <inet/ip6.h>
68 
69 #include <rpc/types.h>
70 #include <rpc/xdr.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73 
74 #include <nfs/nfs.h>
75 #include <nfs/nfs4.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/rnode.h>
78 #include <nfs/nfs_acl.h>
79 
80 #include <sys/tsol/label.h>
81 
82 /*
83  * The hash queues for the access to active and cached rnodes
84  * are organized as doubly linked lists.  A reader/writer lock
85  * for each hash bucket is used to control access and to synchronize
86  * lookups, additions, and deletions from the hash queue.
87  *
88  * The rnode freelist is organized as a doubly linked list with
89  * a head pointer.  Additions and deletions are synchronized via
90  * a single mutex.
91  *
92  * In order to add an rnode to the free list, it must be hashed into
93  * a hash queue and the exclusive lock to the hash queue be held.
94  * If an rnode is not hashed into a hash queue, then it is destroyed
95  * because it represents no valuable information that can be reused
96  * about the file.  The exclusive lock to the hash queue must be
97  * held in order to prevent a lookup in the hash queue from finding
98  * the rnode and using it and assuming that the rnode is not on the
99  * freelist.  The lookup in the hash queue will have the hash queue
100  * locked, either exclusive or shared.
101  *
102  * The vnode reference count for each rnode is not allowed to drop
103  * below 1.  This prevents external entities, such as the VM
104  * subsystem, from acquiring references to vnodes already on the
105  * freelist and then trying to place them back on the freelist
106  * when their reference is released.  This means that the when an
107  * rnode is looked up in the hash queues, then either the rnode
108  * is removed from the freelist and that reference is transferred to
109  * the new reference or the vnode reference count must be incremented
110  * accordingly.  The mutex for the freelist must be held in order to
111  * accurately test to see if the rnode is on the freelist or not.
112  * The hash queue lock might be held shared and it is possible that
113  * two different threads may race to remove the rnode from the
114  * freelist.  This race can be resolved by holding the mutex for the
115  * freelist.  Please note that the mutex for the freelist does not
116  * need to held if the rnode is not on the freelist.  It can not be
117  * placed on the freelist due to the requirement that the thread
118  * putting the rnode on the freelist must hold the exclusive lock
119  * to the hash queue and the thread doing the lookup in the hash
120  * queue is holding either a shared or exclusive lock to the hash
121  * queue.
122  *
123  * The lock ordering is:
124  *
125  *	hash bucket lock -> vnode lock
126  *	hash bucket lock -> freelist lock
127  */
128 static rhashq_t *rtable;
129 
130 static kmutex_t rpfreelist_lock;
131 static rnode_t *rpfreelist = NULL;
132 static long rnew = 0;
133 long nrnode = 0;
134 
135 static int rtablesize;
136 static int rtablemask;
137 
138 static int hashlen = 4;
139 
140 static struct kmem_cache *rnode_cache;
141 
142 /*
143  * Mutex to protect the following variables:
144  *	nfs_major
145  *	nfs_minor
146  */
147 kmutex_t nfs_minor_lock;
148 int nfs_major;
149 int nfs_minor;
150 
151 /* Do we allow preepoch (negative) time values otw? */
152 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
153 
154 /*
155  * Access cache
156  */
157 static acache_hash_t *acache;
158 static long nacache;	/* used strictly to size the number of hash queues */
159 
160 static int acachesize;
161 static int acachemask;
162 static struct kmem_cache *acache_cache;
163 
164 /*
165  * Client side utilities
166  */
167 
168 /*
169  * client side statistics
170  */
171 static const struct clstat clstat_tmpl = {
172 	{ "calls",	KSTAT_DATA_UINT64 },
173 	{ "badcalls",	KSTAT_DATA_UINT64 },
174 	{ "clgets",	KSTAT_DATA_UINT64 },
175 	{ "cltoomany",	KSTAT_DATA_UINT64 },
176 #ifdef DEBUG
177 	{ "clalloc",	KSTAT_DATA_UINT64 },
178 	{ "noresponse",	KSTAT_DATA_UINT64 },
179 	{ "failover",	KSTAT_DATA_UINT64 },
180 	{ "remap",	KSTAT_DATA_UINT64 },
181 #endif
182 };
183 
184 /*
185  * The following are statistics that describe behavior of the system as a whole
186  * and doesn't correspond to any one particular zone.
187  */
188 #ifdef DEBUG
189 static struct clstat_debug {
190 	kstat_named_t	nrnode;			/* number of allocated rnodes */
191 	kstat_named_t	access;			/* size of access cache */
192 	kstat_named_t	dirent;			/* size of readdir cache */
193 	kstat_named_t	dirents;		/* size of readdir buf cache */
194 	kstat_named_t	reclaim;		/* number of reclaims */
195 	kstat_named_t	clreclaim;		/* number of cl reclaims */
196 	kstat_named_t	f_reclaim;		/* number of free reclaims */
197 	kstat_named_t	a_reclaim;		/* number of active reclaims */
198 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
199 	kstat_named_t	rpath;			/* bytes used to store rpaths */
200 } clstat_debug = {
201 	{ "nrnode",	KSTAT_DATA_UINT64 },
202 	{ "access",	KSTAT_DATA_UINT64 },
203 	{ "dirent",	KSTAT_DATA_UINT64 },
204 	{ "dirents",	KSTAT_DATA_UINT64 },
205 	{ "reclaim",	KSTAT_DATA_UINT64 },
206 	{ "clreclaim",	KSTAT_DATA_UINT64 },
207 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
208 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
209 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
210 	{ "r_path",	KSTAT_DATA_UINT64 },
211 };
212 #endif	/* DEBUG */
213 
214 /*
215  * We keep a global list of per-zone client data, so we can clean up all zones
216  * if we get low on memory.
217  */
218 static list_t nfs_clnt_list;
219 static kmutex_t nfs_clnt_list_lock;
220 static zone_key_t nfsclnt_zone_key;
221 
222 static struct kmem_cache *chtab_cache;
223 
224 /*
225  * Some servers do not properly update the attributes of the
226  * directory when changes are made.  To allow interoperability
227  * with these broken servers, the nfs_disable_rddir_cache
228  * parameter must be set in /etc/system
229  */
230 int nfs_disable_rddir_cache = 0;
231 
232 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
233 		    struct chtab **);
234 void		clfree(CLIENT *, struct chtab *);
235 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
236 		    struct chtab **, struct nfs_clnt *);
237 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
238 		    struct chtab **, struct nfs_clnt *);
239 static void	clreclaim(void *);
240 static int	nfs_feedback(int, int, mntinfo_t *);
241 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
242 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
243 		    failinfo_t *);
244 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
245 		    caddr_t, cred_t *, int *, int, failinfo_t *);
246 static void	rinactive(rnode_t *, cred_t *);
247 static int	rtablehash(nfs_fhandle *);
248 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
249 		    struct vnodeops *,
250 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
251 			cred_t *),
252 		    int (*)(const void *, const void *), int *, cred_t *,
253 		    char *, char *);
254 static void	rp_rmfree(rnode_t *);
255 static void	rp_addhash(rnode_t *);
256 static void	rp_rmhash_locked(rnode_t *);
257 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
258 static void	destroy_rnode(rnode_t *);
259 static void	rddir_cache_free(rddir_cache *);
260 static int	nfs_free_data_reclaim(rnode_t *);
261 static int	nfs_active_data_reclaim(rnode_t *);
262 static int	nfs_free_reclaim(void);
263 static int	nfs_active_reclaim(void);
264 static int	nfs_rnode_reclaim(void);
265 static void	nfs_reclaim(void *);
266 static int	failover_safe(failinfo_t *);
267 static void	failover_newserver(mntinfo_t *mi);
268 static void	failover_thread(mntinfo_t *mi);
269 static int	failover_wait(mntinfo_t *);
270 static int	failover_remap(failinfo_t *);
271 static int	failover_lookup(char *, vnode_t *,
272 		    int (*)(vnode_t *, char *, vnode_t **,
273 			struct pathname *, int, vnode_t *, cred_t *, int),
274 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
275 		    vnode_t **);
276 static void	nfs_free_r_path(rnode_t *);
277 static void	nfs_set_vroot(vnode_t *);
278 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
279 
280 /*
281  * from rpcsec module (common/rpcsec)
282  */
283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
284 extern void sec_clnt_freeh(AUTH *);
285 extern void sec_clnt_freeinfo(struct sec_data *);
286 
287 /*
288  * used in mount policy
289  */
290 extern ts_label_t *getflabel_cipso(vfs_t *);
291 
292 /*
293  * EIO or EINTR are not recoverable errors.
294  */
295 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
296 
297 #ifdef DEBUG
298 #define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
299 #define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
300 #else
301 #define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
302 #define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
303 #endif
304 /*
305  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
306  */
307 static int
308 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
309     struct chtab **chp, struct nfs_clnt *nfscl)
310 {
311 	struct chhead *ch, *newch;
312 	struct chhead **plistp;
313 	struct chtab *cp;
314 	int error;
315 	k_sigset_t smask;
316 
317 	if (newcl == NULL || chp == NULL || ci == NULL)
318 		return (EINVAL);
319 
320 	*newcl = NULL;
321 	*chp = NULL;
322 
323 	/*
324 	 * Find an unused handle or create one
325 	 */
326 	newch = NULL;
327 	nfscl->nfscl_stat.clgets.value.ui64++;
328 top:
329 	/*
330 	 * Find the correct entry in the cache to check for free
331 	 * client handles.  The search is based on the RPC program
332 	 * number, program version number, dev_t for the transport
333 	 * device, and the protocol family.
334 	 */
335 	mutex_enter(&nfscl->nfscl_chtable_lock);
336 	plistp = &nfscl->nfscl_chtable;
337 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
338 		if (ch->ch_prog == ci->cl_prog &&
339 		    ch->ch_vers == ci->cl_vers &&
340 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
341 		    (strcmp(ch->ch_protofmly,
342 		    svp->sv_knconf->knc_protofmly) == 0))
343 			break;
344 		plistp = &ch->ch_next;
345 	}
346 
347 	/*
348 	 * If we didn't find a cache entry for this quadruple, then
349 	 * create one.  If we don't have one already preallocated,
350 	 * then drop the cache lock, create one, and then start over.
351 	 * If we did have a preallocated entry, then just add it to
352 	 * the front of the list.
353 	 */
354 	if (ch == NULL) {
355 		if (newch == NULL) {
356 			mutex_exit(&nfscl->nfscl_chtable_lock);
357 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
358 			newch->ch_timesused = 0;
359 			newch->ch_prog = ci->cl_prog;
360 			newch->ch_vers = ci->cl_vers;
361 			newch->ch_dev = svp->sv_knconf->knc_rdev;
362 			newch->ch_protofmly = kmem_alloc(
363 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
364 			    KM_SLEEP);
365 			(void) strcpy(newch->ch_protofmly,
366 			    svp->sv_knconf->knc_protofmly);
367 			newch->ch_list = NULL;
368 			goto top;
369 		}
370 		ch = newch;
371 		newch = NULL;
372 		ch->ch_next = nfscl->nfscl_chtable;
373 		nfscl->nfscl_chtable = ch;
374 	/*
375 	 * We found a cache entry, but if it isn't on the front of the
376 	 * list, then move it to the front of the list to try to take
377 	 * advantage of locality of operations.
378 	 */
379 	} else if (ch != nfscl->nfscl_chtable) {
380 		*plistp = ch->ch_next;
381 		ch->ch_next = nfscl->nfscl_chtable;
382 		nfscl->nfscl_chtable = ch;
383 	}
384 
385 	/*
386 	 * If there was a free client handle cached, then remove it
387 	 * from the list, init it, and use it.
388 	 */
389 	if (ch->ch_list != NULL) {
390 		cp = ch->ch_list;
391 		ch->ch_list = cp->ch_list;
392 		mutex_exit(&nfscl->nfscl_chtable_lock);
393 		if (newch != NULL) {
394 			kmem_free(newch->ch_protofmly,
395 			    strlen(newch->ch_protofmly) + 1);
396 			kmem_free(newch, sizeof (*newch));
397 		}
398 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
399 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
400 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
401 		    &cp->ch_client->cl_auth);
402 		if (error || cp->ch_client->cl_auth == NULL) {
403 			CLNT_DESTROY(cp->ch_client);
404 			kmem_cache_free(chtab_cache, cp);
405 			return ((error != 0) ? error : EINTR);
406 		}
407 		ch->ch_timesused++;
408 		*newcl = cp->ch_client;
409 		*chp = cp;
410 		return (0);
411 	}
412 
413 	/*
414 	 * There weren't any free client handles which fit, so allocate
415 	 * a new one and use that.
416 	 */
417 #ifdef DEBUG
418 	atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
419 #endif
420 	mutex_exit(&nfscl->nfscl_chtable_lock);
421 
422 	nfscl->nfscl_stat.cltoomany.value.ui64++;
423 	if (newch != NULL) {
424 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
425 		kmem_free(newch, sizeof (*newch));
426 	}
427 
428 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
429 	cp->ch_head = ch;
430 
431 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
432 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
433 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
434 	sigunintr(&smask);
435 
436 	if (error != 0) {
437 		kmem_cache_free(chtab_cache, cp);
438 #ifdef DEBUG
439 		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
440 #endif
441 		/*
442 		 * Warning is unnecessary if error is EINTR.
443 		 */
444 		if (error != EINTR) {
445 			nfs_cmn_err(error, CE_WARN,
446 			    "clget: couldn't create handle: %m\n");
447 		}
448 		return (error);
449 	}
450 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
451 	auth_destroy(cp->ch_client->cl_auth);
452 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
453 	    &cp->ch_client->cl_auth);
454 	if (error || cp->ch_client->cl_auth == NULL) {
455 		CLNT_DESTROY(cp->ch_client);
456 		kmem_cache_free(chtab_cache, cp);
457 #ifdef DEBUG
458 		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
459 #endif
460 		return ((error != 0) ? error : EINTR);
461 	}
462 	ch->ch_timesused++;
463 	*newcl = cp->ch_client;
464 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
465 	*chp = cp;
466 	return (0);
467 }
468 
469 int
470 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
471     struct chtab **chp)
472 {
473 	struct nfs_clnt *nfscl;
474 
475 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
476 	ASSERT(nfscl != NULL);
477 
478 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
479 }
480 
481 static int
482 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
483     struct chtab **chp, struct nfs_clnt *nfscl)
484 {
485 	clinfo_t ci;
486 	int error;
487 
488 	/*
489 	 * Set read buffer size to rsize
490 	 * and add room for RPC headers.
491 	 */
492 	ci.cl_readsize = mi->mi_tsize;
493 	if (ci.cl_readsize != 0)
494 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
495 
496 	/*
497 	 * If soft mount and server is down just try once.
498 	 * meaning: do not retransmit.
499 	 */
500 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
501 		ci.cl_retrans = 0;
502 	else
503 		ci.cl_retrans = mi->mi_retrans;
504 
505 	ci.cl_prog = NFS_ACL_PROGRAM;
506 	ci.cl_vers = mi->mi_vers;
507 	ci.cl_flags = mi->mi_flags;
508 
509 	/*
510 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
511 	 * security flavor, the client tries to establish a security context
512 	 * by contacting the server. If the connection is timed out or reset,
513 	 * e.g. server reboot, we will try again.
514 	 */
515 	do {
516 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
517 
518 		if (error == 0)
519 			break;
520 
521 		/*
522 		 * For forced unmount or zone shutdown, bail out, no retry.
523 		 */
524 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
525 			error = EIO;
526 			break;
527 		}
528 
529 		/* do not retry for softmount */
530 		if (!(mi->mi_flags & MI_HARD))
531 			break;
532 
533 		/* let the caller deal with the failover case */
534 		if (FAILOVER_MOUNT(mi))
535 			break;
536 
537 	} while (error == ETIMEDOUT || error == ECONNRESET);
538 
539 	return (error);
540 }
541 
542 static int
543 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
544     struct chtab **chp, struct nfs_clnt *nfscl)
545 {
546 	clinfo_t ci;
547 	int error;
548 
549 	/*
550 	 * Set read buffer size to rsize
551 	 * and add room for RPC headers.
552 	 */
553 	ci.cl_readsize = mi->mi_tsize;
554 	if (ci.cl_readsize != 0)
555 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
556 
557 	/*
558 	 * If soft mount and server is down just try once.
559 	 * meaning: do not retransmit.
560 	 */
561 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
562 		ci.cl_retrans = 0;
563 	else
564 		ci.cl_retrans = mi->mi_retrans;
565 
566 	ci.cl_prog = mi->mi_prog;
567 	ci.cl_vers = mi->mi_vers;
568 	ci.cl_flags = mi->mi_flags;
569 
570 	/*
571 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
572 	 * security flavor, the client tries to establish a security context
573 	 * by contacting the server. If the connection is timed out or reset,
574 	 * e.g. server reboot, we will try again.
575 	 */
576 	do {
577 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
578 
579 		if (error == 0)
580 			break;
581 
582 		/*
583 		 * For forced unmount or zone shutdown, bail out, no retry.
584 		 */
585 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
586 			error = EIO;
587 			break;
588 		}
589 
590 		/* do not retry for softmount */
591 		if (!(mi->mi_flags & MI_HARD))
592 			break;
593 
594 		/* let the caller deal with the failover case */
595 		if (FAILOVER_MOUNT(mi))
596 			break;
597 
598 	} while (error == ETIMEDOUT || error == ECONNRESET);
599 
600 	return (error);
601 }
602 
603 static void
604 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
605 {
606 	if (cl->cl_auth != NULL) {
607 		sec_clnt_freeh(cl->cl_auth);
608 		cl->cl_auth = NULL;
609 	}
610 
611 	/*
612 	 * Timestamp this cache entry so that we know when it was last
613 	 * used.
614 	 */
615 	cp->ch_freed = gethrestime_sec();
616 
617 	/*
618 	 * Add the free client handle to the front of the list.
619 	 * This way, the list will be sorted in youngest to oldest
620 	 * order.
621 	 */
622 	mutex_enter(&nfscl->nfscl_chtable_lock);
623 	cp->ch_list = cp->ch_head->ch_list;
624 	cp->ch_head->ch_list = cp;
625 	mutex_exit(&nfscl->nfscl_chtable_lock);
626 }
627 
628 void
629 clfree(CLIENT *cl, struct chtab *cp)
630 {
631 	struct nfs_clnt *nfscl;
632 
633 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
634 	ASSERT(nfscl != NULL);
635 
636 	clfree_impl(cl, cp, nfscl);
637 }
638 
639 #define	CL_HOLDTIME	60	/* time to hold client handles */
640 
641 static void
642 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
643 {
644 	struct chhead *ch;
645 	struct chtab *cp;	/* list of objects that can be reclaimed */
646 	struct chtab *cpe;
647 	struct chtab *cpl;
648 	struct chtab **cpp;
649 #ifdef DEBUG
650 	int n = 0;
651 #endif
652 
653 	/*
654 	 * Need to reclaim some memory, so step through the cache
655 	 * looking through the lists for entries which can be freed.
656 	 */
657 	cp = NULL;
658 
659 	mutex_enter(&nfscl->nfscl_chtable_lock);
660 
661 	/*
662 	 * Here we step through each non-NULL quadruple and start to
663 	 * construct the reclaim list pointed to by cp.  Note that
664 	 * cp will contain all eligible chtab entries.  When this traversal
665 	 * completes, chtab entries from the last quadruple will be at the
666 	 * front of cp and entries from previously inspected quadruples have
667 	 * been appended to the rear of cp.
668 	 */
669 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
670 		if (ch->ch_list == NULL)
671 			continue;
672 		/*
673 		 * Search each list for entries older then
674 		 * cl_holdtime seconds.  The lists are maintained
675 		 * in youngest to oldest order so that when the
676 		 * first entry is found which is old enough, then
677 		 * all of the rest of the entries on the list will
678 		 * be old enough as well.
679 		 */
680 		cpl = ch->ch_list;
681 		cpp = &ch->ch_list;
682 		while (cpl != NULL &&
683 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
684 			cpp = &cpl->ch_list;
685 			cpl = cpl->ch_list;
686 		}
687 		if (cpl != NULL) {
688 			*cpp = NULL;
689 			if (cp != NULL) {
690 				cpe = cpl;
691 				while (cpe->ch_list != NULL)
692 					cpe = cpe->ch_list;
693 				cpe->ch_list = cp;
694 			}
695 			cp = cpl;
696 		}
697 	}
698 
699 	mutex_exit(&nfscl->nfscl_chtable_lock);
700 
701 	/*
702 	 * If cp is empty, then there is nothing to reclaim here.
703 	 */
704 	if (cp == NULL)
705 		return;
706 
707 	/*
708 	 * Step through the list of entries to free, destroying each client
709 	 * handle and kmem_free'ing the memory for each entry.
710 	 */
711 	while (cp != NULL) {
712 #ifdef DEBUG
713 		n++;
714 #endif
715 		CLNT_DESTROY(cp->ch_client);
716 		cpl = cp->ch_list;
717 		kmem_cache_free(chtab_cache, cp);
718 		cp = cpl;
719 	}
720 
721 #ifdef DEBUG
722 	/*
723 	 * Update clalloc so that nfsstat shows the current number
724 	 * of allocated client handles.
725 	 */
726 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
727 #endif
728 }
729 
730 /* ARGSUSED */
731 static void
732 clreclaim(void *all)
733 {
734 	struct nfs_clnt *nfscl;
735 
736 #ifdef DEBUG
737 	clstat_debug.clreclaim.value.ui64++;
738 #endif
739 	/*
740 	 * The system is low on memory; go through and try to reclaim some from
741 	 * every zone on the system.
742 	 */
743 	mutex_enter(&nfs_clnt_list_lock);
744 	nfscl = list_head(&nfs_clnt_list);
745 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
746 		clreclaim_zone(nfscl, CL_HOLDTIME);
747 	mutex_exit(&nfs_clnt_list_lock);
748 }
749 
750 /*
751  * Minimum time-out values indexed by call type
752  * These units are in "eights" of a second to avoid multiplies
753  */
754 static unsigned int minimum_timeo[] = {
755 	6, 7, 10
756 };
757 
758 /*
759  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
760  */
761 #define	MAXTIMO	(20*hz)
762 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
763 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
764 
765 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
766 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
767 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
768 
769 /*
770  * Function called when rfscall notices that we have been
771  * re-transmitting, or when we get a response without retransmissions.
772  * Return 1 if the transfer size was adjusted down - 0 if no change.
773  */
774 static int
775 nfs_feedback(int flag, int which, mntinfo_t *mi)
776 {
777 	int kind;
778 	int r = 0;
779 
780 	mutex_enter(&mi->mi_lock);
781 	if (flag == FEEDBACK_REXMIT1) {
782 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
783 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
784 			goto done;
785 		if (mi->mi_curread > MIN_NFS_TSIZE) {
786 			mi->mi_curread /= 2;
787 			if (mi->mi_curread < MIN_NFS_TSIZE)
788 				mi->mi_curread = MIN_NFS_TSIZE;
789 			r = 1;
790 		}
791 
792 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
793 			mi->mi_curwrite /= 2;
794 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
795 				mi->mi_curwrite = MIN_NFS_TSIZE;
796 			r = 1;
797 		}
798 	} else if (flag == FEEDBACK_OK) {
799 		kind = mi->mi_timer_type[which];
800 		if (kind == 0 ||
801 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
802 			goto done;
803 		if (kind == 1) {
804 			if (mi->mi_curread >= mi->mi_tsize)
805 				goto done;
806 			mi->mi_curread +=  MIN_NFS_TSIZE;
807 			if (mi->mi_curread > mi->mi_tsize/2)
808 				mi->mi_curread = mi->mi_tsize;
809 		} else if (kind == 2) {
810 			if (mi->mi_curwrite >= mi->mi_stsize)
811 				goto done;
812 			mi->mi_curwrite += MIN_NFS_TSIZE;
813 			if (mi->mi_curwrite > mi->mi_stsize/2)
814 				mi->mi_curwrite = mi->mi_stsize;
815 		}
816 	}
817 done:
818 	mutex_exit(&mi->mi_lock);
819 	return (r);
820 }
821 
822 #ifdef DEBUG
823 static int rfs2call_hits = 0;
824 static int rfs2call_misses = 0;
825 #endif
826 
827 int
828 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
829     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
830     enum nfsstat *statusp, int flags, failinfo_t *fi)
831 {
832 	int rpcerror;
833 	enum clnt_stat rpc_status;
834 
835 	ASSERT(statusp != NULL);
836 
837 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
838 	    cr, douprintf, &rpc_status, flags, fi);
839 	if (!rpcerror) {
840 		/*
841 		 * See crnetadjust() for comments.
842 		 */
843 		if (*statusp == NFSERR_ACCES &&
844 		    (cr = crnetadjust(cr)) != NULL) {
845 #ifdef DEBUG
846 			rfs2call_hits++;
847 #endif
848 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
849 			    resp, cr, douprintf, NULL, flags, fi);
850 			crfree(cr);
851 #ifdef DEBUG
852 			if (*statusp == NFSERR_ACCES)
853 				rfs2call_misses++;
854 #endif
855 		}
856 	} else if (rpc_status == RPC_PROCUNAVAIL) {
857 		*statusp = NFSERR_OPNOTSUPP;
858 		rpcerror = 0;
859 	}
860 
861 	return (rpcerror);
862 }
863 
864 #define	NFS3_JUKEBOX_DELAY	10 * hz
865 
866 static clock_t nfs3_jukebox_delay = 0;
867 
868 #ifdef DEBUG
869 static int rfs3call_hits = 0;
870 static int rfs3call_misses = 0;
871 #endif
872 
873 int
874 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
875     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
876     nfsstat3 *statusp, int flags, failinfo_t *fi)
877 {
878 	int rpcerror;
879 	int user_informed;
880 
881 	user_informed = 0;
882 	do {
883 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
884 		    cr, douprintf, NULL, flags, fi);
885 		if (!rpcerror) {
886 			cred_t *crr;
887 			if (*statusp == NFS3ERR_JUKEBOX) {
888 				if (ttoproc(curthread) == &p0) {
889 					rpcerror = EAGAIN;
890 					break;
891 				}
892 				if (!user_informed) {
893 					user_informed = 1;
894 					uprintf(
895 		"file temporarily unavailable on the server, retrying...\n");
896 				}
897 				delay(nfs3_jukebox_delay);
898 			}
899 			/*
900 			 * See crnetadjust() for comments.
901 			 */
902 			else if (*statusp == NFS3ERR_ACCES &&
903 			    (crr = crnetadjust(cr)) != NULL) {
904 #ifdef DEBUG
905 				rfs3call_hits++;
906 #endif
907 				rpcerror = rfscall(mi, which, xdrargs, argsp,
908 				    xdrres, resp, crr, douprintf,
909 				    NULL, flags, fi);
910 
911 				crfree(crr);
912 #ifdef DEBUG
913 				if (*statusp == NFS3ERR_ACCES)
914 					rfs3call_misses++;
915 #endif
916 			}
917 		}
918 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
919 
920 	return (rpcerror);
921 }
922 
923 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
924 #define	INC_READERS(mi)		{ \
925 	mi->mi_readers++; \
926 }
927 #define	DEC_READERS(mi)		{ \
928 	mi->mi_readers--; \
929 	if (mi->mi_readers == 0) \
930 		cv_broadcast(&mi->mi_failover_cv); \
931 }
932 
933 static int
934 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
935     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
936     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
937 {
938 	CLIENT *client;
939 	struct chtab *ch;
940 	cred_t *cr = icr;
941 	enum clnt_stat status;
942 	struct rpc_err rpcerr, rpcerr_tmp;
943 	struct timeval wait;
944 	int timeo;		/* in units of hz */
945 	int my_rsize, my_wsize;
946 	bool_t tryagain;
947 	bool_t cred_cloned = FALSE;
948 	k_sigset_t smask;
949 	servinfo_t *svp;
950 	struct nfs_clnt *nfscl;
951 	zoneid_t zoneid = getzoneid();
952 	char *msg;
953 #ifdef DEBUG
954 	char *bufp;
955 #endif
956 
957 
958 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
959 	    "rfscall_start:which %d mi %p", which, mi);
960 
961 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
962 	ASSERT(nfscl != NULL);
963 
964 	nfscl->nfscl_stat.calls.value.ui64++;
965 	mi->mi_reqs[which].value.ui64++;
966 
967 	rpcerr.re_status = RPC_SUCCESS;
968 
969 	/*
970 	 * In case of forced unmount or zone shutdown, return EIO.
971 	 */
972 
973 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
974 		rpcerr.re_status = RPC_FAILED;
975 		rpcerr.re_errno = EIO;
976 		return (rpcerr.re_errno);
977 	}
978 
979 	/*
980 	 * Remember the transfer sizes in case
981 	 * nfs_feedback changes them underneath us.
982 	 */
983 	my_rsize = mi->mi_curread;
984 	my_wsize = mi->mi_curwrite;
985 
986 	/*
987 	 * NFS client failover support
988 	 *
989 	 * If this rnode is not in sync with the current server (VALID_FH),
990 	 * we'd like to do a remap to get in sync.  We can be interrupted
991 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
992 	 * use the best info we have to try the RPC.  Part of that is
993 	 * unconditionally updating the filehandle copy kept for V3.
994 	 *
995 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
996 	 * rw_enter(); we're trying to keep the current server from being
997 	 * changed on us until we're done with the remapping and have a
998 	 * matching client handle.  We don't want to sending a filehandle
999 	 * to the wrong host.
1000 	 */
1001 failoverretry:
1002 	if (FAILOVER_MOUNT(mi)) {
1003 		mutex_enter(&mi->mi_lock);
1004 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1005 			if (failover_wait(mi)) {
1006 				mutex_exit(&mi->mi_lock);
1007 				return (EINTR);
1008 			}
1009 		}
1010 		INC_READERS(mi);
1011 		mutex_exit(&mi->mi_lock);
1012 		if (fi) {
1013 			if (!VALID_FH(fi) &&
1014 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1015 				int remaperr;
1016 
1017 				svp = mi->mi_curr_serv;
1018 				remaperr = failover_remap(fi);
1019 				if (remaperr != 0) {
1020 #ifdef DEBUG
1021 					if (remaperr != EINTR)
1022 						nfs_cmn_err(remaperr, CE_WARN,
1023 					    "rfscall couldn't failover: %m");
1024 #endif
1025 					mutex_enter(&mi->mi_lock);
1026 					DEC_READERS(mi);
1027 					mutex_exit(&mi->mi_lock);
1028 					/*
1029 					 * If failover_remap returns ETIMEDOUT
1030 					 * and the filesystem is hard mounted
1031 					 * we have to retry the call with a new
1032 					 * server.
1033 					 */
1034 					if ((mi->mi_flags & MI_HARD) &&
1035 					    IS_RECOVERABLE_ERROR(remaperr)) {
1036 						if (svp == mi->mi_curr_serv)
1037 							failover_newserver(mi);
1038 						rpcerr.re_status = RPC_SUCCESS;
1039 						goto failoverretry;
1040 					}
1041 					rpcerr.re_errno = remaperr;
1042 					return (remaperr);
1043 				}
1044 			}
1045 			if (fi->fhp && fi->copyproc)
1046 				(*fi->copyproc)(fi->fhp, fi->vp);
1047 		}
1048 	}
1049 
1050 	/* For TSOL, use a new cred which has net_mac_aware flag */
1051 	if (!cred_cloned && is_system_labeled()) {
1052 		cred_cloned = TRUE;
1053 		cr = crdup(icr);
1054 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1055 	}
1056 
1057 	/*
1058 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1059 	 * are guaranteed to reprocess the retry as a new request.
1060 	 */
1061 	svp = mi->mi_curr_serv;
1062 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1063 
1064 	if (FAILOVER_MOUNT(mi)) {
1065 		mutex_enter(&mi->mi_lock);
1066 		DEC_READERS(mi);
1067 		mutex_exit(&mi->mi_lock);
1068 
1069 		if ((rpcerr.re_errno == ETIMEDOUT ||
1070 		    rpcerr.re_errno == ECONNRESET) &&
1071 		    failover_safe(fi)) {
1072 			if (svp == mi->mi_curr_serv)
1073 				failover_newserver(mi);
1074 			goto failoverretry;
1075 		}
1076 	}
1077 	if (rpcerr.re_errno != 0)
1078 		return (rpcerr.re_errno);
1079 
1080 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1081 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1082 		timeo = (mi->mi_timeo * hz) / 10;
1083 	} else {
1084 		mutex_enter(&mi->mi_lock);
1085 		timeo = CLNT_SETTIMERS(client,
1086 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1087 		    &(mi->mi_timers[NFS_CALLTYPES]),
1088 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1089 		    (void (*)())NULL, (caddr_t)mi, 0);
1090 		mutex_exit(&mi->mi_lock);
1091 	}
1092 
1093 	/*
1094 	 * If hard mounted fs, retry call forever unless hard error occurs.
1095 	 */
1096 	do {
1097 		tryagain = FALSE;
1098 
1099 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1100 			status = RPC_FAILED;
1101 			rpcerr.re_status = RPC_FAILED;
1102 			rpcerr.re_errno = EIO;
1103 			break;
1104 		}
1105 
1106 		TICK_TO_TIMEVAL(timeo, &wait);
1107 
1108 		/*
1109 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1110 		 * and SIGTERM. (Preserving the existing masks).
1111 		 * Mask out SIGINT if mount option nointr is specified.
1112 		 */
1113 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1114 		if (!(mi->mi_flags & MI_INT))
1115 			client->cl_nosignal = TRUE;
1116 
1117 		/*
1118 		 * If there is a current signal, then don't bother
1119 		 * even trying to send out the request because we
1120 		 * won't be able to block waiting for the response.
1121 		 * Simply assume RPC_INTR and get on with it.
1122 		 */
1123 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1124 			status = RPC_INTR;
1125 		else {
1126 			status = CLNT_CALL(client, which, xdrargs, argsp,
1127 			    xdrres, resp, wait);
1128 		}
1129 
1130 		if (!(mi->mi_flags & MI_INT))
1131 			client->cl_nosignal = FALSE;
1132 		/*
1133 		 * restore original signal mask
1134 		 */
1135 		sigunintr(&smask);
1136 
1137 		switch (status) {
1138 		case RPC_SUCCESS:
1139 			if ((mi->mi_flags & MI_DYNAMIC) &&
1140 			    mi->mi_timer_type[which] != 0 &&
1141 			    (mi->mi_curread != my_rsize ||
1142 			    mi->mi_curwrite != my_wsize))
1143 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1144 			break;
1145 
1146 		case RPC_INTR:
1147 			/*
1148 			 * There is no way to recover from this error,
1149 			 * even if mount option nointr is specified.
1150 			 * SIGKILL, for example, cannot be blocked.
1151 			 */
1152 			rpcerr.re_status = RPC_INTR;
1153 			rpcerr.re_errno = EINTR;
1154 			break;
1155 
1156 		case RPC_UDERROR:
1157 			/*
1158 			 * If the NFS server is local (vold) and
1159 			 * it goes away then we get RPC_UDERROR.
1160 			 * This is a retryable error, so we would
1161 			 * loop, so check to see if the specific
1162 			 * error was ECONNRESET, indicating that
1163 			 * target did not exist at all.  If so,
1164 			 * return with RPC_PROGUNAVAIL and
1165 			 * ECONNRESET to indicate why.
1166 			 */
1167 			CLNT_GETERR(client, &rpcerr);
1168 			if (rpcerr.re_errno == ECONNRESET) {
1169 				rpcerr.re_status = RPC_PROGUNAVAIL;
1170 				rpcerr.re_errno = ECONNRESET;
1171 				break;
1172 			}
1173 			/*FALLTHROUGH*/
1174 
1175 		default:		/* probably RPC_TIMEDOUT */
1176 			if (IS_UNRECOVERABLE_RPC(status))
1177 				break;
1178 
1179 			/*
1180 			 * increment server not responding count
1181 			 */
1182 			mutex_enter(&mi->mi_lock);
1183 			mi->mi_noresponse++;
1184 			mutex_exit(&mi->mi_lock);
1185 #ifdef DEBUG
1186 			nfscl->nfscl_stat.noresponse.value.ui64++;
1187 #endif
1188 
1189 			if (!(mi->mi_flags & MI_HARD)) {
1190 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1191 				    (mi->mi_ss_call_type[which] == 0))
1192 					break;
1193 			}
1194 
1195 			/*
1196 			 * The call is in progress (over COTS).
1197 			 * Try the CLNT_CALL again, but don't
1198 			 * print a noisy error message.
1199 			 */
1200 			if (status == RPC_INPROGRESS) {
1201 				tryagain = TRUE;
1202 				break;
1203 			}
1204 
1205 			if (flags & RFSCALL_SOFT)
1206 				break;
1207 
1208 			/*
1209 			 * On zone shutdown, just move on.
1210 			 */
1211 			if (zone_status_get(curproc->p_zone) >=
1212 			    ZONE_IS_SHUTTING_DOWN) {
1213 				rpcerr.re_status = RPC_FAILED;
1214 				rpcerr.re_errno = EIO;
1215 				break;
1216 			}
1217 
1218 			/*
1219 			 * NFS client failover support
1220 			 *
1221 			 * If the current server just failed us, we'll
1222 			 * start the process of finding a new server.
1223 			 * After that, we can just retry.
1224 			 */
1225 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1226 				if (svp == mi->mi_curr_serv)
1227 					failover_newserver(mi);
1228 				clfree_impl(client, ch, nfscl);
1229 				goto failoverretry;
1230 			}
1231 
1232 			tryagain = TRUE;
1233 			timeo = backoff(timeo);
1234 
1235 			CLNT_GETERR(client, &rpcerr_tmp);
1236 			if ((status == RPC_CANTSEND) &&
1237 			    (rpcerr_tmp.re_errno == ENOBUFS))
1238 				msg = SRV_QFULL_MSG;
1239 			else
1240 				msg = SRV_NOTRESP_MSG;
1241 
1242 			mutex_enter(&mi->mi_lock);
1243 			if (!(mi->mi_flags & MI_PRINTED)) {
1244 				mi->mi_flags |= MI_PRINTED;
1245 				mutex_exit(&mi->mi_lock);
1246 #ifdef DEBUG
1247 				zprintf(zoneid, msg, mi->mi_vers,
1248 				    svp->sv_hostname);
1249 #else
1250 				zprintf(zoneid, msg, svp->sv_hostname);
1251 #endif
1252 			} else
1253 				mutex_exit(&mi->mi_lock);
1254 			if (*douprintf && nfs_has_ctty()) {
1255 				*douprintf = 0;
1256 				if (!(mi->mi_flags & MI_NOPRINT))
1257 #ifdef DEBUG
1258 					uprintf(msg, mi->mi_vers,
1259 					    svp->sv_hostname);
1260 #else
1261 					uprintf(msg, svp->sv_hostname);
1262 #endif
1263 			}
1264 
1265 			/*
1266 			 * If doing dynamic adjustment of transfer
1267 			 * size and if it's a read or write call
1268 			 * and if the transfer size changed while
1269 			 * retransmitting or if the feedback routine
1270 			 * changed the transfer size,
1271 			 * then exit rfscall so that the transfer
1272 			 * size can be adjusted at the vnops level.
1273 			 */
1274 			if ((mi->mi_flags & MI_DYNAMIC) &&
1275 			    mi->mi_timer_type[which] != 0 &&
1276 			    (mi->mi_curread != my_rsize ||
1277 			    mi->mi_curwrite != my_wsize ||
1278 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1279 				/*
1280 				 * On read or write calls, return
1281 				 * back to the vnode ops level if
1282 				 * the transfer size changed.
1283 				 */
1284 				clfree_impl(client, ch, nfscl);
1285 				if (cred_cloned)
1286 					crfree(cr);
1287 				return (ENFS_TRYAGAIN);
1288 			}
1289 		}
1290 	} while (tryagain);
1291 
1292 	if (status != RPC_SUCCESS) {
1293 		/*
1294 		 * Let soft mounts use the timed out message.
1295 		 */
1296 		if (status == RPC_INPROGRESS)
1297 			status = RPC_TIMEDOUT;
1298 		nfscl->nfscl_stat.badcalls.value.ui64++;
1299 		if (status != RPC_INTR) {
1300 			mutex_enter(&mi->mi_lock);
1301 			mi->mi_flags |= MI_DOWN;
1302 			mutex_exit(&mi->mi_lock);
1303 			CLNT_GETERR(client, &rpcerr);
1304 #ifdef DEBUG
1305 			bufp = clnt_sperror(client, svp->sv_hostname);
1306 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1307 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1308 			if (nfs_has_ctty()) {
1309 				if (!(mi->mi_flags & MI_NOPRINT)) {
1310 					uprintf("NFS%d %s failed for %s\n",
1311 					    mi->mi_vers, mi->mi_rfsnames[which],
1312 					    bufp);
1313 				}
1314 			}
1315 			kmem_free(bufp, MAXPATHLEN);
1316 #else
1317 			zprintf(zoneid,
1318 			    "NFS %s failed for server %s: error %d (%s)\n",
1319 			    mi->mi_rfsnames[which], svp->sv_hostname,
1320 			    status, clnt_sperrno(status));
1321 			if (nfs_has_ctty()) {
1322 				if (!(mi->mi_flags & MI_NOPRINT)) {
1323 					uprintf(
1324 				"NFS %s failed for server %s: error %d (%s)\n",
1325 					    mi->mi_rfsnames[which],
1326 					    svp->sv_hostname, status,
1327 					    clnt_sperrno(status));
1328 				}
1329 			}
1330 #endif
1331 			/*
1332 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1333 			 * re_errno is set appropriately depending on
1334 			 * the authentication error
1335 			 */
1336 			if (status == RPC_VERSMISMATCH ||
1337 			    status == RPC_PROGVERSMISMATCH)
1338 				rpcerr.re_errno = EIO;
1339 		}
1340 	} else {
1341 		/*
1342 		 * Test the value of mi_down and mi_printed without
1343 		 * holding the mi_lock mutex.  If they are both zero,
1344 		 * then it is okay to skip the down and printed
1345 		 * processing.  This saves on a mutex_enter and
1346 		 * mutex_exit pair for a normal, successful RPC.
1347 		 * This was just complete overhead.
1348 		 */
1349 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1350 			mutex_enter(&mi->mi_lock);
1351 			mi->mi_flags &= ~MI_DOWN;
1352 			if (mi->mi_flags & MI_PRINTED) {
1353 				mi->mi_flags &= ~MI_PRINTED;
1354 				mutex_exit(&mi->mi_lock);
1355 #ifdef DEBUG
1356 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357 				zprintf(zoneid, "NFS%d server %s ok\n",
1358 				    mi->mi_vers, svp->sv_hostname);
1359 #else
1360 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1361 				zprintf(zoneid, "NFS server %s ok\n",
1362 				    svp->sv_hostname);
1363 #endif
1364 			} else
1365 				mutex_exit(&mi->mi_lock);
1366 		}
1367 
1368 		if (*douprintf == 0) {
1369 			if (!(mi->mi_flags & MI_NOPRINT))
1370 #ifdef DEBUG
1371 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372 					uprintf("NFS%d server %s ok\n",
1373 					    mi->mi_vers, svp->sv_hostname);
1374 #else
1375 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1376 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1377 #endif
1378 			*douprintf = 1;
1379 		}
1380 	}
1381 
1382 	clfree_impl(client, ch, nfscl);
1383 	if (cred_cloned)
1384 		crfree(cr);
1385 
1386 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1387 
1388 	if (rpc_status != NULL)
1389 		*rpc_status = rpcerr.re_status;
1390 
1391 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1392 	    rpcerr.re_errno);
1393 
1394 	return (rpcerr.re_errno);
1395 }
1396 
1397 #ifdef DEBUG
1398 static int acl2call_hits = 0;
1399 static int acl2call_misses = 0;
1400 #endif
1401 
1402 int
1403 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1404     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1405     enum nfsstat *statusp, int flags, failinfo_t *fi)
1406 {
1407 	int rpcerror;
1408 
1409 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1410 	    cr, douprintf, flags, fi);
1411 	if (!rpcerror) {
1412 		/*
1413 		 * See comments with crnetadjust().
1414 		 */
1415 		if (*statusp == NFSERR_ACCES &&
1416 		    (cr = crnetadjust(cr)) != NULL) {
1417 #ifdef DEBUG
1418 			acl2call_hits++;
1419 #endif
1420 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1421 			    resp, cr, douprintf, flags, fi);
1422 			crfree(cr);
1423 #ifdef DEBUG
1424 			if (*statusp == NFSERR_ACCES)
1425 				acl2call_misses++;
1426 #endif
1427 		}
1428 	}
1429 
1430 	return (rpcerror);
1431 }
1432 
1433 #ifdef DEBUG
1434 static int acl3call_hits = 0;
1435 static int acl3call_misses = 0;
1436 #endif
1437 
1438 int
1439 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1440     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1441     nfsstat3 *statusp, int flags, failinfo_t *fi)
1442 {
1443 	int rpcerror;
1444 	int user_informed;
1445 
1446 	user_informed = 0;
1447 
1448 	do {
1449 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1450 		    cr, douprintf, flags, fi);
1451 		if (!rpcerror) {
1452 			cred_t *crr;
1453 			if (*statusp == NFS3ERR_JUKEBOX) {
1454 				if (!user_informed) {
1455 					user_informed = 1;
1456 					uprintf(
1457 		"file temporarily unavailable on the server, retrying...\n");
1458 				}
1459 				delay(nfs3_jukebox_delay);
1460 			}
1461 			/*
1462 			 * See crnetadjust() for comments.
1463 			 */
1464 			else if (*statusp == NFS3ERR_ACCES &&
1465 			    (crr = crnetadjust(cr)) != NULL) {
1466 #ifdef DEBUG
1467 				acl3call_hits++;
1468 #endif
1469 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1470 				    xdrres, resp, crr, douprintf, flags, fi);
1471 
1472 				crfree(crr);
1473 #ifdef DEBUG
1474 				if (*statusp == NFS3ERR_ACCES)
1475 					acl3call_misses++;
1476 #endif
1477 			}
1478 		}
1479 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1480 
1481 	return (rpcerror);
1482 }
1483 
1484 static int
1485 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1486     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1487     int flags, failinfo_t *fi)
1488 {
1489 	CLIENT *client;
1490 	struct chtab *ch;
1491 	cred_t *cr = icr;
1492 	bool_t cred_cloned = FALSE;
1493 	enum clnt_stat status;
1494 	struct rpc_err rpcerr;
1495 	struct timeval wait;
1496 	int timeo;		/* in units of hz */
1497 #if 0 /* notyet */
1498 	int my_rsize, my_wsize;
1499 #endif
1500 	bool_t tryagain;
1501 	k_sigset_t smask;
1502 	servinfo_t *svp;
1503 	struct nfs_clnt *nfscl;
1504 	zoneid_t zoneid = getzoneid();
1505 #ifdef DEBUG
1506 	char *bufp;
1507 #endif
1508 
1509 #if 0 /* notyet */
1510 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1511 	    "rfscall_start:which %d mi %p", which, mi);
1512 #endif
1513 
1514 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1515 	ASSERT(nfscl != NULL);
1516 
1517 	nfscl->nfscl_stat.calls.value.ui64++;
1518 	mi->mi_aclreqs[which].value.ui64++;
1519 
1520 	rpcerr.re_status = RPC_SUCCESS;
1521 
1522 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1523 		rpcerr.re_status = RPC_FAILED;
1524 		rpcerr.re_errno = EIO;
1525 		return (rpcerr.re_errno);
1526 	}
1527 
1528 #if 0 /* notyet */
1529 	/*
1530 	 * Remember the transfer sizes in case
1531 	 * nfs_feedback changes them underneath us.
1532 	 */
1533 	my_rsize = mi->mi_curread;
1534 	my_wsize = mi->mi_curwrite;
1535 #endif
1536 
1537 	/*
1538 	 * NFS client failover support
1539 	 *
1540 	 * If this rnode is not in sync with the current server (VALID_FH),
1541 	 * we'd like to do a remap to get in sync.  We can be interrupted
1542 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1543 	 * use the best info we have to try the RPC.  Part of that is
1544 	 * unconditionally updating the filehandle copy kept for V3.
1545 	 *
1546 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1547 	 * rw_enter(); we're trying to keep the current server from being
1548 	 * changed on us until we're done with the remapping and have a
1549 	 * matching client handle.  We don't want to sending a filehandle
1550 	 * to the wrong host.
1551 	 */
1552 failoverretry:
1553 	if (FAILOVER_MOUNT(mi)) {
1554 		mutex_enter(&mi->mi_lock);
1555 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1556 			if (failover_wait(mi)) {
1557 				mutex_exit(&mi->mi_lock);
1558 				return (EINTR);
1559 			}
1560 		}
1561 		INC_READERS(mi);
1562 		mutex_exit(&mi->mi_lock);
1563 		if (fi) {
1564 			if (!VALID_FH(fi) &&
1565 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1566 				int remaperr;
1567 
1568 				svp = mi->mi_curr_serv;
1569 				remaperr = failover_remap(fi);
1570 				if (remaperr != 0) {
1571 #ifdef DEBUG
1572 					if (remaperr != EINTR)
1573 						nfs_cmn_err(remaperr, CE_WARN,
1574 					    "aclcall couldn't failover: %m");
1575 #endif
1576 					mutex_enter(&mi->mi_lock);
1577 					DEC_READERS(mi);
1578 					mutex_exit(&mi->mi_lock);
1579 
1580 					/*
1581 					 * If failover_remap returns ETIMEDOUT
1582 					 * and the filesystem is hard mounted
1583 					 * we have to retry the call with a new
1584 					 * server.
1585 					 */
1586 					if ((mi->mi_flags & MI_HARD) &&
1587 					    IS_RECOVERABLE_ERROR(remaperr)) {
1588 						if (svp == mi->mi_curr_serv)
1589 							failover_newserver(mi);
1590 						rpcerr.re_status = RPC_SUCCESS;
1591 						goto failoverretry;
1592 					}
1593 					return (remaperr);
1594 				}
1595 			}
1596 			if (fi->fhp && fi->copyproc)
1597 				(*fi->copyproc)(fi->fhp, fi->vp);
1598 		}
1599 	}
1600 
1601 	/* For TSOL, use a new cred which has net_mac_aware flag */
1602 	if (!cred_cloned && is_system_labeled()) {
1603 		cred_cloned = TRUE;
1604 		cr = crdup(icr);
1605 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1606 	}
1607 
1608 	/*
1609 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1610 	 * are guaranteed to reprocess the retry as a new request.
1611 	 */
1612 	svp = mi->mi_curr_serv;
1613 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1614 	if (FAILOVER_MOUNT(mi)) {
1615 		mutex_enter(&mi->mi_lock);
1616 		DEC_READERS(mi);
1617 		mutex_exit(&mi->mi_lock);
1618 
1619 		if ((rpcerr.re_errno == ETIMEDOUT ||
1620 		    rpcerr.re_errno == ECONNRESET) &&
1621 		    failover_safe(fi)) {
1622 			if (svp == mi->mi_curr_serv)
1623 				failover_newserver(mi);
1624 			goto failoverretry;
1625 		}
1626 	}
1627 	if (rpcerr.re_errno != 0) {
1628 		if (cred_cloned)
1629 			crfree(cr);
1630 		return (rpcerr.re_errno);
1631 	}
1632 
1633 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1634 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1635 		timeo = (mi->mi_timeo * hz) / 10;
1636 	} else {
1637 		mutex_enter(&mi->mi_lock);
1638 		timeo = CLNT_SETTIMERS(client,
1639 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1640 		    &(mi->mi_timers[NFS_CALLTYPES]),
1641 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1642 		    (void (*)()) 0, (caddr_t)mi, 0);
1643 		mutex_exit(&mi->mi_lock);
1644 	}
1645 
1646 	/*
1647 	 * If hard mounted fs, retry call forever unless hard error occurs.
1648 	 */
1649 	do {
1650 		tryagain = FALSE;
1651 
1652 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1653 			status = RPC_FAILED;
1654 			rpcerr.re_status = RPC_FAILED;
1655 			rpcerr.re_errno = EIO;
1656 			break;
1657 		}
1658 
1659 		TICK_TO_TIMEVAL(timeo, &wait);
1660 
1661 		/*
1662 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1663 		 * and SIGTERM. (Preserving the existing masks).
1664 		 * Mask out SIGINT if mount option nointr is specified.
1665 		 */
1666 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1667 		if (!(mi->mi_flags & MI_INT))
1668 			client->cl_nosignal = TRUE;
1669 
1670 		/*
1671 		 * If there is a current signal, then don't bother
1672 		 * even trying to send out the request because we
1673 		 * won't be able to block waiting for the response.
1674 		 * Simply assume RPC_INTR and get on with it.
1675 		 */
1676 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1677 			status = RPC_INTR;
1678 		else {
1679 			status = CLNT_CALL(client, which, xdrargs, argsp,
1680 			    xdrres, resp, wait);
1681 		}
1682 
1683 		if (!(mi->mi_flags & MI_INT))
1684 			client->cl_nosignal = FALSE;
1685 		/*
1686 		 * restore original signal mask
1687 		 */
1688 		sigunintr(&smask);
1689 
1690 		switch (status) {
1691 		case RPC_SUCCESS:
1692 #if 0 /* notyet */
1693 			if ((mi->mi_flags & MI_DYNAMIC) &&
1694 			    mi->mi_timer_type[which] != 0 &&
1695 			    (mi->mi_curread != my_rsize ||
1696 			    mi->mi_curwrite != my_wsize))
1697 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1698 #endif
1699 			break;
1700 
1701 		/*
1702 		 * Unfortunately, there are servers in the world which
1703 		 * are not coded correctly.  They are not prepared to
1704 		 * handle RPC requests to the NFS port which are not
1705 		 * NFS requests.  Thus, they may try to process the
1706 		 * NFS_ACL request as if it were an NFS request.  This
1707 		 * does not work.  Generally, an error will be generated
1708 		 * on the client because it will not be able to decode
1709 		 * the response from the server.  However, it seems
1710 		 * possible that the server may not be able to decode
1711 		 * the arguments.  Thus, the criteria for deciding
1712 		 * whether the server supports NFS_ACL or not is whether
1713 		 * the following RPC errors are returned from CLNT_CALL.
1714 		 */
1715 		case RPC_CANTDECODERES:
1716 		case RPC_PROGUNAVAIL:
1717 		case RPC_CANTDECODEARGS:
1718 		case RPC_PROGVERSMISMATCH:
1719 			mutex_enter(&mi->mi_lock);
1720 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1721 			mutex_exit(&mi->mi_lock);
1722 			break;
1723 
1724 		/*
1725 		 * If the server supports NFS_ACL but not the new ops
1726 		 * for extended attributes, make sure we don't retry.
1727 		 */
1728 		case RPC_PROCUNAVAIL:
1729 			mutex_enter(&mi->mi_lock);
1730 			mi->mi_flags &= ~MI_EXTATTR;
1731 			mutex_exit(&mi->mi_lock);
1732 			break;
1733 
1734 		case RPC_INTR:
1735 			/*
1736 			 * There is no way to recover from this error,
1737 			 * even if mount option nointr is specified.
1738 			 * SIGKILL, for example, cannot be blocked.
1739 			 */
1740 			rpcerr.re_status = RPC_INTR;
1741 			rpcerr.re_errno = EINTR;
1742 			break;
1743 
1744 		case RPC_UDERROR:
1745 			/*
1746 			 * If the NFS server is local (vold) and
1747 			 * it goes away then we get RPC_UDERROR.
1748 			 * This is a retryable error, so we would
1749 			 * loop, so check to see if the specific
1750 			 * error was ECONNRESET, indicating that
1751 			 * target did not exist at all.  If so,
1752 			 * return with RPC_PROGUNAVAIL and
1753 			 * ECONNRESET to indicate why.
1754 			 */
1755 			CLNT_GETERR(client, &rpcerr);
1756 			if (rpcerr.re_errno == ECONNRESET) {
1757 				rpcerr.re_status = RPC_PROGUNAVAIL;
1758 				rpcerr.re_errno = ECONNRESET;
1759 				break;
1760 			}
1761 			/*FALLTHROUGH*/
1762 
1763 		default:		/* probably RPC_TIMEDOUT */
1764 			if (IS_UNRECOVERABLE_RPC(status))
1765 				break;
1766 
1767 			/*
1768 			 * increment server not responding count
1769 			 */
1770 			mutex_enter(&mi->mi_lock);
1771 			mi->mi_noresponse++;
1772 			mutex_exit(&mi->mi_lock);
1773 #ifdef DEBUG
1774 			nfscl->nfscl_stat.noresponse.value.ui64++;
1775 #endif
1776 
1777 			if (!(mi->mi_flags & MI_HARD)) {
1778 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1779 				    (mi->mi_acl_ss_call_type[which] == 0))
1780 					break;
1781 			}
1782 
1783 			/*
1784 			 * The call is in progress (over COTS).
1785 			 * Try the CLNT_CALL again, but don't
1786 			 * print a noisy error message.
1787 			 */
1788 			if (status == RPC_INPROGRESS) {
1789 				tryagain = TRUE;
1790 				break;
1791 			}
1792 
1793 			if (flags & RFSCALL_SOFT)
1794 				break;
1795 
1796 			/*
1797 			 * On zone shutdown, just move on.
1798 			 */
1799 			if (zone_status_get(curproc->p_zone) >=
1800 			    ZONE_IS_SHUTTING_DOWN) {
1801 				rpcerr.re_status = RPC_FAILED;
1802 				rpcerr.re_errno = EIO;
1803 				break;
1804 			}
1805 
1806 			/*
1807 			 * NFS client failover support
1808 			 *
1809 			 * If the current server just failed us, we'll
1810 			 * start the process of finding a new server.
1811 			 * After that, we can just retry.
1812 			 */
1813 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1814 				if (svp == mi->mi_curr_serv)
1815 					failover_newserver(mi);
1816 				clfree_impl(client, ch, nfscl);
1817 				goto failoverretry;
1818 			}
1819 
1820 			tryagain = TRUE;
1821 			timeo = backoff(timeo);
1822 			mutex_enter(&mi->mi_lock);
1823 			if (!(mi->mi_flags & MI_PRINTED)) {
1824 				mi->mi_flags |= MI_PRINTED;
1825 				mutex_exit(&mi->mi_lock);
1826 #ifdef DEBUG
1827 				zprintf(zoneid,
1828 			"NFS_ACL%d server %s not responding still trying\n",
1829 				    mi->mi_vers, svp->sv_hostname);
1830 #else
1831 				zprintf(zoneid,
1832 			    "NFS server %s not responding still trying\n",
1833 				    svp->sv_hostname);
1834 #endif
1835 			} else
1836 				mutex_exit(&mi->mi_lock);
1837 			if (*douprintf && nfs_has_ctty()) {
1838 				*douprintf = 0;
1839 				if (!(mi->mi_flags & MI_NOPRINT))
1840 #ifdef DEBUG
1841 					uprintf(
1842 			"NFS_ACL%d server %s not responding still trying\n",
1843 					    mi->mi_vers, svp->sv_hostname);
1844 #else
1845 					uprintf(
1846 			    "NFS server %s not responding still trying\n",
1847 					    svp->sv_hostname);
1848 #endif
1849 			}
1850 
1851 #if 0 /* notyet */
1852 			/*
1853 			 * If doing dynamic adjustment of transfer
1854 			 * size and if it's a read or write call
1855 			 * and if the transfer size changed while
1856 			 * retransmitting or if the feedback routine
1857 			 * changed the transfer size,
1858 			 * then exit rfscall so that the transfer
1859 			 * size can be adjusted at the vnops level.
1860 			 */
1861 			if ((mi->mi_flags & MI_DYNAMIC) &&
1862 			    mi->mi_acl_timer_type[which] != 0 &&
1863 			    (mi->mi_curread != my_rsize ||
1864 			    mi->mi_curwrite != my_wsize ||
1865 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1866 				/*
1867 				 * On read or write calls, return
1868 				 * back to the vnode ops level if
1869 				 * the transfer size changed.
1870 				 */
1871 				clfree_impl(client, ch, nfscl);
1872 				if (cred_cloned)
1873 					crfree(cr);
1874 				return (ENFS_TRYAGAIN);
1875 			}
1876 #endif
1877 		}
1878 	} while (tryagain);
1879 
1880 	if (status != RPC_SUCCESS) {
1881 		/*
1882 		 * Let soft mounts use the timed out message.
1883 		 */
1884 		if (status == RPC_INPROGRESS)
1885 			status = RPC_TIMEDOUT;
1886 		nfscl->nfscl_stat.badcalls.value.ui64++;
1887 		if (status == RPC_CANTDECODERES ||
1888 		    status == RPC_PROGUNAVAIL ||
1889 		    status == RPC_PROCUNAVAIL ||
1890 		    status == RPC_CANTDECODEARGS ||
1891 		    status == RPC_PROGVERSMISMATCH)
1892 			CLNT_GETERR(client, &rpcerr);
1893 		else if (status != RPC_INTR) {
1894 			mutex_enter(&mi->mi_lock);
1895 			mi->mi_flags |= MI_DOWN;
1896 			mutex_exit(&mi->mi_lock);
1897 			CLNT_GETERR(client, &rpcerr);
1898 #ifdef DEBUG
1899 			bufp = clnt_sperror(client, svp->sv_hostname);
1900 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1901 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1902 			if (nfs_has_ctty()) {
1903 				if (!(mi->mi_flags & MI_NOPRINT)) {
1904 					uprintf("NFS_ACL%d %s failed for %s\n",
1905 					    mi->mi_vers, mi->mi_aclnames[which],
1906 					    bufp);
1907 				}
1908 			}
1909 			kmem_free(bufp, MAXPATHLEN);
1910 #else
1911 			zprintf(zoneid,
1912 			    "NFS %s failed for server %s: error %d (%s)\n",
1913 			    mi->mi_aclnames[which], svp->sv_hostname,
1914 			    status, clnt_sperrno(status));
1915 			if (nfs_has_ctty()) {
1916 				if (!(mi->mi_flags & MI_NOPRINT))
1917 					uprintf(
1918 				"NFS %s failed for server %s: error %d (%s)\n",
1919 					    mi->mi_aclnames[which],
1920 					    svp->sv_hostname, status,
1921 					    clnt_sperrno(status));
1922 			}
1923 #endif
1924 			/*
1925 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1926 			 * re_errno is set appropriately depending on
1927 			 * the authentication error
1928 			 */
1929 			if (status == RPC_VERSMISMATCH ||
1930 			    status == RPC_PROGVERSMISMATCH)
1931 				rpcerr.re_errno = EIO;
1932 		}
1933 	} else {
1934 		/*
1935 		 * Test the value of mi_down and mi_printed without
1936 		 * holding the mi_lock mutex.  If they are both zero,
1937 		 * then it is okay to skip the down and printed
1938 		 * processing.  This saves on a mutex_enter and
1939 		 * mutex_exit pair for a normal, successful RPC.
1940 		 * This was just complete overhead.
1941 		 */
1942 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1943 			mutex_enter(&mi->mi_lock);
1944 			mi->mi_flags &= ~MI_DOWN;
1945 			if (mi->mi_flags & MI_PRINTED) {
1946 				mi->mi_flags &= ~MI_PRINTED;
1947 				mutex_exit(&mi->mi_lock);
1948 #ifdef DEBUG
1949 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1950 				    mi->mi_vers, svp->sv_hostname);
1951 #else
1952 				zprintf(zoneid, "NFS server %s ok\n",
1953 				    svp->sv_hostname);
1954 #endif
1955 			} else
1956 				mutex_exit(&mi->mi_lock);
1957 		}
1958 
1959 		if (*douprintf == 0) {
1960 			if (!(mi->mi_flags & MI_NOPRINT))
1961 #ifdef DEBUG
1962 				uprintf("NFS_ACL%d server %s ok\n",
1963 				    mi->mi_vers, svp->sv_hostname);
1964 #else
1965 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1966 #endif
1967 			*douprintf = 1;
1968 		}
1969 	}
1970 
1971 	clfree_impl(client, ch, nfscl);
1972 	if (cred_cloned)
1973 		crfree(cr);
1974 
1975 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1976 
1977 #if 0 /* notyet */
1978 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1979 	    rpcerr.re_errno);
1980 #endif
1981 
1982 	return (rpcerr.re_errno);
1983 }
1984 
1985 int
1986 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1987 {
1988 	uint_t mask = vap->va_mask;
1989 
1990 	if (!(mask & AT_MODE))
1991 		sa->sa_mode = (uint32_t)-1;
1992 	else
1993 		sa->sa_mode = vap->va_mode;
1994 	if (!(mask & AT_UID))
1995 		sa->sa_uid = (uint32_t)-1;
1996 	else
1997 		sa->sa_uid = (uint32_t)vap->va_uid;
1998 	if (!(mask & AT_GID))
1999 		sa->sa_gid = (uint32_t)-1;
2000 	else
2001 		sa->sa_gid = (uint32_t)vap->va_gid;
2002 	if (!(mask & AT_SIZE))
2003 		sa->sa_size = (uint32_t)-1;
2004 	else
2005 		sa->sa_size = (uint32_t)vap->va_size;
2006 	if (!(mask & AT_ATIME))
2007 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2008 	else {
2009 		/* check time validity */
2010 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2011 			return (EOVERFLOW);
2012 		}
2013 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2014 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2015 	}
2016 	if (!(mask & AT_MTIME))
2017 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2018 	else {
2019 		/* check time validity */
2020 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2021 			return (EOVERFLOW);
2022 		}
2023 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2024 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2025 	}
2026 	return (0);
2027 }
2028 
2029 int
2030 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2031 {
2032 	uint_t mask = vap->va_mask;
2033 
2034 	if (!(mask & AT_MODE))
2035 		sa->mode.set_it = FALSE;
2036 	else {
2037 		sa->mode.set_it = TRUE;
2038 		sa->mode.mode = (mode3)vap->va_mode;
2039 	}
2040 	if (!(mask & AT_UID))
2041 		sa->uid.set_it = FALSE;
2042 	else {
2043 		sa->uid.set_it = TRUE;
2044 		sa->uid.uid = (uid3)vap->va_uid;
2045 	}
2046 	if (!(mask & AT_GID))
2047 		sa->gid.set_it = FALSE;
2048 	else {
2049 		sa->gid.set_it = TRUE;
2050 		sa->gid.gid = (gid3)vap->va_gid;
2051 	}
2052 	if (!(mask & AT_SIZE))
2053 		sa->size.set_it = FALSE;
2054 	else {
2055 		sa->size.set_it = TRUE;
2056 		sa->size.size = (size3)vap->va_size;
2057 	}
2058 	if (!(mask & AT_ATIME))
2059 		sa->atime.set_it = DONT_CHANGE;
2060 	else {
2061 		/* check time validity */
2062 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2063 			return (EOVERFLOW);
2064 		}
2065 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2066 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2067 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2068 	}
2069 	if (!(mask & AT_MTIME))
2070 		sa->mtime.set_it = DONT_CHANGE;
2071 	else {
2072 		/* check time validity */
2073 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2074 			return (EOVERFLOW);
2075 		}
2076 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2077 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2078 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2079 	}
2080 	return (0);
2081 }
2082 
2083 void
2084 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2085 {
2086 
2087 	da->da_fhandle = VTOFH(dvp);
2088 	da->da_name = nm;
2089 	da->da_flags = 0;
2090 }
2091 
2092 void
2093 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2094 {
2095 
2096 	da->dirp = VTOFH3(dvp);
2097 	da->name = nm;
2098 }
2099 
2100 int
2101 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2102 {
2103 	int error;
2104 	rnode_t *rp;
2105 	struct vattr va;
2106 
2107 	va.va_mask = AT_MODE | AT_GID;
2108 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2109 	if (error)
2110 		return (error);
2111 
2112 	/*
2113 	 * To determine the expected group-id of the created file:
2114 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2115 	 *	GRPID option, and the directory's set-gid bit is clear,
2116 	 *	then use the process's gid.
2117 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2118 	 */
2119 	rp = VTOR(dvp);
2120 	mutex_enter(&rp->r_statelock);
2121 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2122 		*gidp = crgetgid(cr);
2123 	else
2124 		*gidp = va.va_gid;
2125 	mutex_exit(&rp->r_statelock);
2126 	return (0);
2127 }
2128 
2129 int
2130 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2131 {
2132 	int error;
2133 	struct vattr va;
2134 
2135 	va.va_mask = AT_MODE;
2136 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2137 	if (error)
2138 		return (error);
2139 
2140 	/*
2141 	 * Modify the expected mode (om) so that the set-gid bit matches
2142 	 * that of the parent directory (dvp).
2143 	 */
2144 	if (va.va_mode & VSGID)
2145 		*omp |= VSGID;
2146 	else
2147 		*omp &= ~VSGID;
2148 	return (0);
2149 }
2150 
2151 void
2152 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2153 {
2154 
2155 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2156 		if (!(vp->v_flag & VSWAPLIKE)) {
2157 			mutex_enter(&vp->v_lock);
2158 			vp->v_flag |= VSWAPLIKE;
2159 			mutex_exit(&vp->v_lock);
2160 		}
2161 	} else {
2162 		if (vp->v_flag & VSWAPLIKE) {
2163 			mutex_enter(&vp->v_lock);
2164 			vp->v_flag &= ~VSWAPLIKE;
2165 			mutex_exit(&vp->v_lock);
2166 		}
2167 	}
2168 }
2169 
2170 /*
2171  * Free the resources associated with an rnode.
2172  */
2173 static void
2174 rinactive(rnode_t *rp, cred_t *cr)
2175 {
2176 	vnode_t *vp;
2177 	cred_t *cred;
2178 	char *contents;
2179 	int size;
2180 	vsecattr_t *vsp;
2181 	int error;
2182 	nfs3_pathconf_info *info;
2183 
2184 	/*
2185 	 * Before freeing anything, wait until all asynchronous
2186 	 * activity is done on this rnode.  This will allow all
2187 	 * asynchronous read ahead and write behind i/o's to
2188 	 * finish.
2189 	 */
2190 	mutex_enter(&rp->r_statelock);
2191 	while (rp->r_count > 0)
2192 		cv_wait(&rp->r_cv, &rp->r_statelock);
2193 	mutex_exit(&rp->r_statelock);
2194 
2195 	/*
2196 	 * Flush and invalidate all pages associated with the vnode.
2197 	 */
2198 	vp = RTOV(rp);
2199 	if (vn_has_cached_data(vp)) {
2200 		ASSERT(vp->v_type != VCHR);
2201 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2202 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2203 			if (error && (error == ENOSPC || error == EDQUOT)) {
2204 				mutex_enter(&rp->r_statelock);
2205 				if (!rp->r_error)
2206 					rp->r_error = error;
2207 				mutex_exit(&rp->r_statelock);
2208 			}
2209 		}
2210 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2211 	}
2212 
2213 	/*
2214 	 * Free any held credentials and caches which may be associated
2215 	 * with this rnode.
2216 	 */
2217 	mutex_enter(&rp->r_statelock);
2218 	cred = rp->r_cred;
2219 	rp->r_cred = NULL;
2220 	contents = rp->r_symlink.contents;
2221 	size = rp->r_symlink.size;
2222 	rp->r_symlink.contents = NULL;
2223 	vsp = rp->r_secattr;
2224 	rp->r_secattr = NULL;
2225 	info = rp->r_pathconf;
2226 	rp->r_pathconf = NULL;
2227 	mutex_exit(&rp->r_statelock);
2228 
2229 	/*
2230 	 * Free the held credential.
2231 	 */
2232 	if (cred != NULL)
2233 		crfree(cred);
2234 
2235 	/*
2236 	 * Free the access cache entries.
2237 	 */
2238 	(void) nfs_access_purge_rp(rp);
2239 
2240 	/*
2241 	 * Free the readdir cache entries.
2242 	 */
2243 	if (HAVE_RDDIR_CACHE(rp))
2244 		nfs_purge_rddir_cache(vp);
2245 
2246 	/*
2247 	 * Free the symbolic link cache.
2248 	 */
2249 	if (contents != NULL) {
2250 
2251 		kmem_free((void *)contents, size);
2252 	}
2253 
2254 	/*
2255 	 * Free any cached ACL.
2256 	 */
2257 	if (vsp != NULL)
2258 		nfs_acl_free(vsp);
2259 
2260 	/*
2261 	 * Free any cached pathconf information.
2262 	 */
2263 	if (info != NULL)
2264 		kmem_free(info, sizeof (*info));
2265 }
2266 
2267 /*
2268  * Return a vnode for the given NFS Version 2 file handle.
2269  * If no rnode exists for this fhandle, create one and put it
2270  * into the hash queues.  If the rnode for this fhandle
2271  * already exists, return it.
2272  *
2273  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2274  */
2275 vnode_t *
2276 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2277     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2278 {
2279 	int newnode;
2280 	int index;
2281 	vnode_t *vp;
2282 	nfs_fhandle nfh;
2283 	vattr_t va;
2284 
2285 	nfh.fh_len = NFS_FHSIZE;
2286 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2287 
2288 	index = rtablehash(&nfh);
2289 	rw_enter(&rtable[index].r_lock, RW_READER);
2290 
2291 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2292 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2293 
2294 	if (attr != NULL) {
2295 		if (!newnode) {
2296 			rw_exit(&rtable[index].r_lock);
2297 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2298 		} else {
2299 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2300 				vp->v_type = VBAD;
2301 			else
2302 				vp->v_type = n2v_type(attr);
2303 			/*
2304 			 * A translation here seems to be necessary
2305 			 * because this function can be called
2306 			 * with `attr' that has come from the wire,
2307 			 * and been operated on by vattr_to_nattr().
2308 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2309 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2310 			 * ->makenfsnode().
2311 			 */
2312 			if ((attr->na_rdev & 0xffff0000) == 0)
2313 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2314 			else
2315 				vp->v_rdev = expldev(n2v_rdev(attr));
2316 			nfs_attrcache(vp, attr, t);
2317 			rw_exit(&rtable[index].r_lock);
2318 		}
2319 	} else {
2320 		if (newnode) {
2321 			PURGE_ATTRCACHE(vp);
2322 		}
2323 		rw_exit(&rtable[index].r_lock);
2324 	}
2325 
2326 	return (vp);
2327 }
2328 
2329 /*
2330  * Return a vnode for the given NFS Version 3 file handle.
2331  * If no rnode exists for this fhandle, create one and put it
2332  * into the hash queues.  If the rnode for this fhandle
2333  * already exists, return it.
2334  *
2335  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2336  */
2337 vnode_t *
2338 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2339     cred_t *cr, char *dnm, char *nm)
2340 {
2341 	int newnode;
2342 	int index;
2343 	vnode_t *vp;
2344 
2345 	index = rtablehash((nfs_fhandle *)fh);
2346 	rw_enter(&rtable[index].r_lock, RW_READER);
2347 
2348 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2349 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2350 	    dnm, nm);
2351 
2352 	if (vap == NULL) {
2353 		if (newnode) {
2354 			PURGE_ATTRCACHE(vp);
2355 		}
2356 		rw_exit(&rtable[index].r_lock);
2357 		return (vp);
2358 	}
2359 
2360 	if (!newnode) {
2361 		rw_exit(&rtable[index].r_lock);
2362 		nfs_attr_cache(vp, vap, t, cr);
2363 	} else {
2364 		rnode_t *rp = VTOR(vp);
2365 
2366 		vp->v_type = vap->va_type;
2367 		vp->v_rdev = vap->va_rdev;
2368 
2369 		mutex_enter(&rp->r_statelock);
2370 		if (rp->r_mtime <= t)
2371 			nfs_attrcache_va(vp, vap);
2372 		mutex_exit(&rp->r_statelock);
2373 		rw_exit(&rtable[index].r_lock);
2374 	}
2375 
2376 	return (vp);
2377 }
2378 
2379 vnode_t *
2380 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2381     cred_t *cr, char *dnm, char *nm)
2382 {
2383 	int newnode;
2384 	int index;
2385 	vnode_t *vp;
2386 	vattr_t va;
2387 
2388 	index = rtablehash((nfs_fhandle *)fh);
2389 	rw_enter(&rtable[index].r_lock, RW_READER);
2390 
2391 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2392 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2393 	    dnm, nm);
2394 
2395 	if (attr == NULL) {
2396 		if (newnode) {
2397 			PURGE_ATTRCACHE(vp);
2398 		}
2399 		rw_exit(&rtable[index].r_lock);
2400 		return (vp);
2401 	}
2402 
2403 	if (!newnode) {
2404 		rw_exit(&rtable[index].r_lock);
2405 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2406 	} else {
2407 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2408 			vp->v_type = VBAD;
2409 		else
2410 			vp->v_type = nf3_to_vt[attr->type];
2411 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2412 		    attr->rdev.specdata2);
2413 		nfs3_attrcache(vp, attr, t);
2414 		rw_exit(&rtable[index].r_lock);
2415 	}
2416 
2417 	return (vp);
2418 }
2419 
2420 /*
2421  * Read this comment before making changes to rtablehash()!
2422  * This is a hash function in which seemingly obvious and harmless
2423  * changes can cause escalations costing million dollars!
2424  * Know what you are doing.
2425  *
2426  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2427  * algorithm is currently detailed here:
2428  *
2429  *   http://burtleburtle.net/bob/hash/doobs.html
2430  *
2431  * Of course, the above link may not be valid by the time you are reading
2432  * this, but suffice it to say that the one-at-a-time algorithm works well in
2433  * almost all cases.  If you are changing the algorithm be sure to verify that
2434  * the hash algorithm still provides even distribution in all cases and with
2435  * any server returning filehandles in whatever order (sequential or random).
2436  */
2437 static int
2438 rtablehash(nfs_fhandle *fh)
2439 {
2440 	ulong_t hash, len, i;
2441 	char *key;
2442 
2443 	key = fh->fh_buf;
2444 	len = (ulong_t)fh->fh_len;
2445 	for (hash = 0, i = 0; i < len; i++) {
2446 		hash += key[i];
2447 		hash += (hash << 10);
2448 		hash ^= (hash >> 6);
2449 	}
2450 	hash += (hash << 3);
2451 	hash ^= (hash >> 11);
2452 	hash += (hash << 15);
2453 	return (hash & rtablemask);
2454 }
2455 
2456 static vnode_t *
2457 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2458     struct vnodeops *vops,
2459     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2460     int (*compar)(const void *, const void *),
2461     int *newnode, cred_t *cr, char *dnm, char *nm)
2462 {
2463 	rnode_t *rp;
2464 	rnode_t *trp;
2465 	vnode_t *vp;
2466 	mntinfo_t *mi;
2467 
2468 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2469 
2470 	mi = VFTOMI(vfsp);
2471 start:
2472 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2473 		vp = RTOV(rp);
2474 		nfs_set_vroot(vp);
2475 		*newnode = 0;
2476 		return (vp);
2477 	}
2478 	rw_exit(&rhtp->r_lock);
2479 
2480 	mutex_enter(&rpfreelist_lock);
2481 	if (rpfreelist != NULL && rnew >= nrnode) {
2482 		rp = rpfreelist;
2483 		rp_rmfree(rp);
2484 		mutex_exit(&rpfreelist_lock);
2485 
2486 		vp = RTOV(rp);
2487 
2488 		if (rp->r_flags & RHASHED) {
2489 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2490 			mutex_enter(&vp->v_lock);
2491 			if (vp->v_count > 1) {
2492 				vp->v_count--;
2493 				mutex_exit(&vp->v_lock);
2494 				rw_exit(&rp->r_hashq->r_lock);
2495 				rw_enter(&rhtp->r_lock, RW_READER);
2496 				goto start;
2497 			}
2498 			mutex_exit(&vp->v_lock);
2499 			rp_rmhash_locked(rp);
2500 			rw_exit(&rp->r_hashq->r_lock);
2501 		}
2502 
2503 		rinactive(rp, cr);
2504 
2505 		mutex_enter(&vp->v_lock);
2506 		if (vp->v_count > 1) {
2507 			vp->v_count--;
2508 			mutex_exit(&vp->v_lock);
2509 			rw_enter(&rhtp->r_lock, RW_READER);
2510 			goto start;
2511 		}
2512 		mutex_exit(&vp->v_lock);
2513 		vn_invalid(vp);
2514 		/*
2515 		 * destroy old locks before bzero'ing and
2516 		 * recreating the locks below.
2517 		 */
2518 		nfs_rw_destroy(&rp->r_rwlock);
2519 		nfs_rw_destroy(&rp->r_lkserlock);
2520 		mutex_destroy(&rp->r_statelock);
2521 		cv_destroy(&rp->r_cv);
2522 		cv_destroy(&rp->r_commit.c_cv);
2523 		nfs_free_r_path(rp);
2524 		avl_destroy(&rp->r_dir);
2525 		/*
2526 		 * Make sure that if rnode is recycled then
2527 		 * VFS count is decremented properly before
2528 		 * reuse.
2529 		 */
2530 		VFS_RELE(vp->v_vfsp);
2531 		vn_reinit(vp);
2532 	} else {
2533 		vnode_t *new_vp;
2534 
2535 		mutex_exit(&rpfreelist_lock);
2536 
2537 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2538 		new_vp = vn_alloc(KM_SLEEP);
2539 
2540 		atomic_inc_ulong((ulong_t *)&rnew);
2541 #ifdef DEBUG
2542 		clstat_debug.nrnode.value.ui64++;
2543 #endif
2544 		vp = new_vp;
2545 	}
2546 
2547 	bzero(rp, sizeof (*rp));
2548 	rp->r_vnode = vp;
2549 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2550 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2551 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2552 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2553 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2554 	rp->r_fh.fh_len = fh->fh_len;
2555 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2556 	rp->r_server = mi->mi_curr_serv;
2557 	if (FAILOVER_MOUNT(mi)) {
2558 		/*
2559 		 * If replicated servers, stash pathnames
2560 		 */
2561 		if (dnm != NULL && nm != NULL) {
2562 			char *s, *p;
2563 			uint_t len;
2564 
2565 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2566 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2567 #ifdef DEBUG
2568 			clstat_debug.rpath.value.ui64 += len;
2569 #endif
2570 			s = rp->r_path;
2571 			for (p = dnm; *p; p++)
2572 				*s++ = *p;
2573 			*s++ = '/';
2574 			for (p = nm; *p; p++)
2575 				*s++ = *p;
2576 			*s = '\0';
2577 		} else {
2578 			/* special case for root */
2579 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2580 #ifdef DEBUG
2581 			clstat_debug.rpath.value.ui64 += 2;
2582 #endif
2583 			*rp->r_path = '.';
2584 			*(rp->r_path + 1) = '\0';
2585 		}
2586 	}
2587 	VFS_HOLD(vfsp);
2588 	rp->r_putapage = putapage;
2589 	rp->r_hashq = rhtp;
2590 	rp->r_flags = RREADDIRPLUS;
2591 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2592 	    offsetof(rddir_cache, tree));
2593 	vn_setops(vp, vops);
2594 	vp->v_data = (caddr_t)rp;
2595 	vp->v_vfsp = vfsp;
2596 	vp->v_type = VNON;
2597 	vp->v_flag |= VMODSORT;
2598 	nfs_set_vroot(vp);
2599 
2600 	/*
2601 	 * There is a race condition if someone else
2602 	 * alloc's the rnode while no locks are held, so we
2603 	 * check again and recover if found.
2604 	 */
2605 	rw_enter(&rhtp->r_lock, RW_WRITER);
2606 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2607 		vp = RTOV(trp);
2608 		nfs_set_vroot(vp);
2609 		*newnode = 0;
2610 		rw_exit(&rhtp->r_lock);
2611 		rp_addfree(rp, cr);
2612 		rw_enter(&rhtp->r_lock, RW_READER);
2613 		return (vp);
2614 	}
2615 	rp_addhash(rp);
2616 	*newnode = 1;
2617 	return (vp);
2618 }
2619 
2620 /*
2621  * Callback function to check if the page should be marked as
2622  * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2623  */
2624 int
2625 nfs_setmod_check(page_t *pp)
2626 {
2627 	if (pp->p_fsdata != C_NOCOMMIT) {
2628 		pp->p_fsdata = C_NOCOMMIT;
2629 		return (1);
2630 	}
2631 	return (0);
2632 }
2633 
2634 static void
2635 nfs_set_vroot(vnode_t *vp)
2636 {
2637 	rnode_t *rp;
2638 	nfs_fhandle *rootfh;
2639 
2640 	rp = VTOR(vp);
2641 	rootfh = &rp->r_server->sv_fhandle;
2642 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2643 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2644 		if (!(vp->v_flag & VROOT)) {
2645 			mutex_enter(&vp->v_lock);
2646 			vp->v_flag |= VROOT;
2647 			mutex_exit(&vp->v_lock);
2648 		}
2649 	}
2650 }
2651 
2652 static void
2653 nfs_free_r_path(rnode_t *rp)
2654 {
2655 	char *path;
2656 	size_t len;
2657 
2658 	path = rp->r_path;
2659 	if (path) {
2660 		rp->r_path = NULL;
2661 		len = strlen(path) + 1;
2662 		kmem_free(path, len);
2663 #ifdef DEBUG
2664 		clstat_debug.rpath.value.ui64 -= len;
2665 #endif
2666 	}
2667 }
2668 
2669 /*
2670  * Put an rnode on the free list.
2671  *
2672  * Rnodes which were allocated above and beyond the normal limit
2673  * are immediately freed.
2674  */
2675 void
2676 rp_addfree(rnode_t *rp, cred_t *cr)
2677 {
2678 	vnode_t *vp;
2679 	struct vfs *vfsp;
2680 
2681 	vp = RTOV(rp);
2682 	ASSERT(vp->v_count >= 1);
2683 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2684 
2685 	/*
2686 	 * If we have too many rnodes allocated and there are no
2687 	 * references to this rnode, or if the rnode is no longer
2688 	 * accessible by it does not reside in the hash queues,
2689 	 * or if an i/o error occurred while writing to the file,
2690 	 * then just free it instead of putting it on the rnode
2691 	 * freelist.
2692 	 */
2693 	vfsp = vp->v_vfsp;
2694 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2695 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2696 		if (rp->r_flags & RHASHED) {
2697 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2698 			mutex_enter(&vp->v_lock);
2699 			if (vp->v_count > 1) {
2700 				vp->v_count--;
2701 				mutex_exit(&vp->v_lock);
2702 				rw_exit(&rp->r_hashq->r_lock);
2703 				return;
2704 			}
2705 			mutex_exit(&vp->v_lock);
2706 			rp_rmhash_locked(rp);
2707 			rw_exit(&rp->r_hashq->r_lock);
2708 		}
2709 
2710 		rinactive(rp, cr);
2711 
2712 		/*
2713 		 * Recheck the vnode reference count.  We need to
2714 		 * make sure that another reference has not been
2715 		 * acquired while we were not holding v_lock.  The
2716 		 * rnode is not in the rnode hash queues, so the
2717 		 * only way for a reference to have been acquired
2718 		 * is for a VOP_PUTPAGE because the rnode was marked
2719 		 * with RDIRTY or for a modified page.  This
2720 		 * reference may have been acquired before our call
2721 		 * to rinactive.  The i/o may have been completed,
2722 		 * thus allowing rinactive to complete, but the
2723 		 * reference to the vnode may not have been released
2724 		 * yet.  In any case, the rnode can not be destroyed
2725 		 * until the other references to this vnode have been
2726 		 * released.  The other references will take care of
2727 		 * either destroying the rnode or placing it on the
2728 		 * rnode freelist.  If there are no other references,
2729 		 * then the rnode may be safely destroyed.
2730 		 */
2731 		mutex_enter(&vp->v_lock);
2732 		if (vp->v_count > 1) {
2733 			vp->v_count--;
2734 			mutex_exit(&vp->v_lock);
2735 			return;
2736 		}
2737 		mutex_exit(&vp->v_lock);
2738 
2739 		destroy_rnode(rp);
2740 		return;
2741 	}
2742 
2743 	/*
2744 	 * Lock the hash queue and then recheck the reference count
2745 	 * to ensure that no other threads have acquired a reference
2746 	 * to indicate that the rnode should not be placed on the
2747 	 * freelist.  If another reference has been acquired, then
2748 	 * just release this one and let the other thread complete
2749 	 * the processing of adding this rnode to the freelist.
2750 	 */
2751 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2752 
2753 	mutex_enter(&vp->v_lock);
2754 	if (vp->v_count > 1) {
2755 		vp->v_count--;
2756 		mutex_exit(&vp->v_lock);
2757 		rw_exit(&rp->r_hashq->r_lock);
2758 		return;
2759 	}
2760 	mutex_exit(&vp->v_lock);
2761 
2762 	/*
2763 	 * If there is no cached data or metadata for this file, then
2764 	 * put the rnode on the front of the freelist so that it will
2765 	 * be reused before other rnodes which may have cached data or
2766 	 * metadata associated with them.
2767 	 */
2768 	mutex_enter(&rpfreelist_lock);
2769 	if (rpfreelist == NULL) {
2770 		rp->r_freef = rp;
2771 		rp->r_freeb = rp;
2772 		rpfreelist = rp;
2773 	} else {
2774 		rp->r_freef = rpfreelist;
2775 		rp->r_freeb = rpfreelist->r_freeb;
2776 		rpfreelist->r_freeb->r_freef = rp;
2777 		rpfreelist->r_freeb = rp;
2778 		if (!vn_has_cached_data(vp) &&
2779 		    !HAVE_RDDIR_CACHE(rp) &&
2780 		    rp->r_symlink.contents == NULL &&
2781 		    rp->r_secattr == NULL &&
2782 		    rp->r_pathconf == NULL)
2783 			rpfreelist = rp;
2784 	}
2785 	mutex_exit(&rpfreelist_lock);
2786 
2787 	rw_exit(&rp->r_hashq->r_lock);
2788 }
2789 
2790 /*
2791  * Remove an rnode from the free list.
2792  *
2793  * The caller must be holding rpfreelist_lock and the rnode
2794  * must be on the freelist.
2795  */
2796 static void
2797 rp_rmfree(rnode_t *rp)
2798 {
2799 
2800 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2801 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2802 
2803 	if (rp == rpfreelist) {
2804 		rpfreelist = rp->r_freef;
2805 		if (rp == rpfreelist)
2806 			rpfreelist = NULL;
2807 	}
2808 
2809 	rp->r_freeb->r_freef = rp->r_freef;
2810 	rp->r_freef->r_freeb = rp->r_freeb;
2811 
2812 	rp->r_freef = rp->r_freeb = NULL;
2813 }
2814 
2815 /*
2816  * Put a rnode in the hash table.
2817  *
2818  * The caller must be holding the exclusive hash queue lock.
2819  */
2820 static void
2821 rp_addhash(rnode_t *rp)
2822 {
2823 
2824 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2825 	ASSERT(!(rp->r_flags & RHASHED));
2826 
2827 	rp->r_hashf = rp->r_hashq->r_hashf;
2828 	rp->r_hashq->r_hashf = rp;
2829 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2830 	rp->r_hashf->r_hashb = rp;
2831 
2832 	mutex_enter(&rp->r_statelock);
2833 	rp->r_flags |= RHASHED;
2834 	mutex_exit(&rp->r_statelock);
2835 }
2836 
2837 /*
2838  * Remove a rnode from the hash table.
2839  *
2840  * The caller must be holding the hash queue lock.
2841  */
2842 static void
2843 rp_rmhash_locked(rnode_t *rp)
2844 {
2845 
2846 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2847 	ASSERT(rp->r_flags & RHASHED);
2848 
2849 	rp->r_hashb->r_hashf = rp->r_hashf;
2850 	rp->r_hashf->r_hashb = rp->r_hashb;
2851 
2852 	mutex_enter(&rp->r_statelock);
2853 	rp->r_flags &= ~RHASHED;
2854 	mutex_exit(&rp->r_statelock);
2855 }
2856 
2857 /*
2858  * Remove a rnode from the hash table.
2859  *
2860  * The caller must not be holding the hash queue lock.
2861  */
2862 void
2863 rp_rmhash(rnode_t *rp)
2864 {
2865 
2866 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2867 	rp_rmhash_locked(rp);
2868 	rw_exit(&rp->r_hashq->r_lock);
2869 }
2870 
2871 /*
2872  * Lookup a rnode by fhandle.
2873  *
2874  * The caller must be holding the hash queue lock, either shared or exclusive.
2875  */
2876 static rnode_t *
2877 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2878 {
2879 	rnode_t *rp;
2880 	vnode_t *vp;
2881 
2882 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2883 
2884 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2885 		vp = RTOV(rp);
2886 		if (vp->v_vfsp == vfsp &&
2887 		    rp->r_fh.fh_len == fh->fh_len &&
2888 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2889 			/*
2890 			 * remove rnode from free list, if necessary.
2891 			 */
2892 			if (rp->r_freef != NULL) {
2893 				mutex_enter(&rpfreelist_lock);
2894 				/*
2895 				 * If the rnode is on the freelist,
2896 				 * then remove it and use that reference
2897 				 * as the new reference.  Otherwise,
2898 				 * need to increment the reference count.
2899 				 */
2900 				if (rp->r_freef != NULL) {
2901 					rp_rmfree(rp);
2902 					mutex_exit(&rpfreelist_lock);
2903 				} else {
2904 					mutex_exit(&rpfreelist_lock);
2905 					VN_HOLD(vp);
2906 				}
2907 			} else
2908 				VN_HOLD(vp);
2909 			return (rp);
2910 		}
2911 	}
2912 	return (NULL);
2913 }
2914 
2915 /*
2916  * Return 1 if there is a active vnode belonging to this vfs in the
2917  * rtable cache.
2918  *
2919  * Several of these checks are done without holding the usual
2920  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2921  * etc. will redo the necessary checks before actually destroying
2922  * any rnodes.
2923  */
2924 int
2925 check_rtable(struct vfs *vfsp)
2926 {
2927 	int index;
2928 	rnode_t *rp;
2929 	vnode_t *vp;
2930 
2931 	for (index = 0; index < rtablesize; index++) {
2932 		rw_enter(&rtable[index].r_lock, RW_READER);
2933 		for (rp = rtable[index].r_hashf;
2934 		    rp != (rnode_t *)(&rtable[index]);
2935 		    rp = rp->r_hashf) {
2936 			vp = RTOV(rp);
2937 			if (vp->v_vfsp == vfsp) {
2938 				if (rp->r_freef == NULL ||
2939 				    (vn_has_cached_data(vp) &&
2940 				    (rp->r_flags & RDIRTY)) ||
2941 				    rp->r_count > 0) {
2942 					rw_exit(&rtable[index].r_lock);
2943 					return (1);
2944 				}
2945 			}
2946 		}
2947 		rw_exit(&rtable[index].r_lock);
2948 	}
2949 	return (0);
2950 }
2951 
2952 /*
2953  * Destroy inactive vnodes from the hash queues which belong to this
2954  * vfs.  It is essential that we destroy all inactive vnodes during a
2955  * forced unmount as well as during a normal unmount.
2956  */
2957 void
2958 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2959 {
2960 	int index;
2961 	rnode_t *rp;
2962 	rnode_t *rlist;
2963 	rnode_t *r_hashf;
2964 	vnode_t *vp;
2965 
2966 	rlist = NULL;
2967 
2968 	for (index = 0; index < rtablesize; index++) {
2969 		rw_enter(&rtable[index].r_lock, RW_WRITER);
2970 		for (rp = rtable[index].r_hashf;
2971 		    rp != (rnode_t *)(&rtable[index]);
2972 		    rp = r_hashf) {
2973 			/* save the hash pointer before destroying */
2974 			r_hashf = rp->r_hashf;
2975 			vp = RTOV(rp);
2976 			if (vp->v_vfsp == vfsp) {
2977 				mutex_enter(&rpfreelist_lock);
2978 				if (rp->r_freef != NULL) {
2979 					rp_rmfree(rp);
2980 					mutex_exit(&rpfreelist_lock);
2981 					rp_rmhash_locked(rp);
2982 					rp->r_hashf = rlist;
2983 					rlist = rp;
2984 				} else
2985 					mutex_exit(&rpfreelist_lock);
2986 			}
2987 		}
2988 		rw_exit(&rtable[index].r_lock);
2989 	}
2990 
2991 	for (rp = rlist; rp != NULL; rp = rlist) {
2992 		rlist = rp->r_hashf;
2993 		/*
2994 		 * This call to rp_addfree will end up destroying the
2995 		 * rnode, but in a safe way with the appropriate set
2996 		 * of checks done.
2997 		 */
2998 		rp_addfree(rp, cr);
2999 	}
3000 
3001 }
3002 
3003 /*
3004  * This routine destroys all the resources associated with the rnode
3005  * and then the rnode itself.
3006  */
3007 static void
3008 destroy_rnode(rnode_t *rp)
3009 {
3010 	vnode_t *vp;
3011 	vfs_t *vfsp;
3012 
3013 	vp = RTOV(rp);
3014 	vfsp = vp->v_vfsp;
3015 
3016 	ASSERT(vp->v_count == 1);
3017 	ASSERT(rp->r_count == 0);
3018 	ASSERT(rp->r_lmpl == NULL);
3019 	ASSERT(rp->r_mapcnt == 0);
3020 	ASSERT(!(rp->r_flags & RHASHED));
3021 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3022 	atomic_dec_ulong((ulong_t *)&rnew);
3023 #ifdef DEBUG
3024 	clstat_debug.nrnode.value.ui64--;
3025 #endif
3026 	nfs_rw_destroy(&rp->r_rwlock);
3027 	nfs_rw_destroy(&rp->r_lkserlock);
3028 	mutex_destroy(&rp->r_statelock);
3029 	cv_destroy(&rp->r_cv);
3030 	cv_destroy(&rp->r_commit.c_cv);
3031 	if (rp->r_flags & RDELMAPLIST)
3032 		list_destroy(&rp->r_indelmap);
3033 	nfs_free_r_path(rp);
3034 	avl_destroy(&rp->r_dir);
3035 	vn_invalid(vp);
3036 	vn_free(vp);
3037 	kmem_cache_free(rnode_cache, rp);
3038 	VFS_RELE(vfsp);
3039 }
3040 
3041 /*
3042  * Flush all vnodes in this (or every) vfs.
3043  * Used by nfs_sync and by nfs_unmount.
3044  */
3045 void
3046 rflush(struct vfs *vfsp, cred_t *cr)
3047 {
3048 	int index;
3049 	rnode_t *rp;
3050 	vnode_t *vp, **vplist;
3051 	long num, cnt;
3052 
3053 	/*
3054 	 * Check to see whether there is anything to do.
3055 	 */
3056 	num = rnew;
3057 	if (num == 0)
3058 		return;
3059 
3060 	/*
3061 	 * Allocate a slot for all currently active rnodes on the
3062 	 * supposition that they all may need flushing.
3063 	 */
3064 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3065 	cnt = 0;
3066 
3067 	/*
3068 	 * Walk the hash queues looking for rnodes with page
3069 	 * lists associated with them.  Make a list of these
3070 	 * files.
3071 	 */
3072 	for (index = 0; index < rtablesize; index++) {
3073 		rw_enter(&rtable[index].r_lock, RW_READER);
3074 		for (rp = rtable[index].r_hashf;
3075 		    rp != (rnode_t *)(&rtable[index]);
3076 		    rp = rp->r_hashf) {
3077 			vp = RTOV(rp);
3078 			/*
3079 			 * Don't bother sync'ing a vp if it
3080 			 * is part of virtual swap device or
3081 			 * if VFS is read-only
3082 			 */
3083 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3084 				continue;
3085 			/*
3086 			 * If flushing all mounted file systems or
3087 			 * the vnode belongs to this vfs, has pages
3088 			 * and is marked as either dirty or mmap'd,
3089 			 * hold and add this vnode to the list of
3090 			 * vnodes to flush.
3091 			 */
3092 			if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3093 			    vn_has_cached_data(vp) &&
3094 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3095 				VN_HOLD(vp);
3096 				vplist[cnt++] = vp;
3097 				if (cnt == num) {
3098 					rw_exit(&rtable[index].r_lock);
3099 					goto toomany;
3100 				}
3101 			}
3102 		}
3103 		rw_exit(&rtable[index].r_lock);
3104 	}
3105 toomany:
3106 
3107 	/*
3108 	 * Flush and release all of the files on the list.
3109 	 */
3110 	while (cnt-- > 0) {
3111 		vp = vplist[cnt];
3112 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3113 		VN_RELE(vp);
3114 	}
3115 
3116 	/*
3117 	 * Free the space allocated to hold the list.
3118 	 */
3119 	kmem_free(vplist, num * sizeof (*vplist));
3120 }
3121 
3122 /*
3123  * This probably needs to be larger than or equal to
3124  * log2(sizeof (struct rnode)) due to the way that rnodes are
3125  * allocated.
3126  */
3127 #define	ACACHE_SHIFT_BITS	9
3128 
3129 static int
3130 acachehash(rnode_t *rp, cred_t *cr)
3131 {
3132 
3133 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3134 	    acachemask);
3135 }
3136 
3137 #ifdef DEBUG
3138 static long nfs_access_cache_hits = 0;
3139 static long nfs_access_cache_misses = 0;
3140 #endif
3141 
3142 nfs_access_type_t
3143 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3144 {
3145 	vnode_t *vp;
3146 	acache_t *ap;
3147 	acache_hash_t *hp;
3148 	nfs_access_type_t all;
3149 
3150 	vp = RTOV(rp);
3151 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3152 		return (NFS_ACCESS_UNKNOWN);
3153 
3154 	if (rp->r_acache != NULL) {
3155 		hp = &acache[acachehash(rp, cr)];
3156 		rw_enter(&hp->lock, RW_READER);
3157 		ap = hp->next;
3158 		while (ap != (acache_t *)hp) {
3159 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3160 				if ((ap->known & acc) == acc) {
3161 #ifdef DEBUG
3162 					nfs_access_cache_hits++;
3163 #endif
3164 					if ((ap->allowed & acc) == acc)
3165 						all = NFS_ACCESS_ALLOWED;
3166 					else
3167 						all = NFS_ACCESS_DENIED;
3168 				} else {
3169 #ifdef DEBUG
3170 					nfs_access_cache_misses++;
3171 #endif
3172 					all = NFS_ACCESS_UNKNOWN;
3173 				}
3174 				rw_exit(&hp->lock);
3175 				return (all);
3176 			}
3177 			ap = ap->next;
3178 		}
3179 		rw_exit(&hp->lock);
3180 	}
3181 
3182 #ifdef DEBUG
3183 	nfs_access_cache_misses++;
3184 #endif
3185 	return (NFS_ACCESS_UNKNOWN);
3186 }
3187 
3188 void
3189 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3190 {
3191 	acache_t *ap;
3192 	acache_t *nap;
3193 	acache_hash_t *hp;
3194 
3195 	hp = &acache[acachehash(rp, cr)];
3196 
3197 	/*
3198 	 * Allocate now assuming that mostly an allocation will be
3199 	 * required.  This allows the allocation to happen without
3200 	 * holding the hash bucket locked.
3201 	 */
3202 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3203 	if (nap != NULL) {
3204 		nap->known = acc;
3205 		nap->allowed = resacc;
3206 		nap->rnode = rp;
3207 		crhold(cr);
3208 		nap->cred = cr;
3209 		nap->hashq = hp;
3210 	}
3211 
3212 	rw_enter(&hp->lock, RW_WRITER);
3213 
3214 	if (rp->r_acache != NULL) {
3215 		ap = hp->next;
3216 		while (ap != (acache_t *)hp) {
3217 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3218 				ap->known |= acc;
3219 				ap->allowed &= ~acc;
3220 				ap->allowed |= resacc;
3221 				rw_exit(&hp->lock);
3222 				if (nap != NULL) {
3223 					crfree(nap->cred);
3224 					kmem_cache_free(acache_cache, nap);
3225 				}
3226 				return;
3227 			}
3228 			ap = ap->next;
3229 		}
3230 	}
3231 
3232 	if (nap != NULL) {
3233 #ifdef DEBUG
3234 		clstat_debug.access.value.ui64++;
3235 #endif
3236 		nap->next = hp->next;
3237 		hp->next = nap;
3238 		nap->next->prev = nap;
3239 		nap->prev = (acache_t *)hp;
3240 
3241 		mutex_enter(&rp->r_statelock);
3242 		nap->list = rp->r_acache;
3243 		rp->r_acache = nap;
3244 		mutex_exit(&rp->r_statelock);
3245 	}
3246 
3247 	rw_exit(&hp->lock);
3248 }
3249 
3250 int
3251 nfs_access_purge_rp(rnode_t *rp)
3252 {
3253 	acache_t *ap;
3254 	acache_t *tmpap;
3255 	acache_t *rplist;
3256 
3257 	/*
3258 	 * If there aren't any cached entries, then there is nothing
3259 	 * to free.
3260 	 */
3261 	if (rp->r_acache == NULL)
3262 		return (0);
3263 
3264 	mutex_enter(&rp->r_statelock);
3265 	rplist = rp->r_acache;
3266 	rp->r_acache = NULL;
3267 	mutex_exit(&rp->r_statelock);
3268 
3269 	/*
3270 	 * Loop through each entry in the list pointed to in the
3271 	 * rnode.  Remove each of these entries from the hash
3272 	 * queue that it is on and remove it from the list in
3273 	 * the rnode.
3274 	 */
3275 	for (ap = rplist; ap != NULL; ap = tmpap) {
3276 		rw_enter(&ap->hashq->lock, RW_WRITER);
3277 		ap->prev->next = ap->next;
3278 		ap->next->prev = ap->prev;
3279 		rw_exit(&ap->hashq->lock);
3280 
3281 		tmpap = ap->list;
3282 		crfree(ap->cred);
3283 		kmem_cache_free(acache_cache, ap);
3284 #ifdef DEBUG
3285 		clstat_debug.access.value.ui64--;
3286 #endif
3287 	}
3288 
3289 	return (1);
3290 }
3291 
3292 static const char prefix[] = ".nfs";
3293 
3294 static kmutex_t newnum_lock;
3295 
3296 int
3297 newnum(void)
3298 {
3299 	static uint_t newnum = 0;
3300 	uint_t id;
3301 
3302 	mutex_enter(&newnum_lock);
3303 	if (newnum == 0)
3304 		newnum = gethrestime_sec() & 0xffff;
3305 	id = newnum++;
3306 	mutex_exit(&newnum_lock);
3307 	return (id);
3308 }
3309 
3310 char *
3311 newname(void)
3312 {
3313 	char *news;
3314 	char *s;
3315 	const char *p;
3316 	uint_t id;
3317 
3318 	id = newnum();
3319 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3320 	s = news;
3321 	p = prefix;
3322 	while (*p != '\0')
3323 		*s++ = *p++;
3324 	while (id != 0) {
3325 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3326 		id >>= 4;
3327 	}
3328 	*s = '\0';
3329 	return (news);
3330 }
3331 
3332 /*
3333  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3334  * framework.
3335  */
3336 static int
3337 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3338 {
3339 	ksp->ks_snaptime = gethrtime();
3340 	if (rw == KSTAT_WRITE) {
3341 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3342 #ifdef DEBUG
3343 		/*
3344 		 * Currently only the global zone can write to kstats, but we
3345 		 * add the check just for paranoia.
3346 		 */
3347 		if (INGLOBALZONE(curproc))
3348 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3349 			    sizeof (clstat_debug));
3350 #endif
3351 	} else {
3352 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3353 #ifdef DEBUG
3354 		/*
3355 		 * If we're displaying the "global" debug kstat values, we
3356 		 * display them as-is to all zones since in fact they apply to
3357 		 * the system as a whole.
3358 		 */
3359 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3360 		    sizeof (clstat_debug));
3361 #endif
3362 	}
3363 	return (0);
3364 }
3365 
3366 static void *
3367 clinit_zone(zoneid_t zoneid)
3368 {
3369 	kstat_t *nfs_client_kstat;
3370 	struct nfs_clnt *nfscl;
3371 	uint_t ndata;
3372 
3373 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3374 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3375 	nfscl->nfscl_chtable = NULL;
3376 	nfscl->nfscl_zoneid = zoneid;
3377 
3378 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3379 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3380 #ifdef DEBUG
3381 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3382 #endif
3383 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3384 	    "misc", KSTAT_TYPE_NAMED, ndata,
3385 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3386 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3387 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3388 		kstat_install(nfs_client_kstat);
3389 	}
3390 	mutex_enter(&nfs_clnt_list_lock);
3391 	list_insert_head(&nfs_clnt_list, nfscl);
3392 	mutex_exit(&nfs_clnt_list_lock);
3393 	return (nfscl);
3394 }
3395 
3396 /*ARGSUSED*/
3397 static void
3398 clfini_zone(zoneid_t zoneid, void *arg)
3399 {
3400 	struct nfs_clnt *nfscl = arg;
3401 	chhead_t *chp, *next;
3402 
3403 	if (nfscl == NULL)
3404 		return;
3405 	mutex_enter(&nfs_clnt_list_lock);
3406 	list_remove(&nfs_clnt_list, nfscl);
3407 	mutex_exit(&nfs_clnt_list_lock);
3408 	clreclaim_zone(nfscl, 0);
3409 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3410 		ASSERT(chp->ch_list == NULL);
3411 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3412 		next = chp->ch_next;
3413 		kmem_free(chp, sizeof (*chp));
3414 	}
3415 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3416 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3417 	kmem_free(nfscl, sizeof (*nfscl));
3418 }
3419 
3420 /*
3421  * Called by endpnt_destructor to make sure the client handles are
3422  * cleaned up before the RPC endpoints.  This becomes a no-op if
3423  * clfini_zone (above) is called first.  This function is needed
3424  * (rather than relying on clfini_zone to clean up) because the ZSD
3425  * callbacks have no ordering mechanism, so we have no way to ensure
3426  * that clfini_zone is called before endpnt_destructor.
3427  */
3428 void
3429 clcleanup_zone(zoneid_t zoneid)
3430 {
3431 	struct nfs_clnt *nfscl;
3432 
3433 	mutex_enter(&nfs_clnt_list_lock);
3434 	nfscl = list_head(&nfs_clnt_list);
3435 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3436 		if (nfscl->nfscl_zoneid == zoneid) {
3437 			clreclaim_zone(nfscl, 0);
3438 			break;
3439 		}
3440 	}
3441 	mutex_exit(&nfs_clnt_list_lock);
3442 }
3443 
3444 int
3445 nfs_subrinit(void)
3446 {
3447 	int i;
3448 	ulong_t nrnode_max;
3449 
3450 	/*
3451 	 * Allocate and initialize the rnode hash queues
3452 	 */
3453 	if (nrnode <= 0)
3454 		nrnode = ncsize;
3455 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3456 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3457 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3458 		    "!setting nrnode to max value of %ld", nrnode_max);
3459 		nrnode = nrnode_max;
3460 	}
3461 
3462 	rtablesize = 1 << highbit(nrnode / hashlen);
3463 	rtablemask = rtablesize - 1;
3464 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3465 	for (i = 0; i < rtablesize; i++) {
3466 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3467 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3468 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3469 	}
3470 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3471 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3472 
3473 	/*
3474 	 * Allocate and initialize the access cache
3475 	 */
3476 
3477 	/*
3478 	 * Initial guess is one access cache entry per rnode unless
3479 	 * nacache is set to a non-zero value and then it is used to
3480 	 * indicate a guess at the number of access cache entries.
3481 	 */
3482 	if (nacache > 0)
3483 		acachesize = 1 << highbit(nacache / hashlen);
3484 	else
3485 		acachesize = rtablesize;
3486 	acachemask = acachesize - 1;
3487 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3488 	for (i = 0; i < acachesize; i++) {
3489 		acache[i].next = (acache_t *)&acache[i];
3490 		acache[i].prev = (acache_t *)&acache[i];
3491 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3492 	}
3493 	acache_cache = kmem_cache_create("nfs_access_cache",
3494 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3495 	/*
3496 	 * Allocate and initialize the client handle cache
3497 	 */
3498 	chtab_cache = kmem_cache_create("client_handle_cache",
3499 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3500 	/*
3501 	 * Initialize the list of per-zone client handles (and associated data).
3502 	 * This needs to be done before we call zone_key_create().
3503 	 */
3504 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3505 	    offsetof(struct nfs_clnt, nfscl_node));
3506 	/*
3507 	 * Initialize the zone_key for per-zone client handle lists.
3508 	 */
3509 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3510 	/*
3511 	 * Initialize the various mutexes and reader/writer locks
3512 	 */
3513 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3514 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3515 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3516 
3517 	/*
3518 	 * Assign unique major number for all nfs mounts
3519 	 */
3520 	if ((nfs_major = getudev()) == -1) {
3521 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3522 		    "nfs: init: can't get unique device number");
3523 		nfs_major = 0;
3524 	}
3525 	nfs_minor = 0;
3526 
3527 	if (nfs3_jukebox_delay == 0)
3528 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3529 
3530 	return (0);
3531 }
3532 
3533 void
3534 nfs_subrfini(void)
3535 {
3536 	int i;
3537 
3538 	/*
3539 	 * Deallocate the rnode hash queues
3540 	 */
3541 	kmem_cache_destroy(rnode_cache);
3542 
3543 	for (i = 0; i < rtablesize; i++)
3544 		rw_destroy(&rtable[i].r_lock);
3545 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3546 
3547 	/*
3548 	 * Deallocated the access cache
3549 	 */
3550 	kmem_cache_destroy(acache_cache);
3551 
3552 	for (i = 0; i < acachesize; i++)
3553 		rw_destroy(&acache[i].lock);
3554 	kmem_free(acache, acachesize * sizeof (*acache));
3555 
3556 	/*
3557 	 * Deallocate the client handle cache
3558 	 */
3559 	kmem_cache_destroy(chtab_cache);
3560 
3561 	/*
3562 	 * Destroy the various mutexes and reader/writer locks
3563 	 */
3564 	mutex_destroy(&rpfreelist_lock);
3565 	mutex_destroy(&newnum_lock);
3566 	mutex_destroy(&nfs_minor_lock);
3567 	(void) zone_key_delete(nfsclnt_zone_key);
3568 }
3569 
3570 enum nfsstat
3571 puterrno(int error)
3572 {
3573 
3574 	switch (error) {
3575 	case EOPNOTSUPP:
3576 		return (NFSERR_OPNOTSUPP);
3577 	case ENAMETOOLONG:
3578 		return (NFSERR_NAMETOOLONG);
3579 	case ENOTEMPTY:
3580 		return (NFSERR_NOTEMPTY);
3581 	case EDQUOT:
3582 		return (NFSERR_DQUOT);
3583 	case ESTALE:
3584 		return (NFSERR_STALE);
3585 	case EREMOTE:
3586 		return (NFSERR_REMOTE);
3587 	case ENOSYS:
3588 		return (NFSERR_OPNOTSUPP);
3589 	case EOVERFLOW:
3590 		return (NFSERR_INVAL);
3591 	default:
3592 		return ((enum nfsstat)error);
3593 	}
3594 	/* NOTREACHED */
3595 }
3596 
3597 int
3598 geterrno(enum nfsstat status)
3599 {
3600 
3601 	switch (status) {
3602 	case NFSERR_OPNOTSUPP:
3603 		return (EOPNOTSUPP);
3604 	case NFSERR_NAMETOOLONG:
3605 		return (ENAMETOOLONG);
3606 	case NFSERR_NOTEMPTY:
3607 		return (ENOTEMPTY);
3608 	case NFSERR_DQUOT:
3609 		return (EDQUOT);
3610 	case NFSERR_STALE:
3611 		return (ESTALE);
3612 	case NFSERR_REMOTE:
3613 		return (EREMOTE);
3614 	case NFSERR_WFLUSH:
3615 		return (EIO);
3616 	default:
3617 		return ((int)status);
3618 	}
3619 	/* NOTREACHED */
3620 }
3621 
3622 enum nfsstat3
3623 puterrno3(int error)
3624 {
3625 
3626 #ifdef DEBUG
3627 	switch (error) {
3628 	case 0:
3629 		return (NFS3_OK);
3630 	case EPERM:
3631 		return (NFS3ERR_PERM);
3632 	case ENOENT:
3633 		return (NFS3ERR_NOENT);
3634 	case EIO:
3635 		return (NFS3ERR_IO);
3636 	case ENXIO:
3637 		return (NFS3ERR_NXIO);
3638 	case EACCES:
3639 		return (NFS3ERR_ACCES);
3640 	case EEXIST:
3641 		return (NFS3ERR_EXIST);
3642 	case EXDEV:
3643 		return (NFS3ERR_XDEV);
3644 	case ENODEV:
3645 		return (NFS3ERR_NODEV);
3646 	case ENOTDIR:
3647 		return (NFS3ERR_NOTDIR);
3648 	case EISDIR:
3649 		return (NFS3ERR_ISDIR);
3650 	case EINVAL:
3651 		return (NFS3ERR_INVAL);
3652 	case EFBIG:
3653 		return (NFS3ERR_FBIG);
3654 	case ENOSPC:
3655 		return (NFS3ERR_NOSPC);
3656 	case EROFS:
3657 		return (NFS3ERR_ROFS);
3658 	case EMLINK:
3659 		return (NFS3ERR_MLINK);
3660 	case ENAMETOOLONG:
3661 		return (NFS3ERR_NAMETOOLONG);
3662 	case ENOTEMPTY:
3663 		return (NFS3ERR_NOTEMPTY);
3664 	case EDQUOT:
3665 		return (NFS3ERR_DQUOT);
3666 	case ESTALE:
3667 		return (NFS3ERR_STALE);
3668 	case EREMOTE:
3669 		return (NFS3ERR_REMOTE);
3670 	case ENOSYS:
3671 	case EOPNOTSUPP:
3672 		return (NFS3ERR_NOTSUPP);
3673 	case EOVERFLOW:
3674 		return (NFS3ERR_INVAL);
3675 	default:
3676 		zcmn_err(getzoneid(), CE_WARN,
3677 		    "puterrno3: got error %d", error);
3678 		return ((enum nfsstat3)error);
3679 	}
3680 #else
3681 	switch (error) {
3682 	case ENAMETOOLONG:
3683 		return (NFS3ERR_NAMETOOLONG);
3684 	case ENOTEMPTY:
3685 		return (NFS3ERR_NOTEMPTY);
3686 	case EDQUOT:
3687 		return (NFS3ERR_DQUOT);
3688 	case ESTALE:
3689 		return (NFS3ERR_STALE);
3690 	case ENOSYS:
3691 	case EOPNOTSUPP:
3692 		return (NFS3ERR_NOTSUPP);
3693 	case EREMOTE:
3694 		return (NFS3ERR_REMOTE);
3695 	case EOVERFLOW:
3696 		return (NFS3ERR_INVAL);
3697 	default:
3698 		return ((enum nfsstat3)error);
3699 	}
3700 #endif
3701 }
3702 
3703 int
3704 geterrno3(enum nfsstat3 status)
3705 {
3706 
3707 #ifdef DEBUG
3708 	switch (status) {
3709 	case NFS3_OK:
3710 		return (0);
3711 	case NFS3ERR_PERM:
3712 		return (EPERM);
3713 	case NFS3ERR_NOENT:
3714 		return (ENOENT);
3715 	case NFS3ERR_IO:
3716 		return (EIO);
3717 	case NFS3ERR_NXIO:
3718 		return (ENXIO);
3719 	case NFS3ERR_ACCES:
3720 		return (EACCES);
3721 	case NFS3ERR_EXIST:
3722 		return (EEXIST);
3723 	case NFS3ERR_XDEV:
3724 		return (EXDEV);
3725 	case NFS3ERR_NODEV:
3726 		return (ENODEV);
3727 	case NFS3ERR_NOTDIR:
3728 		return (ENOTDIR);
3729 	case NFS3ERR_ISDIR:
3730 		return (EISDIR);
3731 	case NFS3ERR_INVAL:
3732 		return (EINVAL);
3733 	case NFS3ERR_FBIG:
3734 		return (EFBIG);
3735 	case NFS3ERR_NOSPC:
3736 		return (ENOSPC);
3737 	case NFS3ERR_ROFS:
3738 		return (EROFS);
3739 	case NFS3ERR_MLINK:
3740 		return (EMLINK);
3741 	case NFS3ERR_NAMETOOLONG:
3742 		return (ENAMETOOLONG);
3743 	case NFS3ERR_NOTEMPTY:
3744 		return (ENOTEMPTY);
3745 	case NFS3ERR_DQUOT:
3746 		return (EDQUOT);
3747 	case NFS3ERR_STALE:
3748 		return (ESTALE);
3749 	case NFS3ERR_REMOTE:
3750 		return (EREMOTE);
3751 	case NFS3ERR_BADHANDLE:
3752 		return (ESTALE);
3753 	case NFS3ERR_NOT_SYNC:
3754 		return (EINVAL);
3755 	case NFS3ERR_BAD_COOKIE:
3756 		return (ENOENT);
3757 	case NFS3ERR_NOTSUPP:
3758 		return (EOPNOTSUPP);
3759 	case NFS3ERR_TOOSMALL:
3760 		return (EINVAL);
3761 	case NFS3ERR_SERVERFAULT:
3762 		return (EIO);
3763 	case NFS3ERR_BADTYPE:
3764 		return (EINVAL);
3765 	case NFS3ERR_JUKEBOX:
3766 		return (ENXIO);
3767 	default:
3768 		zcmn_err(getzoneid(), CE_WARN,
3769 		    "geterrno3: got status %d", status);
3770 		return ((int)status);
3771 	}
3772 #else
3773 	switch (status) {
3774 	case NFS3ERR_NAMETOOLONG:
3775 		return (ENAMETOOLONG);
3776 	case NFS3ERR_NOTEMPTY:
3777 		return (ENOTEMPTY);
3778 	case NFS3ERR_DQUOT:
3779 		return (EDQUOT);
3780 	case NFS3ERR_STALE:
3781 	case NFS3ERR_BADHANDLE:
3782 		return (ESTALE);
3783 	case NFS3ERR_NOTSUPP:
3784 		return (EOPNOTSUPP);
3785 	case NFS3ERR_REMOTE:
3786 		return (EREMOTE);
3787 	case NFS3ERR_NOT_SYNC:
3788 	case NFS3ERR_TOOSMALL:
3789 	case NFS3ERR_BADTYPE:
3790 		return (EINVAL);
3791 	case NFS3ERR_BAD_COOKIE:
3792 		return (ENOENT);
3793 	case NFS3ERR_SERVERFAULT:
3794 		return (EIO);
3795 	case NFS3ERR_JUKEBOX:
3796 		return (ENXIO);
3797 	default:
3798 		return ((int)status);
3799 	}
3800 #endif
3801 }
3802 
3803 rddir_cache *
3804 rddir_cache_alloc(int flags)
3805 {
3806 	rddir_cache *rc;
3807 
3808 	rc = kmem_alloc(sizeof (*rc), flags);
3809 	if (rc != NULL) {
3810 		rc->entries = NULL;
3811 		rc->flags = RDDIR;
3812 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3813 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3814 		rc->count = 1;
3815 #ifdef DEBUG
3816 		atomic_inc_64(&clstat_debug.dirent.value.ui64);
3817 #endif
3818 	}
3819 	return (rc);
3820 }
3821 
3822 static void
3823 rddir_cache_free(rddir_cache *rc)
3824 {
3825 
3826 #ifdef DEBUG
3827 	atomic_dec_64(&clstat_debug.dirent.value.ui64);
3828 #endif
3829 	if (rc->entries != NULL) {
3830 #ifdef DEBUG
3831 		rddir_cache_buf_free(rc->entries, rc->buflen);
3832 #else
3833 		kmem_free(rc->entries, rc->buflen);
3834 #endif
3835 	}
3836 	cv_destroy(&rc->cv);
3837 	mutex_destroy(&rc->lock);
3838 	kmem_free(rc, sizeof (*rc));
3839 }
3840 
3841 void
3842 rddir_cache_hold(rddir_cache *rc)
3843 {
3844 
3845 	mutex_enter(&rc->lock);
3846 	rc->count++;
3847 	mutex_exit(&rc->lock);
3848 }
3849 
3850 void
3851 rddir_cache_rele(rddir_cache *rc)
3852 {
3853 
3854 	mutex_enter(&rc->lock);
3855 	ASSERT(rc->count > 0);
3856 	if (--rc->count == 0) {
3857 		mutex_exit(&rc->lock);
3858 		rddir_cache_free(rc);
3859 	} else
3860 		mutex_exit(&rc->lock);
3861 }
3862 
3863 #ifdef DEBUG
3864 char *
3865 rddir_cache_buf_alloc(size_t size, int flags)
3866 {
3867 	char *rc;
3868 
3869 	rc = kmem_alloc(size, flags);
3870 	if (rc != NULL)
3871 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3872 	return (rc);
3873 }
3874 
3875 void
3876 rddir_cache_buf_free(void *addr, size_t size)
3877 {
3878 
3879 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3880 	kmem_free(addr, size);
3881 }
3882 #endif
3883 
3884 static int
3885 nfs_free_data_reclaim(rnode_t *rp)
3886 {
3887 	char *contents;
3888 	int size;
3889 	vsecattr_t *vsp;
3890 	nfs3_pathconf_info *info;
3891 	int freed;
3892 	cred_t *cred;
3893 
3894 	/*
3895 	 * Free any held credentials and caches which
3896 	 * may be associated with this rnode.
3897 	 */
3898 	mutex_enter(&rp->r_statelock);
3899 	cred = rp->r_cred;
3900 	rp->r_cred = NULL;
3901 	contents = rp->r_symlink.contents;
3902 	size = rp->r_symlink.size;
3903 	rp->r_symlink.contents = NULL;
3904 	vsp = rp->r_secattr;
3905 	rp->r_secattr = NULL;
3906 	info = rp->r_pathconf;
3907 	rp->r_pathconf = NULL;
3908 	mutex_exit(&rp->r_statelock);
3909 
3910 	if (cred != NULL)
3911 		crfree(cred);
3912 
3913 	/*
3914 	 * Free the access cache entries.
3915 	 */
3916 	freed = nfs_access_purge_rp(rp);
3917 
3918 	if (!HAVE_RDDIR_CACHE(rp) &&
3919 	    contents == NULL &&
3920 	    vsp == NULL &&
3921 	    info == NULL)
3922 		return (freed);
3923 
3924 	/*
3925 	 * Free the readdir cache entries
3926 	 */
3927 	if (HAVE_RDDIR_CACHE(rp))
3928 		nfs_purge_rddir_cache(RTOV(rp));
3929 
3930 	/*
3931 	 * Free the symbolic link cache.
3932 	 */
3933 	if (contents != NULL) {
3934 
3935 		kmem_free((void *)contents, size);
3936 	}
3937 
3938 	/*
3939 	 * Free any cached ACL.
3940 	 */
3941 	if (vsp != NULL)
3942 		nfs_acl_free(vsp);
3943 
3944 	/*
3945 	 * Free any cached pathconf information.
3946 	 */
3947 	if (info != NULL)
3948 		kmem_free(info, sizeof (*info));
3949 
3950 	return (1);
3951 }
3952 
3953 static int
3954 nfs_active_data_reclaim(rnode_t *rp)
3955 {
3956 	char *contents;
3957 	int size;
3958 	vsecattr_t *vsp;
3959 	nfs3_pathconf_info *info;
3960 	int freed;
3961 
3962 	/*
3963 	 * Free any held credentials and caches which
3964 	 * may be associated with this rnode.
3965 	 */
3966 	if (!mutex_tryenter(&rp->r_statelock))
3967 		return (0);
3968 	contents = rp->r_symlink.contents;
3969 	size = rp->r_symlink.size;
3970 	rp->r_symlink.contents = NULL;
3971 	vsp = rp->r_secattr;
3972 	rp->r_secattr = NULL;
3973 	info = rp->r_pathconf;
3974 	rp->r_pathconf = NULL;
3975 	mutex_exit(&rp->r_statelock);
3976 
3977 	/*
3978 	 * Free the access cache entries.
3979 	 */
3980 	freed = nfs_access_purge_rp(rp);
3981 
3982 	if (!HAVE_RDDIR_CACHE(rp) &&
3983 	    contents == NULL &&
3984 	    vsp == NULL &&
3985 	    info == NULL)
3986 		return (freed);
3987 
3988 	/*
3989 	 * Free the readdir cache entries
3990 	 */
3991 	if (HAVE_RDDIR_CACHE(rp))
3992 		nfs_purge_rddir_cache(RTOV(rp));
3993 
3994 	/*
3995 	 * Free the symbolic link cache.
3996 	 */
3997 	if (contents != NULL) {
3998 
3999 		kmem_free((void *)contents, size);
4000 	}
4001 
4002 	/*
4003 	 * Free any cached ACL.
4004 	 */
4005 	if (vsp != NULL)
4006 		nfs_acl_free(vsp);
4007 
4008 	/*
4009 	 * Free any cached pathconf information.
4010 	 */
4011 	if (info != NULL)
4012 		kmem_free(info, sizeof (*info));
4013 
4014 	return (1);
4015 }
4016 
4017 static int
4018 nfs_free_reclaim(void)
4019 {
4020 	int freed;
4021 	rnode_t *rp;
4022 
4023 #ifdef DEBUG
4024 	clstat_debug.f_reclaim.value.ui64++;
4025 #endif
4026 	freed = 0;
4027 	mutex_enter(&rpfreelist_lock);
4028 	rp = rpfreelist;
4029 	if (rp != NULL) {
4030 		do {
4031 			if (nfs_free_data_reclaim(rp))
4032 				freed = 1;
4033 		} while ((rp = rp->r_freef) != rpfreelist);
4034 	}
4035 	mutex_exit(&rpfreelist_lock);
4036 	return (freed);
4037 }
4038 
4039 static int
4040 nfs_active_reclaim(void)
4041 {
4042 	int freed;
4043 	int index;
4044 	rnode_t *rp;
4045 
4046 #ifdef DEBUG
4047 	clstat_debug.a_reclaim.value.ui64++;
4048 #endif
4049 	freed = 0;
4050 	for (index = 0; index < rtablesize; index++) {
4051 		rw_enter(&rtable[index].r_lock, RW_READER);
4052 		for (rp = rtable[index].r_hashf;
4053 		    rp != (rnode_t *)(&rtable[index]);
4054 		    rp = rp->r_hashf) {
4055 			if (nfs_active_data_reclaim(rp))
4056 				freed = 1;
4057 		}
4058 		rw_exit(&rtable[index].r_lock);
4059 	}
4060 	return (freed);
4061 }
4062 
4063 static int
4064 nfs_rnode_reclaim(void)
4065 {
4066 	int freed;
4067 	rnode_t *rp;
4068 	vnode_t *vp;
4069 
4070 #ifdef DEBUG
4071 	clstat_debug.r_reclaim.value.ui64++;
4072 #endif
4073 	freed = 0;
4074 	mutex_enter(&rpfreelist_lock);
4075 	while ((rp = rpfreelist) != NULL) {
4076 		rp_rmfree(rp);
4077 		mutex_exit(&rpfreelist_lock);
4078 		if (rp->r_flags & RHASHED) {
4079 			vp = RTOV(rp);
4080 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4081 			mutex_enter(&vp->v_lock);
4082 			if (vp->v_count > 1) {
4083 				vp->v_count--;
4084 				mutex_exit(&vp->v_lock);
4085 				rw_exit(&rp->r_hashq->r_lock);
4086 				mutex_enter(&rpfreelist_lock);
4087 				continue;
4088 			}
4089 			mutex_exit(&vp->v_lock);
4090 			rp_rmhash_locked(rp);
4091 			rw_exit(&rp->r_hashq->r_lock);
4092 		}
4093 		/*
4094 		 * This call to rp_addfree will end up destroying the
4095 		 * rnode, but in a safe way with the appropriate set
4096 		 * of checks done.
4097 		 */
4098 		rp_addfree(rp, CRED());
4099 		mutex_enter(&rpfreelist_lock);
4100 	}
4101 	mutex_exit(&rpfreelist_lock);
4102 	return (freed);
4103 }
4104 
4105 /*ARGSUSED*/
4106 static void
4107 nfs_reclaim(void *cdrarg)
4108 {
4109 
4110 #ifdef DEBUG
4111 	clstat_debug.reclaim.value.ui64++;
4112 #endif
4113 	if (nfs_free_reclaim())
4114 		return;
4115 
4116 	if (nfs_active_reclaim())
4117 		return;
4118 
4119 	(void) nfs_rnode_reclaim();
4120 }
4121 
4122 /*
4123  * NFS client failover support
4124  *
4125  * Routines to copy filehandles
4126  */
4127 void
4128 nfscopyfh(caddr_t fhp, vnode_t *vp)
4129 {
4130 	fhandle_t *dest = (fhandle_t *)fhp;
4131 
4132 	if (dest != NULL)
4133 		*dest = *VTOFH(vp);
4134 }
4135 
4136 void
4137 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4138 {
4139 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4140 
4141 	if (dest != NULL)
4142 		*dest = *VTOFH3(vp);
4143 }
4144 
4145 /*
4146  * NFS client failover support
4147  *
4148  * failover_safe() will test various conditions to ensure that
4149  * failover is permitted for this vnode.  It will be denied
4150  * if:
4151  *	1) the operation in progress does not support failover (NULL fi)
4152  *	2) there are no available replicas (NULL mi_servers->sv_next)
4153  *	3) any locks are outstanding on this file
4154  */
4155 static int
4156 failover_safe(failinfo_t *fi)
4157 {
4158 
4159 	/*
4160 	 * Does this op permit failover?
4161 	 */
4162 	if (fi == NULL || fi->vp == NULL)
4163 		return (0);
4164 
4165 	/*
4166 	 * Are there any alternates to failover to?
4167 	 */
4168 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4169 		return (0);
4170 
4171 	/*
4172 	 * Disable check; we've forced local locking
4173 	 *
4174 	 * if (flk_has_remote_locks(fi->vp))
4175 	 *	return (0);
4176 	 */
4177 
4178 	/*
4179 	 * If we have no partial path, we can't do anything
4180 	 */
4181 	if (VTOR(fi->vp)->r_path == NULL)
4182 		return (0);
4183 
4184 	return (1);
4185 }
4186 
4187 #include <sys/thread.h>
4188 
4189 /*
4190  * NFS client failover support
4191  *
4192  * failover_newserver() will start a search for a new server,
4193  * preferably by starting an async thread to do the work.  If
4194  * someone is already doing this (recognizable by MI_BINDINPROG
4195  * being set), it will simply return and the calling thread
4196  * will queue on the mi_failover_cv condition variable.
4197  */
4198 static void
4199 failover_newserver(mntinfo_t *mi)
4200 {
4201 	/*
4202 	 * Check if someone else is doing this already
4203 	 */
4204 	mutex_enter(&mi->mi_lock);
4205 	if (mi->mi_flags & MI_BINDINPROG) {
4206 		mutex_exit(&mi->mi_lock);
4207 		return;
4208 	}
4209 	mi->mi_flags |= MI_BINDINPROG;
4210 
4211 	/*
4212 	 * Need to hold the vfs struct so that it can't be released
4213 	 * while the failover thread is selecting a new server.
4214 	 */
4215 	VFS_HOLD(mi->mi_vfsp);
4216 
4217 	/*
4218 	 * Start a thread to do the real searching.
4219 	 */
4220 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4221 
4222 	mutex_exit(&mi->mi_lock);
4223 }
4224 
4225 /*
4226  * NFS client failover support
4227  *
4228  * failover_thread() will find a new server to replace the one
4229  * currently in use, wake up other threads waiting on this mount
4230  * point, and die.  It will start at the head of the server list
4231  * and poll servers until it finds one with an NFS server which is
4232  * registered and responds to a NULL procedure ping.
4233  *
4234  * XXX failover_thread is unsafe within the scope of the
4235  * present model defined for cpr to suspend the system.
4236  * Specifically, over-the-wire calls made by the thread
4237  * are unsafe. The thread needs to be reevaluated in case of
4238  * future updates to the cpr suspend model.
4239  */
4240 static void
4241 failover_thread(mntinfo_t *mi)
4242 {
4243 	servinfo_t *svp = NULL;
4244 	CLIENT *cl;
4245 	enum clnt_stat status;
4246 	struct timeval tv;
4247 	int error;
4248 	int oncethru = 0;
4249 	callb_cpr_t cprinfo;
4250 	rnode_t *rp;
4251 	int index;
4252 	char *srvnames;
4253 	size_t srvnames_len;
4254 	struct nfs_clnt *nfscl = NULL;
4255 	zoneid_t zoneid = getzoneid();
4256 
4257 #ifdef DEBUG
4258 	/*
4259 	 * This is currently only needed to access counters which exist on
4260 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4261 	 * on non-DEBUG kernels.
4262 	 */
4263 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4264 	ASSERT(nfscl != NULL);
4265 #endif
4266 
4267 	/*
4268 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4269 	 * code guarantees that there will be only one failover thread
4270 	 * per mountinfo at any instance.
4271 	 */
4272 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4273 	    "failover_thread");
4274 
4275 	mutex_enter(&mi->mi_lock);
4276 	while (mi->mi_readers) {
4277 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4278 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4279 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4280 	}
4281 	mutex_exit(&mi->mi_lock);
4282 
4283 	tv.tv_sec = 2;
4284 	tv.tv_usec = 0;
4285 
4286 	/*
4287 	 * Ping the null NFS procedure of every server in
4288 	 * the list until one responds.  We always start
4289 	 * at the head of the list and always skip the one
4290 	 * that is current, since it's caused us a problem.
4291 	 */
4292 	while (svp == NULL) {
4293 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4294 			if (!oncethru && svp == mi->mi_curr_serv)
4295 				continue;
4296 
4297 			/*
4298 			 * If the file system was forcibly umounted
4299 			 * while trying to do a failover, then just
4300 			 * give up on the failover.  It won't matter
4301 			 * what the server is.
4302 			 */
4303 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4304 				svp = NULL;
4305 				goto done;
4306 			}
4307 
4308 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4309 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4310 			if (error)
4311 				continue;
4312 
4313 			if (!(mi->mi_flags & MI_INT))
4314 				cl->cl_nosignal = TRUE;
4315 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4316 			    xdr_void, NULL, tv);
4317 			if (!(mi->mi_flags & MI_INT))
4318 				cl->cl_nosignal = FALSE;
4319 			AUTH_DESTROY(cl->cl_auth);
4320 			CLNT_DESTROY(cl);
4321 			if (status == RPC_SUCCESS) {
4322 				if (svp == mi->mi_curr_serv) {
4323 #ifdef DEBUG
4324 					zcmn_err(zoneid, CE_NOTE,
4325 			"NFS%d: failing over: selecting original server %s",
4326 					    mi->mi_vers, svp->sv_hostname);
4327 #else
4328 					zcmn_err(zoneid, CE_NOTE,
4329 			"NFS: failing over: selecting original server %s",
4330 					    svp->sv_hostname);
4331 #endif
4332 				} else {
4333 #ifdef DEBUG
4334 					zcmn_err(zoneid, CE_NOTE,
4335 				    "NFS%d: failing over from %s to %s",
4336 					    mi->mi_vers,
4337 					    mi->mi_curr_serv->sv_hostname,
4338 					    svp->sv_hostname);
4339 #else
4340 					zcmn_err(zoneid, CE_NOTE,
4341 				    "NFS: failing over from %s to %s",
4342 					    mi->mi_curr_serv->sv_hostname,
4343 					    svp->sv_hostname);
4344 #endif
4345 				}
4346 				break;
4347 			}
4348 		}
4349 
4350 		if (svp == NULL) {
4351 			if (!oncethru) {
4352 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4353 #ifdef DEBUG
4354 				zprintf(zoneid,
4355 				    "NFS%d servers %s not responding "
4356 				    "still trying\n", mi->mi_vers, srvnames);
4357 #else
4358 				zprintf(zoneid, "NFS servers %s not responding "
4359 				    "still trying\n", srvnames);
4360 #endif
4361 				oncethru = 1;
4362 			}
4363 			mutex_enter(&mi->mi_lock);
4364 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4365 			mutex_exit(&mi->mi_lock);
4366 			delay(hz);
4367 			mutex_enter(&mi->mi_lock);
4368 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4369 			mutex_exit(&mi->mi_lock);
4370 		}
4371 	}
4372 
4373 	if (oncethru) {
4374 #ifdef DEBUG
4375 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4376 #else
4377 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4378 #endif
4379 	}
4380 
4381 	if (svp != mi->mi_curr_serv) {
4382 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4383 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4384 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4385 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4386 		    mi->mi_vfsp);
4387 		if (rp != NULL) {
4388 			if (rp->r_flags & RHASHED)
4389 				rp_rmhash_locked(rp);
4390 			rw_exit(&rtable[index].r_lock);
4391 			rp->r_server = svp;
4392 			rp->r_fh = svp->sv_fhandle;
4393 			(void) nfs_free_data_reclaim(rp);
4394 			index = rtablehash(&rp->r_fh);
4395 			rp->r_hashq = &rtable[index];
4396 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4397 			vn_exists(RTOV(rp));
4398 			rp_addhash(rp);
4399 			rw_exit(&rp->r_hashq->r_lock);
4400 			VN_RELE(RTOV(rp));
4401 		} else
4402 			rw_exit(&rtable[index].r_lock);
4403 	}
4404 
4405 done:
4406 	if (oncethru)
4407 		kmem_free(srvnames, srvnames_len);
4408 	mutex_enter(&mi->mi_lock);
4409 	mi->mi_flags &= ~MI_BINDINPROG;
4410 	if (svp != NULL) {
4411 		mi->mi_curr_serv = svp;
4412 		mi->mi_failover++;
4413 #ifdef DEBUG
4414 	nfscl->nfscl_stat.failover.value.ui64++;
4415 #endif
4416 	}
4417 	cv_broadcast(&mi->mi_failover_cv);
4418 	CALLB_CPR_EXIT(&cprinfo);
4419 	VFS_RELE(mi->mi_vfsp);
4420 	zthread_exit();
4421 	/* NOTREACHED */
4422 }
4423 
4424 /*
4425  * NFS client failover support
4426  *
4427  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4428  * is cleared, meaning that failover is complete.  Called with
4429  * mi_lock mutex held.
4430  */
4431 static int
4432 failover_wait(mntinfo_t *mi)
4433 {
4434 	k_sigset_t smask;
4435 
4436 	/*
4437 	 * If someone else is hunting for a living server,
4438 	 * sleep until it's done.  After our sleep, we may
4439 	 * be bound to the right server and get off cheaply.
4440 	 */
4441 	while (mi->mi_flags & MI_BINDINPROG) {
4442 		/*
4443 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4444 		 * and SIGTERM. (Preserving the existing masks).
4445 		 * Mask out SIGINT if mount option nointr is specified.
4446 		 */
4447 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4448 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4449 			/*
4450 			 * restore original signal mask
4451 			 */
4452 			sigunintr(&smask);
4453 			return (EINTR);
4454 		}
4455 		/*
4456 		 * restore original signal mask
4457 		 */
4458 		sigunintr(&smask);
4459 	}
4460 	return (0);
4461 }
4462 
4463 /*
4464  * NFS client failover support
4465  *
4466  * failover_remap() will do a partial pathname lookup and find the
4467  * desired vnode on the current server.  The interim vnode will be
4468  * discarded after we pilfer the new filehandle.
4469  *
4470  * Side effects:
4471  * - This routine will also update the filehandle in the args structure
4472  *    pointed to by the fi->fhp pointer if it is non-NULL.
4473  */
4474 
4475 static int
4476 failover_remap(failinfo_t *fi)
4477 {
4478 	vnode_t *vp, *nvp, *rootvp;
4479 	rnode_t *rp, *nrp;
4480 	mntinfo_t *mi;
4481 	int error;
4482 #ifdef DEBUG
4483 	struct nfs_clnt *nfscl;
4484 
4485 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4486 	ASSERT(nfscl != NULL);
4487 #endif
4488 	/*
4489 	 * Sanity check
4490 	 */
4491 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4492 		return (EINVAL);
4493 	vp = fi->vp;
4494 	rp = VTOR(vp);
4495 	mi = VTOMI(vp);
4496 
4497 	if (!(vp->v_flag & VROOT)) {
4498 		/*
4499 		 * Given the root fh, use the path stored in
4500 		 * the rnode to find the fh for the new server.
4501 		 */
4502 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4503 		if (error)
4504 			return (error);
4505 
4506 		error = failover_lookup(rp->r_path, rootvp,
4507 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4508 
4509 		VN_RELE(rootvp);
4510 
4511 		if (error)
4512 			return (error);
4513 
4514 		/*
4515 		 * If we found the same rnode, we're done now
4516 		 */
4517 		if (nvp == vp) {
4518 			/*
4519 			 * Failed and the new server may physically be same
4520 			 * OR may share a same disk subsystem. In this case
4521 			 * file handle for a particular file path is not going
4522 			 * to change, given the same filehandle lookup will
4523 			 * always locate the same rnode as the existing one.
4524 			 * All we might need to do is to update the r_server
4525 			 * with the current servinfo.
4526 			 */
4527 			if (!VALID_FH(fi)) {
4528 				rp->r_server = mi->mi_curr_serv;
4529 			}
4530 			VN_RELE(nvp);
4531 			return (0);
4532 		}
4533 
4534 		/*
4535 		 * Try to make it so that no one else will find this
4536 		 * vnode because it is just a temporary to hold the
4537 		 * new file handle until that file handle can be
4538 		 * copied to the original vnode/rnode.
4539 		 */
4540 		nrp = VTOR(nvp);
4541 		mutex_enter(&mi->mi_remap_lock);
4542 		/*
4543 		 * Some other thread could have raced in here and could
4544 		 * have done the remap for this particular rnode before
4545 		 * this thread here. Check for rp->r_server and
4546 		 * mi->mi_curr_serv and return if they are same.
4547 		 */
4548 		if (VALID_FH(fi)) {
4549 			mutex_exit(&mi->mi_remap_lock);
4550 			VN_RELE(nvp);
4551 			return (0);
4552 		}
4553 
4554 		if (nrp->r_flags & RHASHED)
4555 			rp_rmhash(nrp);
4556 
4557 		/*
4558 		 * As a heuristic check on the validity of the new
4559 		 * file, check that the size and type match against
4560 		 * that we remember from the old version.
4561 		 */
4562 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4563 			mutex_exit(&mi->mi_remap_lock);
4564 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4565 			    "NFS replicas %s and %s: file %s not same.",
4566 			    rp->r_server->sv_hostname,
4567 			    nrp->r_server->sv_hostname, rp->r_path);
4568 			VN_RELE(nvp);
4569 			return (EINVAL);
4570 		}
4571 
4572 		/*
4573 		 * snarf the filehandle from the new rnode
4574 		 * then release it, again while updating the
4575 		 * hash queues for the rnode.
4576 		 */
4577 		if (rp->r_flags & RHASHED)
4578 			rp_rmhash(rp);
4579 		rp->r_server = mi->mi_curr_serv;
4580 		rp->r_fh = nrp->r_fh;
4581 		rp->r_hashq = nrp->r_hashq;
4582 		/*
4583 		 * Copy the attributes from the new rnode to the old
4584 		 * rnode.  This will help to reduce unnecessary page
4585 		 * cache flushes.
4586 		 */
4587 		rp->r_attr = nrp->r_attr;
4588 		rp->r_attrtime = nrp->r_attrtime;
4589 		rp->r_mtime = nrp->r_mtime;
4590 		(void) nfs_free_data_reclaim(rp);
4591 		nfs_setswaplike(vp, &rp->r_attr);
4592 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4593 		rp_addhash(rp);
4594 		rw_exit(&rp->r_hashq->r_lock);
4595 		mutex_exit(&mi->mi_remap_lock);
4596 		VN_RELE(nvp);
4597 	}
4598 
4599 	/*
4600 	 * Update successful failover remap count
4601 	 */
4602 	mutex_enter(&mi->mi_lock);
4603 	mi->mi_remap++;
4604 	mutex_exit(&mi->mi_lock);
4605 #ifdef DEBUG
4606 	nfscl->nfscl_stat.remap.value.ui64++;
4607 #endif
4608 
4609 	/*
4610 	 * If we have a copied filehandle to update, do it now.
4611 	 */
4612 	if (fi->fhp != NULL && fi->copyproc != NULL)
4613 		(*fi->copyproc)(fi->fhp, vp);
4614 
4615 	return (0);
4616 }
4617 
4618 /*
4619  * NFS client failover support
4620  *
4621  * We want a simple pathname lookup routine to parse the pieces
4622  * of path in rp->r_path.  We know that the path was a created
4623  * as rnodes were made, so we know we have only to deal with
4624  * paths that look like:
4625  *	dir1/dir2/dir3/file
4626  * Any evidence of anything like .., symlinks, and ENOTDIR
4627  * are hard errors, because they mean something in this filesystem
4628  * is different from the one we came from, or has changed under
4629  * us in some way.  If this is true, we want the failure.
4630  *
4631  * Extended attributes: if the filesystem is mounted with extended
4632  * attributes enabled (-o xattr), the attribute directory will be
4633  * represented in the r_path as the magic name XATTR_RPATH. So if
4634  * we see that name in the pathname, is must be because this node
4635  * is an extended attribute.  Therefore, look it up that way.
4636  */
4637 static int
4638 failover_lookup(char *path, vnode_t *root,
4639     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4640     vnode_t *, cred_t *, int),
4641     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4642     vnode_t **new)
4643 {
4644 	vnode_t *dvp, *nvp;
4645 	int error = EINVAL;
4646 	char *s, *p, *tmppath;
4647 	size_t len;
4648 	mntinfo_t *mi;
4649 	bool_t xattr;
4650 
4651 	/* Make local copy of path */
4652 	len = strlen(path) + 1;
4653 	tmppath = kmem_alloc(len, KM_SLEEP);
4654 	(void) strcpy(tmppath, path);
4655 	s = tmppath;
4656 
4657 	dvp = root;
4658 	VN_HOLD(dvp);
4659 	mi = VTOMI(root);
4660 	xattr = mi->mi_flags & MI_EXTATTR;
4661 
4662 	do {
4663 		p = strchr(s, '/');
4664 		if (p != NULL)
4665 			*p = '\0';
4666 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4667 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4668 			    RFSCALL_SOFT);
4669 		} else {
4670 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4671 			    CRED(), RFSCALL_SOFT);
4672 		}
4673 		if (p != NULL)
4674 			*p++ = '/';
4675 		if (error) {
4676 			VN_RELE(dvp);
4677 			kmem_free(tmppath, len);
4678 			return (error);
4679 		}
4680 		s = p;
4681 		VN_RELE(dvp);
4682 		dvp = nvp;
4683 	} while (p != NULL);
4684 
4685 	if (nvp != NULL && new != NULL)
4686 		*new = nvp;
4687 	kmem_free(tmppath, len);
4688 	return (0);
4689 }
4690 
4691 /*
4692  * NFS client failover support
4693  *
4694  * sv_free() frees the malloc'd portion of a "servinfo_t".
4695  */
4696 void
4697 sv_free(servinfo_t *svp)
4698 {
4699 	servinfo_t *next;
4700 	struct knetconfig *knconf;
4701 
4702 	while (svp != NULL) {
4703 		next = svp->sv_next;
4704 		if (svp->sv_secdata)
4705 			sec_clnt_freeinfo(svp->sv_secdata);
4706 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4707 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4708 		knconf = svp->sv_knconf;
4709 		if (knconf != NULL) {
4710 			if (knconf->knc_protofmly != NULL)
4711 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4712 			if (knconf->knc_proto != NULL)
4713 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4714 			kmem_free(knconf, sizeof (*knconf));
4715 		}
4716 		knconf = svp->sv_origknconf;
4717 		if (knconf != NULL) {
4718 			if (knconf->knc_protofmly != NULL)
4719 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4720 			if (knconf->knc_proto != NULL)
4721 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4722 			kmem_free(knconf, sizeof (*knconf));
4723 		}
4724 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4725 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4726 		mutex_destroy(&svp->sv_lock);
4727 		kmem_free(svp, sizeof (*svp));
4728 		svp = next;
4729 	}
4730 }
4731 
4732 /*
4733  * Only can return non-zero if intr != 0.
4734  */
4735 int
4736 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4737 {
4738 
4739 	mutex_enter(&l->lock);
4740 
4741 	/*
4742 	 * If this is a nested enter, then allow it.  There
4743 	 * must be as many exits as enters through.
4744 	 */
4745 	if (l->owner == curthread) {
4746 		/* lock is held for writing by current thread */
4747 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4748 		l->count--;
4749 	} else if (rw == RW_READER) {
4750 		/*
4751 		 * While there is a writer active or writers waiting,
4752 		 * then wait for them to finish up and move on.  Then,
4753 		 * increment the count to indicate that a reader is
4754 		 * active.
4755 		 */
4756 		while (l->count < 0 || l->waiters > 0) {
4757 			if (intr) {
4758 				klwp_t *lwp = ttolwp(curthread);
4759 
4760 				if (lwp != NULL)
4761 					lwp->lwp_nostop++;
4762 				if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4763 					if (lwp != NULL)
4764 						lwp->lwp_nostop--;
4765 					mutex_exit(&l->lock);
4766 					return (EINTR);
4767 				}
4768 				if (lwp != NULL)
4769 					lwp->lwp_nostop--;
4770 			} else
4771 				cv_wait(&l->cv_rd, &l->lock);
4772 		}
4773 		ASSERT(l->count < INT_MAX);
4774 #ifdef	DEBUG
4775 		if ((l->count % 10000) == 9999)
4776 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4777 			    "rwlock @ %p\n", l->count, (void *)&l);
4778 #endif
4779 		l->count++;
4780 	} else {
4781 		ASSERT(rw == RW_WRITER);
4782 		/*
4783 		 * While there are readers active or a writer
4784 		 * active, then wait for all of the readers
4785 		 * to finish or for the writer to finish.
4786 		 * Then, set the owner field to curthread and
4787 		 * decrement count to indicate that a writer
4788 		 * is active.
4789 		 */
4790 		while (l->count != 0) {
4791 			l->waiters++;
4792 			if (intr) {
4793 				klwp_t *lwp = ttolwp(curthread);
4794 
4795 				if (lwp != NULL)
4796 					lwp->lwp_nostop++;
4797 				if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4798 					if (lwp != NULL)
4799 						lwp->lwp_nostop--;
4800 					l->waiters--;
4801 					/*
4802 					 * If there are readers active and no
4803 					 * writers waiting then wake up all of
4804 					 * the waiting readers (if any).
4805 					 */
4806 					if (l->count > 0 && l->waiters == 0)
4807 						cv_broadcast(&l->cv_rd);
4808 					mutex_exit(&l->lock);
4809 					return (EINTR);
4810 				}
4811 				if (lwp != NULL)
4812 					lwp->lwp_nostop--;
4813 			} else
4814 				cv_wait(&l->cv, &l->lock);
4815 			l->waiters--;
4816 		}
4817 		ASSERT(l->owner == NULL);
4818 		l->owner = curthread;
4819 		l->count--;
4820 	}
4821 
4822 	mutex_exit(&l->lock);
4823 
4824 	return (0);
4825 }
4826 
4827 /*
4828  * If the lock is available, obtain it and return non-zero.  If there is
4829  * already a conflicting lock, return 0 immediately.
4830  */
4831 
4832 int
4833 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4834 {
4835 	mutex_enter(&l->lock);
4836 
4837 	/*
4838 	 * If this is a nested enter, then allow it.  There
4839 	 * must be as many exits as enters through.
4840 	 */
4841 	if (l->owner == curthread) {
4842 		/* lock is held for writing by current thread */
4843 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4844 		l->count--;
4845 	} else if (rw == RW_READER) {
4846 		/*
4847 		 * If there is a writer active or writers waiting, deny the
4848 		 * lock.  Otherwise, bump the count of readers.
4849 		 */
4850 		if (l->count < 0 || l->waiters > 0) {
4851 			mutex_exit(&l->lock);
4852 			return (0);
4853 		}
4854 		l->count++;
4855 	} else {
4856 		ASSERT(rw == RW_WRITER);
4857 		/*
4858 		 * If there are readers active or a writer active, deny the
4859 		 * lock.  Otherwise, set the owner field to curthread and
4860 		 * decrement count to indicate that a writer is active.
4861 		 */
4862 		if (l->count != 0) {
4863 			mutex_exit(&l->lock);
4864 			return (0);
4865 		}
4866 		ASSERT(l->owner == NULL);
4867 		l->owner = curthread;
4868 		l->count--;
4869 	}
4870 
4871 	mutex_exit(&l->lock);
4872 
4873 	return (1);
4874 }
4875 
4876 void
4877 nfs_rw_exit(nfs_rwlock_t *l)
4878 {
4879 
4880 	mutex_enter(&l->lock);
4881 
4882 	if (l->owner != NULL) {
4883 		ASSERT(l->owner == curthread);
4884 
4885 		/*
4886 		 * To release a writer lock increment count to indicate that
4887 		 * there is one less writer active.  If this was the last of
4888 		 * possibly nested writer locks, then clear the owner field as
4889 		 * well to indicate that there is no writer active.
4890 		 */
4891 		ASSERT(l->count < 0);
4892 		l->count++;
4893 		if (l->count == 0) {
4894 			l->owner = NULL;
4895 
4896 			/*
4897 			 * If there are no writers waiting then wakeup all of
4898 			 * the waiting readers (if any).
4899 			 */
4900 			if (l->waiters == 0)
4901 				cv_broadcast(&l->cv_rd);
4902 		}
4903 	} else {
4904 		/*
4905 		 * To release a reader lock just decrement count to indicate
4906 		 * that there is one less reader active.
4907 		 */
4908 		ASSERT(l->count > 0);
4909 		l->count--;
4910 	}
4911 
4912 	/*
4913 	 * If there are no readers active nor a writer active and there is a
4914 	 * writer waiting we need to wake up it.
4915 	 */
4916 	if (l->count == 0 && l->waiters > 0)
4917 		cv_signal(&l->cv);
4918 	mutex_exit(&l->lock);
4919 }
4920 
4921 int
4922 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4923 {
4924 
4925 	if (rw == RW_READER)
4926 		return (l->count > 0);
4927 	ASSERT(rw == RW_WRITER);
4928 	return (l->count < 0);
4929 }
4930 
4931 /* ARGSUSED */
4932 void
4933 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4934 {
4935 
4936 	l->count = 0;
4937 	l->waiters = 0;
4938 	l->owner = NULL;
4939 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4940 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4941 	cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4942 }
4943 
4944 void
4945 nfs_rw_destroy(nfs_rwlock_t *l)
4946 {
4947 
4948 	mutex_destroy(&l->lock);
4949 	cv_destroy(&l->cv);
4950 	cv_destroy(&l->cv_rd);
4951 }
4952 
4953 int
4954 nfs3_rddir_compar(const void *x, const void *y)
4955 {
4956 	rddir_cache *a = (rddir_cache *)x;
4957 	rddir_cache *b = (rddir_cache *)y;
4958 
4959 	if (a->nfs3_cookie == b->nfs3_cookie) {
4960 		if (a->buflen == b->buflen)
4961 			return (0);
4962 		if (a->buflen < b->buflen)
4963 			return (-1);
4964 		return (1);
4965 	}
4966 
4967 	if (a->nfs3_cookie < b->nfs3_cookie)
4968 		return (-1);
4969 
4970 	return (1);
4971 }
4972 
4973 int
4974 nfs_rddir_compar(const void *x, const void *y)
4975 {
4976 	rddir_cache *a = (rddir_cache *)x;
4977 	rddir_cache *b = (rddir_cache *)y;
4978 
4979 	if (a->nfs_cookie == b->nfs_cookie) {
4980 		if (a->buflen == b->buflen)
4981 			return (0);
4982 		if (a->buflen < b->buflen)
4983 			return (-1);
4984 		return (1);
4985 	}
4986 
4987 	if (a->nfs_cookie < b->nfs_cookie)
4988 		return (-1);
4989 
4990 	return (1);
4991 }
4992 
4993 static char *
4994 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4995 {
4996 	servinfo_t *s;
4997 	char *srvnames;
4998 	char *namep;
4999 	size_t length;
5000 
5001 	/*
5002 	 * Calculate the length of the string required to hold all
5003 	 * of the server names plus either a comma or a null
5004 	 * character following each individual one.
5005 	 */
5006 	length = 0;
5007 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5008 		length += s->sv_hostnamelen;
5009 
5010 	srvnames = kmem_alloc(length, KM_SLEEP);
5011 
5012 	namep = srvnames;
5013 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5014 		(void) strcpy(namep, s->sv_hostname);
5015 		namep += s->sv_hostnamelen - 1;
5016 		*namep++ = ',';
5017 	}
5018 	*--namep = '\0';
5019 
5020 	*len = length;
5021 
5022 	return (srvnames);
5023 }
5024 
5025 /*
5026  * These two functions are temporary and designed for the upgrade-workaround
5027  * only.  They cannot be used for general zone-crossing NFS client support, and
5028  * will be removed shortly.
5029  *
5030  * When the workaround is enabled, all NFS traffic is forced into the global
5031  * zone.  These functions are called when the code needs to refer to the state
5032  * of the underlying network connection.  They're not called when the function
5033  * needs to refer to the state of the process that invoked the system call.
5034  * (E.g., when checking whether the zone is shutting down during the mount()
5035  * call.)
5036  */
5037 
5038 struct zone *
5039 nfs_zone(void)
5040 {
5041 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5042 }
5043 
5044 zoneid_t
5045 nfs_zoneid(void)
5046 {
5047 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5048 }
5049 
5050 /*
5051  * nfs_mount_label_policy:
5052  *	Determine whether the mount is allowed according to MAC check,
5053  *	by comparing (where appropriate) label of the remote server
5054  *	against the label of the zone being mounted into.
5055  *
5056  *	Returns:
5057  *		 0 :	access allowed
5058  *		-1 :	read-only access allowed (i.e., read-down)
5059  *		>0 :	error code, such as EACCES
5060  */
5061 int
5062 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5063     struct knetconfig *knconf, cred_t *cr)
5064 {
5065 	int		addr_type;
5066 	void		*ipaddr;
5067 	bslabel_t	*server_sl, *mntlabel;
5068 	zone_t		*mntzone = NULL;
5069 	ts_label_t	*zlabel;
5070 	tsol_tpc_t	*tp;
5071 	ts_label_t	*tsl = NULL;
5072 	int		retv;
5073 
5074 	/*
5075 	 * Get the zone's label.  Each zone on a labeled system has a label.
5076 	 */
5077 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5078 	zlabel = mntzone->zone_slabel;
5079 	ASSERT(zlabel != NULL);
5080 	label_hold(zlabel);
5081 
5082 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5083 		addr_type = IPV4_VERSION;
5084 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5085 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5086 		addr_type = IPV6_VERSION;
5087 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5088 	} else {
5089 		retv = 0;
5090 		goto out;
5091 	}
5092 
5093 	retv = EACCES;				/* assume the worst */
5094 
5095 	/*
5096 	 * Next, get the assigned label of the remote server.
5097 	 */
5098 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5099 	if (tp == NULL)
5100 		goto out;			/* error getting host entry */
5101 
5102 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5103 		goto rel_tpc;			/* invalid domain */
5104 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5105 	    (tp->tpc_tp.host_type != UNLABELED))
5106 		goto rel_tpc;			/* invalid hosttype */
5107 
5108 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5109 		tsl = getflabel_cipso(vfsp);
5110 		if (tsl == NULL)
5111 			goto rel_tpc;		/* error getting server lbl */
5112 
5113 		server_sl = label2bslabel(tsl);
5114 	} else {	/* UNLABELED */
5115 		server_sl = &tp->tpc_tp.tp_def_label;
5116 	}
5117 
5118 	mntlabel = label2bslabel(zlabel);
5119 
5120 	/*
5121 	 * Now compare labels to complete the MAC check.  If the labels
5122 	 * are equal or if the requestor is in the global zone and has
5123 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5124 	 * mounts into the global zone itself; restrict these to
5125 	 * read-only.)
5126 	 *
5127 	 * If the requestor is in some other zone, but his label
5128 	 * dominates the server, then allow read-down.
5129 	 *
5130 	 * Otherwise, access is denied.
5131 	 */
5132 	if (blequal(mntlabel, server_sl) ||
5133 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5134 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5135 		if ((mntzone == global_zone) ||
5136 		    !blequal(mntlabel, server_sl))
5137 			retv = -1;		/* read-only */
5138 		else
5139 			retv = 0;		/* access OK */
5140 	} else if (bldominates(mntlabel, server_sl)) {
5141 		retv = -1;			/* read-only */
5142 	} else {
5143 		retv = EACCES;
5144 	}
5145 
5146 	if (tsl != NULL)
5147 		label_rele(tsl);
5148 
5149 rel_tpc:
5150 	TPC_RELE(tp);
5151 out:
5152 	if (mntzone)
5153 		zone_rele(mntzone);
5154 	label_rele(zlabel);
5155 	return (retv);
5156 }
5157 
5158 boolean_t
5159 nfs_has_ctty(void)
5160 {
5161 	boolean_t rv;
5162 	mutex_enter(&curproc->p_splock);
5163 	rv = (curproc->p_sessp->s_vp != NULL);
5164 	mutex_exit(&curproc->p_splock);
5165 	return (rv);
5166 }
5167 
5168 /*
5169  * See if xattr directory to see if it has any generic user attributes
5170  */
5171 int
5172 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5173 {
5174 	struct uio uio;
5175 	struct iovec iov;
5176 	char *dbuf;
5177 	struct dirent64 *dp;
5178 	size_t dlen = 8 * 1024;
5179 	size_t dbuflen;
5180 	int eof = 0;
5181 	int error;
5182 
5183 	*valp = 0;
5184 	dbuf = kmem_alloc(dlen, KM_SLEEP);
5185 	uio.uio_iov = &iov;
5186 	uio.uio_iovcnt = 1;
5187 	uio.uio_segflg = UIO_SYSSPACE;
5188 	uio.uio_fmode = 0;
5189 	uio.uio_extflg = UIO_COPY_CACHED;
5190 	uio.uio_loffset = 0;
5191 	uio.uio_resid = dlen;
5192 	iov.iov_base = dbuf;
5193 	iov.iov_len = dlen;
5194 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5195 	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5196 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5197 
5198 	dbuflen = dlen - uio.uio_resid;
5199 
5200 	if (error || dbuflen == 0) {
5201 		kmem_free(dbuf, dlen);
5202 		return (error);
5203 	}
5204 
5205 	dp = (dirent64_t *)dbuf;
5206 
5207 	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5208 		if (strcmp(dp->d_name, ".") == 0 ||
5209 		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5210 		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5211 		    VIEW_READONLY) == 0) {
5212 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5213 			continue;
5214 		}
5215 
5216 		*valp = 1;
5217 		break;
5218 	}
5219 	kmem_free(dbuf, dlen);
5220 	return (0);
5221 }
5222