xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_subr.c (revision 13b136d3061155363c62c9f6568d25b8b27da8f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/proc.h>
36 #include <sys/user.h>
37 #include <sys/time.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/socket.h>
42 #include <sys/uio.h>
43 #include <sys/tiuser.h>
44 #include <sys/swap.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/kmem.h>
48 #include <sys/kstat.h>
49 #include <sys/cmn_err.h>
50 #include <sys/vtrace.h>
51 #include <sys/session.h>
52 #include <sys/dnlc.h>
53 #include <sys/bitmap.h>
54 #include <sys/acl.h>
55 #include <sys/ddi.h>
56 #include <sys/pathname.h>
57 #include <sys/flock.h>
58 #include <sys/dirent.h>
59 #include <sys/flock.h>
60 #include <sys/callb.h>
61 #include <sys/atomic.h>
62 #include <sys/list.h>
63 #include <sys/tsol/tnet.h>
64 #include <sys/priv.h>
65 #include <sys/sdt.h>
66 #include <sys/attr.h>
67 
68 #include <inet/ip6.h>
69 
70 #include <rpc/types.h>
71 #include <rpc/xdr.h>
72 #include <rpc/auth.h>
73 #include <rpc/clnt.h>
74 
75 #include <nfs/nfs.h>
76 #include <nfs/nfs4.h>
77 #include <nfs/nfs_clnt.h>
78 #include <nfs/rnode.h>
79 #include <nfs/nfs_acl.h>
80 
81 #include <sys/tsol/label.h>
82 
83 /*
84  * The hash queues for the access to active and cached rnodes
85  * are organized as doubly linked lists.  A reader/writer lock
86  * for each hash bucket is used to control access and to synchronize
87  * lookups, additions, and deletions from the hash queue.
88  *
89  * The rnode freelist is organized as a doubly linked list with
90  * a head pointer.  Additions and deletions are synchronized via
91  * a single mutex.
92  *
93  * In order to add an rnode to the free list, it must be hashed into
94  * a hash queue and the exclusive lock to the hash queue be held.
95  * If an rnode is not hashed into a hash queue, then it is destroyed
96  * because it represents no valuable information that can be reused
97  * about the file.  The exclusive lock to the hash queue must be
98  * held in order to prevent a lookup in the hash queue from finding
99  * the rnode and using it and assuming that the rnode is not on the
100  * freelist.  The lookup in the hash queue will have the hash queue
101  * locked, either exclusive or shared.
102  *
103  * The vnode reference count for each rnode is not allowed to drop
104  * below 1.  This prevents external entities, such as the VM
105  * subsystem, from acquiring references to vnodes already on the
106  * freelist and then trying to place them back on the freelist
107  * when their reference is released.  This means that the when an
108  * rnode is looked up in the hash queues, then either the rnode
109  * is removed from the freelist and that reference is transferred to
110  * the new reference or the vnode reference count must be incremented
111  * accordingly.  The mutex for the freelist must be held in order to
112  * accurately test to see if the rnode is on the freelist or not.
113  * The hash queue lock might be held shared and it is possible that
114  * two different threads may race to remove the rnode from the
115  * freelist.  This race can be resolved by holding the mutex for the
116  * freelist.  Please note that the mutex for the freelist does not
117  * need to held if the rnode is not on the freelist.  It can not be
118  * placed on the freelist due to the requirement that the thread
119  * putting the rnode on the freelist must hold the exclusive lock
120  * to the hash queue and the thread doing the lookup in the hash
121  * queue is holding either a shared or exclusive lock to the hash
122  * queue.
123  *
124  * The lock ordering is:
125  *
126  *	hash bucket lock -> vnode lock
127  *	hash bucket lock -> freelist lock
128  */
129 static rhashq_t *rtable;
130 
131 static kmutex_t rpfreelist_lock;
132 static rnode_t *rpfreelist = NULL;
133 static long rnew = 0;
134 long nrnode = 0;
135 
136 static int rtablesize;
137 static int rtablemask;
138 
139 static int hashlen = 4;
140 
141 static struct kmem_cache *rnode_cache;
142 
143 /*
144  * Mutex to protect the following variables:
145  *	nfs_major
146  *	nfs_minor
147  */
148 kmutex_t nfs_minor_lock;
149 int nfs_major;
150 int nfs_minor;
151 
152 /* Do we allow preepoch (negative) time values otw? */
153 bool_t nfs_allow_preepoch_time = FALSE;	/* default: do not allow preepoch */
154 
155 /*
156  * Access cache
157  */
158 static acache_hash_t *acache;
159 static long nacache;	/* used strictly to size the number of hash queues */
160 
161 static int acachesize;
162 static int acachemask;
163 static struct kmem_cache *acache_cache;
164 
165 /*
166  * Client side utilities
167  */
168 
169 /*
170  * client side statistics
171  */
172 static const struct clstat clstat_tmpl = {
173 	{ "calls",	KSTAT_DATA_UINT64 },
174 	{ "badcalls",	KSTAT_DATA_UINT64 },
175 	{ "clgets",	KSTAT_DATA_UINT64 },
176 	{ "cltoomany",	KSTAT_DATA_UINT64 },
177 #ifdef DEBUG
178 	{ "clalloc",	KSTAT_DATA_UINT64 },
179 	{ "noresponse",	KSTAT_DATA_UINT64 },
180 	{ "failover",	KSTAT_DATA_UINT64 },
181 	{ "remap",	KSTAT_DATA_UINT64 },
182 #endif
183 };
184 
185 /*
186  * The following are statistics that describe behavior of the system as a whole
187  * and doesn't correspond to any one particular zone.
188  */
189 #ifdef DEBUG
190 static struct clstat_debug {
191 	kstat_named_t	nrnode;			/* number of allocated rnodes */
192 	kstat_named_t	access;			/* size of access cache */
193 	kstat_named_t	dirent;			/* size of readdir cache */
194 	kstat_named_t	dirents;		/* size of readdir buf cache */
195 	kstat_named_t	reclaim;		/* number of reclaims */
196 	kstat_named_t	clreclaim;		/* number of cl reclaims */
197 	kstat_named_t	f_reclaim;		/* number of free reclaims */
198 	kstat_named_t	a_reclaim;		/* number of active reclaims */
199 	kstat_named_t	r_reclaim;		/* number of rnode reclaims */
200 	kstat_named_t	rpath;			/* bytes used to store rpaths */
201 } clstat_debug = {
202 	{ "nrnode",	KSTAT_DATA_UINT64 },
203 	{ "access",	KSTAT_DATA_UINT64 },
204 	{ "dirent",	KSTAT_DATA_UINT64 },
205 	{ "dirents",	KSTAT_DATA_UINT64 },
206 	{ "reclaim",	KSTAT_DATA_UINT64 },
207 	{ "clreclaim",	KSTAT_DATA_UINT64 },
208 	{ "f_reclaim",	KSTAT_DATA_UINT64 },
209 	{ "a_reclaim",	KSTAT_DATA_UINT64 },
210 	{ "r_reclaim",	KSTAT_DATA_UINT64 },
211 	{ "r_path",	KSTAT_DATA_UINT64 },
212 };
213 #endif	/* DEBUG */
214 
215 /*
216  * We keep a global list of per-zone client data, so we can clean up all zones
217  * if we get low on memory.
218  */
219 static list_t nfs_clnt_list;
220 static kmutex_t nfs_clnt_list_lock;
221 static zone_key_t nfsclnt_zone_key;
222 
223 static struct kmem_cache *chtab_cache;
224 
225 /*
226  * Some servers do not properly update the attributes of the
227  * directory when changes are made.  To allow interoperability
228  * with these broken servers, the nfs_disable_rddir_cache
229  * parameter must be set in /etc/system
230  */
231 int nfs_disable_rddir_cache = 0;
232 
233 int		clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 		    struct chtab **);
235 void		clfree(CLIENT *, struct chtab *);
236 static int	acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
237 		    struct chtab **, struct nfs_clnt *);
238 static int	nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
239 		    struct chtab **, struct nfs_clnt *);
240 static void	clreclaim(void *);
241 static int	nfs_feedback(int, int, mntinfo_t *);
242 static int	rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
243 		    caddr_t, cred_t *, int *, enum clnt_stat *, int,
244 		    failinfo_t *);
245 static int	aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
246 		    caddr_t, cred_t *, int *, int, failinfo_t *);
247 static void	rinactive(rnode_t *, cred_t *);
248 static int	rtablehash(nfs_fhandle *);
249 static vnode_t	*make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
250 		    struct vnodeops *,
251 		    int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
252 			cred_t *),
253 		    int (*)(const void *, const void *), int *, cred_t *,
254 		    char *, char *);
255 static void	rp_rmfree(rnode_t *);
256 static void	rp_addhash(rnode_t *);
257 static void	rp_rmhash_locked(rnode_t *);
258 static rnode_t	*rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
259 static void	destroy_rnode(rnode_t *);
260 static void	rddir_cache_free(rddir_cache *);
261 static int	nfs_free_data_reclaim(rnode_t *);
262 static int	nfs_active_data_reclaim(rnode_t *);
263 static int	nfs_free_reclaim(void);
264 static int	nfs_active_reclaim(void);
265 static int	nfs_rnode_reclaim(void);
266 static void	nfs_reclaim(void *);
267 static int	failover_safe(failinfo_t *);
268 static void	failover_newserver(mntinfo_t *mi);
269 static void	failover_thread(mntinfo_t *mi);
270 static int	failover_wait(mntinfo_t *);
271 static int	failover_remap(failinfo_t *);
272 static int	failover_lookup(char *, vnode_t *,
273 		    int (*)(vnode_t *, char *, vnode_t **,
274 			struct pathname *, int, vnode_t *, cred_t *, int),
275 		    int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
276 		    vnode_t **);
277 static void	nfs_free_r_path(rnode_t *);
278 static void	nfs_set_vroot(vnode_t *);
279 static char	*nfs_getsrvnames(mntinfo_t *, size_t *);
280 
281 /*
282  * from rpcsec module (common/rpcsec)
283  */
284 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
285 extern void sec_clnt_freeh(AUTH *);
286 extern void sec_clnt_freeinfo(struct sec_data *);
287 
288 /*
289  * used in mount policy
290  */
291 extern ts_label_t *getflabel_cipso(vfs_t *);
292 
293 /*
294  * EIO or EINTR are not recoverable errors.
295  */
296 #define	IS_RECOVERABLE_ERROR(error)	!((error == EINTR) || (error == EIO))
297 
298 #ifdef DEBUG
299 #define	SRV_QFULL_MSG	"send queue to NFS%d server %s is full; still trying\n"
300 #define	SRV_NOTRESP_MSG	"NFS%d server %s not responding still trying\n"
301 #else
302 #define	SRV_QFULL_MSG	"send queue to NFS server %s is full still trying\n"
303 #define	SRV_NOTRESP_MSG	"NFS server %s not responding still trying\n"
304 #endif
305 /*
306  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
307  */
308 static int
309 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
310     struct chtab **chp, struct nfs_clnt *nfscl)
311 {
312 	struct chhead *ch, *newch;
313 	struct chhead **plistp;
314 	struct chtab *cp;
315 	int error;
316 	k_sigset_t smask;
317 
318 	if (newcl == NULL || chp == NULL || ci == NULL)
319 		return (EINVAL);
320 
321 	*newcl = NULL;
322 	*chp = NULL;
323 
324 	/*
325 	 * Find an unused handle or create one
326 	 */
327 	newch = NULL;
328 	nfscl->nfscl_stat.clgets.value.ui64++;
329 top:
330 	/*
331 	 * Find the correct entry in the cache to check for free
332 	 * client handles.  The search is based on the RPC program
333 	 * number, program version number, dev_t for the transport
334 	 * device, and the protocol family.
335 	 */
336 	mutex_enter(&nfscl->nfscl_chtable_lock);
337 	plistp = &nfscl->nfscl_chtable;
338 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
339 		if (ch->ch_prog == ci->cl_prog &&
340 		    ch->ch_vers == ci->cl_vers &&
341 		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
342 		    (strcmp(ch->ch_protofmly,
343 		    svp->sv_knconf->knc_protofmly) == 0))
344 			break;
345 		plistp = &ch->ch_next;
346 	}
347 
348 	/*
349 	 * If we didn't find a cache entry for this quadruple, then
350 	 * create one.  If we don't have one already preallocated,
351 	 * then drop the cache lock, create one, and then start over.
352 	 * If we did have a preallocated entry, then just add it to
353 	 * the front of the list.
354 	 */
355 	if (ch == NULL) {
356 		if (newch == NULL) {
357 			mutex_exit(&nfscl->nfscl_chtable_lock);
358 			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
359 			newch->ch_timesused = 0;
360 			newch->ch_prog = ci->cl_prog;
361 			newch->ch_vers = ci->cl_vers;
362 			newch->ch_dev = svp->sv_knconf->knc_rdev;
363 			newch->ch_protofmly = kmem_alloc(
364 			    strlen(svp->sv_knconf->knc_protofmly) + 1,
365 			    KM_SLEEP);
366 			(void) strcpy(newch->ch_protofmly,
367 			    svp->sv_knconf->knc_protofmly);
368 			newch->ch_list = NULL;
369 			goto top;
370 		}
371 		ch = newch;
372 		newch = NULL;
373 		ch->ch_next = nfscl->nfscl_chtable;
374 		nfscl->nfscl_chtable = ch;
375 	/*
376 	 * We found a cache entry, but if it isn't on the front of the
377 	 * list, then move it to the front of the list to try to take
378 	 * advantage of locality of operations.
379 	 */
380 	} else if (ch != nfscl->nfscl_chtable) {
381 		*plistp = ch->ch_next;
382 		ch->ch_next = nfscl->nfscl_chtable;
383 		nfscl->nfscl_chtable = ch;
384 	}
385 
386 	/*
387 	 * If there was a free client handle cached, then remove it
388 	 * from the list, init it, and use it.
389 	 */
390 	if (ch->ch_list != NULL) {
391 		cp = ch->ch_list;
392 		ch->ch_list = cp->ch_list;
393 		mutex_exit(&nfscl->nfscl_chtable_lock);
394 		if (newch != NULL) {
395 			kmem_free(newch->ch_protofmly,
396 			    strlen(newch->ch_protofmly) + 1);
397 			kmem_free(newch, sizeof (*newch));
398 		}
399 		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
400 		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
401 		error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
402 		    &cp->ch_client->cl_auth);
403 		if (error || cp->ch_client->cl_auth == NULL) {
404 			CLNT_DESTROY(cp->ch_client);
405 			kmem_cache_free(chtab_cache, cp);
406 			return ((error != 0) ? error : EINTR);
407 		}
408 		ch->ch_timesused++;
409 		*newcl = cp->ch_client;
410 		*chp = cp;
411 		return (0);
412 	}
413 
414 	/*
415 	 * There weren't any free client handles which fit, so allocate
416 	 * a new one and use that.
417 	 */
418 #ifdef DEBUG
419 	atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
420 #endif
421 	mutex_exit(&nfscl->nfscl_chtable_lock);
422 
423 	nfscl->nfscl_stat.cltoomany.value.ui64++;
424 	if (newch != NULL) {
425 		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
426 		kmem_free(newch, sizeof (*newch));
427 	}
428 
429 	cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
430 	cp->ch_head = ch;
431 
432 	sigintr(&smask, (int)ci->cl_flags & MI_INT);
433 	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
434 	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
435 	sigunintr(&smask);
436 
437 	if (error != 0) {
438 		kmem_cache_free(chtab_cache, cp);
439 #ifdef DEBUG
440 		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
441 #endif
442 		/*
443 		 * Warning is unnecessary if error is EINTR.
444 		 */
445 		if (error != EINTR) {
446 			nfs_cmn_err(error, CE_WARN,
447 			    "clget: couldn't create handle: %m\n");
448 		}
449 		return (error);
450 	}
451 	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
452 	auth_destroy(cp->ch_client->cl_auth);
453 	error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
454 	    &cp->ch_client->cl_auth);
455 	if (error || cp->ch_client->cl_auth == NULL) {
456 		CLNT_DESTROY(cp->ch_client);
457 		kmem_cache_free(chtab_cache, cp);
458 #ifdef DEBUG
459 		atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
460 #endif
461 		return ((error != 0) ? error : EINTR);
462 	}
463 	ch->ch_timesused++;
464 	*newcl = cp->ch_client;
465 	ASSERT(cp->ch_client->cl_nosignal == FALSE);
466 	*chp = cp;
467 	return (0);
468 }
469 
470 int
471 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
472     struct chtab **chp)
473 {
474 	struct nfs_clnt *nfscl;
475 
476 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
477 	ASSERT(nfscl != NULL);
478 
479 	return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
480 }
481 
482 static int
483 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
484     struct chtab **chp, struct nfs_clnt *nfscl)
485 {
486 	clinfo_t ci;
487 	int error;
488 
489 	/*
490 	 * Set read buffer size to rsize
491 	 * and add room for RPC headers.
492 	 */
493 	ci.cl_readsize = mi->mi_tsize;
494 	if (ci.cl_readsize != 0)
495 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
496 
497 	/*
498 	 * If soft mount and server is down just try once.
499 	 * meaning: do not retransmit.
500 	 */
501 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
502 		ci.cl_retrans = 0;
503 	else
504 		ci.cl_retrans = mi->mi_retrans;
505 
506 	ci.cl_prog = NFS_ACL_PROGRAM;
507 	ci.cl_vers = mi->mi_vers;
508 	ci.cl_flags = mi->mi_flags;
509 
510 	/*
511 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
512 	 * security flavor, the client tries to establish a security context
513 	 * by contacting the server. If the connection is timed out or reset,
514 	 * e.g. server reboot, we will try again.
515 	 */
516 	do {
517 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
518 
519 		if (error == 0)
520 			break;
521 
522 		/*
523 		 * For forced unmount or zone shutdown, bail out, no retry.
524 		 */
525 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
526 			error = EIO;
527 			break;
528 		}
529 
530 		/* do not retry for softmount */
531 		if (!(mi->mi_flags & MI_HARD))
532 			break;
533 
534 		/* let the caller deal with the failover case */
535 		if (FAILOVER_MOUNT(mi))
536 			break;
537 
538 	} while (error == ETIMEDOUT || error == ECONNRESET);
539 
540 	return (error);
541 }
542 
543 static int
544 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
545     struct chtab **chp, struct nfs_clnt *nfscl)
546 {
547 	clinfo_t ci;
548 	int error;
549 
550 	/*
551 	 * Set read buffer size to rsize
552 	 * and add room for RPC headers.
553 	 */
554 	ci.cl_readsize = mi->mi_tsize;
555 	if (ci.cl_readsize != 0)
556 		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
557 
558 	/*
559 	 * If soft mount and server is down just try once.
560 	 * meaning: do not retransmit.
561 	 */
562 	if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
563 		ci.cl_retrans = 0;
564 	else
565 		ci.cl_retrans = mi->mi_retrans;
566 
567 	ci.cl_prog = mi->mi_prog;
568 	ci.cl_vers = mi->mi_vers;
569 	ci.cl_flags = mi->mi_flags;
570 
571 	/*
572 	 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
573 	 * security flavor, the client tries to establish a security context
574 	 * by contacting the server. If the connection is timed out or reset,
575 	 * e.g. server reboot, we will try again.
576 	 */
577 	do {
578 		error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
579 
580 		if (error == 0)
581 			break;
582 
583 		/*
584 		 * For forced unmount or zone shutdown, bail out, no retry.
585 		 */
586 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
587 			error = EIO;
588 			break;
589 		}
590 
591 		/* do not retry for softmount */
592 		if (!(mi->mi_flags & MI_HARD))
593 			break;
594 
595 		/* let the caller deal with the failover case */
596 		if (FAILOVER_MOUNT(mi))
597 			break;
598 
599 	} while (error == ETIMEDOUT || error == ECONNRESET);
600 
601 	return (error);
602 }
603 
604 static void
605 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
606 {
607 	if (cl->cl_auth != NULL) {
608 		sec_clnt_freeh(cl->cl_auth);
609 		cl->cl_auth = NULL;
610 	}
611 
612 	/*
613 	 * Timestamp this cache entry so that we know when it was last
614 	 * used.
615 	 */
616 	cp->ch_freed = gethrestime_sec();
617 
618 	/*
619 	 * Add the free client handle to the front of the list.
620 	 * This way, the list will be sorted in youngest to oldest
621 	 * order.
622 	 */
623 	mutex_enter(&nfscl->nfscl_chtable_lock);
624 	cp->ch_list = cp->ch_head->ch_list;
625 	cp->ch_head->ch_list = cp;
626 	mutex_exit(&nfscl->nfscl_chtable_lock);
627 }
628 
629 void
630 clfree(CLIENT *cl, struct chtab *cp)
631 {
632 	struct nfs_clnt *nfscl;
633 
634 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
635 	ASSERT(nfscl != NULL);
636 
637 	clfree_impl(cl, cp, nfscl);
638 }
639 
640 #define	CL_HOLDTIME	60	/* time to hold client handles */
641 
642 static void
643 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
644 {
645 	struct chhead *ch;
646 	struct chtab *cp;	/* list of objects that can be reclaimed */
647 	struct chtab *cpe;
648 	struct chtab *cpl;
649 	struct chtab **cpp;
650 #ifdef DEBUG
651 	int n = 0;
652 #endif
653 
654 	/*
655 	 * Need to reclaim some memory, so step through the cache
656 	 * looking through the lists for entries which can be freed.
657 	 */
658 	cp = NULL;
659 
660 	mutex_enter(&nfscl->nfscl_chtable_lock);
661 
662 	/*
663 	 * Here we step through each non-NULL quadruple and start to
664 	 * construct the reclaim list pointed to by cp.  Note that
665 	 * cp will contain all eligible chtab entries.  When this traversal
666 	 * completes, chtab entries from the last quadruple will be at the
667 	 * front of cp and entries from previously inspected quadruples have
668 	 * been appended to the rear of cp.
669 	 */
670 	for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
671 		if (ch->ch_list == NULL)
672 			continue;
673 		/*
674 		 * Search each list for entries older then
675 		 * cl_holdtime seconds.  The lists are maintained
676 		 * in youngest to oldest order so that when the
677 		 * first entry is found which is old enough, then
678 		 * all of the rest of the entries on the list will
679 		 * be old enough as well.
680 		 */
681 		cpl = ch->ch_list;
682 		cpp = &ch->ch_list;
683 		while (cpl != NULL &&
684 		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
685 			cpp = &cpl->ch_list;
686 			cpl = cpl->ch_list;
687 		}
688 		if (cpl != NULL) {
689 			*cpp = NULL;
690 			if (cp != NULL) {
691 				cpe = cpl;
692 				while (cpe->ch_list != NULL)
693 					cpe = cpe->ch_list;
694 				cpe->ch_list = cp;
695 			}
696 			cp = cpl;
697 		}
698 	}
699 
700 	mutex_exit(&nfscl->nfscl_chtable_lock);
701 
702 	/*
703 	 * If cp is empty, then there is nothing to reclaim here.
704 	 */
705 	if (cp == NULL)
706 		return;
707 
708 	/*
709 	 * Step through the list of entries to free, destroying each client
710 	 * handle and kmem_free'ing the memory for each entry.
711 	 */
712 	while (cp != NULL) {
713 #ifdef DEBUG
714 		n++;
715 #endif
716 		CLNT_DESTROY(cp->ch_client);
717 		cpl = cp->ch_list;
718 		kmem_cache_free(chtab_cache, cp);
719 		cp = cpl;
720 	}
721 
722 #ifdef DEBUG
723 	/*
724 	 * Update clalloc so that nfsstat shows the current number
725 	 * of allocated client handles.
726 	 */
727 	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
728 #endif
729 }
730 
731 /* ARGSUSED */
732 static void
733 clreclaim(void *all)
734 {
735 	struct nfs_clnt *nfscl;
736 
737 #ifdef DEBUG
738 	clstat_debug.clreclaim.value.ui64++;
739 #endif
740 	/*
741 	 * The system is low on memory; go through and try to reclaim some from
742 	 * every zone on the system.
743 	 */
744 	mutex_enter(&nfs_clnt_list_lock);
745 	nfscl = list_head(&nfs_clnt_list);
746 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
747 		clreclaim_zone(nfscl, CL_HOLDTIME);
748 	mutex_exit(&nfs_clnt_list_lock);
749 }
750 
751 /*
752  * Minimum time-out values indexed by call type
753  * These units are in "eights" of a second to avoid multiplies
754  */
755 static unsigned int minimum_timeo[] = {
756 	6, 7, 10
757 };
758 
759 /*
760  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
761  */
762 #define	MAXTIMO	(20*hz)
763 #define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
764 #define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
765 
766 #define	MIN_NFS_TSIZE 512	/* minimum "chunk" of NFS IO */
767 #define	REDUCE_NFS_TIME (hz/2)	/* rtxcur we try to keep under */
768 #define	INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
769 
770 /*
771  * Function called when rfscall notices that we have been
772  * re-transmitting, or when we get a response without retransmissions.
773  * Return 1 if the transfer size was adjusted down - 0 if no change.
774  */
775 static int
776 nfs_feedback(int flag, int which, mntinfo_t *mi)
777 {
778 	int kind;
779 	int r = 0;
780 
781 	mutex_enter(&mi->mi_lock);
782 	if (flag == FEEDBACK_REXMIT1) {
783 		if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
784 		    mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
785 			goto done;
786 		if (mi->mi_curread > MIN_NFS_TSIZE) {
787 			mi->mi_curread /= 2;
788 			if (mi->mi_curread < MIN_NFS_TSIZE)
789 				mi->mi_curread = MIN_NFS_TSIZE;
790 			r = 1;
791 		}
792 
793 		if (mi->mi_curwrite > MIN_NFS_TSIZE) {
794 			mi->mi_curwrite /= 2;
795 			if (mi->mi_curwrite < MIN_NFS_TSIZE)
796 				mi->mi_curwrite = MIN_NFS_TSIZE;
797 			r = 1;
798 		}
799 	} else if (flag == FEEDBACK_OK) {
800 		kind = mi->mi_timer_type[which];
801 		if (kind == 0 ||
802 		    mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
803 			goto done;
804 		if (kind == 1) {
805 			if (mi->mi_curread >= mi->mi_tsize)
806 				goto done;
807 			mi->mi_curread +=  MIN_NFS_TSIZE;
808 			if (mi->mi_curread > mi->mi_tsize/2)
809 				mi->mi_curread = mi->mi_tsize;
810 		} else if (kind == 2) {
811 			if (mi->mi_curwrite >= mi->mi_stsize)
812 				goto done;
813 			mi->mi_curwrite += MIN_NFS_TSIZE;
814 			if (mi->mi_curwrite > mi->mi_stsize/2)
815 				mi->mi_curwrite = mi->mi_stsize;
816 		}
817 	}
818 done:
819 	mutex_exit(&mi->mi_lock);
820 	return (r);
821 }
822 
823 #ifdef DEBUG
824 static int rfs2call_hits = 0;
825 static int rfs2call_misses = 0;
826 #endif
827 
828 int
829 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
830     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
831     enum nfsstat *statusp, int flags, failinfo_t *fi)
832 {
833 	int rpcerror;
834 	enum clnt_stat rpc_status;
835 
836 	ASSERT(statusp != NULL);
837 
838 	rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
839 	    cr, douprintf, &rpc_status, flags, fi);
840 	if (!rpcerror) {
841 		/*
842 		 * See crnetadjust() for comments.
843 		 */
844 		if (*statusp == NFSERR_ACCES &&
845 		    (cr = crnetadjust(cr)) != NULL) {
846 #ifdef DEBUG
847 			rfs2call_hits++;
848 #endif
849 			rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
850 			    resp, cr, douprintf, NULL, flags, fi);
851 			crfree(cr);
852 #ifdef DEBUG
853 			if (*statusp == NFSERR_ACCES)
854 				rfs2call_misses++;
855 #endif
856 		}
857 	} else if (rpc_status == RPC_PROCUNAVAIL) {
858 		*statusp = NFSERR_OPNOTSUPP;
859 		rpcerror = 0;
860 	}
861 
862 	return (rpcerror);
863 }
864 
865 #define	NFS3_JUKEBOX_DELAY	10 * hz
866 
867 static clock_t nfs3_jukebox_delay = 0;
868 
869 #ifdef DEBUG
870 static int rfs3call_hits = 0;
871 static int rfs3call_misses = 0;
872 #endif
873 
874 int
875 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
876     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
877     nfsstat3 *statusp, int flags, failinfo_t *fi)
878 {
879 	int rpcerror;
880 	int user_informed;
881 
882 	user_informed = 0;
883 	do {
884 		rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
885 		    cr, douprintf, NULL, flags, fi);
886 		if (!rpcerror) {
887 			cred_t *crr;
888 			if (*statusp == NFS3ERR_JUKEBOX) {
889 				if (ttoproc(curthread) == &p0) {
890 					rpcerror = EAGAIN;
891 					break;
892 				}
893 				if (!user_informed) {
894 					user_informed = 1;
895 					uprintf(
896 		"file temporarily unavailable on the server, retrying...\n");
897 				}
898 				delay(nfs3_jukebox_delay);
899 			}
900 			/*
901 			 * See crnetadjust() for comments.
902 			 */
903 			else if (*statusp == NFS3ERR_ACCES &&
904 			    (crr = crnetadjust(cr)) != NULL) {
905 #ifdef DEBUG
906 				rfs3call_hits++;
907 #endif
908 				rpcerror = rfscall(mi, which, xdrargs, argsp,
909 				    xdrres, resp, crr, douprintf,
910 				    NULL, flags, fi);
911 
912 				crfree(crr);
913 #ifdef DEBUG
914 				if (*statusp == NFS3ERR_ACCES)
915 					rfs3call_misses++;
916 #endif
917 			}
918 		}
919 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
920 
921 	return (rpcerror);
922 }
923 
924 #define	VALID_FH(fi)	(VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
925 #define	INC_READERS(mi)		{ \
926 	mi->mi_readers++; \
927 }
928 #define	DEC_READERS(mi)		{ \
929 	mi->mi_readers--; \
930 	if (mi->mi_readers == 0) \
931 		cv_broadcast(&mi->mi_failover_cv); \
932 }
933 
934 static int
935 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
936     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
937     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
938 {
939 	CLIENT *client;
940 	struct chtab *ch;
941 	cred_t *cr = icr;
942 	enum clnt_stat status;
943 	struct rpc_err rpcerr, rpcerr_tmp;
944 	struct timeval wait;
945 	int timeo;		/* in units of hz */
946 	int my_rsize, my_wsize;
947 	bool_t tryagain;
948 	bool_t cred_cloned = FALSE;
949 	k_sigset_t smask;
950 	servinfo_t *svp;
951 	struct nfs_clnt *nfscl;
952 	zoneid_t zoneid = getzoneid();
953 	char *msg;
954 #ifdef DEBUG
955 	char *bufp;
956 #endif
957 
958 
959 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
960 	    "rfscall_start:which %d mi %p", which, mi);
961 
962 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
963 	ASSERT(nfscl != NULL);
964 
965 	nfscl->nfscl_stat.calls.value.ui64++;
966 	mi->mi_reqs[which].value.ui64++;
967 
968 	rpcerr.re_status = RPC_SUCCESS;
969 
970 	/*
971 	 * In case of forced unmount or zone shutdown, return EIO.
972 	 */
973 
974 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
975 		rpcerr.re_status = RPC_FAILED;
976 		rpcerr.re_errno = EIO;
977 		return (rpcerr.re_errno);
978 	}
979 
980 	/*
981 	 * Remember the transfer sizes in case
982 	 * nfs_feedback changes them underneath us.
983 	 */
984 	my_rsize = mi->mi_curread;
985 	my_wsize = mi->mi_curwrite;
986 
987 	/*
988 	 * NFS client failover support
989 	 *
990 	 * If this rnode is not in sync with the current server (VALID_FH),
991 	 * we'd like to do a remap to get in sync.  We can be interrupted
992 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
993 	 * use the best info we have to try the RPC.  Part of that is
994 	 * unconditionally updating the filehandle copy kept for V3.
995 	 *
996 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
997 	 * rw_enter(); we're trying to keep the current server from being
998 	 * changed on us until we're done with the remapping and have a
999 	 * matching client handle.  We don't want to sending a filehandle
1000 	 * to the wrong host.
1001 	 */
1002 failoverretry:
1003 	if (FAILOVER_MOUNT(mi)) {
1004 		mutex_enter(&mi->mi_lock);
1005 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1006 			if (failover_wait(mi)) {
1007 				mutex_exit(&mi->mi_lock);
1008 				return (EINTR);
1009 			}
1010 		}
1011 		INC_READERS(mi);
1012 		mutex_exit(&mi->mi_lock);
1013 		if (fi) {
1014 			if (!VALID_FH(fi) &&
1015 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1016 				int remaperr;
1017 
1018 				svp = mi->mi_curr_serv;
1019 				remaperr = failover_remap(fi);
1020 				if (remaperr != 0) {
1021 #ifdef DEBUG
1022 					if (remaperr != EINTR)
1023 						nfs_cmn_err(remaperr, CE_WARN,
1024 					    "rfscall couldn't failover: %m");
1025 #endif
1026 					mutex_enter(&mi->mi_lock);
1027 					DEC_READERS(mi);
1028 					mutex_exit(&mi->mi_lock);
1029 					/*
1030 					 * If failover_remap returns ETIMEDOUT
1031 					 * and the filesystem is hard mounted
1032 					 * we have to retry the call with a new
1033 					 * server.
1034 					 */
1035 					if ((mi->mi_flags & MI_HARD) &&
1036 					    IS_RECOVERABLE_ERROR(remaperr)) {
1037 						if (svp == mi->mi_curr_serv)
1038 							failover_newserver(mi);
1039 						rpcerr.re_status = RPC_SUCCESS;
1040 						goto failoverretry;
1041 					}
1042 					rpcerr.re_errno = remaperr;
1043 					return (remaperr);
1044 				}
1045 			}
1046 			if (fi->fhp && fi->copyproc)
1047 				(*fi->copyproc)(fi->fhp, fi->vp);
1048 		}
1049 	}
1050 
1051 	/* For TSOL, use a new cred which has net_mac_aware flag */
1052 	if (!cred_cloned && is_system_labeled()) {
1053 		cred_cloned = TRUE;
1054 		cr = crdup(icr);
1055 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1056 	}
1057 
1058 	/*
1059 	 * clget() calls clnt_tli_kinit() which clears the xid, so we
1060 	 * are guaranteed to reprocess the retry as a new request.
1061 	 */
1062 	svp = mi->mi_curr_serv;
1063 	rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1064 
1065 	if (FAILOVER_MOUNT(mi)) {
1066 		mutex_enter(&mi->mi_lock);
1067 		DEC_READERS(mi);
1068 		mutex_exit(&mi->mi_lock);
1069 
1070 		if ((rpcerr.re_errno == ETIMEDOUT ||
1071 		    rpcerr.re_errno == ECONNRESET) &&
1072 		    failover_safe(fi)) {
1073 			if (svp == mi->mi_curr_serv)
1074 				failover_newserver(mi);
1075 			goto failoverretry;
1076 		}
1077 	}
1078 	if (rpcerr.re_errno != 0)
1079 		return (rpcerr.re_errno);
1080 
1081 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1082 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1083 		timeo = (mi->mi_timeo * hz) / 10;
1084 	} else {
1085 		mutex_enter(&mi->mi_lock);
1086 		timeo = CLNT_SETTIMERS(client,
1087 		    &(mi->mi_timers[mi->mi_timer_type[which]]),
1088 		    &(mi->mi_timers[NFS_CALLTYPES]),
1089 		    (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1090 		    (void (*)())NULL, (caddr_t)mi, 0);
1091 		mutex_exit(&mi->mi_lock);
1092 	}
1093 
1094 	/*
1095 	 * If hard mounted fs, retry call forever unless hard error occurs.
1096 	 */
1097 	do {
1098 		tryagain = FALSE;
1099 
1100 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1101 			status = RPC_FAILED;
1102 			rpcerr.re_status = RPC_FAILED;
1103 			rpcerr.re_errno = EIO;
1104 			break;
1105 		}
1106 
1107 		TICK_TO_TIMEVAL(timeo, &wait);
1108 
1109 		/*
1110 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1111 		 * and SIGTERM. (Preserving the existing masks).
1112 		 * Mask out SIGINT if mount option nointr is specified.
1113 		 */
1114 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1115 		if (!(mi->mi_flags & MI_INT))
1116 			client->cl_nosignal = TRUE;
1117 
1118 		/*
1119 		 * If there is a current signal, then don't bother
1120 		 * even trying to send out the request because we
1121 		 * won't be able to block waiting for the response.
1122 		 * Simply assume RPC_INTR and get on with it.
1123 		 */
1124 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1125 			status = RPC_INTR;
1126 		else {
1127 			status = CLNT_CALL(client, which, xdrargs, argsp,
1128 			    xdrres, resp, wait);
1129 		}
1130 
1131 		if (!(mi->mi_flags & MI_INT))
1132 			client->cl_nosignal = FALSE;
1133 		/*
1134 		 * restore original signal mask
1135 		 */
1136 		sigunintr(&smask);
1137 
1138 		switch (status) {
1139 		case RPC_SUCCESS:
1140 			if ((mi->mi_flags & MI_DYNAMIC) &&
1141 			    mi->mi_timer_type[which] != 0 &&
1142 			    (mi->mi_curread != my_rsize ||
1143 			    mi->mi_curwrite != my_wsize))
1144 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1145 			break;
1146 
1147 		case RPC_INTR:
1148 			/*
1149 			 * There is no way to recover from this error,
1150 			 * even if mount option nointr is specified.
1151 			 * SIGKILL, for example, cannot be blocked.
1152 			 */
1153 			rpcerr.re_status = RPC_INTR;
1154 			rpcerr.re_errno = EINTR;
1155 			break;
1156 
1157 		case RPC_UDERROR:
1158 			/*
1159 			 * If the NFS server is local (vold) and
1160 			 * it goes away then we get RPC_UDERROR.
1161 			 * This is a retryable error, so we would
1162 			 * loop, so check to see if the specific
1163 			 * error was ECONNRESET, indicating that
1164 			 * target did not exist at all.  If so,
1165 			 * return with RPC_PROGUNAVAIL and
1166 			 * ECONNRESET to indicate why.
1167 			 */
1168 			CLNT_GETERR(client, &rpcerr);
1169 			if (rpcerr.re_errno == ECONNRESET) {
1170 				rpcerr.re_status = RPC_PROGUNAVAIL;
1171 				rpcerr.re_errno = ECONNRESET;
1172 				break;
1173 			}
1174 			/*FALLTHROUGH*/
1175 
1176 		default:		/* probably RPC_TIMEDOUT */
1177 			if (IS_UNRECOVERABLE_RPC(status))
1178 				break;
1179 
1180 			/*
1181 			 * increment server not responding count
1182 			 */
1183 			mutex_enter(&mi->mi_lock);
1184 			mi->mi_noresponse++;
1185 			mutex_exit(&mi->mi_lock);
1186 #ifdef DEBUG
1187 			nfscl->nfscl_stat.noresponse.value.ui64++;
1188 #endif
1189 
1190 			if (!(mi->mi_flags & MI_HARD)) {
1191 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1192 				    (mi->mi_ss_call_type[which] == 0))
1193 					break;
1194 			}
1195 
1196 			/*
1197 			 * The call is in progress (over COTS).
1198 			 * Try the CLNT_CALL again, but don't
1199 			 * print a noisy error message.
1200 			 */
1201 			if (status == RPC_INPROGRESS) {
1202 				tryagain = TRUE;
1203 				break;
1204 			}
1205 
1206 			if (flags & RFSCALL_SOFT)
1207 				break;
1208 
1209 			/*
1210 			 * On zone shutdown, just move on.
1211 			 */
1212 			if (zone_status_get(curproc->p_zone) >=
1213 			    ZONE_IS_SHUTTING_DOWN) {
1214 				rpcerr.re_status = RPC_FAILED;
1215 				rpcerr.re_errno = EIO;
1216 				break;
1217 			}
1218 
1219 			/*
1220 			 * NFS client failover support
1221 			 *
1222 			 * If the current server just failed us, we'll
1223 			 * start the process of finding a new server.
1224 			 * After that, we can just retry.
1225 			 */
1226 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1227 				if (svp == mi->mi_curr_serv)
1228 					failover_newserver(mi);
1229 				clfree_impl(client, ch, nfscl);
1230 				goto failoverretry;
1231 			}
1232 
1233 			tryagain = TRUE;
1234 			timeo = backoff(timeo);
1235 
1236 			CLNT_GETERR(client, &rpcerr_tmp);
1237 			if ((status == RPC_CANTSEND) &&
1238 			    (rpcerr_tmp.re_errno == ENOBUFS))
1239 				msg = SRV_QFULL_MSG;
1240 			else
1241 				msg = SRV_NOTRESP_MSG;
1242 
1243 			mutex_enter(&mi->mi_lock);
1244 			if (!(mi->mi_flags & MI_PRINTED)) {
1245 				mi->mi_flags |= MI_PRINTED;
1246 				mutex_exit(&mi->mi_lock);
1247 #ifdef DEBUG
1248 				zprintf(zoneid, msg, mi->mi_vers,
1249 				    svp->sv_hostname);
1250 #else
1251 				zprintf(zoneid, msg, svp->sv_hostname);
1252 #endif
1253 			} else
1254 				mutex_exit(&mi->mi_lock);
1255 			if (*douprintf && nfs_has_ctty()) {
1256 				*douprintf = 0;
1257 				if (!(mi->mi_flags & MI_NOPRINT))
1258 #ifdef DEBUG
1259 					uprintf(msg, mi->mi_vers,
1260 					    svp->sv_hostname);
1261 #else
1262 					uprintf(msg, svp->sv_hostname);
1263 #endif
1264 			}
1265 
1266 			/*
1267 			 * If doing dynamic adjustment of transfer
1268 			 * size and if it's a read or write call
1269 			 * and if the transfer size changed while
1270 			 * retransmitting or if the feedback routine
1271 			 * changed the transfer size,
1272 			 * then exit rfscall so that the transfer
1273 			 * size can be adjusted at the vnops level.
1274 			 */
1275 			if ((mi->mi_flags & MI_DYNAMIC) &&
1276 			    mi->mi_timer_type[which] != 0 &&
1277 			    (mi->mi_curread != my_rsize ||
1278 			    mi->mi_curwrite != my_wsize ||
1279 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1280 				/*
1281 				 * On read or write calls, return
1282 				 * back to the vnode ops level if
1283 				 * the transfer size changed.
1284 				 */
1285 				clfree_impl(client, ch, nfscl);
1286 				if (cred_cloned)
1287 					crfree(cr);
1288 				return (ENFS_TRYAGAIN);
1289 			}
1290 		}
1291 	} while (tryagain);
1292 
1293 	if (status != RPC_SUCCESS) {
1294 		/*
1295 		 * Let soft mounts use the timed out message.
1296 		 */
1297 		if (status == RPC_INPROGRESS)
1298 			status = RPC_TIMEDOUT;
1299 		nfscl->nfscl_stat.badcalls.value.ui64++;
1300 		if (status != RPC_INTR) {
1301 			mutex_enter(&mi->mi_lock);
1302 			mi->mi_flags |= MI_DOWN;
1303 			mutex_exit(&mi->mi_lock);
1304 			CLNT_GETERR(client, &rpcerr);
1305 #ifdef DEBUG
1306 			bufp = clnt_sperror(client, svp->sv_hostname);
1307 			zprintf(zoneid, "NFS%d %s failed for %s\n",
1308 			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
1309 			if (nfs_has_ctty()) {
1310 				if (!(mi->mi_flags & MI_NOPRINT)) {
1311 					uprintf("NFS%d %s failed for %s\n",
1312 					    mi->mi_vers, mi->mi_rfsnames[which],
1313 					    bufp);
1314 				}
1315 			}
1316 			kmem_free(bufp, MAXPATHLEN);
1317 #else
1318 			zprintf(zoneid,
1319 			    "NFS %s failed for server %s: error %d (%s)\n",
1320 			    mi->mi_rfsnames[which], svp->sv_hostname,
1321 			    status, clnt_sperrno(status));
1322 			if (nfs_has_ctty()) {
1323 				if (!(mi->mi_flags & MI_NOPRINT)) {
1324 					uprintf(
1325 				"NFS %s failed for server %s: error %d (%s)\n",
1326 					    mi->mi_rfsnames[which],
1327 					    svp->sv_hostname, status,
1328 					    clnt_sperrno(status));
1329 				}
1330 			}
1331 #endif
1332 			/*
1333 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1334 			 * re_errno is set appropriately depending on
1335 			 * the authentication error
1336 			 */
1337 			if (status == RPC_VERSMISMATCH ||
1338 			    status == RPC_PROGVERSMISMATCH)
1339 				rpcerr.re_errno = EIO;
1340 		}
1341 	} else {
1342 		/*
1343 		 * Test the value of mi_down and mi_printed without
1344 		 * holding the mi_lock mutex.  If they are both zero,
1345 		 * then it is okay to skip the down and printed
1346 		 * processing.  This saves on a mutex_enter and
1347 		 * mutex_exit pair for a normal, successful RPC.
1348 		 * This was just complete overhead.
1349 		 */
1350 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1351 			mutex_enter(&mi->mi_lock);
1352 			mi->mi_flags &= ~MI_DOWN;
1353 			if (mi->mi_flags & MI_PRINTED) {
1354 				mi->mi_flags &= ~MI_PRINTED;
1355 				mutex_exit(&mi->mi_lock);
1356 #ifdef DEBUG
1357 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358 				zprintf(zoneid, "NFS%d server %s ok\n",
1359 				    mi->mi_vers, svp->sv_hostname);
1360 #else
1361 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362 				zprintf(zoneid, "NFS server %s ok\n",
1363 				    svp->sv_hostname);
1364 #endif
1365 			} else
1366 				mutex_exit(&mi->mi_lock);
1367 		}
1368 
1369 		if (*douprintf == 0) {
1370 			if (!(mi->mi_flags & MI_NOPRINT))
1371 #ifdef DEBUG
1372 				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1373 					uprintf("NFS%d server %s ok\n",
1374 					    mi->mi_vers, svp->sv_hostname);
1375 #else
1376 			if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1377 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1378 #endif
1379 			*douprintf = 1;
1380 		}
1381 	}
1382 
1383 	clfree_impl(client, ch, nfscl);
1384 	if (cred_cloned)
1385 		crfree(cr);
1386 
1387 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1388 
1389 	if (rpc_status != NULL)
1390 		*rpc_status = rpcerr.re_status;
1391 
1392 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1393 	    rpcerr.re_errno);
1394 
1395 	return (rpcerr.re_errno);
1396 }
1397 
1398 #ifdef DEBUG
1399 static int acl2call_hits = 0;
1400 static int acl2call_misses = 0;
1401 #endif
1402 
1403 int
1404 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1405     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1406     enum nfsstat *statusp, int flags, failinfo_t *fi)
1407 {
1408 	int rpcerror;
1409 
1410 	rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1411 	    cr, douprintf, flags, fi);
1412 	if (!rpcerror) {
1413 		/*
1414 		 * See comments with crnetadjust().
1415 		 */
1416 		if (*statusp == NFSERR_ACCES &&
1417 		    (cr = crnetadjust(cr)) != NULL) {
1418 #ifdef DEBUG
1419 			acl2call_hits++;
1420 #endif
1421 			rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1422 			    resp, cr, douprintf, flags, fi);
1423 			crfree(cr);
1424 #ifdef DEBUG
1425 			if (*statusp == NFSERR_ACCES)
1426 				acl2call_misses++;
1427 #endif
1428 		}
1429 	}
1430 
1431 	return (rpcerror);
1432 }
1433 
1434 #ifdef DEBUG
1435 static int acl3call_hits = 0;
1436 static int acl3call_misses = 0;
1437 #endif
1438 
1439 int
1440 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1441     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1442     nfsstat3 *statusp, int flags, failinfo_t *fi)
1443 {
1444 	int rpcerror;
1445 	int user_informed;
1446 
1447 	user_informed = 0;
1448 
1449 	do {
1450 		rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1451 		    cr, douprintf, flags, fi);
1452 		if (!rpcerror) {
1453 			cred_t *crr;
1454 			if (*statusp == NFS3ERR_JUKEBOX) {
1455 				if (!user_informed) {
1456 					user_informed = 1;
1457 					uprintf(
1458 		"file temporarily unavailable on the server, retrying...\n");
1459 				}
1460 				delay(nfs3_jukebox_delay);
1461 			}
1462 			/*
1463 			 * See crnetadjust() for comments.
1464 			 */
1465 			else if (*statusp == NFS3ERR_ACCES &&
1466 			    (crr = crnetadjust(cr)) != NULL) {
1467 #ifdef DEBUG
1468 				acl3call_hits++;
1469 #endif
1470 				rpcerror = aclcall(mi, which, xdrargs, argsp,
1471 				    xdrres, resp, crr, douprintf, flags, fi);
1472 
1473 				crfree(crr);
1474 #ifdef DEBUG
1475 				if (*statusp == NFS3ERR_ACCES)
1476 					acl3call_misses++;
1477 #endif
1478 			}
1479 		}
1480 	} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1481 
1482 	return (rpcerror);
1483 }
1484 
1485 static int
1486 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1487     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1488     int flags, failinfo_t *fi)
1489 {
1490 	CLIENT *client;
1491 	struct chtab *ch;
1492 	cred_t *cr = icr;
1493 	bool_t cred_cloned = FALSE;
1494 	enum clnt_stat status;
1495 	struct rpc_err rpcerr;
1496 	struct timeval wait;
1497 	int timeo;		/* in units of hz */
1498 #if 0 /* notyet */
1499 	int my_rsize, my_wsize;
1500 #endif
1501 	bool_t tryagain;
1502 	k_sigset_t smask;
1503 	servinfo_t *svp;
1504 	struct nfs_clnt *nfscl;
1505 	zoneid_t zoneid = getzoneid();
1506 #ifdef DEBUG
1507 	char *bufp;
1508 #endif
1509 
1510 #if 0 /* notyet */
1511 	TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1512 	    "rfscall_start:which %d mi %p", which, mi);
1513 #endif
1514 
1515 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1516 	ASSERT(nfscl != NULL);
1517 
1518 	nfscl->nfscl_stat.calls.value.ui64++;
1519 	mi->mi_aclreqs[which].value.ui64++;
1520 
1521 	rpcerr.re_status = RPC_SUCCESS;
1522 
1523 	if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1524 		rpcerr.re_status = RPC_FAILED;
1525 		rpcerr.re_errno = EIO;
1526 		return (rpcerr.re_errno);
1527 	}
1528 
1529 #if 0 /* notyet */
1530 	/*
1531 	 * Remember the transfer sizes in case
1532 	 * nfs_feedback changes them underneath us.
1533 	 */
1534 	my_rsize = mi->mi_curread;
1535 	my_wsize = mi->mi_curwrite;
1536 #endif
1537 
1538 	/*
1539 	 * NFS client failover support
1540 	 *
1541 	 * If this rnode is not in sync with the current server (VALID_FH),
1542 	 * we'd like to do a remap to get in sync.  We can be interrupted
1543 	 * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1544 	 * use the best info we have to try the RPC.  Part of that is
1545 	 * unconditionally updating the filehandle copy kept for V3.
1546 	 *
1547 	 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1548 	 * rw_enter(); we're trying to keep the current server from being
1549 	 * changed on us until we're done with the remapping and have a
1550 	 * matching client handle.  We don't want to sending a filehandle
1551 	 * to the wrong host.
1552 	 */
1553 failoverretry:
1554 	if (FAILOVER_MOUNT(mi)) {
1555 		mutex_enter(&mi->mi_lock);
1556 		if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1557 			if (failover_wait(mi)) {
1558 				mutex_exit(&mi->mi_lock);
1559 				return (EINTR);
1560 			}
1561 		}
1562 		INC_READERS(mi);
1563 		mutex_exit(&mi->mi_lock);
1564 		if (fi) {
1565 			if (!VALID_FH(fi) &&
1566 			    !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1567 				int remaperr;
1568 
1569 				svp = mi->mi_curr_serv;
1570 				remaperr = failover_remap(fi);
1571 				if (remaperr != 0) {
1572 #ifdef DEBUG
1573 					if (remaperr != EINTR)
1574 						nfs_cmn_err(remaperr, CE_WARN,
1575 					    "aclcall couldn't failover: %m");
1576 #endif
1577 					mutex_enter(&mi->mi_lock);
1578 					DEC_READERS(mi);
1579 					mutex_exit(&mi->mi_lock);
1580 
1581 					/*
1582 					 * If failover_remap returns ETIMEDOUT
1583 					 * and the filesystem is hard mounted
1584 					 * we have to retry the call with a new
1585 					 * server.
1586 					 */
1587 					if ((mi->mi_flags & MI_HARD) &&
1588 					    IS_RECOVERABLE_ERROR(remaperr)) {
1589 						if (svp == mi->mi_curr_serv)
1590 							failover_newserver(mi);
1591 						rpcerr.re_status = RPC_SUCCESS;
1592 						goto failoverretry;
1593 					}
1594 					return (remaperr);
1595 				}
1596 			}
1597 			if (fi->fhp && fi->copyproc)
1598 				(*fi->copyproc)(fi->fhp, fi->vp);
1599 		}
1600 	}
1601 
1602 	/* For TSOL, use a new cred which has net_mac_aware flag */
1603 	if (!cred_cloned && is_system_labeled()) {
1604 		cred_cloned = TRUE;
1605 		cr = crdup(icr);
1606 		(void) setpflags(NET_MAC_AWARE, 1, cr);
1607 	}
1608 
1609 	/*
1610 	 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1611 	 * are guaranteed to reprocess the retry as a new request.
1612 	 */
1613 	svp = mi->mi_curr_serv;
1614 	rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1615 	if (FAILOVER_MOUNT(mi)) {
1616 		mutex_enter(&mi->mi_lock);
1617 		DEC_READERS(mi);
1618 		mutex_exit(&mi->mi_lock);
1619 
1620 		if ((rpcerr.re_errno == ETIMEDOUT ||
1621 		    rpcerr.re_errno == ECONNRESET) &&
1622 		    failover_safe(fi)) {
1623 			if (svp == mi->mi_curr_serv)
1624 				failover_newserver(mi);
1625 			goto failoverretry;
1626 		}
1627 	}
1628 	if (rpcerr.re_errno != 0) {
1629 		if (cred_cloned)
1630 			crfree(cr);
1631 		return (rpcerr.re_errno);
1632 	}
1633 
1634 	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1635 	    svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1636 		timeo = (mi->mi_timeo * hz) / 10;
1637 	} else {
1638 		mutex_enter(&mi->mi_lock);
1639 		timeo = CLNT_SETTIMERS(client,
1640 		    &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1641 		    &(mi->mi_timers[NFS_CALLTYPES]),
1642 		    (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1643 		    (void (*)()) 0, (caddr_t)mi, 0);
1644 		mutex_exit(&mi->mi_lock);
1645 	}
1646 
1647 	/*
1648 	 * If hard mounted fs, retry call forever unless hard error occurs.
1649 	 */
1650 	do {
1651 		tryagain = FALSE;
1652 
1653 		if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1654 			status = RPC_FAILED;
1655 			rpcerr.re_status = RPC_FAILED;
1656 			rpcerr.re_errno = EIO;
1657 			break;
1658 		}
1659 
1660 		TICK_TO_TIMEVAL(timeo, &wait);
1661 
1662 		/*
1663 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1664 		 * and SIGTERM. (Preserving the existing masks).
1665 		 * Mask out SIGINT if mount option nointr is specified.
1666 		 */
1667 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
1668 		if (!(mi->mi_flags & MI_INT))
1669 			client->cl_nosignal = TRUE;
1670 
1671 		/*
1672 		 * If there is a current signal, then don't bother
1673 		 * even trying to send out the request because we
1674 		 * won't be able to block waiting for the response.
1675 		 * Simply assume RPC_INTR and get on with it.
1676 		 */
1677 		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1678 			status = RPC_INTR;
1679 		else {
1680 			status = CLNT_CALL(client, which, xdrargs, argsp,
1681 			    xdrres, resp, wait);
1682 		}
1683 
1684 		if (!(mi->mi_flags & MI_INT))
1685 			client->cl_nosignal = FALSE;
1686 		/*
1687 		 * restore original signal mask
1688 		 */
1689 		sigunintr(&smask);
1690 
1691 		switch (status) {
1692 		case RPC_SUCCESS:
1693 #if 0 /* notyet */
1694 			if ((mi->mi_flags & MI_DYNAMIC) &&
1695 			    mi->mi_timer_type[which] != 0 &&
1696 			    (mi->mi_curread != my_rsize ||
1697 			    mi->mi_curwrite != my_wsize))
1698 				(void) nfs_feedback(FEEDBACK_OK, which, mi);
1699 #endif
1700 			break;
1701 
1702 		/*
1703 		 * Unfortunately, there are servers in the world which
1704 		 * are not coded correctly.  They are not prepared to
1705 		 * handle RPC requests to the NFS port which are not
1706 		 * NFS requests.  Thus, they may try to process the
1707 		 * NFS_ACL request as if it were an NFS request.  This
1708 		 * does not work.  Generally, an error will be generated
1709 		 * on the client because it will not be able to decode
1710 		 * the response from the server.  However, it seems
1711 		 * possible that the server may not be able to decode
1712 		 * the arguments.  Thus, the criteria for deciding
1713 		 * whether the server supports NFS_ACL or not is whether
1714 		 * the following RPC errors are returned from CLNT_CALL.
1715 		 */
1716 		case RPC_CANTDECODERES:
1717 		case RPC_PROGUNAVAIL:
1718 		case RPC_CANTDECODEARGS:
1719 		case RPC_PROGVERSMISMATCH:
1720 			mutex_enter(&mi->mi_lock);
1721 			mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1722 			mutex_exit(&mi->mi_lock);
1723 			break;
1724 
1725 		/*
1726 		 * If the server supports NFS_ACL but not the new ops
1727 		 * for extended attributes, make sure we don't retry.
1728 		 */
1729 		case RPC_PROCUNAVAIL:
1730 			mutex_enter(&mi->mi_lock);
1731 			mi->mi_flags &= ~MI_EXTATTR;
1732 			mutex_exit(&mi->mi_lock);
1733 			break;
1734 
1735 		case RPC_INTR:
1736 			/*
1737 			 * There is no way to recover from this error,
1738 			 * even if mount option nointr is specified.
1739 			 * SIGKILL, for example, cannot be blocked.
1740 			 */
1741 			rpcerr.re_status = RPC_INTR;
1742 			rpcerr.re_errno = EINTR;
1743 			break;
1744 
1745 		case RPC_UDERROR:
1746 			/*
1747 			 * If the NFS server is local (vold) and
1748 			 * it goes away then we get RPC_UDERROR.
1749 			 * This is a retryable error, so we would
1750 			 * loop, so check to see if the specific
1751 			 * error was ECONNRESET, indicating that
1752 			 * target did not exist at all.  If so,
1753 			 * return with RPC_PROGUNAVAIL and
1754 			 * ECONNRESET to indicate why.
1755 			 */
1756 			CLNT_GETERR(client, &rpcerr);
1757 			if (rpcerr.re_errno == ECONNRESET) {
1758 				rpcerr.re_status = RPC_PROGUNAVAIL;
1759 				rpcerr.re_errno = ECONNRESET;
1760 				break;
1761 			}
1762 			/*FALLTHROUGH*/
1763 
1764 		default:		/* probably RPC_TIMEDOUT */
1765 			if (IS_UNRECOVERABLE_RPC(status))
1766 				break;
1767 
1768 			/*
1769 			 * increment server not responding count
1770 			 */
1771 			mutex_enter(&mi->mi_lock);
1772 			mi->mi_noresponse++;
1773 			mutex_exit(&mi->mi_lock);
1774 #ifdef DEBUG
1775 			nfscl->nfscl_stat.noresponse.value.ui64++;
1776 #endif
1777 
1778 			if (!(mi->mi_flags & MI_HARD)) {
1779 				if (!(mi->mi_flags & MI_SEMISOFT) ||
1780 				    (mi->mi_acl_ss_call_type[which] == 0))
1781 					break;
1782 			}
1783 
1784 			/*
1785 			 * The call is in progress (over COTS).
1786 			 * Try the CLNT_CALL again, but don't
1787 			 * print a noisy error message.
1788 			 */
1789 			if (status == RPC_INPROGRESS) {
1790 				tryagain = TRUE;
1791 				break;
1792 			}
1793 
1794 			if (flags & RFSCALL_SOFT)
1795 				break;
1796 
1797 			/*
1798 			 * On zone shutdown, just move on.
1799 			 */
1800 			if (zone_status_get(curproc->p_zone) >=
1801 			    ZONE_IS_SHUTTING_DOWN) {
1802 				rpcerr.re_status = RPC_FAILED;
1803 				rpcerr.re_errno = EIO;
1804 				break;
1805 			}
1806 
1807 			/*
1808 			 * NFS client failover support
1809 			 *
1810 			 * If the current server just failed us, we'll
1811 			 * start the process of finding a new server.
1812 			 * After that, we can just retry.
1813 			 */
1814 			if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1815 				if (svp == mi->mi_curr_serv)
1816 					failover_newserver(mi);
1817 				clfree_impl(client, ch, nfscl);
1818 				goto failoverretry;
1819 			}
1820 
1821 			tryagain = TRUE;
1822 			timeo = backoff(timeo);
1823 			mutex_enter(&mi->mi_lock);
1824 			if (!(mi->mi_flags & MI_PRINTED)) {
1825 				mi->mi_flags |= MI_PRINTED;
1826 				mutex_exit(&mi->mi_lock);
1827 #ifdef DEBUG
1828 				zprintf(zoneid,
1829 			"NFS_ACL%d server %s not responding still trying\n",
1830 				    mi->mi_vers, svp->sv_hostname);
1831 #else
1832 				zprintf(zoneid,
1833 			    "NFS server %s not responding still trying\n",
1834 				    svp->sv_hostname);
1835 #endif
1836 			} else
1837 				mutex_exit(&mi->mi_lock);
1838 			if (*douprintf && nfs_has_ctty()) {
1839 				*douprintf = 0;
1840 				if (!(mi->mi_flags & MI_NOPRINT))
1841 #ifdef DEBUG
1842 					uprintf(
1843 			"NFS_ACL%d server %s not responding still trying\n",
1844 					    mi->mi_vers, svp->sv_hostname);
1845 #else
1846 					uprintf(
1847 			    "NFS server %s not responding still trying\n",
1848 					    svp->sv_hostname);
1849 #endif
1850 			}
1851 
1852 #if 0 /* notyet */
1853 			/*
1854 			 * If doing dynamic adjustment of transfer
1855 			 * size and if it's a read or write call
1856 			 * and if the transfer size changed while
1857 			 * retransmitting or if the feedback routine
1858 			 * changed the transfer size,
1859 			 * then exit rfscall so that the transfer
1860 			 * size can be adjusted at the vnops level.
1861 			 */
1862 			if ((mi->mi_flags & MI_DYNAMIC) &&
1863 			    mi->mi_acl_timer_type[which] != 0 &&
1864 			    (mi->mi_curread != my_rsize ||
1865 			    mi->mi_curwrite != my_wsize ||
1866 			    nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1867 				/*
1868 				 * On read or write calls, return
1869 				 * back to the vnode ops level if
1870 				 * the transfer size changed.
1871 				 */
1872 				clfree_impl(client, ch, nfscl);
1873 				if (cred_cloned)
1874 					crfree(cr);
1875 				return (ENFS_TRYAGAIN);
1876 			}
1877 #endif
1878 		}
1879 	} while (tryagain);
1880 
1881 	if (status != RPC_SUCCESS) {
1882 		/*
1883 		 * Let soft mounts use the timed out message.
1884 		 */
1885 		if (status == RPC_INPROGRESS)
1886 			status = RPC_TIMEDOUT;
1887 		nfscl->nfscl_stat.badcalls.value.ui64++;
1888 		if (status == RPC_CANTDECODERES ||
1889 		    status == RPC_PROGUNAVAIL ||
1890 		    status == RPC_PROCUNAVAIL ||
1891 		    status == RPC_CANTDECODEARGS ||
1892 		    status == RPC_PROGVERSMISMATCH)
1893 			CLNT_GETERR(client, &rpcerr);
1894 		else if (status != RPC_INTR) {
1895 			mutex_enter(&mi->mi_lock);
1896 			mi->mi_flags |= MI_DOWN;
1897 			mutex_exit(&mi->mi_lock);
1898 			CLNT_GETERR(client, &rpcerr);
1899 #ifdef DEBUG
1900 			bufp = clnt_sperror(client, svp->sv_hostname);
1901 			zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1902 			    mi->mi_vers, mi->mi_aclnames[which], bufp);
1903 			if (nfs_has_ctty()) {
1904 				if (!(mi->mi_flags & MI_NOPRINT)) {
1905 					uprintf("NFS_ACL%d %s failed for %s\n",
1906 					    mi->mi_vers, mi->mi_aclnames[which],
1907 					    bufp);
1908 				}
1909 			}
1910 			kmem_free(bufp, MAXPATHLEN);
1911 #else
1912 			zprintf(zoneid,
1913 			    "NFS %s failed for server %s: error %d (%s)\n",
1914 			    mi->mi_aclnames[which], svp->sv_hostname,
1915 			    status, clnt_sperrno(status));
1916 			if (nfs_has_ctty()) {
1917 				if (!(mi->mi_flags & MI_NOPRINT))
1918 					uprintf(
1919 				"NFS %s failed for server %s: error %d (%s)\n",
1920 					    mi->mi_aclnames[which],
1921 					    svp->sv_hostname, status,
1922 					    clnt_sperrno(status));
1923 			}
1924 #endif
1925 			/*
1926 			 * when CLNT_CALL() fails with RPC_AUTHERROR,
1927 			 * re_errno is set appropriately depending on
1928 			 * the authentication error
1929 			 */
1930 			if (status == RPC_VERSMISMATCH ||
1931 			    status == RPC_PROGVERSMISMATCH)
1932 				rpcerr.re_errno = EIO;
1933 		}
1934 	} else {
1935 		/*
1936 		 * Test the value of mi_down and mi_printed without
1937 		 * holding the mi_lock mutex.  If they are both zero,
1938 		 * then it is okay to skip the down and printed
1939 		 * processing.  This saves on a mutex_enter and
1940 		 * mutex_exit pair for a normal, successful RPC.
1941 		 * This was just complete overhead.
1942 		 */
1943 		if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1944 			mutex_enter(&mi->mi_lock);
1945 			mi->mi_flags &= ~MI_DOWN;
1946 			if (mi->mi_flags & MI_PRINTED) {
1947 				mi->mi_flags &= ~MI_PRINTED;
1948 				mutex_exit(&mi->mi_lock);
1949 #ifdef DEBUG
1950 				zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1951 				    mi->mi_vers, svp->sv_hostname);
1952 #else
1953 				zprintf(zoneid, "NFS server %s ok\n",
1954 				    svp->sv_hostname);
1955 #endif
1956 			} else
1957 				mutex_exit(&mi->mi_lock);
1958 		}
1959 
1960 		if (*douprintf == 0) {
1961 			if (!(mi->mi_flags & MI_NOPRINT))
1962 #ifdef DEBUG
1963 				uprintf("NFS_ACL%d server %s ok\n",
1964 				    mi->mi_vers, svp->sv_hostname);
1965 #else
1966 				uprintf("NFS server %s ok\n", svp->sv_hostname);
1967 #endif
1968 			*douprintf = 1;
1969 		}
1970 	}
1971 
1972 	clfree_impl(client, ch, nfscl);
1973 	if (cred_cloned)
1974 		crfree(cr);
1975 
1976 	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1977 
1978 #if 0 /* notyet */
1979 	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1980 	    rpcerr.re_errno);
1981 #endif
1982 
1983 	return (rpcerr.re_errno);
1984 }
1985 
1986 int
1987 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1988 {
1989 	uint_t mask = vap->va_mask;
1990 
1991 	if (!(mask & AT_MODE))
1992 		sa->sa_mode = (uint32_t)-1;
1993 	else
1994 		sa->sa_mode = vap->va_mode;
1995 	if (!(mask & AT_UID))
1996 		sa->sa_uid = (uint32_t)-1;
1997 	else
1998 		sa->sa_uid = (uint32_t)vap->va_uid;
1999 	if (!(mask & AT_GID))
2000 		sa->sa_gid = (uint32_t)-1;
2001 	else
2002 		sa->sa_gid = (uint32_t)vap->va_gid;
2003 	if (!(mask & AT_SIZE))
2004 		sa->sa_size = (uint32_t)-1;
2005 	else
2006 		sa->sa_size = (uint32_t)vap->va_size;
2007 	if (!(mask & AT_ATIME))
2008 		sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2009 	else {
2010 		/* check time validity */
2011 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2012 			return (EOVERFLOW);
2013 		}
2014 		sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2015 		sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2016 	}
2017 	if (!(mask & AT_MTIME))
2018 		sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2019 	else {
2020 		/* check time validity */
2021 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2022 			return (EOVERFLOW);
2023 		}
2024 		sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2025 		sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2026 	}
2027 	return (0);
2028 }
2029 
2030 int
2031 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2032 {
2033 	uint_t mask = vap->va_mask;
2034 
2035 	if (!(mask & AT_MODE))
2036 		sa->mode.set_it = FALSE;
2037 	else {
2038 		sa->mode.set_it = TRUE;
2039 		sa->mode.mode = (mode3)vap->va_mode;
2040 	}
2041 	if (!(mask & AT_UID))
2042 		sa->uid.set_it = FALSE;
2043 	else {
2044 		sa->uid.set_it = TRUE;
2045 		sa->uid.uid = (uid3)vap->va_uid;
2046 	}
2047 	if (!(mask & AT_GID))
2048 		sa->gid.set_it = FALSE;
2049 	else {
2050 		sa->gid.set_it = TRUE;
2051 		sa->gid.gid = (gid3)vap->va_gid;
2052 	}
2053 	if (!(mask & AT_SIZE))
2054 		sa->size.set_it = FALSE;
2055 	else {
2056 		sa->size.set_it = TRUE;
2057 		sa->size.size = (size3)vap->va_size;
2058 	}
2059 	if (!(mask & AT_ATIME))
2060 		sa->atime.set_it = DONT_CHANGE;
2061 	else {
2062 		/* check time validity */
2063 		if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2064 			return (EOVERFLOW);
2065 		}
2066 		sa->atime.set_it = SET_TO_CLIENT_TIME;
2067 		sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2068 		sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2069 	}
2070 	if (!(mask & AT_MTIME))
2071 		sa->mtime.set_it = DONT_CHANGE;
2072 	else {
2073 		/* check time validity */
2074 		if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2075 			return (EOVERFLOW);
2076 		}
2077 		sa->mtime.set_it = SET_TO_CLIENT_TIME;
2078 		sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2079 		sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2080 	}
2081 	return (0);
2082 }
2083 
2084 void
2085 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2086 {
2087 
2088 	da->da_fhandle = VTOFH(dvp);
2089 	da->da_name = nm;
2090 	da->da_flags = 0;
2091 }
2092 
2093 void
2094 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2095 {
2096 
2097 	da->dirp = VTOFH3(dvp);
2098 	da->name = nm;
2099 }
2100 
2101 int
2102 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2103 {
2104 	int error;
2105 	rnode_t *rp;
2106 	struct vattr va;
2107 
2108 	va.va_mask = AT_MODE | AT_GID;
2109 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2110 	if (error)
2111 		return (error);
2112 
2113 	/*
2114 	 * To determine the expected group-id of the created file:
2115 	 *  1)	If the filesystem was not mounted with the Old-BSD-compatible
2116 	 *	GRPID option, and the directory's set-gid bit is clear,
2117 	 *	then use the process's gid.
2118 	 *  2)	Otherwise, set the group-id to the gid of the parent directory.
2119 	 */
2120 	rp = VTOR(dvp);
2121 	mutex_enter(&rp->r_statelock);
2122 	if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2123 		*gidp = crgetgid(cr);
2124 	else
2125 		*gidp = va.va_gid;
2126 	mutex_exit(&rp->r_statelock);
2127 	return (0);
2128 }
2129 
2130 int
2131 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2132 {
2133 	int error;
2134 	struct vattr va;
2135 
2136 	va.va_mask = AT_MODE;
2137 	error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2138 	if (error)
2139 		return (error);
2140 
2141 	/*
2142 	 * Modify the expected mode (om) so that the set-gid bit matches
2143 	 * that of the parent directory (dvp).
2144 	 */
2145 	if (va.va_mode & VSGID)
2146 		*omp |= VSGID;
2147 	else
2148 		*omp &= ~VSGID;
2149 	return (0);
2150 }
2151 
2152 void
2153 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2154 {
2155 
2156 	if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2157 		if (!(vp->v_flag & VSWAPLIKE)) {
2158 			mutex_enter(&vp->v_lock);
2159 			vp->v_flag |= VSWAPLIKE;
2160 			mutex_exit(&vp->v_lock);
2161 		}
2162 	} else {
2163 		if (vp->v_flag & VSWAPLIKE) {
2164 			mutex_enter(&vp->v_lock);
2165 			vp->v_flag &= ~VSWAPLIKE;
2166 			mutex_exit(&vp->v_lock);
2167 		}
2168 	}
2169 }
2170 
2171 /*
2172  * Free the resources associated with an rnode.
2173  */
2174 static void
2175 rinactive(rnode_t *rp, cred_t *cr)
2176 {
2177 	vnode_t *vp;
2178 	cred_t *cred;
2179 	char *contents;
2180 	int size;
2181 	vsecattr_t *vsp;
2182 	int error;
2183 	nfs3_pathconf_info *info;
2184 
2185 	/*
2186 	 * Before freeing anything, wait until all asynchronous
2187 	 * activity is done on this rnode.  This will allow all
2188 	 * asynchronous read ahead and write behind i/o's to
2189 	 * finish.
2190 	 */
2191 	mutex_enter(&rp->r_statelock);
2192 	while (rp->r_count > 0)
2193 		cv_wait(&rp->r_cv, &rp->r_statelock);
2194 	mutex_exit(&rp->r_statelock);
2195 
2196 	/*
2197 	 * Flush and invalidate all pages associated with the vnode.
2198 	 */
2199 	vp = RTOV(rp);
2200 	if (vn_has_cached_data(vp)) {
2201 		ASSERT(vp->v_type != VCHR);
2202 		if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2203 			error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2204 			if (error && (error == ENOSPC || error == EDQUOT)) {
2205 				mutex_enter(&rp->r_statelock);
2206 				if (!rp->r_error)
2207 					rp->r_error = error;
2208 				mutex_exit(&rp->r_statelock);
2209 			}
2210 		}
2211 		nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2212 	}
2213 
2214 	/*
2215 	 * Free any held credentials and caches which may be associated
2216 	 * with this rnode.
2217 	 */
2218 	mutex_enter(&rp->r_statelock);
2219 	cred = rp->r_cred;
2220 	rp->r_cred = NULL;
2221 	contents = rp->r_symlink.contents;
2222 	size = rp->r_symlink.size;
2223 	rp->r_symlink.contents = NULL;
2224 	vsp = rp->r_secattr;
2225 	rp->r_secattr = NULL;
2226 	info = rp->r_pathconf;
2227 	rp->r_pathconf = NULL;
2228 	mutex_exit(&rp->r_statelock);
2229 
2230 	/*
2231 	 * Free the held credential.
2232 	 */
2233 	if (cred != NULL)
2234 		crfree(cred);
2235 
2236 	/*
2237 	 * Free the access cache entries.
2238 	 */
2239 	(void) nfs_access_purge_rp(rp);
2240 
2241 	/*
2242 	 * Free the readdir cache entries.
2243 	 */
2244 	if (HAVE_RDDIR_CACHE(rp))
2245 		nfs_purge_rddir_cache(vp);
2246 
2247 	/*
2248 	 * Free the symbolic link cache.
2249 	 */
2250 	if (contents != NULL) {
2251 
2252 		kmem_free((void *)contents, size);
2253 	}
2254 
2255 	/*
2256 	 * Free any cached ACL.
2257 	 */
2258 	if (vsp != NULL)
2259 		nfs_acl_free(vsp);
2260 
2261 	/*
2262 	 * Free any cached pathconf information.
2263 	 */
2264 	if (info != NULL)
2265 		kmem_free(info, sizeof (*info));
2266 }
2267 
2268 /*
2269  * Return a vnode for the given NFS Version 2 file handle.
2270  * If no rnode exists for this fhandle, create one and put it
2271  * into the hash queues.  If the rnode for this fhandle
2272  * already exists, return it.
2273  *
2274  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2275  */
2276 vnode_t *
2277 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2278     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2279 {
2280 	int newnode;
2281 	int index;
2282 	vnode_t *vp;
2283 	nfs_fhandle nfh;
2284 	vattr_t va;
2285 
2286 	nfh.fh_len = NFS_FHSIZE;
2287 	bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2288 
2289 	index = rtablehash(&nfh);
2290 	rw_enter(&rtable[index].r_lock, RW_READER);
2291 
2292 	vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2293 	    nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2294 
2295 	if (attr != NULL) {
2296 		if (!newnode) {
2297 			rw_exit(&rtable[index].r_lock);
2298 			(void) nfs_cache_fattr(vp, attr, &va, t, cr);
2299 		} else {
2300 			if (attr->na_type < NFNON || attr->na_type > NFSOC)
2301 				vp->v_type = VBAD;
2302 			else
2303 				vp->v_type = n2v_type(attr);
2304 			/*
2305 			 * A translation here seems to be necessary
2306 			 * because this function can be called
2307 			 * with `attr' that has come from the wire,
2308 			 * and been operated on by vattr_to_nattr().
2309 			 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2310 			 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2311 			 * ->makenfsnode().
2312 			 */
2313 			if ((attr->na_rdev & 0xffff0000) == 0)
2314 				vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2315 			else
2316 				vp->v_rdev = expldev(n2v_rdev(attr));
2317 			nfs_attrcache(vp, attr, t);
2318 			rw_exit(&rtable[index].r_lock);
2319 		}
2320 	} else {
2321 		if (newnode) {
2322 			PURGE_ATTRCACHE(vp);
2323 		}
2324 		rw_exit(&rtable[index].r_lock);
2325 	}
2326 
2327 	return (vp);
2328 }
2329 
2330 /*
2331  * Return a vnode for the given NFS Version 3 file handle.
2332  * If no rnode exists for this fhandle, create one and put it
2333  * into the hash queues.  If the rnode for this fhandle
2334  * already exists, return it.
2335  *
2336  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2337  */
2338 vnode_t *
2339 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2340     cred_t *cr, char *dnm, char *nm)
2341 {
2342 	int newnode;
2343 	int index;
2344 	vnode_t *vp;
2345 
2346 	index = rtablehash((nfs_fhandle *)fh);
2347 	rw_enter(&rtable[index].r_lock, RW_READER);
2348 
2349 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2350 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2351 	    dnm, nm);
2352 
2353 	if (vap == NULL) {
2354 		if (newnode) {
2355 			PURGE_ATTRCACHE(vp);
2356 		}
2357 		rw_exit(&rtable[index].r_lock);
2358 		return (vp);
2359 	}
2360 
2361 	if (!newnode) {
2362 		rw_exit(&rtable[index].r_lock);
2363 		nfs_attr_cache(vp, vap, t, cr);
2364 	} else {
2365 		rnode_t *rp = VTOR(vp);
2366 
2367 		vp->v_type = vap->va_type;
2368 		vp->v_rdev = vap->va_rdev;
2369 
2370 		mutex_enter(&rp->r_statelock);
2371 		if (rp->r_mtime <= t)
2372 			nfs_attrcache_va(vp, vap);
2373 		mutex_exit(&rp->r_statelock);
2374 		rw_exit(&rtable[index].r_lock);
2375 	}
2376 
2377 	return (vp);
2378 }
2379 
2380 vnode_t *
2381 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2382     cred_t *cr, char *dnm, char *nm)
2383 {
2384 	int newnode;
2385 	int index;
2386 	vnode_t *vp;
2387 	vattr_t va;
2388 
2389 	index = rtablehash((nfs_fhandle *)fh);
2390 	rw_enter(&rtable[index].r_lock, RW_READER);
2391 
2392 	vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2393 	    nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2394 	    dnm, nm);
2395 
2396 	if (attr == NULL) {
2397 		if (newnode) {
2398 			PURGE_ATTRCACHE(vp);
2399 		}
2400 		rw_exit(&rtable[index].r_lock);
2401 		return (vp);
2402 	}
2403 
2404 	if (!newnode) {
2405 		rw_exit(&rtable[index].r_lock);
2406 		(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2407 	} else {
2408 		if (attr->type < NF3REG || attr->type > NF3FIFO)
2409 			vp->v_type = VBAD;
2410 		else
2411 			vp->v_type = nf3_to_vt[attr->type];
2412 		vp->v_rdev = makedevice(attr->rdev.specdata1,
2413 		    attr->rdev.specdata2);
2414 		nfs3_attrcache(vp, attr, t);
2415 		rw_exit(&rtable[index].r_lock);
2416 	}
2417 
2418 	return (vp);
2419 }
2420 
2421 /*
2422  * Read this comment before making changes to rtablehash()!
2423  * This is a hash function in which seemingly obvious and harmless
2424  * changes can cause escalations costing million dollars!
2425  * Know what you are doing.
2426  *
2427  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2428  * algorithm is currently detailed here:
2429  *
2430  *   http://burtleburtle.net/bob/hash/doobs.html
2431  *
2432  * Of course, the above link may not be valid by the time you are reading
2433  * this, but suffice it to say that the one-at-a-time algorithm works well in
2434  * almost all cases.  If you are changing the algorithm be sure to verify that
2435  * the hash algorithm still provides even distribution in all cases and with
2436  * any server returning filehandles in whatever order (sequential or random).
2437  */
2438 static int
2439 rtablehash(nfs_fhandle *fh)
2440 {
2441 	ulong_t hash, len, i;
2442 	char *key;
2443 
2444 	key = fh->fh_buf;
2445 	len = (ulong_t)fh->fh_len;
2446 	for (hash = 0, i = 0; i < len; i++) {
2447 		hash += key[i];
2448 		hash += (hash << 10);
2449 		hash ^= (hash >> 6);
2450 	}
2451 	hash += (hash << 3);
2452 	hash ^= (hash >> 11);
2453 	hash += (hash << 15);
2454 	return (hash & rtablemask);
2455 }
2456 
2457 static vnode_t *
2458 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2459     struct vnodeops *vops,
2460     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2461     int (*compar)(const void *, const void *),
2462     int *newnode, cred_t *cr, char *dnm, char *nm)
2463 {
2464 	rnode_t *rp;
2465 	rnode_t *trp;
2466 	vnode_t *vp;
2467 	mntinfo_t *mi;
2468 
2469 	ASSERT(RW_READ_HELD(&rhtp->r_lock));
2470 
2471 	mi = VFTOMI(vfsp);
2472 start:
2473 	if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2474 		vp = RTOV(rp);
2475 		nfs_set_vroot(vp);
2476 		*newnode = 0;
2477 		return (vp);
2478 	}
2479 	rw_exit(&rhtp->r_lock);
2480 
2481 	mutex_enter(&rpfreelist_lock);
2482 	if (rpfreelist != NULL && rnew >= nrnode) {
2483 		rp = rpfreelist;
2484 		rp_rmfree(rp);
2485 		mutex_exit(&rpfreelist_lock);
2486 
2487 		vp = RTOV(rp);
2488 
2489 		if (rp->r_flags & RHASHED) {
2490 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2491 			mutex_enter(&vp->v_lock);
2492 			if (vp->v_count > 1) {
2493 				VN_RELE_LOCKED(vp);
2494 				mutex_exit(&vp->v_lock);
2495 				rw_exit(&rp->r_hashq->r_lock);
2496 				rw_enter(&rhtp->r_lock, RW_READER);
2497 				goto start;
2498 			}
2499 			mutex_exit(&vp->v_lock);
2500 			rp_rmhash_locked(rp);
2501 			rw_exit(&rp->r_hashq->r_lock);
2502 		}
2503 
2504 		rinactive(rp, cr);
2505 
2506 		mutex_enter(&vp->v_lock);
2507 		if (vp->v_count > 1) {
2508 			VN_RELE_LOCKED(vp);
2509 			mutex_exit(&vp->v_lock);
2510 			rw_enter(&rhtp->r_lock, RW_READER);
2511 			goto start;
2512 		}
2513 		mutex_exit(&vp->v_lock);
2514 		vn_invalid(vp);
2515 		/*
2516 		 * destroy old locks before bzero'ing and
2517 		 * recreating the locks below.
2518 		 */
2519 		nfs_rw_destroy(&rp->r_rwlock);
2520 		nfs_rw_destroy(&rp->r_lkserlock);
2521 		mutex_destroy(&rp->r_statelock);
2522 		cv_destroy(&rp->r_cv);
2523 		cv_destroy(&rp->r_commit.c_cv);
2524 		nfs_free_r_path(rp);
2525 		avl_destroy(&rp->r_dir);
2526 		/*
2527 		 * Make sure that if rnode is recycled then
2528 		 * VFS count is decremented properly before
2529 		 * reuse.
2530 		 */
2531 		VFS_RELE(vp->v_vfsp);
2532 		vn_reinit(vp);
2533 	} else {
2534 		vnode_t *new_vp;
2535 
2536 		mutex_exit(&rpfreelist_lock);
2537 
2538 		rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2539 		new_vp = vn_alloc(KM_SLEEP);
2540 
2541 		atomic_inc_ulong((ulong_t *)&rnew);
2542 #ifdef DEBUG
2543 		clstat_debug.nrnode.value.ui64++;
2544 #endif
2545 		vp = new_vp;
2546 	}
2547 
2548 	bzero(rp, sizeof (*rp));
2549 	rp->r_vnode = vp;
2550 	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2551 	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2552 	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2553 	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2554 	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2555 	rp->r_fh.fh_len = fh->fh_len;
2556 	bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2557 	rp->r_server = mi->mi_curr_serv;
2558 	if (FAILOVER_MOUNT(mi)) {
2559 		/*
2560 		 * If replicated servers, stash pathnames
2561 		 */
2562 		if (dnm != NULL && nm != NULL) {
2563 			char *s, *p;
2564 			uint_t len;
2565 
2566 			len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2567 			rp->r_path = kmem_alloc(len, KM_SLEEP);
2568 #ifdef DEBUG
2569 			clstat_debug.rpath.value.ui64 += len;
2570 #endif
2571 			s = rp->r_path;
2572 			for (p = dnm; *p; p++)
2573 				*s++ = *p;
2574 			*s++ = '/';
2575 			for (p = nm; *p; p++)
2576 				*s++ = *p;
2577 			*s = '\0';
2578 		} else {
2579 			/* special case for root */
2580 			rp->r_path = kmem_alloc(2, KM_SLEEP);
2581 #ifdef DEBUG
2582 			clstat_debug.rpath.value.ui64 += 2;
2583 #endif
2584 			*rp->r_path = '.';
2585 			*(rp->r_path + 1) = '\0';
2586 		}
2587 	}
2588 	VFS_HOLD(vfsp);
2589 	rp->r_putapage = putapage;
2590 	rp->r_hashq = rhtp;
2591 	rp->r_flags = RREADDIRPLUS;
2592 	avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2593 	    offsetof(rddir_cache, tree));
2594 	vn_setops(vp, vops);
2595 	vp->v_data = (caddr_t)rp;
2596 	vp->v_vfsp = vfsp;
2597 	vp->v_type = VNON;
2598 	vp->v_flag |= VMODSORT;
2599 	nfs_set_vroot(vp);
2600 
2601 	/*
2602 	 * There is a race condition if someone else
2603 	 * alloc's the rnode while no locks are held, so we
2604 	 * check again and recover if found.
2605 	 */
2606 	rw_enter(&rhtp->r_lock, RW_WRITER);
2607 	if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2608 		vp = RTOV(trp);
2609 		nfs_set_vroot(vp);
2610 		*newnode = 0;
2611 		rw_exit(&rhtp->r_lock);
2612 		rp_addfree(rp, cr);
2613 		rw_enter(&rhtp->r_lock, RW_READER);
2614 		return (vp);
2615 	}
2616 	rp_addhash(rp);
2617 	*newnode = 1;
2618 	return (vp);
2619 }
2620 
2621 /*
2622  * Callback function to check if the page should be marked as
2623  * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2624  */
2625 int
2626 nfs_setmod_check(page_t *pp)
2627 {
2628 	if (pp->p_fsdata != C_NOCOMMIT) {
2629 		pp->p_fsdata = C_NOCOMMIT;
2630 		return (1);
2631 	}
2632 	return (0);
2633 }
2634 
2635 static void
2636 nfs_set_vroot(vnode_t *vp)
2637 {
2638 	rnode_t *rp;
2639 	nfs_fhandle *rootfh;
2640 
2641 	rp = VTOR(vp);
2642 	rootfh = &rp->r_server->sv_fhandle;
2643 	if (rootfh->fh_len == rp->r_fh.fh_len &&
2644 	    bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2645 		if (!(vp->v_flag & VROOT)) {
2646 			mutex_enter(&vp->v_lock);
2647 			vp->v_flag |= VROOT;
2648 			mutex_exit(&vp->v_lock);
2649 		}
2650 	}
2651 }
2652 
2653 static void
2654 nfs_free_r_path(rnode_t *rp)
2655 {
2656 	char *path;
2657 	size_t len;
2658 
2659 	path = rp->r_path;
2660 	if (path) {
2661 		rp->r_path = NULL;
2662 		len = strlen(path) + 1;
2663 		kmem_free(path, len);
2664 #ifdef DEBUG
2665 		clstat_debug.rpath.value.ui64 -= len;
2666 #endif
2667 	}
2668 }
2669 
2670 /*
2671  * Put an rnode on the free list.
2672  *
2673  * Rnodes which were allocated above and beyond the normal limit
2674  * are immediately freed.
2675  */
2676 void
2677 rp_addfree(rnode_t *rp, cred_t *cr)
2678 {
2679 	vnode_t *vp;
2680 	struct vfs *vfsp;
2681 
2682 	vp = RTOV(rp);
2683 	ASSERT(vp->v_count >= 1);
2684 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2685 
2686 	/*
2687 	 * If we have too many rnodes allocated and there are no
2688 	 * references to this rnode, or if the rnode is no longer
2689 	 * accessible by it does not reside in the hash queues,
2690 	 * or if an i/o error occurred while writing to the file,
2691 	 * then just free it instead of putting it on the rnode
2692 	 * freelist.
2693 	 */
2694 	vfsp = vp->v_vfsp;
2695 	if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2696 	    (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2697 		if (rp->r_flags & RHASHED) {
2698 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2699 			mutex_enter(&vp->v_lock);
2700 			if (vp->v_count > 1) {
2701 				VN_RELE_LOCKED(vp);
2702 				mutex_exit(&vp->v_lock);
2703 				rw_exit(&rp->r_hashq->r_lock);
2704 				return;
2705 			}
2706 			mutex_exit(&vp->v_lock);
2707 			rp_rmhash_locked(rp);
2708 			rw_exit(&rp->r_hashq->r_lock);
2709 		}
2710 
2711 		rinactive(rp, cr);
2712 
2713 		/*
2714 		 * Recheck the vnode reference count.  We need to
2715 		 * make sure that another reference has not been
2716 		 * acquired while we were not holding v_lock.  The
2717 		 * rnode is not in the rnode hash queues, so the
2718 		 * only way for a reference to have been acquired
2719 		 * is for a VOP_PUTPAGE because the rnode was marked
2720 		 * with RDIRTY or for a modified page.  This
2721 		 * reference may have been acquired before our call
2722 		 * to rinactive.  The i/o may have been completed,
2723 		 * thus allowing rinactive to complete, but the
2724 		 * reference to the vnode may not have been released
2725 		 * yet.  In any case, the rnode can not be destroyed
2726 		 * until the other references to this vnode have been
2727 		 * released.  The other references will take care of
2728 		 * either destroying the rnode or placing it on the
2729 		 * rnode freelist.  If there are no other references,
2730 		 * then the rnode may be safely destroyed.
2731 		 */
2732 		mutex_enter(&vp->v_lock);
2733 		if (vp->v_count > 1) {
2734 			VN_RELE_LOCKED(vp);
2735 			mutex_exit(&vp->v_lock);
2736 			return;
2737 		}
2738 		mutex_exit(&vp->v_lock);
2739 
2740 		destroy_rnode(rp);
2741 		return;
2742 	}
2743 
2744 	/*
2745 	 * Lock the hash queue and then recheck the reference count
2746 	 * to ensure that no other threads have acquired a reference
2747 	 * to indicate that the rnode should not be placed on the
2748 	 * freelist.  If another reference has been acquired, then
2749 	 * just release this one and let the other thread complete
2750 	 * the processing of adding this rnode to the freelist.
2751 	 */
2752 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2753 
2754 	mutex_enter(&vp->v_lock);
2755 	if (vp->v_count > 1) {
2756 		VN_RELE_LOCKED(vp);
2757 		mutex_exit(&vp->v_lock);
2758 		rw_exit(&rp->r_hashq->r_lock);
2759 		return;
2760 	}
2761 	mutex_exit(&vp->v_lock);
2762 
2763 	/*
2764 	 * If there is no cached data or metadata for this file, then
2765 	 * put the rnode on the front of the freelist so that it will
2766 	 * be reused before other rnodes which may have cached data or
2767 	 * metadata associated with them.
2768 	 */
2769 	mutex_enter(&rpfreelist_lock);
2770 	if (rpfreelist == NULL) {
2771 		rp->r_freef = rp;
2772 		rp->r_freeb = rp;
2773 		rpfreelist = rp;
2774 	} else {
2775 		rp->r_freef = rpfreelist;
2776 		rp->r_freeb = rpfreelist->r_freeb;
2777 		rpfreelist->r_freeb->r_freef = rp;
2778 		rpfreelist->r_freeb = rp;
2779 		if (!vn_has_cached_data(vp) &&
2780 		    !HAVE_RDDIR_CACHE(rp) &&
2781 		    rp->r_symlink.contents == NULL &&
2782 		    rp->r_secattr == NULL &&
2783 		    rp->r_pathconf == NULL)
2784 			rpfreelist = rp;
2785 	}
2786 	mutex_exit(&rpfreelist_lock);
2787 
2788 	rw_exit(&rp->r_hashq->r_lock);
2789 }
2790 
2791 /*
2792  * Remove an rnode from the free list.
2793  *
2794  * The caller must be holding rpfreelist_lock and the rnode
2795  * must be on the freelist.
2796  */
2797 static void
2798 rp_rmfree(rnode_t *rp)
2799 {
2800 
2801 	ASSERT(MUTEX_HELD(&rpfreelist_lock));
2802 	ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2803 
2804 	if (rp == rpfreelist) {
2805 		rpfreelist = rp->r_freef;
2806 		if (rp == rpfreelist)
2807 			rpfreelist = NULL;
2808 	}
2809 
2810 	rp->r_freeb->r_freef = rp->r_freef;
2811 	rp->r_freef->r_freeb = rp->r_freeb;
2812 
2813 	rp->r_freef = rp->r_freeb = NULL;
2814 }
2815 
2816 /*
2817  * Put a rnode in the hash table.
2818  *
2819  * The caller must be holding the exclusive hash queue lock.
2820  */
2821 static void
2822 rp_addhash(rnode_t *rp)
2823 {
2824 	mntinfo_t *mi;
2825 
2826 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2827 	ASSERT(!(rp->r_flags & RHASHED));
2828 
2829 	rp->r_hashf = rp->r_hashq->r_hashf;
2830 	rp->r_hashq->r_hashf = rp;
2831 	rp->r_hashb = (rnode_t *)rp->r_hashq;
2832 	rp->r_hashf->r_hashb = rp;
2833 
2834 	mutex_enter(&rp->r_statelock);
2835 	rp->r_flags |= RHASHED;
2836 	mutex_exit(&rp->r_statelock);
2837 
2838 	mi = VTOMI(RTOV(rp));
2839 	mutex_enter(&mi->mi_rnodes_lock);
2840 	list_insert_tail(&mi->mi_rnodes, rp);
2841 	mutex_exit(&mi->mi_rnodes_lock);
2842 }
2843 
2844 /*
2845  * Remove a rnode from the hash table.
2846  *
2847  * The caller must be holding the hash queue lock.
2848  */
2849 static void
2850 rp_rmhash_locked(rnode_t *rp)
2851 {
2852 	mntinfo_t *mi;
2853 
2854 	ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2855 	ASSERT(rp->r_flags & RHASHED);
2856 
2857 	rp->r_hashb->r_hashf = rp->r_hashf;
2858 	rp->r_hashf->r_hashb = rp->r_hashb;
2859 
2860 	mutex_enter(&rp->r_statelock);
2861 	rp->r_flags &= ~RHASHED;
2862 	mutex_exit(&rp->r_statelock);
2863 
2864 	mi = VTOMI(RTOV(rp));
2865 	mutex_enter(&mi->mi_rnodes_lock);
2866 	if (list_link_active(&rp->r_mi_link))
2867 		list_remove(&mi->mi_rnodes, rp);
2868 	mutex_exit(&mi->mi_rnodes_lock);
2869 }
2870 
2871 /*
2872  * Remove a rnode from the hash table.
2873  *
2874  * The caller must not be holding the hash queue lock.
2875  */
2876 void
2877 rp_rmhash(rnode_t *rp)
2878 {
2879 
2880 	rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2881 	rp_rmhash_locked(rp);
2882 	rw_exit(&rp->r_hashq->r_lock);
2883 }
2884 
2885 /*
2886  * Lookup a rnode by fhandle.
2887  *
2888  * The caller must be holding the hash queue lock, either shared or exclusive.
2889  */
2890 static rnode_t *
2891 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2892 {
2893 	rnode_t *rp;
2894 	vnode_t *vp;
2895 
2896 	ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2897 
2898 	for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2899 		vp = RTOV(rp);
2900 		if (vp->v_vfsp == vfsp &&
2901 		    rp->r_fh.fh_len == fh->fh_len &&
2902 		    bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2903 			/*
2904 			 * remove rnode from free list, if necessary.
2905 			 */
2906 			if (rp->r_freef != NULL) {
2907 				mutex_enter(&rpfreelist_lock);
2908 				/*
2909 				 * If the rnode is on the freelist,
2910 				 * then remove it and use that reference
2911 				 * as the new reference.  Otherwise,
2912 				 * need to increment the reference count.
2913 				 */
2914 				if (rp->r_freef != NULL) {
2915 					rp_rmfree(rp);
2916 					mutex_exit(&rpfreelist_lock);
2917 				} else {
2918 					mutex_exit(&rpfreelist_lock);
2919 					VN_HOLD(vp);
2920 				}
2921 			} else
2922 				VN_HOLD(vp);
2923 			return (rp);
2924 		}
2925 	}
2926 	return (NULL);
2927 }
2928 
2929 /*
2930  * Return 1 if there is an active vnode belonging to this vfs in the
2931  * rtable cache.
2932  *
2933  * Several of these checks are done without holding the usual
2934  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2935  * etc. will redo the necessary checks before actually destroying
2936  * any rnodes.
2937  */
2938 int
2939 check_rtable(struct vfs *vfsp)
2940 {
2941 	rnode_t *rp;
2942 	vnode_t *vp;
2943 	mntinfo_t *mi;
2944 
2945 	ASSERT(vfsp != NULL);
2946 	mi = VFTOMI(vfsp);
2947 
2948 	mutex_enter(&mi->mi_rnodes_lock);
2949 	for (rp = list_head(&mi->mi_rnodes); rp != NULL;
2950 	    rp = list_next(&mi->mi_rnodes, rp)) {
2951 		vp = RTOV(rp);
2952 
2953 		if (rp->r_freef == NULL ||
2954 		    (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) ||
2955 		    rp->r_count > 0) {
2956 			mutex_exit(&mi->mi_rnodes_lock);
2957 			return (1);
2958 		}
2959 	}
2960 	mutex_exit(&mi->mi_rnodes_lock);
2961 
2962 	return (0);
2963 }
2964 
2965 /*
2966  * Destroy inactive vnodes from the hash queues which belong to this
2967  * vfs.  It is essential that we destroy all inactive vnodes during a
2968  * forced unmount as well as during a normal unmount.
2969  */
2970 void
2971 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2972 {
2973 	rnode_t *rp;
2974 	mntinfo_t *mi;
2975 
2976 	ASSERT(vfsp != NULL);
2977 
2978 	mi = VFTOMI(vfsp);
2979 
2980 	mutex_enter(&rpfreelist_lock);
2981 	mutex_enter(&mi->mi_rnodes_lock);
2982 	while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
2983 		/*
2984 		 * If the rnode is no longer on the freelist it is not
2985 		 * ours and it will be handled by some other thread, so
2986 		 * skip it.
2987 		 */
2988 		if (rp->r_freef == NULL)
2989 			continue;
2990 		mutex_exit(&mi->mi_rnodes_lock);
2991 
2992 		rp_rmfree(rp);
2993 		mutex_exit(&rpfreelist_lock);
2994 
2995 		rp_rmhash(rp);
2996 
2997 		/*
2998 		 * This call to rp_addfree will end up destroying the
2999 		 * rnode, but in a safe way with the appropriate set
3000 		 * of checks done.
3001 		 */
3002 		rp_addfree(rp, cr);
3003 
3004 		mutex_enter(&rpfreelist_lock);
3005 		mutex_enter(&mi->mi_rnodes_lock);
3006 	}
3007 	mutex_exit(&mi->mi_rnodes_lock);
3008 	mutex_exit(&rpfreelist_lock);
3009 }
3010 
3011 /*
3012  * This routine destroys all the resources associated with the rnode
3013  * and then the rnode itself.
3014  */
3015 static void
3016 destroy_rnode(rnode_t *rp)
3017 {
3018 	vnode_t *vp;
3019 	vfs_t *vfsp;
3020 
3021 	vp = RTOV(rp);
3022 	vfsp = vp->v_vfsp;
3023 
3024 	ASSERT(vp->v_count == 1);
3025 	ASSERT(rp->r_count == 0);
3026 	ASSERT(rp->r_lmpl == NULL);
3027 	ASSERT(rp->r_mapcnt == 0);
3028 	ASSERT(!(rp->r_flags & RHASHED));
3029 	ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3030 	atomic_dec_ulong((ulong_t *)&rnew);
3031 #ifdef DEBUG
3032 	clstat_debug.nrnode.value.ui64--;
3033 #endif
3034 	nfs_rw_destroy(&rp->r_rwlock);
3035 	nfs_rw_destroy(&rp->r_lkserlock);
3036 	mutex_destroy(&rp->r_statelock);
3037 	cv_destroy(&rp->r_cv);
3038 	cv_destroy(&rp->r_commit.c_cv);
3039 	if (rp->r_flags & RDELMAPLIST)
3040 		list_destroy(&rp->r_indelmap);
3041 	nfs_free_r_path(rp);
3042 	avl_destroy(&rp->r_dir);
3043 	vn_invalid(vp);
3044 	vn_free(vp);
3045 	kmem_cache_free(rnode_cache, rp);
3046 	VFS_RELE(vfsp);
3047 }
3048 
3049 /*
3050  * Flush all vnodes in this (or every) vfs.
3051  * Used by nfs_sync and by nfs_unmount.
3052  */
3053 void
3054 rflush(struct vfs *vfsp, cred_t *cr)
3055 {
3056 	int index;
3057 	rnode_t *rp;
3058 	vnode_t *vp, **vplist;
3059 	long num, cnt;
3060 
3061 	/*
3062 	 * Check to see whether there is anything to do.
3063 	 */
3064 	num = rnew;
3065 	if (num == 0)
3066 		return;
3067 
3068 	/*
3069 	 * Allocate a slot for all currently active rnodes on the
3070 	 * supposition that they all may need flushing.
3071 	 */
3072 	vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3073 	cnt = 0;
3074 
3075 	/*
3076 	 * If the vfs is known we can do fast path by iterating all rnodes that
3077 	 * belongs to this vfs.  This is much faster than the traditional way
3078 	 * of iterating rtable (below) in a case there is a lot of rnodes that
3079 	 * does not belong to our vfs.
3080 	 */
3081 	if (vfsp != NULL) {
3082 		mntinfo_t *mi = VFTOMI(vfsp);
3083 
3084 		mutex_enter(&mi->mi_rnodes_lock);
3085 		for (rp = list_head(&mi->mi_rnodes); rp != NULL;
3086 		    rp = list_next(&mi->mi_rnodes, rp)) {
3087 			vp = RTOV(rp);
3088 			/*
3089 			 * Don't bother sync'ing a vp if it
3090 			 * is part of virtual swap device or
3091 			 * if VFS is read-only
3092 			 */
3093 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3094 				continue;
3095 			/*
3096 			 * If the vnode has pages and is marked as either dirty
3097 			 * or mmap'd, hold and add this vnode to the list of
3098 			 * vnodes to flush.
3099 			 */
3100 			ASSERT(vp->v_vfsp == vfsp);
3101 			if (vn_has_cached_data(vp) &&
3102 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3103 				VN_HOLD(vp);
3104 				vplist[cnt++] = vp;
3105 				if (cnt == num) {
3106 					/*
3107 					 * The vplist is full because there is
3108 					 * too many rnodes.  We are done for
3109 					 * now.
3110 					 */
3111 					break;
3112 				}
3113 			}
3114 		}
3115 		mutex_exit(&mi->mi_rnodes_lock);
3116 
3117 		goto done;
3118 	}
3119 
3120 	ASSERT(vfsp == NULL);
3121 
3122 	/*
3123 	 * Walk the hash queues looking for rnodes with page
3124 	 * lists associated with them.  Make a list of these
3125 	 * files.
3126 	 */
3127 	for (index = 0; index < rtablesize; index++) {
3128 		rw_enter(&rtable[index].r_lock, RW_READER);
3129 		for (rp = rtable[index].r_hashf;
3130 		    rp != (rnode_t *)(&rtable[index]);
3131 		    rp = rp->r_hashf) {
3132 			vp = RTOV(rp);
3133 			/*
3134 			 * Don't bother sync'ing a vp if it
3135 			 * is part of virtual swap device or
3136 			 * if VFS is read-only
3137 			 */
3138 			if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3139 				continue;
3140 			/*
3141 			 * If the vnode has pages and is marked as either dirty
3142 			 * or mmap'd, hold and add this vnode to the list of
3143 			 * vnodes to flush.
3144 			 */
3145 			if (vn_has_cached_data(vp) &&
3146 			    ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3147 				VN_HOLD(vp);
3148 				vplist[cnt++] = vp;
3149 				if (cnt == num) {
3150 					rw_exit(&rtable[index].r_lock);
3151 					/*
3152 					 * The vplist is full because there is
3153 					 * too many rnodes.  We are done for
3154 					 * now.
3155 					 */
3156 					goto done;
3157 				}
3158 			}
3159 		}
3160 		rw_exit(&rtable[index].r_lock);
3161 	}
3162 
3163 done:
3164 
3165 	/*
3166 	 * Flush and release all of the files on the list.
3167 	 */
3168 	while (cnt-- > 0) {
3169 		vp = vplist[cnt];
3170 		(void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3171 		VN_RELE(vp);
3172 	}
3173 
3174 	/*
3175 	 * Free the space allocated to hold the list.
3176 	 */
3177 	kmem_free(vplist, num * sizeof (*vplist));
3178 }
3179 
3180 /*
3181  * This probably needs to be larger than or equal to
3182  * log2(sizeof (struct rnode)) due to the way that rnodes are
3183  * allocated.
3184  */
3185 #define	ACACHE_SHIFT_BITS	9
3186 
3187 static int
3188 acachehash(rnode_t *rp, cred_t *cr)
3189 {
3190 
3191 	return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3192 	    acachemask);
3193 }
3194 
3195 #ifdef DEBUG
3196 static long nfs_access_cache_hits = 0;
3197 static long nfs_access_cache_misses = 0;
3198 #endif
3199 
3200 nfs_access_type_t
3201 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3202 {
3203 	vnode_t *vp;
3204 	acache_t *ap;
3205 	acache_hash_t *hp;
3206 	nfs_access_type_t all;
3207 
3208 	vp = RTOV(rp);
3209 	if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3210 		return (NFS_ACCESS_UNKNOWN);
3211 
3212 	if (rp->r_acache != NULL) {
3213 		hp = &acache[acachehash(rp, cr)];
3214 		rw_enter(&hp->lock, RW_READER);
3215 		ap = hp->next;
3216 		while (ap != (acache_t *)hp) {
3217 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3218 				if ((ap->known & acc) == acc) {
3219 #ifdef DEBUG
3220 					nfs_access_cache_hits++;
3221 #endif
3222 					if ((ap->allowed & acc) == acc)
3223 						all = NFS_ACCESS_ALLOWED;
3224 					else
3225 						all = NFS_ACCESS_DENIED;
3226 				} else {
3227 #ifdef DEBUG
3228 					nfs_access_cache_misses++;
3229 #endif
3230 					all = NFS_ACCESS_UNKNOWN;
3231 				}
3232 				rw_exit(&hp->lock);
3233 				return (all);
3234 			}
3235 			ap = ap->next;
3236 		}
3237 		rw_exit(&hp->lock);
3238 	}
3239 
3240 #ifdef DEBUG
3241 	nfs_access_cache_misses++;
3242 #endif
3243 	return (NFS_ACCESS_UNKNOWN);
3244 }
3245 
3246 void
3247 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3248 {
3249 	acache_t *ap;
3250 	acache_t *nap;
3251 	acache_hash_t *hp;
3252 
3253 	hp = &acache[acachehash(rp, cr)];
3254 
3255 	/*
3256 	 * Allocate now assuming that mostly an allocation will be
3257 	 * required.  This allows the allocation to happen without
3258 	 * holding the hash bucket locked.
3259 	 */
3260 	nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3261 	if (nap != NULL) {
3262 		nap->known = acc;
3263 		nap->allowed = resacc;
3264 		nap->rnode = rp;
3265 		crhold(cr);
3266 		nap->cred = cr;
3267 		nap->hashq = hp;
3268 	}
3269 
3270 	rw_enter(&hp->lock, RW_WRITER);
3271 
3272 	if (rp->r_acache != NULL) {
3273 		ap = hp->next;
3274 		while (ap != (acache_t *)hp) {
3275 			if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3276 				ap->known |= acc;
3277 				ap->allowed &= ~acc;
3278 				ap->allowed |= resacc;
3279 				rw_exit(&hp->lock);
3280 				if (nap != NULL) {
3281 					crfree(nap->cred);
3282 					kmem_cache_free(acache_cache, nap);
3283 				}
3284 				return;
3285 			}
3286 			ap = ap->next;
3287 		}
3288 	}
3289 
3290 	if (nap != NULL) {
3291 #ifdef DEBUG
3292 		clstat_debug.access.value.ui64++;
3293 #endif
3294 		nap->next = hp->next;
3295 		hp->next = nap;
3296 		nap->next->prev = nap;
3297 		nap->prev = (acache_t *)hp;
3298 
3299 		mutex_enter(&rp->r_statelock);
3300 		nap->list = rp->r_acache;
3301 		rp->r_acache = nap;
3302 		mutex_exit(&rp->r_statelock);
3303 	}
3304 
3305 	rw_exit(&hp->lock);
3306 }
3307 
3308 int
3309 nfs_access_purge_rp(rnode_t *rp)
3310 {
3311 	acache_t *ap;
3312 	acache_t *tmpap;
3313 	acache_t *rplist;
3314 
3315 	/*
3316 	 * If there aren't any cached entries, then there is nothing
3317 	 * to free.
3318 	 */
3319 	if (rp->r_acache == NULL)
3320 		return (0);
3321 
3322 	mutex_enter(&rp->r_statelock);
3323 	rplist = rp->r_acache;
3324 	rp->r_acache = NULL;
3325 	mutex_exit(&rp->r_statelock);
3326 
3327 	/*
3328 	 * Loop through each entry in the list pointed to in the
3329 	 * rnode.  Remove each of these entries from the hash
3330 	 * queue that it is on and remove it from the list in
3331 	 * the rnode.
3332 	 */
3333 	for (ap = rplist; ap != NULL; ap = tmpap) {
3334 		rw_enter(&ap->hashq->lock, RW_WRITER);
3335 		ap->prev->next = ap->next;
3336 		ap->next->prev = ap->prev;
3337 		rw_exit(&ap->hashq->lock);
3338 
3339 		tmpap = ap->list;
3340 		crfree(ap->cred);
3341 		kmem_cache_free(acache_cache, ap);
3342 #ifdef DEBUG
3343 		clstat_debug.access.value.ui64--;
3344 #endif
3345 	}
3346 
3347 	return (1);
3348 }
3349 
3350 static const char prefix[] = ".nfs";
3351 
3352 static kmutex_t newnum_lock;
3353 
3354 int
3355 newnum(void)
3356 {
3357 	static uint_t newnum = 0;
3358 	uint_t id;
3359 
3360 	mutex_enter(&newnum_lock);
3361 	if (newnum == 0)
3362 		newnum = gethrestime_sec() & 0xffff;
3363 	id = newnum++;
3364 	mutex_exit(&newnum_lock);
3365 	return (id);
3366 }
3367 
3368 char *
3369 newname(void)
3370 {
3371 	char *news;
3372 	char *s;
3373 	const char *p;
3374 	uint_t id;
3375 
3376 	id = newnum();
3377 	news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3378 	s = news;
3379 	p = prefix;
3380 	while (*p != '\0')
3381 		*s++ = *p++;
3382 	while (id != 0) {
3383 		*s++ = "0123456789ABCDEF"[id & 0x0f];
3384 		id >>= 4;
3385 	}
3386 	*s = '\0';
3387 	return (news);
3388 }
3389 
3390 /*
3391  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3392  * framework.
3393  */
3394 static int
3395 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3396 {
3397 	ksp->ks_snaptime = gethrtime();
3398 	if (rw == KSTAT_WRITE) {
3399 		bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3400 #ifdef DEBUG
3401 		/*
3402 		 * Currently only the global zone can write to kstats, but we
3403 		 * add the check just for paranoia.
3404 		 */
3405 		if (INGLOBALZONE(curproc))
3406 			bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3407 			    sizeof (clstat_debug));
3408 #endif
3409 	} else {
3410 		bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3411 #ifdef DEBUG
3412 		/*
3413 		 * If we're displaying the "global" debug kstat values, we
3414 		 * display them as-is to all zones since in fact they apply to
3415 		 * the system as a whole.
3416 		 */
3417 		bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3418 		    sizeof (clstat_debug));
3419 #endif
3420 	}
3421 	return (0);
3422 }
3423 
3424 static void *
3425 clinit_zone(zoneid_t zoneid)
3426 {
3427 	kstat_t *nfs_client_kstat;
3428 	struct nfs_clnt *nfscl;
3429 	uint_t ndata;
3430 
3431 	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3432 	mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3433 	nfscl->nfscl_chtable = NULL;
3434 	nfscl->nfscl_zoneid = zoneid;
3435 
3436 	bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3437 	ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3438 #ifdef DEBUG
3439 	ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3440 #endif
3441 	if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3442 	    "misc", KSTAT_TYPE_NAMED, ndata,
3443 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3444 		nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3445 		nfs_client_kstat->ks_snapshot = cl_snapshot;
3446 		kstat_install(nfs_client_kstat);
3447 	}
3448 	mutex_enter(&nfs_clnt_list_lock);
3449 	list_insert_head(&nfs_clnt_list, nfscl);
3450 	mutex_exit(&nfs_clnt_list_lock);
3451 	return (nfscl);
3452 }
3453 
3454 /*ARGSUSED*/
3455 static void
3456 clfini_zone(zoneid_t zoneid, void *arg)
3457 {
3458 	struct nfs_clnt *nfscl = arg;
3459 	chhead_t *chp, *next;
3460 
3461 	if (nfscl == NULL)
3462 		return;
3463 	mutex_enter(&nfs_clnt_list_lock);
3464 	list_remove(&nfs_clnt_list, nfscl);
3465 	mutex_exit(&nfs_clnt_list_lock);
3466 	clreclaim_zone(nfscl, 0);
3467 	for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3468 		ASSERT(chp->ch_list == NULL);
3469 		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3470 		next = chp->ch_next;
3471 		kmem_free(chp, sizeof (*chp));
3472 	}
3473 	kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3474 	mutex_destroy(&nfscl->nfscl_chtable_lock);
3475 	kmem_free(nfscl, sizeof (*nfscl));
3476 }
3477 
3478 /*
3479  * Called by endpnt_destructor to make sure the client handles are
3480  * cleaned up before the RPC endpoints.  This becomes a no-op if
3481  * clfini_zone (above) is called first.  This function is needed
3482  * (rather than relying on clfini_zone to clean up) because the ZSD
3483  * callbacks have no ordering mechanism, so we have no way to ensure
3484  * that clfini_zone is called before endpnt_destructor.
3485  */
3486 void
3487 clcleanup_zone(zoneid_t zoneid)
3488 {
3489 	struct nfs_clnt *nfscl;
3490 
3491 	mutex_enter(&nfs_clnt_list_lock);
3492 	nfscl = list_head(&nfs_clnt_list);
3493 	for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3494 		if (nfscl->nfscl_zoneid == zoneid) {
3495 			clreclaim_zone(nfscl, 0);
3496 			break;
3497 		}
3498 	}
3499 	mutex_exit(&nfs_clnt_list_lock);
3500 }
3501 
3502 int
3503 nfs_subrinit(void)
3504 {
3505 	int i;
3506 	ulong_t nrnode_max;
3507 
3508 	/*
3509 	 * Allocate and initialize the rnode hash queues
3510 	 */
3511 	if (nrnode <= 0)
3512 		nrnode = ncsize;
3513 	nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3514 	if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3515 		zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3516 		    "!setting nrnode to max value of %ld", nrnode_max);
3517 		nrnode = nrnode_max;
3518 	}
3519 
3520 	rtablesize = 1 << highbit(nrnode / hashlen);
3521 	rtablemask = rtablesize - 1;
3522 	rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3523 	for (i = 0; i < rtablesize; i++) {
3524 		rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3525 		rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3526 		rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3527 	}
3528 	rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3529 	    0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3530 
3531 	/*
3532 	 * Allocate and initialize the access cache
3533 	 */
3534 
3535 	/*
3536 	 * Initial guess is one access cache entry per rnode unless
3537 	 * nacache is set to a non-zero value and then it is used to
3538 	 * indicate a guess at the number of access cache entries.
3539 	 */
3540 	if (nacache > 0)
3541 		acachesize = 1 << highbit(nacache / hashlen);
3542 	else
3543 		acachesize = rtablesize;
3544 	acachemask = acachesize - 1;
3545 	acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3546 	for (i = 0; i < acachesize; i++) {
3547 		acache[i].next = (acache_t *)&acache[i];
3548 		acache[i].prev = (acache_t *)&acache[i];
3549 		rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3550 	}
3551 	acache_cache = kmem_cache_create("nfs_access_cache",
3552 	    sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3553 	/*
3554 	 * Allocate and initialize the client handle cache
3555 	 */
3556 	chtab_cache = kmem_cache_create("client_handle_cache",
3557 	    sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3558 	/*
3559 	 * Initialize the list of per-zone client handles (and associated data).
3560 	 * This needs to be done before we call zone_key_create().
3561 	 */
3562 	list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3563 	    offsetof(struct nfs_clnt, nfscl_node));
3564 	/*
3565 	 * Initialize the zone_key for per-zone client handle lists.
3566 	 */
3567 	zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3568 	/*
3569 	 * Initialize the various mutexes and reader/writer locks
3570 	 */
3571 	mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3572 	mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3573 	mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3574 
3575 	/*
3576 	 * Assign unique major number for all nfs mounts
3577 	 */
3578 	if ((nfs_major = getudev()) == -1) {
3579 		zcmn_err(GLOBAL_ZONEID, CE_WARN,
3580 		    "nfs: init: can't get unique device number");
3581 		nfs_major = 0;
3582 	}
3583 	nfs_minor = 0;
3584 
3585 	if (nfs3_jukebox_delay == 0)
3586 		nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3587 
3588 	return (0);
3589 }
3590 
3591 void
3592 nfs_subrfini(void)
3593 {
3594 	int i;
3595 
3596 	/*
3597 	 * Deallocate the rnode hash queues
3598 	 */
3599 	kmem_cache_destroy(rnode_cache);
3600 
3601 	for (i = 0; i < rtablesize; i++)
3602 		rw_destroy(&rtable[i].r_lock);
3603 	kmem_free(rtable, rtablesize * sizeof (*rtable));
3604 
3605 	/*
3606 	 * Deallocated the access cache
3607 	 */
3608 	kmem_cache_destroy(acache_cache);
3609 
3610 	for (i = 0; i < acachesize; i++)
3611 		rw_destroy(&acache[i].lock);
3612 	kmem_free(acache, acachesize * sizeof (*acache));
3613 
3614 	/*
3615 	 * Deallocate the client handle cache
3616 	 */
3617 	kmem_cache_destroy(chtab_cache);
3618 
3619 	/*
3620 	 * Destroy the various mutexes and reader/writer locks
3621 	 */
3622 	mutex_destroy(&rpfreelist_lock);
3623 	mutex_destroy(&newnum_lock);
3624 	mutex_destroy(&nfs_minor_lock);
3625 	(void) zone_key_delete(nfsclnt_zone_key);
3626 }
3627 
3628 enum nfsstat
3629 puterrno(int error)
3630 {
3631 
3632 	switch (error) {
3633 	case EOPNOTSUPP:
3634 		return (NFSERR_OPNOTSUPP);
3635 	case ENAMETOOLONG:
3636 		return (NFSERR_NAMETOOLONG);
3637 	case ENOTEMPTY:
3638 		return (NFSERR_NOTEMPTY);
3639 	case EDQUOT:
3640 		return (NFSERR_DQUOT);
3641 	case ESTALE:
3642 		return (NFSERR_STALE);
3643 	case EREMOTE:
3644 		return (NFSERR_REMOTE);
3645 	case ENOSYS:
3646 		return (NFSERR_OPNOTSUPP);
3647 	case EOVERFLOW:
3648 		return (NFSERR_INVAL);
3649 	default:
3650 		return ((enum nfsstat)error);
3651 	}
3652 	/* NOTREACHED */
3653 }
3654 
3655 int
3656 geterrno(enum nfsstat status)
3657 {
3658 
3659 	switch (status) {
3660 	case NFSERR_OPNOTSUPP:
3661 		return (EOPNOTSUPP);
3662 	case NFSERR_NAMETOOLONG:
3663 		return (ENAMETOOLONG);
3664 	case NFSERR_NOTEMPTY:
3665 		return (ENOTEMPTY);
3666 	case NFSERR_DQUOT:
3667 		return (EDQUOT);
3668 	case NFSERR_STALE:
3669 		return (ESTALE);
3670 	case NFSERR_REMOTE:
3671 		return (EREMOTE);
3672 	case NFSERR_WFLUSH:
3673 		return (EIO);
3674 	default:
3675 		return ((int)status);
3676 	}
3677 	/* NOTREACHED */
3678 }
3679 
3680 enum nfsstat3
3681 puterrno3(int error)
3682 {
3683 
3684 #ifdef DEBUG
3685 	switch (error) {
3686 	case 0:
3687 		return (NFS3_OK);
3688 	case EPERM:
3689 		return (NFS3ERR_PERM);
3690 	case ENOENT:
3691 		return (NFS3ERR_NOENT);
3692 	case EIO:
3693 		return (NFS3ERR_IO);
3694 	case ENXIO:
3695 		return (NFS3ERR_NXIO);
3696 	case EACCES:
3697 		return (NFS3ERR_ACCES);
3698 	case EEXIST:
3699 		return (NFS3ERR_EXIST);
3700 	case EXDEV:
3701 		return (NFS3ERR_XDEV);
3702 	case ENODEV:
3703 		return (NFS3ERR_NODEV);
3704 	case ENOTDIR:
3705 		return (NFS3ERR_NOTDIR);
3706 	case EISDIR:
3707 		return (NFS3ERR_ISDIR);
3708 	case EINVAL:
3709 		return (NFS3ERR_INVAL);
3710 	case EFBIG:
3711 		return (NFS3ERR_FBIG);
3712 	case ENOSPC:
3713 		return (NFS3ERR_NOSPC);
3714 	case EROFS:
3715 		return (NFS3ERR_ROFS);
3716 	case EMLINK:
3717 		return (NFS3ERR_MLINK);
3718 	case ENAMETOOLONG:
3719 		return (NFS3ERR_NAMETOOLONG);
3720 	case ENOTEMPTY:
3721 		return (NFS3ERR_NOTEMPTY);
3722 	case EDQUOT:
3723 		return (NFS3ERR_DQUOT);
3724 	case ESTALE:
3725 		return (NFS3ERR_STALE);
3726 	case EREMOTE:
3727 		return (NFS3ERR_REMOTE);
3728 	case ENOSYS:
3729 	case EOPNOTSUPP:
3730 		return (NFS3ERR_NOTSUPP);
3731 	case EOVERFLOW:
3732 		return (NFS3ERR_INVAL);
3733 	default:
3734 		zcmn_err(getzoneid(), CE_WARN,
3735 		    "puterrno3: got error %d", error);
3736 		return ((enum nfsstat3)error);
3737 	}
3738 #else
3739 	switch (error) {
3740 	case ENAMETOOLONG:
3741 		return (NFS3ERR_NAMETOOLONG);
3742 	case ENOTEMPTY:
3743 		return (NFS3ERR_NOTEMPTY);
3744 	case EDQUOT:
3745 		return (NFS3ERR_DQUOT);
3746 	case ESTALE:
3747 		return (NFS3ERR_STALE);
3748 	case ENOSYS:
3749 	case EOPNOTSUPP:
3750 		return (NFS3ERR_NOTSUPP);
3751 	case EREMOTE:
3752 		return (NFS3ERR_REMOTE);
3753 	case EOVERFLOW:
3754 		return (NFS3ERR_INVAL);
3755 	default:
3756 		return ((enum nfsstat3)error);
3757 	}
3758 #endif
3759 }
3760 
3761 int
3762 geterrno3(enum nfsstat3 status)
3763 {
3764 
3765 #ifdef DEBUG
3766 	switch (status) {
3767 	case NFS3_OK:
3768 		return (0);
3769 	case NFS3ERR_PERM:
3770 		return (EPERM);
3771 	case NFS3ERR_NOENT:
3772 		return (ENOENT);
3773 	case NFS3ERR_IO:
3774 		return (EIO);
3775 	case NFS3ERR_NXIO:
3776 		return (ENXIO);
3777 	case NFS3ERR_ACCES:
3778 		return (EACCES);
3779 	case NFS3ERR_EXIST:
3780 		return (EEXIST);
3781 	case NFS3ERR_XDEV:
3782 		return (EXDEV);
3783 	case NFS3ERR_NODEV:
3784 		return (ENODEV);
3785 	case NFS3ERR_NOTDIR:
3786 		return (ENOTDIR);
3787 	case NFS3ERR_ISDIR:
3788 		return (EISDIR);
3789 	case NFS3ERR_INVAL:
3790 		return (EINVAL);
3791 	case NFS3ERR_FBIG:
3792 		return (EFBIG);
3793 	case NFS3ERR_NOSPC:
3794 		return (ENOSPC);
3795 	case NFS3ERR_ROFS:
3796 		return (EROFS);
3797 	case NFS3ERR_MLINK:
3798 		return (EMLINK);
3799 	case NFS3ERR_NAMETOOLONG:
3800 		return (ENAMETOOLONG);
3801 	case NFS3ERR_NOTEMPTY:
3802 		return (ENOTEMPTY);
3803 	case NFS3ERR_DQUOT:
3804 		return (EDQUOT);
3805 	case NFS3ERR_STALE:
3806 		return (ESTALE);
3807 	case NFS3ERR_REMOTE:
3808 		return (EREMOTE);
3809 	case NFS3ERR_BADHANDLE:
3810 		return (ESTALE);
3811 	case NFS3ERR_NOT_SYNC:
3812 		return (EINVAL);
3813 	case NFS3ERR_BAD_COOKIE:
3814 		return (ENOENT);
3815 	case NFS3ERR_NOTSUPP:
3816 		return (EOPNOTSUPP);
3817 	case NFS3ERR_TOOSMALL:
3818 		return (EINVAL);
3819 	case NFS3ERR_SERVERFAULT:
3820 		return (EIO);
3821 	case NFS3ERR_BADTYPE:
3822 		return (EINVAL);
3823 	case NFS3ERR_JUKEBOX:
3824 		return (ENXIO);
3825 	default:
3826 		zcmn_err(getzoneid(), CE_WARN,
3827 		    "geterrno3: got status %d", status);
3828 		return ((int)status);
3829 	}
3830 #else
3831 	switch (status) {
3832 	case NFS3ERR_NAMETOOLONG:
3833 		return (ENAMETOOLONG);
3834 	case NFS3ERR_NOTEMPTY:
3835 		return (ENOTEMPTY);
3836 	case NFS3ERR_DQUOT:
3837 		return (EDQUOT);
3838 	case NFS3ERR_STALE:
3839 	case NFS3ERR_BADHANDLE:
3840 		return (ESTALE);
3841 	case NFS3ERR_NOTSUPP:
3842 		return (EOPNOTSUPP);
3843 	case NFS3ERR_REMOTE:
3844 		return (EREMOTE);
3845 	case NFS3ERR_NOT_SYNC:
3846 	case NFS3ERR_TOOSMALL:
3847 	case NFS3ERR_BADTYPE:
3848 		return (EINVAL);
3849 	case NFS3ERR_BAD_COOKIE:
3850 		return (ENOENT);
3851 	case NFS3ERR_SERVERFAULT:
3852 		return (EIO);
3853 	case NFS3ERR_JUKEBOX:
3854 		return (ENXIO);
3855 	default:
3856 		return ((int)status);
3857 	}
3858 #endif
3859 }
3860 
3861 rddir_cache *
3862 rddir_cache_alloc(int flags)
3863 {
3864 	rddir_cache *rc;
3865 
3866 	rc = kmem_alloc(sizeof (*rc), flags);
3867 	if (rc != NULL) {
3868 		rc->entries = NULL;
3869 		rc->flags = RDDIR;
3870 		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3871 		mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3872 		rc->count = 1;
3873 #ifdef DEBUG
3874 		atomic_inc_64(&clstat_debug.dirent.value.ui64);
3875 #endif
3876 	}
3877 	return (rc);
3878 }
3879 
3880 static void
3881 rddir_cache_free(rddir_cache *rc)
3882 {
3883 
3884 #ifdef DEBUG
3885 	atomic_dec_64(&clstat_debug.dirent.value.ui64);
3886 #endif
3887 	if (rc->entries != NULL) {
3888 #ifdef DEBUG
3889 		rddir_cache_buf_free(rc->entries, rc->buflen);
3890 #else
3891 		kmem_free(rc->entries, rc->buflen);
3892 #endif
3893 	}
3894 	cv_destroy(&rc->cv);
3895 	mutex_destroy(&rc->lock);
3896 	kmem_free(rc, sizeof (*rc));
3897 }
3898 
3899 void
3900 rddir_cache_hold(rddir_cache *rc)
3901 {
3902 
3903 	mutex_enter(&rc->lock);
3904 	rc->count++;
3905 	mutex_exit(&rc->lock);
3906 }
3907 
3908 void
3909 rddir_cache_rele(rddir_cache *rc)
3910 {
3911 
3912 	mutex_enter(&rc->lock);
3913 	ASSERT(rc->count > 0);
3914 	if (--rc->count == 0) {
3915 		mutex_exit(&rc->lock);
3916 		rddir_cache_free(rc);
3917 	} else
3918 		mutex_exit(&rc->lock);
3919 }
3920 
3921 #ifdef DEBUG
3922 char *
3923 rddir_cache_buf_alloc(size_t size, int flags)
3924 {
3925 	char *rc;
3926 
3927 	rc = kmem_alloc(size, flags);
3928 	if (rc != NULL)
3929 		atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3930 	return (rc);
3931 }
3932 
3933 void
3934 rddir_cache_buf_free(void *addr, size_t size)
3935 {
3936 
3937 	atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3938 	kmem_free(addr, size);
3939 }
3940 #endif
3941 
3942 static int
3943 nfs_free_data_reclaim(rnode_t *rp)
3944 {
3945 	char *contents;
3946 	int size;
3947 	vsecattr_t *vsp;
3948 	nfs3_pathconf_info *info;
3949 	int freed;
3950 	cred_t *cred;
3951 
3952 	/*
3953 	 * Free any held credentials and caches which
3954 	 * may be associated with this rnode.
3955 	 */
3956 	mutex_enter(&rp->r_statelock);
3957 	cred = rp->r_cred;
3958 	rp->r_cred = NULL;
3959 	contents = rp->r_symlink.contents;
3960 	size = rp->r_symlink.size;
3961 	rp->r_symlink.contents = NULL;
3962 	vsp = rp->r_secattr;
3963 	rp->r_secattr = NULL;
3964 	info = rp->r_pathconf;
3965 	rp->r_pathconf = NULL;
3966 	mutex_exit(&rp->r_statelock);
3967 
3968 	if (cred != NULL)
3969 		crfree(cred);
3970 
3971 	/*
3972 	 * Free the access cache entries.
3973 	 */
3974 	freed = nfs_access_purge_rp(rp);
3975 
3976 	if (!HAVE_RDDIR_CACHE(rp) &&
3977 	    contents == NULL &&
3978 	    vsp == NULL &&
3979 	    info == NULL)
3980 		return (freed);
3981 
3982 	/*
3983 	 * Free the readdir cache entries
3984 	 */
3985 	if (HAVE_RDDIR_CACHE(rp))
3986 		nfs_purge_rddir_cache(RTOV(rp));
3987 
3988 	/*
3989 	 * Free the symbolic link cache.
3990 	 */
3991 	if (contents != NULL) {
3992 
3993 		kmem_free((void *)contents, size);
3994 	}
3995 
3996 	/*
3997 	 * Free any cached ACL.
3998 	 */
3999 	if (vsp != NULL)
4000 		nfs_acl_free(vsp);
4001 
4002 	/*
4003 	 * Free any cached pathconf information.
4004 	 */
4005 	if (info != NULL)
4006 		kmem_free(info, sizeof (*info));
4007 
4008 	return (1);
4009 }
4010 
4011 static int
4012 nfs_active_data_reclaim(rnode_t *rp)
4013 {
4014 	char *contents;
4015 	int size;
4016 	vsecattr_t *vsp;
4017 	nfs3_pathconf_info *info;
4018 	int freed;
4019 
4020 	/*
4021 	 * Free any held credentials and caches which
4022 	 * may be associated with this rnode.
4023 	 */
4024 	if (!mutex_tryenter(&rp->r_statelock))
4025 		return (0);
4026 	contents = rp->r_symlink.contents;
4027 	size = rp->r_symlink.size;
4028 	rp->r_symlink.contents = NULL;
4029 	vsp = rp->r_secattr;
4030 	rp->r_secattr = NULL;
4031 	info = rp->r_pathconf;
4032 	rp->r_pathconf = NULL;
4033 	mutex_exit(&rp->r_statelock);
4034 
4035 	/*
4036 	 * Free the access cache entries.
4037 	 */
4038 	freed = nfs_access_purge_rp(rp);
4039 
4040 	if (!HAVE_RDDIR_CACHE(rp) &&
4041 	    contents == NULL &&
4042 	    vsp == NULL &&
4043 	    info == NULL)
4044 		return (freed);
4045 
4046 	/*
4047 	 * Free the readdir cache entries
4048 	 */
4049 	if (HAVE_RDDIR_CACHE(rp))
4050 		nfs_purge_rddir_cache(RTOV(rp));
4051 
4052 	/*
4053 	 * Free the symbolic link cache.
4054 	 */
4055 	if (contents != NULL) {
4056 
4057 		kmem_free((void *)contents, size);
4058 	}
4059 
4060 	/*
4061 	 * Free any cached ACL.
4062 	 */
4063 	if (vsp != NULL)
4064 		nfs_acl_free(vsp);
4065 
4066 	/*
4067 	 * Free any cached pathconf information.
4068 	 */
4069 	if (info != NULL)
4070 		kmem_free(info, sizeof (*info));
4071 
4072 	return (1);
4073 }
4074 
4075 static int
4076 nfs_free_reclaim(void)
4077 {
4078 	int freed;
4079 	rnode_t *rp;
4080 
4081 #ifdef DEBUG
4082 	clstat_debug.f_reclaim.value.ui64++;
4083 #endif
4084 	freed = 0;
4085 	mutex_enter(&rpfreelist_lock);
4086 	rp = rpfreelist;
4087 	if (rp != NULL) {
4088 		do {
4089 			if (nfs_free_data_reclaim(rp))
4090 				freed = 1;
4091 		} while ((rp = rp->r_freef) != rpfreelist);
4092 	}
4093 	mutex_exit(&rpfreelist_lock);
4094 	return (freed);
4095 }
4096 
4097 static int
4098 nfs_active_reclaim(void)
4099 {
4100 	int freed;
4101 	int index;
4102 	rnode_t *rp;
4103 
4104 #ifdef DEBUG
4105 	clstat_debug.a_reclaim.value.ui64++;
4106 #endif
4107 	freed = 0;
4108 	for (index = 0; index < rtablesize; index++) {
4109 		rw_enter(&rtable[index].r_lock, RW_READER);
4110 		for (rp = rtable[index].r_hashf;
4111 		    rp != (rnode_t *)(&rtable[index]);
4112 		    rp = rp->r_hashf) {
4113 			if (nfs_active_data_reclaim(rp))
4114 				freed = 1;
4115 		}
4116 		rw_exit(&rtable[index].r_lock);
4117 	}
4118 	return (freed);
4119 }
4120 
4121 static int
4122 nfs_rnode_reclaim(void)
4123 {
4124 	int freed;
4125 	rnode_t *rp;
4126 	vnode_t *vp;
4127 
4128 #ifdef DEBUG
4129 	clstat_debug.r_reclaim.value.ui64++;
4130 #endif
4131 	freed = 0;
4132 	mutex_enter(&rpfreelist_lock);
4133 	while ((rp = rpfreelist) != NULL) {
4134 		rp_rmfree(rp);
4135 		mutex_exit(&rpfreelist_lock);
4136 		if (rp->r_flags & RHASHED) {
4137 			vp = RTOV(rp);
4138 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4139 			mutex_enter(&vp->v_lock);
4140 			if (vp->v_count > 1) {
4141 				VN_RELE_LOCKED(vp);
4142 				mutex_exit(&vp->v_lock);
4143 				rw_exit(&rp->r_hashq->r_lock);
4144 				mutex_enter(&rpfreelist_lock);
4145 				continue;
4146 			}
4147 			mutex_exit(&vp->v_lock);
4148 			rp_rmhash_locked(rp);
4149 			rw_exit(&rp->r_hashq->r_lock);
4150 		}
4151 		/*
4152 		 * This call to rp_addfree will end up destroying the
4153 		 * rnode, but in a safe way with the appropriate set
4154 		 * of checks done.
4155 		 */
4156 		rp_addfree(rp, CRED());
4157 		mutex_enter(&rpfreelist_lock);
4158 	}
4159 	mutex_exit(&rpfreelist_lock);
4160 	return (freed);
4161 }
4162 
4163 /*ARGSUSED*/
4164 static void
4165 nfs_reclaim(void *cdrarg)
4166 {
4167 
4168 #ifdef DEBUG
4169 	clstat_debug.reclaim.value.ui64++;
4170 #endif
4171 	if (nfs_free_reclaim())
4172 		return;
4173 
4174 	if (nfs_active_reclaim())
4175 		return;
4176 
4177 	(void) nfs_rnode_reclaim();
4178 }
4179 
4180 /*
4181  * NFS client failover support
4182  *
4183  * Routines to copy filehandles
4184  */
4185 void
4186 nfscopyfh(caddr_t fhp, vnode_t *vp)
4187 {
4188 	fhandle_t *dest = (fhandle_t *)fhp;
4189 
4190 	if (dest != NULL)
4191 		*dest = *VTOFH(vp);
4192 }
4193 
4194 void
4195 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4196 {
4197 	nfs_fh3 *dest = (nfs_fh3 *)fhp;
4198 
4199 	if (dest != NULL)
4200 		*dest = *VTOFH3(vp);
4201 }
4202 
4203 /*
4204  * NFS client failover support
4205  *
4206  * failover_safe() will test various conditions to ensure that
4207  * failover is permitted for this vnode.  It will be denied
4208  * if:
4209  *	1) the operation in progress does not support failover (NULL fi)
4210  *	2) there are no available replicas (NULL mi_servers->sv_next)
4211  *	3) any locks are outstanding on this file
4212  */
4213 static int
4214 failover_safe(failinfo_t *fi)
4215 {
4216 
4217 	/*
4218 	 * Does this op permit failover?
4219 	 */
4220 	if (fi == NULL || fi->vp == NULL)
4221 		return (0);
4222 
4223 	/*
4224 	 * Are there any alternates to failover to?
4225 	 */
4226 	if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4227 		return (0);
4228 
4229 	/*
4230 	 * Disable check; we've forced local locking
4231 	 *
4232 	 * if (flk_has_remote_locks(fi->vp))
4233 	 *	return (0);
4234 	 */
4235 
4236 	/*
4237 	 * If we have no partial path, we can't do anything
4238 	 */
4239 	if (VTOR(fi->vp)->r_path == NULL)
4240 		return (0);
4241 
4242 	return (1);
4243 }
4244 
4245 #include <sys/thread.h>
4246 
4247 /*
4248  * NFS client failover support
4249  *
4250  * failover_newserver() will start a search for a new server,
4251  * preferably by starting an async thread to do the work.  If
4252  * someone is already doing this (recognizable by MI_BINDINPROG
4253  * being set), it will simply return and the calling thread
4254  * will queue on the mi_failover_cv condition variable.
4255  */
4256 static void
4257 failover_newserver(mntinfo_t *mi)
4258 {
4259 	/*
4260 	 * Check if someone else is doing this already
4261 	 */
4262 	mutex_enter(&mi->mi_lock);
4263 	if (mi->mi_flags & MI_BINDINPROG) {
4264 		mutex_exit(&mi->mi_lock);
4265 		return;
4266 	}
4267 	mi->mi_flags |= MI_BINDINPROG;
4268 
4269 	/*
4270 	 * Need to hold the vfs struct so that it can't be released
4271 	 * while the failover thread is selecting a new server.
4272 	 */
4273 	VFS_HOLD(mi->mi_vfsp);
4274 
4275 	/*
4276 	 * Start a thread to do the real searching.
4277 	 */
4278 	(void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4279 
4280 	mutex_exit(&mi->mi_lock);
4281 }
4282 
4283 /*
4284  * NFS client failover support
4285  *
4286  * failover_thread() will find a new server to replace the one
4287  * currently in use, wake up other threads waiting on this mount
4288  * point, and die.  It will start at the head of the server list
4289  * and poll servers until it finds one with an NFS server which is
4290  * registered and responds to a NULL procedure ping.
4291  *
4292  * XXX failover_thread is unsafe within the scope of the
4293  * present model defined for cpr to suspend the system.
4294  * Specifically, over-the-wire calls made by the thread
4295  * are unsafe. The thread needs to be reevaluated in case of
4296  * future updates to the cpr suspend model.
4297  */
4298 static void
4299 failover_thread(mntinfo_t *mi)
4300 {
4301 	servinfo_t *svp = NULL;
4302 	CLIENT *cl;
4303 	enum clnt_stat status;
4304 	struct timeval tv;
4305 	int error;
4306 	int oncethru = 0;
4307 	callb_cpr_t cprinfo;
4308 	rnode_t *rp;
4309 	int index;
4310 	char *srvnames;
4311 	size_t srvnames_len;
4312 	struct nfs_clnt *nfscl = NULL;
4313 	zoneid_t zoneid = getzoneid();
4314 
4315 #ifdef DEBUG
4316 	/*
4317 	 * This is currently only needed to access counters which exist on
4318 	 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4319 	 * on non-DEBUG kernels.
4320 	 */
4321 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4322 	ASSERT(nfscl != NULL);
4323 #endif
4324 
4325 	/*
4326 	 * Its safe to piggyback on the mi_lock since failover_newserver()
4327 	 * code guarantees that there will be only one failover thread
4328 	 * per mountinfo at any instance.
4329 	 */
4330 	CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4331 	    "failover_thread");
4332 
4333 	mutex_enter(&mi->mi_lock);
4334 	while (mi->mi_readers) {
4335 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
4336 		cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4337 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4338 	}
4339 	mutex_exit(&mi->mi_lock);
4340 
4341 	tv.tv_sec = 2;
4342 	tv.tv_usec = 0;
4343 
4344 	/*
4345 	 * Ping the null NFS procedure of every server in
4346 	 * the list until one responds.  We always start
4347 	 * at the head of the list and always skip the one
4348 	 * that is current, since it's caused us a problem.
4349 	 */
4350 	while (svp == NULL) {
4351 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4352 			if (!oncethru && svp == mi->mi_curr_serv)
4353 				continue;
4354 
4355 			/*
4356 			 * If the file system was forcibly umounted
4357 			 * while trying to do a failover, then just
4358 			 * give up on the failover.  It won't matter
4359 			 * what the server is.
4360 			 */
4361 			if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4362 				svp = NULL;
4363 				goto done;
4364 			}
4365 
4366 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4367 			    NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4368 			if (error)
4369 				continue;
4370 
4371 			if (!(mi->mi_flags & MI_INT))
4372 				cl->cl_nosignal = TRUE;
4373 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4374 			    xdr_void, NULL, tv);
4375 			if (!(mi->mi_flags & MI_INT))
4376 				cl->cl_nosignal = FALSE;
4377 			AUTH_DESTROY(cl->cl_auth);
4378 			CLNT_DESTROY(cl);
4379 			if (status == RPC_SUCCESS) {
4380 				if (svp == mi->mi_curr_serv) {
4381 #ifdef DEBUG
4382 					zcmn_err(zoneid, CE_NOTE,
4383 			"NFS%d: failing over: selecting original server %s",
4384 					    mi->mi_vers, svp->sv_hostname);
4385 #else
4386 					zcmn_err(zoneid, CE_NOTE,
4387 			"NFS: failing over: selecting original server %s",
4388 					    svp->sv_hostname);
4389 #endif
4390 				} else {
4391 #ifdef DEBUG
4392 					zcmn_err(zoneid, CE_NOTE,
4393 				    "NFS%d: failing over from %s to %s",
4394 					    mi->mi_vers,
4395 					    mi->mi_curr_serv->sv_hostname,
4396 					    svp->sv_hostname);
4397 #else
4398 					zcmn_err(zoneid, CE_NOTE,
4399 				    "NFS: failing over from %s to %s",
4400 					    mi->mi_curr_serv->sv_hostname,
4401 					    svp->sv_hostname);
4402 #endif
4403 				}
4404 				break;
4405 			}
4406 		}
4407 
4408 		if (svp == NULL) {
4409 			if (!oncethru) {
4410 				srvnames = nfs_getsrvnames(mi, &srvnames_len);
4411 #ifdef DEBUG
4412 				zprintf(zoneid,
4413 				    "NFS%d servers %s not responding "
4414 				    "still trying\n", mi->mi_vers, srvnames);
4415 #else
4416 				zprintf(zoneid, "NFS servers %s not responding "
4417 				    "still trying\n", srvnames);
4418 #endif
4419 				oncethru = 1;
4420 			}
4421 			mutex_enter(&mi->mi_lock);
4422 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4423 			mutex_exit(&mi->mi_lock);
4424 			delay(hz);
4425 			mutex_enter(&mi->mi_lock);
4426 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4427 			mutex_exit(&mi->mi_lock);
4428 		}
4429 	}
4430 
4431 	if (oncethru) {
4432 #ifdef DEBUG
4433 		zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4434 #else
4435 		zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4436 #endif
4437 	}
4438 
4439 	if (svp != mi->mi_curr_serv) {
4440 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4441 		index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4442 		rw_enter(&rtable[index].r_lock, RW_WRITER);
4443 		rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4444 		    mi->mi_vfsp);
4445 		if (rp != NULL) {
4446 			if (rp->r_flags & RHASHED)
4447 				rp_rmhash_locked(rp);
4448 			rw_exit(&rtable[index].r_lock);
4449 			rp->r_server = svp;
4450 			rp->r_fh = svp->sv_fhandle;
4451 			(void) nfs_free_data_reclaim(rp);
4452 			index = rtablehash(&rp->r_fh);
4453 			rp->r_hashq = &rtable[index];
4454 			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4455 			vn_exists(RTOV(rp));
4456 			rp_addhash(rp);
4457 			rw_exit(&rp->r_hashq->r_lock);
4458 			VN_RELE(RTOV(rp));
4459 		} else
4460 			rw_exit(&rtable[index].r_lock);
4461 	}
4462 
4463 done:
4464 	if (oncethru)
4465 		kmem_free(srvnames, srvnames_len);
4466 	mutex_enter(&mi->mi_lock);
4467 	mi->mi_flags &= ~MI_BINDINPROG;
4468 	if (svp != NULL) {
4469 		mi->mi_curr_serv = svp;
4470 		mi->mi_failover++;
4471 #ifdef DEBUG
4472 	nfscl->nfscl_stat.failover.value.ui64++;
4473 #endif
4474 	}
4475 	cv_broadcast(&mi->mi_failover_cv);
4476 	CALLB_CPR_EXIT(&cprinfo);
4477 	VFS_RELE(mi->mi_vfsp);
4478 	zthread_exit();
4479 	/* NOTREACHED */
4480 }
4481 
4482 /*
4483  * NFS client failover support
4484  *
4485  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4486  * is cleared, meaning that failover is complete.  Called with
4487  * mi_lock mutex held.
4488  */
4489 static int
4490 failover_wait(mntinfo_t *mi)
4491 {
4492 	k_sigset_t smask;
4493 
4494 	/*
4495 	 * If someone else is hunting for a living server,
4496 	 * sleep until it's done.  After our sleep, we may
4497 	 * be bound to the right server and get off cheaply.
4498 	 */
4499 	while (mi->mi_flags & MI_BINDINPROG) {
4500 		/*
4501 		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4502 		 * and SIGTERM. (Preserving the existing masks).
4503 		 * Mask out SIGINT if mount option nointr is specified.
4504 		 */
4505 		sigintr(&smask, (int)mi->mi_flags & MI_INT);
4506 		if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4507 			/*
4508 			 * restore original signal mask
4509 			 */
4510 			sigunintr(&smask);
4511 			return (EINTR);
4512 		}
4513 		/*
4514 		 * restore original signal mask
4515 		 */
4516 		sigunintr(&smask);
4517 	}
4518 	return (0);
4519 }
4520 
4521 /*
4522  * NFS client failover support
4523  *
4524  * failover_remap() will do a partial pathname lookup and find the
4525  * desired vnode on the current server.  The interim vnode will be
4526  * discarded after we pilfer the new filehandle.
4527  *
4528  * Side effects:
4529  * - This routine will also update the filehandle in the args structure
4530  *    pointed to by the fi->fhp pointer if it is non-NULL.
4531  */
4532 
4533 static int
4534 failover_remap(failinfo_t *fi)
4535 {
4536 	vnode_t *vp, *nvp, *rootvp;
4537 	rnode_t *rp, *nrp;
4538 	mntinfo_t *mi;
4539 	int error;
4540 #ifdef DEBUG
4541 	struct nfs_clnt *nfscl;
4542 
4543 	nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4544 	ASSERT(nfscl != NULL);
4545 #endif
4546 	/*
4547 	 * Sanity check
4548 	 */
4549 	if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4550 		return (EINVAL);
4551 	vp = fi->vp;
4552 	rp = VTOR(vp);
4553 	mi = VTOMI(vp);
4554 
4555 	if (!(vp->v_flag & VROOT)) {
4556 		/*
4557 		 * Given the root fh, use the path stored in
4558 		 * the rnode to find the fh for the new server.
4559 		 */
4560 		error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4561 		if (error)
4562 			return (error);
4563 
4564 		error = failover_lookup(rp->r_path, rootvp,
4565 		    fi->lookupproc, fi->xattrdirproc, &nvp);
4566 
4567 		VN_RELE(rootvp);
4568 
4569 		if (error)
4570 			return (error);
4571 
4572 		/*
4573 		 * If we found the same rnode, we're done now
4574 		 */
4575 		if (nvp == vp) {
4576 			/*
4577 			 * Failed and the new server may physically be same
4578 			 * OR may share a same disk subsystem. In this case
4579 			 * file handle for a particular file path is not going
4580 			 * to change, given the same filehandle lookup will
4581 			 * always locate the same rnode as the existing one.
4582 			 * All we might need to do is to update the r_server
4583 			 * with the current servinfo.
4584 			 */
4585 			if (!VALID_FH(fi)) {
4586 				rp->r_server = mi->mi_curr_serv;
4587 			}
4588 			VN_RELE(nvp);
4589 			return (0);
4590 		}
4591 
4592 		/*
4593 		 * Try to make it so that no one else will find this
4594 		 * vnode because it is just a temporary to hold the
4595 		 * new file handle until that file handle can be
4596 		 * copied to the original vnode/rnode.
4597 		 */
4598 		nrp = VTOR(nvp);
4599 		mutex_enter(&mi->mi_remap_lock);
4600 		/*
4601 		 * Some other thread could have raced in here and could
4602 		 * have done the remap for this particular rnode before
4603 		 * this thread here. Check for rp->r_server and
4604 		 * mi->mi_curr_serv and return if they are same.
4605 		 */
4606 		if (VALID_FH(fi)) {
4607 			mutex_exit(&mi->mi_remap_lock);
4608 			VN_RELE(nvp);
4609 			return (0);
4610 		}
4611 
4612 		if (nrp->r_flags & RHASHED)
4613 			rp_rmhash(nrp);
4614 
4615 		/*
4616 		 * As a heuristic check on the validity of the new
4617 		 * file, check that the size and type match against
4618 		 * that we remember from the old version.
4619 		 */
4620 		if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4621 			mutex_exit(&mi->mi_remap_lock);
4622 			zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4623 			    "NFS replicas %s and %s: file %s not same.",
4624 			    rp->r_server->sv_hostname,
4625 			    nrp->r_server->sv_hostname, rp->r_path);
4626 			VN_RELE(nvp);
4627 			return (EINVAL);
4628 		}
4629 
4630 		/*
4631 		 * snarf the filehandle from the new rnode
4632 		 * then release it, again while updating the
4633 		 * hash queues for the rnode.
4634 		 */
4635 		if (rp->r_flags & RHASHED)
4636 			rp_rmhash(rp);
4637 		rp->r_server = mi->mi_curr_serv;
4638 		rp->r_fh = nrp->r_fh;
4639 		rp->r_hashq = nrp->r_hashq;
4640 		/*
4641 		 * Copy the attributes from the new rnode to the old
4642 		 * rnode.  This will help to reduce unnecessary page
4643 		 * cache flushes.
4644 		 */
4645 		rp->r_attr = nrp->r_attr;
4646 		rp->r_attrtime = nrp->r_attrtime;
4647 		rp->r_mtime = nrp->r_mtime;
4648 		(void) nfs_free_data_reclaim(rp);
4649 		nfs_setswaplike(vp, &rp->r_attr);
4650 		rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4651 		rp_addhash(rp);
4652 		rw_exit(&rp->r_hashq->r_lock);
4653 		mutex_exit(&mi->mi_remap_lock);
4654 		VN_RELE(nvp);
4655 	}
4656 
4657 	/*
4658 	 * Update successful failover remap count
4659 	 */
4660 	mutex_enter(&mi->mi_lock);
4661 	mi->mi_remap++;
4662 	mutex_exit(&mi->mi_lock);
4663 #ifdef DEBUG
4664 	nfscl->nfscl_stat.remap.value.ui64++;
4665 #endif
4666 
4667 	/*
4668 	 * If we have a copied filehandle to update, do it now.
4669 	 */
4670 	if (fi->fhp != NULL && fi->copyproc != NULL)
4671 		(*fi->copyproc)(fi->fhp, vp);
4672 
4673 	return (0);
4674 }
4675 
4676 /*
4677  * NFS client failover support
4678  *
4679  * We want a simple pathname lookup routine to parse the pieces
4680  * of path in rp->r_path.  We know that the path was a created
4681  * as rnodes were made, so we know we have only to deal with
4682  * paths that look like:
4683  *	dir1/dir2/dir3/file
4684  * Any evidence of anything like .., symlinks, and ENOTDIR
4685  * are hard errors, because they mean something in this filesystem
4686  * is different from the one we came from, or has changed under
4687  * us in some way.  If this is true, we want the failure.
4688  *
4689  * Extended attributes: if the filesystem is mounted with extended
4690  * attributes enabled (-o xattr), the attribute directory will be
4691  * represented in the r_path as the magic name XATTR_RPATH. So if
4692  * we see that name in the pathname, is must be because this node
4693  * is an extended attribute.  Therefore, look it up that way.
4694  */
4695 static int
4696 failover_lookup(char *path, vnode_t *root,
4697     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4698     vnode_t *, cred_t *, int),
4699     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4700     vnode_t **new)
4701 {
4702 	vnode_t *dvp, *nvp;
4703 	int error = EINVAL;
4704 	char *s, *p, *tmppath;
4705 	size_t len;
4706 	mntinfo_t *mi;
4707 	bool_t xattr;
4708 
4709 	/* Make local copy of path */
4710 	len = strlen(path) + 1;
4711 	tmppath = kmem_alloc(len, KM_SLEEP);
4712 	(void) strcpy(tmppath, path);
4713 	s = tmppath;
4714 
4715 	dvp = root;
4716 	VN_HOLD(dvp);
4717 	mi = VTOMI(root);
4718 	xattr = mi->mi_flags & MI_EXTATTR;
4719 
4720 	do {
4721 		p = strchr(s, '/');
4722 		if (p != NULL)
4723 			*p = '\0';
4724 		if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4725 			error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4726 			    RFSCALL_SOFT);
4727 		} else {
4728 			error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4729 			    CRED(), RFSCALL_SOFT);
4730 		}
4731 		if (p != NULL)
4732 			*p++ = '/';
4733 		if (error) {
4734 			VN_RELE(dvp);
4735 			kmem_free(tmppath, len);
4736 			return (error);
4737 		}
4738 		s = p;
4739 		VN_RELE(dvp);
4740 		dvp = nvp;
4741 	} while (p != NULL);
4742 
4743 	if (nvp != NULL && new != NULL)
4744 		*new = nvp;
4745 	kmem_free(tmppath, len);
4746 	return (0);
4747 }
4748 
4749 /*
4750  * NFS client failover support
4751  *
4752  * sv_free() frees the malloc'd portion of a "servinfo_t".
4753  */
4754 void
4755 sv_free(servinfo_t *svp)
4756 {
4757 	servinfo_t *next;
4758 	struct knetconfig *knconf;
4759 
4760 	while (svp != NULL) {
4761 		next = svp->sv_next;
4762 		if (svp->sv_secdata)
4763 			sec_clnt_freeinfo(svp->sv_secdata);
4764 		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4765 			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4766 		knconf = svp->sv_knconf;
4767 		if (knconf != NULL) {
4768 			if (knconf->knc_protofmly != NULL)
4769 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4770 			if (knconf->knc_proto != NULL)
4771 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4772 			kmem_free(knconf, sizeof (*knconf));
4773 		}
4774 		knconf = svp->sv_origknconf;
4775 		if (knconf != NULL) {
4776 			if (knconf->knc_protofmly != NULL)
4777 				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4778 			if (knconf->knc_proto != NULL)
4779 				kmem_free(knconf->knc_proto, KNC_STRSIZE);
4780 			kmem_free(knconf, sizeof (*knconf));
4781 		}
4782 		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4783 			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4784 		mutex_destroy(&svp->sv_lock);
4785 		kmem_free(svp, sizeof (*svp));
4786 		svp = next;
4787 	}
4788 }
4789 
4790 /*
4791  * Only can return non-zero if intr != 0.
4792  */
4793 int
4794 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4795 {
4796 
4797 	mutex_enter(&l->lock);
4798 
4799 	/*
4800 	 * If this is a nested enter, then allow it.  There
4801 	 * must be as many exits as enters through.
4802 	 */
4803 	if (l->owner == curthread) {
4804 		/* lock is held for writing by current thread */
4805 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4806 		l->count--;
4807 	} else if (rw == RW_READER) {
4808 		/*
4809 		 * While there is a writer active or writers waiting,
4810 		 * then wait for them to finish up and move on.  Then,
4811 		 * increment the count to indicate that a reader is
4812 		 * active.
4813 		 */
4814 		while (l->count < 0 || l->waiters > 0) {
4815 			if (intr) {
4816 				klwp_t *lwp = ttolwp(curthread);
4817 
4818 				if (lwp != NULL)
4819 					lwp->lwp_nostop++;
4820 				if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4821 					if (lwp != NULL)
4822 						lwp->lwp_nostop--;
4823 					mutex_exit(&l->lock);
4824 					return (EINTR);
4825 				}
4826 				if (lwp != NULL)
4827 					lwp->lwp_nostop--;
4828 			} else
4829 				cv_wait(&l->cv_rd, &l->lock);
4830 		}
4831 		ASSERT(l->count < INT_MAX);
4832 #ifdef	DEBUG
4833 		if ((l->count % 10000) == 9999)
4834 			cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4835 			    "rwlock @ %p\n", l->count, (void *)&l);
4836 #endif
4837 		l->count++;
4838 	} else {
4839 		ASSERT(rw == RW_WRITER);
4840 		/*
4841 		 * While there are readers active or a writer
4842 		 * active, then wait for all of the readers
4843 		 * to finish or for the writer to finish.
4844 		 * Then, set the owner field to curthread and
4845 		 * decrement count to indicate that a writer
4846 		 * is active.
4847 		 */
4848 		while (l->count != 0) {
4849 			l->waiters++;
4850 			if (intr) {
4851 				klwp_t *lwp = ttolwp(curthread);
4852 
4853 				if (lwp != NULL)
4854 					lwp->lwp_nostop++;
4855 				if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4856 					if (lwp != NULL)
4857 						lwp->lwp_nostop--;
4858 					l->waiters--;
4859 					/*
4860 					 * If there are readers active and no
4861 					 * writers waiting then wake up all of
4862 					 * the waiting readers (if any).
4863 					 */
4864 					if (l->count > 0 && l->waiters == 0)
4865 						cv_broadcast(&l->cv_rd);
4866 					mutex_exit(&l->lock);
4867 					return (EINTR);
4868 				}
4869 				if (lwp != NULL)
4870 					lwp->lwp_nostop--;
4871 			} else
4872 				cv_wait(&l->cv, &l->lock);
4873 			l->waiters--;
4874 		}
4875 		ASSERT(l->owner == NULL);
4876 		l->owner = curthread;
4877 		l->count--;
4878 	}
4879 
4880 	mutex_exit(&l->lock);
4881 
4882 	return (0);
4883 }
4884 
4885 /*
4886  * If the lock is available, obtain it and return non-zero.  If there is
4887  * already a conflicting lock, return 0 immediately.
4888  */
4889 
4890 int
4891 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4892 {
4893 	mutex_enter(&l->lock);
4894 
4895 	/*
4896 	 * If this is a nested enter, then allow it.  There
4897 	 * must be as many exits as enters through.
4898 	 */
4899 	if (l->owner == curthread) {
4900 		/* lock is held for writing by current thread */
4901 		ASSERT(rw == RW_READER || rw == RW_WRITER);
4902 		l->count--;
4903 	} else if (rw == RW_READER) {
4904 		/*
4905 		 * If there is a writer active or writers waiting, deny the
4906 		 * lock.  Otherwise, bump the count of readers.
4907 		 */
4908 		if (l->count < 0 || l->waiters > 0) {
4909 			mutex_exit(&l->lock);
4910 			return (0);
4911 		}
4912 		l->count++;
4913 	} else {
4914 		ASSERT(rw == RW_WRITER);
4915 		/*
4916 		 * If there are readers active or a writer active, deny the
4917 		 * lock.  Otherwise, set the owner field to curthread and
4918 		 * decrement count to indicate that a writer is active.
4919 		 */
4920 		if (l->count != 0) {
4921 			mutex_exit(&l->lock);
4922 			return (0);
4923 		}
4924 		ASSERT(l->owner == NULL);
4925 		l->owner = curthread;
4926 		l->count--;
4927 	}
4928 
4929 	mutex_exit(&l->lock);
4930 
4931 	return (1);
4932 }
4933 
4934 void
4935 nfs_rw_exit(nfs_rwlock_t *l)
4936 {
4937 
4938 	mutex_enter(&l->lock);
4939 
4940 	if (l->owner != NULL) {
4941 		ASSERT(l->owner == curthread);
4942 
4943 		/*
4944 		 * To release a writer lock increment count to indicate that
4945 		 * there is one less writer active.  If this was the last of
4946 		 * possibly nested writer locks, then clear the owner field as
4947 		 * well to indicate that there is no writer active.
4948 		 */
4949 		ASSERT(l->count < 0);
4950 		l->count++;
4951 		if (l->count == 0) {
4952 			l->owner = NULL;
4953 
4954 			/*
4955 			 * If there are no writers waiting then wakeup all of
4956 			 * the waiting readers (if any).
4957 			 */
4958 			if (l->waiters == 0)
4959 				cv_broadcast(&l->cv_rd);
4960 		}
4961 	} else {
4962 		/*
4963 		 * To release a reader lock just decrement count to indicate
4964 		 * that there is one less reader active.
4965 		 */
4966 		ASSERT(l->count > 0);
4967 		l->count--;
4968 	}
4969 
4970 	/*
4971 	 * If there are no readers active nor a writer active and there is a
4972 	 * writer waiting we need to wake up it.
4973 	 */
4974 	if (l->count == 0 && l->waiters > 0)
4975 		cv_signal(&l->cv);
4976 	mutex_exit(&l->lock);
4977 }
4978 
4979 int
4980 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4981 {
4982 
4983 	if (rw == RW_READER)
4984 		return (l->count > 0);
4985 	ASSERT(rw == RW_WRITER);
4986 	return (l->count < 0);
4987 }
4988 
4989 /* ARGSUSED */
4990 void
4991 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4992 {
4993 
4994 	l->count = 0;
4995 	l->waiters = 0;
4996 	l->owner = NULL;
4997 	mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4998 	cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4999 	cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
5000 }
5001 
5002 void
5003 nfs_rw_destroy(nfs_rwlock_t *l)
5004 {
5005 
5006 	mutex_destroy(&l->lock);
5007 	cv_destroy(&l->cv);
5008 	cv_destroy(&l->cv_rd);
5009 }
5010 
5011 int
5012 nfs3_rddir_compar(const void *x, const void *y)
5013 {
5014 	rddir_cache *a = (rddir_cache *)x;
5015 	rddir_cache *b = (rddir_cache *)y;
5016 
5017 	if (a->nfs3_cookie == b->nfs3_cookie) {
5018 		if (a->buflen == b->buflen)
5019 			return (0);
5020 		if (a->buflen < b->buflen)
5021 			return (-1);
5022 		return (1);
5023 	}
5024 
5025 	if (a->nfs3_cookie < b->nfs3_cookie)
5026 		return (-1);
5027 
5028 	return (1);
5029 }
5030 
5031 int
5032 nfs_rddir_compar(const void *x, const void *y)
5033 {
5034 	rddir_cache *a = (rddir_cache *)x;
5035 	rddir_cache *b = (rddir_cache *)y;
5036 
5037 	if (a->nfs_cookie == b->nfs_cookie) {
5038 		if (a->buflen == b->buflen)
5039 			return (0);
5040 		if (a->buflen < b->buflen)
5041 			return (-1);
5042 		return (1);
5043 	}
5044 
5045 	if (a->nfs_cookie < b->nfs_cookie)
5046 		return (-1);
5047 
5048 	return (1);
5049 }
5050 
5051 static char *
5052 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5053 {
5054 	servinfo_t *s;
5055 	char *srvnames;
5056 	char *namep;
5057 	size_t length;
5058 
5059 	/*
5060 	 * Calculate the length of the string required to hold all
5061 	 * of the server names plus either a comma or a null
5062 	 * character following each individual one.
5063 	 */
5064 	length = 0;
5065 	for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5066 		length += s->sv_hostnamelen;
5067 
5068 	srvnames = kmem_alloc(length, KM_SLEEP);
5069 
5070 	namep = srvnames;
5071 	for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5072 		(void) strcpy(namep, s->sv_hostname);
5073 		namep += s->sv_hostnamelen - 1;
5074 		*namep++ = ',';
5075 	}
5076 	*--namep = '\0';
5077 
5078 	*len = length;
5079 
5080 	return (srvnames);
5081 }
5082 
5083 /*
5084  * These two functions are temporary and designed for the upgrade-workaround
5085  * only.  They cannot be used for general zone-crossing NFS client support, and
5086  * will be removed shortly.
5087  *
5088  * When the workaround is enabled, all NFS traffic is forced into the global
5089  * zone.  These functions are called when the code needs to refer to the state
5090  * of the underlying network connection.  They're not called when the function
5091  * needs to refer to the state of the process that invoked the system call.
5092  * (E.g., when checking whether the zone is shutting down during the mount()
5093  * call.)
5094  */
5095 
5096 struct zone *
5097 nfs_zone(void)
5098 {
5099 	return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5100 }
5101 
5102 zoneid_t
5103 nfs_zoneid(void)
5104 {
5105 	return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5106 }
5107 
5108 /*
5109  * nfs_mount_label_policy:
5110  *	Determine whether the mount is allowed according to MAC check,
5111  *	by comparing (where appropriate) label of the remote server
5112  *	against the label of the zone being mounted into.
5113  *
5114  *	Returns:
5115  *		 0 :	access allowed
5116  *		-1 :	read-only access allowed (i.e., read-down)
5117  *		>0 :	error code, such as EACCES
5118  */
5119 int
5120 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5121     struct knetconfig *knconf, cred_t *cr)
5122 {
5123 	int		addr_type;
5124 	void		*ipaddr;
5125 	bslabel_t	*server_sl, *mntlabel;
5126 	zone_t		*mntzone = NULL;
5127 	ts_label_t	*zlabel;
5128 	tsol_tpc_t	*tp;
5129 	ts_label_t	*tsl = NULL;
5130 	int		retv;
5131 
5132 	/*
5133 	 * Get the zone's label.  Each zone on a labeled system has a label.
5134 	 */
5135 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5136 	zlabel = mntzone->zone_slabel;
5137 	ASSERT(zlabel != NULL);
5138 	label_hold(zlabel);
5139 
5140 	if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5141 		addr_type = IPV4_VERSION;
5142 		ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5143 	} else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5144 		addr_type = IPV6_VERSION;
5145 		ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5146 	} else {
5147 		retv = 0;
5148 		goto out;
5149 	}
5150 
5151 	retv = EACCES;				/* assume the worst */
5152 
5153 	/*
5154 	 * Next, get the assigned label of the remote server.
5155 	 */
5156 	tp = find_tpc(ipaddr, addr_type, B_FALSE);
5157 	if (tp == NULL)
5158 		goto out;			/* error getting host entry */
5159 
5160 	if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5161 		goto rel_tpc;			/* invalid domain */
5162 	if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5163 	    (tp->tpc_tp.host_type != UNLABELED))
5164 		goto rel_tpc;			/* invalid hosttype */
5165 
5166 	if (tp->tpc_tp.host_type == SUN_CIPSO) {
5167 		tsl = getflabel_cipso(vfsp);
5168 		if (tsl == NULL)
5169 			goto rel_tpc;		/* error getting server lbl */
5170 
5171 		server_sl = label2bslabel(tsl);
5172 	} else {	/* UNLABELED */
5173 		server_sl = &tp->tpc_tp.tp_def_label;
5174 	}
5175 
5176 	mntlabel = label2bslabel(zlabel);
5177 
5178 	/*
5179 	 * Now compare labels to complete the MAC check.  If the labels
5180 	 * are equal or if the requestor is in the global zone and has
5181 	 * NET_MAC_AWARE, then allow read-write access.   (Except for
5182 	 * mounts into the global zone itself; restrict these to
5183 	 * read-only.)
5184 	 *
5185 	 * If the requestor is in some other zone, but their label
5186 	 * dominates the server, then allow read-down.
5187 	 *
5188 	 * Otherwise, access is denied.
5189 	 */
5190 	if (blequal(mntlabel, server_sl) ||
5191 	    (crgetzoneid(cr) == GLOBAL_ZONEID &&
5192 	    getpflags(NET_MAC_AWARE, cr) != 0)) {
5193 		if ((mntzone == global_zone) ||
5194 		    !blequal(mntlabel, server_sl))
5195 			retv = -1;		/* read-only */
5196 		else
5197 			retv = 0;		/* access OK */
5198 	} else if (bldominates(mntlabel, server_sl)) {
5199 		retv = -1;			/* read-only */
5200 	} else {
5201 		retv = EACCES;
5202 	}
5203 
5204 	if (tsl != NULL)
5205 		label_rele(tsl);
5206 
5207 rel_tpc:
5208 	TPC_RELE(tp);
5209 out:
5210 	if (mntzone)
5211 		zone_rele(mntzone);
5212 	label_rele(zlabel);
5213 	return (retv);
5214 }
5215 
5216 boolean_t
5217 nfs_has_ctty(void)
5218 {
5219 	boolean_t rv;
5220 	mutex_enter(&curproc->p_splock);
5221 	rv = (curproc->p_sessp->s_vp != NULL);
5222 	mutex_exit(&curproc->p_splock);
5223 	return (rv);
5224 }
5225 
5226 /*
5227  * See if xattr directory to see if it has any generic user attributes
5228  */
5229 int
5230 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5231 {
5232 	struct uio uio;
5233 	struct iovec iov;
5234 	char *dbuf;
5235 	struct dirent64 *dp;
5236 	size_t dlen = 8 * 1024;
5237 	size_t dbuflen;
5238 	int eof = 0;
5239 	int error;
5240 
5241 	*valp = 0;
5242 	dbuf = kmem_alloc(dlen, KM_SLEEP);
5243 	uio.uio_iov = &iov;
5244 	uio.uio_iovcnt = 1;
5245 	uio.uio_segflg = UIO_SYSSPACE;
5246 	uio.uio_fmode = 0;
5247 	uio.uio_extflg = UIO_COPY_CACHED;
5248 	uio.uio_loffset = 0;
5249 	uio.uio_resid = dlen;
5250 	iov.iov_base = dbuf;
5251 	iov.iov_len = dlen;
5252 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5253 	error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5254 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5255 
5256 	dbuflen = dlen - uio.uio_resid;
5257 
5258 	if (error || dbuflen == 0) {
5259 		kmem_free(dbuf, dlen);
5260 		return (error);
5261 	}
5262 
5263 	dp = (dirent64_t *)dbuf;
5264 
5265 	while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5266 		if (strcmp(dp->d_name, ".") == 0 ||
5267 		    strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5268 		    VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5269 		    VIEW_READONLY) == 0) {
5270 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5271 			continue;
5272 		}
5273 
5274 		*valp = 1;
5275 		break;
5276 	}
5277 	kmem_free(dbuf, dlen);
5278 	return (0);
5279 }
5280